summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/tcp_timer.c
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/tcp_timer.c')
-rw-r--r--freebsd/sys/netinet/tcp_timer.c583
1 files changed, 431 insertions, 152 deletions
diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c
index db952e42..edfc3829 100644
--- a/freebsd/sys/netinet/tcp_timer.c
+++ b/freebsd/sys/netinet/tcp_timer.c
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_tcpdebug.h>
+#include <rtems/bsd/local/opt_rss.h>
#include <rtems/bsd/sys/param.h>
#include <sys/kernel.h>
@@ -52,24 +53,40 @@ __FBSDID("$FreeBSD$");
#include <net/if.h>
#include <net/route.h>
+#include <net/rss_config.h>
#include <net/vnet.h>
+#include <net/netisr.h>
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
+#include <netinet/in_rss.h>
#include <netinet/in_systm.h>
#ifdef INET6
#include <netinet6/in6_pcb.h>
#endif
#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/cc/cc.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
#include <netinet/tcpip.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
+int tcp_persmin;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
+
+int tcp_persmax;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
+
int tcp_keepinit;
SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
&tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
@@ -121,17 +138,110 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
/* max idle probes */
int tcp_maxpersistidle;
-static int tcp_rexmit_drop_options = 1;
+static int tcp_rexmit_drop_options = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
&tcp_rexmit_drop_options, 0,
"Drop TCP options from 3rd and later retransmitted SYN");
+static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
+#define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
+ CTLFLAG_RW|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
+ "Path MTU Discovery Black Hole Detection Enabled");
+
+static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
+#define V_tcp_pmtud_blackhole_activated \
+ VNET(tcp_pmtud_blackhole_activated)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
+ CTLFLAG_RD|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
+ "Path MTU Discovery Black Hole Detection, Activation Count");
+
+static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
+#define V_tcp_pmtud_blackhole_activated_min_mss \
+ VNET(tcp_pmtud_blackhole_activated_min_mss)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
+ CTLFLAG_RD|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
+ "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
+
+static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
+#define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
+ CTLFLAG_RD|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
+ "Path MTU Discovery Black Hole Detection, Failure Count");
+
+#ifdef INET
+static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
+#define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
+ CTLFLAG_RW|CTLFLAG_VNET,
+ &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
+ "Path MTU Discovery Black Hole Detection lowered MSS");
+#endif
+
+#ifdef INET6
+static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
+#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
+ CTLFLAG_RW|CTLFLAG_VNET,
+ &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
+ "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
+#endif
+
+#ifdef RSS
+static int per_cpu_timers = 1;
+#else
static int per_cpu_timers = 0;
+#endif
SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
&per_cpu_timers , 0, "run tcp timers on all cpus");
+#if 0
#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
+#endif
+
+/*
+ * Map the given inp to a CPU id.
+ *
+ * This queries RSS if it's compiled in, else it defaults to the current
+ * CPU ID.
+ */
+static inline int
+inp_to_cpuid(struct inpcb *inp)
+{
+ u_int cpuid;
+
+#ifdef RSS
+ if (per_cpu_timers) {
+ cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
+ if (cpuid == NETISR_CPUID_NONE)
+ return (curcpu); /* XXX */
+ else
+ return (cpuid);
+ }
+#else
+ /* Legacy, pre-RSS behaviour */
+ if (per_cpu_timers) {
+ /*
+ * We don't have a flowid -> cpuid mapping, so cheat and
+ * just map unknown cpuids to curcpu. Not the best, but
+ * apparently better than defaulting to swi 0.
+ */
+ cpuid = inp->inp_flowid % (mp_maxid + 1);
+ if (! CPU_ABSENT(cpuid))
+ return (cpuid);
+ return (curcpu);
+ }
+#endif
+ /* Default for RSS and non-RSS - cpuid 0 */
+ else {
+ return (0);
+ }
+}
/*
* Tcp protocol timeout routine called every 500 ms.
@@ -146,9 +256,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- INP_INFO_WLOCK(&V_tcbinfo);
(void) tcp_tw_2msl_scan(0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
@@ -162,10 +270,6 @@ int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
-static int tcp_timer_race;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
- 0, "Count of t_inpcb races on tcp_discardcb");
-
/*
* TCP timer processing.
*/
@@ -178,18 +282,7 @@ tcp_timer_delack(void *xtp)
CURVNET_SET(tp->t_vnet);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_delack) ||
!callout_active(&tp->t_timers->tt_delack)) {
@@ -203,14 +296,65 @@ tcp_timer_delack(void *xtp)
CURVNET_RESTORE();
return;
}
-
tp->t_flags |= TF_ACKNOW;
TCPSTAT_INC(tcps_delack);
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
INP_WUNLOCK(inp);
CURVNET_RESTORE();
}
+/*
+ * When a timer wants to remove a TCB it must
+ * hold the INP_INFO_RLOCK(). The timer function
+ * should only have grabbed the INP_WLOCK() when
+ * it entered. To safely switch to holding both the
+ * INP_INFO_RLOCK() and the INP_WLOCK() we must first
+ * grab a reference on the inp, which will hold the inp
+ * so that it can't be removed. We then unlock the INP_WLOCK(),
+ * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK()
+ * we proceed again to get the INP_WLOCK() (this preserves proper
+ * lock order). After acquiring the INP_WLOCK we must check if someone
+ * else deleted the pcb i.e. the inp_flags check.
+ * If so we return 1 otherwise we return 0.
+ *
+ * No matter what the tcp_inpinfo_lock_add() function
+ * returns the caller must afterwards call tcp_inpinfo_lock_del()
+ * to drop the locks and reference properly.
+ */
+
+int
+tcp_inpinfo_lock_add(struct inpcb *inp)
+{
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ return(1);
+ }
+ return(0);
+
+}
+
+void
+tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp)
+{
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (inp && (tp == NULL)) {
+ /*
+ * If tcp_close/drop() gets called and tp
+ * returns NULL, then the function dropped
+ * the inp lock, we hold a reference keeping
+ * this around, so we must re-aquire the
+ * INP_WLOCK() in order to proceed with
+ * our dropping the inp reference.
+ */
+ INP_WLOCK(inp);
+ }
+ if (inp && in_pcbrele_wlocked(inp) == 0)
+ INP_WUNLOCK(inp);
+}
+
void
tcp_timer_2msl(void *xtp)
{
@@ -222,62 +366,66 @@ tcp_timer_2msl(void *xtp)
ostate = tp->t_state;
#endif
- /*
- * XXXRW: Does this actually happen?
- */
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
tcp_free_sackholes(tp);
if (callout_pending(&tp->t_timers->tt_2msl) ||
!callout_active(&tp->t_timers->tt_2msl)) {
INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_2msl);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
* 2 MSL timeout in shutdown went off. If we're closed but
* still waiting for peer to close and connection has been idle
- * too long, or if 2MSL time is up from TIME_WAIT, delete connection
- * control block. Otherwise, check again in a bit.
+ * too long delete connection control block. Otherwise, check
+ * again in a bit.
+ *
+ * If in TIME_WAIT state just ignore as this timeout is handled in
+ * tcp_tw_2msl_scan().
*
* If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
* there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
* Ignore fact that there were recent incoming segments.
*/
+ if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
+ INP_WUNLOCK(inp);
+ CURVNET_RESTORE();
+ return;
+ }
if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
tp->t_inpcb && tp->t_inpcb->inp_socket &&
(tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
TCPSTAT_INC(tcps_finwait2_drops);
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_close(tp);
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
} else {
- if (tp->t_state != TCPS_TIME_WAIT &&
- ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
- callout_reset_on(&tp->t_timers->tt_2msl,
- TP_KEEPINTVL(tp), tcp_timer_2msl, tp, INP_CPU(inp));
- else
- tp = tcp_close(tp);
+ if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
+ callout_reset(&tp->t_timers->tt_2msl,
+ TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
+ } else {
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
+ tp = tcp_close(tp);
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
}
#ifdef TCPDEBUG
@@ -285,9 +433,11 @@ tcp_timer_2msl(void *xtp)
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+out:
CURVNET_RESTORE();
}
@@ -303,36 +453,23 @@ tcp_timer_keep(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_keep) ||
!callout_active(&tp->t_timers->tt_keep)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_keep);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
* Keep-alive timer went off; send something
* or drop connection if idle for too long.
@@ -364,24 +501,29 @@ tcp_timer_keep(void *xtp)
tp->rcv_nxt, tp->snd_una - 1, 0);
free(t_template, M_TEMP);
}
- callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
- tcp_timer_keep, tp, INP_CPU(inp));
+ callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
+ tcp_timer_keep, tp);
} else
- callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
- tcp_timer_keep, tp, INP_CPU(inp));
+ callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
+ tcp_timer_keep, tp);
#ifdef TCPDEBUG
if (inp->inp_socket->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
dropit:
TCPSTAT_INC(tcps_keepdrops);
+
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_drop(tp, ETIMEDOUT);
#ifdef TCPDEBUG
@@ -389,9 +531,9 @@ dropit:
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
- if (tp != NULL)
- INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+ tcp_inpinfo_lock_del(inp, tp);
+out:
CURVNET_RESTORE();
}
@@ -406,38 +548,25 @@ tcp_timer_persist(void *xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_persist) ||
!callout_active(&tp->t_timers->tt_persist)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_persist);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
/*
- * Persistance timer into zero window.
+ * Persistence timer into zero window.
* Force a byte to be output, if possible.
*/
TCPSTAT_INC(tcps_persisttimeo);
@@ -452,7 +581,12 @@ tcp_timer_persist(void *xtp)
(ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
TCPSTAT_INC(tcps_persistdrop);
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_drop(tp, ETIMEDOUT);
+ tcp_inpinfo_lock_del(inp, tp);
goto out;
}
/*
@@ -462,22 +596,26 @@ tcp_timer_persist(void *xtp)
if (tp->t_state > TCPS_CLOSE_WAIT &&
(ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
TCPSTAT_INC(tcps_persistdrop);
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
+ }
tp = tcp_drop(tp, ETIMEDOUT);
+ tcp_inpinfo_lock_del(inp, tp);
goto out;
}
tcp_setpersist(tp);
tp->t_flags |= TF_FORCEDATA;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
tp->t_flags &= ~TF_FORCEDATA;
-out:
#ifdef TCPDEBUG
if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
#endif
- if (tp != NULL)
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+ INP_WUNLOCK(inp);
+out:
CURVNET_RESTORE();
}
@@ -487,44 +625,34 @@ tcp_timer_rexmt(void * xtp)
struct tcpcb *tp = xtp;
CURVNET_SET(tp->t_vnet);
int rexmt;
- int headlocked;
struct inpcb *inp;
#ifdef TCPDEBUG
int ostate;
ostate = tp->t_state;
#endif
- INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_RUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
INP_WLOCK(inp);
if (callout_pending(&tp->t_timers->tt_rexmt) ||
!callout_active(&tp->t_timers->tt_rexmt)) {
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_rexmt);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
+ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+ ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
tcp_free_sackholes(tp);
+ if (tp->t_fb->tfb_tcp_rexmit_tmr) {
+ /* The stack has a timer action too. */
+ (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
+ }
/*
* Retransmission timer went off. Message has not
* been acked within retransmit interval. Back off
@@ -533,30 +661,15 @@ tcp_timer_rexmt(void * xtp)
if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
TCPSTAT_INC(tcps_timeoutdrop);
- in_pcbref(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
- INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp)) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
- if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
+ if (tcp_inpinfo_lock_add(inp)) {
+ tcp_inpinfo_lock_del(inp, tp);
+ goto out;
}
-
tp = tcp_drop(tp, tp->t_softerror ?
tp->t_softerror : ETIMEDOUT);
- headlocked = 1;
+ tcp_inpinfo_lock_del(inp, tp);
goto out;
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
- headlocked = 0;
if (tp->t_state == TCPS_SYN_SENT) {
/*
* If the SYN was retransmitted, indicate CWND to be
@@ -589,12 +702,120 @@ tcp_timer_rexmt(void * xtp)
} else
tp->t_flags &= ~TF_PREVVALID;
TCPSTAT_INC(tcps_rexmttimeo);
- if (tp->t_state == TCPS_SYN_SENT)
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED))
rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
TCPT_RANGESET(tp->t_rxtcur, rexmt,
tp->t_rttmin, TCPTV_REXMTMAX);
+
+ /*
+ * We enter the path for PLMTUD if connection is established or, if
+ * connection is FIN_WAIT_1 status, reason for the last is that if
+ * amount of data we send is very small, we could send it in couple of
+ * packets and process straight to FIN. In that case we won't catch
+ * ESTABLISHED state.
+ */
+ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
+ || (tp->t_state == TCPS_FIN_WAIT_1))) {
+#ifdef INET6
+ int isipv6;
+#endif
+
+ /*
+ * Idea here is that at each stage of mtu probe (usually, 1448
+ * -> 1188 -> 524) should be given 2 chances to recover before
+ * further clamping down. 'tp->t_rxtshift % 2 == 0' should
+ * take care of that.
+ */
+ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
+ (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
+ (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
+ /*
+ * Enter Path MTU Black-hole Detection mechanism:
+ * - Disable Path MTU Discovery (IP "DF" bit).
+ * - Reduce MTU to lower value than what we
+ * negotiated with peer.
+ */
+ /* Record that we may have found a black hole. */
+ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
+
+ /* Keep track of previous MSS. */
+ tp->t_pmtud_saved_maxseg = tp->t_maxseg;
+
+ /*
+ * Reduce the MSS to blackhole value or to the default
+ * in an attempt to retransmit.
+ */
+#ifdef INET6
+ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
+ if (isipv6 &&
+ tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
+ V_tcp_pmtud_blackhole_activated++;
+ } else if (isipv6) {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_v6mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch to
+ * minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ V_tcp_pmtud_blackhole_activated_min_mss++;
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
+ /* Use the sysctl tuneable blackhole MSS. */
+ tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
+ V_tcp_pmtud_blackhole_activated++;
+ } else {
+ /* Use the default MSS. */
+ tp->t_maxseg = V_tcp_mssdflt;
+ /*
+ * Disable Path MTU Discovery when we switch to
+ * minmss.
+ */
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ V_tcp_pmtud_blackhole_activated_min_mss++;
+ }
+#endif
+ /*
+ * Reset the slow-start flight size
+ * as it may depend on the new MSS.
+ */
+ if (CC_ALGO(tp)->conn_init != NULL)
+ CC_ALGO(tp)->conn_init(tp->ccv);
+ } else {
+ /*
+ * If further retransmissions are still unsuccessful
+ * with a lowered MTU, maybe this isn't a blackhole and
+ * we restore the previous MSS and blackhole detection
+ * flags.
+ * The limit '6' is determined by giving each probe
+ * stage (1448, 1188, 524) 2 chances to recover.
+ */
+ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
+ (tp->t_rxtshift > 6)) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
+ tp->t_maxseg = tp->t_pmtud_saved_maxseg;
+ V_tcp_pmtud_blackhole_failed++;
+ /*
+ * Reset the slow-start flight size as it
+ * may depend on the new MSS.
+ */
+ if (CC_ALGO(tp)->conn_init != NULL)
+ CC_ALGO(tp)->conn_init(tp->ccv);
+ }
+ }
+ }
+
/*
* Disable RFC1323 and SACK if we haven't got any response to
* our third SYN to work-around some broken terminal servers
@@ -615,7 +836,9 @@ tcp_timer_rexmt(void * xtp)
#ifdef INET6
if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
in6_losing(tp->t_inpcb);
+ else
#endif
+ in_losing(tp->t_inpcb);
tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
tp->t_srtt = 0;
}
@@ -632,34 +855,35 @@ tcp_timer_rexmt(void * xtp)
cc_cong_signal(tp, NULL, CC_RTO);
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
-out:
#ifdef TCPDEBUG
if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
PRU_SLOWTIMO);
#endif
- if (tp != NULL)
- INP_WUNLOCK(inp);
- if (headlocked)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+ INP_WUNLOCK(inp);
+out:
CURVNET_RESTORE();
}
void
-tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
{
struct callout *t_callout;
- void *f_callout;
+ timeout_t *f_callout;
struct inpcb *inp = tp->t_inpcb;
- int cpu = INP_CPU(inp);
+ int cpu = inp_to_cpuid(inp);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return;
#endif
+ if (tp->t_timers->tt_flags & TT_STOPPED)
+ return;
+
switch (timer_type) {
case TT_DELACK:
t_callout = &tp->t_timers->tt_delack;
@@ -682,7 +906,11 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
f_callout = tcp_timer_2msl;
break;
default:
- panic("bad timer_type");
+ if (tp->t_fb->tfb_tcp_timer_activate) {
+ tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
+ return;
+ }
+ panic("tp %p bad timer_type %#x", tp, timer_type);
}
if (delta == 0) {
callout_stop(t_callout);
@@ -692,7 +920,7 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
}
int
-tcp_timer_active(struct tcpcb *tp, int timer_type)
+tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
{
struct callout *t_callout;
@@ -713,28 +941,79 @@ tcp_timer_active(struct tcpcb *tp, int timer_type)
t_callout = &tp->t_timers->tt_2msl;
break;
default:
- panic("bad timer_type");
+ if (tp->t_fb->tfb_tcp_timer_active) {
+ return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
+ }
+ panic("tp %p bad timer_type %#x", tp, timer_type);
}
return callout_active(t_callout);
}
+void
+tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+{
+ struct callout *t_callout;
+
+ tp->t_timers->tt_flags |= TT_STOPPED;
+ switch (timer_type) {
+ case TT_DELACK:
+ t_callout = &tp->t_timers->tt_delack;
+ break;
+ case TT_REXMT:
+ t_callout = &tp->t_timers->tt_rexmt;
+ break;
+ case TT_PERSIST:
+ t_callout = &tp->t_timers->tt_persist;
+ break;
+ case TT_KEEP:
+ t_callout = &tp->t_timers->tt_keep;
+ break;
+ case TT_2MSL:
+ t_callout = &tp->t_timers->tt_2msl;
+ break;
+ default:
+ if (tp->t_fb->tfb_tcp_timer_stop) {
+ /*
+ * XXXrrs we need to look at this with the
+ * stop case below (flags).
+ */
+ tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
+ return;
+ }
+ panic("tp %p bad timer_type %#x", tp, timer_type);
+ }
+
+ if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
+ /*
+ * Can't stop the callout, defer tcpcb actual deletion
+ * to the last one. We do this using the async drain
+ * function and incrementing the count in
+ */
+ tp->t_timers->tt_draincnt++;
+ }
+}
+
#define ticks_to_msecs(t) (1000*(t) / hz)
void
-tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer)
+tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
+ struct xtcp_timer *xtimer)
{
- bzero(xtimer, sizeof(struct xtcp_timer));
+ sbintime_t now;
+
+ bzero(xtimer, sizeof(*xtimer));
if (timer == NULL)
return;
+ now = getsbinuptime();
if (callout_active(&timer->tt_delack))
- xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks);
+ xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_rexmt))
- xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks);
+ xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_persist))
- xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks);
+ xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_keep))
- xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks);
+ xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_2msl))
- xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks);
+ xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
}