diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_timer.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_timer.c | 583 |
1 files changed, 431 insertions, 152 deletions
diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c index db952e42..edfc3829 100644 --- a/freebsd/sys/netinet/tcp_timer.c +++ b/freebsd/sys/netinet/tcp_timer.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_tcpdebug.h> +#include <rtems/bsd/local/opt_rss.h> #include <rtems/bsd/sys/param.h> #include <sys/kernel.h> @@ -52,24 +53,40 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/route.h> +#include <net/rss_config.h> #include <net/vnet.h> +#include <net/netisr.h> -#include <netinet/cc.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_pcb.h> +#include <netinet/in_rss.h> #include <netinet/in_systm.h> #ifdef INET6 #include <netinet6/in6_pcb.h> #endif #include <netinet/ip_var.h> +#include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#include <netinet/cc/cc.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif #include <netinet/tcpip.h> #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +int tcp_persmin; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, + &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); + +int tcp_persmax; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, + &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); + int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); @@ -121,17 +138,110 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, /* max idle probes */ int tcp_maxpersistidle; -static int tcp_rexmit_drop_options = 1; +static int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); +static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); +#define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, + CTLFLAG_RW|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_detect), 0, + "Path MTU Discovery Black Hole Detection Enabled"); + +static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); +#define V_tcp_pmtud_blackhole_activated \ + VNET(tcp_pmtud_blackhole_activated) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, + CTLFLAG_RD|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_activated), 0, + "Path MTU Discovery Black Hole Detection, Activation Count"); + +static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); +#define V_tcp_pmtud_blackhole_activated_min_mss \ + VNET(tcp_pmtud_blackhole_activated_min_mss) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, + CTLFLAG_RD|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, + "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); + +static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); +#define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, + CTLFLAG_RD|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_failed), 0, + "Path MTU Discovery Black Hole Detection, Failure Count"); + +#ifdef INET +static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; +#define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, + CTLFLAG_RW|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_mss), 0, + "Path MTU Discovery Black Hole Detection lowered MSS"); +#endif + +#ifdef INET6 +static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; +#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, + CTLFLAG_RW|CTLFLAG_VNET, + &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, + "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); +#endif + +#ifdef RSS +static int per_cpu_timers = 1; +#else static int per_cpu_timers = 0; +#endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); +#if 0 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) +#endif + +/* + * Map the given inp to a CPU id. + * + * This queries RSS if it's compiled in, else it defaults to the current + * CPU ID. + */ +static inline int +inp_to_cpuid(struct inpcb *inp) +{ + u_int cpuid; + +#ifdef RSS + if (per_cpu_timers) { + cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); + if (cpuid == NETISR_CPUID_NONE) + return (curcpu); /* XXX */ + else + return (cpuid); + } +#else + /* Legacy, pre-RSS behaviour */ + if (per_cpu_timers) { + /* + * We don't have a flowid -> cpuid mapping, so cheat and + * just map unknown cpuids to curcpu. Not the best, but + * apparently better than defaulting to swi 0. + */ + cpuid = inp->inp_flowid % (mp_maxid + 1); + if (! CPU_ABSENT(cpuid)) + return (cpuid); + return (curcpu); + } +#endif + /* Default for RSS and non-RSS - cpuid 0 */ + else { + return (0); + } +} /* * Tcp protocol timeout routine called every 500 ms. @@ -146,9 +256,7 @@ tcp_slowtimo(void) VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_WLOCK(&V_tcbinfo); (void) tcp_tw_2msl_scan(0); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -162,10 +270,6 @@ int tcp_backoff[TCP_MAXRXTSHIFT + 1] = static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ -static int tcp_timer_race; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race, - 0, "Count of t_inpcb races on tcp_discardcb"); - /* * TCP timer processing. */ @@ -178,18 +282,7 @@ tcp_timer_delack(void *xtp) CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { @@ -203,14 +296,65 @@ tcp_timer_delack(void *xtp) CURVNET_RESTORE(); return; } - tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } +/* + * When a timer wants to remove a TCB it must + * hold the INP_INFO_RLOCK(). The timer function + * should only have grabbed the INP_WLOCK() when + * it entered. To safely switch to holding both the + * INP_INFO_RLOCK() and the INP_WLOCK() we must first + * grab a reference on the inp, which will hold the inp + * so that it can't be removed. We then unlock the INP_WLOCK(), + * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() + * we proceed again to get the INP_WLOCK() (this preserves proper + * lock order). After acquiring the INP_WLOCK we must check if someone + * else deleted the pcb i.e. the inp_flags check. + * If so we return 1 otherwise we return 0. + * + * No matter what the tcp_inpinfo_lock_add() function + * returns the caller must afterwards call tcp_inpinfo_lock_del() + * to drop the locks and reference properly. + */ + +int +tcp_inpinfo_lock_add(struct inpcb *inp) +{ + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_RLOCK(&V_tcbinfo); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + return(1); + } + return(0); + +} + +void +tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) +{ + INP_INFO_RUNLOCK(&V_tcbinfo); + if (inp && (tp == NULL)) { + /* + * If tcp_close/drop() gets called and tp + * returns NULL, then the function dropped + * the inp lock, we hold a reference keeping + * this around, so we must re-aquire the + * INP_WLOCK() in order to proceed with + * our dropping the inp reference. + */ + INP_WLOCK(inp); + } + if (inp && in_pcbrele_wlocked(inp) == 0) + INP_WUNLOCK(inp); +} + void tcp_timer_2msl(void *xtp) { @@ -222,62 +366,66 @@ tcp_timer_2msl(void *xtp) ostate = tp->t_state; #endif - /* - * XXXRW: Does this actually happen? - */ - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle - * too long, or if 2MSL time is up from TIME_WAIT, delete connection - * control block. Otherwise, check again in a bit. + * too long delete connection control block. Otherwise, check + * again in a bit. + * + * If in TIME_WAIT state just ignore as this timeout is handled in + * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ + if ((inp->inp_flags & INP_TIMEWAIT) != 0) { + INP_WUNLOCK(inp); + CURVNET_RESTORE(); + return; + } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_close(tp); + tcp_inpinfo_lock_del(inp, tp); + goto out; } else { - if (tp->t_state != TCPS_TIME_WAIT && - ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) - callout_reset_on(&tp->t_timers->tt_2msl, - TP_KEEPINTVL(tp), tcp_timer_2msl, tp, INP_CPU(inp)); - else - tp = tcp_close(tp); + if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { + callout_reset(&tp->t_timers->tt_2msl, + TP_KEEPINTVL(tp), tcp_timer_2msl, tp); + } else { + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } + tp = tcp_close(tp); + tcp_inpinfo_lock_del(inp, tp); + goto out; + } } #ifdef TCPDEBUG @@ -285,9 +433,11 @@ tcp_timer_2msl(void *xtp) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + if (tp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); +out: CURVNET_RESTORE(); } @@ -303,36 +453,23 @@ tcp_timer_keep(void *xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. @@ -364,24 +501,29 @@ tcp_timer_keep(void *xtp) tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } - callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), - tcp_timer_keep, tp, INP_CPU(inp)); + callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), + tcp_timer_keep, tp); } else - callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), - tcp_timer_keep, tp, INP_CPU(inp)); + callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), + tcp_timer_keep, tp); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); + + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG @@ -389,9 +531,9 @@ dropit: tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif - if (tp != NULL) - INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + tcp_inpinfo_lock_del(inp, tp); +out: CURVNET_RESTORE(); } @@ -406,38 +548,25 @@ tcp_timer_persist(void *xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* - * Persistance timer into zero window. + * Persistence timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); @@ -452,7 +581,12 @@ tcp_timer_persist(void *xtp) (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_drop(tp, ETIMEDOUT); + tcp_inpinfo_lock_del(inp, tp); goto out; } /* @@ -462,22 +596,26 @@ tcp_timer_persist(void *xtp) if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_drop(tp, ETIMEDOUT); + tcp_inpinfo_lock_del(inp, tp); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; -out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif - if (tp != NULL) - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + INP_WUNLOCK(inp); +out: CURVNET_RESTORE(); } @@ -487,44 +625,34 @@ tcp_timer_rexmt(void * xtp) struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; - int headlocked; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif - INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_RUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); tcp_free_sackholes(tp); + if (tp->t_fb->tfb_tcp_rexmit_tmr) { + /* The stack has a timer action too. */ + (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); + } /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off @@ -533,30 +661,15 @@ tcp_timer_rexmt(void * xtp) if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); - in_pcbref(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); - INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; } - tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); - headlocked = 1; + tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RUNLOCK(&V_tcbinfo); - headlocked = 0; if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be @@ -589,12 +702,120 @@ tcp_timer_rexmt(void * xtp) } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); - if (tp->t_state == TCPS_SYN_SENT) + if ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); + + /* + * We enter the path for PLMTUD if connection is established or, if + * connection is FIN_WAIT_1 status, reason for the last is that if + * amount of data we send is very small, we could send it in couple of + * packets and process straight to FIN. In that case we won't catch + * ESTABLISHED state. + */ + if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) + || (tp->t_state == TCPS_FIN_WAIT_1))) { +#ifdef INET6 + int isipv6; +#endif + + /* + * Idea here is that at each stage of mtu probe (usually, 1448 + * -> 1188 -> 524) should be given 2 chances to recover before + * further clamping down. 'tp->t_rxtshift % 2 == 0' should + * take care of that. + */ + if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == + (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && + (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { + /* + * Enter Path MTU Black-hole Detection mechanism: + * - Disable Path MTU Discovery (IP "DF" bit). + * - Reduce MTU to lower value than what we + * negotiated with peer. + */ + /* Record that we may have found a black hole. */ + tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; + + /* Keep track of previous MSS. */ + tp->t_pmtud_saved_maxseg = tp->t_maxseg; + + /* + * Reduce the MSS to blackhole value or to the default + * in an attempt to retransmit. + */ +#ifdef INET6 + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; + if (isipv6 && + tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; + V_tcp_pmtud_blackhole_activated++; + } else if (isipv6) { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_v6mssdflt; + /* + * Disable Path MTU Discovery when we switch to + * minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + V_tcp_pmtud_blackhole_activated_min_mss++; + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_pmtud_blackhole_mss; + V_tcp_pmtud_blackhole_activated++; + } else { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_mssdflt; + /* + * Disable Path MTU Discovery when we switch to + * minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + V_tcp_pmtud_blackhole_activated_min_mss++; + } +#endif + /* + * Reset the slow-start flight size + * as it may depend on the new MSS. + */ + if (CC_ALGO(tp)->conn_init != NULL) + CC_ALGO(tp)->conn_init(tp->ccv); + } else { + /* + * If further retransmissions are still unsuccessful + * with a lowered MTU, maybe this isn't a blackhole and + * we restore the previous MSS and blackhole detection + * flags. + * The limit '6' is determined by giving each probe + * stage (1448, 1188, 524) 2 chances to recover. + */ + if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && + (tp->t_rxtshift > 6)) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; + tp->t_maxseg = tp->t_pmtud_saved_maxseg; + V_tcp_pmtud_blackhole_failed++; + /* + * Reset the slow-start flight size as it + * may depend on the new MSS. + */ + if (CC_ALGO(tp)->conn_init != NULL) + CC_ALGO(tp)->conn_init(tp->ccv); + } + } + } + /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers @@ -615,7 +836,9 @@ tcp_timer_rexmt(void * xtp) #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); + else #endif + in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } @@ -632,34 +855,35 @@ tcp_timer_rexmt(void * xtp) cc_cong_signal(tp, NULL, CC_RTO); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); -out: #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif - if (tp != NULL) - INP_WUNLOCK(inp); - if (headlocked) - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + INP_WUNLOCK(inp); +out: CURVNET_RESTORE(); } void -tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) +tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; - void *f_callout; + timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; - int cpu = INP_CPU(inp); + int cpu = inp_to_cpuid(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif + if (tp->t_timers->tt_flags & TT_STOPPED) + return; + switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; @@ -682,7 +906,11 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) f_callout = tcp_timer_2msl; break; default: - panic("bad timer_type"); + if (tp->t_fb->tfb_tcp_timer_activate) { + tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); + return; + } + panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { callout_stop(t_callout); @@ -692,7 +920,7 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) } int -tcp_timer_active(struct tcpcb *tp, int timer_type) +tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; @@ -713,28 +941,79 @@ tcp_timer_active(struct tcpcb *tp, int timer_type) t_callout = &tp->t_timers->tt_2msl; break; default: - panic("bad timer_type"); + if (tp->t_fb->tfb_tcp_timer_active) { + return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); + } + panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } +void +tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) +{ + struct callout *t_callout; + + tp->t_timers->tt_flags |= TT_STOPPED; + switch (timer_type) { + case TT_DELACK: + t_callout = &tp->t_timers->tt_delack; + break; + case TT_REXMT: + t_callout = &tp->t_timers->tt_rexmt; + break; + case TT_PERSIST: + t_callout = &tp->t_timers->tt_persist; + break; + case TT_KEEP: + t_callout = &tp->t_timers->tt_keep; + break; + case TT_2MSL: + t_callout = &tp->t_timers->tt_2msl; + break; + default: + if (tp->t_fb->tfb_tcp_timer_stop) { + /* + * XXXrrs we need to look at this with the + * stop case below (flags). + */ + tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); + return; + } + panic("tp %p bad timer_type %#x", tp, timer_type); + } + + if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { + /* + * Can't stop the callout, defer tcpcb actual deletion + * to the last one. We do this using the async drain + * function and incrementing the count in + */ + tp->t_timers->tt_draincnt++; + } +} + #define ticks_to_msecs(t) (1000*(t) / hz) void -tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) +tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, + struct xtcp_timer *xtimer) { - bzero(xtimer, sizeof(struct xtcp_timer)); + sbintime_t now; + + bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; + now = getsbinuptime(); if (callout_active(&timer->tt_delack)) - xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks); + xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) - xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks); + xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) - xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks); + xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) - xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks); + xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) - xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks); + xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } |