summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/tcp_input.c')
-rw-r--r--freebsd/sys/netinet/tcp_input.c1134
1 files changed, 639 insertions, 495 deletions
diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c
index f9512eb3..eaa3eb3d 100644
--- a/freebsd/sys/netinet/tcp_input.c
+++ b/freebsd/sys/netinet/tcp_input.c
@@ -52,7 +52,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_ipfw.h> /* for ipfw_fwd */
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
@@ -65,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
#include <sys/protosw.h>
+#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
@@ -77,16 +77,16 @@ __FBSDID("$FreeBSD$");
#include <vm/uma.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
#define TCPSTATES /* for logging */
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h> /* required for icmp_var.h */
#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
@@ -95,14 +95,23 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/in6_pcb.h>
+#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
+#ifdef TCP_RFC7413
+#include <netinet/tcp_fastopen.h>
+#endif
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#ifdef TCPPCAP
+#include <netinet/tcp_pcap.h>
+#endif
#include <netinet/tcp_syncache.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
@@ -122,11 +131,6 @@ __FBSDID("$FreeBSD$");
const int tcprexmtthresh = 3;
-VNET_DEFINE(struct tcpstat, tcpstat);
-SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
- &VNET_NAME(tcpstat), tcpstat,
- "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
-
int tcp_log_in_vain = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
&tcp_log_in_vain, 0,
@@ -134,88 +138,96 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
VNET_DEFINE(int, blackhole) = 0;
#define V_blackhole VNET(blackhole)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(blackhole), 0,
"Do not send RST on segments to closed ports");
VNET_DEFINE(int, tcp_delack_enabled) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_delack_enabled), 0,
"Delay ACK to try and piggyback it onto a data packet");
VNET_DEFINE(int, drop_synfin) = 0;
#define V_drop_synfin VNET(drop_synfin)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(drop_synfin), 0,
"Drop TCP packets with SYN+FIN set");
+VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(tcp_do_rfc6675_pipe), 0,
+ "Use calculated pipe/in-flight bytes per RFC 6675");
+
VNET_DEFINE(int, tcp_do_rfc3042) = 1;
#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3042), 0,
"Enable RFC 3042 (Limited Transmit)");
VNET_DEFINE(int, tcp_do_rfc3390) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3390), 0,
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0,
- "Experimental TCP extensions");
-
-VNET_DEFINE(int, tcp_do_initcwnd10) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW,
- &VNET_NAME(tcp_do_initcwnd10), 0,
- "Enable RFC 6928 (Increasing initial CWND to 10)");
+VNET_DEFINE(int, tcp_initcwnd_segments) = 10;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0,
+ "Slow-start flight size (initial congestion window) in number of segments");
VNET_DEFINE(int, tcp_do_rfc3465) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3465), 0,
"Enable RFC 3465 (Appropriate Byte Counting)");
VNET_DEFINE(int, tcp_abc_l_var) = 2;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_abc_l_var), 2,
"Cap the max cwnd increment during slow-start to this number of segments");
static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
-VNET_DEFINE(int, tcp_do_ecn) = 0;
-SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
+VNET_DEFINE(int, tcp_do_ecn) = 2;
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_ecn), 0,
"TCP ECN support");
VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_ecn_maxretries), 0,
"Max retries before giving up on ECN");
+VNET_DEFINE(int, tcp_insecure_syn) = 0;
+#define V_tcp_insecure_syn VNET(tcp_insecure_syn)
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(tcp_insecure_syn), 0,
+ "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets");
+
VNET_DEFINE(int, tcp_insecure_rst) = 0;
#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_insecure_rst), 0,
- "Follow the old (insecure) criteria for accepting RST packets");
+ "Follow RFC793 instead of RFC5961 criteria for accepting RST packets");
VNET_DEFINE(int, tcp_recvspace) = 1024*64;
#define V_tcp_recvspace VNET(tcp_recvspace)
-SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_do_autorcvbuf), 0,
"Enable automatic receive buffer sizing");
VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autorcvbuf_inc), 0,
"Incrementor step size of automatic receive buffer");
VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_autorcvbuf_max), 0,
"Max size of automatic receive buffer");
@@ -223,47 +235,55 @@ VNET_DEFINE(struct inpcbhead, tcb);
#define tcb6 tcb /* for KAME src sync over BSD*'s */
VNET_DEFINE(struct inpcbinfo, tcbinfo);
-static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
-static void tcp_do_segment(struct mbuf *, struct tcphdr *,
- struct socket *, struct tcpcb *, int, int, uint8_t,
- int);
-static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
- struct tcpcb *, int, int);
-static void tcp_pulloutofband(struct socket *,
- struct tcphdr *, struct mbuf *, int);
-static void tcp_xmit_timer(struct tcpcb *, int);
-static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline tcp_fields_to_host(struct tcphdr *);
-#ifdef TCP_SIGNATURE
-static void inline tcp_fields_to_net(struct tcphdr *);
-static int inline tcp_signature_verify_input(struct mbuf *, int, int,
- int, struct tcpopt *, struct tcphdr *, u_int);
-#endif
-static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
- uint16_t type);
-static void inline cc_conn_init(struct tcpcb *tp);
-static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
-static void inline hhook_run_tcp_est_in(struct tcpcb *tp,
- struct tcphdr *th, struct tcpopt *to);
+/*
+ * TCP statistics are stored in an array of counter(9)s, which size matches
+ * size of struct tcpstat. TCP running connection count is a regular array.
+ */
+VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
+SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
+ tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
+VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]);
+SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD |
+ CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES,
+ "TCP connection counts by TCP state");
+
+static void
+tcp_vnet_init(const void *unused)
+{
+
+ COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
+ VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
+}
+VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ tcp_vnet_init, NULL);
+
+#ifdef VIMAGE
+static void
+tcp_vnet_uninit(const void *unused)
+{
+
+ COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
+ VNET_PCPUSTAT_FREE(tcpstat);
+}
+VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ tcp_vnet_uninit, NULL);
+#endif /* VIMAGE */
/*
* Kernel module interface for updating tcpstat. The argument is an index
- * into tcpstat treated as an array of u_long. While this encodes the
- * general layout of tcpstat into the caller, it doesn't encode its location,
- * so that future changes to add, for example, per-CPU stats support won't
- * cause binary compatibility problems for kernel modules.
+ * into tcpstat treated as an array.
*/
void
kmod_tcpstat_inc(int statnum)
{
- (*((u_long *)&V_tcpstat + statnum))++;
+ counter_u64_add(VNET(tcpstat)[statnum], 1);
}
/*
* Wrapper for the TCP established input helper hook.
*/
-static void inline
+void
hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
{
struct tcp_hhook_data hhook_data;
@@ -281,7 +301,7 @@ hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
/*
* CC wrapper hook functions
*/
-static void inline
+void
cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -295,7 +315,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
if (type == CC_ACK) {
if (tp->snd_cwnd > tp->snd_ssthresh) {
tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
- V_tcp_abc_l_var * tp->t_maxseg);
+ V_tcp_abc_l_var * tcp_maxseg(tp));
if (tp->t_bytes_acked >= tp->snd_cwnd) {
tp->t_bytes_acked -= tp->snd_cwnd;
tp->ccv->flags |= CCF_ABC_SENTAWND;
@@ -313,16 +333,18 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
}
}
-static void inline
+void
cc_conn_init(struct tcpcb *tp)
{
struct hc_metrics_lite metrics;
struct inpcb *inp = tp->t_inpcb;
+ u_int maxseg;
int rtt;
INP_WLOCK_ASSERT(tp->t_inpcb);
tcp_hc_get(&inp->inp_inc, &metrics);
+ maxseg = tcp_maxseg(tp);
if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
tp->t_srtt = rtt;
@@ -344,10 +366,10 @@ cc_conn_init(struct tcpcb *tp)
/*
* There's some sort of gateway or interface
* buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
+ * the slow start threshold, but set the
* threshold to no less than 2*mss.
*/
- tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
+ tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh);
TCPSTAT_INC(tcps_usedssthresh);
}
@@ -357,27 +379,27 @@ cc_conn_init(struct tcpcb *tp)
* RFC5681 Section 3.1 specifies the default conservative values.
* RFC3390 specifies slightly more aggressive values.
* RFC6928 increases it to ten segments.
+ * Support for user specified value for initial flight size.
*
* If a SYN or SYN/ACK was lost and retransmitted, we have to
* reduce the initial CWND to one segment as congestion is likely
* requiring us to be cautious.
*/
if (tp->snd_cwnd == 1)
- tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
- else if (V_tcp_do_initcwnd10)
- tp->snd_cwnd = min(10 * tp->t_maxseg,
- max(2 * tp->t_maxseg, 14600));
+ tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */
+ else if (V_tcp_initcwnd_segments)
+ tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg,
+ max(2 * maxseg, V_tcp_initcwnd_segments * 1460));
else if (V_tcp_do_rfc3390)
- tp->snd_cwnd = min(4 * tp->t_maxseg,
- max(2 * tp->t_maxseg, 4380));
+ tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380));
else {
/* Per RFC5681 Section 3.1 */
- if (tp->t_maxseg > 2190)
- tp->snd_cwnd = 2 * tp->t_maxseg;
- else if (tp->t_maxseg > 1095)
- tp->snd_cwnd = 3 * tp->t_maxseg;
+ if (maxseg > 2190)
+ tp->snd_cwnd = 2 * maxseg;
+ else if (maxseg > 1095)
+ tp->snd_cwnd = 3 * maxseg;
else
- tp->snd_cwnd = 4 * tp->t_maxseg;
+ tp->snd_cwnd = 4 * maxseg;
}
if (CC_ALGO(tp)->conn_init != NULL)
@@ -387,6 +409,8 @@ cc_conn_init(struct tcpcb *tp)
void inline
cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
{
+ u_int maxseg;
+
INP_WLOCK_ASSERT(tp->t_inpcb);
switch(type) {
@@ -406,12 +430,13 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
}
break;
case CC_RTO:
+ maxseg = tcp_maxseg(tp);
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
EXIT_RECOVERY(tp->t_flags);
tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
- tp->t_maxseg) * tp->t_maxseg;
- tp->snd_cwnd = tp->t_maxseg;
+ maxseg) * maxseg;
+ tp->snd_cwnd = maxseg;
break;
case CC_RTO_ERR:
TCPSTAT_INC(tcps_sndrexmitbad);
@@ -436,7 +461,7 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
}
}
-static void inline
+void inline
cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -451,27 +476,7 @@ cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
tp->t_bytes_acked = 0;
}
-static inline void
-tcp_fields_to_host(struct tcphdr *th)
-{
-
- th->th_seq = ntohl(th->th_seq);
- th->th_ack = ntohl(th->th_ack);
- th->th_win = ntohs(th->th_win);
- th->th_urp = ntohs(th->th_urp);
-}
-
#ifdef TCP_SIGNATURE
-static inline void
-tcp_fields_to_net(struct tcphdr *th)
-{
-
- th->th_seq = htonl(th->th_seq);
- th->th_ack = htonl(th->th_ack);
- th->th_win = htons(th->th_win);
- th->th_urp = htons(th->th_urp);
-}
-
static inline int
tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
struct tcpopt *to, struct tcphdr *th, u_int tcpbflag)
@@ -485,34 +490,56 @@ tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
}
#endif
-/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
-#ifdef INET6
-#define ND6_HINT(tp) \
-do { \
- if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
- nd6_nud_hint(NULL, NULL, 0); \
-} while (0)
-#else
-#define ND6_HINT(tp)
-#endif
-
/*
* Indicate whether this ack should be delayed. We can delay the ack if
- * - there is no delayed ack timer in progress and
- * - our last ack wasn't a 0-sized window. We never want to delay
- * the ack that opens up a 0-sized window and
- * - delayed acks are enabled or
- * - this is a half-synchronized T/TCP connection.
- * - the segment size is not larger than the MSS and LRO wasn't used
- * for this segment.
+ * following conditions are met:
+ * - There is no delayed ack timer in progress.
+ * - Our last ack wasn't a 0-sized window. We never want to delay
+ * the ack that opens up a 0-sized window.
+ * - LRO wasn't used for this segment. We make sure by checking that the
+ * segment size is not larger than the MSS.
*/
#define DELAY_ACK(tp, tlen) \
((!tcp_timer_active(tp, TT_DELACK) && \
(tp->t_flags & TF_RXWIN0SENT) == 0) && \
- (tlen <= tp->t_maxopd) && \
+ (tlen <= tp->t_maxseg) && \
(V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+static void inline
+cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
+{
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->ccv->flags |= CCF_IPHDR_CE;
+ break;
+ case IPTOS_ECN_ECT0:
+ tp->ccv->flags &= ~CCF_IPHDR_CE;
+ break;
+ case IPTOS_ECN_ECT1:
+ tp->ccv->flags &= ~CCF_IPHDR_CE;
+ break;
+ }
+
+ if (th->th_flags & TH_CWR)
+ tp->ccv->flags |= CCF_TCPHDR_CWR;
+ else
+ tp->ccv->flags &= ~CCF_TCPHDR_CWR;
+
+ if (tp->t_flags & TF_DELACK)
+ tp->ccv->flags |= CCF_DELACK;
+ else
+ tp->ccv->flags &= ~CCF_DELACK;
+
+ CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
+
+ if (tp->ccv->flags & CCF_ACKNOW)
+ tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ }
+}
+
/*
* TCP input handling is split into multiple parts:
* tcp6_input is a thin wrapper around tcp_input for the extended
@@ -528,6 +555,7 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
struct in6_ifaddr *ia6;
+ struct ip6_hdr *ip6;
IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
@@ -535,7 +563,8 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
* draft-itojun-ipv6-tcp-to-anycast
* better place to put this in?
*/
- ia6 = ip6_getdstifaddr(m);
+ ip6 = mtod(m, struct ip6_hdr *);
+ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
struct ip6_hdr *ip6;
@@ -543,28 +572,26 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
ip6 = mtod(m, struct ip6_hdr *);
icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
(caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
- return IPPROTO_DONE;
+ return (IPPROTO_DONE);
}
if (ia6)
ifa_free(&ia6->ia_ifa);
- tcp_input(m, *offp);
- return IPPROTO_DONE;
+ return (tcp_input(mp, offp, proto));
}
#endif /* INET6 */
-void
-tcp_input(struct mbuf *m, int off0)
+int
+tcp_input(struct mbuf **mp, int *offp, int proto)
{
+ struct mbuf *m = *mp;
struct tcphdr *th = NULL;
struct ip *ip = NULL;
-#ifdef INET
- struct ipovly *ipov;
-#endif
struct inpcb *inp = NULL;
struct tcpcb *tp = NULL;
struct socket *so = NULL;
u_char *optp = NULL;
+ int off0;
int optlen = 0;
#ifdef INET
int len;
@@ -587,9 +614,6 @@ tcp_input(struct mbuf *m, int off0)
struct tcpopt to; /* options in this segment */
char *s = NULL; /* address and port logging */
int ti_locked;
-#define TI_UNLOCKED 1
-#define TI_WLOCKED 2
-
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -604,6 +628,9 @@ tcp_input(struct mbuf *m, int off0)
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ off0 = *offp;
+ m = *mp;
+ *mp = NULL;
to.to_flags = 0;
TCPSTAT_INC(tcps_rcvtotal);
@@ -615,7 +642,7 @@ tcp_input(struct mbuf *m, int off0)
m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
if (m == NULL) {
TCPSTAT_INC(tcps_rcvshort);
- return;
+ return (IPPROTO_DONE);
}
}
@@ -660,45 +687,43 @@ tcp_input(struct mbuf *m, int off0)
* Note: IP leaves IP header in first mbuf.
*/
if (off0 > sizeof (struct ip)) {
- ip_stripoptions(m, (struct mbuf *)0);
+ ip_stripoptions(m);
off0 = sizeof(struct ip);
}
if (m->m_len < sizeof (struct tcpiphdr)) {
if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
== NULL) {
TCPSTAT_INC(tcps_rcvshort);
- return;
+ return (IPPROTO_DONE);
}
}
ip = mtod(m, struct ip *);
- ipov = (struct ipovly *)ip;
th = (struct tcphdr *)((caddr_t)ip + off0);
- tlen = ip->ip_len;
+ tlen = ntohs(ip->ip_len) - off0;
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
else
th->th_sum = in_pseudo(ip->ip_src.s_addr,
- ip->ip_dst.s_addr,
- htonl(m->m_pkthdr.csum_data +
- ip->ip_len +
- IPPROTO_TCP));
+ ip->ip_dst.s_addr,
+ htonl(m->m_pkthdr.csum_data + tlen +
+ IPPROTO_TCP));
th->th_sum ^= 0xffff;
-#ifdef TCPDEBUG
- ipov->ih_len = (u_short)tlen;
- ipov->ih_len = htons(ipov->ih_len);
-#endif
} else {
+ struct ipovly *ipov = (struct ipovly *)ip;
+
/*
* Checksum extended TCP header and data.
*/
- len = sizeof (struct ip) + tlen;
+ len = off0 + tlen;
bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
- ipov->ih_len = (u_short)tlen;
- ipov->ih_len = htons(ipov->ih_len);
+ ipov->ih_len = htons(tlen);
th->th_sum = in_cksum(m, len);
+ /* Reset length for SDT probes. */
+ ip->ip_len = htons(tlen + off0);
}
+
if (th->th_sum) {
TCPSTAT_INC(tcps_rcvbadsum);
goto drop;
@@ -732,7 +757,7 @@ tcp_input(struct mbuf *m, int off0)
if (off > sizeof (struct tcphdr)) {
#ifdef INET6
if (isipv6) {
- IP6_EXTHDR_CHECK(m, off0, off, );
+ IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE);
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)((caddr_t)ip6 + off0);
}
@@ -746,10 +771,9 @@ tcp_input(struct mbuf *m, int off0)
if ((m = m_pullup(m, sizeof (struct ip) + off))
== NULL) {
TCPSTAT_INC(tcps_rcvshort);
- return;
+ return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
- ipov = (struct ipovly *)ip;
th = (struct tcphdr *)((caddr_t)ip + off0);
}
}
@@ -771,26 +795,17 @@ tcp_input(struct mbuf *m, int off0)
/*
* Locate pcb for segment; if we're likely to add or remove a
- * connection then first acquire pcbinfo lock. There are two cases
+ * connection then first acquire pcbinfo lock. There are three cases
* where we might discover later we need a write lock despite the
- * flags: ACKs moving a connection out of the syncache, and ACKs for
- * a connection in TIMEWAIT.
+ * flags: ACKs moving a connection out of the syncache, ACKs for a
+ * connection in TIMEWAIT and SYNs not targeting a listening socket.
*/
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) {
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ if ((thflags & (TH_FIN | TH_RST)) != 0) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
} else
ti_locked = TI_UNLOCKED;
-findpcb:
-#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- } else {
- INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- }
-#endif
-
/*
* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
*/
@@ -807,6 +822,14 @@ findpcb:
)
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+findpcb:
+#ifdef INVARIANTS
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ } else {
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
#ifdef INET6
if (isipv6 && fwd_tag != NULL) {
struct sockaddr_in6 *next_hop6;
@@ -831,10 +854,6 @@ findpcb:
th->th_dport, INPLOOKUP_WILDCARD |
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
}
- /* Remove the tag from the packet. We don't need it anymore. */
- m_tag_delete(m, fwd_tag);
- m->m_flags &= ~M_IP6_NEXTHOP;
- fwd_tag = NULL;
} else if (isipv6) {
inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
th->th_sport, &ip6->ip6_dst, th->th_dport,
@@ -869,10 +888,6 @@ findpcb:
th->th_dport, INPLOOKUP_WILDCARD |
INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
}
- /* Remove the tag from the packet. We don't need it anymore. */
- m_tag_delete(m, fwd_tag);
- m->m_flags &= ~M_IP_NEXTHOP;
- fwd_tag = NULL;
} else
inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
th->th_sport, ip->ip_dst, th->th_dport,
@@ -908,23 +923,20 @@ findpcb:
goto dropwithreset;
}
INP_WLOCK_ASSERT(inp);
- if (!(inp->inp_flags & INP_HW_FLOWID)
- && (m->m_flags & M_FLOWID)
- && ((inp->inp_socket == NULL)
- || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
- inp->inp_flags |= INP_HW_FLOWID;
- inp->inp_flags &= ~INP_SW_FLOWID;
+ if ((inp->inp_flowtype == M_HASHTYPE_NONE) &&
+ (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) &&
+ ((inp->inp_socket == NULL) ||
+ (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) {
inp->inp_flowid = m->m_pkthdr.flowid;
+ inp->inp_flowtype = M_HASHTYPE_GET(m);
}
#ifdef IPSEC
#ifdef INET6
if (isipv6 && ipsec6_in_reject(m, inp)) {
- IPSEC6STAT_INC(in_polvio);
goto dropunlock;
} else
#endif /* INET6 */
if (ipsec4_in_reject(m, inp) != 0) {
- IPSECSTAT_INC(in_polvio);
goto dropunlock;
}
#endif /* IPSEC */
@@ -934,9 +946,10 @@ findpcb:
*/
if (inp->inp_ip_minttl != 0) {
#ifdef INET6
- if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
- goto dropunlock;
- else
+ if (isipv6) {
+ if (inp->inp_ip_minttl > ip6->ip6_hlim)
+ goto dropunlock;
+ } else
#endif
if (inp->inp_ip_minttl > ip->ip_ttl)
goto dropunlock;
@@ -945,7 +958,7 @@ findpcb:
/*
* A previous connection in TIMEWAIT state is supposed to catch stray
* or duplicate segments arriving late. If this segment was a
- * legitimate new connection attempt the old INPCB gets removed and
+ * legitimate new connection attempt, the old INPCB gets removed and
* we can try again to find a listening socket.
*
* At this point, due to earlier optimism, we may hold only an inpcb
@@ -961,20 +974,20 @@ findpcb:
relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
}
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
if (thflags & TH_SYN)
tcp_dooptions(&to, optp, optlen, TO_SYN);
@@ -983,8 +996,8 @@ relocked:
*/
if (tcp_twcheck(inp, &to, th, m, tlen))
goto findpcb;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- return;
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ return (IPPROTO_DONE);
}
/*
* The TCPCB may no longer exist if the connection is winding
@@ -1013,16 +1026,18 @@ relocked:
* now be in TIMEWAIT.
*/
#ifdef INVARIANTS
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if ((thflags & (TH_FIN | TH_RST)) != 0)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
#endif
- if (tp->t_state != TCPS_ESTABLISHED) {
+ if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) ||
+ (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
+ !(tp->t_flags & TF_FASTOPEN)))) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- ti_locked = TI_WLOCKED;
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
@@ -1030,9 +1045,9 @@ relocked:
}
goto relocked;
} else
- ti_locked = TI_WLOCKED;
+ ti_locked = TI_RLOCKED;
}
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
}
#ifdef MAC
@@ -1057,17 +1072,13 @@ relocked:
/*
* When the socket is accepting connections (the INPCB is in LISTEN
* state) we look into the SYN cache if this is a new connection
- * attempt or the completion of a previous one. Because listen
- * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be
- * held in this case.
+ * attempt or the completion of a previous one.
*/
if (so->so_options & SO_ACCEPTCONN) {
struct in_conninfo inc;
KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
"tp not listening", __func__));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
bzero(&inc, sizeof(inc));
#ifdef INET6
if (isipv6) {
@@ -1090,6 +1101,8 @@ relocked:
* socket appended to the listen queue in SYN_RECEIVED state.
*/
if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
/*
* Parse the TCP options here because
* syncookies need access to the reflected
@@ -1110,6 +1123,9 @@ relocked:
rstreason = BANDLIM_RST_OPENPORT;
goto dropwithreset;
}
+#ifdef TCP_RFC7413
+new_tfo_socket:
+#endif
if (so == NULL) {
/*
* We completed the 3-way handshake
@@ -1141,7 +1157,11 @@ relocked:
*/
INP_WUNLOCK(inp); /* listen socket */
inp = sotoinpcb(so);
- INP_WLOCK(inp); /* new connection */
+ /*
+ * New connection inpcb is already locked by
+ * syncache_expand().
+ */
+ INP_WLOCK_ASSERT(inp);
tp = intotcpcb(inp);
KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
("%s: ", __func__));
@@ -1170,10 +1190,10 @@ relocked:
* contains. tcp_do_segment() consumes
* the mbuf chain and unlocks the inpcb.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
iptos, ti_locked);
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- return;
+ return (IPPROTO_DONE);
}
/*
* Segment flag validation for new connection attempts:
@@ -1277,7 +1297,7 @@ relocked:
if (isipv6 && !V_ip6_use_deprecated) {
struct in6_ifaddr *ia6;
- ia6 = ip6_getdstifaddr(m);
+ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
if (ia6 != NULL &&
(ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
ifa_free(&ia6->ia_ifa);
@@ -1366,14 +1386,24 @@ relocked:
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
tcp_dooptions(&to, optp, optlen, TO_SYN);
- syncache_add(&inc, &to, th, inp, &so, m);
+#ifdef TCP_RFC7413
+ if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL))
+ goto new_tfo_socket;
+#else
+ syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
+#endif
/*
* Entry added to syncache and mbuf consumed.
- * Everything already unlocked by syncache_add().
+ * Only the listen socket is unlocked by syncache_add().
*/
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ ti_locked = TI_UNLOCKED;
+ }
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- return;
+ return (IPPROTO_DONE);
} else if (tp->t_state == TCPS_LISTEN) {
/*
* When a listen socket is torn down the SO_ACCEPTCONN
@@ -1404,18 +1434,22 @@ relocked:
}
#endif
+ TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
+
/*
* Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
* state. tcp_do_segment() always consumes the mbuf chain, unlocks
* the inpcb, and unlocks pcbinfo.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
+ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
- return;
+ return (IPPROTO_DONE);
dropwithreset:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
+
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1435,8 +1469,11 @@ dropwithreset:
goto drop;
dropunlock:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (m != NULL)
+ TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th);
+
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -1456,18 +1493,23 @@ drop:
free(s, M_TCPLOG);
if (m != NULL)
m_freem(m);
+ return (IPPROTO_DONE);
}
-static void
+void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
int ti_locked)
{
- int thflags, acked, ourfinisacked, needoutput = 0;
+ int thflags, acked, ourfinisacked, needoutput = 0, sack_changed;
int rstreason, todrop, win;
u_long tiwin;
+ char *s;
+ struct in_conninfo *inc;
+ struct mbuf *mfree;
struct tcpopt to;
-
+ int tfo_syn;
+
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -1478,30 +1520,25 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
short ostate = 0;
#endif
thflags = th->th_flags;
+ inc = &tp->t_inpcb->inp_inc;
tp->sackhint.last_sack_ack = 0;
+ sack_changed = 0;
/*
* If this is either a state-changing packet or current state isn't
* established, we require a write lock on tcbinfo. Otherwise, we
- * allow either a read lock or a write lock, as we may have acquired
- * a write lock due to a race.
- *
- * Require a global write lock for SYN/FIN/RST segments or
- * non-established connections; otherwise accept either a read or
- * write lock, as we may have conservatively acquired a write lock in
- * certain cases in tcp_input() (is this still true?). Currently we
- * will never enter with no lock, so we try to drop it quickly in the
- * common pure ack/pure data cases.
+ * allow the tcbinfo to be in either alocked or unlocked, as the
+ * caller may have unnecessarily acquired a write lock due to a race.
*/
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
tp->t_state != TCPS_ESTABLISHED) {
- KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for "
"SYN/FIN/RST/!EST", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
} else {
#ifdef INVARIANTS
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
else {
KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
"ti_locked: %d", __func__, ti_locked));
@@ -1515,6 +1552,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
__func__));
+#ifdef TCPPCAP
+ /* Save segment, if requested. */
+ tcp_pcap_add(th, m, &(tp->t_inpkts));
+#endif
+
/*
* Segment received on connection.
* Reset idle time and keep-alive timer.
@@ -1526,7 +1568,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
/*
- * Unscale the window into a 32-bit value.
+ * Scale up the window into a 32-bit value.
* For the SYN_SENT state the scale is zero.
*/
tiwin = th->th_win << tp->snd_scale;
@@ -1549,6 +1591,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
TCPSTAT_INC(tcps_ecn_ect1);
break;
}
+
+ /* Process a packet differently from RFC3168. */
+ cc_ecnpkt_handler(tp, th, iptos);
+
/* Congestion experienced. */
if (thflags & TH_ECE) {
cc_cong_signal(tp, th, CC_ECN);
@@ -1573,6 +1619,24 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
to.to_tsecr = 0;
}
+ /*
+ * If timestamps were negotiated during SYN/ACK they should
+ * appear on every segment during this session and vice versa.
+ */
+ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp missing, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
+ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
+ "no action\n", s, __func__);
+ free(s, M_TCPLOG);
+ }
+ }
/*
* Process options only when we get SYN/ACK back. The SYN case
@@ -1652,8 +1716,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/*
* This is a pure ack for outstanding data.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
TCPSTAT_INC(tcps_predack);
@@ -1720,7 +1784,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_wl2 = th->th_ack;
tp->t_dupacks = 0;
m_freem(m);
- ND6_HINT(tp); /* Some progress has been made. */
/*
* If all outstanding data are acked, stop
@@ -1737,14 +1800,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
(void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th,
+ mtod(m, const char *));
if (tp->snd_una == tp->snd_max)
tcp_timer_activate(tp, TT_REXMT, 0);
else if (!tcp_timer_active(tp, TT_PERSIST))
tcp_timer_activate(tp, TT_REXMT,
tp->t_rxtcur);
sowwakeup(so);
- if (so->so_snd.sb_cc)
- (void) tcp_output(tp);
+ if (sbavail(&so->so_snd))
+ (void) tp->t_fb->tfb_tcp_output(tp);
goto check_delack;
}
} else if (th->th_ack == tp->snd_una &&
@@ -1756,8 +1821,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing on the reassembly queue and we have enough
* buffer space to take it.
*/
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
/* Clean receiver SACK report if present */
@@ -1777,12 +1842,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->rcv_up = tp->rcv_nxt;
TCPSTAT_INC(tcps_rcvpack);
TCPSTAT_ADD(tcps_rcvbyte, tlen);
- ND6_HINT(tp); /* Some progress has been made */
#ifdef TCPDEBUG
if (so->so_options & SO_DEBUG)
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+
/*
* Automatic sizing of receive socket buffer. Often the send
* buffer size is not optimally adjusted to the actual network
@@ -1802,11 +1868,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* reassembly queue.
*
* The criteria to step up the receive buffer one notch are:
- * 1. the number of bytes received during the time it takes
+ * 1. Application has not set receive buffer size with
+ * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
+ * 2. the number of bytes received during the time it takes
* one timestamp to be reflected back to us (the RTT);
- * 2. received bytes per RTT is within seven eighth of the
+ * 3. received bytes per RTT is within seven eighth of the
* current socket buffer size;
- * 3. receive buffer size has not hit maximal automatic size;
+ * 4. receive buffer size has not hit maximal automatic size;
*
* This algorithm does one step per RTT at most and only if
* we receive a bulk stream w/o packet losses or reorderings.
@@ -1817,6 +1885,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* the buffer to better manage the socket buffer resources.
*/
if (V_tcp_do_autorcvbuf &&
+ (to.to_flags & TOF_TS) &&
to.to_tsecr &&
(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
@@ -1851,7 +1920,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
newsize, so, NULL))
so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
m_adj(m, drop_hdrlen); /* delayed header drop */
- sbappendstream_locked(&so->so_rcv, m);
+ sbappendstream_locked(&so->so_rcv, m, 0);
}
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
@@ -1859,7 +1928,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->t_flags |= TF_DELACK;
} else {
tp->t_flags |= TF_ACKNOW;
- tcp_output(tp);
+ tp->t_fb->tfb_tcp_output(tp);
}
goto check_delack;
}
@@ -1893,6 +1962,28 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
rstreason = BANDLIM_RST_OPENPORT;
goto dropwithreset;
}
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN) {
+ /*
+ * When a TFO connection is in SYN_RECEIVED, the
+ * only valid packets are the initial SYN, a
+ * retransmit/copy of the initial SYN (possibly with
+ * a subset of the original data), a valid ACK, a
+ * FIN, or a RST.
+ */
+ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
+ rstreason = BANDLIM_RST_OPENPORT;
+ goto dropwithreset;
+ } else if (thflags & TH_SYN) {
+ /* non-initial SYN is ignored */
+ if ((tcp_timer_active(tp, TT_DELACK) ||
+ tcp_timer_active(tp, TT_REXMT)))
+ goto drop;
+ } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
+ goto drop;
+ }
+ }
+#endif
break;
/*
@@ -1916,8 +2007,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
rstreason = BANDLIM_UNLIMITED;
goto dropwithreset;
}
- if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
+ if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
+ TCP_PROBE5(connect__refused, NULL, tp,
+ mtod(m, const char *), tp, th);
tp = tcp_drop(tp, ECONNREFUSED);
+ }
if (thflags & TH_RST)
goto drop;
if (!(thflags & TH_SYN))
@@ -1962,11 +2056,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
- tp->t_state = TCPS_FIN_WAIT_1;
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
thflags &= ~TH_SYN;
} else {
- tp->t_state = TCPS_ESTABLISHED;
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(connect__established, NULL, tp,
+ mtod(m, const char *), tp, th);
cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP,
TP_KEEPIDLE(tp));
@@ -1974,22 +2070,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else {
/*
* Received initial SYN in SYN-SENT[*] state =>
- * simultaneous open. If segment contains CC option
- * and there is a cached CC, apply TAO test.
+ * simultaneous open.
* If it succeeds, connection is * half-synchronized.
* Otherwise, do 3-way handshake:
* SYN-SENT -> SYN-RECEIVED
* SYN-SENT* -> SYN-RECEIVED*
- * If there was no CC option, clear cached CC value.
*/
tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
tcp_timer_activate(tp, TT_REXMT, 0);
- tp->t_state = TCPS_SYN_RECEIVED;
+ tcp_state_change(tp, TCPS_SYN_RECEIVED);
}
- KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
"ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2045,98 +2139,84 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* Then check that at least some bytes of segment are within
* receive window. If segment begins before rcv_nxt,
* drop leading data (and SYN); if nothing left, just ack.
- *
- *
- * If the RST bit is set, check the sequence number to see
- * if this is a valid reset segment.
- * RFC 793 page 37:
- * In all states except SYN-SENT, all reset (RST) segments
- * are validated by checking their SEQ-fields. A reset is
- * valid if its sequence number is in the window.
- * Note: this does not take into account delayed ACKs, so
- * we should test against last_ack_sent instead of rcv_nxt.
- * The sequence number in the reset segment is normally an
- * echo of our outgoing acknowlegement numbers, but some hosts
- * send a reset with the sequence number at the rightmost edge
- * of our receive window, and we have to handle this case.
- * Note 2: Paul Watson's paper "Slipping in the Window" has shown
- * that brute force RST attacks are possible. To combat this,
- * we use a much stricter check while in the ESTABLISHED state,
- * only accepting RSTs where the sequence number is equal to
- * last_ack_sent. In all other states (the states in which a
- * RST is more likely), the more permissive check is used.
- * If we have multiple segments in flight, the initial reset
- * segment sequence numbers will be to the left of last_ack_sent,
- * but they will eventually catch up.
- * In any case, it never made sense to trim reset segments to
- * fit the receive window since RFC 1122 says:
- * 4.2.2.12 RST Segment: RFC-793 Section 3.4
- *
- * A TCP SHOULD allow a received RST segment to include data.
- *
- * DISCUSSION
- * It has been suggested that a RST segment could contain
- * ASCII text that encoded and explained the cause of the
- * RST. No standard has yet been established for such
- * data.
- *
- * If the reset segment passes the sequence number test examine
- * the state:
- * SYN_RECEIVED STATE:
- * If passive open, return to LISTEN state.
- * If active open, inform user that connection was refused.
- * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
- * Inform user that connection was reset, and close tcb.
- * CLOSING, LAST_ACK STATES:
- * Close the tcb.
- * TIME_WAIT STATE:
- * Drop the segment - see Stevens, vol. 2, p. 964 and
- * RFC 1337.
*/
if (thflags & TH_RST) {
- if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
- SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
- switch (tp->t_state) {
-
- case TCPS_SYN_RECEIVED:
- so->so_error = ECONNREFUSED;
- goto close;
-
- case TCPS_ESTABLISHED:
- if (V_tcp_insecure_rst == 0 &&
- !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
- SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
- !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
- SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
- TCPSTAT_INC(tcps_badrst);
- goto drop;
- }
- /* FALLTHROUGH */
- case TCPS_FIN_WAIT_1:
- case TCPS_FIN_WAIT_2:
- case TCPS_CLOSE_WAIT:
- so->so_error = ECONNRESET;
- close:
- KASSERT(ti_locked == TI_WLOCKED,
- ("tcp_do_segment: TH_RST 1 ti_locked %d",
- ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
- tp->t_state = TCPS_CLOSED;
+ /*
+ * RFC5961 Section 3.2
+ *
+ * - RST drops connection only if SEG.SEQ == RCV.NXT.
+ * - If RST is in window, we send challenge ACK.
+ *
+ * Note: to take into account delayed ACKs, we should
+ * test against last_ack_sent instead of rcv_nxt.
+ * Note 2: we handle special case of closed window, not
+ * covered by the RFC.
+ */
+ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED,
+ ("%s: TH_RST ti_locked %d, th %p tp %p",
+ __func__, ti_locked, th, tp));
+ KASSERT(tp->t_state != TCPS_SYN_SENT,
+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+ __func__, th, tp));
+
+ if (V_tcp_insecure_rst ||
+ tp->last_ack_sent == th->th_seq) {
TCPSTAT_INC(tcps_drops);
- tp = tcp_close(tp);
- break;
+ /* Drop the connection. */
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ so->so_error = ECONNRESET;
+ close:
+ tcp_state_change(tp, TCPS_CLOSED);
+ /* FALLTHROUGH */
+ default:
+ tp = tcp_close(tp);
+ }
+ } else {
+ TCPSTAT_INC(tcps_badrst);
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m,
+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ }
+ }
+ goto drop;
+ }
- case TCPS_CLOSING:
- case TCPS_LAST_ACK:
- KASSERT(ti_locked == TI_WLOCKED,
- ("tcp_do_segment: TH_RST 2 ti_locked %d",
- ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ /*
+ * RFC5961 Section 4.2
+ * Send challenge ACK for any SYN in synchronized state.
+ */
+ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT &&
+ tp->t_state != TCPS_SYN_RECEIVED) {
+ KASSERT(ti_locked == TI_RLOCKED,
+ ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- tp = tcp_close(tp);
- break;
- }
+ TCPSTAT_INC(tcps_badsyn);
+ if (V_tcp_insecure_syn &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp = tcp_drop(tp, ECONNRESET);
+ rstreason = BANDLIM_UNLIMITED;
+ } else {
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+ tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
}
goto drop;
}
@@ -2236,15 +2316,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if ((so->so_state & SS_NOFDREF) &&
tp->t_state > TCPS_CLOSE_WAIT && tlen) {
- char *s;
-
- KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && "
"CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
- log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
- "was closed, sending RST and removing tcpcb\n",
+ if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
+ log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
+ "after socket was closed, "
+ "sending RST and removing tcpcb\n",
s, __func__, tcpstates[tp->t_state], tlen);
free(s, M_TCPLOG);
}
@@ -2309,29 +2388,22 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
/*
- * If a SYN is in the window, then this is an
- * error and we send an RST and drop the connection.
- */
- if (thflags & TH_SYN) {
- KASSERT(ti_locked == TI_WLOCKED,
- ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
- tp = tcp_drop(tp, ECONNRESET);
- rstreason = BANDLIM_UNLIMITED;
- goto drop;
- }
-
- /*
* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
* flag is on (half-synchronized state), then queue data for
* later processing; else drop segment and return.
*/
if ((thflags & TH_ACK) == 0) {
if (tp->t_state == TCPS_SYN_RECEIVED ||
- (tp->t_flags & TF_NEEDSYN))
+ (tp->t_flags & TF_NEEDSYN)) {
+#ifdef TCP_RFC7413
+ if (tp->t_state == TCPS_SYN_RECEIVED &&
+ tp->t_flags & TF_FASTOPEN) {
+ tp->snd_wnd = tiwin;
+ cc_conn_init(tp);
+ }
+#endif
goto step6;
- else if (tp->t_flags & TF_ACKNOW)
+ } else if (tp->t_flags & TF_ACKNOW)
goto dropafterack;
else
goto drop;
@@ -2364,11 +2436,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
- tp->t_state = TCPS_FIN_WAIT_1;
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
} else {
- tp->t_state = TCPS_ESTABLISHED;
- cc_conn_init(tp);
+ tcp_state_change(tp, TCPS_ESTABLISHED);
+ TCP_PROBE5(accept__established, NULL, tp,
+ mtod(m, const char *), tp, th);
+#ifdef TCP_RFC7413
+ if (tp->t_tfo_pending) {
+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+ tp->t_tfo_pending = NULL;
+
+ /*
+ * Account for the ACK of our SYN prior to
+ * regular ACK processing below.
+ */
+ tp->snd_una++;
+ }
+ /*
+ * TFO connections call cc_conn_init() during SYN
+ * processing. Calling it again here for such
+ * connections is not harmless as it would undo the
+ * snd_cwnd reduction that occurs when a TFO SYN|ACK
+ * is retransmitted.
+ */
+ if (!(tp->t_flags & TF_FASTOPEN))
+#endif
+ cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
}
/*
@@ -2402,21 +2496,45 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if ((tp->t_flags & TF_SACK_PERMIT) &&
((to.to_flags & TOF_SACK) ||
!TAILQ_EMPTY(&tp->snd_holes)))
- tcp_sack_doack(tp, &to, th->th_ack);
+ sack_changed = tcp_sack_doack(tp, &to, th->th_ack);
+ else
+ /*
+ * Reset the value so that previous (valid) value
+ * from the last ack with SACK doesn't get used.
+ */
+ tp->sackhint.sacked_bytes = 0;
/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
hhook_run_tcp_est_in(tp, th, &to);
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
- if (tlen == 0 && tiwin == tp->snd_wnd) {
+ u_int maxseg;
+
+ maxseg = tcp_maxseg(tp);
+ if (tlen == 0 &&
+ (tiwin == tp->snd_wnd ||
+ (tp->t_flags & TF_SACK_PERMIT))) {
+ /*
+ * If this is the first time we've seen a
+ * FIN from the remote, this is not a
+ * duplicate and it needs to be processed
+ * normally. This happens during a
+ * simultaneous close.
+ */
+ if ((thflags & TH_FIN) &&
+ (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
+ tp->t_dupacks = 0;
+ break;
+ }
TCPSTAT_INC(tcps_rcvdupack);
/*
* If we have outstanding data (other than
* a window probe), this is a completely
* duplicate ack (ie, window info didn't
- * change), the ack is the biggest we've
+ * change and FIN isn't set),
+ * the ack is the biggest we've
* seen and we've seen exactly our rexmt
- * threshhold of them, assume a packet
+ * threshold of them, assume a packet
* has been dropped and retransmit it.
* Kludge snd_nxt & the congestion
* window so we send only this one
@@ -2437,8 +2555,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* When using TCP ECN, notify the peer that
* we reduced the cwnd.
*/
- if (!tcp_timer_active(tp, TT_REXMT) ||
- th->th_ack != tp->snd_una)
+ /*
+ * Following 2 kinds of acks should not affect
+ * dupack counting:
+ * 1) Old acks
+ * 2) Acks with SACK but without any new SACK
+ * information in them. These could result from
+ * any anomaly in the network like a switch
+ * duplicating packets or a possible DoS attack.
+ */
+ if (th->th_ack != tp->snd_una ||
+ ((tp->t_flags & TF_SACK_PERMIT) &&
+ !sack_changed))
+ break;
+ else if (!tcp_timer_active(tp, TT_REXMT))
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
IN_FASTRECOVERY(tp->t_flags)) {
@@ -2453,26 +2583,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* we have less than 1/2 the original window's
* worth of data in flight.
*/
- awnd = (tp->snd_nxt - tp->snd_fack) +
- tp->sackhint.sack_bytes_rexmit;
+ if (V_tcp_do_rfc6675_pipe)
+ awnd = tcp_compute_pipe(tp);
+ else
+ awnd = (tp->snd_nxt - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+
if (awnd < tp->snd_ssthresh) {
- tp->snd_cwnd += tp->t_maxseg;
+ tp->snd_cwnd += maxseg;
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
}
} else
- tp->snd_cwnd += tp->t_maxseg;
- if ((thflags & TH_FIN) &&
- (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
- /*
- * If its a fin we need to process
- * it to avoid a race where both
- * sides enter FIN-WAIT and send FIN|ACK
- * at the same time.
- */
- break;
- }
- (void) tcp_output(tp);
+ tp->snd_cwnd += maxseg;
+ (void) tp->t_fb->tfb_tcp_output(tp);
goto drop;
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
@@ -2505,33 +2629,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
TCPSTAT_INC(
tcps_sack_recovery_episode);
tp->sack_newdata = tp->snd_nxt;
- tp->snd_cwnd = tp->t_maxseg;
- (void) tcp_output(tp);
+ tp->snd_cwnd = maxseg;
+ (void) tp->t_fb->tfb_tcp_output(tp);
goto drop;
}
tp->snd_nxt = th->th_ack;
- tp->snd_cwnd = tp->t_maxseg;
- if ((thflags & TH_FIN) &&
- (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
- /*
- * If its a fin we need to process
- * it to avoid a race where both
- * sides enter FIN-WAIT and send FIN|ACK
- * at the same time.
- */
- break;
- }
- (void) tcp_output(tp);
+ tp->snd_cwnd = maxseg;
+ (void) tp->t_fb->tfb_tcp_output(tp);
KASSERT(tp->snd_limited <= 2,
("%s: tp->snd_limited too big",
__func__));
tp->snd_cwnd = tp->snd_ssthresh +
- tp->t_maxseg *
+ maxseg *
(tp->t_dupacks - tp->snd_limited);
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
goto drop;
} else if (V_tcp_do_rfc3042) {
+ /*
+ * Process first and second duplicate
+ * ACKs. Each indicates a segment
+ * leaving the network, creating room
+ * for more. Make sure we can send a
+ * packet on reception of each duplicate
+ * ACK by increasing snd_cwnd by one
+ * segment. Restore the original
+ * snd_cwnd after packet transmission.
+ */
cc_ack_received(tp, th, CC_DUPACK);
u_long oldcwnd = tp->snd_cwnd;
tcp_seq oldsndmax = tp->snd_max;
@@ -2547,33 +2671,23 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_cwnd =
(tp->snd_nxt - tp->snd_una) +
(tp->t_dupacks - tp->snd_limited) *
- tp->t_maxseg;
- if ((thflags & TH_FIN) &&
- (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
- /*
- * If its a fin we need to process
- * it to avoid a race where both
- * sides enter FIN-WAIT and send FIN|ACK
- * at the same time.
- */
- break;
- }
+ maxseg;
/*
* Only call tcp_output when there
* is new data available to be sent.
* Otherwise we would send pure ACKs.
*/
SOCKBUF_LOCK(&so->so_snd);
- avail = so->so_snd.sb_cc -
+ avail = sbavail(&so->so_snd) -
(tp->snd_nxt - tp->snd_una);
SOCKBUF_UNLOCK(&so->so_snd);
if (avail > 0)
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
sent = tp->snd_max - oldsndmax;
- if (sent > tp->t_maxseg) {
+ if (sent > maxseg) {
KASSERT((tp->t_dupacks == 2 &&
tp->snd_limited == 0) ||
- (sent == tp->t_maxseg + 1 &&
+ (sent == maxseg + 1 &&
tp->t_flags & TF_SENTFIN),
("%s: sent too much",
__func__));
@@ -2583,9 +2697,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_cwnd = oldcwnd;
goto drop;
}
- } else
- tp->t_dupacks = 0;
+ }
break;
+ } else {
+ /*
+ * This ack is advancing the left edge, reset the
+ * counter.
+ */
+ tp->t_dupacks = 0;
+ /*
+ * If this ack also has new SACK info, increment the
+ * counter as per rfc6675.
+ */
+ if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed)
+ tp->t_dupacks++;
}
KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
@@ -2604,7 +2729,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else
cc_post_recovery(tp, th);
}
- tp->t_dupacks = 0;
/*
* If we reach this point, ACK is not a duplicate,
* i.e., it ACKs something we sent.
@@ -2631,6 +2755,9 @@ process_ACK:
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
+ KASSERT(acked >= 0, ("%s: acked unexepectedly negative "
+ "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__,
+ tp->snd_una, th->th_ack, tp, m));
TCPSTAT_INC(tcps_rcvackpack);
TCPSTAT_ADD(tcps_rcvackbyte, acked);
@@ -2699,17 +2826,25 @@ process_ACK:
cc_ack_received(tp, th, CC_ACK);
SOCKBUF_LOCK(&so->so_snd);
- if (acked > so->so_snd.sb_cc) {
- tp->snd_wnd -= so->so_snd.sb_cc;
- sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
+ if (acked > sbavail(&so->so_snd)) {
+ if (tp->snd_wnd >= sbavail(&so->so_snd))
+ tp->snd_wnd -= sbavail(&so->so_snd);
+ else
+ tp->snd_wnd = 0;
+ mfree = sbcut_locked(&so->so_snd,
+ (int)sbavail(&so->so_snd));
ourfinisacked = 1;
} else {
- sbdrop_locked(&so->so_snd, acked);
- tp->snd_wnd -= acked;
+ mfree = sbcut_locked(&so->so_snd, acked);
+ if (tp->snd_wnd >= (u_long) acked)
+ tp->snd_wnd -= acked;
+ else
+ tp->snd_wnd = 0;
ourfinisacked = 0;
}
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
+ m_freem(mfree);
/* Detect una wraparound. */
if (!IN_RECOVERY(tp->t_flags) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
@@ -2755,7 +2890,7 @@ process_ACK:
tcp_finwait2_timeout :
TP_MAXIDLE(tp)));
}
- tp->t_state = TCPS_FIN_WAIT_2;
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
}
break;
@@ -2767,9 +2902,9 @@ process_ACK:
*/
case TCPS_CLOSING:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
m_freem(m);
return;
}
@@ -2783,7 +2918,7 @@ process_ACK:
*/
case TCPS_LAST_ACK:
if (ourfinisacked) {
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
tp = tcp_close(tp);
goto drop;
}
@@ -2826,7 +2961,7 @@ step6:
* actually wanting to send this much urgent data.
*/
SOCKBUF_LOCK(&so->so_rcv);
- if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
+ if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
th->th_urp = 0; /* XXX */
thflags &= ~TH_URG; /* XXX */
SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
@@ -2848,7 +2983,7 @@ step6:
*/
if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
tp->rcv_up = th->th_seq + th->th_urp;
- so->so_oobmark = so->so_rcv.sb_cc +
+ so->so_oobmark = sbavail(&so->so_rcv) +
(tp->rcv_up - tp->rcv_nxt) - 1;
if (so->so_oobmark == 0)
so->so_rcv.sb_state |= SBS_RCVATMARK;
@@ -2887,7 +3022,9 @@ dodata: /* XXX */
* case PRU_RCVD). If a FIN has already been received on this
* connection then we just ignore the text.
*/
- if ((tlen || (thflags & TH_FIN)) &&
+ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
+ (tp->t_flags & TF_FASTOPEN));
+ if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tcp_seq save_start = th->th_seq;
m_adj(m, drop_hdrlen); /* delayed header drop */
@@ -2905,8 +3042,9 @@ dodata: /* XXX */
*/
if (th->th_seq == tp->rcv_nxt &&
LIST_EMPTY(&tp->t_segq) &&
- TCPS_HAVEESTABLISHED(tp->t_state)) {
- if (DELAY_ACK(tp, tlen))
+ (TCPS_HAVEESTABLISHED(tp->t_state) ||
+ tfo_syn)) {
+ if (DELAY_ACK(tp, tlen) || tfo_syn)
tp->t_flags |= TF_DELACK;
else
tp->t_flags |= TF_ACKNOW;
@@ -2914,12 +3052,11 @@ dodata: /* XXX */
thflags = th->th_flags & TH_FIN;
TCPSTAT_INC(tcps_rcvpack);
TCPSTAT_ADD(tcps_rcvbyte, tlen);
- ND6_HINT(tp);
SOCKBUF_LOCK(&so->so_rcv);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
m_freem(m);
else
- sbappendstream_locked(&so->so_rcv, m);
+ sbappendstream_locked(&so->so_rcv, m, 0);
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
} else {
@@ -2981,7 +3118,7 @@ dodata: /* XXX */
tp->t_starttime = ticks;
/* FALLTHROUGH */
case TCPS_ESTABLISHED:
- tp->t_state = TCPS_CLOSE_WAIT;
+ tcp_state_change(tp, TCPS_CLOSE_WAIT);
break;
/*
@@ -2989,7 +3126,7 @@ dodata: /* XXX */
* enter the CLOSING state.
*/
case TCPS_FIN_WAIT_1:
- tp->t_state = TCPS_CLOSING;
+ tcp_state_change(tp, TCPS_CLOSING);
break;
/*
@@ -2998,18 +3135,18 @@ dodata: /* XXX */
* standard timers.
*/
case TCPS_FIN_WAIT_2:
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata "
"TCP_FIN_WAIT_2 ti_locked: %d", __func__,
ti_locked));
tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return;
}
}
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
#ifdef TCPDEBUG
@@ -3017,12 +3154,13 @@ dodata: /* XXX */
tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
/*
* Return any desired output.
*/
if (needoutput || (tp->t_flags & TF_ACKNOW))
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
check_delack:
KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
@@ -3064,19 +3202,20 @@ dropafterack:
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
return;
dropwithreset:
- if (ti_locked == TI_WLOCKED)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
if (tp != NULL) {
@@ -3087,8 +3226,8 @@ dropwithreset:
return;
drop:
- if (ti_locked == TI_WLOCKED) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
ti_locked = TI_UNLOCKED;
}
#ifdef INVARIANTS
@@ -3104,6 +3243,7 @@ drop:
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
+ TCP_PROBE3(debug__input, tp, th, mtod(m, const char *));
if (tp != NULL)
INP_WUNLOCK(tp->t_inpcb);
m_freem(m);
@@ -3114,7 +3254,7 @@ drop:
* The mbuf must still include the original packet header.
* tp may be NULL.
*/
-static void
+void
tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
int tlen, int rstreason)
{
@@ -3177,7 +3317,7 @@ drop:
/*
* Parse TCP options and place in tcpopt.
*/
-static void
+void
tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
{
int opt, optlen;
@@ -3259,6 +3399,21 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
to->to_sacks = cp + 2;
TCPSTAT_INC(tcps_sack_rcv_blocks);
break;
+#ifdef TCP_RFC7413
+ case TCPOPT_FAST_OPEN:
+ if ((optlen != TCPOLEN_FAST_OPEN_EMPTY) &&
+ (optlen < TCPOLEN_FAST_OPEN_MIN) &&
+ (optlen > TCPOLEN_FAST_OPEN_MAX))
+ continue;
+ if (!(flags & TO_SYN))
+ continue;
+ if (!V_tcp_fastopen_enabled)
+ continue;
+ to->to_flags |= TOF_FASTOPEN;
+ to->to_tfo_len = optlen - 2;
+ to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
+ break;
+#endif
default:
continue;
}
@@ -3271,7 +3426,7 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
* It is still reflected in the segment length for
* sequencing purposes.
*/
-static void
+void
tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
int off)
{
@@ -3304,7 +3459,7 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
* Collect new round-trip time estimate
* and update averages and current timeout.
*/
-static void
+void
tcp_xmit_timer(struct tcpcb *tp, int rtt)
{
int delta;
@@ -3394,11 +3549,9 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
* While looking at the routing entry, we also initialize other path-dependent
* parameters from pre-set or cached values in the routing entry.
*
- * Also take into account the space needed for options that we
- * send regularly. Make maxseg shorter by that amount to assure
- * that we can send maxseg amount of data even when the options
- * are present. Store the upper limit of the length of options plus
- * data in maxopd.
+ * NOTE that resulting t_maxseg doesn't include space for TCP options or
+ * IP options, e.g. IPSEC data, since length of this data may vary, and
+ * thus it is calculated for every segment separately in tcp_output().
*
* NOTE that this routine is only called when we process an incoming
* segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
@@ -3412,7 +3565,6 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
u_long maxmtu = 0;
struct inpcb *inp = tp->t_inpcb;
struct hc_metrics_lite metrics;
- int origoffer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
size_t min_protoh = isipv6 ?
@@ -3428,13 +3580,12 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
KASSERT(offer == -1, ("%s: conflict", __func__));
offer = mtuoffer - min_protoh;
}
- origoffer = offer;
/* Initialize. */
#ifdef INET6
if (isipv6) {
maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
- tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
+ tp->t_maxseg = V_tcp_v6mssdflt;
}
#endif
#if defined(INET) && defined(INET6)
@@ -3443,7 +3594,7 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
#ifdef INET
{
maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
- tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
+ tp->t_maxseg = V_tcp_mssdflt;
}
#endif
@@ -3467,9 +3618,9 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
/*
* Offer == 0 means that there was no MSS on the SYN
* segment, in this case we use tcp_mssdflt as
- * already assigned to t_maxopd above.
+ * already assigned to t_maxseg above.
*/
- offer = tp->t_maxopd;
+ offer = tp->t_maxseg;
break;
case -1:
@@ -3494,8 +3645,8 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
/*
- * If there's a discovered mtu int tcp hostcache, use it
- * else, use the link mtu.
+ * If there's a discovered mtu in tcp hostcache, use it.
+ * Else, use the link mtu.
*/
if (metrics.rmx_mtu)
mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
@@ -3541,31 +3692,15 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
mss = min(mss, offer);
/*
- * Sanity check: make sure that maxopd will be large
+ * Sanity check: make sure that maxseg will be large
* enough to allow some data on segments even if the
* all the option space is used (40bytes). Otherwise
* funny things may happen in tcp_output.
+ *
+ * XXXGL: shouldn't we reserve space for IP/IPv6 options?
*/
mss = max(mss, 64);
- /*
- * maxopd stores the maximum length of data AND options
- * in a segment; maxseg is the amount of data in a normal
- * segment. We need to store this value (maxopd) apart
- * from maxseg, because now every segment carries options
- * and thus we normally have somewhat less data in segments.
- */
- tp->t_maxopd = mss;
-
- /*
- * origoffer==-1 indicates that no segments were received yet.
- * In this case we just guess.
- */
- if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
- (origoffer == -1 ||
- (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
- mss -= TCPOLEN_TSTAMP_APPA;
-
tp->t_maxseg = mss;
}
@@ -3684,11 +3819,12 @@ tcp_mssopt(struct in_conninfo *inc)
* By setting snd_nxt to ti_ack, this forces retransmission timer to
* be started again.
*/
-static void
+void
tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
{
tcp_seq onxt = tp->snd_nxt;
- u_long ocwnd = tp->snd_cwnd;
+ u_long ocwnd = tp->snd_cwnd;
+ u_int maxseg = tcp_maxseg(tp);
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -3699,9 +3835,9 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
* Set snd_cwnd to one segment beyond acknowledged offset.
* (tp->snd_una has not yet been updated when this function is called.)
*/
- tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
+ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
tp->t_flags |= TF_ACKNOW;
- (void) tcp_output(tp);
+ (void) tp->t_fb->tfb_tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
@@ -3713,5 +3849,13 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
else
tp->snd_cwnd = 0;
- tp->snd_cwnd += tp->t_maxseg;
+ tp->snd_cwnd += maxseg;
+}
+
+int
+tcp_compute_pipe(struct tcpcb *tp)
+{
+ return (tp->snd_max - tp->snd_una +
+ tp->sackhint.sack_bytes_rexmit -
+ tp->sackhint.sacked_bytes);
}