diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_input.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_input.c | 1134 |
1 files changed, 639 insertions, 495 deletions
diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c index f9512eb3..eaa3eb3d 100644 --- a/freebsd/sys/netinet/tcp_input.c +++ b/freebsd/sys/netinet/tcp_input.c @@ -52,7 +52,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <rtems/bsd/local/opt_ipfw.h> /* for ipfw_fwd */ #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> @@ -65,6 +64,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/proc.h> /* for proc0 declaration */ #include <sys/protosw.h> +#include <sys/sdt.h> #include <sys/signalvar.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -77,16 +77,16 @@ __FBSDID("$FreeBSD$"); #include <vm/uma.h> #include <net/if.h> +#include <net/if_var.h> #include <net/route.h> #include <net/vnet.h> #define TCPSTATES /* for logging */ -#include <netinet/cc.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_pcb.h> #include <netinet/in_systm.h> -#include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_icmp.h> /* required for icmp_var.h */ #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ @@ -95,14 +95,23 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <netinet6/in6_pcb.h> +#include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> +#ifdef TCP_RFC7413 +#include <netinet/tcp_fastopen.h> +#endif +#include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet6/tcp6_var.h> #include <netinet/tcpip.h> +#include <netinet/cc/cc.h> +#ifdef TCPPCAP +#include <netinet/tcp_pcap.h> +#endif #include <netinet/tcp_syncache.h> #ifdef TCPDEBUG #include <netinet/tcp_debug.h> @@ -122,11 +131,6 @@ __FBSDID("$FreeBSD$"); const int tcprexmtthresh = 3; -VNET_DEFINE(struct tcpstat, tcpstat); -SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(tcpstat), tcpstat, - "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); - int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, @@ -134,88 +138,96 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; #define V_drop_synfin VNET(drop_synfin) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); +VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc6675_pipe), 0, + "Use calculated pipe/in-flight bytes per RFC 6675"); + VNET_DEFINE(int, tcp_do_rfc3042) = 1; #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0, - "Experimental TCP extensions"); - -VNET_DEFINE(int, tcp_do_initcwnd10) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW, - &VNET_NAME(tcp_do_initcwnd10), 0, - "Enable RFC 6928 (Increasing initial CWND to 10)"); +VNET_DEFINE(int, tcp_initcwnd_segments) = 10; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, + "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); -VNET_DEFINE(int, tcp_do_ecn) = 0; -SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, +VNET_DEFINE(int, tcp_do_ecn) = 2; +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); +VNET_DEFINE(int, tcp_insecure_syn) = 0; +#define V_tcp_insecure_syn VNET(tcp_insecure_syn) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_insecure_syn), 0, + "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); + VNET_DEFINE(int, tcp_insecure_rst) = 0; #define V_tcp_insecure_rst VNET(tcp_insecure_rst) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, - "Follow the old (insecure) criteria for accepting RST packets"); + "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_inc), 0, "Incrementor step size of automatic receive buffer"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); @@ -223,47 +235,55 @@ VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); -static void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); -static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, - struct tcpcb *, int, int); -static void tcp_pulloutofband(struct socket *, - struct tcphdr *, struct mbuf *, int); -static void tcp_xmit_timer(struct tcpcb *, int); -static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); -static void inline tcp_fields_to_host(struct tcphdr *); -#ifdef TCP_SIGNATURE -static void inline tcp_fields_to_net(struct tcphdr *); -static int inline tcp_signature_verify_input(struct mbuf *, int, int, - int, struct tcpopt *, struct tcphdr *, u_int); -#endif -static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, - uint16_t type); -static void inline cc_conn_init(struct tcpcb *tp); -static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); -static void inline hhook_run_tcp_est_in(struct tcpcb *tp, - struct tcphdr *th, struct tcpopt *to); +/* + * TCP statistics are stored in an array of counter(9)s, which size matches + * size of struct tcpstat. TCP running connection count is a regular array. + */ +VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, + tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); +VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); +SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | + CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, + "TCP connection counts by TCP state"); + +static void +tcp_vnet_init(const void *unused) +{ + + COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); + VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); +} +VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + tcp_vnet_init, NULL); + +#ifdef VIMAGE +static void +tcp_vnet_uninit(const void *unused) +{ + + COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); + VNET_PCPUSTAT_FREE(tcpstat); +} +VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + tcp_vnet_uninit, NULL); +#endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The argument is an index - * into tcpstat treated as an array of u_long. While this encodes the - * general layout of tcpstat into the caller, it doesn't encode its location, - * so that future changes to add, for example, per-CPU stats support won't - * cause binary compatibility problems for kernel modules. + * into tcpstat treated as an array. */ void kmod_tcpstat_inc(int statnum) { - (*((u_long *)&V_tcpstat + statnum))++; + counter_u64_add(VNET(tcpstat)[statnum], 1); } /* * Wrapper for the TCP established input helper hook. */ -static void inline +void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; @@ -281,7 +301,7 @@ hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) /* * CC wrapper hook functions */ -static void inline +void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -295,7 +315,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) if (type == CC_ACK) { if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, - V_tcp_abc_l_var * tp->t_maxseg); + V_tcp_abc_l_var * tcp_maxseg(tp)); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; @@ -313,16 +333,18 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) } } -static void inline +void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; + u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); + maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; @@ -344,10 +366,10 @@ cc_conn_init(struct tcpcb *tp) /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set - * the slow start threshhold, but set the + * the slow start threshold, but set the * threshold to no less than 2*mss. */ - tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); + tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } @@ -357,27 +379,27 @@ cc_conn_init(struct tcpcb *tp) * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. + * Support for user specified value for initial flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) - tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ - else if (V_tcp_do_initcwnd10) - tp->snd_cwnd = min(10 * tp->t_maxseg, - max(2 * tp->t_maxseg, 14600)); + tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ + else if (V_tcp_initcwnd_segments) + tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg, + max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) - tp->snd_cwnd = min(4 * tp->t_maxseg, - max(2 * tp->t_maxseg, 4380)); + tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ - if (tp->t_maxseg > 2190) - tp->snd_cwnd = 2 * tp->t_maxseg; - else if (tp->t_maxseg > 1095) - tp->snd_cwnd = 3 * tp->t_maxseg; + if (maxseg > 2190) + tp->snd_cwnd = 2 * maxseg; + else if (maxseg > 1095) + tp->snd_cwnd = 3 * maxseg; else - tp->snd_cwnd = 4 * tp->t_maxseg; + tp->snd_cwnd = 4 * maxseg; } if (CC_ALGO(tp)->conn_init != NULL) @@ -387,6 +409,8 @@ cc_conn_init(struct tcpcb *tp) void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { + u_int maxseg; + INP_WLOCK_ASSERT(tp->t_inpcb); switch(type) { @@ -406,12 +430,13 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) } break; case CC_RTO: + maxseg = tcp_maxseg(tp); tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / - tp->t_maxseg) * tp->t_maxseg; - tp->snd_cwnd = tp->t_maxseg; + maxseg) * maxseg; + tp->snd_cwnd = maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); @@ -436,7 +461,7 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) } } -static void inline +void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -451,27 +476,7 @@ cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) tp->t_bytes_acked = 0; } -static inline void -tcp_fields_to_host(struct tcphdr *th) -{ - - th->th_seq = ntohl(th->th_seq); - th->th_ack = ntohl(th->th_ack); - th->th_win = ntohs(th->th_win); - th->th_urp = ntohs(th->th_urp); -} - #ifdef TCP_SIGNATURE -static inline void -tcp_fields_to_net(struct tcphdr *th) -{ - - th->th_seq = htonl(th->th_seq); - th->th_ack = htonl(th->th_ack); - th->th_win = htons(th->th_win); - th->th_urp = htons(th->th_urp); -} - static inline int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) @@ -485,34 +490,56 @@ tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, } #endif -/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ -#ifdef INET6 -#define ND6_HINT(tp) \ -do { \ - if ((tp) && (tp)->t_inpcb && \ - ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ - nd6_nud_hint(NULL, NULL, 0); \ -} while (0) -#else -#define ND6_HINT(tp) -#endif - /* * Indicate whether this ack should be delayed. We can delay the ack if - * - there is no delayed ack timer in progress and - * - our last ack wasn't a 0-sized window. We never want to delay - * the ack that opens up a 0-sized window and - * - delayed acks are enabled or - * - this is a half-synchronized T/TCP connection. - * - the segment size is not larger than the MSS and LRO wasn't used - * for this segment. + * following conditions are met: + * - There is no delayed ack timer in progress. + * - Our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window. + * - LRO wasn't used for this segment. We make sure by checking that the + * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tlen <= tp->t_maxopd) && \ + (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) +static void inline +cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (CC_ALGO(tp)->ecnpkt_handler != NULL) { + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->ccv->flags |= CCF_IPHDR_CE; + break; + case IPTOS_ECN_ECT0: + tp->ccv->flags &= ~CCF_IPHDR_CE; + break; + case IPTOS_ECN_ECT1: + tp->ccv->flags &= ~CCF_IPHDR_CE; + break; + } + + if (th->th_flags & TH_CWR) + tp->ccv->flags |= CCF_TCPHDR_CWR; + else + tp->ccv->flags &= ~CCF_TCPHDR_CWR; + + if (tp->t_flags & TF_DELACK) + tp->ccv->flags |= CCF_DELACK; + else + tp->ccv->flags &= ~CCF_DELACK; + + CC_ALGO(tp)->ecnpkt_handler(tp->ccv); + + if (tp->ccv->flags & CCF_ACKNOW) + tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + } +} + /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended @@ -528,6 +555,7 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct in6_ifaddr *ia6; + struct ip6_hdr *ip6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); @@ -535,7 +563,8 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ - ia6 = ip6_getdstifaddr(m); + ip6 = mtod(m, struct ip6_hdr *); + ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; @@ -543,28 +572,26 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); - return IPPROTO_DONE; + return (IPPROTO_DONE); } if (ia6) ifa_free(&ia6->ia_ifa); - tcp_input(m, *offp); - return IPPROTO_DONE; + return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ -void -tcp_input(struct mbuf *m, int off0) +int +tcp_input(struct mbuf **mp, int *offp, int proto) { + struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; -#ifdef INET - struct ipovly *ipov; -#endif struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; + int off0; int optlen = 0; #ifdef INET int len; @@ -587,9 +614,6 @@ tcp_input(struct mbuf *m, int off0) struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; -#define TI_UNLOCKED 1 -#define TI_WLOCKED 2 - #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -604,6 +628,9 @@ tcp_input(struct mbuf *m, int off0) isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif + off0 = *offp; + m = *mp; + *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); @@ -615,7 +642,7 @@ tcp_input(struct mbuf *m, int off0) m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); - return; + return (IPPROTO_DONE); } } @@ -660,45 +687,43 @@ tcp_input(struct mbuf *m, int off0) * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { - ip_stripoptions(m, (struct mbuf *)0); + ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); - return; + return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); - ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); - tlen = ip->ip_len; + tlen = ntohs(ip->ip_len) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, - htonl(m->m_pkthdr.csum_data + - ip->ip_len + - IPPROTO_TCP)); + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + tlen + + IPPROTO_TCP)); th->th_sum ^= 0xffff; -#ifdef TCPDEBUG - ipov->ih_len = (u_short)tlen; - ipov->ih_len = htons(ipov->ih_len); -#endif } else { + struct ipovly *ipov = (struct ipovly *)ip; + /* * Checksum extended TCP header and data. */ - len = sizeof (struct ip) + tlen; + len = off0 + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); - ipov->ih_len = (u_short)tlen; - ipov->ih_len = htons(ipov->ih_len); + ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); + /* Reset length for SDT probes. */ + ip->ip_len = htons(tlen + off0); } + if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; @@ -732,7 +757,7 @@ tcp_input(struct mbuf *m, int off0) if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { - IP6_EXTHDR_CHECK(m, off0, off, ); + IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } @@ -746,10 +771,9 @@ tcp_input(struct mbuf *m, int off0) if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); - ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } @@ -771,26 +795,17 @@ tcp_input(struct mbuf *m, int off0) /* * Locate pcb for segment; if we're likely to add or remove a - * connection then first acquire pcbinfo lock. There are two cases + * connection then first acquire pcbinfo lock. There are three cases * where we might discover later we need a write lock despite the - * flags: ACKs moving a connection out of the syncache, and ACKs for - * a connection in TIMEWAIT. + * flags: ACKs moving a connection out of the syncache, ACKs for a + * connection in TIMEWAIT and SYNs not targeting a listening socket. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + if ((thflags & (TH_FIN | TH_RST)) != 0) { + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; -findpcb: -#ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - } else { - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - } -#endif - /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ @@ -807,6 +822,14 @@ findpcb: ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); +findpcb: +#ifdef INVARIANTS + if (ti_locked == TI_RLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; @@ -831,10 +854,6 @@ findpcb: th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } - /* Remove the tag from the packet. We don't need it anymore. */ - m_tag_delete(m, fwd_tag); - m->m_flags &= ~M_IP6_NEXTHOP; - fwd_tag = NULL; } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, @@ -869,10 +888,6 @@ findpcb: th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } - /* Remove the tag from the packet. We don't need it anymore. */ - m_tag_delete(m, fwd_tag); - m->m_flags &= ~M_IP_NEXTHOP; - fwd_tag = NULL; } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, @@ -908,23 +923,20 @@ findpcb: goto dropwithreset; } INP_WLOCK_ASSERT(inp); - if (!(inp->inp_flags & INP_HW_FLOWID) - && (m->m_flags & M_FLOWID) - && ((inp->inp_socket == NULL) - || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { - inp->inp_flags |= INP_HW_FLOWID; - inp->inp_flags &= ~INP_SW_FLOWID; + if ((inp->inp_flowtype == M_HASHTYPE_NONE) && + (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && + ((inp->inp_socket == NULL) || + (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { inp->inp_flowid = m->m_pkthdr.flowid; + inp->inp_flowtype = M_HASHTYPE_GET(m); } #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { - IPSEC6STAT_INC(in_polvio); goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { - IPSECSTAT_INC(in_polvio); goto dropunlock; } #endif /* IPSEC */ @@ -934,9 +946,10 @@ findpcb: */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 - if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) - goto dropunlock; - else + if (isipv6) { + if (inp->inp_ip_minttl > ip6->ip6_hlim) + goto dropunlock; + } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; @@ -945,7 +958,7 @@ findpcb: /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a - * legitimate new connection attempt the old INPCB gets removed and + * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. * * At this point, due to earlier optimism, we may hold only an inpcb @@ -961,20 +974,20 @@ findpcb: relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); @@ -983,8 +996,8 @@ relocked: */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_WUNLOCK(&V_tcbinfo); - return; + INP_INFO_RUNLOCK(&V_tcbinfo); + return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding @@ -1013,16 +1026,18 @@ relocked: * now be in TIMEWAIT. */ #ifdef INVARIANTS - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if ((thflags & (TH_FIN | TH_RST)) != 0) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); #endif - if (tp->t_state != TCPS_ESTABLISHED) { + if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || + (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && + !(tp->t_flags & TF_FASTOPEN)))) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; @@ -1030,9 +1045,9 @@ relocked: } goto relocked; } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #ifdef MAC @@ -1057,17 +1072,13 @@ relocked: /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection - * attempt or the completion of a previous one. Because listen - * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be - * held in this case. + * attempt or the completion of a previous one. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { @@ -1090,6 +1101,8 @@ relocked: * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1110,6 +1123,9 @@ relocked: rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } +#ifdef TCP_RFC7413 +new_tfo_socket: +#endif if (so == NULL) { /* * We completed the 3-way handshake @@ -1141,7 +1157,11 @@ relocked: */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); - INP_WLOCK(inp); /* new connection */ + /* + * New connection inpcb is already locked by + * syncache_expand(). + */ + INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); @@ -1170,10 +1190,10 @@ relocked: * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - return; + return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: @@ -1277,7 +1297,7 @@ relocked: if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; - ia6 = ip6_getdstifaddr(m); + ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { ifa_free(&ia6->ia_ifa); @@ -1366,14 +1386,24 @@ relocked: tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); tcp_dooptions(&to, optp, optlen, TO_SYN); - syncache_add(&inc, &to, th, inp, &so, m); +#ifdef TCP_RFC7413 + if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) + goto new_tfo_socket; +#else + syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); +#endif /* * Entry added to syncache and mbuf consumed. - * Everything already unlocked by syncache_add(). + * Only the listen socket is unlocked by syncache_add(). */ + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - return; + return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN @@ -1404,18 +1434,22 @@ relocked: } #endif + TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); + /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - return; + return (IPPROTO_DONE); dropwithreset: - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); + + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1435,8 +1469,11 @@ dropwithreset: goto drop; dropunlock: - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (m != NULL) + TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); + + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1456,18 +1493,23 @@ drop: free(s, M_TCPLOG); if (m != NULL) m_freem(m); + return (IPPROTO_DONE); } -static void +void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { - int thflags, acked, ourfinisacked, needoutput = 0; + int thflags, acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win; u_long tiwin; + char *s; + struct in_conninfo *inc; + struct mbuf *mfree; struct tcpopt to; - + int tfo_syn; + #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1478,30 +1520,25 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, short ostate = 0; #endif thflags = th->th_flags; + inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; + sack_changed = 0; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we - * allow either a read lock or a write lock, as we may have acquired - * a write lock due to a race. - * - * Require a global write lock for SYN/FIN/RST segments or - * non-established connections; otherwise accept either a read or - * write lock, as we may have conservatively acquired a write lock in - * certain cases in tcp_input() (is this still true?). Currently we - * will never enter with no lock, so we try to drop it quickly in the - * common pure ack/pure data cases. + * allow the tcbinfo to be in either alocked or unlocked, as the + * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " + KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); @@ -1515,6 +1552,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); +#ifdef TCPPCAP + /* Save segment, if requested. */ + tcp_pcap_add(th, m, &(tp->t_inpkts)); +#endif + /* * Segment received on connection. * Reset idle time and keep-alive timer. @@ -1526,7 +1568,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); /* - * Unscale the window into a 32-bit value. + * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; @@ -1549,6 +1591,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_INC(tcps_ecn_ect1); break; } + + /* Process a packet differently from RFC3168. */ + cc_ecnpkt_handler(tp, th, iptos); + /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); @@ -1573,6 +1619,24 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } + /* + * If timestamps were negotiated during SYN/ACK they should + * appear on every segment during this session and vice versa. + */ + if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp missing, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } + if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp not expected, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } /* * Process options only when we get SYN/ACK back. The SYN case @@ -1652,8 +1716,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1720,7 +1784,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); - ND6_HINT(tp); /* Some progress has been made. */ /* * If all outstanding data are acked, stop @@ -1737,14 +1800,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, + mtod(m, const char *)); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); - if (so->so_snd.sb_cc) - (void) tcp_output(tp); + if (sbavail(&so->so_snd)) + (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && @@ -1756,8 +1821,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -1777,12 +1842,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->rcv_up = tp->rcv_nxt; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); - ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); + /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network @@ -1802,11 +1868,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * reassembly queue. * * The criteria to step up the receive buffer one notch are: - * 1. the number of bytes received during the time it takes + * 1. Application has not set receive buffer size with + * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. + * 2. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); - * 2. received bytes per RTT is within seven eighth of the + * 3. received bytes per RTT is within seven eighth of the * current socket buffer size; - * 3. receive buffer size has not hit maximal automatic size; + * 4. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. @@ -1817,6 +1885,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * the buffer to better manage the socket buffer resources. */ if (V_tcp_do_autorcvbuf && + (to.to_flags & TOF_TS) && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && @@ -1851,7 +1920,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ - sbappendstream_locked(&so->so_rcv, m); + sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); @@ -1859,7 +1928,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } @@ -1893,6 +1962,28 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { + /* + * When a TFO connection is in SYN_RECEIVED, the + * only valid packets are the initial SYN, a + * retransmit/copy of the initial SYN (possibly with + * a subset of the original data), a valid ACK, a + * FIN, or a RST. + */ + if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } else if (thflags & TH_SYN) { + /* non-initial SYN is ignored */ + if ((tcp_timer_active(tp, TT_DELACK) || + tcp_timer_active(tp, TT_REXMT))) + goto drop; + } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { + goto drop; + } + } +#endif break; /* @@ -1916,8 +2007,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } - if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) + if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { + TCP_PROBE5(connect__refused, NULL, tp, + mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); + } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) @@ -1962,11 +2056,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { - tp->t_state = TCPS_FIN_WAIT_1; + tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { - tp->t_state = TCPS_ESTABLISHED; + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(connect__established, NULL, tp, + mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); @@ -1974,22 +2070,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } else { /* * Received initial SYN in SYN-SENT[*] state => - * simultaneous open. If segment contains CC option - * and there is a cached CC, apply TAO test. + * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* - * If there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); - tp->t_state = TCPS_SYN_RECEIVED; + tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " + KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2045,98 +2139,84 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. - * - * - * If the RST bit is set, check the sequence number to see - * if this is a valid reset segment. - * RFC 793 page 37: - * In all states except SYN-SENT, all reset (RST) segments - * are validated by checking their SEQ-fields. A reset is - * valid if its sequence number is in the window. - * Note: this does not take into account delayed ACKs, so - * we should test against last_ack_sent instead of rcv_nxt. - * The sequence number in the reset segment is normally an - * echo of our outgoing acknowlegement numbers, but some hosts - * send a reset with the sequence number at the rightmost edge - * of our receive window, and we have to handle this case. - * Note 2: Paul Watson's paper "Slipping in the Window" has shown - * that brute force RST attacks are possible. To combat this, - * we use a much stricter check while in the ESTABLISHED state, - * only accepting RSTs where the sequence number is equal to - * last_ack_sent. In all other states (the states in which a - * RST is more likely), the more permissive check is used. - * If we have multiple segments in flight, the initial reset - * segment sequence numbers will be to the left of last_ack_sent, - * but they will eventually catch up. - * In any case, it never made sense to trim reset segments to - * fit the receive window since RFC 1122 says: - * 4.2.2.12 RST Segment: RFC-793 Section 3.4 - * - * A TCP SHOULD allow a received RST segment to include data. - * - * DISCUSSION - * It has been suggested that a RST segment could contain - * ASCII text that encoded and explained the cause of the - * RST. No standard has yet been established for such - * data. - * - * If the reset segment passes the sequence number test examine - * the state: - * SYN_RECEIVED STATE: - * If passive open, return to LISTEN state. - * If active open, inform user that connection was refused. - * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: - * Inform user that connection was reset, and close tcb. - * CLOSING, LAST_ACK STATES: - * Close the tcb. - * TIME_WAIT STATE: - * Drop the segment - see Stevens, vol. 2, p. 964 and - * RFC 1337. */ if (thflags & TH_RST) { - if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && - SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { - switch (tp->t_state) { - - case TCPS_SYN_RECEIVED: - so->so_error = ECONNREFUSED; - goto close; - - case TCPS_ESTABLISHED: - if (V_tcp_insecure_rst == 0 && - !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && - SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && - !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && - SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { - TCPSTAT_INC(tcps_badrst); - goto drop; - } - /* FALLTHROUGH */ - case TCPS_FIN_WAIT_1: - case TCPS_FIN_WAIT_2: - case TCPS_CLOSE_WAIT: - so->so_error = ECONNRESET; - close: - KASSERT(ti_locked == TI_WLOCKED, - ("tcp_do_segment: TH_RST 1 ti_locked %d", - ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - - tp->t_state = TCPS_CLOSED; + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. + * - If RST is in window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should + * test against last_ack_sent instead of rcv_nxt. + * Note 2: we handle special case of closed window, not + * covered by the RFC. + */ + if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, + ("%s: TH_RST ti_locked %d, th %p tp %p", + __func__, ti_locked, th, tp)); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); - tp = tcp_close(tp); - break; + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + } + } + goto drop; + } - case TCPS_CLOSING: - case TCPS_LAST_ACK: - KASSERT(ti_locked == TI_WLOCKED, - ("tcp_do_segment: TH_RST 2 ti_locked %d", - ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + /* + * RFC5961 Section 4.2 + * Send challenge ACK for any SYN in synchronized state. + */ + if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && + tp->t_state != TCPS_SYN_RECEIVED) { + KASSERT(ti_locked == TI_RLOCKED, + ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - tp = tcp_close(tp); - break; - } + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; } goto drop; } @@ -2236,15 +2316,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - char *s; - - KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " + KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { - log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " - "was closed, sending RST and removing tcpcb\n", + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " + "after socket was closed, " + "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } @@ -2309,29 +2388,22 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } /* - * If a SYN is in the window, then this is an - * error and we send an RST and drop the connection. - */ - if (thflags & TH_SYN) { - KASSERT(ti_locked == TI_WLOCKED, - ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - - tp = tcp_drop(tp, ECONNRESET); - rstreason = BANDLIM_UNLIMITED; - goto drop; - } - - /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || - (tp->t_flags & TF_NEEDSYN)) + (tp->t_flags & TF_NEEDSYN)) { +#ifdef TCP_RFC7413 + if (tp->t_state == TCPS_SYN_RECEIVED && + tp->t_flags & TF_FASTOPEN) { + tp->snd_wnd = tiwin; + cc_conn_init(tp); + } +#endif goto step6; - else if (tp->t_flags & TF_ACKNOW) + } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; @@ -2364,11 +2436,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { - tp->t_state = TCPS_FIN_WAIT_1; + tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { - tp->t_state = TCPS_ESTABLISHED; - cc_conn_init(tp); + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(accept__established, NULL, tp, + mtod(m, const char *), tp, th); +#ifdef TCP_RFC7413 + if (tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + + /* + * Account for the ACK of our SYN prior to + * regular ACK processing below. + */ + tp->snd_una++; + } + /* + * TFO connections call cc_conn_init() during SYN + * processing. Calling it again here for such + * connections is not harmless as it would undo the + * snd_cwnd reduction that occurs when a TFO SYN|ACK + * is retransmitted. + */ + if (!(tp->t_flags & TF_FASTOPEN)) +#endif + cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* @@ -2402,21 +2496,45 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) - tcp_sack_doack(tp, &to, th->th_ack); + sack_changed = tcp_sack_doack(tp, &to, th->th_ack); + else + /* + * Reset the value so that previous (valid) value + * from the last ack with SACK doesn't get used. + */ + tp->sackhint.sacked_bytes = 0; /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { - if (tlen == 0 && tiwin == tp->snd_wnd) { + u_int maxseg; + + maxseg = tcp_maxseg(tp); + if (tlen == 0 && + (tiwin == tp->snd_wnd || + (tp->t_flags & TF_SACK_PERMIT))) { + /* + * If this is the first time we've seen a + * FIN from the remote, this is not a + * duplicate and it needs to be processed + * normally. This happens during a + * simultaneous close. + */ + if ((thflags & TH_FIN) && + (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { + tp->t_dupacks = 0; + break; + } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't - * change), the ack is the biggest we've + * change and FIN isn't set), + * the ack is the biggest we've * seen and we've seen exactly our rexmt - * threshhold of them, assume a packet + * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one @@ -2437,8 +2555,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * When using TCP ECN, notify the peer that * we reduced the cwnd. */ - if (!tcp_timer_active(tp, TT_REXMT) || - th->th_ack != tp->snd_una) + /* + * Following 2 kinds of acks should not affect + * dupack counting: + * 1) Old acks + * 2) Acks with SACK but without any new SACK + * information in them. These could result from + * any anomaly in the network like a switch + * duplicating packets or a possible DoS attack. + */ + if (th->th_ack != tp->snd_una || + ((tp->t_flags & TF_SACK_PERMIT) && + !sack_changed)) + break; + else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { @@ -2453,26 +2583,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * we have less than 1/2 the original window's * worth of data in flight. */ - awnd = (tp->snd_nxt - tp->snd_fack) + - tp->sackhint.sack_bytes_rexmit; + if (V_tcp_do_rfc6675_pipe) + awnd = tcp_compute_pipe(tp); + else + awnd = (tp->snd_nxt - tp->snd_fack) + + tp->sackhint.sack_bytes_rexmit; + if (awnd < tp->snd_ssthresh) { - tp->snd_cwnd += tp->t_maxseg; + tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else - tp->snd_cwnd += tp->t_maxseg; - if ((thflags & TH_FIN) && - (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { - /* - * If its a fin we need to process - * it to avoid a race where both - * sides enter FIN-WAIT and send FIN|ACK - * at the same time. - */ - break; - } - (void) tcp_output(tp); + tp->snd_cwnd += maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; @@ -2505,33 +2629,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; - tp->snd_cwnd = tp->t_maxseg; - (void) tcp_output(tp); + tp->snd_cwnd = maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; - tp->snd_cwnd = tp->t_maxseg; - if ((thflags & TH_FIN) && - (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { - /* - * If its a fin we need to process - * it to avoid a race where both - * sides enter FIN-WAIT and send FIN|ACK - * at the same time. - */ - break; - } - (void) tcp_output(tp); + tp->snd_cwnd = maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + - tp->t_maxseg * + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { + /* + * Process first and second duplicate + * ACKs. Each indicates a segment + * leaving the network, creating room + * for more. Make sure we can send a + * packet on reception of each duplicate + * ACK by increasing snd_cwnd by one + * segment. Restore the original + * snd_cwnd after packet transmission. + */ cc_ack_received(tp, th, CC_DUPACK); u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; @@ -2547,33 +2671,23 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * - tp->t_maxseg; - if ((thflags & TH_FIN) && - (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { - /* - * If its a fin we need to process - * it to avoid a race where both - * sides enter FIN-WAIT and send FIN|ACK - * at the same time. - */ - break; - } + maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); - avail = so->so_snd.sb_cc - + avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; - if (sent > tp->t_maxseg) { + if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || - (sent == tp->t_maxseg + 1 && + (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); @@ -2583,9 +2697,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_cwnd = oldcwnd; goto drop; } - } else - tp->t_dupacks = 0; + } break; + } else { + /* + * This ack is advancing the left edge, reset the + * counter. + */ + tp->t_dupacks = 0; + /* + * If this ack also has new SACK info, increment the + * counter as per rfc6675. + */ + if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed) + tp->t_dupacks++; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), @@ -2604,7 +2729,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } else cc_post_recovery(tp, th); } - tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. @@ -2631,6 +2755,9 @@ process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); + KASSERT(acked >= 0, ("%s: acked unexepectedly negative " + "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, + tp->snd_una, th->th_ack, tp, m)); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); @@ -2699,17 +2826,25 @@ process_ACK: cc_ack_received(tp, th, CC_ACK); SOCKBUF_LOCK(&so->so_snd); - if (acked > so->so_snd.sb_cc) { - tp->snd_wnd -= so->so_snd.sb_cc; - sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); + if (acked > sbavail(&so->so_snd)) { + if (tp->snd_wnd >= sbavail(&so->so_snd)) + tp->snd_wnd -= sbavail(&so->so_snd); + else + tp->snd_wnd = 0; + mfree = sbcut_locked(&so->so_snd, + (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { - sbdrop_locked(&so->so_snd, acked); - tp->snd_wnd -= acked; + mfree = sbcut_locked(&so->so_snd, acked); + if (tp->snd_wnd >= (u_long) acked) + tp->snd_wnd -= acked; + else + tp->snd_wnd = 0; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); + m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && @@ -2755,7 +2890,7 @@ process_ACK: tcp_finwait2_timeout : TP_MAXIDLE(tp))); } - tp->t_state = TCPS_FIN_WAIT_2; + tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; @@ -2767,9 +2902,9 @@ process_ACK: */ case TCPS_CLOSING: if (ourfinisacked) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } @@ -2783,7 +2918,7 @@ process_ACK: */ case TCPS_LAST_ACK: if (ourfinisacked) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } @@ -2826,7 +2961,7 @@ step6: * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); - if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ @@ -2848,7 +2983,7 @@ step6: */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; - so->so_oobmark = so->so_rcv.sb_cc + + so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; @@ -2887,7 +3022,9 @@ dodata: /* XXX */ * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ - if ((tlen || (thflags & TH_FIN)) && + tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && + (tp->t_flags & TF_FASTOPEN)); + if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ @@ -2905,8 +3042,9 @@ dodata: /* XXX */ */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && - TCPS_HAVEESTABLISHED(tp->t_state)) { - if (DELAY_ACK(tp, tlen)) + (TCPS_HAVEESTABLISHED(tp->t_state) || + tfo_syn)) { + if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; @@ -2914,12 +3052,11 @@ dodata: /* XXX */ thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); - ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else - sbappendstream_locked(&so->so_rcv, m); + sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { @@ -2981,7 +3118,7 @@ dodata: /* XXX */ tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: - tp->t_state = TCPS_CLOSE_WAIT; + tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* @@ -2989,7 +3126,7 @@ dodata: /* XXX */ * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: - tp->t_state = TCPS_CLOSING; + tcp_state_change(tp, TCPS_CLOSING); break; /* @@ -2998,18 +3135,18 @@ dodata: /* XXX */ * standard timers. */ case TCPS_FIN_WAIT_2: - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, ti_locked)); tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return; } } - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG @@ -3017,12 +3154,13 @@ dodata: /* XXX */ tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); check_delack: KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", @@ -3064,19 +3202,20 @@ dropafterack: tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; if (tp != NULL) { @@ -3087,8 +3226,8 @@ dropwithreset: return; drop: - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -3104,6 +3243,7 @@ drop: tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); @@ -3114,7 +3254,7 @@ drop: * The mbuf must still include the original packet header. * tp may be NULL. */ -static void +void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { @@ -3177,7 +3317,7 @@ drop: /* * Parse TCP options and place in tcpopt. */ -static void +void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; @@ -3259,6 +3399,21 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; +#ifdef TCP_RFC7413 + case TCPOPT_FAST_OPEN: + if ((optlen != TCPOLEN_FAST_OPEN_EMPTY) && + (optlen < TCPOLEN_FAST_OPEN_MIN) && + (optlen > TCPOLEN_FAST_OPEN_MAX)) + continue; + if (!(flags & TO_SYN)) + continue; + if (!V_tcp_fastopen_enabled) + continue; + to->to_flags |= TOF_FASTOPEN; + to->to_tfo_len = optlen - 2; + to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; + break; +#endif default: continue; } @@ -3271,7 +3426,7 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) * It is still reflected in the segment length for * sequencing purposes. */ -static void +void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { @@ -3304,7 +3459,7 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, * Collect new round-trip time estimate * and update averages and current timeout. */ -static void +void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; @@ -3394,11 +3549,9 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * - * Also take into account the space needed for options that we - * send regularly. Make maxseg shorter by that amount to assure - * that we can send maxseg amount of data even when the options - * are present. Store the upper limit of the length of options plus - * data in maxopd. + * NOTE that resulting t_maxseg doesn't include space for TCP options or + * IP options, e.g. IPSEC data, since length of this data may vary, and + * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS @@ -3412,7 +3565,6 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, u_long maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; - int origoffer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? @@ -3428,13 +3580,12 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } - origoffer = offer; /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); - tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; + tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) @@ -3443,7 +3594,7 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); - tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; + tp->t_maxseg = V_tcp_mssdflt; } #endif @@ -3467,9 +3618,9 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as - * already assigned to t_maxopd above. + * already assigned to t_maxseg above. */ - offer = tp->t_maxopd; + offer = tp->t_maxseg; break; case -1: @@ -3494,8 +3645,8 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* - * If there's a discovered mtu int tcp hostcache, use it - * else, use the link mtu. + * If there's a discovered mtu in tcp hostcache, use it. + * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; @@ -3541,31 +3692,15 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, mss = min(mss, offer); /* - * Sanity check: make sure that maxopd will be large + * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. + * + * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); - /* - * maxopd stores the maximum length of data AND options - * in a segment; maxseg is the amount of data in a normal - * segment. We need to store this value (maxopd) apart - * from maxseg, because now every segment carries options - * and thus we normally have somewhat less data in segments. - */ - tp->t_maxopd = mss; - - /* - * origoffer==-1 indicates that no segments were received yet. - * In this case we just guess. - */ - if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && - (origoffer == -1 || - (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) - mss -= TCPOLEN_TSTAMP_APPA; - tp->t_maxseg = mss; } @@ -3684,11 +3819,12 @@ tcp_mssopt(struct in_conninfo *inc) * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ -static void +void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; - u_long ocwnd = tp->snd_cwnd; + u_long ocwnd = tp->snd_cwnd; + u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); @@ -3699,9 +3835,9 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ - tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); + tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; @@ -3713,5 +3849,13 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; - tp->snd_cwnd += tp->t_maxseg; + tp->snd_cwnd += maxseg; +} + +int +tcp_compute_pipe(struct tcpcb *tp) +{ + return (tp->snd_max - tp->snd_una + + tp->sackhint.sack_bytes_rexmit - + tp->sackhint.sacked_bytes); } |