diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_output.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_output.c | 445 |
1 files changed, 307 insertions, 138 deletions
diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c index 550af64f..af11d805 100644 --- a/freebsd/sys/netinet/tcp_output.c +++ b/freebsd/sys/netinet/tcp_output.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/mutex.h> #include <sys/protosw.h> +#include <sys/sdt.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> @@ -56,8 +57,8 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <net/vnet.h> -#include <netinet/cc.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> @@ -68,12 +69,20 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #endif +#ifdef TCP_RFC7413 +#include <netinet/tcp_fastopen.h> +#endif +#include <netinet/tcp.h> #define TCPOUTFLAGS #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcpip.h> +#include <netinet/cc/cc.h> +#ifdef TCPPCAP +#include <netinet/tcp_pcap.h> +#endif #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif @@ -90,46 +99,56 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> VNET_DEFINE(int, path_mtu_discovery) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); +/* + * Make sure that either retransmit or persist timer is set for SYN, FIN and + * non-ACK. + */ +#define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ + KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ + tcp_timer_active((tp), TT_REXMT) || \ + tcp_timer_active((tp), TT_PERSIST), \ + ("neither rexmt nor persist timer is set")) + static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso); static void inline cc_after_idle(struct tcpcb *tp); /* - * Wrapper for the TCP established ouput helper hook. + * Wrapper for the TCP established output helper hook. */ static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, @@ -201,6 +220,17 @@ tcp_output(struct tcpcb *tp) return (tcp_offload_output(tp)); #endif +#ifdef TCP_RFC7413 + /* + * For TFO connections in SYN_RECEIVED, only allow the initial + * SYN|ACK and those sent by the retransmit timer. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ + (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ + return (0); +#endif /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -322,7 +352,7 @@ after_sack_rexmit: * to send then the probe will be the FIN * itself. */ - if (off < so->so_snd.sb_cc) + if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { @@ -348,7 +378,8 @@ after_sack_rexmit: */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) - len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); + len = ((long)ulmin(sbavail(&so->so_snd), sendwin) - + off); else { long cwin; @@ -357,8 +388,8 @@ after_sack_rexmit: * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - - off); + len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) - + off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it @@ -386,6 +417,15 @@ after_sack_rexmit: if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; +#ifdef TCP_RFC7413 + /* + * When sending additional segments following a TFO SYN|ACK, + * do not include the SYN bit. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED)) + flags &= ~TH_SYN; +#endif off--, len++; } @@ -399,7 +439,18 @@ after_sack_rexmit: flags &= ~TH_FIN; } - if (len < 0) { +#ifdef TCP_RFC7413 + /* + * When retransmitting SYN|ACK on a passively-created TFO socket, + * don't include data, as the presence of data may have caused the + * original SYN|ACK to have been dropped by a middlebox. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) || + (flags & TH_RST))) + len = 0; +#endif + if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, @@ -409,9 +460,16 @@ after_sack_rexmit: * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. + * + * We also do a general check here to ensure that + * we will set the persist timer when we have data + * to send, but a 0-byte window. This makes sure + * the persist timer is set even if the packet + * hits one of the "goto send" lines below. */ len = 0; - if (sendwin == 0) { + if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && + (off < (int) sbavail(&so->so_snd))) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; @@ -449,20 +507,23 @@ after_sack_rexmit: * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given - * delay*bandwith product. However testing has shown this not + * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting - * of available bandwith (the non-use of it) for wasting some + * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. + * + * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && - so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && - so->so_snd.sb_cc < V_tcp_autosndbuf_max && - sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) && + sbused(&so->so_snd) < V_tcp_autosndbuf_max && + sendwin >= (sbused(&so->so_snd) - + (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) @@ -499,10 +560,11 @@ after_sack_rexmit: tso = 1; if (sack_rxmit) { - if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) + if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + + sbused(&so->so_snd))) flags &= ~TH_FIN; } @@ -532,7 +594,7 @@ after_sack_rexmit: */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && - len + off >= so->so_snd.sb_cc && + len + off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } @@ -660,7 +722,7 @@ dontupdate: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ - if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && + if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); @@ -675,6 +737,12 @@ just_return: send: SOCKBUF_LOCK_ASSERT(&so->so_snd); + if (len > 0) { + if (len >= tp->t_maxseg) + tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; + else + tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; + } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. @@ -697,13 +765,29 @@ send: * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ + to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { - to.to_flags = 0; /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; +#ifdef TCP_RFC7413 + /* + * Only include the TFO option on the first + * transmission of the SYN|ACK on a + * passively-created TFO socket, as the presence of + * the TFO option may have caused the original + * SYN|ACK to have been dropped by a middlebox. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + (tp->t_rxtshift == 0)) { + to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; + to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; + to.to_flags |= TOF_FASTOPEN; + } +#endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { @@ -759,11 +843,11 @@ send: /* * Adjust data length if insertion of options will - * bump the packet length beyond the t_maxopd length. + * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ - if (len + optlen + ipoptlen > tp->t_maxopd) { + if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { @@ -793,7 +877,8 @@ send: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ - max_len = (if_hw_tsomax - hdrlen); + max_len = (if_hw_tsomax - hdrlen - + max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { @@ -808,6 +893,15 @@ send: */ if (if_hw_tsomaxsegcount != 0 && if_hw_tsomaxsegsize != 0) { + /* + * Subtract one segment for the LINK + * and TCP/IP headers mbuf that will + * be prepended to this mbuf chain + * after the code in this section + * limits the number of mbufs in the + * chain to if_hw_tsomaxsegcount. + */ + if_hw_tsomaxsegcount -= 1; max_len = 0; mb = sbsndmbuf(&so->so_snd, off, &moff); @@ -856,8 +950,8 @@ send: * fractional unless the send sockbuf can be * emptied: */ - max_len = (tp->t_maxopd - optlen); - if ((off + len) < so->so_snd.sb_cc) { + max_len = (tp->t_maxseg - optlen); + if ((off + len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; @@ -886,7 +980,7 @@ send: sendalot = 1; } else { - len = tp->t_maxopd - optlen - ipoptlen; + len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else @@ -929,23 +1023,20 @@ send: TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } - MGETHDR(m, M_DONTWAIT, MT_DATA); +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + else +#endif + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; + sack_rxmit = 0; goto out; } -#ifdef INET6 - if (MHLEN < hdrlen + max_linkhdr) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - SOCKBUF_UNLOCK(&so->so_snd); - m_freem(m); - error = ENOBUFS; - goto out; - } - } -#endif + m->m_data += max_linkhdr; m->m_len = hdrlen; @@ -965,6 +1056,7 @@ send: SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; + sack_rxmit = 0; goto out; } } @@ -975,7 +1067,7 @@ send: * give data to the user when a buffer fills or * a PUSH comes in.) */ - if (off + len == so->so_snd.sb_cc) + if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { @@ -989,15 +1081,16 @@ send: else TCPSTAT_INC(tcps_sndwinup); - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; + sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { - MH_ALIGN(m, hdrlen); + M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; @@ -1036,7 +1129,7 @@ send: * resend those bits a number of times as per * RFC 3168. */ - if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; @@ -1153,7 +1246,7 @@ send: tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE - if (tp->t_flags & TF_SIGNATURE) { + if (to.to_flags & TOF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, 0, len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); @@ -1195,13 +1288,12 @@ send: /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. - * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { - KASSERT(len > tp->t_maxopd - optlen, + KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; - m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #ifdef IPSEC @@ -1214,75 +1306,6 @@ send: __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif - /* - * In transmit state, time the transmission and arrange for - * the retransmit. In persist state, just set snd_max. - */ - if ((tp->t_flags & TF_FORCEDATA) == 0 || - !tcp_timer_active(tp, TT_PERSIST)) { - tcp_seq startseq = tp->snd_nxt; - - /* - * Advance snd_nxt over sequence space of this segment. - */ - if (flags & (TH_SYN|TH_FIN)) { - if (flags & TH_SYN) - tp->snd_nxt++; - if (flags & TH_FIN) { - tp->snd_nxt++; - tp->t_flags |= TF_SENTFIN; - } - } - if (sack_rxmit) - goto timer; - tp->snd_nxt += len; - if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { - tp->snd_max = tp->snd_nxt; - /* - * Time this transmission if not a retransmission and - * not currently timing anything. - */ - if (tp->t_rtttime == 0) { - tp->t_rtttime = ticks; - tp->t_rtseq = startseq; - TCPSTAT_INC(tcps_segstimed); - } - } - - /* - * Set retransmit timer if not currently set, - * and not doing a pure ack or a keep-alive probe. - * Initial value for retransmit timer is smoothed - * round-trip time + 2 * round-trip time variance. - * Initialize shift counter which is used for backoff - * of retransmit time. - */ -timer: - if (!tcp_timer_active(tp, TT_REXMT) && - ((sack_rxmit && tp->snd_nxt != tp->snd_max) || - (tp->snd_nxt != tp->snd_una))) { - if (tcp_timer_active(tp, TT_PERSIST)) { - tcp_timer_activate(tp, TT_PERSIST, 0); - tp->t_rxtshift = 0; - } - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); - } - } else { - /* - * Persist case, update snd_max but since we are in - * persist mode (no window) we do not update snd_nxt. - */ - int xlen = len; - if (flags & TH_SYN) - ++xlen; - if (flags & TH_FIN) { - ++xlen; - tp->t_flags |= TF_SENTFIN; - } - if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) - tp->snd_max = tp->snd_nxt + len; - } - /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); @@ -1306,6 +1329,7 @@ timer: ipov->ih_len = save; } #endif /* TCPDEBUG */ + TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); /* * Fill in IP length and desired time to live and @@ -1314,7 +1338,7 @@ timer: * the template, but need a way to checksum without them. */ /* - * m->m_pkthdr.len should have been set before cksum calcuration, + * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 @@ -1330,13 +1354,35 @@ timer: */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); + /* + * Set the packet size here for the benefit of DTrace probes. + * ip6_output() will set it properly; it's supposed to include + * the option header lengths as well. + */ + ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); + + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + else + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); + + TCP_PROBE5(send, NULL, tp, ip6, tp, th); + +#ifdef TCPPCAP + /* Save packet, if requested. */ + tcp_pcap_add(th, m, &(tp->t_outpkts)); +#endif + /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_rmx.rmx_mtu; + mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET6 */ @@ -1345,10 +1391,7 @@ timer: #endif #ifdef INET { - struct route ro; - - bzero(&ro, sizeof(ro)); - ip->ip_len = m->m_pkthdr.len; + ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); @@ -1361,18 +1404,126 @@ timer: * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ - if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) - ip->ip_off |= IP_DF; + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { + ip->ip_off |= htons(IP_DF); + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + } else { + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); - error = ip_output(m, tp->t_inpcb->inp_options, &ro, + TCP_PROBE5(send, NULL, tp, ip, tp, th); + +#ifdef TCPPCAP + /* Save packet, if requested. */ + tcp_pcap_add(th, m, &(tp->t_outpkts)); +#endif + + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); - if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_rmx.rmx_mtu; - RO_RTFREE(&ro); + if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) + mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ + +out: + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if ((tp->t_flags & TF_FORCEDATA) == 0 || + !tcp_timer_active(tp, TT_PERSIST)) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + if (sack_rxmit) + goto timer; + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + TCPSTAT_INC(tcps_segstimed); + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing a pure ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ +timer: + if (!tcp_timer_active(tp, TT_REXMT) && + ((sack_rxmit && tp->snd_nxt != tp->snd_max) || + (tp->snd_nxt != tp->snd_una))) { + if (tcp_timer_active(tp, TT_PERSIST)) { + tcp_timer_activate(tp, TT_PERSIST, 0); + tp->t_rxtshift = 0; + } + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + } else if (len == 0 && sbavail(&so->so_snd) && + !tcp_timer_active(tp, TT_REXMT) && + !tcp_timer_active(tp, TT_PERSIST)) { + /* + * Avoid a situation where we do not set persist timer + * after a zero window condition. For example: + * 1) A -> B: packet with enough data to fill the window + * 2) B -> A: ACK for #1 + new data (0 window + * advertisement) + * 3) A -> B: ACK for #2, 0 len packet + * + * In this case, A will not activate the persist timer, + * because it chose to send a packet. Unless tcp_output + * is called for some other reason (delayed ack timer, + * another input packet from B, socket syscall), A will + * not send zero window probes. + * + * So, if you send a 0-length packet, but there is data + * in the socket buffer, and neither the rexmt or + * persist timer is already set, then activate the + * persist timer. + */ + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + } else { + /* + * Persist case, update snd_max but since we are in + * persist mode (no window) we do not update snd_nxt. + */ + int xlen = len; + if (flags & TH_SYN) + ++xlen; + if (flags & TH_FIN) { + ++xlen; + tp->t_flags |= TF_SENTFIN; + } + if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + } + if (error) { /* @@ -1400,16 +1551,13 @@ timer: } else tp->snd_nxt -= len; } -out: SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: - if (!tcp_timer_active(tp, TT_REXMT) && - !tcp_timer_active(tp, TT_PERSIST)) - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + TCP_XMIT_TIMER_ASSERT(tp, len, flags); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: @@ -1481,10 +1629,10 @@ tcp_setpersist(struct tcpcb *tp) if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* - * Start/restart persistance timer. + * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], - TCPTV_PERSMIN, TCPTV_PERSMAX); + tcp_persmin, tcp_persmax); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; @@ -1510,7 +1658,7 @@ tcp_setpersist(struct tcpcb *tp) int tcp_addoptions(struct tcpopt *to, u_char *optp) { - u_int mask, optlen = 0; + u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) @@ -1572,6 +1720,7 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; +#ifdef TCP_SIGNATURE case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; @@ -1590,6 +1739,7 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) *optp++ = 0; break; } +#endif case TOF_SACK: { int sackblks = 0; @@ -1620,6 +1770,25 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) TCPSTAT_INC(tcps_sack_send_blocks); break; } +#ifdef TCP_RFC7413 + case TOF_FASTOPEN: + { + int total_len; + + /* XXX is there any point to aligning this option? */ + total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; + if (TCP_MAXOLEN - optlen < total_len) + continue; + *optp++ = TCPOPT_FAST_OPEN; + *optp++ = total_len; + if (to->to_tfo_len > 0) { + bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); + optp += to->to_tfo_len; + } + optlen += total_len; + break; + } +#endif default: panic("%s: unknown TCP option type", __func__); break; |