summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/tcp_usrreq.c
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/tcp_usrreq.c')
-rw-r--r--freebsd/sys/netinet/tcp_usrreq.c568
1 files changed, 419 insertions, 149 deletions
diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c
index 61711a6e..d5fa680f 100644
--- a/freebsd/sys/netinet/tcp_usrreq.c
+++ b/freebsd/sys/netinet/tcp_usrreq.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/malloc.h>
+#include <sys/refcount.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
@@ -66,11 +67,12 @@ __FBSDID("$FreeBSD$");
#endif
#include <net/if.h>
+#include <net/if_var.h>
#include <net/route.h>
#include <net/vnet.h>
-#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
@@ -81,11 +83,19 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif
+#ifdef TCP_RFC7413
+#include <netinet/tcp_fastopen.h>
+#endif
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#ifdef TCPPCAP
+#include <netinet/tcp_pcap.h>
+#endif
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
@@ -147,6 +157,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td)
tp = intotcpcb(inp);
out:
TCPDEBUG2(PRU_ATTACH);
+ TCP_PROBE2(debug__user, tp, PRU_ATTACH);
return error;
}
@@ -164,7 +175,7 @@ tcp_detach(struct socket *so, struct inpcb *inp)
{
struct tcpcb *tp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_LOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
@@ -184,6 +195,21 @@ tcp_detach(struct socket *so, struct inpcb *inp)
* present until timewait ends.
*
* XXXRW: Would it be cleaner to free the tcptw here?
+ *
+ * Astute question indeed, from twtcp perspective there are
+ * three cases to consider:
+ *
+ * #1 tcp_detach is called at tcptw creation time by
+ * tcp_twstart, then do not discard the newly created tcptw
+ * and leave inpcb present until timewait ends
+ * #2 tcp_detach is called at timewait end (or reuse) by
+ * tcp_twclose, then the tcptw has already been discarded
+ * (or reused) and inpcb is freed here
+ * #3 tcp_detach is called() after timewait ends (or reuse)
+ * (e.g. by soclose), then tcptw has already been discarded
+ * (or reused) and inpcb is freed here
+ *
+ * In all three cases the tcptw should not be freed here.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
@@ -227,15 +253,20 @@ static void
tcp_usr_detach(struct socket *so)
{
struct inpcb *inp;
+ int rlock = 0;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ if (!INP_INFO_WLOCKED(&V_tcbinfo)) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ rlock = 1;
+ }
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_detach: inp_socket == NULL"));
tcp_detach(so, inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (rlock)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
#ifdef INET
@@ -276,6 +307,7 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
INP_HASH_WUNLOCK(&V_tcbinfo);
out:
TCPDEBUG2(PRU_BIND);
+ TCP_PROBE2(debug__user, tp, PRU_BIND);
INP_WUNLOCK(inp);
return (error);
@@ -336,6 +368,7 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
INP_HASH_WUNLOCK(&V_tcbinfo);
out:
TCPDEBUG2(PRU_BIND);
+ TCP_PROBE2(debug__user, tp, PRU_BIND);
INP_WUNLOCK(inp);
return (error);
}
@@ -369,7 +402,7 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
INP_HASH_WUNLOCK(&V_tcbinfo);
if (error == 0) {
- tp->t_state = TCPS_LISTEN;
+ tcp_state_change(tp, TCPS_LISTEN);
solisten_proto(so, backlog);
#ifdef TCP_OFFLOAD
if ((so->so_options & SO_NO_OFFLOAD) == 0)
@@ -378,8 +411,13 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
}
SOCK_UNLOCK(so);
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN)
+ tp->t_tfo_pending = tcp_fastopen_alloc_counter();
+#endif
out:
TCPDEBUG2(PRU_LISTEN);
+ TCP_PROBE2(debug__user, tp, PRU_LISTEN);
INP_WUNLOCK(inp);
return (error);
}
@@ -414,7 +452,7 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
}
INP_HASH_WUNLOCK(&V_tcbinfo);
if (error == 0) {
- tp->t_state = TCPS_LISTEN;
+ tcp_state_change(tp, TCPS_LISTEN);
solisten_proto(so, backlog);
#ifdef TCP_OFFLOAD
if ((so->so_options & SO_NO_OFFLOAD) == 0)
@@ -423,8 +461,13 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
}
SOCK_UNLOCK(so);
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN)
+ tp->t_tfo_pending = tcp_fastopen_alloc_counter();
+#endif
out:
TCPDEBUG2(PRU_LISTEN);
+ TCP_PROBE2(debug__user, tp, PRU_LISTEN);
INP_WUNLOCK(inp);
return (error);
}
@@ -462,8 +505,12 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
INP_WLOCK(inp);
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ error = EADDRINUSE;
+ goto out;
+ }
+ if (inp->inp_flags & INP_DROPPED) {
+ error = ECONNREFUSED;
goto out;
}
tp = intotcpcb(inp);
@@ -477,9 +524,10 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out;
#endif
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
+ TCP_PROBE2(debug__user, tp, PRU_CONNECT);
INP_WUNLOCK(inp);
return (error);
}
@@ -509,8 +557,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
INP_WLOCK(inp);
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- error = EINVAL;
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ error = EADDRINUSE;
+ goto out;
+ }
+ if (inp->inp_flags & INP_DROPPED) {
+ error = ECONNREFUSED;
goto out;
}
tp = intotcpcb(inp);
@@ -543,7 +595,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
(error = tcp_offload_connect(so, nam)) == 0)
goto out;
#endif
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
goto out;
}
#endif
@@ -561,10 +613,11 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out;
#endif
tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
+ TCP_PROBE2(debug__user, tp, PRU_CONNECT);
INP_WUNLOCK(inp);
return (error);
}
@@ -589,11 +642,13 @@ tcp_usr_disconnect(struct socket *so)
int error = 0;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
INP_WLOCK(inp);
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ if (inp->inp_flags & INP_TIMEWAIT)
+ goto out;
+ if (inp->inp_flags & INP_DROPPED) {
error = ECONNRESET;
goto out;
}
@@ -602,8 +657,9 @@ tcp_usr_disconnect(struct socket *so)
tcp_disconnect(tp);
out:
TCPDEBUG2(PRU_DISCONNECT);
+ TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -611,13 +667,6 @@ out:
/*
* Accept a connection. Essentially all the work is done at higher levels;
* just return the address of the peer, storing through addr.
- *
- * The rationale for acquiring the tcbinfo lock here is somewhat complicated,
- * and is described in detail in the commit log entry for r175612. Acquiring
- * it delays an accept(2) racing with sonewconn(), which inserts the socket
- * before the inpcb address/port fields are initialized. A better fix would
- * prevent the socket from being placed in the listen queue until all fields
- * are fully initialized.
*/
static int
tcp_usr_accept(struct socket *so, struct sockaddr **nam)
@@ -634,7 +683,6 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
- INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
error = ECONNABORTED;
@@ -653,8 +701,8 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam)
out:
TCPDEBUG2(PRU_ACCEPT);
+ TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
if (error == 0)
*nam = in_sockaddr(port, &addr);
return error;
@@ -704,6 +752,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
out:
TCPDEBUG2(PRU_ACCEPT);
+ TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
INP_WUNLOCK(inp);
INP_INFO_RUNLOCK(&V_tcbinfo);
if (error == 0) {
@@ -727,7 +776,7 @@ tcp_usr_shutdown(struct socket *so)
struct tcpcb *tp = NULL;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("inp == NULL"));
INP_WLOCK(inp);
@@ -740,12 +789,13 @@ tcp_usr_shutdown(struct socket *so)
socantsendmore(so);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_SHUTDOWN);
+ TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
@@ -770,15 +820,28 @@ tcp_usr_rcvd(struct socket *so, int flags)
}
tp = intotcpcb(inp);
TCPDEBUG1();
+#ifdef TCP_RFC7413
+ /*
+ * For passively-created TFO connections, don't attempt a window
+ * update while still in SYN_RECEIVED as this may trigger an early
+ * SYN|ACK. It is preferable to have the SYN|ACK be sent along with
+ * application response data, or failing that, when the DELACK timer
+ * expires.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ goto out;
+#endif
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
tcp_offload_rcvd(tp);
else
#endif
- tcp_output(tp);
+ tp->t_fb->tfb_tcp_output(tp);
out:
TCPDEBUG2(PRU_RCVD);
+ TCP_PROBE2(debug__user, tp, PRU_RCVD);
INP_WUNLOCK(inp);
return (error);
}
@@ -807,14 +870,18 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* this call.
*/
if (flags & PRUS_EOF)
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
if (control)
m_freem(control);
- if (m)
+ /*
+ * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible
+ * for freeing memory.
+ */
+ if (m && (flags & PRUS_NOTREADY) == 0)
m_freem(m);
error = ECONNRESET;
goto out;
@@ -836,13 +903,12 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
m_freem(control); /* empty control, just free it */
}
if (!(flags & PRUS_OOB)) {
- sbappendstream(&so->so_snd, m);
+ sbappendstream(&so->so_snd, m, flags);
if (nam && tp->t_state < TCPS_SYN_SENT) {
/*
* Do implied connect if not yet connected,
* initialize window to default value, and
- * initialize maxseg/maxopd using peer's cached
- * MSS.
+ * initialize maxseg using peer's cached MSS.
*/
#ifdef INET6
if (isipv6)
@@ -864,14 +930,15 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* Close the send side of the connection after
* the data is sent.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
socantsendmore(so);
tcp_usrclosed(tp);
}
- if (!(inp->inp_flags & INP_DROPPED)) {
+ if (!(inp->inp_flags & INP_DROPPED) &&
+ !(flags & PRUS_NOTREADY)) {
if (flags & PRUS_MORETOCOME)
tp->t_flags |= TF_MORETOCOME;
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
if (flags & PRUS_MORETOCOME)
tp->t_flags &= ~TF_MORETOCOME;
}
@@ -894,14 +961,13 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* of data past the urgent section.
* Otherwise, snd_up should be one lower.
*/
- sbappendstream_locked(&so->so_snd, m);
+ sbappendstream_locked(&so->so_snd, m, flags);
SOCKBUF_UNLOCK(&so->so_snd);
if (nam && tp->t_state < TCPS_SYN_SENT) {
/*
* Do implied connect if not yet connected,
* initialize window to default value, and
- * initialize maxseg/maxopd using peer's cached
- * MSS.
+ * initialize maxseg using peer's cached MSS.
*/
#ifdef INET6
if (isipv6)
@@ -918,17 +984,48 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
tp->snd_wnd = TTCP_CLIENT_SND_WND;
tcp_mss(tp, -1);
}
- tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
- tp->t_flags |= TF_FORCEDATA;
- error = tcp_output(tp);
- tp->t_flags &= ~TF_FORCEDATA;
+ tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
+ if (!(flags & PRUS_NOTREADY)) {
+ tp->t_flags |= TF_FORCEDATA;
+ error = tp->t_fb->tfb_tcp_output(tp);
+ tp->t_flags &= ~TF_FORCEDATA;
+ }
}
out:
TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
+ TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
+ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
INP_WUNLOCK(inp);
if (flags & PRUS_EOF)
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ return (error);
+}
+
+static int
+tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ int error;
+
+ inp = sotoinpcb(so);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ INP_WUNLOCK(inp);
+ for (int i = 0; i < count; i++)
+ m = m_free(m);
+ return (ECONNRESET);
+ }
+ tp = intotcpcb(inp);
+
+ SOCKBUF_LOCK(&so->so_snd);
+ error = sbready(&so->so_snd, m, count);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (error == 0)
+ error = tp->t_fb->tfb_tcp_output(tp);
+ INP_WUNLOCK(inp);
+
return (error);
}
@@ -945,7 +1042,7 @@ tcp_usr_abort(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_abort: inp_socket == NULL"));
@@ -959,6 +1056,7 @@ tcp_usr_abort(struct socket *so)
TCPDEBUG1();
tcp_drop(tp, ECONNABORTED);
TCPDEBUG2(PRU_ABORT);
+ TCP_PROBE2(debug__user, tp, PRU_ABORT);
}
if (!(inp->inp_flags & INP_DROPPED)) {
SOCK_LOCK(so);
@@ -967,7 +1065,7 @@ tcp_usr_abort(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -983,7 +1081,7 @@ tcp_usr_close(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
KASSERT(inp->inp_socket != NULL,
("tcp_usr_close: inp_socket == NULL"));
@@ -998,6 +1096,7 @@ tcp_usr_close(struct socket *so)
TCPDEBUG1();
tcp_disconnect(tp);
TCPDEBUG2(PRU_CLOSE);
+ TCP_PROBE2(debug__user, tp, PRU_CLOSE);
}
if (!(inp->inp_flags & INP_DROPPED)) {
SOCK_LOCK(so);
@@ -1006,7 +1105,7 @@ tcp_usr_close(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
/*
@@ -1047,6 +1146,7 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
out:
TCPDEBUG2(PRU_RCVOOB);
+ TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
INP_WUNLOCK(inp);
return (error);
}
@@ -1066,6 +1166,7 @@ struct pr_usrreqs tcp_usrreqs = {
.pru_rcvd = tcp_usr_rcvd,
.pru_rcvoob = tcp_usr_rcvoob,
.pru_send = tcp_usr_send,
+ .pru_ready = tcp_usr_ready,
.pru_shutdown = tcp_usr_shutdown,
.pru_sockaddr = in_getsockaddr,
.pru_sosetlabel = in_pcbsosetlabel,
@@ -1088,6 +1189,7 @@ struct pr_usrreqs tcp6_usrreqs = {
.pru_rcvd = tcp_usr_rcvd,
.pru_rcvoob = tcp_usr_rcvoob,
.pru_send = tcp_usr_send,
+ .pru_ready = tcp_usr_ready,
.pru_shutdown = tcp_usr_shutdown,
.pru_sockaddr = in6_mapped_sockaddr,
.pru_sosetlabel = in_pcbsosetlabel,
@@ -1154,7 +1256,7 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
soisconnecting(so);
TCPSTAT_INC(tcps_connattempt);
- tp->t_state = TCPS_SYN_SENT;
+ tcp_state_change(tp, TCPS_SYN_SENT);
tp->iss = tcp_new_isn(tp);
tcp_sendseqinit(tp);
@@ -1170,10 +1272,7 @@ out:
static int
tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
{
- struct inpcb *inp = tp->t_inpcb, *oinp;
- struct socket *so = inp->inp_socket;
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
- struct in6_addr addr6;
+ struct inpcb *inp = tp->t_inpcb;
int error;
INP_WLOCK_ASSERT(inp);
@@ -1184,39 +1283,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
if (error)
goto out;
}
-
- /*
- * Cannot simply call in_pcbconnect, because there might be an
- * earlier incarnation of this same connection still in
- * TIME_WAIT state, creating an ADDRINUSE error.
- * in6_pcbladdr() also handles scope zone IDs.
- *
- * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked()
- * outside of in6_pcb.c if there were an in6_pcbconnect_setup().
- */
- error = in6_pcbladdr(inp, nam, &addr6);
- if (error)
- goto out;
- oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo,
- &sin6->sin6_addr, sin6->sin6_port,
- IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
- ? &addr6
- : &inp->in6p_laddr,
- inp->inp_lport, 0, NULL);
- if (oinp) {
- error = EADDRINUSE;
+ error = in6_pcbconnect(inp, nam, td->td_ucred);
+ if (error != 0)
goto out;
- }
- if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
- inp->in6p_laddr = addr6;
- inp->in6p_faddr = sin6->sin6_addr;
- inp->inp_fport = sin6->sin6_port;
- /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
- inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
- if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
- inp->inp_flow |=
- (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
- in_pcbrehash(inp);
INP_HASH_WUNLOCK(&V_tcbinfo);
/* Compute window scaling to request. */
@@ -1224,9 +1293,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
(TCP_MAXWIN << tp->request_r_scale) < sb_max)
tp->request_r_scale++;
- soisconnecting(so);
+ soisconnecting(inp->inp_socket);
TCPSTAT_INC(tcps_connattempt);
- tp->t_state = TCPS_SYN_SENT;
+ tcp_state_change(tp, TCPS_SYN_SENT);
tp->iss = tcp_new_isn(tp);
tcp_sendseqinit(tp);
@@ -1294,25 +1363,25 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
* has to revalidate that the connection is still valid for the socket
* option.
*/
-#define INP_WLOCK_RECHECK(inp) do { \
+#define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \
INP_WLOCK(inp); \
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \
INP_WUNLOCK(inp); \
+ cleanup; \
return (ECONNRESET); \
} \
tp = intotcpcb(inp); \
} while(0)
+#define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
int
tcp_ctloutput(struct socket *so, struct sockopt *sopt)
{
- int error, opt, optval;
- u_int ui;
+ int error;
struct inpcb *inp;
struct tcpcb *tp;
- struct tcp_info ti;
- char buf[TCP_CA_NAME_MAX];
- struct cc_algo *algo;
+ struct tcp_function_block *blk;
+ struct tcp_function_set fsn;
error = 0;
inp = sotoinpcb(so);
@@ -1340,6 +1409,128 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp);
return (ECONNRESET);
}
+ tp = intotcpcb(inp);
+ /*
+ * Protect the TCP option TCP_FUNCTION_BLK so
+ * that a sub-function can *never* overwrite this.
+ */
+ if ((sopt->sopt_dir == SOPT_SET) &&
+ (sopt->sopt_name == TCP_FUNCTION_BLK)) {
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &fsn, sizeof fsn,
+ sizeof fsn);
+ if (error)
+ return (error);
+ INP_WLOCK_RECHECK(inp);
+ blk = find_and_ref_tcp_functions(&fsn);
+ if (blk == NULL) {
+ INP_WUNLOCK(inp);
+ return (ENOENT);
+ }
+ if (tp->t_fb == blk) {
+ /* You already have this */
+ refcount_release(&blk->tfb_refcnt);
+ INP_WUNLOCK(inp);
+ return (0);
+ }
+ if (tp->t_state != TCPS_CLOSED) {
+ int error=EINVAL;
+ /*
+ * The user has advanced the state
+ * past the initial point, we may not
+ * be able to switch.
+ */
+ if (blk->tfb_tcp_handoff_ok != NULL) {
+ /*
+ * Does the stack provide a
+ * query mechanism, if so it may
+ * still be possible?
+ */
+ error = (*blk->tfb_tcp_handoff_ok)(tp);
+ }
+ if (error) {
+ refcount_release(&blk->tfb_refcnt);
+ INP_WUNLOCK(inp);
+ return(error);
+ }
+ }
+ if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
+ refcount_release(&blk->tfb_refcnt);
+ INP_WUNLOCK(inp);
+ return (ENOENT);
+ }
+ /*
+ * Release the old refcnt, the
+ * lookup acquired a ref on the
+ * new one already.
+ */
+ if (tp->t_fb->tfb_tcp_fb_fini) {
+ /*
+ * Tell the stack to cleanup with 0 i.e.
+ * the tcb is not going away.
+ */
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ }
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ tp->t_fb = blk;
+ if (tp->t_fb->tfb_tcp_fb_init) {
+ (*tp->t_fb->tfb_tcp_fb_init)(tp);
+ }
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE) {
+ tcp_offload_ctloutput(tp, sopt->sopt_dir,
+ sopt->sopt_name);
+ }
+#endif
+ INP_WUNLOCK(inp);
+ return (error);
+ } else if ((sopt->sopt_dir == SOPT_GET) &&
+ (sopt->sopt_name == TCP_FUNCTION_BLK)) {
+ strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name);
+ fsn.pcbcnt = tp->t_fb->tfb_refcnt;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &fsn, sizeof fsn);
+ return (error);
+ }
+ /* Pass in the INP locked, called must unlock it */
+ return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
+}
+
+int
+tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
+{
+ int error, opt, optval;
+ u_int ui;
+ struct tcp_info ti;
+ struct cc_algo *algo;
+ char *pbuf, buf[TCP_CA_NAME_MAX];
+ size_t len;
+
+ /*
+ * For TCP_CCALGOOPT forward the control to CC module, for both
+ * SOPT_SET and SOPT_GET.
+ */
+ switch (sopt->sopt_name) {
+ case TCP_CCALGOOPT:
+ INP_WUNLOCK(inp);
+ pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
+ error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
+ sopt->sopt_valsize);
+ if (error) {
+ free(pbuf, M_TEMP);
+ return (error);
+ }
+ INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
+ if (CC_ALGO(tp)->ctl_output != NULL)
+ error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf);
+ else
+ error = ENOENT;
+ INP_WUNLOCK(inp);
+ if (error == 0 && sopt->sopt_dir == SOPT_GET)
+ error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
+ free(pbuf, M_TEMP);
+ return (error);
+ }
switch (sopt->sopt_dir) {
case SOPT_SET:
@@ -1408,7 +1599,7 @@ unlock_and_done:
else if (tp->t_flags & TF_NOPUSH) {
tp->t_flags &= ~TF_NOPUSH;
if (TCPS_HAVEESTABLISHED(tp->t_state))
- error = tcp_output(tp);
+ error = tp->t_fb->tfb_tcp_output(tp);
}
goto unlock_and_done;
@@ -1434,50 +1625,45 @@ unlock_and_done:
case TCP_CONGESTION:
INP_WUNLOCK(inp);
- bzero(buf, sizeof(buf));
- error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
+ error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
if (error)
break;
+ buf[sopt->sopt_valsize] = '\0';
INP_WLOCK_RECHECK(inp);
+ CC_LIST_RLOCK();
+ STAILQ_FOREACH(algo, &cc_list, entries)
+ if (strncmp(buf, algo->name,
+ TCP_CA_NAME_MAX) == 0)
+ break;
+ CC_LIST_RUNLOCK();
+ if (algo == NULL) {
+ INP_WUNLOCK(inp);
+ error = EINVAL;
+ break;
+ }
/*
- * Return EINVAL if we can't find the requested cc algo.
+ * We hold a write lock over the tcb so it's safe to
+ * do these things without ordering concerns.
*/
- error = EINVAL;
- CC_LIST_RLOCK();
- STAILQ_FOREACH(algo, &cc_list, entries) {
- if (strncmp(buf, algo->name, TCP_CA_NAME_MAX)
- == 0) {
- /* We've found the requested algo. */
- error = 0;
- /*
- * We hold a write lock over the tcb
- * so it's safe to do these things
- * without ordering concerns.
- */
- if (CC_ALGO(tp)->cb_destroy != NULL)
- CC_ALGO(tp)->cb_destroy(tp->ccv);
- CC_ALGO(tp) = algo;
- /*
- * If something goes pear shaped
- * initialising the new algo,
- * fall back to newreno (which
- * does not require initialisation).
- */
- if (algo->cb_init != NULL)
- if (algo->cb_init(tp->ccv) > 0) {
- CC_ALGO(tp) = &newreno_cc_algo;
- /*
- * The only reason init
- * should fail is
- * because of malloc.
- */
- error = ENOMEM;
- }
- break; /* Break the STAILQ_FOREACH. */
- }
+ if (CC_ALGO(tp)->cb_destroy != NULL)
+ CC_ALGO(tp)->cb_destroy(tp->ccv);
+ CC_ALGO(tp) = algo;
+ /*
+ * If something goes pear shaped initialising the new
+ * algo, fall back to newreno (which does not
+ * require initialisation).
+ */
+ if (algo->cb_init != NULL &&
+ algo->cb_init(tp->ccv) != 0) {
+ CC_ALGO(tp) = &newreno_cc_algo;
+ /*
+ * The only reason init should fail is
+ * because of malloc.
+ */
+ error = ENOMEM;
}
- CC_LIST_RUNLOCK();
- goto unlock_and_done;
+ INP_WUNLOCK(inp);
+ break;
case TCP_KEEPIDLE:
case TCP_KEEPINTVL:
@@ -1535,8 +1721,49 @@ unlock_and_done:
(TP_MAXIDLE(tp) > 0))
tcp_timer_activate(tp, TT_2MSL,
TP_MAXIDLE(tp));
+ goto unlock_and_done;
+
+#ifdef TCPPCAP
+ case TCP_PCAP_OUT:
+ case TCP_PCAP_IN:
INP_WUNLOCK(inp);
- break;
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval >= 0)
+ tcp_pcap_set_sock_max(TCP_PCAP_OUT ?
+ &(tp->t_outpkts) : &(tp->t_inpkts),
+ optval);
+ else
+ error = EINVAL;
+ goto unlock_and_done;
+#endif
+
+#ifdef TCP_RFC7413
+ case TCP_FASTOPEN:
+ INP_WUNLOCK(inp);
+ if (!V_tcp_fastopen_enabled)
+ return (EPERM);
+
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval) {
+ tp->t_flags |= TF_FASTOPEN;
+ if ((tp->t_state == TCPS_LISTEN) &&
+ (tp->t_tfo_pending == NULL))
+ tp->t_tfo_pending =
+ tcp_fastopen_alloc_counter();
+ } else
+ tp->t_flags &= ~TF_FASTOPEN;
+ goto unlock_and_done;
+#endif
default:
INP_WUNLOCK(inp);
@@ -1582,11 +1809,48 @@ unlock_and_done:
error = sooptcopyout(sopt, &ti, sizeof ti);
break;
case TCP_CONGESTION:
- bzero(buf, sizeof(buf));
- strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
+ len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, buf, len + 1);
+ break;
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPINIT:
+ case TCP_KEEPCNT:
+ switch (sopt->sopt_name) {
+ case TCP_KEEPIDLE:
+ ui = tp->t_keepidle / hz;
+ break;
+ case TCP_KEEPINTVL:
+ ui = tp->t_keepintvl / hz;
+ break;
+ case TCP_KEEPINIT:
+ ui = tp->t_keepinit / hz;
+ break;
+ case TCP_KEEPCNT:
+ ui = tp->t_keepcnt;
+ break;
+ }
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &ui, sizeof(ui));
+ break;
+#ifdef TCPPCAP
+ case TCP_PCAP_OUT:
+ case TCP_PCAP_IN:
+ optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ?
+ &(tp->t_outpkts) : &(tp->t_inpkts));
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+#endif
+
+#ifdef TCP_RFC7413
+ case TCP_FASTOPEN:
+ optval = tp->t_flags & TF_FASTOPEN;
INP_WUNLOCK(inp);
- error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
break;
+#endif
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -1597,6 +1861,7 @@ unlock_and_done:
return (error);
}
#undef INP_WLOCK_RECHECK
+#undef INP_WLOCK_RECHECK_CLEANUP
/*
* Attach TCP protocol to socket, allocating
@@ -1617,10 +1882,10 @@ tcp_attach(struct socket *so)
}
so->so_rcv.sb_flags |= SB_AUTOSIZE;
so->so_snd.sb_flags |= SB_AUTOSIZE;
- INP_INFO_WLOCK(&V_tcbinfo);
+ INP_INFO_RLOCK(&V_tcbinfo);
error = in_pcballoc(so, &V_tcbinfo);
if (error) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (error);
}
inp = sotoinpcb(so);
@@ -1636,12 +1901,13 @@ tcp_attach(struct socket *so)
if (tp == NULL) {
in_pcbdetach(inp);
in_pcbfree(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
return (ENOBUFS);
}
tp->t_state = TCPS_CLOSED;
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ TCPSTATES_INC(TCPS_CLOSED);
return (0);
}
@@ -1659,7 +1925,7 @@ tcp_disconnect(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -1679,7 +1945,7 @@ tcp_disconnect(struct tcpcb *tp)
sbflush(&so->so_rcv);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- tcp_output(tp);
+ tp->t_fb->tfb_tcp_output(tp);
}
}
@@ -1697,7 +1963,7 @@ static void
tcp_usrclosed(struct tcpcb *tp)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
switch (tp->t_state) {
@@ -1705,9 +1971,9 @@ tcp_usrclosed(struct tcpcb *tp)
#ifdef TCP_OFFLOAD
tcp_offload_listen_stop(tp);
#endif
+ tcp_state_change(tp, TCPS_CLOSED);
/* FALLTHROUGH */
case TCPS_CLOSED:
- tp->t_state = TCPS_CLOSED;
tp = tcp_close(tp);
/*
* tcp_close() should never return NULL here as the socket is
@@ -1723,11 +1989,11 @@ tcp_usrclosed(struct tcpcb *tp)
break;
case TCPS_ESTABLISHED:
- tp->t_state = TCPS_FIN_WAIT_1;
+ tcp_state_change(tp, TCPS_FIN_WAIT_1);
break;
case TCPS_CLOSE_WAIT:
- tp->t_state = TCPS_LAST_ACK;
+ tcp_state_change(tp, TCPS_LAST_ACK);
break;
}
if (tp->t_state >= TCPS_FIN_WAIT_2) {
@@ -1910,6 +2176,10 @@ db_print_tflags(u_int t_flags)
db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
comma = 1;
}
+ if (t_flags & TF_FASTOPEN) {
+ db_printf("%sTF_FASTOPEN", comma ? ", " : "");
+ comma = 1;
+ }
}
static void
@@ -1984,8 +2254,8 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
"0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
db_print_indent(indent);
- db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n",
- tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);
+ db_printf("t_rcvtime: %u t_startime: %u\n",
+ tp->t_rcvtime, tp->t_starttime);
db_print_indent(indent);
db_printf("t_rttime: %u t_rtsq: 0x%08x\n",