diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_usrreq.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_usrreq.c | 568 |
1 files changed, 419 insertions, 149 deletions
diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c index 61711a6e..d5fa680f 100644 --- a/freebsd/sys/netinet/tcp_usrreq.c +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <sys/limits.h> #include <sys/malloc.h> +#include <sys/refcount.h> #include <sys/kernel.h> #include <sys/sysctl.h> #include <sys/mbuf.h> @@ -66,11 +67,12 @@ __FBSDID("$FreeBSD$"); #endif #include <net/if.h> +#include <net/if_var.h> #include <net/route.h> #include <net/vnet.h> -#include <netinet/cc.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_pcb.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> @@ -81,11 +83,19 @@ __FBSDID("$FreeBSD$"); #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #endif +#ifdef TCP_RFC7413 +#include <netinet/tcp_fastopen.h> +#endif +#include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcpip.h> +#include <netinet/cc/cc.h> +#ifdef TCPPCAP +#include <netinet/tcp_pcap.h> +#endif #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif @@ -147,6 +157,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td) tp = intotcpcb(inp); out: TCPDEBUG2(PRU_ATTACH); + TCP_PROBE2(debug__user, tp, PRU_ATTACH); return error; } @@ -164,7 +175,7 @@ tcp_detach(struct socket *so, struct inpcb *inp) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); @@ -184,6 +195,21 @@ tcp_detach(struct socket *so, struct inpcb *inp) * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? + * + * Astute question indeed, from twtcp perspective there are + * three cases to consider: + * + * #1 tcp_detach is called at tcptw creation time by + * tcp_twstart, then do not discard the newly created tcptw + * and leave inpcb present until timewait ends + * #2 tcp_detach is called at timewait end (or reuse) by + * tcp_twclose, then the tcptw has already been discarded + * (or reused) and inpcb is freed here + * #3 tcp_detach is called() after timewait ends (or reuse) + * (e.g. by soclose), then tcptw has already been discarded + * (or reused) and inpcb is freed here + * + * In all three cases the tcptw should not be freed here. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " @@ -227,15 +253,20 @@ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; + int rlock = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + if (!INP_INFO_WLOCKED(&V_tcbinfo)) { + INP_INFO_RLOCK(&V_tcbinfo); + rlock = 1; + } INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + if (rlock) + INP_INFO_RUNLOCK(&V_tcbinfo); } #ifdef INET @@ -276,6 +307,7 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); + TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); @@ -336,6 +368,7 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); + TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } @@ -369,7 +402,7 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { - tp->t_state = TCPS_LISTEN; + tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) @@ -378,8 +411,13 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) } SOCK_UNLOCK(so); +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) + tp->t_tfo_pending = tcp_fastopen_alloc_counter(); +#endif out: TCPDEBUG2(PRU_LISTEN); + TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } @@ -414,7 +452,7 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { - tp->t_state = TCPS_LISTEN; + tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) @@ -423,8 +461,13 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) } SOCK_UNLOCK(so); +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) + tp->t_tfo_pending = tcp_fastopen_alloc_counter(); +#endif out: TCPDEBUG2(PRU_LISTEN); + TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } @@ -462,8 +505,12 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; + if (inp->inp_flags & INP_TIMEWAIT) { + error = EADDRINUSE; + goto out; + } + if (inp->inp_flags & INP_DROPPED) { + error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); @@ -477,9 +524,10 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); + TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } @@ -509,8 +557,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; + if (inp->inp_flags & INP_TIMEWAIT) { + error = EADDRINUSE; + goto out; + } + if (inp->inp_flags & INP_DROPPED) { + error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); @@ -543,7 +595,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); goto out; } #endif @@ -561,10 +613,11 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); + TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } @@ -589,11 +642,13 @@ tcp_usr_disconnect(struct socket *so) int error = 0; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + if (inp->inp_flags & INP_TIMEWAIT) + goto out; + if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } @@ -602,8 +657,9 @@ tcp_usr_disconnect(struct socket *so) tcp_disconnect(tp); out: TCPDEBUG2(PRU_DISCONNECT); + TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } @@ -611,13 +667,6 @@ out: /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. - * - * The rationale for acquiring the tcbinfo lock here is somewhat complicated, - * and is described in detail in the commit log entry for r175612. Acquiring - * it delays an accept(2) racing with sonewconn(), which inserts the socket - * before the inpcb address/port fields are initialized. A better fix would - * prevent the socket from being placed in the listen queue until all fields - * are fully initialized. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) @@ -634,7 +683,6 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -653,8 +701,8 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) out: TCPDEBUG2(PRU_ACCEPT); + TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) *nam = in_sockaddr(port, &addr); return error; @@ -704,6 +752,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) out: TCPDEBUG2(PRU_ACCEPT); + TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) { @@ -727,7 +776,7 @@ tcp_usr_shutdown(struct socket *so) struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); @@ -740,12 +789,13 @@ tcp_usr_shutdown(struct socket *so) socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); + TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } @@ -770,15 +820,28 @@ tcp_usr_rcvd(struct socket *so, int flags) } tp = intotcpcb(inp); TCPDEBUG1(); +#ifdef TCP_RFC7413 + /* + * For passively-created TFO connections, don't attempt a window + * update while still in SYN_RECEIVED as this may trigger an early + * SYN|ACK. It is preferable to have the SYN|ACK be sent along with + * application response data, or failing that, when the DELACK timer + * expires. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED)) + goto out; +#endif #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); + TCP_PROBE2(debug__user, tp, PRU_RCVD); INP_WUNLOCK(inp); return (error); } @@ -807,14 +870,18 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * this call. */ if (flags & PRUS_EOF) - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); - if (m) + /* + * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible + * for freeing memory. + */ + if (m && (flags & PRUS_NOTREADY) == 0) m_freem(m); error = ECONNRESET; goto out; @@ -836,13 +903,12 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { - sbappendstream(&so->so_snd, m); + sbappendstream(&so->so_snd, m, flags); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. + * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) @@ -864,14 +930,15 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * Close the send side of the connection after * the data is sent. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } - if (!(inp->inp_flags & INP_DROPPED)) { + if (!(inp->inp_flags & INP_DROPPED) && + !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -894,14 +961,13 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * of data past the urgent section. * Otherwise, snd_up should be one lower. */ - sbappendstream_locked(&so->so_snd, m); + sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. + * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) @@ -918,17 +984,48 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } - tp->snd_up = tp->snd_una + so->so_snd.sb_cc; - tp->t_flags |= TF_FORCEDATA; - error = tcp_output(tp); - tp->t_flags &= ~TF_FORCEDATA; + tp->snd_up = tp->snd_una + sbavail(&so->so_snd); + if (!(flags & PRUS_NOTREADY)) { + tp->t_flags |= TF_FORCEDATA; + error = tp->t_fb->tfb_tcp_output(tp); + tp->t_flags &= ~TF_FORCEDATA; + } } out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); + TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + return (error); +} + +static int +tcp_usr_ready(struct socket *so, struct mbuf *m, int count) +{ + struct inpcb *inp; + struct tcpcb *tp; + int error; + + inp = sotoinpcb(so); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + for (int i = 0; i < count; i++) + m = m_free(m); + return (ECONNRESET); + } + tp = intotcpcb(inp); + + SOCKBUF_LOCK(&so->so_snd); + error = sbready(&so->so_snd, m, count); + SOCKBUF_UNLOCK(&so->so_snd); + if (error == 0) + error = tp->t_fb->tfb_tcp_output(tp); + INP_WUNLOCK(inp); + return (error); } @@ -945,7 +1042,7 @@ tcp_usr_abort(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); @@ -959,6 +1056,7 @@ tcp_usr_abort(struct socket *so) TCPDEBUG1(); tcp_drop(tp, ECONNABORTED); TCPDEBUG2(PRU_ABORT); + TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); @@ -967,7 +1065,7 @@ tcp_usr_abort(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } /* @@ -983,7 +1081,7 @@ tcp_usr_close(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); @@ -998,6 +1096,7 @@ tcp_usr_close(struct socket *so) TCPDEBUG1(); tcp_disconnect(tp); TCPDEBUG2(PRU_CLOSE); + TCP_PROBE2(debug__user, tp, PRU_CLOSE); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); @@ -1006,7 +1105,7 @@ tcp_usr_close(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } /* @@ -1047,6 +1146,7 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) out: TCPDEBUG2(PRU_RCVOOB); + TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } @@ -1066,6 +1166,7 @@ struct pr_usrreqs tcp_usrreqs = { .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, + .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, @@ -1088,6 +1189,7 @@ struct pr_usrreqs tcp6_usrreqs = { .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, + .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, @@ -1154,7 +1256,7 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); - tp->t_state = TCPS_SYN_SENT; + tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1170,10 +1272,7 @@ out: static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { - struct inpcb *inp = tp->t_inpcb, *oinp; - struct socket *so = inp->inp_socket; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; - struct in6_addr addr6; + struct inpcb *inp = tp->t_inpcb; int error; INP_WLOCK_ASSERT(inp); @@ -1184,39 +1283,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) if (error) goto out; } - - /* - * Cannot simply call in_pcbconnect, because there might be an - * earlier incarnation of this same connection still in - * TIME_WAIT state, creating an ADDRINUSE error. - * in6_pcbladdr() also handles scope zone IDs. - * - * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked() - * outside of in6_pcb.c if there were an in6_pcbconnect_setup(). - */ - error = in6_pcbladdr(inp, nam, &addr6); - if (error) - goto out; - oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo, - &sin6->sin6_addr, sin6->sin6_port, - IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) - ? &addr6 - : &inp->in6p_laddr, - inp->inp_lport, 0, NULL); - if (oinp) { - error = EADDRINUSE; + error = in6_pcbconnect(inp, nam, td->td_ucred); + if (error != 0) goto out; - } - if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) - inp->in6p_laddr = addr6; - inp->in6p_faddr = sin6->sin6_addr; - inp->inp_fport = sin6->sin6_port; - /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ - inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; - if (inp->inp_flags & IN6P_AUTOFLOWLABEL) - inp->inp_flow |= - (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); - in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ @@ -1224,9 +1293,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; - soisconnecting(so); + soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); - tp->t_state = TCPS_SYN_SENT; + tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1294,25 +1363,25 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) * has to revalidate that the connection is still valid for the socket * option. */ -#define INP_WLOCK_RECHECK(inp) do { \ +#define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ INP_WLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_WUNLOCK(inp); \ + cleanup; \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) +#define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { - int error, opt, optval; - u_int ui; + int error; struct inpcb *inp; struct tcpcb *tp; - struct tcp_info ti; - char buf[TCP_CA_NAME_MAX]; - struct cc_algo *algo; + struct tcp_function_block *blk; + struct tcp_function_set fsn; error = 0; inp = sotoinpcb(so); @@ -1340,6 +1409,128 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(inp); return (ECONNRESET); } + tp = intotcpcb(inp); + /* + * Protect the TCP option TCP_FUNCTION_BLK so + * that a sub-function can *never* overwrite this. + */ + if ((sopt->sopt_dir == SOPT_SET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &fsn, sizeof fsn, + sizeof fsn); + if (error) + return (error); + INP_WLOCK_RECHECK(inp); + blk = find_and_ref_tcp_functions(&fsn); + if (blk == NULL) { + INP_WUNLOCK(inp); + return (ENOENT); + } + if (tp->t_fb == blk) { + /* You already have this */ + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (0); + } + if (tp->t_state != TCPS_CLOSED) { + int error=EINVAL; + /* + * The user has advanced the state + * past the initial point, we may not + * be able to switch. + */ + if (blk->tfb_tcp_handoff_ok != NULL) { + /* + * Does the stack provide a + * query mechanism, if so it may + * still be possible? + */ + error = (*blk->tfb_tcp_handoff_ok)(tp); + } + if (error) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return(error); + } + } + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (ENOENT); + } + /* + * Release the old refcnt, the + * lookup acquired a ref on the + * new one already. + */ + if (tp->t_fb->tfb_tcp_fb_fini) { + /* + * Tell the stack to cleanup with 0 i.e. + * the tcb is not going away. + */ + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + } + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = blk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_ctloutput(tp, sopt->sopt_dir, + sopt->sopt_name); + } +#endif + INP_WUNLOCK(inp); + return (error); + } else if ((sopt->sopt_dir == SOPT_GET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name); + fsn.pcbcnt = tp->t_fb->tfb_refcnt; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &fsn, sizeof fsn); + return (error); + } + /* Pass in the INP locked, called must unlock it */ + return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); +} + +int +tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +{ + int error, opt, optval; + u_int ui; + struct tcp_info ti; + struct cc_algo *algo; + char *pbuf, buf[TCP_CA_NAME_MAX]; + size_t len; + + /* + * For TCP_CCALGOOPT forward the control to CC module, for both + * SOPT_SET and SOPT_GET. + */ + switch (sopt->sopt_name) { + case TCP_CCALGOOPT: + INP_WUNLOCK(inp); + pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); + error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, + sopt->sopt_valsize); + if (error) { + free(pbuf, M_TEMP); + return (error); + } + INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); + if (CC_ALGO(tp)->ctl_output != NULL) + error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf); + else + error = ENOENT; + INP_WUNLOCK(inp); + if (error == 0 && sopt->sopt_dir == SOPT_GET) + error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); + free(pbuf, M_TEMP); + return (error); + } switch (sopt->sopt_dir) { case SOPT_SET: @@ -1408,7 +1599,7 @@ unlock_and_done: else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); } goto unlock_and_done; @@ -1434,50 +1625,45 @@ unlock_and_done: case TCP_CONGESTION: INP_WUNLOCK(inp); - bzero(buf, sizeof(buf)); - error = sooptcopyin(sopt, &buf, sizeof(buf), 1); + error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); if (error) break; + buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) + if (strncmp(buf, algo->name, + TCP_CA_NAME_MAX) == 0) + break; + CC_LIST_RUNLOCK(); + if (algo == NULL) { + INP_WUNLOCK(inp); + error = EINVAL; + break; + } /* - * Return EINVAL if we can't find the requested cc algo. + * We hold a write lock over the tcb so it's safe to + * do these things without ordering concerns. */ - error = EINVAL; - CC_LIST_RLOCK(); - STAILQ_FOREACH(algo, &cc_list, entries) { - if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) - == 0) { - /* We've found the requested algo. */ - error = 0; - /* - * We hold a write lock over the tcb - * so it's safe to do these things - * without ordering concerns. - */ - if (CC_ALGO(tp)->cb_destroy != NULL) - CC_ALGO(tp)->cb_destroy(tp->ccv); - CC_ALGO(tp) = algo; - /* - * If something goes pear shaped - * initialising the new algo, - * fall back to newreno (which - * does not require initialisation). - */ - if (algo->cb_init != NULL) - if (algo->cb_init(tp->ccv) > 0) { - CC_ALGO(tp) = &newreno_cc_algo; - /* - * The only reason init - * should fail is - * because of malloc. - */ - error = ENOMEM; - } - break; /* Break the STAILQ_FOREACH. */ - } + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(tp->ccv); + CC_ALGO(tp) = algo; + /* + * If something goes pear shaped initialising the new + * algo, fall back to newreno (which does not + * require initialisation). + */ + if (algo->cb_init != NULL && + algo->cb_init(tp->ccv) != 0) { + CC_ALGO(tp) = &newreno_cc_algo; + /* + * The only reason init should fail is + * because of malloc. + */ + error = ENOMEM; } - CC_LIST_RUNLOCK(); - goto unlock_and_done; + INP_WUNLOCK(inp); + break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: @@ -1535,8 +1721,49 @@ unlock_and_done: (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); + goto unlock_and_done; + +#ifdef TCPPCAP + case TCP_PCAP_OUT: + case TCP_PCAP_IN: INP_WUNLOCK(inp); - break; + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval >= 0) + tcp_pcap_set_sock_max(TCP_PCAP_OUT ? + &(tp->t_outpkts) : &(tp->t_inpkts), + optval); + else + error = EINVAL; + goto unlock_and_done; +#endif + +#ifdef TCP_RFC7413 + case TCP_FASTOPEN: + INP_WUNLOCK(inp); + if (!V_tcp_fastopen_enabled) + return (EPERM); + + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval) { + tp->t_flags |= TF_FASTOPEN; + if ((tp->t_state == TCPS_LISTEN) && + (tp->t_tfo_pending == NULL)) + tp->t_tfo_pending = + tcp_fastopen_alloc_counter(); + } else + tp->t_flags &= ~TF_FASTOPEN; + goto unlock_and_done; +#endif default: INP_WUNLOCK(inp); @@ -1582,11 +1809,48 @@ unlock_and_done: error = sooptcopyout(sopt, &ti, sizeof ti); break; case TCP_CONGESTION: - bzero(buf, sizeof(buf)); - strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); + len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, buf, len + 1); + break; + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPINIT: + case TCP_KEEPCNT: + switch (sopt->sopt_name) { + case TCP_KEEPIDLE: + ui = tp->t_keepidle / hz; + break; + case TCP_KEEPINTVL: + ui = tp->t_keepintvl / hz; + break; + case TCP_KEEPINIT: + ui = tp->t_keepinit / hz; + break; + case TCP_KEEPCNT: + ui = tp->t_keepcnt; + break; + } + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &ui, sizeof(ui)); + break; +#ifdef TCPPCAP + case TCP_PCAP_OUT: + case TCP_PCAP_IN: + optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? + &(tp->t_outpkts) : &(tp->t_inpkts)); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; +#endif + +#ifdef TCP_RFC7413 + case TCP_FASTOPEN: + optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); - error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX); + error = sooptcopyout(sopt, &optval, sizeof optval); break; +#endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1597,6 +1861,7 @@ unlock_and_done: return (error); } #undef INP_WLOCK_RECHECK +#undef INP_WLOCK_RECHECK_CLEANUP /* * Attach TCP protocol to socket, allocating @@ -1617,10 +1882,10 @@ tcp_attach(struct socket *so) } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); error = in_pcballoc(so, &V_tcbinfo); if (error) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } inp = sotoinpcb(so); @@ -1636,12 +1901,13 @@ tcp_attach(struct socket *so) if (tp == NULL) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + TCPSTATES_INC(TCPS_CLOSED); return (0); } @@ -1659,7 +1925,7 @@ tcp_disconnect(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* @@ -1679,7 +1945,7 @@ tcp_disconnect(struct tcpcb *tp) sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } } @@ -1697,7 +1963,7 @@ static void tcp_usrclosed(struct tcpcb *tp) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { @@ -1705,9 +1971,9 @@ tcp_usrclosed(struct tcpcb *tp) #ifdef TCP_OFFLOAD tcp_offload_listen_stop(tp); #endif + tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ case TCPS_CLOSED: - tp->t_state = TCPS_CLOSED; tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is @@ -1723,11 +1989,11 @@ tcp_usrclosed(struct tcpcb *tp) break; case TCPS_ESTABLISHED: - tp->t_state = TCPS_FIN_WAIT_1; + tcp_state_change(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: - tp->t_state = TCPS_LAST_ACK; + tcp_state_change(tp, TCPS_LAST_ACK); break; } if (tp->t_state >= TCPS_FIN_WAIT_2) { @@ -1910,6 +2176,10 @@ db_print_tflags(u_int t_flags) db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_FASTOPEN) { + db_printf("%sTF_FASTOPEN", comma ? ", " : ""); + comma = 1; + } } static void @@ -1984,8 +2254,8 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); - db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", - tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); + db_printf("t_rcvtime: %u t_startime: %u\n", + tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", |