diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_subr.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_subr.c | 1387 |
1 files changed, 1076 insertions, 311 deletions
diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index b175c0c0..cff9bd7b 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -47,18 +47,21 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <sys/callout.h> +#include <sys/eventhandler.h> #include <sys/hhook.h> #include <sys/kernel.h> #include <sys/khelp.h> #include <sys/sysctl.h> #include <sys/jail.h> #include <sys/malloc.h> +#include <sys/refcount.h> #include <sys/mbuf.h> #ifdef INET6 #include <sys/domain.h> #endif #include <sys/priv.h> #include <sys/proc.h> +#include <sys/sdt.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/protosw.h> @@ -68,10 +71,12 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <net/if.h> +#include <net/if_var.h> #include <net/vnet.h> -#include <netinet/cc.h> #include <netinet/in.h> +#include <netinet/in_fib.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_pcb.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> @@ -79,22 +84,32 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip_icmp.h> #include <netinet/ip_var.h> #ifdef INET6 +#include <netinet/icmp6.h> #include <netinet/ip6.h> +#include <netinet6/in6_fib.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet6/nd6.h> #endif +#ifdef TCP_RFC7413 +#include <netinet/tcp_fastopen.h> +#endif +#include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_syncache.h> +#include <netinet/cc/cc.h> #ifdef INET6 #include <netinet6/tcp6_var.h> #endif #include <netinet/tcpip.h> +#ifdef TCPPCAP +#include <netinet/tcp_pcap.h> +#endif #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif @@ -125,6 +140,8 @@ VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif +struct rwlock tcp_function_lock; + static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { @@ -141,8 +158,8 @@ sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); @@ -163,8 +180,8 @@ sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ @@ -178,12 +195,12 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); @@ -191,30 +208,30 @@ static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); -static int tcp_tcbhashsize = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, +static int tcp_tcbhashsize; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_VNET_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); static VNET_DEFINE(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); -static int tcp_soreceive_stream = 0; +static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); @@ -231,9 +248,193 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); +static void tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); + +static struct tcp_function_block tcp_def_funcblk = { + "default", + tcp_output, + tcp_do_segment, + tcp_default_ctloutput, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + 0, + 0 +}; + +int t_functions_inited = 0; +struct tcp_funchead t_functions; +static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; + +static void +init_tcp_functions(void) +{ + if (t_functions_inited == 0) { + TAILQ_INIT(&t_functions); + rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); + t_functions_inited = 1; + } +} + +static struct tcp_function_block * +find_tcp_functions_locked(struct tcp_function_set *fs) +{ + struct tcp_function *f; + struct tcp_function_block *blk=NULL; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { + blk = f->tf_fb; + break; + } + } + return(blk); +} + +static struct tcp_function_block * +find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) +{ + struct tcp_function_block *rblk=NULL; + struct tcp_function *f; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (f->tf_fb == blk) { + rblk = blk; + if (s) { + *s = f; + } + break; + } + } + return (rblk); +} + +struct tcp_function_block * +find_and_ref_tcp_functions(struct tcp_function_set *fs) +{ + struct tcp_function_block *blk; + + rw_rlock(&tcp_function_lock); + blk = find_tcp_functions_locked(fs); + if (blk) + refcount_acquire(&blk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(blk); +} + +struct tcp_function_block * +find_and_ref_tcp_fb(struct tcp_function_block *blk) +{ + struct tcp_function_block *rblk; + + rw_rlock(&tcp_function_lock); + rblk = find_tcp_fb_locked(blk, NULL); + if (rblk) + refcount_acquire(&rblk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(rblk); +} + + +static int +sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) +{ + int error=ENOENT; + struct tcp_function_set fs; + struct tcp_function_block *blk; + + memset(&fs, 0, sizeof(fs)); + rw_rlock(&tcp_function_lock); + blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); + if (blk) { + /* Found him */ + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + fs.pcbcnt = blk->tfb_refcnt; + } + rw_runlock(&tcp_function_lock); + error = sysctl_handle_string(oidp, fs.function_set_name, + sizeof(fs.function_set_name), req); + + /* Check for error or no change */ + if (error != 0 || req->newptr == NULL) + return(error); + + rw_wlock(&tcp_function_lock); + blk = find_tcp_functions_locked(&fs); + if ((blk == NULL) || + (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { + error = ENOENT; + goto done; + } + tcp_func_set_ptr = blk; +done: + rw_wunlock(&tcp_function_lock); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, + CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_net_inet_default_tcp_functions, "A", + "Set/get the default TCP functions"); + +static int +sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) +{ + int error, cnt, linesz; + struct tcp_function *f; + char *buffer, *cp; + size_t bufsz, outsz; + + cnt = 0; + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + cnt++; + } + rw_runlock(&tcp_function_lock); + + bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1; + buffer = malloc(bufsz, M_TEMP, M_WAITOK); + + error = 0; + cp = buffer; + + linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count"); + cp += linesz; + bufsz -= linesz; + outsz = linesz; + + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + linesz = snprintf(cp, bufsz, "%-32s%c %u\n", + f->tf_fb->tfb_tcp_block_name, + (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', + f->tf_fb->tfb_refcnt); + if (linesz >= bufsz) { + error = EOVERFLOW; + break; + } + cp += linesz; + bufsz -= linesz; + outsz += linesz; + } + rw_runlock(&tcp_function_lock); + if (error == 0) + error = sysctl_handle_string(oidp, buffer, outsz + 1, req); + free(buffer, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, + CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_net_inet_list_available, "A", + "list available TCP Function sets"); + /* * Target size of TCP PCB hash tables. Must be a power of two. * @@ -241,7 +442,7 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE -#define TCBHASHSIZE 512 +#define TCBHASHSIZE 0 #endif /* @@ -261,6 +462,8 @@ static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); +MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); + static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -288,48 +491,196 @@ tcp_inpcb_init(void *mem, int size, int flags) return (0); } +/* + * Take a value and get the next power of 2 that doesn't overflow. + * Used to size the tcp_inpcb hash buckets. + */ +static int +maketcp_hashsize(int size) +{ + int hashsize; + + /* + * auto tune. + * get the next power of 2 higher than maxsockets. + */ + hashsize = 1 << fls(size); + /* catch overflow, and just go one power of 2 smaller */ + if (hashsize < size) { + hashsize = 1 << (fls(size) - 1); + } + return (hashsize); +} + +int +register_tcp_functions(struct tcp_function_block *blk, int wait) +{ + struct tcp_function_block *lblk; + struct tcp_function *n; + struct tcp_function_set fs; + + if (t_functions_inited == 0) { + init_tcp_functions(); + } + if ((blk->tfb_tcp_output == NULL) || + (blk->tfb_tcp_do_segment == NULL) || + (blk->tfb_tcp_ctloutput == NULL) || + (strlen(blk->tfb_tcp_block_name) == 0)) { + /* + * These functions are required and you + * need a name. + */ + return (EINVAL); + } + if (blk->tfb_tcp_timer_stop_all || + blk->tfb_tcp_timer_activate || + blk->tfb_tcp_timer_active || + blk->tfb_tcp_timer_stop) { + /* + * If you define one timer function you + * must have them all. + */ + if ((blk->tfb_tcp_timer_stop_all == NULL) || + (blk->tfb_tcp_timer_activate == NULL) || + (blk->tfb_tcp_timer_active == NULL) || + (blk->tfb_tcp_timer_stop == NULL)) { + return (EINVAL); + } + } + n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); + if (n == NULL) { + return (ENOMEM); + } + n->tf_fb = blk; + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + rw_wlock(&tcp_function_lock); + lblk = find_tcp_functions_locked(&fs); + if (lblk) { + /* Duplicate name space not allowed */ + rw_wunlock(&tcp_function_lock); + free(n, M_TCPFUNCTIONS); + return (EALREADY); + } + refcount_init(&blk->tfb_refcnt, 0); + blk->tfb_flags = 0; + TAILQ_INSERT_TAIL(&t_functions, n, tf_next); + rw_wunlock(&tcp_function_lock); + return(0); +} + +int +deregister_tcp_functions(struct tcp_function_block *blk) +{ + struct tcp_function_block *lblk; + struct tcp_function *f; + int error=ENOENT; + + if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { + /* You can't un-register the default */ + return (EPERM); + } + rw_wlock(&tcp_function_lock); + if (blk == tcp_func_set_ptr) { + /* You can't free the current default */ + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + if (blk->tfb_refcnt) { + /* Still tcb attached, mark it. */ + blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + lblk = find_tcp_fb_locked(blk, &f); + if (lblk) { + /* Found */ + TAILQ_REMOVE(&t_functions, f, tf_next); + f->tf_fb = NULL; + free(f, M_TCPFUNCTIONS); + error = 0; + } + rw_wunlock(&tcp_function_lock); + return (error); +} + void tcp_init(void) { + const char *tcbhash_tuneable; int hashsize; + tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; + if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); - hashsize = TCBHASHSIZE; - TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); + TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); + if (hashsize == 0) { + /* + * Auto tune the hash size based on maxsockets. + * A perfect hash would have a 1:1 mapping + * (hashsize = maxsockets) however it's been + * suggested that O(2) average is better. + */ + hashsize = maketcp_hashsize(maxsockets / 4); + /* + * Our historical default is 512, + * do not autotune lower than this. + */ + if (hashsize < 512) + hashsize = 512; + if (bootverbose && IS_DEFAULT_VNET(curvnet)) + printf("%s: %s auto tuned to %d\n", __func__, + tcbhash_tuneable, hashsize); + } + /* + * We require a hashsize to be a power of two. + * Previously if it was not a power of two we would just reset it + * back to 512, which could be a nasty surprise if you did not notice + * the error message. + * Instead what we do is clip it to the closest power of two lower + * than the specified hash value. + */ if (!powerof2(hashsize)) { - printf("WARNING: TCB hash size not a power of 2\n"); - hashsize = 512; /* safe default */ + int oldhashsize = hashsize; + + hashsize = maketcp_hashsize(hashsize); + /* prevent absurdly low value */ + if (hashsize < 16) + hashsize = 16; + printf("%s: WARNING: TCB hash size not a power of 2, " + "clipped from %d to %d.\n", __func__, oldhashsize, + hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, - IPI_HASHFIELDS_4TUPLE); + "tcp_inpcb", tcp_inpcb_init, NULL, 0, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcpcb_zone, maxsockets); + uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); - tcp_reass_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; + tcp_reass_global_init(); + /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; @@ -340,11 +691,15 @@ tcp_init(void) tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; + tcp_persmin = TCPTV_PERSMIN; + tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; + /* Setup the tcp function block list */ + init_tcp_functions(); + register_tcp_functions(&tcp_def_funcblk, M_WAITOK); - TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream); if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; @@ -370,21 +725,64 @@ tcp_init(void) SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); +#ifdef TCPPCAP + tcp_pcap_init(); +#endif + +#ifdef TCP_RFC7413 + tcp_fastopen_init(); +#endif } #ifdef VIMAGE -void -tcp_destroy(void) +static void +tcp_destroy(void *unused __unused) { + int error, n; - tcp_reass_destroy(); + /* + * All our processes are gone, all our sockets should be cleaned + * up, which means, we should be past the tcp_discardcb() calls. + * Sleep to let all tcpcb timers really disappear and cleanup. + */ + for (;;) { + INP_LIST_RLOCK(&V_tcbinfo); + n = V_tcbinfo.ipi_count; + INP_LIST_RUNLOCK(&V_tcbinfo); + if (n == 0) + break; + pause("tcpdes", hz / 10); + } tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); + /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); + +#ifdef TCP_RFC7413 + /* + * Cannot free the zone until all tcpcbs are released as we attach + * the allocations to them. + */ + tcp_fastopen_destroy(); +#endif + + error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); + if (error != 0) { + printf("%s: WARNING: unable to deregister helper hook " + "type=%d, id=%d: error %d returned\n", __func__, + HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); + } + error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); + if (error != 0) { + printf("%s: WARNING: unable to deregister helper hook " + "type=%d, id=%d: error %d returned\n", __func__, + HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); + } } +VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void @@ -473,31 +871,33 @@ tcpip_maketemplate(struct inpcb *inp) /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy - * of the tcpiphdr at ti and send directly to the addressed host. + * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send - * a message back to the TCP which originated the * segment ti, + * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * - * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { - int tlen; - int win = 0; + struct tcpopt to; + struct inpcb *inp; struct ip *ip; + struct mbuf *optm; struct tcphdr *nth; + u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ - int ipflags = 0; - struct inpcb *inp; + int optlen, tlen, win; + bool incl_opts; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); @@ -514,18 +914,21 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, } else inp = NULL; + incl_opts = false; + win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } + if ((tp->t_flags & TF_NOOPT) == 0) + incl_opts = true; } if (m == NULL) { - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; - tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { @@ -535,35 +938,71 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ - { - bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); - ip = mtod(m, struct ip *); - nth = (struct tcphdr *)(ip + 1); - } + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + nth = (struct tcphdr *)(ip + 1); + } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; + } else if (!M_WRITABLE(m)) { + struct mbuf *n; + + /* Can't reuse 'm', allocate a new mbuf. */ + n = m_gethdr(M_NOWAIT, MT_DATA); + if (n == NULL) { + m_freem(m); + return; + } + + if (!m_dup_pkthdr(n, m, M_NOWAIT)) { + m_freem(m); + m_freem(n); + return; + } + + n->m_data += max_linkhdr; + /* m_len is set later */ +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(n, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(n, struct ip6_hdr *); + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); + ip = mtod(n, struct ip *); + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); + xchg(nth->th_dport, nth->th_sport, uint16_t); + th = nth; + m_freem(m); + m = n; } else { /* * reuse the mbuf. - * XXX MRT We inherrit the FIB, which is lucky. + * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; - m_addr_changed(m); /* m_len is set later */ - tlen = 0; -#define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ - { - xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); - nth = (struct tcphdr *)(ip + 1); - } + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); + nth = (struct tcphdr *)(ip + 1); + } if (th != nth) { /* * this is usually a case when an extension header @@ -576,13 +1015,65 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } + tlen = 0; +#ifdef INET6 + if (isipv6) + tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); +#endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + tlen = sizeof (struct tcpiphdr); +#endif +#ifdef INVARIANTS + m->m_len = 0; + KASSERT(M_TRAILINGSPACE(m) >= tlen, + ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", + m, tlen, (long)M_TRAILINGSPACE(m))); +#endif + m->m_len = tlen; + to.to_flags = 0; + if (incl_opts) { + /* Make sure we have room. */ + if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { + m->m_next = m_get(M_NOWAIT, MT_DATA); + if (m->m_next) { + optp = mtod(m->m_next, u_char *); + optm = m->m_next; + } else + incl_opts = false; + } else { + optp = (u_char *) (nth + 1); + optm = m; + } + } + if (incl_opts) { + /* Timestamps. */ + if (tp->t_flags & TF_RCVD_TSTMP) { + to.to_tsval = tcp_ts_getticks() + tp->ts_offset; + to.to_tsecr = tp->ts_recent; + to.to_flags |= TOF_TS; + } +#ifdef TCP_SIGNATURE + /* TCP-MD5 (RFC2385). */ + if (tp->t_flags & TF_SIGNATURE) + to.to_flags |= TOF_SIGNATURE; +#endif + + /* Add the options. */ + tlen += optlen = tcp_addoptions(&to, optp); + + /* Update m_len in the correct mbuf. */ + optm->m_len += optlen; + } else + optlen = 0; #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; - ip6->ip6_plen = 0; /* Set in ip6_output(). */ - tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) @@ -590,14 +1081,12 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #endif #ifdef INET { - tlen += sizeof (struct tcpiphdr); - ip->ip_len = tlen; + ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) - ip->ip_off |= IP_DF; + ip->ip_off |= htons(IP_DF); } #endif - m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC @@ -619,7 +1108,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; - nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); @@ -627,6 +1116,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, nth->th_win = htons((u_short)win); nth->th_urp = 0; +#ifdef TCP_SIGNATURE + if (to.to_flags & TOF_SIGNATURE) { + tcp_signature_compute(m, 0, 0, optlen, to.to_signature, + IPSEC_DIR_OUTBOUND); + } +#endif + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { @@ -651,15 +1147,21 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif + TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); + if (flags & TH_RST) + TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *), + tp, nth); + + TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth); #ifdef INET6 if (isipv6) - (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); + (void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET - (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); + (void) ip_output(m, NULL, NULL, 0, NULL, inp); #endif } @@ -687,7 +1189,10 @@ tcp_newtcpcb(struct inpcb *inp) tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; - + rw_rlock(&tcp_function_lock); + tp->t_fb = tcp_func_set_ptr; + refcount_acquire(&tp->t_fb->tfb_refcnt); + rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ @@ -698,12 +1203,18 @@ tcp_newtcpcb(struct inpcb *inp) if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } @@ -713,25 +1224,31 @@ tcp_newtcpcb(struct inpcb *inp) #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ - tp->t_maxseg = tp->t_maxopd = + tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ - callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); + callout_init(&tp->t_timers->tt_rexmt, 1); + callout_init(&tp->t_timers->tt_persist, 1); + callout_init(&tp->t_timers->tt_keep, 1); + callout_init(&tp->t_timers->tt_2msl, 1); + callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); - tp->t_inpcb = inp; /* XXX */ + /* + * The tcpcb will hold a reference on its inpcb until tcp_discardcb() + * is called. + */ + in_pcbref(inp); /* Reference for tcpcb */ + tp->t_inpcb = inp; + /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives @@ -751,6 +1268,15 @@ tcp_newtcpcb(struct inpcb *inp) */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; +#ifdef TCPPCAP + /* + * Init the TCP PCAP queues. + */ + tcp_pcap_tcpcb_init(tp); +#endif + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } return (tp); /* XXX */ } @@ -779,7 +1305,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code @@ -809,7 +1335,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); @@ -827,12 +1353,12 @@ tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { - tp->t_state = TCPS_CLOSED; - (void) tcp_output(tp); + tcp_state_change(tp, TCPS_CLOSED); + (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -850,6 +1376,7 @@ tcp_discardcb(struct tcpcb *tp) #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ + int released; INP_WLOCK_ASSERT(inp); @@ -857,22 +1384,27 @@ tcp_discardcb(struct tcpcb *tp) * Make sure that all of our timers are stopped before we delete the * PCB. * - * XXXRW: Really, we would like to use callout_drain() here in order - * to avoid races experienced in tcp_timer.c where a timer is already - * executing at this point. However, we can't, both because we're - * running in a context where we can't sleep, and also because we - * hold locks required by the timers. What we instead need to do is - * test to see if callout_drain() is required, and if so, defer some - * portion of the remainder of tcp_discardcb() to an asynchronous - * context that can callout_drain() and then continue. Some care - * will be required to ensure that no further processing takes place - * on the tcpcb, even though it hasn't been freed (a flag?). + * If stopping a timer fails, we schedule a discard function in same + * callout, and the last discard function called will take care of + * deleting the tcpcb. */ - callout_stop(&tp->t_timers->tt_rexmt); - callout_stop(&tp->t_timers->tt_persist); - callout_stop(&tp->t_timers->tt_keep); - callout_stop(&tp->t_timers->tt_2msl); - callout_stop(&tp->t_timers->tt_delack); + tp->t_timers->tt_draincnt = 0; + tcp_timer_stop(tp, TT_REXMT); + tcp_timer_stop(tp, TT_PERSIST); + tcp_timer_stop(tp, TT_KEEP); + tcp_timer_stop(tp, TT_2MSL); + tcp_timer_stop(tp, TT_DELACK); + if (tp->t_fb->tfb_tcp_timer_stop_all) { + /* + * Call the stop-all function of the methods, + * this function should call the tcp_timer_stop() + * method with each of the function specific timeouts. + * That stop will be called via the tfb_tcp_timer_stop() + * which should use the async drain function of the + * callout system (see tcp_var.h). + */ + tp->t_fb->tfb_tcp_timer_stop_all(tp); + } /* * If we got enough samples through the srtt filter, @@ -893,7 +1425,7 @@ tcp_discardcb(struct tcpcb *tp) * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. - * ssthresh is only set if packet loss occured on a session. + * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. @@ -909,14 +1441,14 @@ tcp_discardcb(struct tcpcb *tp) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 - (isipv6 ? sizeof (struct ip6_hdr) + - sizeof (struct tcphdr) : + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : #endif - sizeof (struct tcpiphdr) + sizeof (struct tcpiphdr) #ifdef INET6 - ) + ) #endif - ); + ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; @@ -941,6 +1473,12 @@ tcp_discardcb(struct tcpcb *tp) tcp_free_sackholes(tp); +#ifdef TCPPCAP + /* Free the TCP PCAP queues. */ + tcp_pcap_drain(&(tp->t_inpkts)); + tcp_pcap_drain(&(tp->t_outpkts)); +#endif + /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); @@ -949,8 +1487,51 @@ tcp_discardcb(struct tcpcb *tp) CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; - tp->t_inpcb = NULL; - uma_zfree(V_tcpcb_zone, tp); + if (tp->t_timers->tt_draincnt == 0) { + /* We own the last reference on tcpcb, let's free it. */ + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_inpcb = NULL; + uma_zfree(V_tcpcb_zone, tp); + released = in_pcbrele_wlocked(inp); + KASSERT(!released, ("%s: inp %p should not have been released " + "here", __func__, inp)); + } +} + +void +tcp_timer_discard(void *ptp) +{ + struct inpcb *inp; + struct tcpcb *tp; + + tp = (struct tcpcb *)ptp; + CURVNET_SET(tp->t_vnet); + INP_INFO_RLOCK(&V_tcbinfo); + inp = tp->t_inpcb; + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", + __func__, tp)); + INP_WLOCK(inp); + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, + ("%s: tcpcb has to be stopped here", __func__)); + tp->t_timers->tt_draincnt--; + if (tp->t_timers->tt_draincnt == 0) { + /* We own the last reference on this tcpcb, let's free it. */ + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_inpcb = NULL; + uma_zfree(V_tcpcb_zone, tp); + if (in_pcbrele_wlocked(inp)) { + INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + } + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); } /* @@ -963,15 +1544,27 @@ tcp_close(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct socket *so; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif +#ifdef TCP_RFC7413 + /* + * This releases the TFO pending counter resource for TFO listen + * sockets as well as passively-created TFO sockets that transition + * from SYN_RECEIVED to CLOSED. + */ + if (tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + } +#endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); + TCPSTATES_DEC(tp->t_state); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); @@ -1009,9 +1602,9 @@ tcp_drain(void) * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially - * usefull. + * useful. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_flags & INP_TIMEWAIT) continue; @@ -1019,10 +1612,17 @@ tcp_drain(void) if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); +#ifdef TCPPCAP + if (tcp_pcap_aggressive_free) { + /* Free the TCP PCAP queues. */ + tcp_pcap_drain(&(tcpb->t_inpkts)); + tcp_pcap_drain(&(tcpb->t_outpkts)); + } +#endif } INP_WUNLOCK(inpb); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -1041,7 +1641,7 @@ tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -1061,6 +1661,10 @@ tcp_notify(struct inpcb *inp, int error) if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { @@ -1093,7 +1697,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { - n = V_tcbinfo.ipi_count + syncache_pcbcount(); + n = V_tcbinfo.ipi_count + + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); @@ -1105,12 +1710,12 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_LIST_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_LIST_RUNLOCK(&V_tcbinfo); - m = syncache_pcbcount(); + m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); @@ -1130,10 +1735,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return (ENOMEM); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); @@ -1158,7 +1761,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); n = i; error = 0; @@ -1196,14 +1799,14 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (!error) { /* @@ -1213,11 +1816,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); @@ -1354,16 +1957,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; - /* - * Redirects don't need to be handled up here. - */ - else if (PRC_IS_REDIRECT(cmd)) - return; - /* - * Source quench is depreciated. - */ - else if (cmd == PRC_QUENCH) - return; + /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an @@ -1373,75 +1967,79 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; - if (ip != NULL) { - icp = (struct icmp *)((caddr_t)ip - - offsetof(struct icmp, icmp_ip)); - th = (struct tcphdr *)((caddr_t)ip - + (ip->ip_hl << 2)); - INP_INFO_WLOCK(&V_tcbinfo); - inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); - if (inp != NULL) { - if (!(inp->inp_flags & INP_TIMEWAIT) && - !(inp->inp_flags & INP_DROPPED) && - !(inp->inp_socket == NULL)) { - icmp_tcp_seq = htonl(th->th_seq); - tp = intotcpcb(inp); - if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && - SEQ_LT(icmp_tcp_seq, tp->snd_max)) { - if (cmd == PRC_MSGSIZE) { - /* - * MTU discovery: - * If we got a needfrag set the MTU - * in the route to the suggested new - * value (if given) and then notify. - */ - bzero(&inc, sizeof(inc)); - inc.inc_faddr = faddr; - inc.inc_fibnum = - inp->inp_inc.inc_fibnum; - - mtu = ntohs(icp->icmp_nextmtu); - /* - * If no alternative MTU was - * proposed, try the next smaller - * one. ip->ip_len has already - * been swapped in icmp_input(). - */ - if (!mtu) - mtu = ip_next_mtu(ip->ip_len, - 1); - if (mtu < V_tcp_minmss - + sizeof(struct tcpiphdr)) - mtu = V_tcp_minmss - + sizeof(struct tcpiphdr); - /* - * Only cache the MTU if it - * is smaller than the interface - * or route MTU. tcp_mtudisc() - * will do right thing by itself. - */ - if (mtu <= tcp_maxmtu(&inc, NULL)) + + if (ip == NULL) { + in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); + return; + } + + icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); + th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + INP_INFO_RLOCK(&V_tcbinfo); + inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, + th->th_sport, INPLOOKUP_WLOCKPCB, NULL); + if (inp != NULL && PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + inp = (*notify)(inp, EHOSTDOWN); + if (inp != NULL) + INP_WUNLOCK(inp); + } else if (inp != NULL) { + if (!(inp->inp_flags & INP_TIMEWAIT) && + !(inp->inp_flags & INP_DROPPED) && + !(inp->inp_socket == NULL)) { + icmp_tcp_seq = ntohl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && + SEQ_LT(icmp_tcp_seq, tp->snd_max)) { + if (cmd == PRC_MSGSIZE) { + /* + * MTU discovery: + * If we got a needfrag set the MTU + * in the route to the suggested new + * value (if given) and then notify. + */ + mtu = ntohs(icp->icmp_nextmtu); + /* + * If no alternative MTU was + * proposed, try the next smaller + * one. + */ + if (!mtu) + mtu = ip_next_mtu( + ntohs(ip->ip_len), 1); + if (mtu < V_tcp_minmss + + sizeof(struct tcpiphdr)) + mtu = V_tcp_minmss + + sizeof(struct tcpiphdr); + /* + * Only process the offered MTU if it + * is smaller than the current one. + */ + if (mtu < tp->t_maxseg + + sizeof(struct tcpiphdr)) { + bzero(&inc, sizeof(inc)); + inc.inc_faddr = faddr; + inc.inc_fibnum = + inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); - tcp_mtudisc(inp, mtu); - } else - inp = (*notify)(inp, - inetctlerrmap[cmd]); - } + tcp_mtudisc(inp, mtu); + } + } else + inp = (*notify)(inp, + inetctlerrmap[cmd]); } - if (inp != NULL) - INP_WUNLOCK(inp); - } else { - bzero(&inc, sizeof(inc)); - inc.inc_fport = th->th_dport; - inc.inc_lport = th->th_sport; - inc.inc_faddr = faddr; - inc.inc_laddr = ip->ip_src; - syncache_unreach(&inc, th); } - INP_INFO_WUNLOCK(&V_tcbinfo); - } else - in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); + if (inp != NULL) + INP_WUNLOCK(inp); + } else { + bzero(&inc, sizeof(inc)); + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc_faddr = faddr; + inc.inc_laddr = ip->ip_src; + syncache_unreach(&inc, th); + } + INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET */ @@ -1449,75 +2047,146 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { - struct tcphdr th; + struct in6_addr *dst; + struct tcphdr *th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; + struct inpcb *inp; + struct tcpcb *tp; + struct icmp6_hdr *icmp6; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; - int off; - struct tcp_portonly { - u_int16_t th_sport; - u_int16_t th_dport; - } *thp; + struct in_conninfo inc; + tcp_seq icmp_tcp_seq; + unsigned int mtu; + unsigned int off; + if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; - if (cmd == PRC_MSGSIZE) - notify = tcp_mtudisc_notify; - else if (!PRC_IS_REDIRECT(cmd) && - ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) - return; - /* Source quench is depreciated. */ - else if (cmd == PRC_QUENCH) - return; - /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; + icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; + dst = ip6cp->ip6c_finaldst; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; + dst = NULL; } - if (ip6 != NULL) { - struct in_conninfo inc; - /* - * XXX: We assume that when IPV6 is non NULL, - * M and OFF are valid. - */ + if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc_notify; + else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && + ip6 != NULL) + notify = tcp_drop_syn_sent; - /* check if we can safely examine src and dst ports */ - if (m->m_pkthdr.len < off + sizeof(*thp)) - return; + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * XXX: We never get this from ICMP, otherwise it makes an + * excellent DoS attack on machines with many connections. + */ + else if (cmd == PRC_HOSTDEAD) + ip6 = NULL; + else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0) + return; - bzero(&th, sizeof(th)); - m_copydata(m, off, sizeof(*thp), (caddr_t)&th); + if (ip6 == NULL) { + in6_pcbnotify(&V_tcbinfo, sa, 0, + (const struct sockaddr *)sa6_src, + 0, cmd, NULL, notify); + return; + } - in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, - (struct sockaddr *)ip6cp->ip6c_src, - th.th_sport, cmd, NULL, notify); + /* Check if we can safely get the ports from the tcp hdr */ + if (m == NULL || + (m->m_pkthdr.len < + (int32_t) (off + offsetof(struct tcphdr, th_seq)))) { + return; + } + th = (struct tcphdr *) mtodo(ip6cp->ip6c_m, ip6cp->ip6c_off); + INP_INFO_RLOCK(&V_tcbinfo); + inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, th->th_dport, + &ip6->ip6_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); + if (inp != NULL && PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + inp = (*notify)(inp, EHOSTDOWN); + if (inp != NULL) + INP_WUNLOCK(inp); + } else if (inp != NULL) { + if (!(inp->inp_flags & INP_TIMEWAIT) && + !(inp->inp_flags & INP_DROPPED) && + !(inp->inp_socket == NULL)) { + icmp_tcp_seq = ntohl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && + SEQ_LT(icmp_tcp_seq, tp->snd_max)) { + if (cmd == PRC_MSGSIZE) { + /* + * MTU discovery: + * If we got a needfrag set the MTU + * in the route to the suggested new + * value (if given) and then notify. + */ + mtu = ntohl(icmp6->icmp6_mtu); + /* + * If no alternative MTU was + * proposed, or the proposed + * MTU was too small, set to + * the min. + */ + if (mtu < IPV6_MMTU) + mtu = IPV6_MMTU - 8; + + + bzero(&inc, sizeof(inc)); + inc.inc_fibnum = M_GETFIB(m); + inc.inc_flags |= INC_ISIPV6; + inc.inc6_faddr = *dst; + if (in6_setscope(&inc.inc6_faddr, + m->m_pkthdr.rcvif, NULL)) + goto unlock_inp; + + /* + * Only process the offered MTU if it + * is smaller than the current one. + */ + if (mtu < tp->t_maxseg + + (sizeof (*th) + sizeof (*ip6))) { + tcp_hc_updatemtu(&inc, mtu); + tcp_mtudisc(inp, mtu); + ICMP6STAT_INC(icp6s_pmtuchg); + } + } else + inp = (*notify)(inp, + inet6ctlerrmap[cmd]); + } + } +unlock_inp: + if (inp != NULL) + INP_WUNLOCK(inp); + } else { bzero(&inc, sizeof(inc)); - inc.inc_fport = th.th_dport; - inc.inc_lport = th.th_sport; - inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; - inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; + inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; - INP_INFO_WLOCK(&V_tcbinfo); - syncache_unreach(&inc, &th); - INP_INFO_WUNLOCK(&V_tcbinfo); - } else - in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, - 0, cmd, NULL, notify); + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc6_faddr = *dst; + inc.inc6_laddr = ip6->ip6_src; + syncache_unreach(&inc, th); + } + INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET6 */ @@ -1647,7 +2316,7 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -1675,10 +2344,11 @@ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { - return (tcp_mtudisc(inp, -1)); + tcp_mtudisc(inp, -1); + return (inp); } -struct inpcb * +static void tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; @@ -1687,7 +2357,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) - return (inp); + return; tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); @@ -1708,8 +2378,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output(tp); - return (inp); + tp->t_fb->tfb_tcp_output(tp); } #ifdef INET @@ -1722,27 +2391,20 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) u_long tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { - struct route sro; - struct sockaddr_in *dst; + struct nhop4_extended nh4; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); - bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { - dst = (struct sockaddr_in *)&sro.ro_dst; - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = inc->inc_faddr; - in_rtalloc_ign(&sro, 0, inc->inc_fibnum); - } - if (sro.ro_rt != NULL) { - ifp = sro.ro_rt->rt_ifp; - if (sro.ro_rt->rt_rmx.rmx_mtu == 0) - maxmtu = ifp->if_mtu; - else - maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + + if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, + NHR_REF, 0, &nh4) != 0) + return (0); + + ifp = nh4.nh_ifp; + maxmtu = nh4.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { @@ -1754,7 +2416,7 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } - RTFREE(sro.ro_rt); + fib4_free_nh_ext(inc->inc_fibnum, &nh4); } return (maxmtu); } @@ -1764,26 +2426,22 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) u_long tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { - struct route_in6 sro6; + struct nhop6_extended nh6; + struct in6_addr dst6; + uint32_t scopeid; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); - bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { - sro6.ro_dst.sin6_family = AF_INET6; - sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); - sro6.ro_dst.sin6_addr = inc->inc6_faddr; - in6_rtalloc_ign(&sro6, 0, inc->inc_fibnum); - } - if (sro6.ro_rt != NULL) { - ifp = sro6.ro_rt->rt_ifp; - if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) - maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); - else - maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, - IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); + if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, + 0, &nh6) != 0) + return (0); + + ifp = nh6.nh_ifp; + maxmtu = nh6.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { @@ -1795,13 +2453,66 @@ tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } - RTFREE(sro6.ro_rt); + fib6_free_nh_ext(inc->inc_fibnum, &nh6); } return (maxmtu); } #endif /* INET6 */ +/* + * Calculate effective SMSS per RFC5681 definition for a given TCP + * connection at its current state, taking into account SACK and etc. + */ +u_int +tcp_maxseg(const struct tcpcb *tp) +{ + u_int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We might make mistakes with padding here in some edge cases, + * but this is harmless, since result of tcp_maxseg() is used + * only in cwnd and ssthresh estimations. + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#ifdef TCP_SIGNATURE + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { + optlen += TCPOLEN_SACKHDR; + optlen += tp->rcv_numsacks * TCPOLEN_SACK; + optlen = PAD(optlen); + } + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#ifdef TCP_SIGNATURE + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} + #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t @@ -1816,9 +2527,10 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp) #endif struct tcphdr *th; - if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) + if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) || + (!key_havesp(IPSEC_DIR_OUTBOUND))) return (0); - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) return (0); @@ -1859,55 +2571,20 @@ tcp_signature_apply(void *fstate, void *data, u_int len) } /* - * Compute TCP-MD5 hash of a TCP segment. (RFC2385) - * - * Parameters: - * m pointer to head of mbuf chain - * _unused - * len length of TCP segment data, excluding options - * optlen length of TCP segment options - * buf pointer to storage for computed MD5 digest - * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) - * - * We do this over ip, tcphdr, segment data, and the key in the SADB. - * When called from tcp_input(), we can be sure that th_sum has been - * zeroed out and verified already. - * - * Return 0 if successful, otherwise return -1. - * * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 - * right now. Another branch of this code exists which uses the SPD to - * specify per-application flows but it is unstable. - */ -int -tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, - u_char *buf, u_int direction) +*/ +struct secasvar * +tcp_get_sav(struct mbuf *m, u_int direction) { union sockaddr_union dst; -#ifdef INET - struct ippseudo ippseudo; -#endif - MD5_CTX ctx; - int doff; - struct ip *ip; -#ifdef INET - struct ipovly *ipovly; -#endif struct secasvar *sav; - struct tcphdr *th; + struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; - struct in6_addr in6; char ip6buf[INET6_ADDRSTRLEN]; - uint32_t plen; - uint16_t nhdr; #endif - u_short savecsum; - - KASSERT(m != NULL, ("NULL mbuf chain")); - KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ bzero(&dst, sizeof(union sockaddr_union)); @@ -1934,7 +2611,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, break; #endif default: - return (EINVAL); + return (NULL); /* NOTREACHED */ break; } @@ -1949,9 +2626,61 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : #endif "(unsupported)")); - return (EINVAL); } + return (sav); +} + +/* + * Compute TCP-MD5 hash of a TCP segment. (RFC2385) + * + * Parameters: + * m pointer to head of mbuf chain + * len length of TCP segment data, excluding options + * optlen length of TCP segment options + * buf pointer to storage for computed MD5 digest + * sav pointer to security assosiation + * + * We do this over ip, tcphdr, segment data, and the key in the SADB. + * When called from tcp_input(), we can be sure that th_sum has been + * zeroed out and verified already. + * + * Releases reference to SADB key before return. + * + * Return 0 if successful, otherwise return -1. + * + */ +int +tcp_signature_do_compute(struct mbuf *m, int len, int optlen, + u_char *buf, struct secasvar *sav) +{ +#ifdef INET + struct ippseudo ippseudo; +#endif + MD5_CTX ctx; + int doff; + struct ip *ip; +#ifdef INET + struct ipovly *ipovly; +#endif + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6; + struct in6_addr in6; + uint32_t plen; + uint16_t nhdr; +#endif + u_short savecsum; + + KASSERT(m != NULL, ("NULL mbuf chain")); + KASSERT(buf != NULL, ("NULL signature pointer")); + + /* Extract the destination from the IP header in the mbuf. */ + ip = mtod(m, struct ip *); +#ifdef INET6 + ip6 = NULL; /* Make the compiler happy. */ +#endif + MD5Init(&ctx); /* * Step 1: Update MD5 hash with IP(v6) pseudo-header. @@ -2008,7 +2737,8 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, break; #endif default: - return (EINVAL); + KEY_FREESAV(&sav); + return (-1); /* NOTREACHED */ break; } @@ -2042,6 +2772,23 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, } /* + * Compute TCP-MD5 hash of a TCP segment. (RFC2385) + * + * Return 0 if successful, otherwise return -1. + */ +int +tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, + u_char *buf, u_int direction) +{ + struct secasvar *sav; + + if ((sav = tcp_get_sav(m, direction)) == NULL) + return (-1); + + return (tcp_signature_do_compute(m, len, optlen, buf, sav)); +} + +/* * Verify the TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: @@ -2170,7 +2917,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) default: return (EINVAL); } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: @@ -2209,12 +2956,12 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) INP_WUNLOCK(inp); } else error = ESRCH; - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop, - CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, + CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* @@ -2332,3 +3079,21 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, panic("%s: string too long", __func__); return (s); } + +/* + * A subroutine which makes it easy to track TCP state changes with DTrace. + * This function shouldn't be called for t_state initializations that don't + * correspond to actual TCP state transitions. + */ +void +tcp_state_change(struct tcpcb *tp, int newstate) +{ +#if defined(KDTRACE_HOOKS) + int pstate = tp->t_state; +#endif + + TCPSTATES_DEC(tp->t_state); + TCPSTATES_INC(newstate); + tp->t_state = newstate; + TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); +} |