diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_subr.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_subr.c | 488 |
1 files changed, 155 insertions, 333 deletions
diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index e23a0997..4c6d14eb 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -72,29 +72,25 @@ __FBSDID("$FreeBSD$"); #include <netinet/cc.h> #include <netinet/in.h> +#include <netinet/in_pcb.h> #include <netinet/in_systm.h> +#include <netinet/in_var.h> #include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> -#endif -#include <netinet/in_pcb.h> -#ifdef INET6 #include <netinet6/in6_pcb.h> -#endif -#include <netinet/in_var.h> -#include <netinet/ip_var.h> -#ifdef INET6 #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet6/nd6.h> #endif -#include <netinet/ip_icmp.h> + #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_syncache.h> -#include <netinet/tcp_offload.h> #ifdef INET6 #include <netinet6/tcp6_var.h> #endif @@ -102,7 +98,12 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +#ifdef INET6 #include <netinet6/ip6protosw.h> +#endif +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -166,15 +167,7 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); -#endif - -static int -vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) -{ - - VNET_SYSCTL_ARG(req, arg1); - return (sysctl_msec_to_ticks(oidp, arg1, arg2, req)); -} +#endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where @@ -187,7 +180,7 @@ vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, - "Minmum TCP Maximum Segment Size"); + "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, @@ -221,49 +214,9 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); -/* - * TCP bandwidth limiting sysctls. Note that the default lower bound of - * 1024 exists only for debugging. A good production default would be - * something like 6100. - */ -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, - "TCP inflight data limiting"); - -static VNET_DEFINE(int, tcp_inflight_enable) = 0; -#define V_tcp_inflight_enable VNET(tcp_inflight_enable) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_enable), 0, - "Enable automatic TCP inflight data limiting"); - -static int tcp_inflight_debug = 0; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, - &tcp_inflight_debug, 0, - "Debug TCP inflight calculations"); - -static VNET_DEFINE(int, tcp_inflight_rttthresh); -#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh) -SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0, - vnet_sysctl_msec_to_ticks, "I", - "RTT threshold below which inflight will deactivate itself"); - -static VNET_DEFINE(int, tcp_inflight_min) = 6144; -#define V_tcp_inflight_min VNET(tcp_inflight_min) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_min), 0, - "Lower-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT; -#define V_tcp_inflight_max VNET(tcp_inflight_max) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_max), 0, - "Upper-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_stab) = 20; -#define V_tcp_inflight_stab VNET(tcp_inflight_stab) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_stab), 0, - "Inflight Algorithm Stabilization 20 = 2 packets"); +static int tcp_soreceive_stream = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, + &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); #ifdef TCP_SIGNATURE static int tcp_sig_checksigs = 1; @@ -278,7 +231,6 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); -static void tcp_isn_tick(void *); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); @@ -309,7 +261,6 @@ static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); -struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -342,13 +293,6 @@ tcp_init(void) { int hashsize; - INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); - LIST_INIT(&V_tcb); -#ifdef VIMAGE - V_tcbinfo.ipi_vnet = curvnet; -#endif - V_tcbinfo.ipi_listhead = &V_tcb; - if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); @@ -362,14 +306,9 @@ tcp_init(void) printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } - V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, - &V_tcbinfo.ipi_hashmask); - V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, - &V_tcbinfo.ipi_porthashmask); - V_tcbinfo.ipi_zone = uma_zcreate("tcp_inpcb", sizeof(struct inpcb), - NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); - V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; + in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, + "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. @@ -405,6 +344,16 @@ tcp_init(void) tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; + TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream); + if (tcp_soreceive_stream) { +#ifdef INET + tcp_usrreqs.pru_soreceive = soreceive_stream; +#endif +#ifdef INET6 + tcp6_usrreqs.pru_soreceive = soreceive_stream; +#endif /* INET6 */ + } + #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ @@ -417,8 +366,6 @@ tcp_init(void) #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); - callout_init(&isn_callout, CALLOUT_MPSAFE); - callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, @@ -434,18 +381,9 @@ tcp_destroy(void) tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); - - /* XXX check that hashes are empty! */ - hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, - V_tcbinfo.ipi_hashmask); - hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, - V_tcbinfo.ipi_porthashmask); - + in_pcbinfo_destroy(&V_tcbinfo); uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); - uma_zdestroy(V_tcbinfo.ipi_zone); - - INP_INFO_LOCK_DESTROY(&V_tcbinfo); } #endif @@ -453,7 +391,6 @@ void tcp_fini(void *xtp) { - callout_stop(&isn_callout); } /* @@ -481,8 +418,12 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; - } else + } +#endif /* INET6 */ +#if defined(INET6) && defined(INET) + else #endif +#ifdef INET { struct ip *ip; @@ -499,6 +440,7 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } +#endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; @@ -560,7 +502,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 - isipv6 = ((struct ip *)ipgen)->ip_v == 6; + isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; @@ -608,6 +550,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; + m_addr_changed(m); /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } @@ -638,11 +581,14 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; - ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + - tlen)); + ip6->ip6_plen = 0; /* Set in ip6_output(). */ tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); - } else + } +#endif +#if defined(INET) && defined(INET6) + else #endif +#ifdef INET { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; @@ -650,6 +596,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (V_path_mtu_discovery) ip->ip_off |= IP_DF; } +#endif m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; @@ -679,22 +626,27 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, else nth->th_win = htons((u_short)win); nth->th_urp = 0; + + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { - nth->th_sum = 0; - nth->th_sum = in6_cksum(m, IPPROTO_TCP, - sizeof(struct ip6_hdr), - tlen - sizeof(struct ip6_hdr)); + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + nth->th_sum = in6_cksum_pseudo(ip6, + tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); - } else + } #endif /* INET6 */ +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET { + m->m_pkthdr.csum_flags = CSUM_TCP; nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); - m->m_pkthdr.csum_flags = CSUM_TCP; - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } +#endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); @@ -702,9 +654,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); - else #endif /* INET6 */ - (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); +#endif } /* @@ -786,10 +742,8 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; - tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -878,7 +832,7 @@ tcp_drop(struct tcpcb *tp, int errno) if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; - (void) tcp_output_reset(tp); + (void) tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -900,8 +854,19 @@ tcp_discardcb(struct tcpcb *tp) INP_WLOCK_ASSERT(inp); /* - * Make sure that all of our timers are stopped before we - * delete the PCB. + * Make sure that all of our timers are stopped before we delete the + * PCB. + * + * XXXRW: Really, we would like to use callout_drain() here in order + * to avoid races experienced in tcp_timer.c where a timer is already + * executing at this point. However, we can't, both because we're + * running in a context where we can't sleep, and also because we + * hold locks required by the timers. What we instead need to do is + * test to see if callout_drain() is required, and if so, defer some + * portion of the remainder of tcp_discardcb() to an asynchronous + * context that can callout_drain() and then continue. Some care + * will be required to ensure that no further processing takes place + * on the tcpcb, even though it hasn't been freed (a flag?). */ callout_stop(&tp->t_timers->tt_rexmt); callout_stop(&tp->t_timers->tt_persist); @@ -958,8 +923,6 @@ tcp_discardcb(struct tcpcb *tp) metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; - /* XXX: This wraps if the pipe is more than 4 Gbit per second */ - metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; @@ -969,8 +932,12 @@ tcp_discardcb(struct tcpcb *tp) /* free the reassembly queue, if any */ tcp_reass_flush(tp); + +#ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ - tcp_offload_detach(tp); + if (tp->t_flags & TF_TOE) + tcp_offload_detach(tp); +#endif tcp_free_sackholes(tp); @@ -999,9 +966,10 @@ tcp_close(struct tcpcb *tp) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); - /* Notify any offload devices of listener close */ +#ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) - tcp_offload_listen_close(tp); + tcp_offload_listen_stop(tp); +#endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); @@ -1211,8 +1179,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) else if (inp->inp_flags & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; - } else + } else { bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + if (xt.xt_tp.t_timers) + tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); + } if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { @@ -1228,9 +1199,9 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); @@ -1257,6 +1228,7 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); +#ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { @@ -1271,12 +1243,9 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, - addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1284,10 +1253,8 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1296,6 +1263,7 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); +#endif /* INET */ #ifdef INET6 static int @@ -1304,7 +1272,10 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; - int error, mapped = 0; + int error; +#ifdef INET + int mapped = 0; +#endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) @@ -1317,27 +1288,28 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { +#ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else +#endif return (EINVAL); } - INP_INFO_RLOCK(&V_tcbinfo); +#ifdef INET if (mapped == 1) - inp = in_pcblookup_hash(&V_tcbinfo, + inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], - addrs[0].sin6_port, - 0, NULL); + addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else - inp = in6_pcblookup_hash(&V_tcbinfo, +#endif + inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, - &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); + &addrs[0].sin6_addr, addrs[0].sin6_port, + INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1345,10 +1317,8 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1357,9 +1327,10 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); -#endif +#endif /* INET6 */ +#ifdef INET void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { @@ -1408,10 +1379,9 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - INP_WLOCK(inp); if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { @@ -1473,6 +1443,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) } else in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); } +#endif /* INET */ #ifdef INET6 void @@ -1600,11 +1571,13 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) #define ISN_RANDOM_INCREMENT (4096 - 1) static VNET_DEFINE(u_char, isn_secret[32]); +static VNET_DEFINE(int, isn_last); static VNET_DEFINE(int, isn_last_reseed); static VNET_DEFINE(u_int32_t, isn_offset); static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) +#define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) @@ -1615,6 +1588,7 @@ tcp_new_isn(struct tcpcb *tp) MD5_CTX isn_ctx; u_int32_t md5_buffer[4]; tcp_seq new_isn; + u_int32_t projected_offset; INP_WLOCK_ASSERT(tp->t_inpcb); @@ -1650,38 +1624,17 @@ tcp_new_isn(struct tcpcb *tp) new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); - new_isn += V_isn_offset; - ISN_UNLOCK(); - return (new_isn); -} - -/* - * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary - * to keep time flowing at a relatively constant rate. If the random - * increments have already pushed us past the projected offset, do nothing. - */ -static void -tcp_isn_tick(void *xtp) -{ - VNET_ITERATOR_DECL(vnet_iter); - u_int32_t projected_offset; - - VNET_LIST_RLOCK_NOSLEEP(); - ISN_LOCK(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */ - projected_offset = - V_isn_offset_old + ISN_BYTES_PER_SECOND / 100; - + if (ticks != V_isn_last) { + projected_offset = V_isn_offset_old + + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; - V_isn_offset_old = V_isn_offset; - CURVNET_RESTORE(); + V_isn_last = ticks; } + new_isn += V_isn_offset; ISN_UNLOCK(); - VNET_LIST_RUNLOCK_NOSLEEP(); - callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); + return (new_isn); } /* @@ -1755,10 +1708,11 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output_send(tp); + tcp_output(tp); return (inp); } +#ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine @@ -1766,7 +1720,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) * tcp_mss_update to get the peer/interface MTU. */ u_long -tcp_maxmtu(struct in_conninfo *inc, int *flags) +tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct route sro; struct sockaddr_in *dst; @@ -1791,19 +1745,21 @@ tcp_maxmtu(struct in_conninfo *inc, int *flags) maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); /* Report additional interface capabilities. */ - if (flags != NULL) { + if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) - *flags |= CSUM_TSO; + cap->ifcap |= CSUM_TSO; + cap->tsomax = ifp->if_hw_tsomax; } RTFREE(sro.ro_rt); } return (maxmtu); } +#endif /* INET */ #ifdef INET6 u_long -tcp_maxmtu6(struct in_conninfo *inc, int *flags) +tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct route_in6 sro6; struct ifnet *ifp; @@ -1827,10 +1783,11 @@ tcp_maxmtu6(struct in_conninfo *inc, int *flags) IN6_LINKMTU(sro6.ro_rt->rt_ifp)); /* Report additional interface capabilities. */ - if (flags != NULL) { + if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) - *flags |= CSUM_TSO; + cap->ifcap |= CSUM_TSO; + cap->tsomax = ifp->if_hw_tsomax; } RTFREE(sro6.ro_rt); } @@ -1882,154 +1839,6 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp) } #endif /* IPSEC */ -/* - * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING - * - * This code attempts to calculate the bandwidth-delay product as a - * means of determining the optimal window size to maximize bandwidth, - * minimize RTT, and avoid the over-allocation of buffers on interfaces and - * routers. This code also does a fairly good job keeping RTTs in check - * across slow links like modems. We implement an algorithm which is very - * similar (but not meant to be) TCP/Vegas. The code operates on the - * transmitter side of a TCP connection and so only effects the transmit - * side of the connection. - * - * BACKGROUND: TCP makes no provision for the management of buffer space - * at the end points or at the intermediate routers and switches. A TCP - * stream, whether using NewReno or not, will eventually buffer as - * many packets as it is able and the only reason this typically works is - * due to the fairly small default buffers made available for a connection - * (typicaly 16K or 32K). As machines use larger windows and/or window - * scaling it is now fairly easy for even a single TCP connection to blow-out - * all available buffer space not only on the local interface, but on - * intermediate routers and switches as well. NewReno makes a misguided - * attempt to 'solve' this problem by waiting for an actual failure to occur, - * then backing off, then steadily increasing the window again until another - * failure occurs, ad-infinitum. This results in terrible oscillation that - * is only made worse as network loads increase and the idea of intentionally - * blowing out network buffers is, frankly, a terrible way to manage network - * resources. - * - * It is far better to limit the transmit window prior to the failure - * condition being achieved. There are two general ways to do this: First - * you can 'scan' through different transmit window sizes and locate the - * point where the RTT stops increasing, indicating that you have filled the - * pipe, then scan backwards until you note that RTT stops decreasing, then - * repeat ad-infinitum. This method works in principle but has severe - * implementation issues due to RTT variances, timer granularity, and - * instability in the algorithm which can lead to many false positives and - * create oscillations as well as interact badly with other TCP streams - * implementing the same algorithm. - * - * The second method is to limit the window to the bandwidth delay product - * of the link. This is the method we implement. RTT variances and our - * own manipulation of the congestion window, bwnd, can potentially - * destabilize the algorithm. For this reason we have to stabilize the - * elements used to calculate the window. We do this by using the minimum - * observed RTT, the long term average of the observed bandwidth, and - * by adding two segments worth of slop. It isn't perfect but it is able - * to react to changing conditions and gives us a very stable basis on - * which to extend the algorithm. - */ -void -tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) -{ - u_long bw; - u_long bwnd; - int save_ticks; - - INP_WLOCK_ASSERT(tp->t_inpcb); - - /* - * If inflight_enable is disabled in the middle of a tcp connection, - * make sure snd_bwnd is effectively disabled. - */ - if (V_tcp_inflight_enable == 0 || - tp->t_rttlow < V_tcp_inflight_rttthresh) { - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bandwidth = 0; - return; - } - - /* - * Figure out the bandwidth. Due to the tick granularity this - * is a very rough number and it MUST be averaged over a fairly - * long period of time. XXX we need to take into account a link - * that is not using all available bandwidth, but for now our - * slop will ramp us up if this case occurs and the bandwidth later - * increases. - * - * Note: if ticks rollover 'bw' may wind up negative. We must - * effectively reset t_bw_rtttime for this case. - */ - save_ticks = ticks; - if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) - return; - - bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / - (save_ticks - tp->t_bw_rtttime); - tp->t_bw_rtttime = save_ticks; - tp->t_bw_rtseq = ack_seq; - if (tp->t_bw_rtttime == 0 || (int)bw < 0) - return; - bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; - - tp->snd_bandwidth = bw; - - /* - * Calculate the semi-static bandwidth delay product, plus two maximal - * segments. The additional slop puts us squarely in the sweet - * spot and also handles the bandwidth run-up case and stabilization. - * Without the slop we could be locking ourselves into a lower - * bandwidth. - * - * Situations Handled: - * (1) Prevents over-queueing of packets on LANs, especially on - * high speed LANs, allowing larger TCP buffers to be - * specified, and also does a good job preventing - * over-queueing of packets over choke points like modems - * (at least for the transmit side). - * - * (2) Is able to handle changing network loads (bandwidth - * drops so bwnd drops, bandwidth increases so bwnd - * increases). - * - * (3) Theoretically should stabilize in the face of multiple - * connections implementing the same algorithm (this may need - * a little work). - * - * (4) Stability value (defaults to 20 = 2 maximal packets) can - * be adjusted with a sysctl but typically only needs to be - * on very slow connections. A value no smaller then 5 - * should be used, but only reduce this default if you have - * no other choice. - */ -#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) - bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; -#undef USERTT - - if (tcp_inflight_debug > 0) { - static int ltime; - if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { - ltime = ticks; - printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", - tp, - bw, - tp->t_rttbest, - tp->t_srtt, - bwnd - ); - } - } - if ((long)bwnd < V_tcp_inflight_min) - bwnd = V_tcp_inflight_min; - if (bwnd > V_tcp_inflight_max) - bwnd = V_tcp_inflight_max; - if ((long)bwnd < tp->t_maxseg * 2) - bwnd = tp->t_maxseg * 2; - tp->snd_bwnd = bwnd; -} - #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data @@ -2071,11 +1880,15 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, u_char *buf, u_int direction) { union sockaddr_union dst; +#ifdef INET struct ippseudo ippseudo; +#endif MD5_CTX ctx; int doff; struct ip *ip; +#ifdef INET struct ipovly *ipovly; +#endif struct secasvar *sav; struct tcphdr *th; #ifdef INET6 @@ -2097,12 +1910,14 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, ip6 = NULL; /* Make the compiler happy. */ #endif switch (ip->ip_v) { +#ifdef INET case IPVERSION: dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; break; +#endif #ifdef INET6 case (IPV6_VERSION >> 4): ip6 = mtod(m, struct ip6_hdr *); @@ -2142,6 +1957,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, * tcp_output(), the underlying ip_len member has not yet been set. */ switch (ip->ip_v) { +#ifdef INET case IPVERSION: ipovly = (struct ipovly *)ip; ippseudo.ippseudo_src = ipovly->ih_src; @@ -2155,6 +1971,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; break; +#endif #ifdef INET6 /* * RFC 2385, 2.0 Proposal @@ -2335,6 +2152,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) return (error); break; #endif +#ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; @@ -2342,6 +2160,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; +#endif default: return (EINVAL); } @@ -2349,18 +2168,19 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: - inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr, - fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0, - NULL); + inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, + fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, + INPLOOKUP_WLOCKPCB, NULL); break; #endif +#ifdef INET case AF_INET: - inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, - fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, + lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; +#endif } if (inp != NULL) { - INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an @@ -2387,7 +2207,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) return (error); } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, +SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); @@ -2485,6 +2305,7 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ +#ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); @@ -2493,6 +2314,7 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); +#endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); |