/* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.226 2005/05/07 00:41:36 cperciva Exp $ */ /* * $Id$ */ #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define _IP_VHL #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif int tcp_mssdflt = TCP_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); static int tcp_do_rfc1323 = 1; #if !defined(__rtems__) static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, &tcp_rttdflt , 0, ""); SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, &tcp_do_rfc1323 , 0, ""); #endif static void tcp_notify(struct inpcb *, int); /* * Target size of TCP PCB hash table. Will be rounded down to a prime * number. */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 128 #endif /* * Tcp initialization */ void tcp_init() { tcp_iss = random(); /* wrong, but better than a constant */ tcp_ccgen = 1; LIST_INIT(&tcb); tcbinfo.listhead = &tcb; tcbinfo.hashbase = hashinit(TCBHASHSIZE, M_PCB, &tcbinfo.hashmask); if (max_protohdr < sizeof(struct tcpiphdr)) max_protohdr = sizeof(struct tcpiphdr); if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) panic("tcp_init"); } /* * Create template to be used to send tcp packets on a connection. * Call after host entry created, allocates an mbuf and fills * in a skeletal tcp/ip header, minimizing the amount of work * necessary when the connection is used. */ struct tcpiphdr * tcp_template(tp) struct tcpcb *tp; { register struct inpcb *inp = tp->t_inpcb; register struct mbuf *m; register struct tcpiphdr *n; if ((n = tp->t_template) == 0) { m = m_get(M_DONTWAIT, MT_HEADER); if (m == NULL) return (0); m->m_len = sizeof (struct tcpiphdr); n = mtod(m, struct tcpiphdr *); } n->ti_next = n->ti_prev = 0; n->ti_x1 = 0; n->ti_pr = IPPROTO_TCP; n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); n->ti_src = inp->inp_laddr; n->ti_dst = inp->inp_faddr; n->ti_sport = inp->inp_lport; n->ti_dport = inp->inp_fport; n->ti_seq = 0; n->ti_ack = 0; n->ti_x2 = 0; n->ti_off = 5; n->ti_flags = 0; n->ti_win = 0; n->ti_sum = 0; n->ti_urp = 0; return (n); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == 0, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection tp->t_template. If flags are given * then we send a message back to the TCP which originated the * segment ti, and discard the mbuf containing it and any other * attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then ti must point to *inside* the mbuf. */ void tcp_respond(tp, ti, m, ack, seq, flags) struct tcpcb *tp; register struct tcpiphdr *ti; register struct mbuf *m; tcp_seq ack, seq; int flags; { register int tlen; int win = 0; struct route *ro = 0; struct route sro; if (tp) { win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); ro = &tp->t_inpcb->inp_route; } else { ro = &sro; bzero(ro, sizeof *ro); } if (m == NULL) { m = m_gethdr(M_DONTWAIT, MT_HEADER); if (m == NULL) return; #ifdef TCP_COMPAT_42 tlen = 1; #else tlen = 0; #endif m->m_data += max_linkhdr; *mtod(m, struct tcpiphdr *) = *ti; ti = mtod(m, struct tcpiphdr *); flags = TH_ACK; } else { m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ti; m->m_len = sizeof (struct tcpiphdr); tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_long); xchg(ti->ti_dport, ti->ti_sport, u_short); #undef xchg } ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); tlen += sizeof (struct tcpiphdr); m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = (struct ifnet *) 0; ti->ti_next = ti->ti_prev = 0; ti->ti_x1 = 0; ti->ti_seq = htonl(seq); ti->ti_ack = htonl(ack); ti->ti_x2 = 0; ti->ti_off = sizeof (struct tcphdr) >> 2; ti->ti_flags = flags; if (tp) ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); else ti->ti_win = htons((u_short)win); ti->ti_urp = 0; ti->ti_sum = 0; ti->ti_sum = in_cksum(m, tlen); ((struct ip *)ti)->ip_len = tlen; ((struct ip *)ti)->ip_ttl = ip_defttl; #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, ti, 0); #endif (void) ip_output(m, NULL, ro, 0, NULL); if (ro == &sro && ro->ro_rt) { RTFREE(ro->ro_rt); } } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. */ struct tcpcb * tcp_newtcpcb(inp) struct inpcb *inp; { struct tcpcb *tp; tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT); if (tp == NULL) return ((struct tcpcb *)0); bzero((char *) tp, sizeof(struct tcpcb)); tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp; tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); tp->t_inpcb = inp; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = TCPTV_MIN; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = (caddr_t)tp; return (tp); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(tp, errnum) register struct tcpcb *tp; int errnum; { struct socket *so = tp->t_inpcb->inp_socket; if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; } else tcpstat.tcps_conndrops++; if (errnum == ETIMEDOUT && tp->t_softerror) errnum = tp->t_softerror; so->so_error = errnum; return (tcp_close(tp)); } /* * Close a TCP control block: * discard all space held by the tcp * discard internet protocol block * wake up any sleepers */ struct tcpcb * tcp_close(tp) struct tcpcb *tp; { register struct tcpiphdr *t; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; register struct mbuf *m; register struct rtentry *rt; /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as the 16 samples. * 16 samples is enough for the srtt filter to converge * to within 5% of the correct value; fewer samples and * we could save a very bogus rtt. * * Don't update the default route's characteristics and don't * update anything that the user "locked". */ if (tp->t_rttupdated >= 16 && (rt = inp->inp_route.ro_rt) && ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { register u_long i = 0; if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt * (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); if (rt->rt_rmx.rmx_rtt && i) /* * filter this update to half the old & half * the new values, converting scale. * See route.h and tcp_var.h for a * description of the scaling constants. */ rt->rt_rmx.rmx_rtt = (rt->rt_rmx.rmx_rtt + i) / 2; else rt->rt_rmx.rmx_rtt = i; tcpstat.tcps_cachedrtt++; } if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { i = tp->t_rttvar * (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); if (rt->rt_rmx.rmx_rttvar && i) rt->rt_rmx.rmx_rttvar = (rt->rt_rmx.rmx_rttvar + i) / 2; else rt->rt_rmx.rmx_rttvar = i; tcpstat.tcps_cachedrttvar++; } /* * update the pipelimit (ssthresh) if it has been updated * already or if a pipesize was specified & the threshhold * got below half the pipesize. I.e., wait for bad news * before we start updating, then update on both good * and bad news. */ if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && ((i = tp->snd_ssthresh) != 0) && rt->rt_rmx.rmx_ssthresh) || i < (rt->rt_rmx.rmx_sendpipe / 2)) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ i = (i + tp->t_maxseg / 2) / tp->t_maxseg; if (i < 2) i = 2; i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); if (rt->rt_rmx.rmx_ssthresh) rt->rt_rmx.rmx_ssthresh = (rt->rt_rmx.rmx_ssthresh + i) / 2; else rt->rt_rmx.rmx_ssthresh = i; tcpstat.tcps_cachedssthresh++; } } /* free the reassembly queue, if any */ t = tp->seg_next; while (t != (struct tcpiphdr *)tp) { t = (struct tcpiphdr *)t->ti_next; #if (defined(__GNUC__) && (defined(__arm__) || defined(__mips__))) LD32_UNALGN((struct tcpiphdr *)t->ti_prev,m); #else m = REASS_MBUF((struct tcpiphdr *)t->ti_prev); #endif remque(t->ti_prev); m_freem(m); } if (tp->t_template) (void) m_free(dtom(tp->t_template)); free(tp, M_PCB); inp->inp_ppcb = 0; soisdisconnected(so); in_pcbdetach(inp); tcpstat.tcps_closed++; return ((struct tcpcb *)0); } void tcp_drain() { } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). */ static void tcp_notify(inp, error) struct inpcb *inp; int error; { struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; struct socket *so = inp->inp_socket; /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return; } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) so->so_error = error; else tp->t_softerror = error; soconnwakeup (so); sorwakeup(so); sowwakeup(so); } #ifdef __rtems__ #define INP_INFO_RLOCK(a) #define INP_INFO_RUNLOCK(a) #define INP_LOCK(a) #define INP_UNLOCK(a) #endif static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n, s; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = tcbinfo.ipi_count; req->oldidx = 2 * (sizeof xig) + (n + n/8) * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ s = splnet(); INP_INFO_RLOCK(&tcbinfo); gencnt = tcbinfo.ipi_gencnt; n = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); splx(s); sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xtcpcb)); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; /* xig.xig_sogen = so_gencnt; remove by ccj */ error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; /* ccj add exit if the count is 0 */ if (!n) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return ENOMEM; s = splnet(); INP_INFO_RLOCK(&tcbinfo); for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) #if 0 && cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) #endif inp_list[i++] = inp; INP_UNLOCK(inp); } INP_INFO_RUNLOCK(&tcbinfo); splx(s); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_LOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; caddr_t inp_ppcb; xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb != NULL) bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); else bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); #if 0 if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xt.xt_socket); #endif error = SYSCTL_OUT(req, &xt, sizeof xt); } INP_UNLOCK(inp); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ s = splnet(); INP_INFO_RLOCK(&tcbinfo); xig.xig_gen = tcbinfo.ipi_gencnt; #if 0 xig.xig_sogen = so_gencnt; #endif xig.xig_count = tcbinfo.ipi_count; INP_INFO_RUNLOCK(&tcbinfo); splx(s); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return error; } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); void tcp_ctlinput(cmd, sa, vip) int cmd; struct sockaddr *sa; void *vip; { struct ip *ip = vip; struct tcphdr *th; void (*notify)(struct inpcb *, int) = tcp_notify; if (cmd == PRC_QUENCH) notify = tcp_quench; #if 1 else if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; #endif else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)) return; if (ip != NULL) { #ifdef _IP_VHL th = (struct tcphdr *)((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2)); #else th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); #endif in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport, cmd, notify); } else in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); } /* * When a source quench is received, close congestion window * to one segment. We will gradually open it again as we proceed. */ void tcp_quench(inp, errnum) struct inpcb *inp; int errnum; { struct tcpcb *tp = intotcpcb(inp); if (tp) tp->snd_cwnd = tp->t_maxseg; } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value in the route. Also nudge TCP to send something, * since we know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ void tcp_mtudisc(inp, errnum) struct inpcb *inp; int errnum; { struct tcpcb *tp = intotcpcb(inp); struct rtentry *rt; struct rmxp_tao *taop; struct socket *so = inp->inp_socket; int offered; int mss; if (tp) { rt = tcp_rtlookup(inp); if (!rt || !rt->rt_rmx.rmx_mtu) { tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; return; } taop = rmx_taop(rt->rt_rmx); offered = taop->tao_mssopt; mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); if (offered) mss = min(mss, offered); /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ if (tp->t_maxopd <= mss) return; tp->t_maxopd = mss; if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) mss -= TCPOLEN_TSTAMP_APPA; if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) mss -= TCPOLEN_CC_APPA; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) mss &= ~(MCLBYTES-1); #else if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif if (so->so_snd.sb_hiwat < mss) mss = so->so_snd.sb_hiwat; tp->t_maxseg = mss; tcpstat.tcps_mturesent++; tp->t_rtt = 0; tp->snd_nxt = tp->snd_una; tcp_output(tp); } } /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return NULL. This routine * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ struct rtentry * tcp_rtlookup(inp) struct inpcb *inp; { struct route *ro; struct rtentry *rt; ro = &inp->inp_route; rt = ro->ro_rt; if (rt == NULL || !(rt->rt_flags & RTF_UP)) { /* No route yet, so try to acquire one */ if (inp->inp_faddr.s_addr != INADDR_ANY) { ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(ro->ro_dst); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = inp->inp_faddr; rtalloc(ro); rt = ro->ro_rt; } } return rt; } /* * Return a pointer to the cached information about the remote host. * The cached information is stored in the protocol specific part of * the route metrics. */ struct rmxp_tao * tcp_gettaocache(inp) struct inpcb *inp; { struct rtentry *rt = tcp_rtlookup(inp); /* Make sure this is a host route and is up. */ if (rt == NULL || (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) return NULL; return rmx_taop(rt->rt_rmx); }