From fbc7459c2c3a86a4d94e19463417b759900ff51c Mon Sep 17 00:00:00 2001 From: Vijay Kumar Banerjee Date: Wed, 3 Feb 2021 19:46:50 -0700 Subject: Initial Commit: Add all files from RTEMS libnetworking directory --- netinet/icmp_var.h | 81 ++ netinet/if_ether.c | 644 +++++++++++++ netinet/if_ether.h | 176 ++++ netinet/igmp.c | 473 ++++++++++ netinet/igmp.h | 97 ++ netinet/igmp_var.h | 105 +++ netinet/in.c | 711 ++++++++++++++ netinet/in_cksum.c | 186 ++++ netinet/in_cksum_arm.h | 236 +++++ netinet/in_cksum_i386.h | 202 ++++ netinet/in_cksum_m68k.h | 221 +++++ netinet/in_cksum_nios2.h | 248 +++++ netinet/in_cksum_powerpc.h | 172 ++++ netinet/in_cksum_sparc.h | 269 ++++++ netinet/in_pcb.c | 733 +++++++++++++++ netinet/in_pcb.h | 152 +++ netinet/in_proto.c | 217 +++++ netinet/in_rmx.c | 386 ++++++++ netinet/in_systm.h | 58 ++ netinet/in_var.h | 240 +++++ netinet/ip.h | 230 +++++ netinet/ip_divert.c | 387 ++++++++ netinet/ip_fw.c | 1082 +++++++++++++++++++++ netinet/ip_fw.h | 183 ++++ netinet/ip_icmp.c | 771 +++++++++++++++ netinet/ip_icmp.h | 212 +++++ netinet/ip_input.c | 1510 ++++++++++++++++++++++++++++++ netinet/ip_mroute.c | 2232 ++++++++++++++++++++++++++++++++++++++++++++ netinet/ip_mroute.h | 271 ++++++ netinet/ip_output.c | 1336 ++++++++++++++++++++++++++ netinet/ip_var.h | 215 +++++ netinet/raw_ip.c | 492 ++++++++++ netinet/tcp_debug.c | 172 ++++ netinet/tcp_debug.h | 69 ++ netinet/tcp_fsm.h | 91 ++ netinet/tcp_input.c | 2151 ++++++++++++++++++++++++++++++++++++++++++ netinet/tcp_output.c | 757 +++++++++++++++ netinet/tcp_seq.h | 99 ++ netinet/tcp_subr.c | 729 +++++++++++++++ netinet/tcp_timer.c | 385 ++++++++ netinet/tcp_timer.h | 133 +++ netinet/tcp_usrreq.c | 845 +++++++++++++++++ netinet/tcp_var.h | 414 ++++++++ netinet/tcpip.h | 67 ++ netinet/udp.h | 50 + netinet/udp_usrreq.c | 736 +++++++++++++++ netinet/udp_var.h | 106 +++ 47 files changed, 21332 insertions(+) create mode 100644 netinet/icmp_var.h create mode 100644 netinet/if_ether.c create mode 100644 netinet/if_ether.h create mode 100644 netinet/igmp.c create mode 100644 netinet/igmp.h create mode 100644 netinet/igmp_var.h create mode 100644 netinet/in.c create mode 100644 netinet/in_cksum.c create mode 100644 netinet/in_cksum_arm.h create mode 100644 netinet/in_cksum_i386.h create mode 100644 netinet/in_cksum_m68k.h create mode 100644 netinet/in_cksum_nios2.h create mode 100644 netinet/in_cksum_powerpc.h create mode 100644 netinet/in_cksum_sparc.h create mode 100644 netinet/in_pcb.c create mode 100644 netinet/in_pcb.h create mode 100644 netinet/in_proto.c create mode 100644 netinet/in_rmx.c create mode 100644 netinet/in_systm.h create mode 100644 netinet/in_var.h create mode 100644 netinet/ip.h create mode 100644 netinet/ip_divert.c create mode 100644 netinet/ip_fw.c create mode 100644 netinet/ip_fw.h create mode 100644 netinet/ip_icmp.c create mode 100644 netinet/ip_icmp.h create mode 100644 netinet/ip_input.c create mode 100644 netinet/ip_mroute.c create mode 100644 netinet/ip_mroute.h create mode 100644 netinet/ip_output.c create mode 100644 netinet/ip_var.h create mode 100644 netinet/raw_ip.c create mode 100644 netinet/tcp_debug.c create mode 100644 netinet/tcp_debug.h create mode 100644 netinet/tcp_fsm.h create mode 100644 netinet/tcp_input.c create mode 100644 netinet/tcp_output.c create mode 100644 netinet/tcp_seq.h create mode 100644 netinet/tcp_subr.c create mode 100644 netinet/tcp_timer.c create mode 100644 netinet/tcp_timer.h create mode 100644 netinet/tcp_usrreq.c create mode 100644 netinet/tcp_var.h create mode 100644 netinet/tcpip.h create mode 100644 netinet/udp.h create mode 100644 netinet/udp_usrreq.c create mode 100644 netinet/udp_var.h (limited to 'netinet') diff --git a/netinet/icmp_var.h b/netinet/icmp_var.h new file mode 100644 index 0000000..31eea3b --- /dev/null +++ b/netinet/icmp_var.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_ICMP_VAR_H_ +#define _NETINET_ICMP_VAR_H_ + +#include /* ICMP_MAXTYPE */ + +/* + * Variables related to this implementation + * of the internet control message protocol. + */ +struct icmpstat { +/* statistics related to icmp packets generated */ + u_long icps_error; /* # of calls to icmp_error */ + u_long icps_oldshort; /* no error 'cuz old ip too short */ + u_long icps_oldicmp; /* no error 'cuz old was icmp */ + u_long icps_outhist[ICMP_MAXTYPE + 1]; +/* statistics related to input messages processed */ + u_long icps_badcode; /* icmp_code out of range */ + u_long icps_tooshort; /* packet < ICMP_MINLEN */ + u_long icps_checksum; /* bad checksum */ + u_long icps_badlen; /* calculated bound mismatch */ + u_long icps_reflect; /* number of responses */ + u_long icps_inhist[ICMP_MAXTYPE + 1]; + u_long icps_allecho; /* all echo requests dropped */ + u_long icps_bmcastecho; /* b/mcast echo requests dropped */ + u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */ +}; + +/* + * Names for ICMP sysctl objects + */ +#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ +#define ICMPCTL_STATS 2 /* statistics (read-only) */ +#define ICMPCTL_MAXID 3 + +#define ICMPCTL_NAMES { \ + { 0, 0 }, \ + { "maskrepl", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_icmp); +extern struct icmpstat icmpstat; +#endif + +#endif diff --git a/netinet/if_ether.c b/netinet/if_ether.c new file mode 100644 index 0000000..e068674 --- /dev/null +++ b/netinet/if_ether.c @@ -0,0 +1,644 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/if_ether.c,v 1.136 2005/03/13 11:23:22 glebius Exp $ + */ + + +/* + * Ethernet address resolution protocol. + * TODO: + * add "inuse/lock" bit (or ref. count) along with valid bit + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_inet.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define SIN(s) ((struct sockaddr_in *)s) +#define SDL(s) ((struct sockaddr_dl *)s) + +SYSCTL_DECL(_net_link_ether); +SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); + +/* timer values */ +static int arpt_prune = (5*60*1); /* walk list every 5 minutes */ +static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +static int arpt_down = 20; /* once declared down, don't send for 20 sec */ + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, + &arpt_prune, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, + &arpt_keep, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, + &arpt_down, 0, ""); + +#define rt_expire rt_rmx.rmx_expire + +struct llinfo_arp { + LIST_ENTRY(llinfo_arp) la_le; + struct rtentry *la_rt; + struct mbuf *la_hold; /* last packet until resolved/timeout */ + long la_asked; /* last time we QUERIED for this addr */ +#define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ +}; + +static LIST_HEAD(, llinfo_arp) llinfo_arp; + +struct ifqueue arpintrq = {0, 0, 0, 50, 0}; +static int arp_inuse, arp_allocated; + +static int arp_maxtries = 5; +static int useloopback = 1; /* use loopback interface for local traffic */ +static int arp_proxyall = 0; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, + &arp_maxtries, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, + &useloopback, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, + &arp_proxyall, 0, ""); + +static void arp_rtrequest(int, struct rtentry *, struct sockaddr *); +static void arprequest(struct arpcom *, struct in_addr *, struct in_addr *, u_char *); +void arpintr(void); +static void arptfree(struct llinfo_arp *); +static void arptimer(void *); +static struct llinfo_arp + *arplookup(u_long, int, int); +#ifdef INET +static void in_arpinput(struct mbuf *); +#endif + +/* + * Timeout routine. Age arp_tab entries periodically. + */ +/* ARGSUSED */ +static void +arptimer(void *ignored_arg) +{ + int s = splnet(); + struct llinfo_arp *la, *ola; + + la = llinfo_arp.lh_first; + timeout(arptimer, (caddr_t)0, arpt_prune * hz); + while ((ola = la) != 0) { + register struct rtentry *rt = la->la_rt; + la = la->la_le.le_next; + if (rt->rt_expire && rt->rt_expire <= rtems_bsdnet_seconds_since_boot()) + arptfree(ola); /* timer has expired, clear */ + } + splx(s); +} + +/* + * Parallel to llc_rtrequest. + */ +static void +arp_rtrequest(int req, struct rtentry *rt, struct sockaddr *sa) +{ + struct sockaddr *gate; + struct llinfo_arp *la; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK, 0, 0, 0, 0, 0, { 0 } }; + static int arpinit_done; + + if (!arpinit_done) { + arpinit_done = 1; + LIST_INIT(&llinfo_arp); + timeout(arptimer, (caddr_t)0, hz); + } + if (rt->rt_flags & RTF_GATEWAY) + return; + gate = rt->rt_gateway; + la = (struct llinfo_arp *)rt->rt_llinfo; + switch (req) { + + case RTM_ADD: + /* + * XXX: If this is a manually added route to interface + * such as older version of routed or gated might provide, + * restore cloning bit. + */ + if ((rt->rt_flags & RTF_HOST) == 0 && + rt_mask(rt) != NULL && + SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) + rt->rt_flags |= RTF_CLONING; + if (rt->rt_flags & RTF_CLONING) { + /* + * Case 1: This route should come from a route to iface. + */ + rt_setgate(rt, rt_key(rt), + (struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + rt->rt_expire = rtems_bsdnet_seconds_since_boot(); + break; + } + /* Announce a new entry if requested. */ + if (rt->rt_flags & RTF_ANNOUNCE) + arprequest((struct arpcom *)rt->rt_ifp, + &SIN(rt_key(rt))->sin_addr, + &SIN(rt_key(rt))->sin_addr, + (u_char *)LLADDR(SDL(gate))); + /*FALLTHROUGH*/ + case RTM_RESOLVE: + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "arp_rtrequest: bad gateway value\n"); + break; + } + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + if (la != 0) + break; /* This happens on a route change */ + /* + * Case 2: This route may come from cloning, or a manual route + * add with a LL address. + */ + R_Malloc(la, struct llinfo_arp *, sizeof(*la)); + rt->rt_llinfo = (caddr_t)la; + if (la == 0) { + log(LOG_DEBUG, "arp_rtrequest: malloc failed\n"); + break; + } + arp_inuse++; + arp_allocated++; + Bzero(la, sizeof(*la)); + la->la_rt = rt; + rt->rt_flags |= RTF_LLINFO; + LIST_INSERT_HEAD(&llinfo_arp, la, la_le); + + /* + * This keeps the multicast addresses from showing up + * in `arp -a' listings as unresolved. It's not actually + * functional. Then the same for broadcast. + */ + if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { + ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr, + LLADDR(SDL(gate))); + SDL(gate)->sdl_alen = 6; + rt->rt_expire = 0; + } + if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { + memcpy(LLADDR(SDL(gate)), etherbroadcastaddr, 6); + SDL(gate)->sdl_alen = 6; + rt->rt_expire = 0; + } + + if (SIN(rt_key(rt))->sin_addr.s_addr == + (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced + * through the hardware by configuring the loopback down. + * However, it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and remove + * the route to force traffic out to the hardware. + */ + rt->rt_expire = 0; + Bcopy(((struct arpcom *)rt->rt_ifp)->ac_enaddr, + LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); + if (useloopback) + rt->rt_ifp = loif; + + } + break; + + case RTM_DELETE: + if (la == 0) + break; + arp_inuse--; + LIST_REMOVE(la, la_le); + rt->rt_llinfo = 0; + rt->rt_flags &= ~RTF_LLINFO; + if (la->la_hold) + m_freem(la->la_hold); + Free((caddr_t)la); + } +} + +/* + * Broadcast an ARP request. Caller specifies: + * - arp header source ip address + * - arp header target ip address + * - arp header source ethernet address + */ +static void +arprequest(struct arpcom *ac, struct in_addr *sip, struct in_addr *tip, u_char *enaddr) +{ + struct mbuf *m; + struct ether_header *eh; + struct ether_arp *ea; + struct sockaddr sa; + + if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + return; + m->m_len = sizeof(*ea); + m->m_pkthdr.len = sizeof(*ea); + MH_ALIGN(m, sizeof(*ea)); + ea = mtod(m, struct ether_arp *); + eh = (struct ether_header *)sa.sa_data; + bzero((caddr_t)ea, sizeof (*ea)); + (void)memcpy(eh->ether_dhost, etherbroadcastaddr, sizeof(eh->ether_dhost)); + eh->ether_type = htons(ETHERTYPE_ARP); /* if_output will not swap */ + ea->arp_hrd = htons(ARPHRD_ETHER); + ea->arp_pro = htons(ETHERTYPE_IP); + ea->arp_hln = sizeof(ea->arp_sha); /* hardware address length */ + ea->arp_pln = sizeof(ea->arp_spa); /* protocol address length */ + ea->arp_op = htons(ARPOP_REQUEST); + (void)memcpy(ea->arp_sha, enaddr, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_spa, sip, sizeof(ea->arp_spa)); + (void)memcpy(ea->arp_tpa, tip, sizeof(ea->arp_tpa)); + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); +} + +/* + * Resolve an IP address into an ethernet address. If success, + * desten is filled in. If there is no entry in arptab, + * set one up and broadcast a request for the IP address. + * Hold onto this mbuf and resend it once the address + * is finally resolved. A return value of 1 indicates + * that desten has been filled in and the packet should be sent + * normally; a 0 return indicates that the packet has been + * taken over here, either now or for later transmission. + */ +int +arpresolve( + struct arpcom *ac, + struct rtentry *rt, + struct mbuf *m, + struct sockaddr *dst, + u_char *desten, + struct rtentry *rt0) +{ + struct llinfo_arp *la = 0; + struct sockaddr_dl *sdl; + + if (m->m_flags & M_BCAST) { /* broadcast */ + (void)memcpy(desten, etherbroadcastaddr, sizeof(etherbroadcastaddr)); + return (1); + } + if (m->m_flags & M_MCAST) { /* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return(1); + } + if (rt) + la = (struct llinfo_arp *)rt->rt_llinfo; + else { + la = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0); + if (la) + rt = la->la_rt; + } + if (la == 0 || rt == 0) { + char addrbuf[INET_ADDRSTRLEN]; + log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s\n", + inet_ntoa_r(SIN(dst)->sin_addr, addrbuf)); + m_freem(m); + return (0); + } + sdl = SDL(rt->rt_gateway); + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + if ((rt->rt_expire == 0 || rt->rt_expire > rtems_bsdnet_seconds_since_boot()) && + sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) { + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return 1; + } + /* + * There is an arptab entry, but no ethernet address + * response yet. Replace the held mbuf with this + * latest one. + */ + if (la->la_hold) + m_freem(la->la_hold); + la->la_hold = m; + if (rt->rt_expire) { + rt->rt_flags &= ~RTF_REJECT; + if (la->la_asked == 0 || rt->rt_expire != rtems_bsdnet_seconds_since_boot()) { + rt->rt_expire = rtems_bsdnet_seconds_since_boot(); + if (la->la_asked++ < arp_maxtries) + arprequest(ac, + &(SIN(rt->rt_ifa->ifa_addr)->sin_addr), + &(SIN(dst)->sin_addr), + ac->ac_enaddr); + else { + rt->rt_flags |= RTF_REJECT; + rt->rt_expire += arpt_down; + la->la_asked = 0; + } + + } + } + return (0); +} + +/* + * Common length and type checks are done here, + * then the protocol-specific routine is called. + */ +void +arpintr(void) +{ + struct mbuf *m; + struct arphdr *ar; + int s; + + while (arpintrq.ifq_head) { + s = splimp(); + IF_DEQUEUE(&arpintrq, m); + splx(s); + if (m == 0 || (m->m_flags & M_PKTHDR) == 0) + panic("arpintr"); + if (m->m_len >= sizeof(struct arphdr) && + (ar = mtod(m, struct arphdr *)) && + ntohs(ar->ar_hrd) == ARPHRD_ETHER && + m->m_len >= + sizeof(struct arphdr) + 2 * ar->ar_hln + 2 * ar->ar_pln) + + switch (ntohs(ar->ar_pro)) { + + case ETHERTYPE_IP: + in_arpinput(m); + continue; + } + m_freem(m); + } +} + +NETISR_SET(NETISR_ARP, arpintr); + +/* + * ARP for Internet protocols on 10 Mb/s Ethernet. + * Algorithm is that given in RFC 826. + * In addition, a sanity check is performed on the sender + * protocol address, to catch impersonators. + * We no longer handle negotiations for use of trailer protocol: + * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent + * along with IP replies if we wanted trailers sent to us, + * and also sent them in response to IP replies. + * This allowed either end to announce the desire to receive + * trailer packets. + * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, + * but formerly didn't normally send requests. + */ +static void +in_arpinput(struct mbuf *m) +{ + struct ether_arp *ea; + struct arpcom *ac = (struct arpcom *)m->m_pkthdr.rcvif; + struct ether_header *eh; + struct llinfo_arp *la = 0; + struct rtentry *rt; + struct in_ifaddr *ia; + struct in_ifaddr *maybe_ia = 0; + struct sockaddr_dl *sdl; + struct sockaddr sa; + struct in_addr isaddr, itaddr, myaddr; + int op; + char addrbuf[INET_ADDRSTRLEN]; + + ea = mtod(m, struct ether_arp *); + op = ntohs(ea->arp_op); + (void)memcpy(&isaddr, ea->arp_spa, sizeof (isaddr)); + (void)memcpy(&itaddr, ea->arp_tpa, sizeof (itaddr)); + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == &ac->ac_if) { + maybe_ia = ia; + if ((itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) || + (isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)) + break; + } + if (maybe_ia == 0) { + m_freem(m); + return; + } + myaddr = ia ? ia->ia_addr.sin_addr : maybe_ia->ia_addr.sin_addr; + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)ac->ac_enaddr, + sizeof (ea->arp_sha))) { + m_freem(m); /* it's from me, ignore it. */ + return; + } + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)etherbroadcastaddr, + sizeof (ea->arp_sha))) { + log(LOG_ERR, + "arp: ether address is broadcast for IP address %s!\n", + inet_ntoa_r(isaddr, addrbuf)); + m_freem(m); + return; + } + if (isaddr.s_addr == myaddr.s_addr) { + log(LOG_ERR, + "arp: %6D is using my IP address %s!\n", + ea->arp_sha, ":", inet_ntoa_r(isaddr, addrbuf)); + itaddr = myaddr; + goto reply; + } + la = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0); + if (la && (rt = la->la_rt) && (sdl = SDL(rt->rt_gateway))) { + if (sdl->sdl_alen && + bcmp((caddr_t)ea->arp_sha, LLADDR(sdl), sdl->sdl_alen)) + log(LOG_INFO, "arp: %s moved from %6D to %6D\n", + inet_ntoa_r(isaddr, addrbuf), + (u_char *)LLADDR(sdl), ":", ea->arp_sha, ":"); + (void)memcpy(LLADDR(sdl), ea->arp_sha, sizeof(ea->arp_sha)); + sdl->sdl_alen = sizeof(ea->arp_sha); + if (rt->rt_expire) + rt->rt_expire = rtems_bsdnet_seconds_since_boot() + arpt_keep; + rt->rt_flags &= ~RTF_REJECT; + la->la_asked = 0; + if (la->la_hold) { + (*ac->ac_if.if_output)(&ac->ac_if, la->la_hold, + rt_key(rt), rt); + la->la_hold = 0; + } + } +reply: + if (op != ARPOP_REQUEST) { + m_freem(m); + return; + } + if (itaddr.s_addr == myaddr.s_addr) { + /* I am the target */ + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); + } else { + la = arplookup(itaddr.s_addr, 0, SIN_PROXY); + if (la == NULL) { + struct sockaddr_in sin; + + if (!arp_proxyall) { + m_freem(m); + return; + } + + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof sin; + sin.sin_addr = itaddr; + + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt) { + m_freem(m); + return; + } + /* + * Don't send proxies for nodes on the same interface + * as this one came out of, or we'll get into a fight + * over who claims what Ether address. + */ + if (rt->rt_ifp == &ac->ac_if) { + rtfree(rt); + m_freem(m); + return; + } + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); + rtfree(rt); +#ifdef DEBUG_PROXY + printf("arp: proxying for %s\n", + inet_ntoa_r(itaddr, addrbuf)); +#endif + } else { + rt = la->la_rt; + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + sdl = SDL(rt->rt_gateway); + (void)memcpy(ea->arp_sha, LLADDR(sdl), sizeof(ea->arp_sha)); + } + } + + (void)memcpy(ea->arp_tpa, ea->arp_spa, sizeof(ea->arp_spa)); + (void)memcpy(ea->arp_spa, &itaddr, sizeof(ea->arp_spa)); + ea->arp_op = htons(ARPOP_REPLY); + ea->arp_pro = htons(ETHERTYPE_IP); /* let's be sure! */ + eh = (struct ether_header *)sa.sa_data; + (void)memcpy(eh->ether_dhost, ea->arp_tha, sizeof(eh->ether_dhost)); + eh->ether_type = htons(ETHERTYPE_ARP); + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); + return; +} + +/* + * Free an arp entry. + */ +static void +arptfree(struct llinfo_arp *la) +{ + struct rtentry *rt = la->la_rt; + struct sockaddr_dl *sdl; + if (rt == 0) + panic("arptfree"); + if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) && + sdl->sdl_family == AF_LINK) { + sdl->sdl_alen = 0; + la->la_asked = 0; + rt->rt_flags &= ~RTF_REJECT; + return; + } + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), + 0, (struct rtentry **)0); +} +/* + * Lookup or enter a new address in arptab. + */ +static struct llinfo_arp * +arplookup(u_long addr, int create, int proxy) +{ + struct rtentry *rt; + static struct sockaddr_inarp sin = {sizeof(sin), AF_INET, 0, { 0 }, { 0 }, 0, 0 }; + const char *why = 0; + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_other = proxy ? SIN_PROXY : 0; + rt = rtalloc1((struct sockaddr *)&sin, create, 0UL); + if (rt == 0) + return (0); + rt->rt_refcnt--; + + if (rt->rt_flags & RTF_GATEWAY) + why = "host is not on local network"; + else if ((rt->rt_flags & RTF_LLINFO) == 0) + why = "could not allocate llinfo"; + else if (rt->rt_gateway->sa_family != AF_LINK) + why = "gateway route is not ours"; + + if (why && create) { + char addrbuf[INET_ADDRSTRLEN]; + log(LOG_DEBUG, "arplookup %s failed: %s\n", + inet_ntoa_r(sin.sin_addr, addrbuf), why); + return 0; + } else if (why) { + return 0; + } + return ((struct llinfo_arp *)rt->rt_llinfo); +} + +void +arp_ifinit(struct arpcom *ac, struct ifaddr *ifa) +{ + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) + arprequest(ac, &(IA_SIN(ifa)->sin_addr), + &(IA_SIN(ifa)->sin_addr), ac->ac_enaddr); + ifa->ifa_rtrequest = arp_rtrequest; + ifa->ifa_flags |= RTF_CLONING; +} diff --git a/netinet/if_ether.h b/netinet/if_ether.h new file mode 100644 index 0000000..f45c414 --- /dev/null +++ b/netinet/if_ether.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.h 8.3 (Berkeley) 5/2/95 + * $FreeBSD: src/sys/netinet/if_ether.h,v 1.32 2005/02/22 13:04:03 glebius Exp $ + */ + + +#ifndef _NETINET_IF_ETHER_H_ +#define _NETINET_IF_ETHER_H_ + +#include /* struct in_addr */ +#include +#include + +#ifdef _KERNEL +/* + * Macro to map an IP multicast address to an Ethernet multicast address. + * The high-order 25 bits of the Ethernet address are statically assigned, + * and the low-order 23 bits are taken from the low end of the IP address. + */ +#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \ + /* struct in_addr *ipaddr; */ \ + /* u_char enaddr[ETHER_ADDR_LEN]; */ \ +{ \ + (enaddr)[0] = 0x01; \ + (enaddr)[1] = 0x00; \ + (enaddr)[2] = 0x5e; \ + (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \ + (enaddr)[4] = ((u_char *)ipaddr)[2]; \ + (enaddr)[5] = ((u_char *)ipaddr)[3]; \ +} +#endif + +/* + * Ethernet Address Resolution Protocol. + * + * See RFC 826 for protocol description. Structure below is adapted + * to resolving internet addresses. Field names used correspond to + * RFC 826. + */ +struct ether_arp { + struct arphdr ea_hdr; /* fixed-size header */ + u_char arp_sha[ETHER_ADDR_LEN]; /* sender hardware address */ + u_char arp_spa[4]; /* sender protocol address */ + u_char arp_tha[ETHER_ADDR_LEN]; /* target hardware address */ + u_char arp_tpa[4]; /* target protocol address */ +}; +#define arp_hrd ea_hdr.ar_hrd +#define arp_pro ea_hdr.ar_pro +#define arp_hln ea_hdr.ar_hln +#define arp_pln ea_hdr.ar_pln +#define arp_op ea_hdr.ar_op + +struct sockaddr_inarp { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + struct in_addr sin_srcaddr; + u_short sin_tos; + u_short sin_other; +#define SIN_PROXY 1 +}; +/* + * IP and ethernet specific routing flags + */ +#define RTF_USETRAILERS RTF_PROTO1 /* use trailers */ +#define RTF_ANNOUNCE RTF_PROTO2 /* announce new arp entry */ + +#ifdef _KERNEL +extern u_char etherbroadcastaddr[ETHER_ADDR_LEN]; +extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN]; +extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN]; +extern struct ifqueue arpintrq; + +int arpresolve(struct arpcom *, struct rtentry *, struct mbuf *, + struct sockaddr *, u_char *, struct rtentry *); +void arp_ifinit(struct arpcom *, struct ifaddr *); +int ether_addmulti(struct ifreq *, struct arpcom *); +int ether_delmulti(struct ifreq *, struct arpcom *); + +/* + * Ethernet multicast address structure. There is one of these for each + * multicast address or range of multicast addresses that we are supposed + * to listen to on a particular interface. They are kept in a linked list, + * rooted in the interface's arpcom structure. (This really has nothing to + * do with ARP, or with the Internet address family, but this appears to be + * the minimally-disrupting place to put it.) + */ +struct ether_multi { + u_char enm_addrlo[ETHER_ADDR_LEN]; /* low or only address of range */ + u_char enm_addrhi[ETHER_ADDR_LEN]; /* high or only address of range */ + struct arpcom *enm_ac; /* back pointer to arpcom */ + u_int enm_refcount; /* no. claims to this addr/range */ + struct ether_multi *enm_next; /* ptr to next ether_multi */ +}; + +/* + * Structure used by macros below to remember position when stepping through + * all of the ether_multi records. + */ +struct ether_multistep { + struct ether_multi *e_enm; +}; + +/* + * Macro for looking up the ether_multi record for a given range of Ethernet + * multicast addresses connected to a given arpcom structure. If no matching + * record is found, "enm" returns NULL. + */ +#define ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm) \ + /* u_char addrlo[ETHER_ADDR_LEN]; */ \ + /* u_char addrhi[ETHER_ADDR_LEN]; */ \ + /* struct arpcom *ac; */ \ + /* struct ether_multi *enm; */ \ +{ \ + for ((enm) = (ac)->ac_multiaddrs; \ + (enm) != NULL && \ + (bcmp((enm)->enm_addrlo, (addrlo), ETHER_ADDR_LEN) != 0 || \ + bcmp((enm)->enm_addrhi, (addrhi), ETHER_ADDR_LEN) != 0); \ + (enm) = (enm)->enm_next); \ +} + +/* + * Macro to step through all of the ether_multi records, one at a time. + * The current position is remembered in "step", which the caller must + * provide. ETHER_FIRST_MULTI(), below, must be called to initialize "step" + * and get the first record. Both macros return a NULL "enm" when there + * are no remaining records. + */ +#define ETHER_NEXT_MULTI(step, enm) \ + /* struct ether_multistep step; */ \ + /* struct ether_multi *enm; */ \ +{ \ + if (((enm) = (step).e_enm) != NULL) \ + (step).e_enm = (enm)->enm_next; \ +} + +#define ETHER_FIRST_MULTI(step, ac, enm) \ + /* struct ether_multistep step; */ \ + /* struct arpcom *ac; */ \ + /* struct ether_multi *enm; */ \ +{ \ + (step).e_enm = (ac)->ac_multiaddrs; \ + ETHER_NEXT_MULTI((step), (enm)); \ +} + +#endif + +#endif diff --git a/netinet/igmp.c b/netinet/igmp.c new file mode 100644 index 0000000..7078914 --- /dev/null +++ b/netinet/igmp.c @@ -0,0 +1,473 @@ +#include + +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.c 8.1 (Berkeley) 7/19/93 + */ + +/* + * Internet Group Management Protocol (IGMP) routines. + * + * Written by Steve Deering, Stanford, May 1988. + * Modified by Rosen Sharma, Stanford, Aug 1994. + * Modified by Bill Fenner, Xerox PARC, Feb 1995. + * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * + * MULTICAST Revision: 3.5.1.4 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct router_info * + find_rti (struct ifnet *ifp); + +static struct igmpstat igmpstat; + +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RD, + &igmpstat, igmpstat, ""); + +static int igmp_timers_are_running; +static u_long igmp_all_hosts_group; +static u_long igmp_all_rtrs_group; +static struct mbuf *router_alert; +static struct router_info *Head; + +static void igmp_sendpkt(struct in_multi *, int, unsigned long); + +void +igmp_init(void) +{ + struct ipoption *ra; + + /* + * To avoid byte-swapping the same value over and over again. + */ + igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); + igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); + + igmp_timers_are_running = 0; + + /* + * Construct a Router Alert option to use in outgoing packets + */ + MGET(router_alert, M_DONTWAIT, MT_DATA); + ra = mtod(router_alert, struct ipoption *); + ra->ipopt_dst.s_addr = 0; + ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + ra->ipopt_list[1] = 0x04; /* 4 bytes long */ + ra->ipopt_list[2] = 0x00; + ra->ipopt_list[3] = 0x00; + router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; + + Head = (struct router_info *) 0; +} + +static struct router_info * +find_rti(struct ifnet *ifp) +{ + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> entering \n"); +#endif + while (rti) { + if (rti->rti_ifp == ifp) { +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> found old entry \n"); +#endif + return rti; + } + rti = rti->rti_next; + } + MALLOC(rti, struct router_info *, sizeof *rti, M_MRTABLE, M_NOWAIT); + rti->rti_ifp = ifp; + rti->rti_type = IGMP_V2_ROUTER; + rti->rti_time = 0; + rti->rti_next = Head; + Head = rti; +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> created an entry \n"); +#endif + return rti; +} + +void +igmp_input(struct mbuf *m, int iphlen) +{ + register struct igmp *igmp; + register struct ip *ip; + register int igmplen; + register struct ifnet *ifp = m->m_pkthdr.rcvif; + register int minlen; + register struct in_multi *inm; + register struct in_ifaddr *ia; + struct in_multistep step; + struct router_info *rti; + + int timer; /** timer value in the igmp query header **/ + + ++igmpstat.igps_rcv_total; + + ip = mtod(m, struct ip *); + igmplen = ip->ip_len; + + /* + * Validate lengths + */ + if (igmplen < IGMP_MINLEN) { + ++igmpstat.igps_rcv_tooshort; + m_freem(m); + return; + } + minlen = iphlen + IGMP_MINLEN; + if ((m->m_flags & M_EXT || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == 0) { + ++igmpstat.igps_rcv_tooshort; + return; + } + + /* + * Validate checksum + */ + m->m_data += iphlen; + m->m_len -= iphlen; + igmp = mtod(m, struct igmp *); + if (in_cksum(m, igmplen)) { + ++igmpstat.igps_rcv_badsum; + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + + ip = mtod(m, struct ip *); + timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; + rti = find_rti(ifp); + + /* + * In the IGMPv2 specification, there are 3 states and a flag. + * + * In Non-Member state, we simply don't have a membership record. + * In Delaying Member state, our timer is running (inm->inm_timer) + * In Idle Member state, our timer is not running (inm->inm_timer==0) + * + * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if + * we have heard a report from another member, or IGMP_IREPORTEDLAST + * if I sent the last report. + */ + switch (igmp->igmp_type) { + + case IGMP_MEMBERSHIP_QUERY: + ++igmpstat.igps_rcv_queries; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (igmp->igmp_code == 0) { + /* + * Old router. Remember that the querier on this + * interface is old, and set the timer to the + * value in RFC 1112. + */ + + rti->rti_type = IGMP_V1_ROUTER; + rti->rti_time = 0; + + timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; + + if (ip->ip_dst.s_addr != igmp_all_hosts_group || + igmp->igmp_group.s_addr != 0) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } else { + /* + * New router. Simply do the new validity check. + */ + + if (igmp->igmp_group.s_addr != 0 && + !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to the "all-hosts" group (224.0.0.1). + * - Restart any timer that is already running but has + * a value longer than the requested timeout. + * - Use the value specified in the query message as + * the maximum timeout. + */ + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_ifp == ifp && + inm->inm_addr.s_addr != igmp_all_hosts_group && + (igmp->igmp_group.s_addr == 0 || + igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { + if (inm->inm_timer == 0 || + inm->inm_timer > timer) { + inm->inm_timer = + IGMP_RANDOM_DELAY(timer); + igmp_timers_are_running = 1; + } + } + IN_NEXT_MULTI(step, inm); + } + + break; + + case IGMP_V1_MEMBERSHIP_REPORT: + case IGMP_V2_MEMBERSHIP_REPORT: + /* + * For fast leave to work, we have to know that we are the + * last person to send a report for this group. Reports + * can potentially get looped back if we are a multicast + * router, so discard reports sourced by me. + */ + IFP_TO_IA(ifp, ia); + if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + + ++igmpstat.igps_rcv_reports; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badreports; + m_freem(m); + return; + } + + /* + * KLUDGE: if the IP source address of the report has an + * unspecified (i.e., zero) subnet number, as is allowed for + * a booting host, replace it with the correct subnet number + * so that a process-level multicast routing demon can + * determine which subnet it arrived from. This is necessary + * to compensate for the lack of any way for a process to + * determine the arrival interface of an incoming packet. + */ + if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) + if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); + + /* + * If we belong to the group being reported, stop + * our timer for that group. + */ + IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); + + if (inm != NULL) { + inm->inm_timer = 0; + ++igmpstat.igps_rcv_ourreports; + + inm->inm_state = IGMP_OTHERMEMBER; + } + + break; + } + + /* + * Pass all valid IGMP packets up to any process(es) listening + * on a raw IGMP socket. + */ + rip_input(m, iphlen); +} + +void +igmp_joingroup(struct in_multi *inm) +{ + int s = splnet(); + + if (inm->inm_addr.s_addr == igmp_all_hosts_group + || inm->inm_ifp->if_flags & IFF_LOOPBACK) { + inm->inm_timer = 0; + inm->inm_state = IGMP_OTHERMEMBER; + } else { + inm->inm_rti = find_rti(inm->inm_ifp); + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); + inm->inm_state = IGMP_IREPORTEDLAST; + igmp_timers_are_running = 1; + } + splx(s); +} + +void +igmp_leavegroup(struct in_multi *inm) +{ + if (inm->inm_state == IGMP_IREPORTEDLAST && + inm->inm_addr.s_addr != igmp_all_hosts_group && + !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && + inm->inm_rti->rti_type != IGMP_V1_ROUTER) + igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); +} + +void +igmp_fasttimo(void) +{ + register struct in_multi *inm; + struct in_multistep step; + int s; + + /* + * Quick check to see if any work needs to be done, in order + * to minimize the overhead of fasttimo processing. + */ + + if (!igmp_timers_are_running) + return; + + s = splnet(); + igmp_timers_are_running = 0; + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_timer == 0) { + /* do nothing */ + } else if (--inm->inm_timer == 0) { + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_state = IGMP_IREPORTEDLAST; + } else { + igmp_timers_are_running = 1; + } + IN_NEXT_MULTI(step, inm); + } + splx(s); +} + +void +igmp_slowtimo(void) +{ + int s = splnet(); + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > entering \n"); +#endif + while (rti) { + if (rti->rti_type == IGMP_V1_ROUTER) { + rti->rti_time++; + if (rti->rti_time >= IGMP_AGE_THRESHOLD) { + rti->rti_type = IGMP_V2_ROUTER; + } + } + rti = rti->rti_next; + } +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > exiting \n"); +#endif + splx(s); +} + +static struct route igmprt; + +static void +igmp_sendpkt(struct in_multi *inm, int type, unsigned long addr) +{ + struct mbuf *m; + struct igmp *igmp; + struct ip *ip; + struct ip_moptions imo; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = loif; + m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; + MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); + m->m_data += sizeof(struct ip); + m->m_len = IGMP_MINLEN; + igmp = mtod(m, struct igmp *); + igmp->igmp_type = type; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; + + imo.imo_multicast_ifp = inm->inm_ifp; + imo.imo_multicast_ttl = 1; + imo.imo_multicast_vif = -1; + /* + * Request loopback of the report if we are acting as a multicast + * router, so that the process-level routing demon can hear it. + */ + imo.imo_multicast_loop = (ip_mrouter != NULL); + + /* + * XXX + * Do we have to worry about reentrancy here? Don't think so. + */ + ip_output(m, router_alert, &igmprt, 0, &imo); + + ++igmpstat.igps_snd_reports; +} diff --git a/netinet/igmp.h b/netinet/igmp.h new file mode 100644 index 0000000..b397ccd --- /dev/null +++ b/netinet/igmp.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_IGMP_H_ +#define _NETINET_IGMP_H_ + +#include /* struct in_addr */ + +/* + * Internet Group Management Protocol (IGMP) definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.2 + */ + +/* + * IGMP packet format. + */ +struct igmp { + u_char igmp_type; /* version & type of IGMP message */ + u_char igmp_code; /* subtype for routing msgs */ + u_short igmp_cksum; /* IP-style checksum */ + struct in_addr igmp_group; /* group address being reported */ +}; /* (zero for queries) */ + +#define IGMP_MINLEN 8 + +/* + * Message types, including version number. + */ +#define IGMP_MEMBERSHIP_QUERY 0x11 /* membership query */ +#define IGMP_V1_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ +#define IGMP_V2_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ +#define IGMP_V2_LEAVE_GROUP 0x17 /* Leave-group message */ + +#define IGMP_DVMRP 0x13 /* DVMRP routing message */ +#define IGMP_PIM 0x14 /* PIM routing message */ + +#define IGMP_MTRACE_RESP 0x1e /* traceroute resp.(to sender)*/ +#define IGMP_MTRACE 0x1f /* mcast traceroute messages */ + +#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ + /* query (in seconds) according */ + /* to RFC1112 */ + + +#define IGMP_TIMER_SCALE 10 /* denotes that the igmp code field */ + /* specifies time in 10th of seconds*/ + +/* + * The following four defininitions are for backwards compatibility. + * They should be removed as soon as all applications are updated to + * use the new constant names. + */ +#define IGMP_HOST_MEMBERSHIP_QUERY IGMP_MEMBERSHIP_QUERY +#define IGMP_HOST_MEMBERSHIP_REPORT IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_HOST_NEW_MEMBERSHIP_REPORT IGMP_V2_MEMBERSHIP_REPORT +#define IGMP_HOST_LEAVE_MESSAGE IGMP_V2_LEAVE_GROUP + +#endif /* _NETINET_IGMP_H_ */ diff --git a/netinet/igmp_var.h b/netinet/igmp_var.h new file mode 100644 index 0000000..d1454e0 --- /dev/null +++ b/netinet/igmp_var.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93 + * $FreeBSD: src/sys/netinet/igmp_var.h,v 1.20 2004/04/07 20:46:13 imp Exp $ + */ + +#ifndef _NETINET_IGMP_VAR_H_ +#define _NETINET_IGMP_VAR_H_ + +/* + * Internet Group Management Protocol (IGMP), + * implementation-specific definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.3 + */ + +struct igmpstat { + u_int igps_rcv_total; /* total IGMP messages received */ + u_int igps_rcv_tooshort; /* received with too few bytes */ + u_int igps_rcv_badsum; /* received with bad checksum */ + u_int igps_rcv_queries; /* received membership queries */ + u_int igps_rcv_badqueries; /* received invalid queries */ + u_int igps_rcv_reports; /* received membership reports */ + u_int igps_rcv_badreports; /* received invalid reports */ + u_int igps_rcv_ourreports; /* received reports for our groups */ + u_int igps_snd_reports; /* sent membership reports */ +}; + +#ifdef _KERNEL +#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) + +/* + * States for IGMPv2's leave processing + */ +#define IGMP_OTHERMEMBER 0 +#define IGMP_IREPORTEDLAST 1 + +/* + * We must remember what version the subnet's querier is. + * We conveniently use the IGMP message type for the proper + * membership report to keep this state. + */ +#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT + +/* + * Revert to new router if we haven't heard from an old router in + * this amount of time. + */ +#define IGMP_AGE_THRESHOLD 540 + +void igmp_init(void); +void igmp_input(struct mbuf *, int); +void igmp_joingroup(struct in_multi *); +void igmp_leavegroup(struct in_multi *); +void igmp_fasttimo(void); +void igmp_slowtimo(void); + +SYSCTL_DECL(_net_inet_igmp); + +#endif + +/* + * Names for IGMP sysctl objects + */ +#define IGMPCTL_STATS 1 /* statistics (read-only) */ +#define IGMPCTL_MAXID 2 + +#define IGMPCTL_NAMES { \ + { 0, 0 }, \ + { "stats", CTLTYPE_STRUCT }, \ +} +#endif diff --git a/netinet/in.c b/netinet/in.c new file mode 100644 index 0000000..cb08232 --- /dev/null +++ b/netinet/in.c @@ -0,0 +1,711 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.c 8.4 (Berkeley) 1/9/95 + * $FreeBSD: src/sys/netinet/in.c,v 1.75 2004/04/07 20:46:13 imp Exp $ + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +/* + * This structure is used to keep track of in_multi chains which belong to + * deleted interface addresses. + */ +static LIST_HEAD(, multi_kludge) in_mk; /* XXX BSS initialization */ + +struct multi_kludge { + LIST_ENTRY(multi_kludge) mk_entry; + struct ifnet *mk_ifp; + struct in_multihead mk_head; +}; + +static void in_socktrim(struct sockaddr_in *); +static int in_ifinit(struct ifnet *, + struct in_ifaddr *, struct sockaddr_in *, int); +static void in_ifscrub(struct ifnet *, struct in_ifaddr *); + +static int subnetsarelocal = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, + &subnetsarelocal, 0, ""); +/* + * Return 1 if an internet address is for a ``local'' host + * (one to which we have a connection). If subnetsarelocal + * is true, this includes other subnets of the local net. + * Otherwise, it includes only the directly-connected (sub)nets. + */ +int +in_localaddr(struct in_addr in) +{ + register u_long i = ntohl(in.s_addr); + register struct in_ifaddr *ia; + + if (subnetsarelocal) { + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if ((i & ia->ia_netmask) == ia->ia_net) + return (1); + } else { + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if ((i & ia->ia_subnetmask) == ia->ia_subnet) + return (1); + } + return (0); +} + +/* + * Determine whether an IP address is in a reserved set of addresses + * that may not be forwarded, or whether datagrams to that destination + * may be forwarded. + */ +int +in_canforward(struct in_addr in) +{ + register u_long i = ntohl(in.s_addr); + register u_long net; + + if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i)) + return (0); + if (IN_CLASSA(i)) { + net = i & IN_CLASSA_NET; + if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) + return (0); + } + return (1); +} + +/* + * Trim a mask in a sockaddr + */ +static void +in_socktrim(struct sockaddr_in *ap) +{ + register char *cplim = (char *) &ap->sin_addr; + register char *cp = (char *) (&ap->sin_addr + 1); + + ap->sin_len = 0; + while (--cp >= cplim) + if (*cp) { + (ap)->sin_len = cp - (char *) (ap) + 1; + break; + } +} + +static int in_interfaces; /* number of external internet interfaces */ + +/* + * Generic internet control operations (ioctl's). + * Ifp is 0 if not an interface-specific ioctl. + */ +int +in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp) +{ + register struct ifreq *ifr = (struct ifreq *)data; + register struct in_ifaddr *ia = 0, *iap; + register struct ifaddr *ifa; + struct in_ifaddr *oia; + struct in_aliasreq *ifra = (struct in_aliasreq *)data; + struct sockaddr_in oldaddr; + int error, hostIsNew, maskIsNew, s; + struct multi_kludge *mk; + + /* + * Find address for this interface, if it exists. + * + * If an alias address was specified, find that one instead of + * the first one on the interface. + */ + if (ifp) + for (iap = in_ifaddr; iap; iap = iap->ia_next) + if (iap->ia_ifp == ifp) { + if (((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr.s_addr == + iap->ia_addr.sin_addr.s_addr) { + ia = iap; + break; + } else if (ia == NULL) { + ia = iap; + if (ifr->ifr_addr.sa_family != AF_INET) + break; + } + } + + switch (cmd) { + + case SIOCAIFADDR: + case SIOCDIFADDR: + if (ifra->ifra_addr.sin_family == AF_INET) { + for (oia = ia; ia; ia = ia->ia_next) { + if (ia->ia_ifp == ifp && + ia->ia_addr.sin_addr.s_addr == + ifra->ifra_addr.sin_addr.s_addr) + break; + } + if ((ifp->if_flags & IFF_POINTOPOINT) + && (cmd == SIOCAIFADDR) + && (ifra->ifra_dstaddr.sin_addr.s_addr + == INADDR_ANY)) { + return EDESTADDRREQ; + } + } + if (cmd == SIOCDIFADDR && ia == 0) + return (EADDRNOTAVAIL); + /* FALLTHROUGH */ + case SIOCSIFADDR: + case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + + if (ifp == 0) + panic("in_control"); + if (ia == (struct in_ifaddr *)0) { + oia = (struct in_ifaddr *) + malloc(sizeof *oia, M_IFADDR, M_WAITOK); + if (oia == (struct in_ifaddr *)NULL) + return (ENOBUFS); + bzero((caddr_t)oia, sizeof *oia); + ia = in_ifaddr; + /* + * Protect from ipintr() traversing address list + * while we're modifying it. + */ + s = splnet(); + + if (ia) { + for ( ; ia->ia_next; ia = ia->ia_next) + continue; + ia->ia_next = oia; + } else + in_ifaddr = oia; + ia = oia; + ifa = ifp->if_addrlist; + if (ifa) { + for ( ; ifa->ifa_next; ifa = ifa->ifa_next) + continue; + ifa->ifa_next = (struct ifaddr *) ia; + } else + ifp->if_addrlist = (struct ifaddr *) ia; + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + ia->ia_ifa.ifa_dstaddr + = (struct sockaddr *)&ia->ia_dstaddr; + ia->ia_ifa.ifa_netmask + = (struct sockaddr *)&ia->ia_sockmask; + ia->ia_sockmask.sin_len = 8; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sin_family = AF_INET; + } + ia->ia_ifp = ifp; + if (!(ifp->if_flags & IFF_LOOPBACK)) + in_interfaces++; + splx(s); + } + break; + + case SIOCSIFBRDADDR: + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + /* FALLTHROUGH */ + + case SIOCGIFADDR: + case SIOCGIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCGIFBRDADDR: + if (ia == (struct in_ifaddr *)0) + return (EADDRNOTAVAIL); + break; + } + switch (cmd) { + + case SIOCGIFADDR: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + break; + + case SIOCGIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + break; + + case SIOCGIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + break; + + case SIOCGIFNETMASK: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + break; + + case SIOCSIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + oldaddr = ia->ia_dstaddr; + ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; + if (ifp->if_ioctl && (error = (*ifp->if_ioctl) + (ifp, SIOCSIFDSTADDR, (caddr_t)ia))) { + ia->ia_dstaddr = oldaddr; + return (error); + } + if (ia->ia_flags & IFA_ROUTE) { + ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + ia->ia_ifa.ifa_dstaddr = + (struct sockaddr *)&ia->ia_dstaddr; + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + } + break; + + case SIOCSIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; + break; + + case SIOCSIFADDR: + return (in_ifinit(ifp, ia, + (struct sockaddr_in *) &ifr->ifr_addr, 1)); + + case SIOCSIFNETMASK: + ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr; + ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); + break; + + case SIOCAIFADDR: + maskIsNew = 0; + hostIsNew = 1; + error = 0; + if (ia->ia_addr.sin_family == AF_INET) { + if (ifra->ifra_addr.sin_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (ifra->ifra_addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + hostIsNew = 0; + } + if (ifra->ifra_mask.sin_len) { + in_ifscrub(ifp, ia); + ia->ia_sockmask = ifra->ifra_mask; + ia->ia_subnetmask = + ntohl(ia->ia_sockmask.sin_addr.s_addr); + maskIsNew = 1; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.sin_family == AF_INET)) { + in_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + maskIsNew = 1; /* We lie; but the effect's the same */ + } + if (ifra->ifra_addr.sin_family == AF_INET && + (hostIsNew || maskIsNew)) + error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + if ((ifp->if_flags & IFF_BROADCAST) && + (ifra->ifra_broadaddr.sin_family == AF_INET)) + ia->ia_broadaddr = ifra->ifra_broadaddr; + return (error); + + case SIOCDIFADDR: + mk = malloc(sizeof *mk, M_IPMADDR, M_WAITOK); + if (!mk) + return ENOBUFS; + + in_ifscrub(ifp, ia); + /* + * Protect from ipintr() traversing address list + * while we're modifying it. + */ + s = splnet(); + + if ((ifa = ifp->if_addrlist) == (struct ifaddr *)ia) + ifp->if_addrlist = ifa->ifa_next; + else { + while (ifa->ifa_next && + (ifa->ifa_next != (struct ifaddr *)ia)) + ifa = ifa->ifa_next; + if (ifa->ifa_next) + ifa->ifa_next = ((struct ifaddr *)ia)->ifa_next; + else + printf("Couldn't unlink inifaddr from ifp\n"); + } + oia = ia; + if (oia == (ia = in_ifaddr)) + in_ifaddr = ia->ia_next; + else { + while (ia->ia_next && (ia->ia_next != oia)) + ia = ia->ia_next; + if (ia->ia_next) + ia->ia_next = oia->ia_next; + else + printf("Didn't unlink inifadr from list\n"); + } + + if (!oia->ia_multiaddrs.lh_first) { + IFAFREE(&oia->ia_ifa); + FREE(mk, M_IPMADDR); + splx(s); + break; + } + + /* + * Multicast address kludge: + * If there were any multicast addresses attached to this + * interface address, either move them to another address + * on this interface, or save them until such time as this + * interface is reconfigured for IP. + */ + IFP_TO_IA(oia->ia_ifp, ia); + if (ia) { /* there is another address */ + struct in_multi *inm; + for(inm = oia->ia_multiaddrs.lh_first; inm; + inm = inm->inm_entry.le_next) { + IFAFREE(&inm->inm_ia->ia_ifa); + ia->ia_ifa.ifa_refcnt++; + inm->inm_ia = ia; + LIST_INSERT_HEAD(&ia->ia_multiaddrs, inm, + inm_entry); + } + FREE(mk, M_IPMADDR); + } else { /* last address on this if deleted, save */ + struct in_multi *inm; + + LIST_INIT(&mk->mk_head); + mk->mk_ifp = ifp; + + for(inm = oia->ia_multiaddrs.lh_first; inm; + inm = inm->inm_entry.le_next) { + LIST_INSERT_HEAD(&mk->mk_head, inm, inm_entry); + } + + if (mk->mk_head.lh_first) { + LIST_INSERT_HEAD(&in_mk, mk, mk_entry); + } else { + FREE(mk, M_IPMADDR); + } + } + + IFAFREE((&oia->ia_ifa)); + splx(s); + break; + + default: + if (ifp == 0 || ifp->if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } + return (0); +} + +/* + * Delete any existing route for an interface. + */ +static void +in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia) +{ + + if ((ia->ia_flags & IFA_ROUTE) == 0) + return; + if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + else + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + ia->ia_flags &= ~IFA_ROUTE; +} + +/* + * Initialize an interface's internet address + * and routing table entry. + */ +static int +in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, + int scrub) +{ + register u_long i = ntohl(sin->sin_addr.s_addr); + struct sockaddr_in oldaddr; + int s = splimp(), flags = RTF_UP, error; + struct multi_kludge *mk; + + oldaddr = ia->ia_addr; + ia->ia_addr = *sin; + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) { + splx(s); + ia->ia_addr = oldaddr; + return (error); + } + splx(s); + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + in_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + if (IN_CLASSA(i)) + ia->ia_netmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_netmask = IN_CLASSB_NET; + else + ia->ia_netmask = IN_CLASSC_NET; + /* + * The subnet mask usually includes at least the standard network part, + * but may may be smaller in the case of supernetting. + * If it is set, we believe it. + */ + if (ia->ia_subnetmask == 0) { + ia->ia_subnetmask = ia->ia_netmask; + ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); + } else + ia->ia_netmask &= ia->ia_subnetmask; + ia->ia_net = i & ia->ia_netmask; + ia->ia_subnet = i & ia->ia_subnetmask; + in_socktrim(&ia->ia_sockmask); + /* + * Add route for the network. + */ + ia->ia_ifa.ifa_metric = ifp->if_metric; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); + ia->ia_netbroadcast.s_addr = + htonl(ia->ia_net | ~ ia->ia_netmask); + } else if (ifp->if_flags & IFF_LOOPBACK) { + ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr; + flags |= RTF_HOST; + } else if (ifp->if_flags & IFF_POINTOPOINT) { + if (ia->ia_dstaddr.sin_family != AF_INET) + return (0); + flags |= RTF_HOST; + } + if ((error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) + ia->ia_flags |= IFA_ROUTE; + + LIST_INIT(&ia->ia_multiaddrs); + /* + * If the interface supports multicast, join the "all hosts" + * multicast group on that interface. + */ + if (ifp->if_flags & IFF_MULTICAST) { + struct in_addr addr; + + /* + * Continuation of multicast address hack: + * If there was a multicast group list previously saved + * for this interface, then we re-attach it to the first + * address configured on the i/f. + */ + for(mk = in_mk.lh_first; mk; mk = mk->mk_entry.le_next) { + if(mk->mk_ifp == ifp) { + struct in_multi *inm; + + for(inm = mk->mk_head.lh_first; inm; + inm = inm->inm_entry.le_next) { + IFAFREE(&inm->inm_ia->ia_ifa); + ia->ia_ifa.ifa_refcnt++; + inm->inm_ia = ia; + LIST_INSERT_HEAD(&ia->ia_multiaddrs, + inm, inm_entry); + } + LIST_REMOVE(mk, mk_entry); + free(mk, M_IPMADDR); + break; + } + } + + addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + in_addmulti(&addr, ifp); + } + return (error); +} + + +/* + * Return 1 if the address might be a local broadcast address. + */ +int +in_broadcast(struct in_addr in, struct ifnet *ifp) +{ + register struct ifaddr *ifa; + u_long t; + + if (in.s_addr == INADDR_BROADCAST || + in.s_addr == INADDR_ANY) + return 1; + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return 0; + t = ntohl(in.s_addr); + /* + * Look through the list of addresses for a match + * with a broadcast address. + */ +#define ia ((struct in_ifaddr *)ifa) + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + if (ifa->ifa_addr->sa_family == AF_INET && + (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || + in.s_addr == ia->ia_netbroadcast.s_addr || + /* + * Check for old-style (host 0) broadcast. + */ + t == ia->ia_subnet || t == ia->ia_net) && + /* + * Check for an all one subnetmask. These + * only exist when an interface gets a secondary + * address. + */ + ia->ia_subnetmask != (u_long)0xffffffff) + return 1; + return (0); +#undef ia +} +/* + * Add an address to the list of IP multicast addresses for a given interface. + */ +struct in_multi * +in_addmulti(struct in_addr *ap, struct ifnet *ifp) +{ + register struct in_multi *inm; + struct ifreq ifr; + struct in_ifaddr *ia; + int s = splnet(); + + /* + * See if address already in list. + */ + IN_LOOKUP_MULTI(*ap, ifp, inm); + if (inm != NULL) { + /* + * Found it; just increment the reference count. + */ + ++inm->inm_refcount; + } + else { + /* + * New address; allocate a new multicast record + * and link it into the interface's multicast list. + */ + inm = (struct in_multi *)malloc(sizeof(*inm), + M_IPMADDR, M_NOWAIT); + if (inm == NULL) { + splx(s); + return (NULL); + } + inm->inm_addr = *ap; + inm->inm_ifp = ifp; + inm->inm_refcount = 1; + IFP_TO_IA(ifp, ia); + if (ia == NULL) { + free(inm, M_IPMADDR); + splx(s); + return (NULL); + } + inm->inm_ia = ia; + ia->ia_ifa.ifa_refcnt++; /* gain a reference */ + LIST_INSERT_HEAD(&ia->ia_multiaddrs, inm, inm_entry); + + /* + * Ask the network driver to update its multicast reception + * filter appropriately for the new address. + */ + ((struct sockaddr_in *)&ifr.ifr_addr)->sin_family = AF_INET; + ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr = *ap; + if ((ifp->if_ioctl == NULL) || + (*ifp->if_ioctl)(ifp, SIOCADDMULTI,(caddr_t)&ifr) != 0) { + LIST_REMOVE(inm, inm_entry); + IFAFREE(&ia->ia_ifa); /* release reference */ + free(inm, M_IPMADDR); + splx(s); + return (NULL); + } + /* + * Let IGMP know that we have joined a new IP multicast group. + */ + igmp_joingroup(inm); + } + splx(s); + return (inm); +} + +/* + * Delete a multicast address record. + */ +void +in_delmulti(struct in_multi *inm) +{ + struct ifreq ifr; + int s = splnet(); + + if (--inm->inm_refcount == 0) { + /* + * No remaining claims to this record; let IGMP know that + * we are leaving the multicast group. + */ + igmp_leavegroup(inm); + /* + * Unlink from list. + */ + LIST_REMOVE(inm, inm_entry); + IFAFREE(&inm->inm_ia->ia_ifa); /* release reference */ + + /* + * Notify the network driver to update its multicast reception + * filter. + */ + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr = + inm->inm_addr; + (*inm->inm_ifp->if_ioctl)(inm->inm_ifp, SIOCDELMULTI, + (caddr_t)&ifr); + free(inm, M_IPMADDR); + } + splx(s); +} diff --git a/netinet/in_cksum.c b/netinet/in_cksum.c new file mode 100644 index 0000000..39dc9ac --- /dev/null +++ b/netinet/in_cksum.c @@ -0,0 +1,186 @@ +#include + +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +/* + * Try to use a CPU specific version, then punt to the portable C one. + */ + + +#if (defined(__GNUC__) && (defined(__arm__) && !defined(__thumb__))) + +/* This currently does not support Thumb assembly */ +#include "in_cksum_arm.h" + +#elif (defined(__GNUC__) && defined(__i386__)) + +#include "in_cksum_i386.h" + +#elif (defined(__GNUC__) && (defined(__mc68000__) || defined(__m68k__))) + +#include "in_cksum_m68k.h" +#elif (defined(__GNUC__) && defined(__PPC__)) + +#include "in_cksum_powerpc.h" + +#elif (defined(__GNUC__) && defined(__nios2__)) + +#include "in_cksum_nios2.h" + +#elif (defined(__GNUC__) && defined(__sparc__)) + +#include "in_cksum_sparc.h" + +#else + +#include /* for puts */ + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535L ? x -= 65535L : x) +#define REDUCE \ + {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum( + struct mbuf *m, + uint32_t len ) +{ + register u_short *w; + register int32_t sum = 0; + register int32_t mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ + if ((1 & (intptr_t) w) && (mlen > 0)) { + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + puts("cksum: out of data"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} +#endif diff --git a/netinet/in_cksum_arm.h b/netinet/in_cksum_arm.h new file mode 100644 index 0000000..d884a0f --- /dev/null +++ b/netinet/in_cksum_arm.h @@ -0,0 +1,236 @@ +/* $NetBSD: in_cksum_arm.c,v 1.3 2001/12/08 21:18:50 chris Exp $ */ + +/* + * ARM version: + * + * Copyright (c) 1997 Mark Brinicome + * Copyright (c) 1997 Causality Limited + * + * Based on the sparc version. + */ + +/* + * Sparc version: + * + * Copyright (c) 1995 Zubin Dittia. + * Copyright (c) 1995 Matthew R. Green. + * Copyright (c) 1994 Charles M. Hannum. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/11/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Checksum routine for Internet Protocol family headers. + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + * + * ARM version. + */ + +#define ADD64 __asm __volatile(" \n\ + ldmia %0!, {%2, %3, %4, %5} \n\ + adds %1,%7,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,%4; adcs %1,%1,%5 \n\ + ldmia %0!, {%2, %3, %4, %5} \n\ + adcs %1,%1,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,%4; adcs %1,%1,%5 \n\ + ldmia %0!, {%2, %3, %4, %5} \n\ + adcs %1,%1,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,%4; adcs %1,%1,%5 \n\ + ldmia %0!, {%2, %3, %4, %5} \n\ + adcs %1,%1,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,%4; adcs %1,%1,%5 \n\ + adcs %1,%1,#0\n" \ + : "=r" (w), "=r" (sum), "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4) \ + : "0" (w), "r" (sum) \ + : "cc") + +#define ADD32 __asm __volatile(" \n\ + ldmia %0!, {%2, %3, %4, %5} \n\ + adds %1,%7,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,%4; adcs %1,%1,%5 \n\ + ldmia %0!, {%2, %3, %4, %5} \n\ + adcs %1,%1,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,%4; adcs %1,%1,%5 \n\ + adcs %1,%1,#0\n" \ + : "=r" (w), "=r" (sum), "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4) \ + : "0" (w), "r" (sum) \ + : "cc") + +#define ADD16 __asm __volatile(" \n\ + ldmia %0!, {%2, %3, %4, %5} \n\ + adds %1,%7,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,%4; adcs %1,%1,%5 \n\ + adcs %1,%1,#0\n" \ + : "=r" (w), "=r" (sum), "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4) \ + : "0" (w), "r" (sum) \ + : "cc") + +#define ADD8 __asm __volatile(" \n\ + ldmia %0!, {%2, %3} \n\ + adds %1,%5,%2; adcs %1,%1,%3 \n\ + adcs %1,%1,#0\n" \ + : "=r" (w), "=r" (sum), "=&r" (tmp1), "=&r" (tmp2) \ + : "0" (w), "r" (sum) \ + : "cc" ) + +#define ADD4 __asm __volatile(" \n\ + ldr %2,[%0],#4 \n\ + adds %1,%4,%2 \n\ + adcs %1,%1,#0\n" \ + : "=r" (w), "=r" (sum), "=&r" (tmp1) \ + : "0" (w), "r" (sum) \ + : "cc") + +/*#define REDUCE {sum = (sum & 0xffff) + (sum >> 16);}*/ +#define REDUCE __asm __volatile(" \n\ + mov %2, #0x00ff \n\ + orr %2, %2, #0xff00 \n\ + and %2, %0, %2 \n\ + add %0, %2, %0, lsr #16\n" \ + : "=r" (sum) \ + : "0" (sum), "r" (tmp1)) + +#define ADDCARRY {if (sum > 0xffff) sum -= 0xffff;} +#define ROL {sum = sum << 8;} /* depends on recent REDUCE */ +#define ADDBYTE {ROL; sum += (*w << 8); byte_swapped ^= 1;} +#define ADDSHORT {sum += *(u_short *)w;} +#define ADVANCE(n) {w += n; mlen -= n;} +#define ADVANCEML(n) {mlen -= n;} + +static __inline__ int +in_cksum_internal(struct mbuf *m, int off, int len, u_int sum) +{ + u_char *w; + int mlen = 0; + int byte_swapped = 0; + + /* + * Declare four temporary registers for use by the asm code. We + * allow the compiler to pick which specific machine registers to + * use, instead of hard-coding this in the asm code above. + */ + register u_int tmp1=0, tmp2, tmp3, tmp4; + + for (; m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_char *) + off; + mlen = m->m_len - off; + off = 0; + if (len < mlen) + mlen = len; + len -= mlen; + + /* + * Ensure that we're aligned on a word boundary here so + * that we can do 32 bit operations below. + */ + if ((3 & (long)w) != 0) { + REDUCE; + if ((1 & (long)w) != 0 && mlen >= 1) { + ADDBYTE; + ADVANCE(1); + } + if ((2 & (long)w) != 0 && mlen >= 2) { + ADDSHORT; + ADVANCE(2); + } + } + + /* + * Do as many 32 bit operations as possible using the + * 64/32/16/8/4 macro's above, using as many as possible of + * these. + */ + + while (mlen >= 64) { + ADD64; + ADVANCEML(64); + } + if (mlen >= 32) { + ADD32; + ADVANCEML(32); + } + if (mlen >= 16) { + ADD16; + ADVANCEML(16); + } + if (mlen >= 8) { + ADD8; + ADVANCEML(8); + } + if (mlen >= 4) { + ADD4; + ADVANCEML(4) + } + if (mlen == 0) + continue; + + REDUCE; + if (mlen >= 2) { + ADDSHORT; + ADVANCE(2); + } + if (mlen == 1) { + ADDBYTE; + } + } + if (byte_swapped) { + REDUCE; + ROL; + } + REDUCE; + ADDCARRY; + + return (0xffff ^ sum); +} + +int +in_cksum( + struct mbuf *m, + int len) +{ + int cksum; + cksum =in_cksum_internal(m, 0, len, 0); + return cksum; +} diff --git a/netinet/in_cksum_i386.h b/netinet/in_cksum_i386.h new file mode 100644 index 0000000..1174287 --- /dev/null +++ b/netinet/in_cksum_i386.h @@ -0,0 +1,202 @@ +/* + * Checksum routine for Internet Protocol family headers. + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + * + * This implementation is 386 version. + */ + +#include /* for puts */ + +#undef ADDCARRY +#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff +#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} + +/* + * Thanks to gcc we don't have to guess + * which registers contain sum & w. + */ +#define ADD(n) __asm__ volatile \ + ("addl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w)) +#define ADDC(n) __asm__ volatile \ + ("adcl " #n "(%2), %0" : "=r" (sum) : "0" (sum), "r" (w)) +#define LOAD(n) __asm__ volatile \ + ("movb " #n "(%1), %0" : "=q" (junk) : "r" (w)) +#define MOP __asm__ volatile \ + ("adcl $0, %0" : "=r" (sum) : "0" (sum)) + +int +in_cksum( + struct mbuf *m, + int len ) +{ + register u_short *w; + register unsigned sum = 0; + register int mlen = 0; + int byte_swapped = 0; + union { char c[2]; u_short s; } su; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + */ + + /* su.c[0] is already saved when scanning previous + * mbuf. sum was REDUCEd when we found mlen == -1 + */ + su.c[1] = *(u_char *)w; + sum += su.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to long boundary so we do longword aligned + * memory operations + */ + if (3 & (int) w) { + REDUCE; + if ((1 & (int) w) && (mlen > 0)) { + sum <<= 8; + su.c[0] = *(char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + if ((2 & (int) w) && (mlen >= 2)) { + sum += *w++; + mlen -= 2; + } + } + /* + * Advance to a 486 cache line boundary. + */ + if (4 & (int) w && mlen >= 4) { + ADD(0); + MOP; + w += 2; + mlen -= 4; + } + if (8 & (int) w && mlen >= 8) { + ADD(0); + ADDC(4); + MOP; + w += 4; + mlen -= 8; + } + /* + * Do as much of the checksum as possible 32 bits at at time. + * In fact, this loop is unrolled to make overhead from + * branches &c small. + */ + mlen -= 1; + while ((mlen -= 32) >= 0) { + u_char junk; + /* + * Add with carry 16 words and fold in the last + * carry by adding a 0 with carry. + * + * The early ADD(16) and the LOAD(32) are to load + * the next 2 cache lines in advance on 486's. The + * 486 has a penalty of 2 clock cycles for loading + * a cache line, plus whatever time the external + * memory takes to load the first word(s) addressed. + * These penalties are unavoidable. Subsequent + * accesses to a cache line being loaded (and to + * other external memory?) are delayed until the + * whole load finishes. These penalties are mostly + * avoided by not accessing external memory for + * 8 cycles after the ADD(16) and 12 cycles after + * the LOAD(32). The loop terminates when mlen + * is initially 33 (not 32) to guaranteed that + * the LOAD(32) is within bounds. + */ + ADD(16); + ADDC(0); + ADDC(4); + ADDC(8); + ADDC(12); + LOAD(32); + ADDC(20); + ADDC(24); + ADDC(28); + MOP; + w += 16; + } + mlen += 32 + 1; + if (mlen >= 32) { + ADD(16); + ADDC(0); + ADDC(4); + ADDC(8); + ADDC(12); + ADDC(20); + ADDC(24); + ADDC(28); + MOP; + w += 16; + mlen -= 32; + } + if (mlen >= 16) { + ADD(0); + ADDC(4); + ADDC(8); + ADDC(12); + MOP; + w += 8; + mlen -= 16; + } + if (mlen >= 8) { + ADD(0); + ADDC(4); + MOP; + w += 4; + mlen -= 8; + } + if (mlen == 0 && byte_swapped == 0) + continue; /* worth 1% maybe ?? */ + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + su.c[1] = *(char *)w; + sum += su.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + /* + * This mbuf has odd number of bytes. + * There could be a word split betwen + * this mbuf and the next mbuf. + * Save the last byte (to prepend to next mbuf). + */ + su.c[0] = *(char *)w; + } + + if (len) + puts("cksum: out of data"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte is shifted left by 8 bits) */ + su.c[1] = 0; + sum += su.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/netinet/in_cksum_m68k.h b/netinet/in_cksum_m68k.h new file mode 100644 index 0000000..32e6978 --- /dev/null +++ b/netinet/in_cksum_m68k.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include + +#if defined(__mcoldfire__) +# define IS_COLDFIRE 1 +#else +# define IS_COLDFIRE 0 +#endif + +#define REDUCE { sum = (sum & 0xFFFF) + (sum >> 16); if (sum > 0xFFFF) sum -= 0xFFFF; } + +/* + * Motorola 68k version of Internet Protocol Checksum routine + * W. Eric Norum + * Saskatchewan Accelerator Laboratory + * August, 1998 + */ +int +in_cksum( struct mbuf *m, + int len ) +{ + unsigned short *w; + unsigned long sum = 0; + int mlen = 0; + int byte_swapped = 0; + union { + char c[2]; + u_short s; + } s_util; + + for ( ; m && len ; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + + /* + * Force to longword boundary. + */ + if (3 & (int)w) { + REDUCE; + if ((1 & (int) w) && (mlen > 0)) { + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + if ((2 & (int) w) && (mlen >= 2)) { + sum += *w++; + mlen -= 2; + } + } + + /* + * Sum all the longwords in the buffer. + * See RFC 1071 -- Computing the Internet Checksum. + * It should work for all 68k family members. + */ + { + unsigned long tcnt = mlen, t1; + __asm__ volatile ( + "movel %2,%3\n\t" + "lsrl #6,%2 | count/64 = # loop traversals\n\t" + "andl #0x3c,%3 | Then find fractions of a chunk\n\t" + "negl %3\n\t | Each long uses 4 instruction bytes\n\t" +#if IS_COLDFIRE + "addql #1,%2 | Clear X (extended carry flag)\n\t" + "subql #1,%2 | \n\t" +#else + "andi #0xf,%%cc | Clear X (extended carry flag)\n\t" +#endif + "jmp %%pc@(2f-.-2:b,%3) | Jump into loop\n" + "1: | Begin inner loop...\n\t" + "movel %1@+,%3 | 0: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 1: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 2: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 3: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 4: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 5: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 6: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 7: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 8: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | 9: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | A: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | B: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | C: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | D: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | E: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n\t" + "movel %1@+,%3 | F: Fetch 32-bit word\n\t" + "addxl %3,%0 | Add word + previous carry\n" + "2: | End of unrolled loop\n\t" +#if IS_COLDFIRE + "moveq #0,%3 | Add in last carry\n\t" + "addxl %3,%0 |\n\t" + "subql #1,%2 | Update loop count\n\t" + "bplb 1b | Loop (with X clear) if not done\n\t" + "movel #0xffff,%2 | Get word mask\n\t" + "movel %0,%3 | Fold 32 bit sum to 16 bits\n\t" + "swap %3 |\n\t" + "andl %2,%0 | Mask to 16-bit sum\n\t" + "andl %2,%3 | Mask to 16-bit sum\n\t" + "addl %3,%0 |\n\t" + "movel %0,%3 | Add in last carry\n\t" + "swap %3 |\n\t" + "addl %3,%0 |\n\t" + "andl %2,%0 | Mask to 16-bit sum\n\t" +#else + "dbf %2,1b | (NB- dbf doesn't affect X)\n\t" + "movel %0,%3 | Fold 32 bit sum to 16 bits\n\t" + "swap %3 | (NB- swap doesn't affect X)\n\t" + "addxw %3,%0 |\n\t" + "moveq #0,%3 | Add in last carry\n\t" + "addxw %3,%0 |\n\t" + "andl #0xffff,%0 | Mask to 16-bit sum\n" +#endif + : + "=d" (sum), "=a" (w), "=d" (tcnt) , "=d" (t1) : + "0" (sum), "1" (w), "2" (tcnt) : + "cc", "memory"); + } + mlen &= 3; + + /* + * Soak up the last 1, 2 or 3 bytes + */ + while ((mlen -= 2) >= 0) + sum += *w++; + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + sum = 0xDEAD; + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/netinet/in_cksum_nios2.h b/netinet/in_cksum_nios2.h new file mode 100644 index 0000000..1b34a28 --- /dev/null +++ b/netinet/in_cksum_nios2.h @@ -0,0 +1,248 @@ + +/* + * Altera Nios2 CRC checksum computation + * + * Author: Jeffrey O. Hill + * + * Copyright 2012. Los Alamos National Security, LLC. + * This material was produced under U.S. Government contract + * DE-AC52-06NA25396 for Los Alamos National Laboratory (LANL), + * which is operated by Los Alamos National Security, LLC for + * the U.S. Department of Energy. The U.S. Government has rights + * to use, reproduce, and distribute this software. NEITHER THE + * GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY + * WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR + * THE USE OF THIS SOFTWARE. + * + * COPYRIGHT (c) 1989-2012. + * On-Line Applications Research Corporation (OAR). + * + * Copyright (c) 1997 Mark Brinicome + * Copyright (c) 1997 Causality Limited + * + * Copyright (c) 1995 Zubin Dittia. + * Copyright (c) 1995 Matthew R. Green. + * Copyright (c) 1994 Charles M. Hannum. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Based on the arm / sparc version, but using instead + * mostly inline functions in place of naaasty macros. + * + * It would be a great idea to somehow detect at runtime + * that the Nios2 has a user defined instruction that + * computes the CRC and invoke it here (we could call a + * function in the BSP). + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Checksum routine for Internet Protocol family headers. + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ +static inline uint32_t _NIOS2_Add_ones_complement_64 +( uint32_t sum, const uint32_t * const pWd ) +{ + sum = _NIOS2_Add_ones_complement ( sum, pWd[0] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[1] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[2] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[3] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[4] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[5] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[6] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[7] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[8] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[9] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[10] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[11] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[12] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[13] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[14] ); + return _NIOS2_Add_ones_complement ( sum, pWd[15] ); +} + +static inline uint32_t _NIOS2_Add_ones_complement_32 +( uint32_t sum, const uint32_t * const pWd ) +{ + sum = _NIOS2_Add_ones_complement ( sum, pWd[0] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[1] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[2] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[3] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[4] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[5] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[6] ); + return _NIOS2_Add_ones_complement ( sum, pWd[7] ); +} + +static inline uint32_t _NIOS2_Add_ones_complement_16 +( uint32_t sum, const uint32_t * const pWd ) +{ + sum = _NIOS2_Add_ones_complement ( sum, pWd[0] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[1] ); + sum = _NIOS2_Add_ones_complement ( sum, pWd[2] ); + return _NIOS2_Add_ones_complement ( sum, pWd[3] ); +} + +static inline uint32_t _NIOS2_Add_ones_complement_8 +( uint32_t sum, const uint32_t * const pWd ) +{ + sum = _NIOS2_Add_ones_complement ( sum, pWd[0] ); + return _NIOS2_Add_ones_complement ( sum, pWd[1] ); +} + +static inline uint32_t _NIOS2_Add_ones_complement_4 +( uint32_t sum, const uint32_t * const pWd ) +{ + return _NIOS2_Add_ones_complement ( sum, pWd[0] ); +} + +static inline uint32_t _NIOS2_Reduce_checksum ( uint32_t a ) +{ + uint32_t tmp; + __asm__ __volatile__ ( + " srli %1, %0, 16 \n" /* tmp = a >> 16 */ + " andi %0, %0, 0xffff \n" /* a = a & 0xffff */ + " add %0, %0, %1 \n" /* a = a + tmp */ + : "+&r"(a), "=&r"(tmp) + ); + return a; +} + +#define combineTokens( A, B ) A ## B + +#define ADD_AND_ADVANCE( N ) \ +if ( mlen >= N ) { \ + sum = combineTokens ( _NIOS2_Add_ones_complement_, N ) \ + ( sum, ( uint32_t * ) w ); \ + mlen -= N; \ + w += N; \ +} + +static int +in_cksum_internal(struct mbuf *m, int off, int len, u_int sum) +{ + const uint8_t * w; + int mlen = 0; + int byte_swapped = 0; + + for (; m && len; m = m->m_next) + { + if (m->m_len == 0) + continue; + w = mtod(m, u_char *) + off; + mlen = m->m_len - off; + off = 0; + if (len < mlen) + mlen = len; + len -= mlen; + + /* + * Ensure that we're aligned on a word boundary here so + * that we can do 32 bit operations below. + */ + if ((3 & (uint32_t)w) != 0) + { + sum = _NIOS2_Reduce_checksum ( sum ); + if ((1 & (uint32_t)w) != 0 && mlen >= 1) + { + sum <<= 8u; + sum += *w << 8u; + byte_swapped ^= 1; + w += 1; + mlen -= 1; + } + if ((2 & (uint32_t)w) != 0 && mlen >= 2) + { + sum += *(uint16_t *)w; + w += 2; + mlen -= 2; + } + } + + /* + * instead of using a loop, process in unrolled chunks + */ + while ( mlen >= 64 ) + { + sum = _NIOS2_Add_ones_complement_64 + ( sum, ( uint32_t * ) w ); + mlen -= 64; + w += 64; + } + ADD_AND_ADVANCE ( 32 ); + ADD_AND_ADVANCE ( 16 ); + ADD_AND_ADVANCE ( 8 ); + ADD_AND_ADVANCE ( 4 ); + + if ( mlen > 0 ) + { + sum = _NIOS2_Reduce_checksum ( sum ); + if ( mlen >= 2 ) + { + sum += *(uint16_t *)w; + w += 2; + mlen -= 2; + } + if ( mlen == 1 ) + { + sum <<= 8u; + sum += *w << 8u; + byte_swapped ^= 1; + } + } + } + if ( byte_swapped ) + { + sum = _NIOS2_Reduce_checksum ( sum ); + sum <<= 8u; + } + sum = _NIOS2_Add_ones_complement_word_halves ( sum ); + sum ^= 0xffff; + return sum; +} + +int +in_cksum ( + struct mbuf *m, + int len ) +{ + return in_cksum_internal ( m, 0, len, 0 ); +} diff --git a/netinet/in_cksum_powerpc.h b/netinet/in_cksum_powerpc.h new file mode 100644 index 0000000..ad5e9e1 --- /dev/null +++ b/netinet/in_cksum_powerpc.h @@ -0,0 +1,172 @@ +/* + * Checksum routine for Internet Protocol family headers. + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + * + * This implementation is the PowerPC version. + */ + +#include /* for puts */ + +#undef ADDCARRY +#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff +#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} + +/* + * Thanks to gcc we don't have to guess + * which registers contain sum & w. + */ + +#define LDTMP(n) tmp = *((u_int *)((u_char *)w + n)) + +#define ADD(n) \ + LDTMP(n); \ + __asm__ volatile("addc %0,%0,%2" : "=r" (sum) : "0" (sum), "r" (tmp)) + +#define ADDC(n) \ + LDTMP(n); \ + __asm__ volatile("adde %0,%0,%2" : "=r" (sum) : "0" (sum), "r" (tmp)) + +#define MOP \ + tmp = 0; \ + __asm__ volatile("adde %0,%0,%2" : "=r" (sum) : "0" (sum), "r" (tmp)) + +#define LOAD(n) junk = (u_char) *((volatile u_char *) w + n) + + +int +in_cksum( + struct mbuf *m, + int len +) +{ + u_char junk; + register u_short *w; + register unsigned sum = 0; + register unsigned tmp; + register int mlen = 0; + int byte_swapped = 0; + union { char c[2]; u_short s; } su; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + */ + + /* su.c[0] is already saved when scanning previous + * mbuf. sum was REDUCEd when we found mlen == -1 + */ + su.c[1] = *(u_char *)w; + sum += su.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to long boundary so we do longword aligned + * memory operations + */ + if (3 & (int) w) { + REDUCE; + if ((1 & (int) w) && (mlen > 0)) { + sum <<= 8; + su.c[0] = *(char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + if ((2 & (int) w) && (mlen >= 2)) { + sum += *w++; + mlen -= 2; + } + } + + /* + * Do as much of the checksum as possible 32 bits at at time. + * In fact, this loop is unrolled to keep overhead from + * branches small. + */ + while (mlen >= 32) { + /* + * Add with carry 16 words and fold in the last + * carry by adding a 0 with carry. + * + * The early ADD(16) and the LOAD(32) are intended + * to help get the data into the cache. + */ + ADD(16); + ADDC(0); + ADDC(4); + ADDC(8); + ADDC(12); + LOAD(32); + ADDC(20); + ADDC(24); + ADDC(28); + MOP; + w += 16; + mlen -= 32; + } + if (mlen >= 16) { + ADD(0); + ADDC(4); + ADDC(8); + ADDC(12); + MOP; + w += 8; + mlen -= 16; + } + if (mlen >= 8) { + ADD(0); + ADDC(4); + MOP; + w += 4; + mlen -= 8; + } + if (mlen == 0 && byte_swapped == 0) + continue; /* worth 1% maybe ?? */ + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + su.c[1] = *(char *)w; + sum += su.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + /* + * This mbuf has odd number of bytes. + * There could be a word split betwen + * this mbuf and the next mbuf. + * Save the last byte (to prepend to next mbuf). + */ + su.c[0] = *(char *)w; + } + + if (len) + puts("cksum: out of data"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte is shifted left by 8 bits) */ + su.c[1] = 0; + sum += su.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/netinet/in_cksum_sparc.h b/netinet/in_cksum_sparc.h new file mode 100644 index 0000000..b88ce92 --- /dev/null +++ b/netinet/in_cksum_sparc.h @@ -0,0 +1,269 @@ +/* $NetBSD: in_cksum.c,v 1.12.10.4 2005/12/11 10:28:36 christos Exp $ */ + +/* + * Copyright (c) 1995 Matthew R. Green. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, and it's contributors. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/11/93 + */ + +/* + * Copyright (c) 1995 Zubin Dittia. + * Copyright (c) 1994, 1998 Charles M. Hannum. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, and it's contributors. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/11/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Checksum routine for Internet Protocol family headers. + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + * + * SPARC version. + */ + +/* + * The checksum computation code here is significantly faster than its + * vanilla C counterpart (by significantly, I mean 2-3 times faster if + * the data is in cache, and 1.5-2 times faster if the data is not in + * cache). + * We optimize on three fronts: + * 1. By using the add-with-carry (addxcc) instruction, we can use + * 32-bit operations instead of 16-bit operations. + * 2. By unrolling the main loop to reduce branch overheads. + * 3. By doing a sequence of load,load,add,add,load,load,add,add, + * we can avoid the extra stall cycle which is incurred if the + * instruction immediately following a load tries to use the + * target register of the load. + * Another possible optimization is to replace a pair of 32-bit loads + * with a single 64-bit load (ldd) instruction, but I found that although + * this improves performance somewhat on Sun4c machines, it actually + * reduces performance considerably on Sun4m machines (I don't know why). + * So I chose to leave it out. + * + * Zubin Dittia (zubin@dworkin.wustl.edu) + */ + +#define Asm __asm __volatile +#define ADD64 Asm(" ld [%4+ 0],%1; ld [%4+ 4],%2; \ + addcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+ 8],%1; ld [%4+12],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+16],%1; ld [%4+20],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+24],%1; ld [%4+28],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+32],%1; ld [%4+36],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+40],%1; ld [%4+44],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+48],%1; ld [%4+52],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+56],%1; ld [%4+60],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + addxcc %0,0,%0" \ + : "=r" (sum), "=&r" (tmp1), "=&r" (tmp2)\ + : "0" (sum), "r" (w)) +#define ADD32 Asm(" ld [%4+ 0],%1; ld [%4+ 4],%2; \ + addcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+ 8],%1; ld [%4+12],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+16],%1; ld [%4+20],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+24],%1; ld [%4+28],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + addxcc %0,0,%0" \ + : "=r" (sum), "=&r" (tmp1), "=&r" (tmp2)\ + : "0" (sum), "r" (w)) +#define ADD16 Asm(" ld [%4+ 0],%1; ld [%4+ 4],%2; \ + addcc %0,%1,%0; addxcc %0,%2,%0; \ + ld [%4+ 8],%1; ld [%4+12],%2; \ + addxcc %0,%1,%0; addxcc %0,%2,%0; \ + addxcc %0,0,%0" \ + : "=r" (sum), "=&r" (tmp1), "=&r" (tmp2)\ + : "0" (sum), "r" (w)) +#define ADD8 Asm(" ld [%4+ 0],%1; ld [%4+ 4],%2; \ + addcc %0,%1,%0; addxcc %0,%2,%0; \ + addxcc %0,0,%0" \ + : "=r" (sum), "=&r" (tmp1), "=&r" (tmp2)\ + : "0" (sum), "r" (w)) +#define ADD4 Asm(" ld [%3+ 0],%1; \ + addcc %0,%1,%0; \ + addxcc %0,0,%0" \ + : "=r" (sum), "=&r" (tmp1) \ + : "0" (sum), "r" (w)) + +#define REDUCE {sum = (sum & 0xffff) + (sum >> 16);} +#define ADDCARRY {if (sum > 0xffff) sum -= 0xffff;} +#define ROL {sum = sum << 8;} /* depends on recent REDUCE */ +#define ADDBYTE {ROL; sum += *w; byte_swapped ^= 1;} +#define ADDSHORT {sum += *(u_short *)w;} +#define ADVANCE(n) {w += n; mlen -= n;} + +static __inline__ int +in_cksum_internal(struct mbuf *m, int off, int len, u_int sum) +{ + u_char *w; + int mlen = 0; + int byte_swapped = 0; + + /* + * Declare two temporary registers for use by the asm code. We + * allow the compiler to pick which specific machine registers to + * use, instead of hard-coding this in the asm code above. + */ + u_int tmp1, tmp2; + + for (; m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_char *) + off; + mlen = m->m_len - off; + off = 0; + if (len < mlen) + mlen = len; + len -= mlen; + + /* + * Ensure that we're aligned on a word boundary here so + * that we can do 32 bit operations below. + */ + if ((3 & (long)w) != 0) { + REDUCE; + if ((1 & (long)w) != 0 && mlen >= 1) { + ADDBYTE; + ADVANCE(1); + } + if ((2 & (long)w) != 0 && mlen >= 2) { + ADDSHORT; + ADVANCE(2); + } + } + + /* + * Do as many 32 bit operations as possible using the + * 64/32/16/8/4 macro's above, using as many as possible of + * these. + */ + while (mlen >= 64) { + ADD64; + ADVANCE(64); + } + if (mlen >= 32) { + ADD32; + ADVANCE(32); + } + if (mlen >= 16) { + ADD16; + ADVANCE(16); + } + if (mlen >= 8) { + ADD8; + ADVANCE(8); + } + if (mlen >= 4) { + ADD4; + ADVANCE(4) + } + if (mlen == 0) + continue; + + REDUCE; + if (mlen >= 2) { + ADDSHORT; + ADVANCE(2); + } + if (mlen == 1) { + ADDBYTE; + } + } + if (byte_swapped) { + REDUCE; + ROL; + } + REDUCE; + ADDCARRY; + + return (0xffff ^ sum); +} + +int +in_cksum(struct mbuf *m, int len) +{ + + return (in_cksum_internal(m, 0, len, 0)); +} diff --git a/netinet/in_pcb.c b/netinet/in_pcb.c new file mode 100644 index 0000000..d9d4417 --- /dev/null +++ b/netinet/in_pcb.c @@ -0,0 +1,733 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.158 2005/01/07 01:45:44 imp Exp $ + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +struct in_addr zeroin_addr; + +static void in_pcbinshash(struct inpcb *); +static void in_rtchange(struct inpcb *, int); + +/* + * These configure the range of local port addresses assigned to + * "unspecified" outgoing connections/packets/whatever. + */ +static int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ +static int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ +static int ipport_firstauto = IPPORT_RESERVED; /* 1024 */ +static int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */ +static int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 40000 */ +static int ipport_hilastauto = IPPORT_HILASTAUTO; /* 44999 */ + +#define RANGECHK(var, min, max) \ + if ((var) < (min)) { (var) = (min); } \ + else if ((var) > (max)) { (var) = (max); } + +static int +sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) +{ + int error = sysctl_handle_int(oidp, + oidp->oid_arg1, oidp->oid_arg2, req); + if (!error) { + RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); + } + return error; +} + +#undef RANGECHK + +SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); + +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, + &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); + +int +in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) +{ + register struct inpcb *inp; + int s; + + MALLOC(inp, struct inpcb *, sizeof(*inp), M_PCB, M_NOWAIT); + if (inp == NULL) + return (ENOBUFS); + bzero((caddr_t)inp, sizeof(*inp)); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + inp->inp_pcbinfo = pcbinfo; + inp->inp_socket = so; + s = splnet(); + LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list); + pcbinfo->ipi_count++; + in_pcbinshash(inp); + splx(s); + so->so_pcb = (caddr_t)inp; + return (0); +} + +int +in_pcbbind(struct inpcb *inp, struct mbuf *nam) +{ + register struct socket *so = inp->inp_socket; + unsigned short *lastport; + struct sockaddr_in *sin; + u_short lport = 0; + int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error; + + if (in_ifaddr == 0) + return (EADDRNOTAVAIL); + if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY) + return (EINVAL); + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && + ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || + (so->so_options & SO_ACCEPTCONN) == 0)) + wild = 1; + if (nam) { + sin = mtod(nam, struct sockaddr_in *); + if (nam->m_len != sizeof (*sin)) + return (EINVAL); +#ifdef notdef + /* + * We should check the family, but old programs + * incorrectly fail to initialize it. + */ + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); +#endif + lport = sin->sin_port; + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + /* + * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; + * allow complete duplication of binding if + * SO_REUSEPORT is set, or if SO_REUSEADDR is set + * and a multicast address is bound on both + * new and duplicated sockets. + */ + if (so->so_options & SO_REUSEADDR) + reuseport = SO_REUSEADDR|SO_REUSEPORT; + } else if (sin->sin_addr.s_addr != INADDR_ANY) { + sin->sin_port = 0; /* yech... */ + if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) + return (EADDRNOTAVAIL); + } + if (lport) { + struct inpcb *t; + + /* GROSS */ + if (ntohs(lport) < IPPORT_RESERVED && + (error = suser(p->p_ucred, &p->p_acflag))) + return (EACCES); + t = in_pcblookup(inp->inp_pcbinfo, zeroin_addr, 0, + sin->sin_addr, lport, wild); + if (t && (reuseport & t->inp_socket->so_options) == 0) + return (EADDRINUSE); + } + inp->inp_laddr = sin->sin_addr; + } + if (lport == 0) { + unsigned short first, last; + int count; + + inp->inp_flags |= INP_ANONPORT; + + if (inp->inp_flags & INP_HIGHPORT) { + first = ipport_hifirstauto; /* sysctl */ + last = ipport_hilastauto; + lastport = &inp->inp_pcbinfo->lasthi; + } else if (inp->inp_flags & INP_LOWPORT) { + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (EACCES); + first = ipport_lowfirstauto; /* 1023 */ + last = ipport_lowlastauto; /* 600 */ + lastport = &inp->inp_pcbinfo->lastlow; + } else { + first = ipport_firstauto; /* sysctl */ + last = ipport_lastauto; + lastport = &inp->inp_pcbinfo->lastport; + } + /* + * Simple check to ensure all ports are not used up causing + * a deadlock here. + * + * We split the two cases (up and down) so that the direction + * is not being tested on each round of the loop. + */ + if (first > last) { + /* + * counting down + */ + count = first - last; + + do { + if (count-- <= 0) /* completely used? */ + return (EADDRNOTAVAIL); + --*lastport; + if (*lastport > first || *lastport < last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup(inp->inp_pcbinfo, + zeroin_addr, 0, inp->inp_laddr, lport, wild)); + } else { + /* + * counting up + */ + count = last - first; + + do { + if (count-- <= 0) /* completely used? */ + return (EADDRNOTAVAIL); + ++*lastport; + if (*lastport < first || *lastport > last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup(inp->inp_pcbinfo, + zeroin_addr, 0, inp->inp_laddr, lport, wild)); + } + } + inp->inp_lport = lport; + in_pcbrehash(inp); + return (0); +} + +/* + * Transform old in_pcbconnect() into an inner subroutine for new + * in_pcbconnect(): Do some validity-checking on the remote + * address (in mbuf 'nam') and then determine local host address + * (i.e., which interface) to use to access that remote host. + * + * This preserves definition of in_pcbconnect(), while supporting a + * slightly different version for T/TCP. (This is more than + * a bit of a kludge, but cleaning up the internal interfaces would + * have forced minor changes in every protocol). + */ + +int +in_pcbladdr(struct inpcb *inp, struct mbuf *nam, struct sockaddr_in **plocal_sin) +{ + struct in_ifaddr *ia; + register struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); + + if (nam->m_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); + if (sin->sin_port == 0) + return (EADDRNOTAVAIL); + if (in_ifaddr) { + /* + * If the destination address is INADDR_ANY, + * use the primary local address. + * If the supplied address is INADDR_BROADCAST, + * and the primary interface supports broadcast, + * choose the broadcast address for that interface. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr = IA_SIN(in_ifaddr)->sin_addr; + else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && + (in_ifaddr->ia_ifp->if_flags & IFF_BROADCAST)) + sin->sin_addr = satosin(&in_ifaddr->ia_broadaddr)->sin_addr; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + register struct route *ro; + + ia = (struct in_ifaddr *)0; + /* + * If route is known or can be allocated now, + * our src addr is taken from the i/f, else punt. + */ + ro = &inp->inp_route; + if (ro->ro_rt && + (satosin(&ro->ro_dst)->sin_addr.s_addr != + sin->sin_addr.s_addr || + inp->inp_socket->so_options & SO_DONTROUTE)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ + (ro->ro_rt == (struct rtentry *)0 || + ro->ro_rt->rt_ifp == (struct ifnet *)0)) { + /* No route yet, so try to acquire one */ + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + sin->sin_addr; + rtalloc(ro); + } + /* + * If we found a route, use the address + * corresponding to the outgoing interface + * unless it is the loopback (in case a route + * to our address on another net goes to loopback). + */ + if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) + ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia == 0) { + u_short fport = sin->sin_port; + + sin->sin_port = 0; + ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); + if (ia == 0) + ia = ifatoia(ifa_ifwithnet(sintosa(sin))); + sin->sin_port = fport; + if (ia == 0) + ia = in_ifaddr; + if (ia == 0) + return (EADDRNOTAVAIL); + } + /* + * If the destination address is multicast and an outgoing + * interface has been set as a multicast option, use the + * address of that interface as our source address. + */ + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && + inp->inp_moptions != NULL) { + struct ip_moptions *imo; + struct ifnet *ifp; + + imo = inp->inp_moptions; + if (imo->imo_multicast_ifp != NULL) { + ifp = imo->imo_multicast_ifp; + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) + break; + if (ia == 0) + return (EADDRNOTAVAIL); + } + } + /* + * Don't do pcblookup call here; return interface in plocal_sin + * and exit to caller, that will do the lookup. + */ + *plocal_sin = &ia->ia_addr; + + } + return(0); +} + +/* + * Outer subroutine: + * Connect from a socket to a specified address. + * Both address and port must be specified in argument sin. + * If don't have a local address for this socket yet, + * then pick one. + */ +int +in_pcbconnect(struct inpcb *inp, struct mbuf *nam) +{ + struct sockaddr_in *ifaddr; + register struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); + int error; + + /* + * Call inner routine, to assign local interface address. + */ + if ((error = in_pcbladdr(inp, nam, &ifaddr))) + return(error); + + if (in_pcblookuphash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, + inp->inp_lport, 0) != NULL) + return (EADDRINUSE); + if (inp->inp_laddr.s_addr == INADDR_ANY) { + if (inp->inp_lport == 0) + (void)in_pcbbind(inp, (struct mbuf *)0); + inp->inp_laddr = ifaddr->sin_addr; + } + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + return (0); +} + +void +in_pcbdisconnect(struct inpcb *inp) +{ + + inp->inp_faddr.s_addr = INADDR_ANY; + inp->inp_fport = 0; + in_pcbrehash(inp); + if (inp->inp_socket->so_state & SS_NOFDREF) + in_pcbdetach(inp); +} + +void +in_pcbdetach(struct inpcb *inp) +{ + struct socket *so = inp->inp_socket; + struct inpcbinfo *ipi = inp->inp_pcbinfo; + int s; + + inp->inp_gencnt = ++ipi->ipi_gencnt; + so->so_pcb = 0; + sofree(so); + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_route.ro_rt) + rtfree(inp->inp_route.ro_rt); + ip_freemoptions(inp->inp_moptions); + s = splnet(); + LIST_REMOVE(inp, inp_hash); + LIST_REMOVE(inp, inp_list); + splx(s); + FREE(inp, M_PCB); +} + +void +in_setsockaddr(struct inpcb *inp, struct mbuf *nam) +{ + register struct sockaddr_in *sin; + + nam->m_len = sizeof (*sin); + sin = mtod(nam, struct sockaddr_in *); + bzero((caddr_t)sin, sizeof (*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = inp->inp_lport; + sin->sin_addr = inp->inp_laddr; +} + +void +in_setpeeraddr(struct inpcb *inp, struct mbuf *nam) +{ + register struct sockaddr_in *sin; + + nam->m_len = sizeof (*sin); + sin = mtod(nam, struct sockaddr_in *); + bzero((caddr_t)sin, sizeof (*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = inp->inp_fport; + sin->sin_addr = inp->inp_faddr; +} + +/* + * Pass some notification to all connections of a protocol + * associated with address dst. The local address and/or port numbers + * may be specified to limit the search. The "usual action" will be + * taken, depending on the ctlinput cmd. The caller must filter any + * cmds that are uninteresting (e.g., no error in the map). + * Call the protocol specific routine (if any) to report + * any errors for each matching socket. + * + * Must be called at splnet. + */ +void +in_pcbnotify(struct inpcbhead *head, struct sockaddr *dst, u_int fport_arg, + struct in_addr laddr, u_int lport_arg, int cmd, + void (*notify)(struct inpcb *, int)) +{ + register struct inpcb *inp, *oinp; + struct in_addr faddr; + u_short fport = fport_arg, lport = lport_arg; + int errnum, s; + + if ((unsigned)cmd > PRC_NCMDS || dst->sa_family != AF_INET) + return; + faddr = ((struct sockaddr_in *)dst)->sin_addr; + if (faddr.s_addr == INADDR_ANY) + return; + + /* + * Redirects go to all references to the destination, + * and use in_rtchange to invalidate the route cache. + * Dead host indications: notify all references to the destination. + * Otherwise, if we have knowledge of the local port and address, + * deliver only to that socket. + */ + if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { + fport = 0; + lport = 0; + laddr.s_addr = 0; + if (cmd != PRC_HOSTDEAD) + notify = in_rtchange; + } + errnum = inetctlerrmap[cmd]; + s = splnet(); + for (inp = head->lh_first; inp != NULL;) { + if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_socket == 0 || + (lport && inp->inp_lport != lport) || + (laddr.s_addr && inp->inp_laddr.s_addr != laddr.s_addr) || + (fport && inp->inp_fport != fport)) { + inp = inp->inp_list.le_next; + continue; + } + oinp = inp; + inp = inp->inp_list.le_next; + if (notify) + (*notify)(oinp, errnum); + } + splx(s); +} + +/* + * Check for alternatives when higher level complains + * about service problems. For now, invalidate cached + * routing information. If the route was created dynamically + * (by a redirect), time to try a default gateway again. + */ +void +in_losing(struct inpcb *inp) +{ + register struct rtentry *rt; + struct rt_addrinfo info; + + if ((rt = inp->inp_route.ro_rt)) { + inp->inp_route.ro_rt = 0; + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = + (struct sockaddr *)&inp->inp_route.ro_dst; + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); + if (rt->rt_flags & RTF_DYNAMIC) + (void) rtrequest(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, + (struct rtentry **)0); + else + /* + * A new route can be allocated + * the next time output is attempted. + */ + rtfree(rt); + } +} + +/* + * After a routing change, flush old routing + * and allocate a (hopefully) better one. + */ +static void +in_rtchange(struct inpcb *inp, int errnum) +{ + if (inp->inp_route.ro_rt) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = 0; + /* + * A new route can be allocated the next time + * output is attempted. + */ + } +} + +struct inpcb * +in_pcblookup(struct inpcbinfo *pcbinfo, + struct in_addr faddr, u_int fport_arg, + struct in_addr laddr, u_int lport_arg, + int wild_okay) +{ + register struct inpcb *inp, *match = NULL; + int matchwild = 3, wildcard; + u_short fport = fport_arg, lport = lport_arg; + int s; + + s = splnet(); + + for (inp = pcbinfo->listhead->lh_first; inp != NULL; inp = inp->inp_list.le_next) { + if (inp->inp_lport != lport) + continue; + wildcard = 0; + if (inp->inp_faddr.s_addr != INADDR_ANY) { + if (faddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_fport != fport) + continue; + } else { + if (faddr.s_addr != INADDR_ANY) + wildcard++; + } + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) + wildcard++; + } + if (wildcard && wild_okay == 0) + continue; + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) { + break; + } + } + } + splx(s); + return (match); +} + +/* + * Lookup PCB in hash list. + */ +struct inpcb * +in_pcblookuphash(struct inpcbinfo *pcbinfo, + struct in_addr faddr, u_int fport_arg, + struct in_addr laddr, u_int lport_arg, + int wildcard) +{ + struct inpcbhead *head; + register struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + int s; + + s = splnet(); + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)]; + for (inp = head->lh_first; inp != NULL; inp = inp->inp_hash.le_next) { + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) + goto found; + } + if (wildcard) { + struct inpcb *local_wild = NULL; + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)]; + for (inp = head->lh_first; inp != NULL; inp = inp->inp_hash.le_next) { + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_fport == 0 && inp->inp_lport == lport) { + if (inp->inp_laddr.s_addr == laddr.s_addr) + goto found; + else if (inp->inp_laddr.s_addr == INADDR_ANY) + local_wild = inp; + } + } + if (local_wild != NULL) { + inp = local_wild; + goto found; + } + } + splx(s); + return (NULL); + +found: + /* + * Move PCB to head of this hash chain so that it can be + * found more quickly in the future. + * XXX - this is a pessimization on machines with few + * concurrent connections. + */ + if (inp != head->lh_first) { + LIST_REMOVE(inp, inp_hash); + LIST_INSERT_HEAD(head, inp, inp_hash); + } + splx(s); + return (inp); +} + +/* + * Insert PCB into hash chain. Must be called at splnet. + */ +static void +in_pcbinshash(struct inpcb *inp) +{ + struct inpcbhead *head; + + head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(inp->inp_faddr.s_addr, + inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)]; + + LIST_INSERT_HEAD(head, inp, inp_hash); +} + +void +in_pcbrehash(struct inpcb *inp) +{ + struct inpcbhead *head; + int s; + + s = splnet(); + LIST_REMOVE(inp, inp_hash); + + head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(inp->inp_faddr.s_addr, + inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)]; + + LIST_INSERT_HEAD(head, inp, inp_hash); + inp->inp_pcbinfo->ipi_count--; + splx(s); +} diff --git a/netinet/in_pcb.h b/netinet/in_pcb.h new file mode 100644 index 0000000..ad8b99c --- /dev/null +++ b/netinet/in_pcb.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/in_pcb.h,v 1.89 2006/07/18 22:34:27 ups Exp $ + */ + + +#ifndef _NETINET_IN_PCB_H_ +#define _NETINET_IN_PCB_H_ + +#include +#include /* struct in_addr */ +#include /* struct route */ + +/* + * Common structure pcb for internet protocol implementation. + * Here are stored pointers to local and foreign host table + * entries, local and foreign socket numbers, and pointers + * up (to a socket structure) and down (to a protocol-specific) + * control block. + */ +LIST_HEAD(inpcbhead, inpcb); + +typedef u_int64_t inp_gen_t; + +struct inpcb { + LIST_ENTRY(inpcb) inp_hash; /* hash list */ + LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ + struct inpcbinfo *inp_pcbinfo; /* PCB list info */ + struct in_addr inp_faddr; /* foreign host table entry */ + struct in_addr inp_laddr; /* local host table entry */ + u_short inp_fport; /* foreign port */ + u_short inp_lport; /* local port */ + caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + struct socket *inp_socket; /* back pointer to socket */ + struct route inp_route; /* placeholder for routing entry */ + int inp_flags; /* generic IP/datagram flags */ + u_char inp_vflag; /* IP version flag (v4/v6) */ + u_char inp_ip_ttl; /* time to live proto */ + u_char inp_ip_p; /* protocol proto */ + u_char inp_ip_minttl; /* minimum TTL or drop */ + + /* protocol dependent part; options */ + struct { + u_char inp4_ip_tos; /* type of service proto */ + struct mbuf *inp4_options; /* IP options */ + struct ip_moptions *inp4_moptions; /* IP multicast options */ + } inp_depend4; +#define inp_ip_tos inp_depend4.inp4_ip_tos +#define inp_options inp_depend4.inp4_options +#define inp_moptions inp_depend4.inp4_moptions + inp_gen_t inp_gencnt; /* generation count of this instance */ +}; + +/* + * Interface exported to userland by various protocols which use + * inpcbs. Hack alert -- only define if struct xsocket is in scope. + */ +#ifdef _SYS_SOCKETVAR_H_ +struct xinpcb { + size_t xi_len; /* length of this structure */ + struct inpcb xi_inp; +/* struct xsocket xi_socket; ccj removed */ + u_int64_t xi_alignment_hack; +}; + +struct xinpgen { + size_t xig_len; /* length of this structure */ + u_int xig_count; /* number of PCBs at this time */ + inp_gen_t xig_gen; /* generation count at this time */ + so_gen_t xig_sogen; /* socket generation count at this time */ +}; +#endif /* _SYS_SOCKETVAR_H_ */ + +struct inpcbinfo { /* XXX documentation, prefixes */ + struct inpcbhead *listhead; + struct inpcbhead *hashbase; + unsigned long hashmask; + unsigned short lastport; + unsigned short lastlow; + unsigned short lasthi; + u_int ipi_count; /* number of pcbs in this list */ + u_int64_t ipi_gencnt; /* current generation count */ +}; + +#define INP_PCBHASH(faddr, lport, fport, mask) \ + (((faddr) ^ ((faddr) >> 16) ^ (lport) ^ (fport)) & (mask)) + +/* flags in inp_flags: */ +#define INP_RECVOPTS 0x01 /* receive incoming IP options */ +#define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ +#define INP_RECVDSTADDR 0x04 /* receive IP dst address */ +#define INP_HDRINCL 0x08 /* user supplies entire IP header */ +#define INP_HIGHPORT 0x10 /* user wants "high" port binding */ +#define INP_LOWPORT 0x20 /* user wants "low" port binding */ +#define INP_ANONPORT 0x40 /* port chosen for user */ +#define INP_RECVIF 0x80 /* receive incoming interface */ +#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ + INP_RECVIF) + +#define INPLOOKUP_WILDCARD 1 +#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) + +#ifdef _KERNEL +void in_losing(struct inpcb *); +int in_pcballoc(struct socket *, struct inpcbinfo *); +int in_pcbbind(struct inpcb *, struct mbuf *); +int in_pcbconnect(struct inpcb *, struct mbuf *); +void in_pcbdetach(struct inpcb *); +void in_pcbdisconnect(struct inpcb *); +int in_pcbladdr(struct inpcb *, struct mbuf *, + struct sockaddr_in **); +struct inpcb * + in_pcblookup(struct inpcbinfo *, + struct in_addr, u_int, struct in_addr, u_int, int); +struct inpcb * + in_pcblookuphash(struct inpcbinfo *, + struct in_addr, u_int, struct in_addr, u_int, int); +void in_pcbnotify(struct inpcbhead *, struct sockaddr *, + u_int, struct in_addr, u_int, int, void (*)(struct inpcb *, int)); +void in_pcbrehash(struct inpcb *); +void in_setpeeraddr(struct inpcb *, struct mbuf *); +void in_setsockaddr(struct inpcb *, struct mbuf *); +#endif /* _KERNEL */ + +#endif /* !_NETINET_IN_PCB_H_ */ diff --git a/netinet/in_proto.c b/netinet/in_proto.c new file mode 100644 index 0000000..558c3e2 --- /dev/null +++ b/netinet/in_proto.c @@ -0,0 +1,217 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_tcpdebug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif +#include +#include +/* + * TCP/IP protocol family: IP, ICMP, UDP, TCP. + */ + +#ifdef IPXIP +#include +#include +#endif + +#ifdef NSIP +#include +#include +#endif + +#ifdef TPIP +void tpip_input(), tpip_ctlinput(), tp_init(), tp_slowtimo(), tp_drain(); +int tp_ctloutput(), tp_usrreq(); +#endif + +#ifdef EON +void eoninput(), eonctlinput(), eonprotoinit(); +#endif /* EON */ + +extern struct domain inetdomain; + +struct protosw inetsw[] = { +{ 0, &inetdomain, 0, 0, + 0, 0, 0, 0, + 0, + ip_init, 0, ip_slowtimo, ip_drain, + NULL +}, +{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, + udp_input, 0, udp_ctlinput, ip_ctloutput, + udp_usrreq, + udp_init, 0, 0, 0, + NULL +}, +{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, + PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + tcp_input, 0, tcp_ctlinput, tcp_ctloutput, + 0, + tcp_init, tcp_fasttimo, tcp_slowtimo, tcp_drain, + &tcp_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, + rip_input, 0, 0, rip_ctloutput, + rip_usrreq, + 0, 0, 0, 0, + NULL +}, +{ SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR, + icmp_input, 0, 0, rip_ctloutput, + rip_usrreq, + 0, 0, 0, 0, + NULL +}, +{ SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR, + igmp_input, 0, 0, rip_ctloutput, + rip_usrreq, + igmp_init, igmp_fasttimo, igmp_slowtimo, 0, + NULL +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RSVP, PR_ATOMIC|PR_ADDR, + rsvp_input, 0, 0, rip_ctloutput, + rip_usrreq, + 0, 0, 0, 0, + NULL +}, +{ SOCK_RAW, &inetdomain, IPPROTO_IPIP, PR_ATOMIC|PR_ADDR, + ipip_input, 0, 0, rip_ctloutput, + rip_usrreq, + 0, 0, 0, 0, + NULL +}, +#ifdef IPDIVERT +{ SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR, + div_input, 0, 0, ip_ctloutput, + div_usrreq, + div_init, 0, 0, 0, + NULL +}, +#endif +#ifdef TPIP +{ SOCK_SEQPACKET,&inetdomain, IPPROTO_TP, PR_CONNREQUIRED|PR_WANTRCVD, + tpip_input, 0, tpip_ctlinput, tp_ctloutput, + tp_usrreq, + tp_init, 0, tp_slowtimo, tp_drain, + NULL +}, +#endif +/* EON (ISO CLNL over IP) */ +#ifdef EON +{ SOCK_RAW, &inetdomain, IPPROTO_EON, 0, + eoninput, 0, eonctlinput, 0, + 0, + eonprotoinit, 0, 0, 0, + NULL +}, +#endif +#ifdef IPXIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR, + ipxip_input, 0, ipxip_ctlinput, 0, + rip_usrreq, + 0, 0, 0, 0, + NULL +}, +#endif +#ifdef NSIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR, + idpip_input, 0, nsip_ctlinput, 0, + rip_usrreq, + 0, 0, 0, 0, + NULL +}, +#endif + /* raw wildcard */ +{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR, + rip_input, 0, 0, rip_ctloutput, + rip_usrreq, + rip_init, 0, 0, 0, + NULL +}, +}; + +extern int in_inithead(void **, int); + +struct domain inetdomain = + { AF_INET, "internet", 0, 0, 0, + inetsw, &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], 0, + in_inithead, 32, sizeof(struct sockaddr_in) + }; + +DOMAIN_SET(inet); + +SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0, + "Internet Family"); + +SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP"); +SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP"); +SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP"); +SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP"); +SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP"); +#ifdef IPDIVERT +SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, div, CTLFLAG_RW, 0, "DIVERT"); +#endif diff --git a/netinet/in_rmx.c b/netinet/in_rmx.c new file mode 100644 index 0000000..5dd276a --- /dev/null +++ b/netinet/in_rmx.c @@ -0,0 +1,386 @@ +#include + +/* + * Copyright 1994, 1995 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This code does two things necessary for the enhanced TCP metrics to + * function in a useful manner: + * 1) It marks all non-host routes as `cloning', thus ensuring that + * every actual reference to such a route actually gets turned + * into a reference to a host route to the specific destination + * requested. + * 2) When such routes lose all their references, it arranges for them + * to be deleted in some random collection of circumstances, so that + * a large quantity of stale routing data is not kept in kernel memory + * indefinitely. See in_rtqtimo() below for the exact mechanism. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +extern int in_inithead(void **head, int off); + +#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ + +/* + * Do what we need to do when inserting a route. + */ +static struct radix_node * +in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, + struct radix_node *treenodes) +{ + struct rtentry *rt = (struct rtentry *)treenodes; + struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); + struct radix_node *ret; + + /* + * For IP, all unicast non-host routes are automatically cloning. + */ + if(IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + rt->rt_flags |= RTF_MULTICAST; + + if(!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) { + rt->rt_flags |= RTF_PRCLONING; + } + + /* + * A little bit of help for both IP output and input: + * For host routes, we make sure that RTF_BROADCAST + * is set for anything that looks like a broadcast address. + * This way, we can avoid an expensive call to in_broadcast() + * in ip_output() most of the time (because the route passed + * to ip_output() is almost always a host route). + * + * We also do the same for local addresses, with the thought + * that this might one day be used to speed up ip_input(). + * + * We also mark routes to multicast addresses as such, because + * it's easy to do and might be useful (but this is much more + * dubious since it's so easy to inspect the address). (This + * is done above.) + */ + if (rt->rt_flags & RTF_HOST) { + if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { + rt->rt_flags |= RTF_BROADCAST; + } else { + if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr + == sin->sin_addr.s_addr) + rt->rt_flags |= RTF_LOCAL; + } + } + + /* + * We also specify a send and receive pipe size for every + * route added, to help TCP a bit. TCP doesn't actually + * want a true pipe size, which would be prohibitive in memory + * costs and is hard to compute anyway; it simply uses these + * values to size its buffers. So, we fill them in with the + * same values that TCP would have used anyway, and allow the + * installing program or the link layer to override these values + * as it sees fit. This will hopefully allow TCP more + * opportunities to save its ssthresh value. + */ + if (!rt->rt_rmx.rmx_sendpipe && !(rt->rt_rmx.rmx_locks & RTV_SPIPE)) + rt->rt_rmx.rmx_sendpipe = tcp_sendspace; + + if (!rt->rt_rmx.rmx_recvpipe && !(rt->rt_rmx.rmx_locks & RTV_RPIPE)) + rt->rt_rmx.rmx_recvpipe = tcp_recvspace; + + if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) + && rt->rt_ifp) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + + ret = rn_addroute(v_arg, n_arg, head, treenodes); + if (ret == NULL && rt->rt_flags & RTF_HOST) { + struct rtentry *rt2; + /* + * We are trying to add a host route, but can't. + * Find out if it is because of an + * ARP entry and delete it if so. + */ + rt2 = rtalloc1((struct sockaddr *)sin, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt2) { + if (rt2->rt_flags & RTF_LLINFO && + rt2->rt_flags & RTF_HOST && + rt2->rt_gateway && + rt2->rt_gateway->sa_family == AF_LINK) { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt2), + rt2->rt_gateway, + rt_mask(rt2), rt2->rt_flags, 0); + ret = rn_addroute(v_arg, n_arg, head, + treenodes); + } + RTFREE(rt2); + } + } + return ret; +} + +/* + * This code is the inverse of in_clsroute: on first reference, if we + * were managing the route, stop doing so and set the expiration timer + * back off again. + */ +static struct radix_node * +in_matroute(void *v_arg, struct radix_node_head *head) +{ + struct radix_node *rn = rn_match(v_arg, head); + struct rtentry *rt = (struct rtentry *)rn; + + if(rt && rt->rt_refcnt == 0) { /* this is first reference */ + if(rt->rt_flags & RTPRF_OURS) { + rt->rt_flags &= ~RTPRF_OURS; + rt->rt_rmx.rmx_expire = 0; + } + } + return rn; +} + +static int rtq_reallyold = 60*60; + /* one hour is ``really old'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, + CTLFLAG_RW, &rtq_reallyold , 0, ""); + +static int rtq_minreallyold = 10; + /* never automatically crank down to less */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, + CTLFLAG_RW, &rtq_minreallyold , 0, ""); + +static int rtq_toomany = 128; + /* 128 cached routes is ``too many'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, + CTLFLAG_RW, &rtq_toomany , 0, ""); + + +/* + * On last reference drop, mark the route as belong to us so that it can be + * timed out. + */ +static void +in_clsroute(struct radix_node *rn, struct radix_node_head *head) +{ + struct rtentry *rt = (struct rtentry *)rn; + + if(!(rt->rt_flags & RTF_UP)) + return; /* prophylactic measures */ + + if((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) + return; + + if((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) + != RTF_WASCLONED) + return; + + /* + * As requested by David Greenman: + * If rtq_reallyold is 0, just delete the route without + * waiting for a timeout cycle to kill it. + */ + if(rtq_reallyold != 0) { + rt->rt_flags |= RTPRF_OURS; + rt->rt_rmx.rmx_expire = rtems_bsdnet_seconds_since_boot() + rtq_reallyold; + } else { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + } +} + +struct rtqk_arg { + struct radix_node_head *rnh; + int draining; + int killed; + int found; + int updating; + time_t nextstop; +}; + +/* + * Get rid of old routes. When draining, this deletes everything, even when + * the timeout is not expired yet. When updating, this makes sure that + * nothing has a timeout longer than the current value of rtq_reallyold. + */ +static int +in_rtqkill(struct radix_node *rn, void *rock) +{ + struct rtqk_arg *ap = rock; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + if(rt->rt_flags & RTPRF_OURS) { + ap->found++; + + if(ap->draining || rt->rt_rmx.rmx_expire <= rtems_bsdnet_seconds_since_boot()) { + if(rt->rt_refcnt > 0) + panic("rtqkill route really not free"); + + err = rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + if(err) { + log(LOG_WARNING, "in_rtqkill: error %d\n", err); + } else { + ap->killed++; + } + } else { + if(ap->updating + && (rt->rt_rmx.rmx_expire - rtems_bsdnet_seconds_since_boot() + > rtq_reallyold)) { + rt->rt_rmx.rmx_expire = rtems_bsdnet_seconds_since_boot() + + rtq_reallyold; + } + ap->nextstop = lmin(ap->nextstop, + rt->rt_rmx.rmx_expire); + } + } + + return 0; +} + +#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ +static int rtq_timeout = RTQ_TIMEOUT; + +static void +in_rtqtimo(void *rock) +{ + struct radix_node_head *rnh = rock; + struct rtqk_arg arg; + struct timeval atv; + static time_t last_adjusted_timeout = 0; + int s; + + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = rtems_bsdnet_seconds_since_boot() + rtq_timeout; + arg.draining = arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + + /* + * Attempt to be somewhat dynamic about this: + * If there are ``too many'' routes sitting around taking up space, + * then crank down the timeout, and see if we can't make some more + * go away. However, we make sure that we will never adjust more + * than once in rtq_timeout seconds, to keep from cranking down too + * hard. + */ + if((arg.found - arg.killed > rtq_toomany) + && (rtems_bsdnet_seconds_since_boot() - last_adjusted_timeout >= rtq_timeout) + && rtq_reallyold > rtq_minreallyold) { + rtq_reallyold = 2*rtq_reallyold / 3; + if(rtq_reallyold < rtq_minreallyold) { + rtq_reallyold = rtq_minreallyold; + } + + last_adjusted_timeout = rtems_bsdnet_seconds_since_boot(); +#ifdef DIAGNOSTIC + log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", + rtq_reallyold); +#endif + arg.found = arg.killed = 0; + arg.updating = 1; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + } + + atv.tv_usec = 0; + atv.tv_sec = arg.nextstop; + timeout(in_rtqtimo, rock, hzto(&atv)); +} + +void +in_rtqdrain(void) +{ + struct radix_node_head *rnh = rt_tables[AF_INET]; + struct rtqk_arg arg; + int s; + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = 0; + arg.draining = 1; + arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); +} + +/* + * Initialize our routing tree. + */ +int +in_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + if(!rn_inithead(head, off)) + return 0; + + if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */ + return 1; /* only do this for the real routing table */ + + rnh = *head; + rnh->rnh_addaddr = in_addroute; + rnh->rnh_matchaddr = in_matroute; + rnh->rnh_close = in_clsroute; + in_rtqtimo(rnh); /* kick off timeout first time */ + return 1; +} diff --git a/netinet/in_systm.h b/netinet/in_systm.h new file mode 100644 index 0000000..f0f3fa9 --- /dev/null +++ b/netinet/in_systm.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_systm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/in_systm.h,v 1.13 2009/02/13 15:14:43 luigi Exp $ + */ + +#ifndef _NETINET_IN_SYSTM_H_ +#define _NETINET_IN_SYSTM_H_ + +/* + * Miscellaneous internetwork + * definitions for kernel. + */ + +/* + * Network types. + * + * Internally the system keeps counters in the headers with the bytes + * swapped so that VAX instructions will work on them. It reverses + * the bytes before transmission at each protocol level. The n_ types + * represent the types with the bytes in ``high-ender'' order. + */ +typedef u_int16_t n_short; /* short as received from the net */ +typedef u_int32_t n_long; /* long as received from the net */ + +typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */ + +#ifdef _KERNEL +uint32_t iptime(void); +#endif + +#endif diff --git a/netinet/in_var.h b/netinet/in_var.h new file mode 100644 index 0000000..9ade9c4 --- /dev/null +++ b/netinet/in_var.h @@ -0,0 +1,240 @@ +/* + * Copyright (c) 1985, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD: src/sys/netinet/in_var.h,v 1.52 2004/10/19 21:06:14 andre Exp $ + */ + +#ifndef _NETINET_IN_VAR_H_ +#define _NETINET_IN_VAR_H_ + +#include +#include /* struct ifaddr */ +#include /* struct in_addr */ + +#if !defined(__rtems__) +#include +#endif + +/* + * Interface address, Internet version. One of these structures + * is allocated for each Internet address on an interface. + * The ifaddr structure contains the protocol-independent part + * of the structure and is assumed to be first. + */ +struct in_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + /* ia_{,sub}net{,mask} in host order */ + u_long ia_net; /* network number of interface */ + u_long ia_netmask; /* mask of net part */ + u_long ia_subnet; /* subnet number, including net */ + u_long ia_subnetmask; /* mask of subnet part */ + struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ + struct in_ifaddr *ia_next; /* next in list of internet addresses */ + struct sockaddr_in ia_addr; /* reserve space for interface name */ + struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ + LIST_HEAD(in_multihead, in_multi) ia_multiaddrs; + /* list of multicast addresses */ +}; + +struct in_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_in ifra_addr; + struct sockaddr_in ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr + struct sockaddr_in ifra_mask; +}; +/* + * Given a pointer to an in_ifaddr (ifaddr), + * return a pointer to the addr as a sockaddr_in. + */ +#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) +#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) + +#define IN_LNAOF(in, ifa) \ + ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) + + +#ifdef _KERNEL +extern struct in_ifaddr *in_ifaddr; +extern struct ifqueue ipintrq; /* ip packet input queue */ +extern struct in_addr zeroin_addr; +extern u_char inetctlerrmap[]; + +/* + * Macro for finding the interface (ifnet structure) corresponding to one + * of our IP addresses. + */ +#define INADDR_TO_IFP(addr, ifp) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ +{ \ + register struct in_ifaddr *ia; \ +\ + for (ia = in_ifaddr; \ + ia != NULL && ((ia->ia_ifp->if_flags & IFF_POINTOPOINT)? \ + IA_DSTSIN(ia):IA_SIN(ia))->sin_addr.s_addr != (addr).s_addr; \ + ia = ia->ia_next) \ + continue; \ + if (ia == NULL) \ + for (ia = in_ifaddr; \ + ia != NULL; \ + ia = ia->ia_next) \ + if (ia->ia_ifp->if_flags & IFF_POINTOPOINT && \ + IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ + break; \ + (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ +} + +/* + * Macro for finding the internet address structure (in_ifaddr) corresponding + * to a given interface (ifnet structure). + */ +#define IFP_TO_IA(ifp, ia) \ + /* struct ifnet *ifp; */ \ + /* struct in_ifaddr *ia; */ \ +{ \ + for ((ia) = in_ifaddr; \ + (ia) != NULL && (ia)->ia_ifp != (ifp); \ + (ia) = (ia)->ia_next) \ + continue; \ +} +#endif + +/* + * This information should be part of the ifnet structure but we don't wish + * to change that - as it might break a number of things + */ + +struct router_info { + struct ifnet *rti_ifp; + int rti_type; /* type of router which is querier on this interface */ + int rti_time; /* # of slow timeouts since last old query */ + struct router_info *rti_next; +}; + +/* + * Internet multicast address structure. There is one of these for each IP + * multicast group to which this host belongs on a given network interface. + * They are kept in a linked list, rooted in the interface's in_ifaddr + * structure. + */ +struct in_multi { + LIST_ENTRY(in_multi) inm_entry; /* list glue */ + struct in_addr inm_addr; /* IP multicast address */ + struct ifnet *inm_ifp; /* back pointer to ifnet */ + struct in_ifaddr *inm_ia; /* back pointer to in_ifaddr */ + u_int inm_refcount; /* no. membership claims by sockets */ + u_int inm_timer; /* IGMP membership report timer */ + u_int inm_state; /* state of the membership */ + struct router_info *inm_rti; /* router info*/ +}; + +#ifdef _KERNEL + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_raw); +#endif + +/* + * Structure used by macros below to remember position when stepping through + * all of the in_multi records. + */ +struct in_multistep { + struct in_ifaddr *i_ia; + struct in_multi *i_inm; +}; + +/* + * Macro for looking up the in_multi record for a given IP multicast address + * on a given interface. If no matching record is found, "inm" returns NULL. + */ +#define IN_LOOKUP_MULTI(addr, ifp, inm) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in_multi *inm; */ \ +{ \ + register struct in_ifaddr *ia; \ +\ + IFP_TO_IA((ifp), ia); \ + if (ia == NULL) \ + (inm) = NULL; \ + else \ + for ((inm) = ia->ia_multiaddrs.lh_first; \ + (inm) != NULL && (inm)->inm_addr.s_addr != (addr).s_addr; \ + (inm) = inm->inm_entry.le_next) \ + continue; \ +} + +/* + * Macro to step through all of the in_multi records, one at a time. + * The current position is remembered in "step", which the caller must + * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" + * and get the first record. Both macros return a NULL "inm" when there + * are no remaining records. + */ +#define IN_NEXT_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +{ \ + if (((inm) = (step).i_inm) != NULL) \ + (step).i_inm = (inm)->inm_entry.le_next; \ + else \ + while ((step).i_ia != NULL) { \ + (inm) = (step).i_ia->ia_multiaddrs.lh_first; \ + (step).i_ia = (step).i_ia->ia_next; \ + if ((inm) != NULL) { \ + (step).i_inm = (inm)->inm_entry.le_next; \ + break; \ + } \ + } \ +} + +#define IN_FIRST_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +{ \ + (step).i_ia = in_ifaddr; \ + (step).i_inm = NULL; \ + IN_NEXT_MULTI((step), (inm)); \ +} + +struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); +void in_delmulti(struct in_multi *); +int in_control(struct socket *, u_long, caddr_t, struct ifnet *); +void in_rtqdrain(void); +void ip_input(struct mbuf *); + +#endif /* _KERNEL */ + +#endif /* _NETINET_IN_VAR_H_ */ diff --git a/netinet/ip.h b/netinet/ip.h new file mode 100644 index 0000000..6a2dcad --- /dev/null +++ b/netinet/ip.h @@ -0,0 +1,230 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip.h 8.2 (Berkeley) 6/1/94 + * $FreeBSD: src/sys/netinet/ip.h,v 1.29 2005/01/07 01:45:44 imp Exp $ + */ + + +#ifndef _NETINET_IP_H_ +#define _NETINET_IP_H_ + +#include +#include /* struct in_addr */ +#include /* n_long */ + +/* + * Definitions for internet protocol version 4. + * Per RFC 791, September 1981. + */ +#define IPVERSION 4 + +#ifndef __packed +#if defined(__GNUC__) +#define __packed __attribute__((packed)) +#define __aligned(x) __attribute__((aligned(x))) +#else +#define __packed +#define __aligned(x) +#endif +#endif + +/* + * Structure of an internet header, naked of options. + */ +struct ip { +#ifdef _IP_VHL + u_char ip_vhl; /* version << 4 | header length >> 2 */ +#else +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ip_v:4, /* version */ + ip_hl:4; /* header length */ +#endif +#endif /* not _IP_VHL */ + u_char ip_tos; /* type of service */ + u_short ip_len; /* total length */ + u_short ip_id; /* identification */ + u_short ip_off; /* fragment offset field */ +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + u_char ip_ttl; /* time to live */ + u_char ip_p; /* protocol */ + u_short ip_sum; /* checksum */ + struct in_addr ip_src,ip_dst; /* source and dest address */ +} __packed __aligned(4); + +#ifdef _IP_VHL +#define IP_MAKE_VHL(v, hl) ((v) << 4 | (hl)) +#define IP_VHL_HL(vhl) ((vhl) & 0x0f) +#define IP_VHL_V(vhl) ((vhl) >> 4) +#define IP_VHL_BORING 0x45 +#endif + +#ifdef CTASSERT +CTASSERT(sizeof (struct ip) == 20); +#endif + +#define IP_MAXPACKET 65535L /* maximum packet size */ + +/* + * Definitions for IP type of service (ip_tos) + */ +#define IPTOS_LOWDELAY 0x10 +#define IPTOS_THROUGHPUT 0x08 +#define IPTOS_RELIABILITY 0x04 +#define IPTOS_MINCOST 0x02 +#if 1 +/* ECN RFC3168 obsoletes RFC2481, and these will be deprecated soon. */ +#define IPTOS_CE 0x01 +#define IPTOS_ECT 0x02 +#endif + +/* + * Definitions for IP precedence (also in ip_tos) (hopefully unused) + */ +#define IPTOS_PREC_NETCONTROL 0xe0 +#define IPTOS_PREC_INTERNETCONTROL 0xc0 +#define IPTOS_PREC_CRITIC_ECP 0xa0 +#define IPTOS_PREC_FLASHOVERRIDE 0x80 +#define IPTOS_PREC_FLASH 0x60 +#define IPTOS_PREC_IMMEDIATE 0x40 +#define IPTOS_PREC_PRIORITY 0x20 +#define IPTOS_PREC_ROUTINE 0x00 + +/* + * ECN (Explicit Congestion Notification) codepoints in RFC3168 + * mapped to the lower 2 bits of the TOS field. + */ +#define IPTOS_ECN_NOTECT 0x00 /* not-ECT */ +#define IPTOS_ECN_ECT1 0x01 /* ECN-capable transport (1) */ +#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ +#define IPTOS_ECN_CE 0x03 /* congestion experienced */ +#define IPTOS_ECN_MASK 0x03 /* ECN field mask */ + +/* + * Definitions for options. + */ +#define IPOPT_COPIED(o) ((o)&0x80) +#define IPOPT_CLASS(o) ((o)&0x60) +#define IPOPT_NUMBER(o) ((o)&0x1f) + +#define IPOPT_CONTROL 0x00 +#define IPOPT_RESERVED1 0x20 +#define IPOPT_DEBMEAS 0x40 +#define IPOPT_RESERVED2 0x60 + +#define IPOPT_EOL 0 /* end of option list */ +#define IPOPT_NOP 1 /* no operation */ + +#define IPOPT_RR 7 /* record packet route */ +#define IPOPT_TS 68 /* timestamp */ +#define IPOPT_SECURITY 130 /* provide s,c,h,tcc */ +#define IPOPT_LSRR 131 /* loose source route */ +#define IPOPT_ESO 133 /* extended security */ +#define IPOPT_CIPSO 134 /* commerical security */ +#define IPOPT_SATID 136 /* satnet id */ +#define IPOPT_SSRR 137 /* strict source route */ +#define IPOPT_RA 148 /* router alert */ + +/* + * Offsets to fields in options other than EOL and NOP. + */ +#define IPOPT_OPTVAL 0 /* option ID */ +#define IPOPT_OLEN 1 /* option length */ +#define IPOPT_OFFSET 2 /* offset within option */ +#define IPOPT_MINOFF 4 /* min value of above */ + +/* + * Time stamp option structure. + */ +struct ip_timestamp { + u_char ipt_code; /* IPOPT_TS */ + u_char ipt_len; /* size of structure (variable) */ + u_char ipt_ptr; /* index of current entry */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ipt_flg:4, /* flags, see below */ + ipt_oflw:4; /* overflow counter */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ipt_oflw:4, /* overflow counter */ + ipt_flg:4; /* flags, see below */ +#endif + union ipt_timestamp { + n_long ipt_time[1]; + struct ipt_ta { + struct in_addr ipt_addr; + n_long ipt_time; + } ipt_ta[1]; + } ipt_timestamp; +}; + +#include + +/* flag bits for ipt_flg */ +#define IPOPT_TS_TSONLY 0 /* timestamps only */ +#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ +#define IPOPT_TS_PRESPEC 3 /* specified modules only */ + +/* bits for security (not byte swapped) */ +#define IPOPT_SECUR_UNCLASS 0x0000 +#define IPOPT_SECUR_CONFID 0xf135 +#define IPOPT_SECUR_EFTO 0x789a +#define IPOPT_SECUR_MMMM 0xbc4d +#define IPOPT_SECUR_RESTR 0xaf13 +#define IPOPT_SECUR_SECRET 0xd788 +#define IPOPT_SECUR_TOPSECRET 0x6bc5 + +/* + * Internet implementation parameters. + */ +#define MAXTTL 255 /* maximum time to live (seconds) */ +#define IPDEFTTL 64 /* default ttl, from RFC 1340 */ +#define IPFRAGTTL 60 /* time to live for frags, slowhz */ +#define IPTTLDEC 1 /* subtracted when forwarding */ + +#define IP_MSS 576 /* default maximum segment size */ + +/* + * This is the real IPv4 pseudo header, used for computing the TCP and UDP + * checksums. For the Internet checksum, struct ipovly can be used instead. + * For stronger checksums, the real thing must be used. + */ +struct ippseudo { + struct in_addr ippseudo_src; /* source internet address */ + struct in_addr ippseudo_dst; /* destination internet address */ + u_char ippseudo_pad; /* pad, must be zero */ + u_char ippseudo_p; /* protocol */ + u_short ippseudo_len; /* protocol length */ +}; +#endif diff --git a/netinet/ip_divert.c b/netinet/ip_divert.c new file mode 100644 index 0000000..c34da82 --- /dev/null +++ b/netinet/ip_divert.c @@ -0,0 +1,387 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/netinet/ip_divert.c,v 1.113 2005/05/13 11:44:37 glebius Exp $ + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef IPDIVERT + +/* + * Divert sockets + */ + +/* + * Allocate enough space to hold a full IP packet + */ +#define DIVSNDQ (65536 + 100) +#define DIVRCVQ (65536 + 100) + +/* Global variables */ + +/* + * ip_input() and ip_output() set this secret value before calling us to + * let us know which divert port to divert a packet to; this is done so + * we can use the existing prototype for struct protosw's pr_input(). + * This is stored in host order. + */ +u_short ip_divert_port; + +/* + * We set this value to a non-zero port number when we want the call to + * ip_fw_chk() in ip_input() or ip_output() to ignore ``divert '' + * chain entries. This is stored in host order. + */ +u_short ip_divert_ignore; + +/* Internal variables. */ +static struct inpcbhead divcb; +static struct inpcbinfo divcbinfo; + +static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ +static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ + +/* Optimization: have this preinitialized */ +static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET, 0, { 0 }, { 0 } }; + +/* Internal functions */ + +static int div_output(struct socket *so, + struct mbuf *m, struct mbuf *addr, struct mbuf *control); + +/* + * Initialize divert connection block queue. + */ +void +div_init(void) +{ + LIST_INIT(&divcb); + divcbinfo.listhead = &divcb; + /* + * XXX We don't use the hash list for divert IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask); +} + +/* + * Setup generic address and protocol structures + * for div_input routine, then pass them along with + * mbuf chain. ip->ip_len is assumed to have had + * the header length (hlen) subtracted out already. + * We tell whether the packet was incoming or outgoing + * by seeing if hlen == 0, which is a hack. + */ +void +div_input(struct mbuf *m, int hlen) +{ + struct ip *ip; + struct inpcb *inp; + struct socket *sa; + + /* Sanity check */ + if (ip_divert_port == 0) + panic("div_input: port is 0"); + + /* Assure header */ + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == 0) { + return; + } + ip = mtod(m, struct ip *); + + /* Record divert port */ + divsrc.sin_port = htons(ip_divert_port); + + /* Restore packet header fields */ + ip->ip_len += hlen; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + + /* Record receive interface address, if any */ + divsrc.sin_addr.s_addr = 0; + if (hlen) { + struct ifaddr *ifa; + +#ifdef DIAGNOSTIC + /* Sanity check */ + if (!(m->m_flags & M_PKTHDR)) + panic("div_input: no pkt hdr"); +#endif + + /* More fields affected by ip_input() */ + HTONS(ip->ip_id); + + /* Find IP address for recieve interface */ + for (ifa = m->m_pkthdr.rcvif->if_addrlist; + ifa != NULL; ifa = ifa->ifa_next) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + divsrc.sin_addr = + ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + break; + } + } + + /* Put packet on socket queue, if any */ + sa = NULL; + for (inp = divcb.lh_first; inp != NULL; inp = inp->inp_list.le_next) { + if (inp->inp_lport == htons(ip_divert_port)) + sa = inp->inp_socket; + } + if (sa) { + if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, + m, (struct mbuf *)0) == 0) + m_freem(m); + else + sorwakeup(sa); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Deliver packet back into the IP processing machinery. + * + * If no address specified, or address is 0.0.0.0, send to ip_output(); + * otherwise, send to ip_input() and mark as having been received on + * the interface with that address. + * + * If no address specified, or dest port is 0, allow packet to divert + * back to this socket; otherwise, don't. + */ +static int +div_output(struct socket *so, struct mbuf *m, + struct mbuf *addr, struct mbuf *control) +{ + register struct inpcb *const inp = sotoinpcb(so); + register struct ip *const ip = mtod(m, struct ip *); + struct sockaddr_in *sin = NULL; + int error = 0; + + if (control) + m_freem(control); /* XXX */ + if (addr) + sin = mtod(addr, struct sockaddr_in *); + + /* Loopback avoidance option */ + ip_divert_ignore = ntohs(inp->inp_lport); + + /* Reinject packet into the system as incoming or outgoing */ + if (!sin || sin->sin_addr.s_addr == 0) { + /* Don't allow both user specified and setsockopt options, + and don't allow packet length sizes that will crash */ + if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || + ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { + error = EINVAL; + goto cantsend; + } + + /* Convert fields to host order for ip_output() */ + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + + /* Send packet to output processing */ + ipstat.ips_rawout++; /* XXX */ + error = ip_output(m, inp->inp_options, &inp->inp_route, + (so->so_options & SO_DONTROUTE) | + IP_ALLOWBROADCAST | IP_RAWOUTPUT, inp->inp_moptions); + } else { + struct ifaddr *ifa; + + /* Find receive interface with the given IP address */ + sin->sin_port = 0; + if ((ifa = ifa_ifwithaddr((struct sockaddr *) sin)) == 0) { + error = EADDRNOTAVAIL; + goto cantsend; + } + m->m_pkthdr.rcvif = ifa->ifa_ifp; + + /* Send packet to input processing */ + ip_input(m); + } + + /* Reset for next time (and other packets) */ + ip_divert_ignore = 0; + return error; + +cantsend: + ip_divert_ignore = 0; + m_freem(m); + return error; +} + +/*ARGSUSED*/ +int +div_usrreq( + struct socket *so, + int req, + struct mbuf *m, + struct mbuf *nam, + struct mbuf *control ) +{ + register int error = 0; + register struct inpcb *inp = sotoinpcb(so); + int s; + + if (inp == NULL && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + switch (req) { + + case PRU_ATTACH: + if (inp) + panic("div_attach"); + if ((so->so_state & SS_PRIV) == 0) { + error = EACCES; + break; + } + s = splnet(); + error = in_pcballoc(so, &divcbinfo); + splx(s); + if (error) + break; + error = soreserve(so, div_sendspace, div_recvspace); + if (error) + break; + inp = (struct inpcb *)so->so_pcb; + inp->inp_ip_p = (intptr_t)nam; /* XXX */ + inp->inp_flags |= INP_HDRINCL; + /* The socket is always "connected" because + we always know "where" to send the packet */ + so->so_state |= SS_ISCONNECTED; + break; + + case PRU_DISCONNECT: + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + break; + } + /* FALLTHROUGH */ + case PRU_ABORT: + soisdisconnected(so); + /* FALLTHROUGH */ + case PRU_DETACH: + if (inp == 0) + panic("div_detach"); + in_pcbdetach(inp); + break; + + case PRU_BIND: + s = splnet(); + error = in_pcbbind(inp, nam); + splx(s); + break; + + /* + * Mark the connection as being incapable of further input. + */ + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + case PRU_SEND: + /* Packet must have a header (but that's about it) */ + if (m->m_len < sizeof (struct ip) || + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + error = EINVAL; + break; + } + + /* Send packet */ + error = div_output(so, m, nam, control); + m = NULL; + break; + + case PRU_SOCKADDR: + in_setsockaddr(inp, nam); + break; + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + /* + * Not supported. + */ + case PRU_CONNECT: + case PRU_CONNECT2: + case PRU_CONTROL: + case PRU_RCVOOB: + case PRU_RCVD: + case PRU_LISTEN: + case PRU_ACCEPT: + case PRU_SENDOOB: + case PRU_PEERADDR: + error = EOPNOTSUPP; + break; + + default: + panic("div_usrreq"); + } +release: + if (m) + m_freem(m); + return (error); +} +#endif /* IPDIVERT */ diff --git a/netinet/ip_fw.c b/netinet/ip_fw.c new file mode 100644 index 0000000..e5f8ab2 --- /dev/null +++ b/netinet/ip_fw.c @@ -0,0 +1,1082 @@ +#include + +/* + * Copyright (c) 1996 Alex Nash + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +/* + * Implement IP packet firewall + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef IPFIREWALL_MODULE +#include "opt_ipfw.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fw_debug = 1; +#ifdef IPFIREWALL_VERBOSE +static int fw_verbose = 1; +#else +static int fw_verbose = 0; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT +static int fw_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#else +static int fw_verbose_limit = 0; +#endif + +LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain; + +/* + * ccj - No current need for firewall so have provided the MIB. + */ +#if 0 +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, &fw_debug, 0, ""); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW, &fw_verbose, 0, ""); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &fw_verbose_limit, 0, ""); +#endif +#endif + +#define dprintf(a) if (!fw_debug); else printf a + +#define print_ip(a) printf("%"PRId32".%"PRId32".%"PRId32".%"PRId32,\ + (ntohl(a.s_addr)>>24)&0xFF,\ + (ntohl(a.s_addr)>>16)&0xFF,\ + (ntohl(a.s_addr)>>8)&0xFF,\ + (ntohl(a.s_addr))&0xFF); + +#define dprint_ip(a) if (!fw_debug); else print_ip(a) + +static int add_entry(struct ip_fw_head *chainptr, struct ip_fw *frwl); +static int del_entry(struct ip_fw_head *chainptr, u_short number); +static int zero_entry(struct mbuf *m); +static struct ip_fw *check_ipfw_struct(struct ip_fw *m); +static struct ip_fw *check_ipfw_mbuf(struct mbuf *fw); +static int ipopts_match(struct ip *ip, struct ip_fw *f); +static int port_match(u_short *portptr, int nports, u_short port, + int range_flag); +static int tcpflg_match(struct tcphdr *tcp, struct ip_fw *f); +static int icmptype_match(struct icmp * icmp, struct ip_fw * f); +static void ipfw_report(struct ip_fw *f, struct ip *ip, + struct ifnet *rif, struct ifnet *oif); + +#ifdef IPFIREWALL_MODULE +static ip_fw_chk_t *old_chk_ptr; +static ip_fw_ctl_t *old_ctl_ptr; +#endif + +static int ip_fw_chk(struct ip **pip, int hlen, + struct ifnet *oif, int ignport, struct mbuf **m); +static int ip_fw_ctl(int stage, struct mbuf **mm); + +static char err_prefix[] = "ip_fw_ctl:"; + +/* + * Returns 1 if the port is matched by the vector, 0 otherwise + */ +static inline int +port_match(u_short *portptr, int nports, u_short port, int range_flag) +{ + if (!nports) + return 1; + if (range_flag) { + if (portptr[0] <= port && port <= portptr[1]) { + return 1; + } + nports -= 2; + portptr += 2; + } + while (nports-- > 0) { + if (*portptr++ == port) { + return 1; + } + } + return 0; +} + +static int +tcpflg_match(struct tcphdr *tcp, struct ip_fw *f) +{ + u_char flg_set, flg_clr; + + if ((f->fw_tcpf & IP_FW_TCPF_ESTAB) && + (tcp->th_flags & (IP_FW_TCPF_RST | IP_FW_TCPF_ACK))) + return 1; + + flg_set = tcp->th_flags & f->fw_tcpf; + flg_clr = tcp->th_flags & f->fw_tcpnf; + + if (flg_set != f->fw_tcpf) + return 0; + if (flg_clr) + return 0; + + return 1; +} + +static int +icmptype_match(struct icmp *icmp, struct ip_fw *f) +{ + int type; + + if (!(f->fw_flg & IP_FW_F_ICMPBIT)) + return(1); + + type = icmp->icmp_type; + + /* check for matching type in the bitmap */ + if (type < IP_FW_ICMPTYPES_DIM * sizeof(unsigned) * 8 && + (f->fw_icmptypes[type / (sizeof(unsigned) * 8)] & + (1U << (type % (8 * sizeof(unsigned)))))) + return(1); + + return(0); /* no match */ +} + +static int +ipopts_match(struct ip *ip, struct ip_fw *f) +{ + register u_char *cp; + int opt, optlen, cnt; + u_char opts, nopts, nopts_sve; + + cp = (u_char *)(ip + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + opts = f->fw_ipopt; + nopts = nopts_sve = f->fw_ipnopt; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > cnt) { + return 0; /*XXX*/ + } + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + opts &= ~IP_FW_IPOPT_LSRR; + nopts &= ~IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + opts &= ~IP_FW_IPOPT_SSRR; + nopts &= ~IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + opts &= ~IP_FW_IPOPT_RR; + nopts &= ~IP_FW_IPOPT_RR; + break; + case IPOPT_TS: + opts &= ~IP_FW_IPOPT_TS; + nopts &= ~IP_FW_IPOPT_TS; + break; + } + if (opts == nopts) + break; + } + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; +} + +static inline int +iface_match(struct ifnet *ifp, union ip_fw_if *ifu, int byname) +{ + /* Check by name or by IP address */ + if (byname) { + /* Check unit number (-1 is wildcard) */ + if (ifu->fu_via_if.unit != -1 + && ifp->if_unit != ifu->fu_via_if.unit) + return(0); + /* Check name */ + if (strncmp(ifp->if_name, ifu->fu_via_if.name, FW_IFNLEN)) + return(0); + return(1); + } else if (ifu->fu_via_ip.s_addr != 0) { /* Zero == wildcard */ + struct ifaddr *ia; + + for (ia = ifp->if_addrlist; ia; ia = ia->ifa_next) { + if (ia->ifa_addr == NULL) + continue; + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (ifu->fu_via_ip.s_addr != ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) + continue; + return(1); + } + return(0); + } + return(1); +} + +static void +ipfw_report(struct ip_fw *f, struct ip *ip, + struct ifnet *rif, struct ifnet *oif) +{ + static int counter; + struct tcphdr *const tcp = (struct tcphdr *) ((u_long *) ip+ ip->ip_hl); + struct udphdr *const udp = (struct udphdr *) ((u_long *) ip+ ip->ip_hl); + struct icmp *const icmp = (struct icmp *) ((u_long *) ip + ip->ip_hl); + int count; + + count = f ? f->fw_pcnt : ++counter; + if (fw_verbose_limit != 0 && count > fw_verbose_limit) + return; + + /* Print command name */ + printf("ipfw: %d ", f ? f->fw_number : -1); + if (!f) + printf("Refuse"); + else + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_DENY: + printf("Deny"); + break; + case IP_FW_F_REJECT: + if (f->fw_reject_code == IP_FW_REJECT_RST) + printf("Reset"); + else + printf("Unreach"); + break; + case IP_FW_F_ACCEPT: + printf("Accept"); + break; + case IP_FW_F_COUNT: + printf("Count"); + break; + case IP_FW_F_DIVERT: + printf("Divert %d", f->fw_divert_port); + break; + case IP_FW_F_TEE: + printf("Tee %d", f->fw_divert_port); + break; + case IP_FW_F_SKIPTO: + printf("SkipTo %d", f->fw_skipto_rule); + break; + default: + printf("UNKNOWN"); + break; + } + printf(" "); + + switch (ip->ip_p) { + case IPPROTO_TCP: + printf("TCP "); + print_ip(ip->ip_src); + if ((ip->ip_off & IP_OFFMASK) == 0) + printf(":%d ", ntohs(tcp->th_sport)); + else + printf(" "); + print_ip(ip->ip_dst); + if ((ip->ip_off & IP_OFFMASK) == 0) + printf(":%d", ntohs(tcp->th_dport)); + break; + case IPPROTO_UDP: + printf("UDP "); + print_ip(ip->ip_src); + if ((ip->ip_off & IP_OFFMASK) == 0) + printf(":%d ", ntohs(udp->uh_sport)); + else + printf(" "); + print_ip(ip->ip_dst); + if ((ip->ip_off & IP_OFFMASK) == 0) + printf(":%d", ntohs(udp->uh_dport)); + break; + case IPPROTO_ICMP: + printf("ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); + print_ip(ip->ip_src); + printf(" "); + print_ip(ip->ip_dst); + break; + default: + printf("P:%d ", ip->ip_p); + print_ip(ip->ip_src); + printf(" "); + print_ip(ip->ip_dst); + break; + } + if (oif) + printf(" out via %s%d", oif->if_name, oif->if_unit); + else if (rif) + printf(" in via %s%d", rif->if_name, rif->if_unit); + if ((ip->ip_off & IP_OFFMASK)) + printf(" Fragment = %d",ip->ip_off & IP_OFFMASK); + printf("\n"); + if (fw_verbose_limit != 0 && count == fw_verbose_limit) + printf("ipfw: limit reached on rule #%d\n", + f ? f->fw_number : -1); +} + +/* + * Parameters: + * + * ip Pointer to packet header (struct ip *) + * hlen Packet header length + * oif Outgoing interface, or NULL if packet is incoming + * ignport Ignore all divert/tee rules to this port (if non-zero) + * *m The packet; we set to NULL when/if we nuke it. + * + * Return value: + * + * 0 The packet is to be accepted and routed normally OR + * the packet was denied/rejected and has been dropped; + * in the latter case, *m is equal to NULL upon return. + * port Divert the packet to port. + */ + +static int +ip_fw_chk(struct ip **pip, int hlen, + struct ifnet *oif, int ignport, struct mbuf **m) +{ + struct ip_fw_chain *chain; + struct ip_fw *rule = NULL; + struct ip *ip = *pip; + struct ifnet *const rif = (*m)->m_pkthdr.rcvif; + u_short offset = (ip->ip_off & IP_OFFMASK); + u_short src_port, dst_port; + + /* + * Go down the chain, looking for enlightment + */ + for (chain=ip_fw_chain.lh_first; chain; chain = chain->chain.le_next) { + register struct ip_fw *const f = chain->rule; + + /* Check direction inbound */ + if (!oif && !(f->fw_flg & IP_FW_F_IN)) + continue; + + /* Check direction outbound */ + if (oif && !(f->fw_flg & IP_FW_F_OUT)) + continue; + + /* Fragments */ + if ((f->fw_flg & IP_FW_F_FRAG) && !(ip->ip_off & IP_OFFMASK)) + continue; + + /* If src-addr doesn't match, not this rule. */ + if (((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ ((ip->ip_src.s_addr + & f->fw_smsk.s_addr) != f->fw_src.s_addr)) + continue; + + /* If dest-addr doesn't match, not this rule. */ + if (((f->fw_flg & IP_FW_F_INVDST) != 0) ^ ((ip->ip_dst.s_addr + & f->fw_dmsk.s_addr) != f->fw_dst.s_addr)) + continue; + + /* Interface check */ + if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + struct ifnet *const iface = oif ? oif : rif; + + /* Backwards compatibility hack for "via" */ + if (!iface || !iface_match(iface, + &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME)) + continue; + } else { + /* Check receive interface */ + if ((f->fw_flg & IP_FW_F_IIFACE) + && (!rif || !iface_match(rif, + &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME))) + continue; + /* Check outgoing interface */ + if ((f->fw_flg & IP_FW_F_OIFACE) + && (!oif || !iface_match(oif, + &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME))) + continue; + } + + /* Check IP options */ + if (f->fw_ipopt != f->fw_ipnopt && !ipopts_match(ip, f)) + continue; + + /* Check protocol; if wildcard, match */ + if (f->fw_prot == IPPROTO_IP) + goto got_match; + + /* If different, don't match */ + if (ip->ip_p != f->fw_prot) + continue; + +#define PULLUP_TO(len) do { \ + if ((*m)->m_len < (len) \ + && (*m = m_pullup(*m, (len))) == 0) { \ + goto bogusfrag; \ + } \ + *pip = ip = mtod(*m, struct ip *); \ + offset = (ip->ip_off & IP_OFFMASK); \ + } while (0) + + /* Protocol specific checks */ + switch (ip->ip_p) { + case IPPROTO_TCP: + { + struct tcphdr *tcp; + + if (offset == 1) /* cf. RFC 1858 */ + goto bogusfrag; + if (offset != 0) { + /* + * TCP flags and ports aren't available in this + * packet -- if this rule specified either one, + * we consider the rule a non-match. + */ + if (f->fw_nports != 0 || + f->fw_tcpf != f->fw_tcpnf) + continue; + + break; + } + PULLUP_TO(hlen + 14); + tcp = (struct tcphdr *) ((u_long *)ip + ip->ip_hl); + if (f->fw_tcpf != f->fw_tcpnf && !tcpflg_match(tcp, f)) + continue; + src_port = ntohs(tcp->th_sport); + dst_port = ntohs(tcp->th_dport); + goto check_ports; + } + + case IPPROTO_UDP: + { + struct udphdr *udp; + + if (offset != 0) { + /* + * Port specification is unavailable -- if this + * rule specifies a port, we consider the rule + * a non-match. + */ + if (f->fw_nports != 0) + continue; + + break; + } + PULLUP_TO(hlen + 4); + udp = (struct udphdr *) ((u_long *)ip + ip->ip_hl); + src_port = ntohs(udp->uh_sport); + dst_port = ntohs(udp->uh_dport); +check_ports: + if (!port_match(&f->fw_pts[0], + IP_FW_GETNSRCP(f), src_port, + f->fw_flg & IP_FW_F_SRNG)) + continue; + if (!port_match(&f->fw_pts[IP_FW_GETNSRCP(f)], + IP_FW_GETNDSTP(f), dst_port, + f->fw_flg & IP_FW_F_DRNG)) + continue; + break; + } + + case IPPROTO_ICMP: + { + struct icmp *icmp; + + if (offset != 0) /* Type isn't valid */ + break; + PULLUP_TO(hlen + 2); + icmp = (struct icmp *) ((u_long *)ip + ip->ip_hl); + if (!icmptype_match(icmp, f)) + continue; + break; + } +#undef PULLUP_TO + +bogusfrag: + if (fw_verbose) + ipfw_report(NULL, ip, rif, oif); + goto dropit; + } + +got_match: + /* Ignore divert/tee rule if socket port is "ignport" */ + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_DIVERT: + case IP_FW_F_TEE: + if (f->fw_divert_port == ignport) + continue; /* ignore this rule */ + break; + } + + /* Update statistics */ + f->fw_pcnt += 1; + f->fw_bcnt += ip->ip_len; + f->timestamp = rtems_bsdnet_seconds_since_boot(); + + /* Log to console if desired */ + if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose) + ipfw_report(f, ip, rif, oif); + + /* Take appropriate action */ + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_ACCEPT: + return(0); + case IP_FW_F_COUNT: + continue; + case IP_FW_F_DIVERT: + return(f->fw_divert_port); + case IP_FW_F_TEE: + /* + * XXX someday tee packet here, but beware that you + * can't use m_copym() or m_copypacket() because + * the divert input routine modifies the mbuf + * (and these routines only increment reference + * counts in the case of mbuf clusters), so need + * to write custom routine. + */ + continue; + case IP_FW_F_SKIPTO: +#ifdef DIAGNOSTIC + while (chain->chain.le_next + && chain->chain.le_next->rule->fw_number + < f->fw_skipto_rule) +#else + while (chain->chain.le_next->rule->fw_number + < f->fw_skipto_rule) +#endif + chain = chain->chain.le_next; + continue; + } + + /* Deny/reject this packet using this rule */ + rule = f; + break; + } + +#ifdef DIAGNOSTIC + /* Rule 65535 should always be there and should always match */ + if (!chain) + panic("ip_fw: chain"); +#endif + + /* + * At this point, we're going to drop the packet. + * Send a reject notice if all of the following are true: + * + * - The packet matched a reject rule + * - The packet is not an ICMP packet + * - The packet is not a multicast or broadcast packet + */ + if ((rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT + && ip->ip_p != IPPROTO_ICMP + && !((*m)->m_flags & (M_BCAST|M_MCAST)) + && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + switch (rule->fw_reject_code) { + case IP_FW_REJECT_RST: + { + struct tcphdr *const tcp = + (struct tcphdr *) ((u_long *)ip + ip->ip_hl); + struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip; + + if (offset != 0 || (tcp->th_flags & TH_RST)) + break; + ti.ti_i = *((struct ipovly *) ip); + ti.ti_t = *tcp; + bcopy(&ti, ip, sizeof(ti)); + NTOHL(tip->ti_seq); + NTOHL(tip->ti_ack); + tip->ti_len = ip->ip_len - hlen - (tip->ti_off << 2); + if (tcp->th_flags & TH_ACK) { + tcp_respond(NULL, tip, *m, + (tcp_seq)0, ntohl(tcp->th_ack), TH_RST); + } else { + if (tcp->th_flags & TH_SYN) + tip->ti_len++; + tcp_respond(NULL, tip, *m, tip->ti_seq + + tip->ti_len, (tcp_seq)0, TH_RST|TH_ACK); + } + *m = NULL; + break; + } + default: /* Send an ICMP unreachable using code */ + icmp_error(*m, ICMP_UNREACH, + rule->fw_reject_code, 0L, 0); + *m = NULL; + break; + } + } + +dropit: + /* + * Finally, drop the packet. + */ + if (*m) { + m_freem(*m); + *m = NULL; + } + return(0); +} + +static int +add_entry(struct ip_fw_head *chainptr, struct ip_fw *frwl) +{ + struct ip_fw *ftmp = 0; + struct ip_fw_chain *fwc = 0, *fcp, *fcpl = 0; + u_short nbr = 0; + int s; + + fwc = malloc(sizeof *fwc, M_IPFW, M_DONTWAIT); + ftmp = malloc(sizeof *ftmp, M_IPFW, M_DONTWAIT); + if (!fwc || !ftmp) { + dprintf(("%s malloc said no\n", err_prefix)); + if (fwc) free(fwc, M_IPFW); + if (ftmp) free(ftmp, M_IPFW); + return (ENOSPC); + } + + bcopy(frwl, ftmp, sizeof(struct ip_fw)); + ftmp->fw_in_if.fu_via_if.name[FW_IFNLEN - 1] = '\0'; + ftmp->fw_pcnt = 0L; + ftmp->fw_bcnt = 0L; + fwc->rule = ftmp; + + s = splnet(); + + if (!chainptr->lh_first) { + LIST_INSERT_HEAD(chainptr, fwc, chain); + splx(s); + return(0); + } else if (ftmp->fw_number == (u_short)-1) { + if (fwc) free(fwc, M_IPFW); + if (ftmp) free(ftmp, M_IPFW); + splx(s); + dprintf(("%s bad rule number\n", err_prefix)); + return (EINVAL); + } + + /* If entry number is 0, find highest numbered rule and add 100 */ + if (ftmp->fw_number == 0) { + for (fcp = chainptr->lh_first; fcp; fcp = fcp->chain.le_next) { + if (fcp->rule->fw_number != (u_short)-1) + nbr = fcp->rule->fw_number; + else + break; + } + if (nbr < (u_short)-1 - 100) + nbr += 100; + ftmp->fw_number = nbr; + } + + /* Got a valid number; now insert it, keeping the list ordered */ + for (fcp = chainptr->lh_first; fcp; fcp = fcp->chain.le_next) { + if (fcp->rule->fw_number > ftmp->fw_number) { + if (fcpl) { + LIST_INSERT_AFTER(fcpl, fwc, chain); + } else { + LIST_INSERT_HEAD(chainptr, fwc, chain); + } + break; + } else { + fcpl = fcp; + } + } + + splx(s); + return (0); +} + +static int +del_entry(struct ip_fw_head *chainptr, u_short number) +{ + struct ip_fw_chain *fcp; + int s; + + s = splnet(); + + fcp = chainptr->lh_first; + if (number != (u_short)-1) { + for (; fcp; fcp = fcp->chain.le_next) { + if (fcp->rule->fw_number == number) { + LIST_REMOVE(fcp, chain); + splx(s); + free(fcp->rule, M_IPFW); + free(fcp, M_IPFW); + return 0; + } + } + } + + splx(s); + return (EINVAL); +} + +static int +zero_entry(struct mbuf *m) +{ + struct ip_fw *frwl; + struct ip_fw_chain *fcp; + int s; + + if (m) { + if (m->m_len != sizeof(struct ip_fw)) + return(EINVAL); + frwl = mtod(m, struct ip_fw *); + } + else + frwl = NULL; + + /* + * It's possible to insert multiple chain entries with the + * same number, so we don't stop after finding the first + * match if zeroing a specific entry. + */ + s = splnet(); + for (fcp = ip_fw_chain.lh_first; fcp; fcp = fcp->chain.le_next) + if (!frwl || frwl->fw_number == fcp->rule->fw_number) { + fcp->rule->fw_bcnt = fcp->rule->fw_pcnt = 0; + fcp->rule->timestamp = 0; + } + splx(s); + + if (fw_verbose) { + if (frwl) + printf("ipfw: Entry %d cleared.\n", frwl->fw_number); + else + printf("ipfw: Accounting cleared.\n"); + } + + return(0); +} + +static struct ip_fw * +check_ipfw_mbuf(struct mbuf *m) +{ + /* Check length */ + if (m->m_len != sizeof(struct ip_fw)) { + dprintf(("%s len=%d, want %d\n", err_prefix, m->m_len, + (int)sizeof(struct ip_fw))); + return (NULL); + } + return(check_ipfw_struct(mtod(m, struct ip_fw *))); +} + +static struct ip_fw * +check_ipfw_struct(struct ip_fw *frwl) +{ + /* Check for invalid flag bits */ + if ((frwl->fw_flg & ~IP_FW_F_MASK) != 0) { + dprintf(("%s undefined flag bits set (flags=%x)\n", + err_prefix, frwl->fw_flg)); + return (NULL); + } + /* Must apply to incoming or outgoing (or both) */ + if (!(frwl->fw_flg & (IP_FW_F_IN | IP_FW_F_OUT))) { + dprintf(("%s neither in nor out\n", err_prefix)); + return (NULL); + } + /* Empty interface name is no good */ + if (((frwl->fw_flg & IP_FW_F_IIFNAME) + && !*frwl->fw_in_if.fu_via_if.name) + || ((frwl->fw_flg & IP_FW_F_OIFNAME) + && !*frwl->fw_out_if.fu_via_if.name)) { + dprintf(("%s empty interface name\n", err_prefix)); + return (NULL); + } + /* Sanity check interface matching */ + if ((frwl->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + ; /* allow "via" backwards compatibility */ + } else if ((frwl->fw_flg & IP_FW_F_IN) + && (frwl->fw_flg & IP_FW_F_OIFACE)) { + dprintf(("%s outgoing interface check on incoming\n", + err_prefix)); + return (NULL); + } + /* Sanity check port ranges */ + if ((frwl->fw_flg & IP_FW_F_SRNG) && IP_FW_GETNSRCP(frwl) < 2) { + dprintf(("%s src range set but n_src_p=%d\n", + err_prefix, IP_FW_GETNSRCP(frwl))); + return (NULL); + } + if ((frwl->fw_flg & IP_FW_F_DRNG) && IP_FW_GETNDSTP(frwl) < 2) { + dprintf(("%s dst range set but n_dst_p=%d\n", + err_prefix, IP_FW_GETNDSTP(frwl))); + return (NULL); + } + if (IP_FW_GETNSRCP(frwl) + IP_FW_GETNDSTP(frwl) > IP_FW_MAX_PORTS) { + dprintf(("%s too many ports (%d+%d)\n", + err_prefix, IP_FW_GETNSRCP(frwl), IP_FW_GETNDSTP(frwl))); + return (NULL); + } + /* + * Protocols other than TCP/UDP don't use port range + */ + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (IP_FW_GETNSRCP(frwl) || IP_FW_GETNDSTP(frwl))) { + dprintf(("%s port(s) specified for non TCP/UDP rule\n", + err_prefix)); + return(NULL); + } + + /* + * Rather than modify the entry to make such entries work, + * we reject this rule and require user level utilities + * to enforce whatever policy they deem appropriate. + */ + if ((frwl->fw_src.s_addr & (~frwl->fw_smsk.s_addr)) || + (frwl->fw_dst.s_addr & (~frwl->fw_dmsk.s_addr))) { + dprintf(("%s rule never matches\n", err_prefix)); + return(NULL); + } + + if ((frwl->fw_flg & IP_FW_F_FRAG) && + (frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) { + if (frwl->fw_nports) { + dprintf(("%s cannot mix 'frag' and ports\n", err_prefix)); + return(NULL); + } + if (frwl->fw_prot == IPPROTO_TCP && + frwl->fw_tcpf != frwl->fw_tcpnf) { + dprintf(("%s cannot mix 'frag' with TCP flags\n", err_prefix)); + return(NULL); + } + } + + /* Check command specific stuff */ + switch (frwl->fw_flg & IP_FW_F_COMMAND) + { + case IP_FW_F_REJECT: + if (frwl->fw_reject_code >= 0x100 + && !(frwl->fw_prot == IPPROTO_TCP + && frwl->fw_reject_code == IP_FW_REJECT_RST)) { + dprintf(("%s unknown reject code\n", err_prefix)); + return(NULL); + } + break; + case IP_FW_F_DIVERT: /* Diverting to port zero is invalid */ + case IP_FW_F_TEE: + if (frwl->fw_divert_port == 0) { + dprintf(("%s can't divert to port 0\n", err_prefix)); + return (NULL); + } + break; + case IP_FW_F_DENY: + case IP_FW_F_ACCEPT: + case IP_FW_F_COUNT: + case IP_FW_F_SKIPTO: + break; + default: + dprintf(("%s invalid command\n", err_prefix)); + return(NULL); + } + + return frwl; +} + +static int +ip_fw_ctl(int stage, struct mbuf **mm) +{ + int error; + struct mbuf *m; + + if (stage == IP_FW_GET) { + struct ip_fw_chain *fcp = ip_fw_chain.lh_first; + *mm = m = m_get(M_WAIT, MT_SOOPTS); + for (; fcp; fcp = fcp->chain.le_next) { + memcpy(m->m_data, fcp->rule, sizeof *(fcp->rule)); + m->m_len = sizeof *(fcp->rule); + m->m_next = m_get(M_WAIT, MT_SOOPTS); + m = m->m_next; + m->m_len = 0; + } + return (0); + } + m = *mm; + /* only allow get calls if secure mode > 2 */ + if (securelevel > 2) { + if (m) (void)m_free(m); + return(EPERM); + } + if (stage == IP_FW_FLUSH) { + while (ip_fw_chain.lh_first != NULL && + ip_fw_chain.lh_first->rule->fw_number != (u_short)-1) { + struct ip_fw_chain *fcp = ip_fw_chain.lh_first; + int s = splnet(); + LIST_REMOVE(ip_fw_chain.lh_first, chain); + splx(s); + free(fcp->rule, M_IPFW); + free(fcp, M_IPFW); + } + if (m) (void)m_free(m); + return (0); + } + if (stage == IP_FW_ZERO) { + error = zero_entry(m); + if (m) (void)m_free(m); + return (error); + } + if (m == NULL) { + printf("%s NULL mbuf ptr\n", err_prefix); + return (EINVAL); + } + + if (stage == IP_FW_ADD) { + struct ip_fw *frwl = check_ipfw_mbuf(m); + + if (!frwl) + error = EINVAL; + else + error = add_entry(&ip_fw_chain, frwl); + if (m) (void)m_free(m); + return error; + } + if (stage == IP_FW_DEL) { + if (m->m_len != sizeof(struct ip_fw)) { + dprintf(("%s len=%d, want %d\n", err_prefix, m->m_len, + (int)sizeof(struct ip_fw))); + error = EINVAL; + } else if (mtod(m, struct ip_fw *)->fw_number == (u_short)-1) { + dprintf(("%s can't delete rule 65535\n", err_prefix)); + error = EINVAL; + } else + error = del_entry(&ip_fw_chain, + mtod(m, struct ip_fw *)->fw_number); + if (m) (void)m_free(m); + return error; + } + + dprintf(("%s unknown request %d\n", err_prefix, stage)); + if (m) (void)m_free(m); + return (EINVAL); +} + +void +ip_fw_init(void) +{ + struct ip_fw default_rule; + + ip_fw_chk_ptr = ip_fw_chk; + ip_fw_ctl_ptr = ip_fw_ctl; + LIST_INIT(&ip_fw_chain); + + bzero(&default_rule, sizeof default_rule); + default_rule.fw_prot = IPPROTO_IP; + default_rule.fw_number = (u_short)-1; +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + default_rule.fw_flg |= IP_FW_F_ACCEPT; +#else + default_rule.fw_flg |= IP_FW_F_DENY; +#endif + default_rule.fw_flg |= IP_FW_F_IN | IP_FW_F_OUT; + if (check_ipfw_struct(&default_rule) == NULL || + add_entry(&ip_fw_chain, &default_rule)) + panic(__FUNCTION__); + + printf("IP packet filtering initialized, " +#ifdef IPDIVERT + "divert enabled, "); +#else + "divert disabled, "); +#endif +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + printf("default to accept, "); +#endif +#ifndef IPFIREWALL_VERBOSE + printf("logging disabled\n"); +#else + if (fw_verbose_limit == 0) + printf("unlimited logging\n"); + else + printf("logging limited to %d packets/entry\n", + fw_verbose_limit); +#endif +} + +#ifdef IPFIREWALL_MODULE + +#include +#include +#include + +MOD_MISC(ipfw); + +static int +ipfw_load(struct lkm_table *lkmtp, int cmd) +{ + int s=splnet(); + + old_chk_ptr = ip_fw_chk_ptr; + old_ctl_ptr = ip_fw_ctl_ptr; + + ip_fw_init(); + splx(s); + return 0; +} + +static int +ipfw_unload(struct lkm_table *lkmtp, int cmd) +{ + int s=splnet(); + + ip_fw_chk_ptr = old_chk_ptr; + ip_fw_ctl_ptr = old_ctl_ptr; + + while (ip_fw_chain.lh_first != NULL) { + struct ip_fw_chain *fcp = ip_fw_chain.lh_first; + LIST_REMOVE(ip_fw_chain.lh_first, chain); + free(fcp->rule, M_IPFW); + free(fcp, M_IPFW); + } + + splx(s); + printf("IP firewall unloaded\n"); + return 0; +} + +int +ipfw_mod(struct lkm_table *lkmtp, int cmd, int ver) +{ + DISPATCH(lkmtp, cmd, ver, ipfw_load, ipfw_unload, lkm_nullcmd); +} +#endif diff --git a/netinet/ip_fw.h b/netinet/ip_fw.h new file mode 100644 index 0000000..c511154 --- /dev/null +++ b/netinet/ip_fw.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#ifndef _IP_FW_H +#define _IP_FW_H + +#include /* LIST_ENTRY */ +#include +#include /* struct in_addr */ + +/* + * This union structure identifies an interface, either explicitly + * by name or implicitly by IP address. The flags IP_FW_F_IIFNAME + * and IP_FW_F_OIFNAME say how to interpret this structure. An + * interface unit number of -1 matches any unit number, while an + * IP address of 0.0.0.0 indicates matches any interface. + * + * The receive and transmit interfaces are only compared against the + * the packet if the corresponding bit (IP_FW_F_IIFACE or IP_FW_F_OIFACE) + * is set. Note some packets lack a receive or transmit interface + * (in which case the missing "interface" never matches). + */ + +union ip_fw_if { + struct in_addr fu_via_ip; /* Specified by IP address */ + struct { /* Specified by interface name */ +#define FW_IFNLEN IFNAMSIZ + char name[FW_IFNLEN]; + short unit; /* -1 means match any unit */ + } fu_via_if; +}; + +/* + * Format of an IP firewall descriptor + * + * fw_src, fw_dst, fw_smsk, fw_dmsk are always stored in network byte order. + * fw_flg and fw_n*p are stored in host byte order (of course). + * Port numbers are stored in HOST byte order. + * Warning: setsockopt() will fail if sizeof(struct ip_fw) > MLEN (108) + */ + +struct ip_fw { + u_long fw_pcnt,fw_bcnt; /* Packet and byte counters */ + struct in_addr fw_src, fw_dst; /* Source and destination IP addr */ + struct in_addr fw_smsk, fw_dmsk; /* Mask for src and dest IP addr */ + u_short fw_number; /* Rule number */ + u_short fw_flg; /* Flags word */ +#define IP_FW_MAX_PORTS 10 /* A reasonable maximum */ + u_short fw_pts[IP_FW_MAX_PORTS]; /* Array of port numbers to match */ + u_char fw_ipopt,fw_ipnopt; /* IP options set/unset */ + u_char fw_tcpf,fw_tcpnf; /* TCP flags set/unset */ +#define IP_FW_ICMPTYPES_DIM (32 / (sizeof(unsigned) * 8)) + unsigned fw_icmptypes[IP_FW_ICMPTYPES_DIM]; /* ICMP types bitmap */ + long timestamp; /* timestamp (tv_sec) of last match */ + union ip_fw_if fw_in_if, fw_out_if; /* Incoming and outgoing interfaces */ + union { + u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ + u_short fu_skipto_rule; /* SKIPTO command rule number */ + u_short fu_reject_code; /* REJECT response code */ + } fw_un; + u_char fw_prot; /* IP protocol */ + u_char fw_nports; /* N'of src ports and # of dst ports */ + /* in ports array (dst ports follow */ + /* src ports; max of 10 ports in all; */ + /* count of 0 means match all ports) */ +}; + +#define IP_FW_GETNSRCP(rule) ((rule)->fw_nports & 0x0f) +#define IP_FW_SETNSRCP(rule, n) do { \ + (rule)->fw_nports &= ~0x0f; \ + (rule)->fw_nports |= (n); \ + } while (0) +#define IP_FW_GETNDSTP(rule) ((rule)->fw_nports >> 4) +#define IP_FW_SETNDSTP(rule, n) do { \ + (rule)->fw_nports &= ~0xf0; \ + (rule)->fw_nports |= (n) << 4;\ + } while (0) + +#define fw_divert_port fw_un.fu_divert_port +#define fw_skipto_rule fw_un.fu_skipto_rule +#define fw_reject_code fw_un.fu_reject_code + +struct ip_fw_chain { + LIST_ENTRY(ip_fw_chain) chain; + struct ip_fw *rule; +}; + +/* + * Values for "flags" field . + */ +#define IP_FW_F_IN 0x0001 /* Check inbound packets */ +#define IP_FW_F_OUT 0x0002 /* Check outbound packets */ +#define IP_FW_F_IIFACE 0x0004 /* Apply inbound interface test */ +#define IP_FW_F_OIFACE 0x0008 /* Apply outbound interface test */ + +#define IP_FW_F_COMMAND 0x0070 /* Mask for type of chain entry: */ +#define IP_FW_F_DENY 0x0000 /* This is a deny rule */ +#define IP_FW_F_REJECT 0x0010 /* Deny and send a response packet */ +#define IP_FW_F_ACCEPT 0x0020 /* This is an accept rule */ +#define IP_FW_F_COUNT 0x0030 /* This is a count rule */ +#define IP_FW_F_DIVERT 0x0040 /* This is a divert rule */ +#define IP_FW_F_TEE 0x0050 /* This is a tee rule */ +#define IP_FW_F_SKIPTO 0x0060 /* This is a skipto rule */ + +#define IP_FW_F_PRN 0x0080 /* Print if this rule matches */ + +#define IP_FW_F_SRNG 0x0100 /* The first two src ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_DRNG 0x0200 /* The first two dst ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_IIFNAME 0x0400 /* In interface by name/unit (not IP) */ +#define IP_FW_F_OIFNAME 0x0800 /* Out interface by name/unit (not IP) */ + +#define IP_FW_F_INVSRC 0x1000 /* Invert sense of src check */ +#define IP_FW_F_INVDST 0x2000 /* Invert sense of dst check */ + +#define IP_FW_F_FRAG 0x4000 /* Fragment */ + +#define IP_FW_F_ICMPBIT 0x8000 /* ICMP type bitmap is valid */ + +#define IP_FW_F_MASK 0xFFFF /* All possible flag bits mask */ + +/* + * For backwards compatibility with rules specifying "via iface" but + * not restricted to only "in" or "out" packets, we define this combination + * of bits to represent this configuration. + */ + +#define IF_FW_F_VIAHACK (IP_FW_F_IN|IP_FW_F_OUT|IP_FW_F_IIFACE|IP_FW_F_OIFACE) + +/* + * Definitions for REJECT response codes. + * Values less than 256 correspond to ICMP unreachable codes. + */ +#define IP_FW_REJECT_RST 0x0100 /* TCP packets: send RST */ + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP flags. + */ +#define IP_FW_TCPF_FIN TH_FIN +#define IP_FW_TCPF_SYN TH_SYN +#define IP_FW_TCPF_RST TH_RST +#define IP_FW_TCPF_PSH TH_PUSH +#define IP_FW_TCPF_ACK TH_ACK +#define IP_FW_TCPF_URG TH_URG +#define IP_FW_TCPF_ESTAB 0x40 + +/* + * Main firewall chains definitions and global var's definitions. + */ +#ifdef _KERNEL + +/* + * Function definitions. + */ +void ip_fw_init(void); + +#endif /* _KERNEL */ + +#endif /* _IP_FW_H */ diff --git a/netinet/ip_icmp.c b/netinet/ip_icmp.c new file mode 100644 index 0000000..e1c7518 --- /dev/null +++ b/netinet/ip_icmp.c @@ -0,0 +1,771 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD: src/sys/netinet/ip_icmp.c,v 1.101 2005/05/04 13:23:54 andre Exp $ + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define _IP_VHL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef IPSEC +#include +#include +#endif + +#ifdef FAST_IPSEC +#include +#include +#define IPSEC +#endif + +#include + +/* + * ICMP routines: error generation, receive packet processing, and + * routines to turnaround packets back to the originator, and + * host table maintenance routines. + */ + + struct icmpstat icmpstat; +SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD, + &icmpstat, icmpstat, ""); + +static int icmpmaskrepl = 0; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, + &icmpmaskrepl, 0, ""); + +static int icmpbmcastecho = 1; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, &icmpbmcastecho, + 0, ""); + +static int icmpallecho = 1; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, allecho, CTLFLAG_RW, &icmpallecho, + 0, ""); + +#ifdef ICMPPRINTFS +int icmpprintfs = 0; +#endif + +static void icmp_reflect(struct mbuf *); +static void icmp_send(struct mbuf *, struct mbuf *); + +extern struct protosw inetsw[]; +unsigned int icmplenPanicAvoided; + +/* + * Generate an error packet of type error + * in response to bad packet ip. + */ +void +icmp_error(struct mbuf *n, int type, int code, n_long dest, + struct ifnet *destifp) +{ + register struct ip *oip = mtod(n, struct ip *), *nip; +#ifdef _IP_VHL + register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2; +#else + register unsigned oiplen = oip->ip_hl << 2; +#endif + register struct icmp *icp; + register struct mbuf *m; + unsigned icmplen; + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_error(%p, %x, %d)\n", oip, type, code); +#endif + if (type != ICMP_REDIRECT) + icmpstat.icps_error++; + /* + * Don't send error if not the first fragment of message. + * Don't error if the old packet protocol was ICMP + * error message, only known informational types. + */ + if (oip->ip_off &~ (IP_MF|IP_DF)) + goto freeit; + if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && + n->m_len >= oiplen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) { + icmpstat.icps_oldicmp++; + goto freeit; + } + /* Don't send error in response to a multicast or broadcast packet */ + if (n->m_flags & (M_BCAST|M_MCAST)) + goto freeit; + /* Don't send error in response to malicious packet */ + icmplen = min(oiplen + 8, oip->ip_len); + if (icmplen < sizeof(struct ip)) { + icmplenPanicAvoided++; + goto freeit; + } + + /* + * First, formulate icmp message + */ + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + goto freeit; +#ifdef MAC + mac_create_mbuf_netlayer(n, m); +#endif + m->m_len = icmplen + ICMP_MINLEN; + MH_ALIGN(m, m->m_len); + icp = mtod(m, struct icmp *); + if ((u_int)type > ICMP_MAXTYPE) + panic("icmp_error"); + icmpstat.icps_outhist[type]++; + icp->icmp_type = type; + if (type == ICMP_REDIRECT) + icp->icmp_gwaddr.s_addr = dest; + else { + icp->icmp_void = 0; + /* + * The following assignments assume an overlay with the + * zeroed icmp_void field. + */ + if (type == ICMP_PARAMPROB) { + icp->icmp_pptr = code; + code = 0; + } else if (type == ICMP_UNREACH && + code == ICMP_UNREACH_NEEDFRAG && destifp) { + icp->icmp_nextmtu = htons(destifp->if_mtu); + } + } + + icp->icmp_code = code; + bcopy((caddr_t)oip, (caddr_t)&icp->icmp_ip, icmplen); + nip = &icp->icmp_ip; + nip->ip_len = htons((u_short)(nip->ip_len + oiplen)); + + /* + * Now, copy old ip header (without options) + * in front of icmp message. + */ + if (m->m_data - sizeof(struct ip) < m->m_pktdat) + panic("icmp len"); + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; + nip = mtod(m, struct ip *); + bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); + nip->ip_len = m->m_len; +#ifdef _IP_VHL + nip->ip_vhl = IP_VHL_BORING; +#else + nip->ip_v = IPVERSION; + nip->ip_hl = 5; +#endif + nip->ip_p = IPPROTO_ICMP; + nip->ip_tos = 0; + icmp_reflect(m); + +freeit: + m_freem(n); +} + +static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET, 0, {0}, {0} }; +static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET, 0, {0}, {0} }; +static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET, 0, {0}, {0} }; + +/* + * Process a received ICMP message. + */ +void +icmp_input(struct mbuf *m, int off) +{ + struct icmp *icp; + struct in_ifaddr *ia; + struct ip *ip = mtod(m, struct ip *); + int hlen = off; + int icmplen = ip->ip_len; + int i, code; + void (*ctlfunc)(int, struct sockaddr *, void *); + + /* + * Locate icmp structure in mbuf, and check + * that not corrupted and of at least minimum length. + */ +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_src)); + printf("icmp_input from %s to %s, len %d\n", + buf, inet_ntoa(ip->ip_dst), icmplen); + } +#endif + if (icmplen < ICMP_MINLEN) { + icmpstat.icps_tooshort++; + goto freeit; + } + i = hlen + min(icmplen, ICMP_ADVLENMIN); + if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { + icmpstat.icps_tooshort++; + return; + } + ip = mtod(m, struct ip *); + m->m_len -= hlen; + m->m_data += hlen; + icp = mtod(m, struct icmp *); + if (in_cksum(m, icmplen)) { + icmpstat.icps_checksum++; + goto freeit; + } + m->m_len += hlen; + m->m_data -= hlen; + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_input, type %d code %d\n", icp->icmp_type, + icp->icmp_code); +#endif + + /* + * Message type specific processing. + */ + if (icp->icmp_type > ICMP_MAXTYPE) + goto raw; + icmpstat.icps_inhist[icp->icmp_type]++; + code = icp->icmp_code; + switch (icp->icmp_type) { + + case ICMP_UNREACH: + switch (code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_PROTOCOL: + case ICMP_UNREACH_PORT: + case ICMP_UNREACH_SRCFAIL: + code = PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_NEEDFRAG: + code = PRC_MSGSIZE; + break; + + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_TOSNET: + code = PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_ISOLATED: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_TOSHOST: + code = PRC_UNREACH_HOST; + break; + + case ICMP_UNREACH_FILTER_PROHIB: + case ICMP_UNREACH_HOST_PRECEDENCE: + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + code = PRC_UNREACH_PORT; + break; + + default: + goto badcode; + } + goto deliver; + + case ICMP_TIMXCEED: + if (code > 1) + goto badcode; + code += PRC_TIMXCEED_INTRANS; + goto deliver; + + case ICMP_PARAMPROB: + if (code > 1) + goto badcode; + code = PRC_PARAMPROB; + goto deliver; + + case ICMP_SOURCEQUENCH: + if (code) + goto badcode; + code = PRC_QUENCH; + deliver: + /* + * Problem with datagram; advise higher level routines. + */ + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || +#ifdef _IP_VHL + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { +#else + icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { +#endif + icmpstat.icps_badlen++; + goto freeit; + } + NTOHS(icp->icmp_ip.ip_len); + /* Discard ICMP's in response to multicast packets */ + if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) + goto badcode; +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; +#if 1 + /* + * MTU discovery: + * If we got a needfrag and there is a host route to the + * original destination, and the MTU is not locked, then + * set the MTU in the route to the suggested new value + * (if given) and then notify as usual. The ULPs will + * notice that the MTU has changed and adapt accordingly. + * If no new MTU was suggested, then we guess a new one + * less than the current value. If the new MTU is + * unreasonably small (arbitrarily set at 296), then + * we reset the MTU to the interface value and enable the + * lock bit, indicating that we are no longer doing MTU + * discovery. + */ + if (code == PRC_MSGSIZE) { + struct rtentry *rt; + int mtu; + + rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt && (rt->rt_flags & RTF_HOST) + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + mtu = ntohs(icp->icmp_nextmtu); + if (!mtu) + mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu, + 1); +#ifdef DEBUG_MTUDISC + printf("MTU for %s reduced to %d\n", + inet_ntoa(icmpsrc.sin_addr), mtu); +#endif + if (mtu < 296) { + /* rt->rt_rmx.rmx_mtu = + rt->rt_ifp->if_mtu; */ + rt->rt_rmx.rmx_locks |= RTV_MTU; + } else if (rt->rt_rmx.rmx_mtu > mtu) { + rt->rt_rmx.rmx_mtu = mtu; + } + } + if (rt) + RTFREE(rt); + } + +#endif + ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; + if (ctlfunc) + (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, + (void *)&icp->icmp_ip); + break; + + badcode: + icmpstat.icps_badcode++; + break; + + case ICMP_ECHO: + if (!icmpallecho) { + icmpstat.icps_allecho++; + break; + } + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0 + && IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + icmpstat.icps_bmcastecho++; + break; + } + icp->icmp_type = ICMP_ECHOREPLY; + goto reflect; + + case ICMP_TSTAMP: + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0 + && IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + icmpstat.icps_bmcasttstamp++; + break; + } + if (icmplen < ICMP_TSLEN) { + icmpstat.icps_badlen++; + break; + } + icp->icmp_type = ICMP_TSTAMPREPLY; + icp->icmp_rtime = iptime(); + icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ + goto reflect; + + case ICMP_MASKREQ: +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (icmpmaskrepl == 0) + break; + /* + * We are not able to respond with all ones broadcast + * unless we receive it over a point-to-point interface. + */ + if (icmplen < ICMP_MASKLEN) + break; + switch (ip->ip_dst.s_addr) { + + case INADDR_BROADCAST: + case INADDR_ANY: + icmpdst.sin_addr = ip->ip_src; + break; + + default: + icmpdst.sin_addr = ip->ip_dst; + } + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + if (ia == NULL) + break; + if (ia->ia_ifp == NULL) { + break; + } + icp->icmp_type = ICMP_MASKREPLY; + icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; + if (ip->ip_src.s_addr == 0) { + if (ia->ia_ifp->if_flags & IFF_BROADCAST) + ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; + else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) + ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; + } +reflect: + ip->ip_len += hlen; /* since ip_input deducts this */ + icmpstat.icps_reflect++; + icmpstat.icps_outhist[icp->icmp_type]++; + icmp_reflect(m); + return; + + case ICMP_REDIRECT: + if (code > 3) + goto badcode; + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || +#ifdef _IP_VHL + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { +#else + icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { +#endif + icmpstat.icps_badlen++; + break; + } + /* + * Short circuit routing redirects to force + * immediate change in the kernel's routing + * tables. The message is also handed to anyone + * listening on a raw socket (e.g. the routing + * daemon for use in updating its tables). + */ + icmpgw.sin_addr = ip->ip_src; + icmpdst.sin_addr = icp->icmp_gwaddr; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); + + printf("redirect dst %s to %s\n", + buf, inet_ntoa(icp->icmp_gwaddr)); + } +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + rtredirect((struct sockaddr *)&icmpsrc, + (struct sockaddr *)&icmpdst, + (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&icmpgw, (struct rtentry **)0); + pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); + break; + + /* + * No kernel processing for the following; + * just fall through to send to raw listener. + */ + case ICMP_ECHOREPLY: + case ICMP_ROUTERADVERT: + case ICMP_ROUTERSOLICIT: + case ICMP_TSTAMPREPLY: + case ICMP_IREQREPLY: + case ICMP_MASKREPLY: + default: + break; + } + +raw: + rip_input(m, hlen); + return; + +freeit: + m_freem(m); +} + +/* + * Reflect the ip packet back to the source + */ +static void +icmp_reflect(struct mbuf *m) +{ + struct ip *ip = mtod(m, struct ip *); + struct in_ifaddr *ia; + struct in_addr t; + struct mbuf *opts = 0; +#ifdef _IP_VHL + int optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip); +#else + int optlen = (ip->ip_hl << 2) - sizeof(struct ip); +#endif + if (!in_canforward(ip->ip_src) && + ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != + (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { + m_freem(m); /* Bad return address */ + goto done; /* Ip_output() will check for broadcast */ + } + t = ip->ip_dst; + ip->ip_dst = ip->ip_src; + /* + * If the incoming packet was addressed directly to us, + * use dst as the src for the reply. Otherwise (broadcast + * or anonymous), use the address which corresponds + * to the incoming interface. + */ + for (ia = in_ifaddr; ia; ia = ia->ia_next) { + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + if (ia->ia_ifp && (ia->ia_ifp->if_flags & IFF_BROADCAST) && + t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) + break; + } + icmpdst.sin_addr = t; + if ((ia == (struct in_ifaddr *)0) && m->m_pkthdr.rcvif) + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + /* + * The following happens if the packet was not addressed to us, + * and was received on an interface with no IP address. + */ + if (ia == (struct in_ifaddr *)0) + ia = in_ifaddr; + t = IA_SIN(ia)->sin_addr; + ip->ip_src = t; + ip->ip_ttl = MAXTTL; + + if (optlen > 0) { + register u_char *cp; + int opt, cnt; + u_int len; + + /* + * Retrieve any source routing from the incoming packet; + * add on any record-route or timestamp options. + */ + cp = (u_char *) (ip + 1); + if ((opts = ip_srcroute()) == 0 && + (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) { + opts->m_len = sizeof(struct in_addr); + mtod(opts, struct in_addr *)->s_addr = 0; + } + if (opts) { +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_reflect optlen %d rt %d => ", + optlen, opts->m_len); +#endif + for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + len = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) + break; + len = cp[IPOPT_OLEN]; + if (len < IPOPT_OLEN + sizeof(*cp) || + len > cnt) + break; + } + /* + * Should check for overflow, but it "can't happen" + */ + if (opt == IPOPT_RR || opt == IPOPT_TS || + opt == IPOPT_SECURITY) { + bcopy((caddr_t)cp, + mtod(opts, caddr_t) + opts->m_len, len); + opts->m_len += len; + } + } + /* Terminate & pad, if necessary */ + cnt = opts->m_len % 4; + if (cnt) { + for (; cnt < 4; cnt++) { + *(mtod(opts, caddr_t) + opts->m_len) = + IPOPT_EOL; + opts->m_len++; + } + } +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("%d\n", opts->m_len); +#endif + } + /* + * Now strip out original options by copying rest of first + * mbuf's data back, and adjust the IP length. + */ + ip->ip_len -= optlen; +#ifdef _IP_VHL + ip->ip_vhl = IP_VHL_BORING; +#else + ip->ip_v = IPVERSION; + ip->ip_hl = 5; +#endif + m->m_len -= optlen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= optlen; + optlen += sizeof(struct ip); + bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), + (unsigned)(m->m_len - sizeof(struct ip))); + } + m->m_flags &= ~(M_BCAST|M_MCAST); + icmp_send(m, opts); +done: + if (opts) + (void)m_free(opts); +} + +/* + * Send an icmp packet back to the ip level, + * after supplying a checksum. + */ +static void +icmp_send(struct mbuf *m, struct mbuf *opts) +{ + register struct ip *ip = mtod(m, struct ip *); + register int hlen; + register struct icmp *icp; + struct route ro; + +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + m->m_data += hlen; + m->m_len -= hlen; + icp = mtod(m, struct icmp *); + icp->icmp_cksum = 0; + icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); + m->m_data -= hlen; + m->m_len += hlen; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_dst)); + printf("icmp_send dst %s src %s\n", + buf, inet_ntoa(ip->ip_src)); + } +#endif + bzero(&ro, sizeof ro); + (void) ip_output(m, opts, &ro, 0, NULL); + if (ro.ro_rt) + RTFREE(ro.ro_rt); +} + +/* + * Return milliseconds since 00:00 GMT in network format. + */ +uint32_t +iptime(void) +{ + struct timeval atv; + u_long t; + + microtime(&atv); + t = (atv.tv_sec % (24L*60L*60L)) * 1000L + atv.tv_usec / 1000L; + return (htonl(t)); +} + +/* + * Return the next larger or smaller MTU plateau (table from RFC 1191) + * given current value MTU. If DIR is less than zero, a larger plateau + * is returned; otherwise, a smaller value is returned. + */ +int +ip_next_mtu(int mtu, int dir) +{ + static int mtutab[] = { + 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, + 296, 68, 0 + }; + int i; + + for (i = 0; i < (sizeof mtutab) / (sizeof mtutab[0]); i++) { + if (mtu >= mtutab[i]) + break; + } + + if (dir < 0) { + if (i == 0) { + return 0; + } else { + return mtutab[i - 1]; + } + } else { + if (mtutab[i] == 0) { + return 0; + } else if(mtu > mtutab[i]) { + return mtutab[i]; + } else { + return mtutab[i + 1]; + } + } +} diff --git a/netinet/ip_icmp.h b/netinet/ip_icmp.h new file mode 100644 index 0000000..2e66a5f --- /dev/null +++ b/netinet/ip_icmp.h @@ -0,0 +1,212 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/ip_icmp.h,v 1.26 2005/05/04 13:09:19 andre Exp $ + */ + + +#ifndef _NETINET_IP_ICMP_H_ +#define _NETINET_IP_ICMP_H_ + +#include /* struct in_addr */ +#include /* struct ip */ + +/* + * Interface Control Message Protocol Definitions. + * Per RFC 792, September 1981. + */ + +/* + * Internal of an ICMP Router Advertisement + */ +struct icmp_ra_addr { + u_int32_t ira_addr; + u_int32_t ira_preference; +}; + +/* + * Structure of an icmp header. + */ +struct icmp { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ + union { + u_char ih_pptr; /* ICMP_PARAMPROB */ + struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ + struct ih_idseq { + uint16_t icd_id; /* network format */ + uint16_t icd_seq; /* network format */ + } ih_idseq; + int ih_void; + + /* ICMP_UNREACH_NEEDFRAG -- Path MTU Discovery (RFC1191) */ + struct ih_pmtu { + uint16_t ipm_void; /* network format */ + uint16_t ipm_nextmtu; /* network format */ + } ih_pmtu; + + struct ih_rtradv { + u_char irt_num_addrs; + u_char irt_wpa; + u_int16_t irt_lifetime; + } ih_rtradv; + } icmp_hun; +#define icmp_pptr icmp_hun.ih_pptr +#define icmp_gwaddr icmp_hun.ih_gwaddr +#define icmp_id icmp_hun.ih_idseq.icd_id +#define icmp_seq icmp_hun.ih_idseq.icd_seq +#define icmp_void icmp_hun.ih_void +#define icmp_pmvoid icmp_hun.ih_pmtu.ipm_void +#define icmp_nextmtu icmp_hun.ih_pmtu.ipm_nextmtu +#define icmp_num_addrs icmp_hun.ih_rtradv.irt_num_addrs +#define icmp_wpa icmp_hun.ih_rtradv.irt_wpa +#define icmp_lifetime icmp_hun.ih_rtradv.irt_lifetime + union { + struct id_ts { /* ICMP Timestamp */ + /* + * The next 3 fields are in network format, + * milliseconds since 00:00 GMT + */ + uint32_t its_otime; /* Originate */ + uint32_t its_rtime; /* Receive */ + uint32_t its_ttime; /* Transmit */ + } id_ts; + struct id_ip { + struct ip idi_ip; + /* options and then 64 bits of data */ + } id_ip; + struct icmp_ra_addr id_radv; + u_int32_t id_mask; + char id_data[1]; + } icmp_dun; +#define icmp_otime icmp_dun.id_ts.its_otime +#define icmp_rtime icmp_dun.id_ts.its_rtime +#define icmp_ttime icmp_dun.id_ts.its_ttime +#define icmp_ip icmp_dun.id_ip.idi_ip +#define icmp_radv icmp_dun.id_radv +#define icmp_mask icmp_dun.id_mask +#define icmp_data icmp_dun.id_data +}; + +/* + * Lower bounds on packet lengths for various types. + * For the error advice packets must first insure that the + * packet is large enough to contain the returned ip header. + * Only then can we do the check to see if 64 bits of packet + * data have been returned, since we need to check the returned + * ip header length. + */ +#define ICMP_MINLEN 8 /* abs minimum */ +#define ICMP_TSLEN (8 + 3 * sizeof (uint32_t)) /* timestamp */ +#define ICMP_MASKLEN 12 /* address mask */ +#define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */ +#ifndef _IP_VHL +#define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8) + /* N.B.: must separately check that ip_hl >= 5 */ +#else +#define ICMP_ADVLEN(p) (8 + (IP_VHL_HL((p)->icmp_ip.ip_vhl) << 2) + 8) + /* N.B.: must separately check that header length >= 5 */ +#endif + +/* + * Definition of type and code field values. + */ +#define ICMP_ECHOREPLY 0 /* echo reply */ +#define ICMP_UNREACH 3 /* dest unreachable, codes: */ +#define ICMP_UNREACH_NET 0 /* bad net */ +#define ICMP_UNREACH_HOST 1 /* bad host */ +#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ +#define ICMP_UNREACH_PORT 3 /* bad port */ +#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ +#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ +#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ +#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ +#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ +#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ +#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ +#define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ +#define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ +#define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ +#define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */ +#define ICMP_REDIRECT 5 /* shorter route, codes: */ +#define ICMP_REDIRECT_NET 0 /* for network */ +#define ICMP_REDIRECT_HOST 1 /* for host */ +#define ICMP_REDIRECT_TOSNET 2 /* for tos and net */ +#define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */ +#define ICMP_ALTHOSTADDR 6 /* alternate host address */ +#define ICMP_ECHO 8 /* echo service */ +#define ICMP_ROUTERADVERT 9 /* router advertisement */ +#define ICMP_ROUTERADVERT_NORMAL 0 /* normal advertisement */ +#define ICMP_ROUTERADVERT_NOROUTE_COMMON 16 /* selective routing */ +#define ICMP_ROUTERSOLICIT 10 /* router solicitation */ +#define ICMP_TIMXCEED 11 /* time exceeded, code: */ +#define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */ +#define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */ +#define ICMP_PARAMPROB 12 /* ip header bad */ +#define ICMP_PARAMPROB_ERRATPTR 0 /* error at param ptr */ +#define ICMP_PARAMPROB_OPTABSENT 1 /* req. opt. absent */ +#define ICMP_PARAMPROB_LENGTH 2 /* bad length */ +#define ICMP_TSTAMP 13 /* timestamp request */ +#define ICMP_TSTAMPREPLY 14 /* timestamp reply */ +#define ICMP_IREQ 15 /* information request */ +#define ICMP_IREQREPLY 16 /* information reply */ +#define ICMP_MASKREQ 17 /* address mask request */ +#define ICMP_MASKREPLY 18 /* address mask reply */ +#define ICMP_TRACEROUTE 30 /* traceroute */ +#define ICMP_DATACONVERR 31 /* data conversion error */ +#define ICMP_MOBILE_REDIRECT 32 /* mobile host redirect */ +#define ICMP_IPV6_WHEREAREYOU 33 /* IPv6 where-are-you */ +#define ICMP_IPV6_IAMHERE 34 /* IPv6 i-am-here */ +#define ICMP_MOBILE_REGREQUEST 35 /* mobile registration req */ +#define ICMP_MOBILE_REGREPLY 36 /* mobile registration reply */ +#define ICMP_SKIP 39 /* SKIP */ +#define ICMP_PHOTURIS 40 /* Photuris */ +#define ICMP_PHOTURIS_UNKNOWN_INDEX 1 /* unknown sec index */ +#define ICMP_PHOTURIS_AUTH_FAILED 2 /* auth failed */ +#define ICMP_PHOTURIS_DECRYPT_FAILED 3 /* decrypt failed */ + +#define ICMP_MAXTYPE 40 + +#define ICMP_INFOTYPE(type) \ + ((type) == ICMP_ECHOREPLY || (type) == ICMP_ECHO || \ + (type) == ICMP_ROUTERADVERT || (type) == ICMP_ROUTERSOLICIT || \ + (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \ + (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ + (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) + +#ifdef _KERNEL +void icmp_error(struct mbuf *, int, int, n_long, struct ifnet *); +void icmp_input(struct mbuf *, int); +int ip_next_mtu(int, int); +#endif + +#endif diff --git a/netinet/ip_input.c b/netinet/ip_input.c new file mode 100644 index 0000000..927537f --- /dev/null +++ b/netinet/ip_input.c @@ -0,0 +1,1510 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 + * $ANA: ip_input.c,v 1.5 1996/09/18 14:34:59 wollman Exp $ + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define _IP_VHL + +#include "opt_ipfw.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef IPFIREWALL +#include +#endif + +int rsvp_on = 0; +static int ip_rsvp_on; +struct socket *ip_rsvpd; + +int ipforwarding = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, + &ipforwarding, 0, "Enable IP forwarding between interfaces"); + +static int ipsendredirects = 1; /* XXX */ +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, + &ipsendredirects, 0, "Enable sending IP redirects"); + +int ip_defttl = IPDEFTTL; +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, + &ip_defttl, 0, "Maximum TTL on IP packets"); + +static int ip_dosourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, + &ip_dosourceroute, 0, ""); + +static int ip_acceptsourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, + CTLFLAG_RW, &ip_acceptsourceroute, 0, ""); +#ifdef DIAGNOSTIC +static int ipprintfs = 0; +#endif + +extern struct domain inetdomain; +extern struct protosw inetsw[]; +u_char ip_protox[IPPROTO_MAX]; +static int ipqmaxlen = IFQ_MAXLEN; +struct in_ifaddr *in_ifaddr; /* first inet address */ +struct ifqueue ipintrq; +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RD, + &ipintrq.ifq_maxlen, 0, ""); +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, + &ipintrq.ifq_drops, 0, + "Number of packets dropped from the IP input queue"); + +struct ipstat ipstat; + +/* Packet reassembly stuff */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) + +static struct ipq ipq[IPREASS_NHASH]; +static int nipq = 0; /* total # of reass queues */ +static int maxnipq; + +#ifdef IPCTL_DEFMTU +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, + &ip_mtu, 0, "Default MTU"); +#endif + +#if !defined(COMPAT_IPFW) || COMPAT_IPFW == 1 +#undef COMPAT_IPFW +#define COMPAT_IPFW 1 +#else +#undef COMPAT_IPFW +#endif + +#ifdef COMPAT_IPFW +/* Firewall hooks */ +ip_fw_chk_t *ip_fw_chk_ptr; +ip_fw_ctl_t *ip_fw_ctl_ptr; + +/* IP Network Address Translation (NAT) hooks */ +ip_nat_t *ip_nat_ptr; +ip_nat_ctl_t *ip_nat_ctl_ptr; +#endif + +/* + * We need to save the IP options in case a protocol wants to respond + * to an incoming packet over the same route if the packet got here + * using IP source routing. This allows connection establishment and + * maintenance when the remote end is on a network that is not known + * to us. + */ +static int ip_nhops = 0; +static struct ip_srcrt { + struct in_addr dst; /* final destination */ + char nop; /* one NOP to align */ + char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ + struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; +} ip_srcrt; + +#ifdef IPDIVERT +/* + * Shared variable between ip_input() and ip_reass() to communicate + * about which packets, once assembled from fragments, get diverted, + * and to which port. + */ +static u_short frag_divert_port; +#endif + +static void save_rte(u_char *, struct in_addr); +static void ip_deq(struct ipasfrag *); +static int ip_dooptions(struct mbuf *); +static void ip_enq(struct ipasfrag *, struct ipasfrag *); +static void ip_forward(struct mbuf *, int); +static void ip_freef(struct ipq *); +static struct ip * + ip_reass(struct ipasfrag *, struct ipq *, struct ipq *); +static struct in_ifaddr * + ip_rtaddr(struct in_addr); +void ipintr(void); +/* + * IP initialization: fill in IP protocol switch table. + * All protocols not implemented in kernel go to raw IP protocol handler. + */ +void +ip_init(void) +{ + register struct protosw *pr; + register int i; + + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == NULL) + panic("ip_init: PF_INET not found"); + + /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ + for (i = 0; i < IPPROTO_MAX; i++) + ip_protox[i] = pr - inetsw; + /* + * Cycle through IP protocols and put them into the appropriate place + * in ip_protox[]. + */ + for (pr = inetdomain.dom_protosw; + pr < inetdomain.dom_protoswNPROTOSW; pr++) + if (pr->pr_domain->dom_family == PF_INET && + pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { + ip_protox[pr->pr_protocol] = pr - inetsw; + } + + for (i = 0; i < IPREASS_NHASH; i++) + ipq[i].next = ipq[i].prev = &ipq[i]; + + maxnipq = nmbclusters/4; + + /* Initialize various other remaining things. */ + ip_id = rtems_bsdnet_seconds_since_boot() & 0xffff; + ipintrq.ifq_maxlen = ipqmaxlen; +#ifdef IPFIREWALL + ip_fw_init(); +#endif +#ifdef IPNAT + ip_nat_init(); +#endif + +} + +static struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET, 0, {0}, {0} }; +static struct route ipforward_rt; + +/* + * Ip input routine. Checksum and byte swap header. If fragmented + * try to reassemble. Process options. Pass to next level. + */ +void +ip_input(struct mbuf *m) +{ + struct ip *ip = NULL; + struct ipq *fp; + struct in_ifaddr *ia = NULL; + int i, hlen; + u_short sum; + +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ip_input no HDR"); +#endif + /* + * If no IP addresses have been set yet but the interfaces + * are receiving, can't do anything with incoming packets yet. + */ + if (in_ifaddr == NULL) + goto bad; + ipstat.ips_total++; + + if (m->m_pkthdr.len < sizeof(struct ip)) + goto tooshort; + +#if defined(DIAGNOSTIC) && defined(ORIGINAL_FREEBSD_CODE) + if (m->m_len < sizeof(struct ip)) + panic("ipintr mbuf too short"); +#endif + + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == NULL) { + ipstat.ips_toosmall++; + return; + } + ip = mtod(m, struct ip *); + +#ifdef _IP_VHL + if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { +#else + if (ip->ip_v != IPVERSION) { +#endif + ipstat.ips_badvers++; + goto bad; + } + +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + if (hlen < sizeof(struct ip)) { /* minimum header length */ + ipstat.ips_badhlen++; + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == NULL) { + ipstat.ips_badhlen++; + return; + } + ip = mtod(m, struct ip *); + } + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + if (sum) { + ipstat.ips_badsum++; + goto bad; + } + + /* + * Convert fields to host representation. + */ + ip->ip_len = ntohs(ip->ip_len); + if (ip->ip_len < hlen) { + ipstat.ips_badlen++; + goto bad; + } + NTOHS(ip->ip_id); + ip->ip_off = ntohs(ip->ip_off); + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < ip->ip_len) { +tooshort: + ipstat.ips_tooshort++; + goto bad; + } + if (m->m_pkthdr.len > ip->ip_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip->ip_len; + m->m_pkthdr.len = ip->ip_len; + } else + m_adj(m, ip->ip_len - m->m_pkthdr.len); + } + /* + * IpHack's section. + * Right now when no processing on packet has done + * and it is still fresh out of network we do our black + * deals with it. + * - Firewall: deny/allow/divert + * - Xlate: translate packet's addr/port (NAT). + * - Wrap: fake packet's addr/port + * - Encapsulate: put it in another IP and send out. + */ + +#ifdef COMPAT_IPFW + if (ip_fw_chk_ptr) { +#ifdef IPDIVERT + u_short port; + + port = (*ip_fw_chk_ptr)(&ip, hlen, NULL, ip_divert_ignore, &m); + ip_divert_ignore = 0; + if (port) { /* Divert packet */ + frag_divert_port = port; + goto ours; + } +#else + /* If ipfw says divert, we have to just drop packet */ + if ((*ip_fw_chk_ptr)(&ip, hlen, NULL, 0, &m)) { + m_freem(m); + m = NULL; + } +#endif + if (!m) + return; + } + + if (ip_nat_ptr && !(*ip_nat_ptr)(&ip, &m, m->m_pkthdr.rcvif, IP_NAT_IN)) + return; +#endif + + /* + * Process options and, if not destined for us, + * ship it on. ip_dooptions returns 1 when an + * error was detected (causing an icmp message + * to be sent and the original packet to be freed). + */ + ip_nhops = 0; /* for source routed packets */ + if (hlen > sizeof (struct ip) && ip_dooptions(m)) + return; + + /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no + * matter if it is destined to another node, or whether it is + * a multicast one, RSVP wants it! and prevents it from being forwarded + * anywhere else. Also checks if the rsvp daemon is running before + * grabbing the packet. + */ + if (rsvp_on && ip->ip_p==IPPROTO_RSVP) + goto ours; + + /* + * Check our list of addresses, to see if the packet is for us. + */ + for (ia = in_ifaddr; ia; ia = ia->ia_next) { +#define satosin(sa) ((struct sockaddr_in *)(sa)) + + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) + goto ours; +#ifdef BOOTP_COMPAT + if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) + goto ours; +#endif + if (ia->ia_ifp && ia->ia_ifp->if_flags & IFF_BROADCAST) { + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + ip->ip_dst.s_addr) + goto ours; + if (ip->ip_dst.s_addr == ia->ia_netbroadcast.s_addr) + goto ours; + } + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + if (ip_mrouter) { + /* + * If we are acting as a multicast router, all + * incoming multicast packets are passed to the + * kernel-level multicast forwarding function. + * The packet is returned (relatively) intact; if + * ip_mforward() returns a non-zero value, the packet + * must be discarded, else it may be accepted below. + * + * (The IP ident field is put in the same byte order + * as expected when ip_mforward() is called from + * ip_output().) + */ + ip->ip_id = htons(ip->ip_id); + if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + ip->ip_id = ntohs(ip->ip_id); + + /* + * The process-level routing daemon needs to receive + * all multicast IGMP packets, whether or not this + * host belongs to their destination groups. + */ + if (ip->ip_p == IPPROTO_IGMP) + goto ours; + ipstat.ips_forward++; + } + /* + * See if we belong to the destination multicast group on the + * arrival interface. + */ + IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); + if (inm == NULL) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + goto ours; + } + if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) + goto ours; + if (ip->ip_dst.s_addr == INADDR_ANY) + goto ours; + + /* + * Not for us; forward if possible and desirable. + */ + if (ipforwarding == 0) { + ipstat.ips_cantforward++; + m_freem(m); + } else { + ip_forward(m, 0); + } + return; + +ours: + + /* + * If offset or IP_MF are set, must reassemble. + * Otherwise, nothing need be done. + * (We could look in the reassembly queue to see + * if the packet was previously fragmented, + * but it's not worth the time; just let them time out.) + */ + if (ip->ip_off &~ (IP_DF | IP_RF)) { + if (m->m_flags & M_EXT) { /* XXX */ + if ((m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; +#ifdef IPDIVERT + frag_divert_port = 0; +#endif + return; + } + ip = mtod(m, struct ip *); + } + sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); + /* + * Look for queue of fragments + * of this datagram. + */ + for (fp = ipq[sum].next; fp != &ipq[sum]; fp = fp->next) + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && + ip->ip_p == fp->ipq_p) + goto found; + + fp = 0; + + /* check if there's a place for the new queue */ + if (nipq > maxnipq) { + /* + * drop something from the tail of the current queue + * before proceeding further + */ + if (ipq[sum].prev == &ipq[sum]) { /* gak */ + for (i = 0; i < IPREASS_NHASH; i++) { + if (ipq[i].prev != &ipq[i]) { + ip_freef(ipq[i].prev); + break; + } + } + } else + ip_freef(ipq[sum].prev); + } +found: + /* + * Adjust ip_len to not reflect header, + * set ip_mff if more fragments are expected, + * convert offset of this to bytes. + */ + ip->ip_len -= hlen; + ((struct ipasfrag *)ip)->ipf_mff &= ~1; + if (ip->ip_off & IP_MF) + ((struct ipasfrag *)ip)->ipf_mff |= 1; + ip->ip_off <<= 3; + + /* + * If datagram marked as having more fragments + * or if this is not the first fragment, + * attempt reassembly; if it succeeds, proceed. + */ + if (((struct ipasfrag *)ip)->ipf_mff & 1 || ip->ip_off) { + ipstat.ips_fragments++; + ip = ip_reass((struct ipasfrag *)ip, fp, &ipq[sum]); + if (ip == 0) + return; + ipstat.ips_reassembled++; + m = dtom(ip); +#ifdef IPDIVERT + if (frag_divert_port) { + ip->ip_len += hlen; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + HTONS(ip->ip_id); + ip->ip_sum = 0; + ip->ip_sum = in_cksum_hdr(ip); + NTOHS(ip->ip_id); + NTOHS(ip->ip_off); + NTOHS(ip->ip_len); + ip->ip_len -= hlen; + } +#endif + } else + if (fp) + ip_freef(fp); + } else + ip->ip_len -= hlen; + +#ifdef IPDIVERT + /* + * Divert reassembled packets to the divert protocol if required + */ + if (frag_divert_port) { + ipstat.ips_delivered++; + ip_divert_port = frag_divert_port; + frag_divert_port = 0; + (*inetsw[ip_protox[IPPROTO_DIVERT]].pr_input)(m, hlen); + return; + } + + /* Don't let packets divert themselves */ + if (ip->ip_p == IPPROTO_DIVERT) { + ipstat.ips_noproto++; + goto bad; + } +#endif + + /* + * Switch out to protocol's input routine. + */ + ipstat.ips_delivered++; + + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); + return; +bad: + m_freem(m); +} + +/* + * IP software interrupt routine - to go away sometime soon + */ +void +ipintr(void) +{ + int s; + struct mbuf *m; + + while(1) { + s = splimp(); + IF_DEQUEUE(&ipintrq, m); + splx(s); + if (m == 0) + return; + ip_input(m); + } +} + +NETISR_SET(NETISR_IP, ipintr); + +/* + * Take incoming datagram fragment and try to + * reassemble it into whole datagram. If a chain for + * reassembly of this datagram already exists, then it + * is given as fp; otherwise have to make a chain. + */ +static struct ip * +ip_reass(struct ipasfrag *ip, struct ipq *fp, struct ipq *where) +{ + register struct mbuf *m = dtom(ip); + register struct ipasfrag *q; + struct mbuf *t; + int hlen = ip->ip_hl << 2; + int i; + int32_t next; + + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == NULL) { + if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL) + goto dropfrag; + fp = mtod(t, struct ipq *); + insque(fp, where); + nipq++; + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ip->ip_p; + fp->ipq_id = ip->ip_id; + fp->ipq_next = fp->ipq_prev = (struct ipasfrag *)fp; + fp->ipq_src = ((struct ip *)ip)->ip_src; + fp->ipq_dst = ((struct ip *)ip)->ip_dst; +#ifdef IPDIVERT + fp->ipq_divert = 0; +#endif + q = (struct ipasfrag *)fp; + goto insert; + } + + /* + * Find a segment which begins after this one does. + */ + for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = q->ipf_next) + if (q->ip_off > ip->ip_off) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (q->ipf_prev != (struct ipasfrag *)fp) { + i = q->ipf_prev->ip_off + q->ipf_prev->ip_len - ip->ip_off; + if (i > 0) { + if (i >= ip->ip_len) + goto dropfrag; + m_adj(dtom(ip), i); + ip->ip_off += i; + ip->ip_len -= i; + } + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q != (struct ipasfrag *)fp && ip->ip_off + ip->ip_len > q->ip_off) { + struct mbuf *m0; + + i = (ip->ip_off + ip->ip_len) - q->ip_off; + if (i < q->ip_len) { + q->ip_len -= i; + q->ip_off += i; + m_adj(dtom(q), i); + break; + } + m0 = dtom(q); + q = q->ipf_next; + ip_deq(q->ipf_prev); + m_freem(m0); + } + +insert: + +#ifdef IPDIVERT + /* + * Any fragment diverting causes the whole packet to divert + */ + if (frag_divert_port != 0) + fp->ipq_divert = frag_divert_port; + frag_divert_port = 0; +#endif + + /* + * Stick new segment in its place; + * check for complete reassembly. + */ + ip_enq(ip, q->ipf_prev); + next = 0; + for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = q->ipf_next) { + if (q->ip_off != next) + return (0); + next += q->ip_len; + } + if (q->ipf_prev->ipf_mff & 1) + return (0); + + /* + * Reassembly is complete. Make sure the packet is a sane size. + */ +#ifdef _IP_VHL + if (next + (IP_VHL_HL(((struct ip *)fp->ipq_next)->ip_vhl) << 2) +#else + if (next + ((((struct ip *)fp->ipq_next)->ip_hl) << 2) +#endif + > IP_MAXPACKET) { + ipstat.ips_toolong++; + ip_freef(fp); + return (0); + } + + /* + * Concatenate fragments. + */ + q = fp->ipq_next; + m = dtom(q); + t = m->m_next; + m->m_next = NULL; + m_cat(m, t); + q = q->ipf_next; + while (q != (struct ipasfrag *)fp) { + t = dtom(q); + q = q->ipf_next; + m_cat(m, t); + } + +#ifdef IPDIVERT + /* + * Record divert port for packet, if any + */ + frag_divert_port = fp->ipq_divert; +#endif + + /* + * Create header for new ip packet by modifying header of first + * packet; dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip = fp->ipq_next; + ip->ip_len = next; + ip->ipf_mff &= ~1; + ((struct ip *)ip)->ip_src = fp->ipq_src; + ((struct ip *)ip)->ip_dst = fp->ipq_dst; + remque(fp); + nipq--; + (void) m_free(dtom(fp)); + m = dtom(ip); + m->m_len += (ip->ip_hl << 2); + m->m_data -= (ip->ip_hl << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ + register int plen = 0; + for (t = m; m; m = m->m_next) + plen += m->m_len; + t->m_pkthdr.len = plen; + } + return ((struct ip *)ip); + +dropfrag: + ipstat.ips_fragdropped++; + m_freem(m); + return (0); +} + +/* + * Free a fragment reassembly header and all + * associated datagrams. + */ +static void +ip_freef(struct ipq *fp) +{ + register struct ipasfrag *q, *p; + + for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = p) { + p = q->ipf_next; + ip_deq(q); + m_freem(dtom(q)); + } + remque(fp); + (void) m_free(dtom(fp)); + nipq--; +} + +/* + * Put an ip fragment on a reassembly chain. + * Like insque, but pointers in middle of structure. + */ +static void +ip_enq(struct ipasfrag *p, struct ipasfrag *prev) +{ + + p->ipf_prev = prev; + p->ipf_next = prev->ipf_next; + prev->ipf_next->ipf_prev = p; + prev->ipf_next = p; +} + +/* + * To ip_enq as remque is to insque. + */ +static void +ip_deq(struct ipasfrag *p) +{ + + p->ipf_prev->ipf_next = p->ipf_next; + p->ipf_next->ipf_prev = p->ipf_prev; +} + +/* + * IP timer processing; + * if a timer expires on a reassembly + * queue, discard it. + */ +void +ip_slowtimo(void) +{ + register struct ipq *fp; + int s = splnet(); + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + fp = ipq[i].next; + if (fp == 0) + continue; + while (fp != &ipq[i]) { + --fp->ipq_ttl; + fp = fp->next; + if (fp->prev->ipq_ttl == 0) { + ipstat.ips_fragtimeout++; + ip_freef(fp->prev); + } + } + } + splx(s); +} + +/* + * Drain off all datagram fragments. + */ +void +ip_drain(void) +{ + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + while (ipq[i].next != &ipq[i]) { + ipstat.ips_fragdropped++; + ip_freef(ipq[i].next); + } + } + in_rtqdrain(); +} + +/* + * Do option processing on a datagram, + * possibly discarding it if bad options are encountered, + * or forwarding it if source-routed. + * Returns 1 if packet has been forwarded/freed, + * 0 if the packet should be processed further. + */ +static int +ip_dooptions(struct mbuf *m) +{ + register struct ip *ip = mtod(m, struct ip *); + register u_char *cp; + register struct ip_timestamp *ipt; + register struct in_ifaddr *ia; + int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; + struct in_addr *sin, dst; + n_time ntime; + + dst = ip->ip_dst; + cp = (u_char *)(ip + 1); +#ifdef _IP_VHL + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); +#else + cnt = (ip->ip_hl << 2) - sizeof (struct ip); +#endif + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > cnt) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + } + switch (opt) { + + default: + break; + + /* + * Source routing with record. + * Find interface with current destination address. + * If none on this machine then drop if strictly routed, + * or do nothing if loosely routed. + * Record interface address and bring up next address + * component. If strictly routed make sure next + * address is on directly accessible net. + */ + case IPOPT_LSRR: + case IPOPT_SSRR: + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = ip->ip_dst; + ia = (struct in_ifaddr *) + ifa_ifwithaddr((struct sockaddr *)&ipaddr); + if (ia == 0) { + if (opt == IPOPT_SSRR) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + if (!ip_dosourceroute) + goto nosourcerouting; + /* + * Loose routing, and not at next destination + * yet; nothing to do except forward. + */ + break; + } + off--; /* 0 origin */ + if (off > optlen - sizeof(struct in_addr)) { + /* + * End of source route. Should be for us. + */ + if (!ip_acceptsourceroute) + goto nosourcerouting; + save_rte(cp, ip->ip_src); + break; + } + + if (!ip_dosourceroute) { + char buf0[INET_ADDRSTRLEN]; + char buf1[INET_ADDRSTRLEN]; + +nosourcerouting: + log(LOG_WARNING, + "attempted source route from %s to %s\n", + inet_ntoa_r(ip->ip_dst, buf0), + inet_ntoa_r(ip->ip_src, buf1)); + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + + /* + * locate outgoing interface + */ + (void)memcpy(&ipaddr.sin_addr, cp + off, + sizeof(ipaddr.sin_addr)); + + if (opt == IPOPT_SSRR) { +#define INA struct in_ifaddr * +#define SA struct sockaddr * + if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) + ia = (INA)ifa_ifwithnet((SA)&ipaddr); + } else + ia = ip_rtaddr(ipaddr.sin_addr); + if (ia == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + ip->ip_dst = ipaddr.sin_addr; + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + /* + * Let ip_intr's mcast routing check handle mcast pkts + */ + forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); + break; + + case IPOPT_RR: + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + /* + * If no space remains, ignore. + */ + off--; /* 0 origin */ + if (off > optlen - sizeof(struct in_addr)) + break; + (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, + sizeof(ipaddr.sin_addr)); + /* + * locate outgoing interface; if we're the destination, + * use the incoming interface (should be same). + */ + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 && + (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + goto bad; + } + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + case IPOPT_TS: + code = cp - (u_char *)ip; + ipt = (struct ip_timestamp *)cp; + if (ipt->ipt_len < 5) + goto bad; + if (ipt->ipt_ptr > ipt->ipt_len - sizeof (long)) { + if (++ipt->ipt_oflw == 0) + goto bad; + break; + } + sin = (struct in_addr *)(cp + ipt->ipt_ptr - 1); + switch (ipt->ipt_flg) { + + case IPOPT_TS_TSONLY: + break; + + case IPOPT_TS_TSANDADDR: + if (ipt->ipt_ptr + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + ipaddr.sin_addr = dst; + ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, + m->m_pkthdr.rcvif); + if (ia == 0) + continue; + (void)memcpy(sin, &IA_SIN(ia)->sin_addr, + sizeof(struct in_addr)); + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + case IPOPT_TS_PRESPEC: + if (ipt->ipt_ptr + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + (void)memcpy(&ipaddr.sin_addr, sin, + sizeof(struct in_addr)); + if (ifa_ifwithaddr((SA)&ipaddr) == 0) + continue; + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + default: + goto bad; + } + ntime = iptime(); + (void)memcpy(cp + ipt->ipt_ptr - 1, &ntime, + sizeof(n_time)); + ipt->ipt_ptr += sizeof(n_time); + } + } + if (forward && ipforwarding) { + ip_forward(m, 1); + return (1); + } + return (0); +bad: +#ifdef _IP_VHL + ip->ip_len -= IP_VHL_HL(ip->ip_vhl) << 2; /* XXX icmp_error adds in hdr length */ +#else + ip->ip_len -= ip->ip_hl << 2; /* XXX icmp_error adds in hdr length */ +#endif + icmp_error(m, type, code, 0, 0); + ipstat.ips_badoptions++; + return (1); +} + +/* + * Given address of next destination (final or next hop), + * return internet address info of interface to be used to get there. + */ +static struct in_ifaddr * +ip_rtaddr(struct in_addr dst) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *) &ipforward_rt.ro_dst; + + if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = dst; + + rtalloc_ign(&ipforward_rt, RTF_PRCLONING); + } + if (ipforward_rt.ro_rt == 0) + return ((struct in_ifaddr *)0); + return ((struct in_ifaddr *) ipforward_rt.ro_rt->rt_ifa); +} + +/* + * Save incoming source route for use in replies, + * to be picked up later by ip_srcroute if the receiver is interested. + */ +void +save_rte(u_char *option, struct in_addr dst) +{ + unsigned olen; + + olen = option[IPOPT_OLEN]; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("save_rte: olen %d\n", olen); +#endif + if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) + return; + bcopy(option, ip_srcrt.srcopt, olen); + ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); + ip_srcrt.dst = dst; +} + +/* + * Retrieve incoming source route for use in replies, + * in the same form used by setsockopt. + * The first hop is placed before the options, will be removed later. + */ +struct mbuf * +ip_srcroute(void) +{ + register struct in_addr *p, *q; + register struct mbuf *m; + + if (ip_nhops == 0) + return ((struct mbuf *)0); + m = m_get(M_DONTWAIT, MT_SOOPTS); + if (m == 0) + return ((struct mbuf *)0); + +#define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt)) + + /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ + m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + + OPTSIZ; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); +#endif + + /* + * First save first hop for return route + */ + p = &ip_srcrt.route[ip_nhops - 1]; + *(mtod(m, struct in_addr *)) = *p--; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" hops %"PRIx32, ntohl(mtod(m, struct in_addr *)->s_addr)); +#endif + + /* + * Copy option fields and padding (nop) to mbuf. + */ + ip_srcrt.nop = IPOPT_NOP; + ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; + (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), + &ip_srcrt.nop, OPTSIZ); + q = (struct in_addr *)(mtod(m, caddr_t) + + sizeof(struct in_addr) + OPTSIZ); +#undef OPTSIZ + /* + * Record return path as an IP source route, + * reversing the path (pointers are now aligned). + */ + while (p >= ip_srcrt.route) { +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %"PRIx32, ntohl(q->s_addr)); +#endif + *q++ = *p--; + } + /* + * Last hop goes to final destination. + */ + *q = ip_srcrt.dst; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %"PRIx32"\n", ntohl(q->s_addr)); +#endif + return (m); +} + +/* + * Strip out IP options, at higher + * level protocol in the kernel. + * Second argument is buffer to which options + * will be moved, and return value is their length. + * XXX should be deleted; last arg currently ignored. + */ +void +ip_stripoptions(struct mbuf *m, struct mbuf *mopt) +{ + register int i; + struct ip *ip = mtod(m, struct ip *); + register caddr_t opts; + int olen; + +#ifdef _IP_VHL + olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); +#else + olen = (ip->ip_hl << 2) - sizeof (struct ip); +#endif + opts = (caddr_t)(ip + 1); + i = m->m_len - (sizeof (struct ip) + olen); + bcopy(opts + olen, opts, (unsigned)i); + m->m_len -= olen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= olen; +#ifdef _IP_VHL + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2); +#else + ip->ip_v = IPVERSION; + ip->ip_hl = (sizeof(struct ip) >> 2); +#endif +} + +u_char inetctlerrmap[PRC_NCMDS] = { + 0, 0, 0, 0, + 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, + EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + EMSGSIZE, EHOSTUNREACH, 0, 0, + 0, 0, 0, 0, + ENOPROTOOPT +}; + +/* + * Forward a packet. If some error occurs return the sender + * an icmp packet. Note we can't always generate a meaningful + * icmp message because icmp doesn't have a large enough repertoire + * of codes and types. + * + * If not forwarding, just drop the packet. This could be confusing + * if ipforwarding was zero but some routing protocol was advancing + * us as a gateway to somewhere. However, we must let the routing + * protocol deal with that. + * + * The srcrt parameter indicates whether the packet is being forwarded + * via a source route. + */ +static void +ip_forward(struct mbuf *m, int srcrt) +{ + struct ip *ip = mtod(m, struct ip *); + register struct sockaddr_in *sin; + register struct rtentry *rt; + int error, type = 0, code = 0; + struct mbuf *mcopy; + n_long dest; + struct ifnet *destifp; + + dest = 0; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("forward: src %"PRIx32" dst %"PRIx32" ttl %x\n", + ip->ip_src.s_addr, ip->ip_dst.s_addr, ip->ip_ttl); +#endif + + + if (m->m_flags & M_BCAST || in_canforward(ip->ip_dst) == 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + HTONS(ip->ip_id); + if (ip->ip_ttl <= IPTTLDEC) { + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); + return; + } + ip->ip_ttl -= IPTTLDEC; + + sin = (struct sockaddr_in *)&ipforward_rt.ro_dst; + if ((rt = ipforward_rt.ro_rt) == 0 || + ip->ip_dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + + rtalloc_ign(&ipforward_rt, RTF_PRCLONING); + if (ipforward_rt.ro_rt == 0) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); + return; + } + rt = ipforward_rt.ro_rt; + } + + /* + * Save at most 64 bytes of the packet in case + * we need to generate an ICMP message to the src. + */ + mcopy = m_copy(m, 0, imin((int)ip->ip_len, 64)); + + /* + * If forwarding packet using same interface that it came in on, + * perhaps should send a redirect to sender to shortcut a hop. + * Only send redirect if source is sending directly to us, + * and if packet was not source routed (or has any options). + * Also, don't send redirect if forwarding using a default route + * or a route modified by a redirect. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (rt->rt_ifp == m->m_pkthdr.rcvif && + (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && + satosin(rt_key(rt))->sin_addr.s_addr != 0 && + ipsendredirects && !srcrt) { +#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + u_long src = ntohl(ip->ip_src.s_addr); + + if (RTA(rt) && + (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = ip->ip_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; + } + } + + error = ip_output(m, (struct mbuf *)0, &ipforward_rt, + IP_FORWARDING, 0); + if (error) + ipstat.ips_cantforward++; + else { + ipstat.ips_forward++; + if (type) + ipstat.ips_redirectsent++; + else { + if (mcopy) + m_freem(mcopy); + return; + } + } + if (mcopy == NULL) + return; + destifp = NULL; + + switch (error) { + + case 0: /* forwarded, but need redirect */ + /* type, code set above */ + break; + + case ENETUNREACH: /* shouldn't happen, checked above */ + case EHOSTUNREACH: + case ENETDOWN: + case EHOSTDOWN: + default: + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + break; + + case EMSGSIZE: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; + if (ipforward_rt.ro_rt) + destifp = ipforward_rt.ro_rt->rt_ifp; + ipstat.ips_cantfrag++; + break; + + case ENOBUFS: + type = ICMP_SOURCEQUENCH; + code = 0; + break; + } + icmp_error(mcopy, type, code, dest, destifp); +} + +void +ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, + struct mbuf *m) +{ + if (inp->inp_socket->so_options & SO_TIMESTAMP) { + struct timeval tv; + + microtime(&tv); + *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET); + if (*mp) + mp = &(*mp)->m_next; + } + if (inp->inp_flags & INP_RECVDSTADDR) { + *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#ifdef notyet + /* XXX + * Moving these out of udp_input() made them even more broken + * than they already were. + */ + /* options were tossed already */ + if (inp->inp_flags & INP_RECVOPTS) { + *mp = sbcreatecontrol((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + /* ip_srcroute doesn't do what we want here, need to fix */ + if (inp->inp_flags & INP_RECVRETOPTS) { + *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#endif + if (inp->inp_flags & INP_RECVIF) { + struct sockaddr_dl sdl; + + sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); + sdl.sdl_family = AF_LINK; + sdl.sdl_index = m->m_pkthdr.rcvif ? + m->m_pkthdr.rcvif->if_index : 0; + sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0; + *mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len, + IP_RECVIF, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +} + +int +ip_rsvp_init(struct socket *so) +{ + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + if (ip_rsvpd != NULL) + return EADDRINUSE; + + ip_rsvpd = so; + /* + * This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!ip_rsvp_on) { + ip_rsvp_on = 1; + rsvp_on++; + } + + return 0; +} + +int +ip_rsvp_done(void) +{ + ip_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (ip_rsvp_on) { + ip_rsvp_on = 0; + rsvp_on--; + } + return 0; +} diff --git a/netinet/ip_mroute.c b/netinet/ip_mroute.c new file mode 100644 index 0000000..3a89497 --- /dev/null +++ b/netinet/ip_mroute.c @@ -0,0 +1,2232 @@ +#include + +/* + * IP multicast forwarding procedures + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Mark J. Steiglitz, Stanford, May, 1991 + * Modified by Van Jacobson, LBL, January 1993 + * Modified by Ajit Thyagarajan, PARC, August 1993 + * Modified by Bill Fenner, PARC, April 1995 + * + * MROUTING Revision: 3.5 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_mrouting.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef NTOHL +#if BYTE_ORDER != BIG_ENDIAN +#define NTOHL(d) ((d) = ntohl((d))) +#define NTOHS(d) ((d) = ntohs((u_short)(d))) +#define HTONL(d) ((d) = htonl((d))) +#define HTONS(d) ((d) = htons((u_short)(d))) +#else +#define NTOHL(d) +#define NTOHS(d) +#define HTONL(d) +#define HTONS(d) +#endif +#endif + +#ifndef MROUTING +extern u_long _ip_mcast_src(int vifi); +extern int _ip_mforward(struct ip *ip, struct ifnet *ifp, + struct mbuf *m, struct ip_moptions *imo); +extern int _ip_mrouter_done(void); +extern int _ip_mrouter_get(int cmd, struct socket *so, + struct mbuf **m); +extern int _ip_mrouter_set(int cmd, struct socket *so, + struct mbuf *m); +extern int _mrt_ioctl(int req, caddr_t data, struct proc *p); + +/* + * Dummy routines and globals used when multicast routing is not compiled in. + */ + +struct socket *ip_mrouter = NULL; +/* static u_int ip_mrtproto = 0; */ +/* static struct mrtstat mrtstat; */ +u_int rsvpdebug = 0; + +int +_ip_mrouter_set(int cmd, struct socket *so, struct mbuf *m) +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_set)(int, struct socket *, struct mbuf *) = _ip_mrouter_set; + + +int +_ip_mrouter_get(int cmd, struct socket *so, struct mbuf **m) +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_get)(int, struct socket *, struct mbuf **) = _ip_mrouter_get; + +int +_ip_mrouter_done(void) +{ + return(0); +} + +int (*ip_mrouter_done)(void) = _ip_mrouter_done; + +int +_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, + struct ip_moptions *imo) +{ + return(0); +} + +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = _ip_mforward; + +int +_mrt_ioctl(int req, caddr_t data, struct proc *p) +{ + return EOPNOTSUPP; +} + +int (*mrt_ioctl)(int, caddr_t, struct proc *) = _mrt_ioctl; + +void +rsvp_input(struct mbuf *m, int iphlen) +{ + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, iphlen); + return; + } + /* Drop the packet */ + m_freem(m); +} + +void ipip_input(struct mbuf *m, int iphlen) +{ /* XXX must fixup manually */ + rip_input(m, iphlen); +} + +int (*legal_vif_num)(int) = 0; + +/* + * This should never be called, since IP_MULTICAST_VIF should fail, but + * just in case it does get called, the code a little lower in ip_output + * will assign the packet a local address. + */ +u_long +_ip_mcast_src(int vifi) { return INADDR_ANY; } +u_long (*ip_mcast_src)(int) = _ip_mcast_src; + +int +ip_rsvp_vif_init(struct socket *so, struct mbuf *m) +{ + return(EINVAL); +} + +int +ip_rsvp_vif_done(struct socket *so, struct mbuf *m) +{ + return(EINVAL); +} + +void +ip_rsvp_force_done(struct socket *so) +{ + return; +} + +#else /* MROUTING */ + +#define M_HASCL(m) ((m)->m_flags & M_EXT) + +#define INSIZ sizeof(struct in_addr) +#define same(a1, a2) \ + (bcmp((caddr_t)(a1), (caddr_t)(a2), INSIZ) == 0) + +#define MT_MRTABLE MT_RTABLE /* since nothing else uses it */ + +/* + * Globals. All but ip_mrouter and ip_mrtproto could be static, + * except for netstat or debugging purposes. + */ +#ifndef MROUTE_LKM +struct socket *ip_mrouter = NULL; +struct mrtstat mrtstat; + +int ip_mrtproto = IGMP_DVMRP; /* for netstat only */ +#else /* MROUTE_LKM */ +extern void X_ipip_input(struct mbuf *m, int iphlen); +extern struct mrtstat mrtstat; +static int ip_mrtproto; +#endif + +#define NO_RTE_FOUND 0x1 +#define RTE_FOUND 0x2 + +static struct mbuf *mfctable[MFCTBLSIZ]; +static u_char nexpire[MFCTBLSIZ]; +static struct vif viftable[MAXVIFS]; +static u_int mrtdebug = 0; /* debug level */ +#define DEBUG_MFC 0x02 +#define DEBUG_FORWARD 0x04 +#define DEBUG_EXPIRE 0x08 +#define DEBUG_XMIT 0x10 +static u_int tbfdebug = 0; /* tbf debug level */ +static u_int rsvpdebug = 0; /* rsvp debug level */ + +#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ +#define UPCALL_EXPIRE 6 /* number of timeouts */ + +/* + * Define the token bucket filter structures + * tbftable -> each vif has one of these for storing info + */ + +static struct tbf tbftable[MAXVIFS]; +#define TBF_REPROCESS (hz / 100) /* 100x / second */ + +/* + * 'Interfaces' associated with decapsulator (so we can tell + * packets that went through it from ones that get reflected + * by a broken gateway). These interfaces are never linked into + * the system ifnet list & no routes point to them. I.e., packets + * can't be sent this way. They only exist as a placeholder for + * multicast source verification. + */ +static struct ifnet multicast_decap_if[MAXVIFS]; + +#define ENCAP_TTL 64 +#define ENCAP_PROTO IPPROTO_IPIP /* 4 */ + +/* prototype IP hdr for encapsulated packets */ +static struct ip multicast_encap_iphdr = { +#if BYTE_ORDER == LITTLE_ENDIAN + sizeof(struct ip) >> 2, IPVERSION, +#else + IPVERSION, sizeof(struct ip) >> 2, +#endif + 0, /* tos */ + sizeof(struct ip), /* total length */ + 0, /* id */ + 0, /* frag offset */ + ENCAP_TTL, ENCAP_PROTO, + 0, /* checksum */ +}; + +/* + * Private variables. + */ +static vifi_t numvifs = 0; +static int have_encap_tunnel = 0; + +/* + * one-back cache used by ipip_input to locate a tunnel's vif + * given a datagram's src ip address. + */ +static u_long last_encap_src; +static struct vif *last_encap_vif; + +static u_long X_ip_mcast_src(int vifi); +static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo); +static int X_ip_mrouter_done(void); +static int X_ip_mrouter_get(int cmd, struct socket *so, struct mbuf **m); +static int X_ip_mrouter_set(int cmd, struct socket *so, struct mbuf *m); +static int X_legal_vif_num(int vif); +static int X_mrt_ioctl(int cmd, caddr_t data); + +static int get_sg_cnt(struct sioc_sg_req *); +static int get_vif_cnt(struct sioc_vif_req *); +static int ip_mrouter_init(struct socket *, struct mbuf *); +static int add_vif(struct vifctl *); +static int del_vif(vifi_t *); +static int add_mfc(struct mfcctl *); +static int del_mfc(struct mfcctl *); +static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); +static int get_version(struct mbuf *); +static int get_assert(struct mbuf *); +static int set_assert(int *); +static void expire_upcalls(void *); +static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, + vifi_t); +static void phyint_send(struct ip *, struct vif *, struct mbuf *); +static void encap_send(struct ip *, struct vif *, struct mbuf *); +static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); +static void tbf_queue(struct vif *, struct mbuf *); +static void tbf_process_q(struct vif *); +static void tbf_reprocess_q(void *); +static int tbf_dq_sel(struct vif *, struct ip *); +static void tbf_send_packet(struct vif *, struct mbuf *); +static void tbf_update_tokens(struct vif *); +static int priority(struct vif *, struct ip *); +void multiencap_decap(struct mbuf *); + +/* + * whether or not special PIM assert processing is enabled. + */ +static int pim_assert; +/* + * Rate limit for assert notification messages, in usec + */ +#define ASSERT_MSG_TIME 3000000 + +/* + * Hash function for a source, group entry + */ +#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ + ((g) >> 20) ^ ((g) >> 10) ^ (g)) + +/* + * Find a route for a given origin IP address and Multicast group address + * Type of service parameter to be added in the future!!! + */ + +#define MFCFIND(o, g, rt) { \ + register struct mbuf *_mb_rt = mfctable[MFCHASH(o,g)]; \ + register struct mfc *_rt = NULL; \ + rt = NULL; \ + ++mrtstat.mrts_mfc_lookups; \ + while (_mb_rt) { \ + _rt = mtod(_mb_rt, struct mfc *); \ + if ((_rt->mfc_origin.s_addr == o) && \ + (_rt->mfc_mcastgrp.s_addr == g) && \ + (_mb_rt->m_act == NULL)) { \ + rt = _rt; \ + break; \ + } \ + _mb_rt = _mb_rt->m_next; \ + } \ + if (rt == NULL) { \ + ++mrtstat.mrts_mfc_misses; \ + } \ +} + + +/* + * Macros to compute elapsed time efficiently + * Borrowed from Van Jacobson's scheduling code + */ +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a).tv_usec - (b).tv_usec; \ + if ((xxs = (a).tv_sec - (b).tv_sec)) { \ + switch (xxs) { \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + default: \ + delta += (1000000 * xxs); \ + } \ + } \ +} + +#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ + (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) + +#ifdef UPCALL_TIMING +u_long upcall_data[51]; +static void collate(struct timeval *); +#endif /* UPCALL_TIMING */ + + +/* + * Handle MRT setsockopt commands to modify the multicast routing tables. + */ +static int +X_ip_mrouter_set(int cmd, struct socket *so, struct mbuf *m) +{ + if (cmd != MRT_INIT && so != ip_mrouter) return EACCES; + + switch (cmd) { + case MRT_INIT: return ip_mrouter_init(so, m); + case MRT_DONE: return ip_mrouter_done(); + case MRT_ADD_VIF: return add_vif (mtod(m, struct vifctl *)); + case MRT_DEL_VIF: return del_vif (mtod(m, vifi_t *)); + case MRT_ADD_MFC: return add_mfc (mtod(m, struct mfcctl *)); + case MRT_DEL_MFC: return del_mfc (mtod(m, struct mfcctl *)); + case MRT_ASSERT: return set_assert(mtod(m, int *)); + default: return EOPNOTSUPP; + } +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_set)(int, struct socket *, struct mbuf *) = X_ip_mrouter_set; +#endif + +/* + * Handle MRT getsockopt commands + */ +static int +X_ip_mrouter_get(int cmd, struct socket *so, struct mbuf **m) +{ + struct mbuf *mb; + + if (so != ip_mrouter) return EACCES; + + *m = mb = m_get(M_WAIT, MT_SOOPTS); + + switch (cmd) { + case MRT_VERSION: return get_version(mb); + case MRT_ASSERT: return get_assert(mb); + default: return EOPNOTSUPP; + } +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_get)(int, struct socket *, struct mbuf **) = X_ip_mrouter_get; +#endif + +/* + * Handle ioctl commands to obtain information from the cache + */ +static int +X_mrt_ioctl(int cmd, caddr_t data) +{ + int error = 0; + + switch (cmd) { + case (SIOCGETVIFCNT): + return (get_vif_cnt((struct sioc_vif_req *)data)); + break; + case (SIOCGETSGCNT): + return (get_sg_cnt((struct sioc_sg_req *)data)); + break; + default: + return (EINVAL); + break; + } + return error; +} + +#ifndef MROUTE_LKM +int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl; +#endif + +/* + * returns the packet, byte, rpf-failure count for the source group provided + */ +static int +get_sg_cnt(struct sioc_sg_req *req) +{ + register struct mfc *rt; + int s; + + s = splnet(); + MFCFIND(req->src.s_addr, req->grp.s_addr, rt); + splx(s); + if (rt != NULL) { + req->pktcnt = rt->mfc_pkt_cnt; + req->bytecnt = rt->mfc_byte_cnt; + req->wrong_if = rt->mfc_wrong_if; + } else + req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; + + return 0; +} + +/* + * returns the input and output packet and byte counts on the vif provided + */ +static int +get_vif_cnt(struct sioc_vif_req *req) +{ + register vifi_t vifi = req->vifi; + + if (vifi >= numvifs) return EINVAL; + + req->icount = viftable[vifi].v_pkt_in; + req->ocount = viftable[vifi].v_pkt_out; + req->ibytes = viftable[vifi].v_bytes_in; + req->obytes = viftable[vifi].v_bytes_out; + + return 0; +} + +/* + * Enable multicast routing + */ +static int +ip_mrouter_init(struct socket *so, struct mbuf *m) +{ + int *v; + + if (mrtdebug) + log(LOG_DEBUG,"ip_mrouter_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; + + if (!m || (m->m_len != sizeof(int *))) + return ENOPROTOOPT; + + v = mtod(m, int *); + if (*v != 1) + return ENOPROTOOPT; + + if (ip_mrouter != NULL) return EADDRINUSE; + + ip_mrouter = so; + + bzero((caddr_t)mfctable, sizeof(mfctable)); + bzero((caddr_t)nexpire, sizeof(nexpire)); + + pim_assert = 0; + + timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_init\n"); + + return 0; +} + +/* + * Disable multicast routing + */ +static int +X_ip_mrouter_done(void) +{ + vifi_t vifi; + int i; + struct ifnet *ifp; + struct ifreq ifr; + struct mbuf *mb_rt; + struct mbuf *m; + struct rtdetq *rte; + int s; + + s = splnet(); + + /* + * For each phyint in use, disable promiscuous reception of all IP + * multicasts. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_lcl_addr.s_addr != 0 && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr + = INADDR_ANY; + ifp = viftable[vifi].v_ifp; + (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr); + } + } + bzero((caddr_t)tbftable, sizeof(tbftable)); + bzero((caddr_t)viftable, sizeof(viftable)); + numvifs = 0; + pim_assert = 0; + + untimeout(expire_upcalls, (caddr_t)NULL); + + /* + * Free all multicast forwarding cache entries. + */ + for (i = 0; i < MFCTBLSIZ; i++) { + mb_rt = mfctable[i]; + while (mb_rt) { + if (mb_rt->m_act != NULL) { + while (mb_rt->m_act) { + m = mb_rt->m_act; + mb_rt->m_act = m->m_act; + rte = mtod(m, struct rtdetq *); + m_freem(rte->m); + m_free(m); + } + } + mb_rt = m_free(mb_rt); + } + } + + bzero((caddr_t)mfctable, sizeof(mfctable)); + + /* + * Reset de-encapsulation cache + */ + last_encap_src = 0; + last_encap_vif = NULL; + have_encap_tunnel = 0; + + ip_mrouter = NULL; + + splx(s); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_done\n"); + + return 0; +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_done)(void) = X_ip_mrouter_done; +#endif + +static int +get_version(struct mbuf *mb) +{ + int *v; + + v = mtod(mb, int *); + + *v = 0x0305; /* XXX !!!! */ + mb->m_len = sizeof(int); + + return 0; +} + +/* + * Set PIM assert processing global + */ +static int +set_assert(int *i) +{ + if ((*i != 1) && (*i != 0)) + return EINVAL; + + pim_assert = *i; + + return 0; +} + +/* + * Get PIM assert processing global + */ +static int +get_assert(struct mbuf *m) +{ + int *i; + + i = mtod(m, int *); + + *i = pim_assert; + + return 0; +} + +/* + * Add a vif to the vif table + */ +static int +add_vif(struct vifctl *vifcp) +{ + register struct vif *vifp = viftable + vifcp->vifc_vifi; + static struct sockaddr_in sin = {sizeof sin, AF_INET}; + struct ifaddr *ifa; + struct ifnet *ifp; + struct ifreq ifr; + int error, s; + struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; + + if (vifcp->vifc_vifi >= MAXVIFS) return EINVAL; + if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE; + + /* Find the interface with an address in AF_INET family */ + sin.sin_addr = vifcp->vifc_lcl_addr; + ifa = ifa_ifwithaddr((struct sockaddr *)&sin); + if (ifa == 0) return EADDRNOTAVAIL; + ifp = ifa->ifa_ifp; + + if (vifcp->vifc_flags & VIFF_TUNNEL) { + if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { + /* + * An encapsulating tunnel is wanted. Tell ipip_input() to + * start paying attention to encapsulated packets. + */ + if (have_encap_tunnel == 0) { + have_encap_tunnel = 1; + for (s = 0; s < MAXVIFS; ++s) { + multicast_decap_if[s].if_name = "mdecap"; + multicast_decap_if[s].if_unit = s; + } + } + /* + * Set interface to fake encapsulator interface + */ + ifp = &multicast_decap_if[vifcp->vifc_vifi]; + /* + * Prepare cached route entry + */ + bzero(&vifp->v_route, sizeof(vifp->v_route)); + } else { + log(LOG_ERR, "source routed tunnels not supported\n"); + return EOPNOTSUPP; + } + } else { + /* Make sure the interface supports multicast */ + if ((ifp->if_flags & IFF_MULTICAST) == 0) + return EOPNOTSUPP; + + /* Enable promiscuous reception of all IP multicasts from the if */ + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY; + s = splnet(); + error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr); + splx(s); + if (error) + return error; + } + + s = splnet(); + /* define parameters for the tbf structure */ + vifp->v_tbf = v_tbf; + GET_TIME(vifp->v_tbf->tbf_last_pkt_t); + vifp->v_tbf->tbf_n_tok = 0; + vifp->v_tbf->tbf_q_len = 0; + vifp->v_tbf->tbf_max_q_len = MAXQSIZE; + vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; + + vifp->v_flags = vifcp->vifc_flags; + vifp->v_threshold = vifcp->vifc_threshold; + vifp->v_lcl_addr = vifcp->vifc_lcl_addr; + vifp->v_rmt_addr = vifcp->vifc_rmt_addr; + vifp->v_ifp = ifp; + /* scaling up here allows division by 1024 in critical code */ + vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; + vifp->v_rsvp_on = 0; + vifp->v_rsvpd = NULL; + /* initialize per vif pkt counters */ + vifp->v_pkt_in = 0; + vifp->v_pkt_out = 0; + vifp->v_bytes_in = 0; + vifp->v_bytes_out = 0; + splx(s); + + /* Adjust numvifs up if the vifi is higher than numvifs */ + if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; + + if (mrtdebug) + log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d\n", + vifcp->vifc_vifi, + ntohl(vifcp->vifc_lcl_addr.s_addr), + (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", + ntohl(vifcp->vifc_rmt_addr.s_addr), + vifcp->vifc_threshold, + vifcp->vifc_rate_limit); + + return 0; +} + +/* + * Delete a vif from the vif table + */ +static int +del_vif(vifi_t *vifip) +{ + register struct vif *vifp = viftable + *vifip; + register vifi_t vifi; + register struct mbuf *m; + struct ifnet *ifp; + struct ifreq ifr; + int s; + + if (*vifip >= numvifs) return EINVAL; + if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL; + + s = splnet(); + + if (!(vifp->v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY; + ifp = vifp->v_ifp; + (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr); + } + + if (vifp == last_encap_vif) { + last_encap_vif = 0; + last_encap_src = 0; + } + + /* + * Free packets queued at the interface + */ + while (vifp->v_tbf->tbf_q) { + m = vifp->v_tbf->tbf_q; + vifp->v_tbf->tbf_q = m->m_act; + m_freem(m); + } + + bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); + bzero((caddr_t)vifp, sizeof (*vifp)); + + /* Adjust numvifs down */ + for (vifi = numvifs; vifi > 0; vifi--) + if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break; + numvifs = vifi; + + splx(s); + + if (mrtdebug) + log(LOG_DEBUG, "del_vif %d, numvifs %d\n", *vifip, numvifs); + + return 0; +} + +/* + * Add an mfc entry + */ +static int +add_mfc(struct mfcctl *mfccp) +{ + struct mfc *rt; + register struct mbuf *mb_rt; + u_long hash; + struct mbuf *mb_ntry; + struct rtdetq *rte; + register u_short nstl; + int s; + int i; + + MFCFIND(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr, rt); + + /* If an entry already exists, just update the fields */ + if (rt) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc update o %x g %x p %x\n", + ntohl(mfccp->mfcc_origin.s_addr), + ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + s = splnet(); + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + splx(s); + return 0; + } + + /* + * Find the entry for which the upcall was made and update + */ + s = splnet(); + hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); + for (mb_rt = mfctable[hash], nstl = 0; mb_rt; mb_rt = mb_rt->m_next) { + + rt = mtod(mb_rt, struct mfc *); + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && + (mb_rt->m_act != NULL)) { + + if (nstl++) + log(LOG_ERR, "add_mfc %s o %x g %x p %x dbx %x\n", + "multiple kernel entries", + ntohl(mfccp->mfcc_origin.s_addr), + ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, mb_rt->m_act); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc o %x g %x p %x dbg %x\n", + ntohl(mfccp->mfcc_origin.s_addr), + ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, mb_rt->m_act); + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + + rt->mfc_expire = 0; /* Don't clean this guy up */ + nexpire[hash]--; + + /* free packets Qed at the end of this entry */ + while (mb_rt->m_act) { + mb_ntry = mb_rt->m_act; + rte = mtod(mb_ntry, struct rtdetq *); +/* #ifdef RSVP_ISI */ + ip_mdq(rte->m, rte->ifp, rt, -1); +/* #endif */ + mb_rt->m_act = mb_ntry->m_act; + m_freem(rte->m); +#ifdef UPCALL_TIMING + collate(&(rte->t)); +#endif /* UPCALL_TIMING */ + m_free(mb_ntry); + } + } + } + + /* + * It is possible that an entry is being inserted without an upcall + */ + if (nstl == 0) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc no upcall h %d o %x g %x p %x\n", + hash, ntohl(mfccp->mfcc_origin.s_addr), + ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + for (mb_rt = mfctable[hash]; mb_rt; mb_rt = mb_rt->m_next) { + + rt = mtod(mb_rt, struct mfc *); + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + if (rt->mfc_expire) + nexpire[hash]--; + rt->mfc_expire = 0; + } + } + if (mb_rt == NULL) { + /* no upcall, so make a new entry */ + MGET(mb_rt, M_DONTWAIT, MT_MRTABLE); + if (mb_rt == NULL) { + splx(s); + return ENOBUFS; + } + + rt = mtod(mb_rt, struct mfc *); + + /* insert new entry at head of hash chain */ + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + rt->mfc_expire = 0; + + /* link into table */ + mb_rt->m_next = mfctable[hash]; + mfctable[hash] = mb_rt; + mb_rt->m_act = NULL; + } + } + splx(s); + return 0; +} + +#ifdef UPCALL_TIMING +/* + * collect delay statistics on the upcalls + */ +static void collate(struct timeval *t) +{ + register u_long d; + register struct timeval tp; + register u_long delta; + + GET_TIME(tp); + + if (TV_LT(*t, tp)) + { + TV_DELTA(tp, *t, delta); + + d = delta >> 10; + if (d > 50) + d = 50; + + ++upcall_data[d]; + } +} +#endif /* UPCALL_TIMING */ + +/* + * Delete an mfc entry + */ +static int +del_mfc(struct mfcctl *mfccp) +{ + struct in_addr origin; + struct in_addr mcastgrp; + struct mfc *rt; + struct mbuf *mb_rt; + struct mbuf **nptr; + u_long hash; + int s; + + origin = mfccp->mfcc_origin; + mcastgrp = mfccp->mfcc_mcastgrp; + hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"del_mfc orig %x mcastgrp %x\n", + ntohl(origin.s_addr), ntohl(mcastgrp.s_addr)); + + s = splnet(); + + nptr = &mfctable[hash]; + while ((mb_rt = *nptr) != NULL) { + rt = mtod(mb_rt, struct mfc *); + if (origin.s_addr == rt->mfc_origin.s_addr && + mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && + mb_rt->m_act == NULL) + break; + + nptr = &mb_rt->m_next; + } + if (mb_rt == NULL) { + splx(s); + return EADDRNOTAVAIL; + } + + MFREE(mb_rt, *nptr); + + splx(s); + + return 0; +} + +/* + * Send a message to mrouted on the multicast routing socket + */ +static int +socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) +{ + if (s) { + if (sbappendaddr(&s->so_rcv, + (struct sockaddr *)src, + mm, (struct mbuf *)0) != 0) { + sorwakeup(s); + return 0; + } + } + m_freem(mm); + return -1; +} + +/* + * IP multicast forwarding function. This function assumes that the packet + * pointed to by "ip" has arrived on (or is about to be sent to) the interface + * pointed to by "ifp", and the packet is to be relayed to other networks + * that have members of the packet's destination IP multicast group. + * + * The packet is returned unscathed to the caller, unless it is + * erroneous, in which case a non-zero return value tells the caller to + * discard it. + */ + +#define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ +#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ + +static int +X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, + struct ip_moptions *imo) +{ + register struct mfc *rt; + register u_char *ipoptions; + static struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + static int srctun = 0; + register struct mbuf *mm; + int s; + vifi_t vifi; + struct vif *vifp; + + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %x\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp); + + if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || + (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) { + /* + * Packet arrived via a physical interface or + * an encapsulated tunnel. + */ + } else { + /* + * Packet arrived through a source-route tunnel. + * Source-route tunnels are no longer supported. + */ + if ((srctun++ % 1000) == 0) + log(LOG_ERR, "ip_mforward: received source-routed packet from %x\n", + ntohl(ip->ip_src.s_addr)); + + return 1; + } + + if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) { + if (ip->ip_ttl < 255) + ip->ip_ttl++; /* compensate for -1 in *_send routines */ + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + vifp = viftable + vifi; + printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), vifi, + (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", + vifp->v_ifp->if_name, vifp->v_ifp->if_unit); + } + return (ip_mdq(m, ifp, NULL, vifi)); + } + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr)); + if(!imo) + printf("In fact, no options were specified at all\n"); + } + + /* + * Don't forward a packet with time-to-live of zero or one, + * or a packet destined to a local-only group. + */ + if (ip->ip_ttl <= 1 || + ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) + return 0; + + /* + * Determine forwarding vifs from the forwarding cache table + */ + s = splnet(); + MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt); + + /* Entry exists, so forward if necessary */ + if (rt != NULL) { + splx(s); + return (ip_mdq(m, ifp, rt, -1)); + } else { + /* + * If we don't have a route for packet's origin, + * Make a copy of the packet & + * send message to routing daemon + */ + + register struct mbuf *mb_rt; + register struct mbuf *mb_ntry; + register struct mbuf *mb0; + register struct rtdetq *rte; + register struct mbuf *rte_m; + register u_long hash; + register int npkts; + int hlen = ip->ip_hl << 2; +#ifdef UPCALL_TIMING + struct timeval tp; + + GET_TIME(tp); +#endif + + mrtstat.mrts_no_route++; + if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) + log(LOG_DEBUG, "ip_mforward: no rte s %x g %x\n", + ntohl(ip->ip_src.s_addr), + ntohl(ip->ip_dst.s_addr)); + + /* + * Allocate mbufs early so that we don't do extra work if we are + * just going to fail anyway. Make sure to pullup the header so + * that other people can't step on it. + */ + MGET(mb_ntry, M_DONTWAIT, MT_DATA); + if (mb_ntry == NULL) { + splx(s); + return ENOBUFS; + } + mb0 = m_copy(m, 0, M_COPYALL); + if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) + mb0 = m_pullup(mb0, hlen); + if (mb0 == NULL) { + m_free(mb_ntry); + splx(s); + return ENOBUFS; + } + + /* is there an upcall waiting for this packet? */ + hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); + for (mb_rt = mfctable[hash]; mb_rt; mb_rt = mb_rt->m_next) { + rt = mtod(mb_rt, struct mfc *); + if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && + (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && + (mb_rt->m_act != NULL)) + break; + } + + if (mb_rt == NULL) { + int i; + struct igmpmsg *im; + + /* no upcall, so make a new entry */ + MGET(mb_rt, M_DONTWAIT, MT_MRTABLE); + if (mb_rt == NULL) { + m_free(mb_ntry); + m_freem(mb0); + splx(s); + return ENOBUFS; + } + /* Make a copy of the header to send to the user level process */ + mm = m_copy(mb0, 0, hlen); + if (mm == NULL) { + m_free(mb_ntry); + m_freem(mb0); + m_free(mb_rt); + splx(s); + return ENOBUFS; + } + + /* + * Send message to routing daemon to install + * a route into the kernel table + */ + k_igmpsrc.sin_addr = ip->ip_src; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_NOCACHE; + im->im_mbz = 0; + + mrtstat.mrts_upcalls++; + + if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); + ++mrtstat.mrts_upq_sockfull; + m_free(mb_ntry); + m_freem(mb0); + m_free(mb_rt); + splx(s); + return ENOBUFS; + } + + rt = mtod(mb_rt, struct mfc *); + + /* insert new entry at head of hash chain */ + rt->mfc_origin.s_addr = ip->ip_src.s_addr; + rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; + rt->mfc_expire = UPCALL_EXPIRE; + nexpire[hash]++; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = 0; + rt->mfc_parent = -1; + + /* link into table */ + mb_rt->m_next = mfctable[hash]; + mfctable[hash] = mb_rt; + mb_rt->m_act = NULL; + + rte_m = mb_rt; + } else { + /* determine if q has overflowed */ + for (rte_m = mb_rt, npkts = 0; rte_m->m_act; rte_m = rte_m->m_act) + npkts++; + + if (npkts > MAX_UPQ) { + mrtstat.mrts_upq_ovflw++; + m_free(mb_ntry); + m_freem(mb0); + splx(s); + return 0; + } + } + + mb_ntry->m_act = NULL; + rte = mtod(mb_ntry, struct rtdetq *); + + rte->m = mb0; + rte->ifp = ifp; +#ifdef UPCALL_TIMING + rte->t = tp; +#endif + + /* Add this entry to the end of the queue */ + rte_m->m_act = mb_ntry; + + splx(s); + + return 0; + } +} + +#ifndef MROUTE_LKM +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = X_ip_mforward; +#endif + +/* + * Clean up the cache entry if upcall is not serviced + */ +static void +expire_upcalls(void *unused) +{ + struct mbuf *mb_rt, *m, **nptr; + struct rtdetq *rte; + struct mfc *mfc; + int i; + int s; + + s = splnet(); + for (i = 0; i < MFCTBLSIZ; i++) { + if (nexpire[i] == 0) + continue; + nptr = &mfctable[i]; + for (mb_rt = *nptr; mb_rt != NULL; mb_rt = *nptr) { + mfc = mtod(mb_rt, struct mfc *); + + /* + * Skip real cache entries + * Make sure it wasn't marked to not expire (shouldn't happen) + * If it expires now + */ + if (mb_rt->m_act != NULL && + mfc->mfc_expire != 0 && + --mfc->mfc_expire == 0) { + if (mrtdebug & DEBUG_EXPIRE) + log(LOG_DEBUG, "expire_upcalls: expiring (%x %x)\n", + ntohl(mfc->mfc_origin.s_addr), + ntohl(mfc->mfc_mcastgrp.s_addr)); + /* + * drop all the packets + * free the mbuf with the pkt, if, timing info + */ + while (mb_rt->m_act) { + m = mb_rt->m_act; + mb_rt->m_act = m->m_act; + + rte = mtod(m, struct rtdetq *); + m_freem(rte->m); + m_free(m); + } + ++mrtstat.mrts_cache_cleanups; + nexpire[i]--; + + MFREE(mb_rt, *nptr); + } else { + nptr = &mb_rt->m_next; + } + } + } + splx(s); + timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); +} + +/* + * Packet forwarding routine once entry in the cache is made + */ +static int +ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, + vifi_t xmt_vif) +{ + register struct ip *ip = mtod(m, struct ip *); + register vifi_t vifi; + register struct vif *vifp; + register int plen = ntohs(ip->ip_len); + +/* + * Macro to send packet on vif. Since RSVP packets don't get counted on + * input, they shouldn't get counted on output, so statistics keeping is + * seperate. + */ +#define MC_SEND(ip,vifp,m) { \ + if ((vifp)->v_flags & VIFF_TUNNEL) \ + encap_send((ip), (vifp), (m)); \ + else \ + phyint_send((ip), (vifp), (m)); \ +} + + /* + * If xmt_vif is not -1, send on only the requested vif. + * + * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) + */ + if (xmt_vif < numvifs) { + MC_SEND(ip, viftable + xmt_vif, m); + return 1; + } + + /* + * Don't forward if it didn't arrive from the parent vif for its origin. + */ + vifi = rt->mfc_parent; + if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { + /* came in the wrong interface */ + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "wrong if: ifp %x vifi %d vififp %x\n", + ifp, vifi, viftable[vifi].v_ifp); + ++mrtstat.mrts_wrong_if; + ++rt->mfc_wrong_if; + /* + * If we are doing PIM assert processing, and we are forwarding + * packets on this interface, and it is a broadcast medium + * interface (and not a tunnel), send a message to the routing daemon. + */ + if (pim_assert && rt->mfc_ttls[vifi] && + (ifp->if_flags & IFF_BROADCAST) && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + struct sockaddr_in k_igmpsrc; + struct mbuf *mm; + struct igmpmsg *im; + int hlen = ip->ip_hl << 2; + struct timeval now; + register u_long delta; + + GET_TIME(now); + + TV_DELTA(rt->mfc_last_assert, now, delta); + + if (delta > ASSERT_MSG_TIME) { + mm = m_copy(m, 0, hlen); + if (mm && (M_HASCL(mm) || mm->m_len < hlen)) + mm = m_pullup(mm, hlen); + if (mm == NULL) { + return ENOBUFS; + } + + rt->mfc_last_assert = now; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_WRONGVIF; + im->im_mbz = 0; + im->im_vif = vifi; + + k_igmpsrc.sin_addr = im->im_src; + + socket_send(ip_mrouter, mm, &k_igmpsrc); + } + } + return 0; + } + + /* If I sourced this packet, it counts as output, else it was input. */ + if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { + viftable[vifi].v_pkt_out++; + viftable[vifi].v_bytes_out += plen; + } else { + viftable[vifi].v_pkt_in++; + viftable[vifi].v_bytes_in += plen; + } + rt->mfc_pkt_cnt++; + rt->mfc_byte_cnt += plen; + + /* + * For each vif, decide if a copy of the packet should be forwarded. + * Forward if: + * - the ttl exceeds the vif's threshold + * - there are group members downstream on interface + */ + for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) + if ((rt->mfc_ttls[vifi] > 0) && + (ip->ip_ttl > rt->mfc_ttls[vifi])) { + vifp->v_pkt_out++; + vifp->v_bytes_out += plen; + MC_SEND(ip, vifp, m); + } + + return 0; +} + +/* + * check if a vif number is legal/ok. This is used by ip_output, to export + * numvifs there, + */ +static int +X_legal_vif_num(int vif) +{ + if (vif >= 0 && vif < numvifs) + return(1); + else + return(0); +} + +#ifndef MROUTE_LKM +int (*legal_vif_num)(int) = X_legal_vif_num; +#endif + +/* + * Return the local address used by this vif + */ +static u_long +X_ip_mcast_src(int vifi) +{ + if (vifi >= 0 && vifi < numvifs) + return viftable[vifi].v_lcl_addr.s_addr; + else + return INADDR_ANY; +} + +#ifndef MROUTE_LKM +u_long (*ip_mcast_src)(int) = X_ip_mcast_src; +#endif + +static void +phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) +{ + register struct mbuf *mb_copy; + register int hlen = ip->ip_hl << 2; + + /* + * Make a new reference to the packet; make sure that + * the IP header is actually copied, not just referenced, + * so that ip_output() only scribbles on the copy. + */ + mb_copy = m_copy(m, 0, M_COPYALL); + if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) + mb_copy = m_pullup(mb_copy, hlen); + if (mb_copy == NULL) + return; + + if (vifp->v_rate_limit <= 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); +} + +static void +encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) +{ + register struct mbuf *mb_copy; + register struct ip *ip_copy; + register int i, len = ip->ip_len; + + /* + * copy the old packet & pullup it's IP header into the + * new mbuf so we can modify it. Try to fill the new + * mbuf since if we don't the ethernet driver will. + */ + MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); + if (mb_copy == NULL) + return; + mb_copy->m_data += max_linkhdr; + mb_copy->m_len = sizeof(multicast_encap_iphdr); + + if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { + m_freem(mb_copy); + return; + } + i = MHLEN - M_LEADINGSPACE(mb_copy); + if (i > len) + i = len; + mb_copy = m_pullup(mb_copy, i); + if (mb_copy == NULL) + return; + mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); + + /* + * fill in the encapsulating IP header. + */ + ip_copy = mtod(mb_copy, struct ip *); + *ip_copy = multicast_encap_iphdr; + ip_copy->ip_id = htons(ip_id++); + ip_copy->ip_len += len; + ip_copy->ip_src = vifp->v_lcl_addr; + ip_copy->ip_dst = vifp->v_rmt_addr; + + /* + * turn the encapsulated IP header back into a valid one. + */ + ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); + --ip->ip_ttl; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + ip->ip_sum = 0; + mb_copy->m_data += sizeof(multicast_encap_iphdr); + ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); + mb_copy->m_data -= sizeof(multicast_encap_iphdr); + + if (vifp->v_rate_limit <= 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); +} + +/* + * De-encapsulate a packet and feed it back through ip input (this + * routine is called whenever IP gets a packet with proto type + * ENCAP_PROTO and a local destination address). + */ +void +#ifdef MROUTE_LKM +X_ipip_input(struct mbuf *m, int iphlen) +#else +ipip_input(struct mbuf *m, int iphlen) +#endif +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + register struct ip *ip = mtod(m, struct ip *); + register int hlen = ip->ip_hl << 2; + register int s; + register struct ifqueue *ifq; + register struct vif *vifp; + + if (!have_encap_tunnel) { + rip_input(m, iphlen); + return; + } + /* + * dump the packet if it's not to a multicast destination or if + * we don't have an encapsulating tunnel with the source. + * Note: This code assumes that the remote site IP address + * uniquely identifies the tunnel (i.e., that this site has + * at most one tunnel with the remote site). + */ + if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) { + ++mrtstat.mrts_bad_tunnel; + m_freem(m); + return; + } + if (ip->ip_src.s_addr != last_encap_src) { + register struct vif *vife; + + vifp = viftable; + vife = vifp + numvifs; + last_encap_src = ip->ip_src.s_addr; + last_encap_vif = 0; + for ( ; vifp < vife; ++vifp) + if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { + if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) + == VIFF_TUNNEL) + last_encap_vif = vifp; + break; + } + } + if ((vifp = last_encap_vif) == 0) { + last_encap_src = 0; + mrtstat.mrts_cant_tunnel++; /*XXX*/ + m_freem(m); + if (mrtdebug) + log(LOG_DEBUG, "ip_mforward: no tunnel with %x\n", + ntohl(ip->ip_src.s_addr)); + return; + } + ifp = vifp->v_ifp; + + if (hlen > IP_HDR_LEN) + ip_stripoptions(m, (struct mbuf *) 0); + m->m_data += IP_HDR_LEN; + m->m_len -= IP_HDR_LEN; + m->m_pkthdr.len -= IP_HDR_LEN; + m->m_pkthdr.rcvif = ifp; + + ifq = &ipintrq; + s = splimp(); + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + m_freem(m); + } else { + IF_ENQUEUE(ifq, m); + /* + * normally we would need a "schednetisr(NETISR_IP)" + * here but we were called by ip_input and it is going + * to loop back & try to dequeue the packet we just + * queued as soon as we return so we avoid the + * unnecessary software interrrupt. + */ + } + splx(s); +} + +/* + * Token bucket filter module + */ + +static void +tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len) +{ + register struct tbf *t = vifp->v_tbf; + + if (p_len > MAX_BKT_SIZE) { + /* drop if packet is too large */ + mrtstat.mrts_pkt2large++; + m_freem(m); + return; + } + + tbf_update_tokens(vifp); + + /* if there are enough tokens, + * and the queue is empty, + * send this packet out + */ + + if (t->tbf_q_len == 0) { + /* queue empty, send packet if enough tokens */ + if (p_len <= t->tbf_n_tok) { + t->tbf_n_tok -= p_len; + tbf_send_packet(vifp, m); + } else { + /* queue packet and timeout till later */ + tbf_queue(vifp, m); + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); + } + } else if (t->tbf_q_len < t->tbf_max_q_len) { + /* finite queue length, so queue pkts and process queue */ + tbf_queue(vifp, m); + tbf_process_q(vifp); + } else { + /* queue length too much, try to dq and queue and process */ + if (!tbf_dq_sel(vifp, ip)) { + mrtstat.mrts_q_overflow++; + m_freem(m); + return; + } else { + tbf_queue(vifp, m); + tbf_process_q(vifp); + } + } + return; +} + +/* + * adds a packet to the queue at the interface + */ +static void +tbf_queue(struct vif *vifp, struct mbuf *m) +{ + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + if (t->tbf_t == NULL) { + /* Queue was empty */ + t->tbf_q = m; + } else { + /* Insert at tail */ + t->tbf_t->m_act = m; + } + + /* Set new tail pointer */ + t->tbf_t = m; + +#ifdef DIAGNOSTIC + /* Make sure we didn't get fed a bogus mbuf */ + if (m->m_act) + panic("tbf_queue: m_act"); +#endif + m->m_act = NULL; + + t->tbf_q_len++; + + splx(s); +} + + +/* + * processes the queue at the interface + */ +static void +tbf_process_q(struct vif *vifp) +{ + register struct mbuf *m; + register int len; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + /* loop through the queue at the interface and send as many packets + * as possible + */ + while (t->tbf_q_len > 0) { + m = t->tbf_q; + + len = mtod(m, struct ip *)->ip_len; + + /* determine if the packet can be sent */ + if (len <= t->tbf_n_tok) { + /* if so, + * reduce no of tokens, dequeue the packet, + * send the packet. + */ + t->tbf_n_tok -= len; + + t->tbf_q = m->m_act; + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + + m->m_act = NULL; + tbf_send_packet(vifp, m); + + } else break; + } + splx(s); +} + +static void +tbf_reprocess_q(void *xvifp) +{ + register struct vif *vifp = xvifp; + if (ip_mrouter == NULL) + return; + + tbf_update_tokens(vifp); + + tbf_process_q(vifp); + + if (vifp->v_tbf->tbf_q_len) + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); +} + +/* function that will selectively discard a member of the queue + * based on the precedence value and the priority + */ +static int +tbf_dq_sel(struct vif *vifp, struct ip *ip) +{ + register int s = splnet(); + register u_int p; + register struct mbuf *m, *last; + register struct mbuf **np; + register struct tbf *t = vifp->v_tbf; + + p = priority(vifp, ip); + + np = &t->tbf_q; + last = NULL; + while ((m = *np) != NULL) { + if (p > priority(vifp, mtod(m, struct ip *))) { + *np = m->m_act; + /* If we're removing the last packet, fix the tail pointer */ + if (m == t->tbf_t) + t->tbf_t = last; + m_freem(m); + /* it's impossible for the queue to be empty, but + * we check anyway. */ + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + splx(s); + mrtstat.mrts_drop_sel++; + return(1); + } + np = &m->m_act; + last = m; + } + splx(s); + return(0); +} + +static void +tbf_send_packet(struct vif *vifp, struct mbuf *m) +{ + struct ip_moptions imo; + int error; + static struct route ro; + int s = splnet(); + + if (vifp->v_flags & VIFF_TUNNEL) { + /* If tunnel options */ + ip_output(m, (struct mbuf *)0, &vifp->v_route, + IP_FORWARDING, (struct ip_moptions *)0); + } else { + imo.imo_multicast_ifp = vifp->v_ifp; + imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; + imo.imo_multicast_loop = 1; + imo.imo_multicast_vif = -1; + + /* + * Re-entrancy should not be a problem here, because + * the packets that we send out and are looped back at us + * should get rejected because they appear to come from + * the loopback interface, thus preventing looping. + */ + error = ip_output(m, (struct mbuf *)0, &ro, + IP_FORWARDING, &imo); + + if (mrtdebug & DEBUG_XMIT) + log(LOG_DEBUG, "phyint_send on vif %d err %d\n", + vifp - viftable, error); + } + splx(s); +} + +/* determine the current time and then + * the elapsed time (between the last time and time now) + * in milliseconds & update the no. of tokens in the bucket + */ +static void +tbf_update_tokens(struct vif *vifp) +{ + struct timeval tp; + register u_long tm; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + GET_TIME(tp); + + TV_DELTA(tp, t->tbf_last_pkt_t, tm); + + /* + * This formula is actually + * "time in seconds" * "bytes/second". + * + * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) + * + * The (1000/1024) was introduced in add_vif to optimize + * this divide into a shift. + */ + t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; + t->tbf_last_pkt_t = tp; + + if (t->tbf_n_tok > MAX_BKT_SIZE) + t->tbf_n_tok = MAX_BKT_SIZE; + + splx(s); +} + +static int +priority(struct vif *vifp, struct ip *ip) +{ + register int prio; + + /* temporary hack; may add general packet classifier some day */ + + /* + * The UDP port space is divided up into four priority ranges: + * [0, 16384) : unclassified - lowest priority + * [16384, 32768) : audio - highest priority + * [32768, 49152) : whiteboard - medium priority + * [49152, 65536) : video - low priority + */ + if (ip->ip_p == IPPROTO_UDP) { + struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); + switch (ntohs(udp->uh_dport) & 0xc000) { + case 0x4000: + prio = 70; + break; + case 0x8000: + prio = 60; + break; + case 0xc000: + prio = 55; + break; + default: + prio = 50; + break; + } + if (tbfdebug > 1) + log(LOG_DEBUG, "port %x prio%d\n", ntohs(udp->uh_dport), prio); + } else { + prio = 50; + } + return prio; +} + +/* + * End of token bucket filter modifications + */ + +int +ip_rsvp_vif_init(struct socket *so, struct mbuf *m) +{ + int i; + register int s; + + if (rsvpdebug) + printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + /* Check mbuf. */ + if (m == NULL || m->m_len != sizeof(int)) { + return EINVAL; + } + i = *(mtod(m, int *)); + + if (rsvpdebug) + printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n",i,rsvp_on); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + /* Check if socket is available. */ + if (viftable[i].v_rsvpd != NULL) { + splx(s); + return EADDRINUSE; + } + + viftable[i].v_rsvpd = so; + /* This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 1; + rsvp_on++; + } + + splx(s); + return 0; +} + +int +ip_rsvp_vif_done(struct socket *so, struct mbuf *m) +{ + int i; + register int s; + + if (rsvpdebug) + printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + /* Check mbuf. */ + if (m == NULL || m->m_len != sizeof(int)) { + return EINVAL; + } + i = *(mtod(m, int *)); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + if (rsvpdebug) + printf("ip_rsvp_vif_done: v_rsvpd = %p so = %p\n", + viftable[i].v_rsvpd, so); + + viftable[i].v_rsvpd = NULL; + /* This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 0; + rsvp_on--; + } + + splx(s); + return 0; +} + +void +ip_rsvp_force_done(struct socket *so) +{ + int vifi; + register int s; + + /* Don't bother if it is not the right type of socket. */ + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return; + + s = splnet(); + + /* The socket may be attached to more than one vif...this + * is perfectly legal. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_rsvpd == so) { + viftable[vifi].v_rsvpd = NULL; + /* This may seem silly, but we need to be sure we don't + * over-decrement the RSVP counter, in case something slips up. + */ + if (viftable[vifi].v_rsvp_on) { + viftable[vifi].v_rsvp_on = 0; + rsvp_on--; + } + } + } + + splx(s); + return; +} + +void +rsvp_input(struct mbuf *m, int iphlen) +{ + int vifi; + register struct ip *ip = mtod(m, struct ip *); + static struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; + register int s; + struct ifnet *ifp; + + if (rsvpdebug) + printf("rsvp_input: rsvp_on %d\n",rsvp_on); + + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + /* If the old-style non-vif-associated socket is set, then use + * it and ignore the new ones. + */ + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, iphlen); + return; + } + + s = splnet(); + + if (rsvpdebug) + printf("rsvp_input: check vifs\n"); + +#ifdef DIAGNOSTIC + if (!(m->m_flags & M_PKTHDR)) + panic("rsvp_input no hdr"); +#endif + + ifp = m->m_pkthdr.rcvif; + /* Find which vif the packet arrived on. */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_ifp == ifp) + break; + } + + if (vifi == numvifs) { + /* Can't find vif packet arrived on. Drop packet. */ + if (rsvpdebug) + printf("rsvp_input: Can't find vif for packet...dropping it.\n"); + m_freem(m); + splx(s); + return; + } + + if (rsvpdebug) + printf("rsvp_input: check socket\n"); + + if (viftable[vifi].v_rsvpd == NULL) { + /* drop packet, since there is no specific socket for this + * interface */ + if (rsvpdebug) + printf("rsvp_input: No socket defined for vif %d\n",vifi); + m_freem(m); + splx(s); + return; + } + rsvp_src.sin_addr = ip->ip_src; + + if (rsvpdebug && m) + printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", + m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); + + if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) + if (rsvpdebug) + printf("rsvp_input: Failed to append to socket\n"); + else + if (rsvpdebug) + printf("rsvp_input: send packet up\n"); + + splx(s); +} + +#ifdef MROUTE_LKM +#include +#include +#include +#include + +MOD_MISC("ip_mroute_mod") + +static int +ip_mroute_mod_handle(struct lkm_table *lkmtp, int cmd) +{ + int i; + struct lkm_misc *args = lkmtp->private.lkm_misc; + int err = 0; + + switch(cmd) { + static int (*old_ip_mrouter_cmd)(); + static int (*old_ip_mrouter_done)(); + static int (*old_ip_mforward)(); + static int (*old_mrt_ioctl)(); + static void (*old_proto4_input)(); + static int (*old_legal_vif_num)(); + extern struct protosw inetsw[]; + + case LKM_E_LOAD: + if(lkmexists(lkmtp) || ip_mrtproto) + return(EEXIST); + old_ip_mrouter_cmd = ip_mrouter_cmd; + ip_mrouter_cmd = X_ip_mrouter_cmd; + old_ip_mrouter_done = ip_mrouter_done; + ip_mrouter_done = X_ip_mrouter_done; + old_ip_mforward = ip_mforward; + ip_mforward = X_ip_mforward; + old_mrt_ioctl = mrt_ioctl; + mrt_ioctl = X_mrt_ioctl; + old_proto4_input = inetsw[ip_protox[ENCAP_PROTO]].pr_input; + inetsw[ip_protox[ENCAP_PROTO]].pr_input = X_ipip_input; + old_legal_vif_num = legal_vif_num; + legal_vif_num = X_legal_vif_num; + ip_mrtproto = IGMP_DVMRP; + + printf("\nIP multicast routing loaded\n"); + break; + + case LKM_E_UNLOAD: + if (ip_mrouter) + return EINVAL; + + ip_mrouter_cmd = old_ip_mrouter_cmd; + ip_mrouter_done = old_ip_mrouter_done; + ip_mforward = old_ip_mforward; + mrt_ioctl = old_mrt_ioctl; + inetsw[ip_protox[ENCAP_PROTO]].pr_input = old_proto4_input; + legal_vif_num = old_legal_vif_num; + ip_mrtproto = 0; + break; + + default: + err = EINVAL; + break; + } + + return(err); +} + +int +ip_mroute_mod(struct lkm_table *lkmtp, int cmd, int ver) { + DISPATCH(lkmtp, cmd, ver, ip_mroute_mod_handle, ip_mroute_mod_handle, + nosys); +} + +#endif /* MROUTE_LKM */ +#endif /* MROUTING */ diff --git a/netinet/ip_mroute.h b/netinet/ip_mroute.h new file mode 100644 index 0000000..9a28574 --- /dev/null +++ b/netinet/ip_mroute.h @@ -0,0 +1,271 @@ +/* + * Copyright (c) 1989 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/ip_mroute.h,v 1.31 2007/02/08 23:05:08 bms Exp $ + */ + + +#ifndef _NETINET_IP_MROUTE_H_ +#define _NETINET_IP_MROUTE_H_ + +#include /* struct in_addr */ +#include /* struct route */ + +/* + * Definitions for IP multicast forwarding. + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Ajit Thyagarajan, PARC, August 1993. + * Modified by Ajit Thyagarajan, PARC, August 1994. + * Modified by Ahmed Helmy, SGI, June 1996. + * Modified by Pavlin Radoslavov, ICSI, October 2002. + * + * MROUTING Revision: 3.3.1.3 + * and PIM-SMv2 and PIM-DM support, advanced API support, + * bandwidth metering and signaling. + */ + + +/* + * Multicast Routing set/getsockopt commands. + */ +#define MRT_INIT 100 /* initialize forwarder */ +#define MRT_DONE 101 /* shut down forwarder */ +#define MRT_ADD_VIF 102 /* create virtual interface */ +#define MRT_DEL_VIF 103 /* delete virtual interface */ +#define MRT_ADD_MFC 104 /* insert forwarding cache entry */ +#define MRT_DEL_MFC 105 /* delete forwarding cache entry */ +#define MRT_VERSION 106 /* get kernel version number */ +#define MRT_ASSERT 107 /* enable assert processing */ +#define MRT_PIM MRT_ASSERT /* enable PIM processing */ +#define MRT_API_SUPPORT 109 /* supported MRT API */ +#define MRT_API_CONFIG 110 /* config MRT API */ +#define MRT_ADD_BW_UPCALL 111 /* create bandwidth monitor */ +#define MRT_DEL_BW_UPCALL 112 /* delete bandwidth monitor */ + + +#define GET_TIME(t) microtime(&t) + +/* + * Types and macros for handling bitmaps with one bit per virtual interface. + */ +#define MAXVIFS 32 +typedef u_long vifbitmap_t; +typedef u_short vifi_t; /* type of a vif index */ +#define ALL_VIFS (vifi_t)-1 + +#define VIFM_SET(n, m) ((m) |= (1 << (n))) +#define VIFM_CLR(n, m) ((m) &= ~(1 << (n))) +#define VIFM_ISSET(n, m) ((m) & (1 << (n))) +#define VIFM_CLRALL(m) ((m) = 0x00000000) +#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom)) +#define VIFM_SAME(m1, m2) ((m1) == (m2)) + + +/* + * Argument structure for MRT_ADD_VIF. + * (MRT_DEL_VIF takes a single vifi_t argument.) + */ +struct vifctl { + vifi_t vifc_vifi; /* the index of the vif to be added */ + u_char vifc_flags; /* VIFF_ flags defined below */ + u_char vifc_threshold; /* min ttl required to forward on vif */ + u_int vifc_rate_limit; /* max rate */ + struct in_addr vifc_lcl_addr; /* local interface address */ + struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */ +}; + +#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */ +#define VIFF_SRCRT 0x2 /* tunnel uses IP source routing */ + +/* + * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC + * (mfcc_tos to be added at a future point) + */ +struct mfcctl { + struct in_addr mfcc_origin; /* ip origin of mcasts */ + struct in_addr mfcc_mcastgrp; /* multicast group associated*/ + vifi_t mfcc_parent; /* incoming vif */ + u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ +}; + +/* + * The kernel's multicast routing statistics. + */ +struct mrtstat { + u_long mrts_mfc_lookups; /* # forw. cache hash table hits */ + u_long mrts_mfc_misses; /* # forw. cache hash table misses */ + u_long mrts_upcalls; /* # calls to mrouted */ + u_long mrts_no_route; /* no route for packet's origin */ + u_long mrts_bad_tunnel; /* malformed tunnel options */ + u_long mrts_cant_tunnel; /* no room for tunnel options */ + u_long mrts_wrong_if; /* arrived on wrong interface */ + u_long mrts_upq_ovflw; /* upcall Q overflow */ + u_long mrts_cache_cleanups; /* # entries with no upcalls */ + u_long mrts_drop_sel; /* pkts dropped selectively */ + u_long mrts_q_overflow; /* pkts dropped - Q overflow */ + u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ + u_long mrts_upq_sockfull; /* upcalls dropped - socket full */ +}; + +/* + * Argument structure used by mrouted to get src-grp pkt counts + */ +struct sioc_sg_req { + struct in_addr src; + struct in_addr grp; + u_long pktcnt; + u_long bytecnt; + u_long wrong_if; +}; + +/* + * Argument structure used by mrouted to get vif pkt counts + */ +struct sioc_vif_req { + vifi_t vifi; /* vif number */ + u_long icount; /* Input packet count on vif */ + u_long ocount; /* Output packet count on vif */ + u_long ibytes; /* Input byte count on vif */ + u_long obytes; /* Output byte count on vif */ +}; + + +/* + * The kernel's virtual-interface structure. + */ +struct vif { + u_char v_flags; /* VIFF_ flags defined above */ + u_char v_threshold; /* min ttl required to forward on vif*/ + u_int v_rate_limit; /* max rate */ + struct tbf *v_tbf; /* token bucket structure at intf. */ + struct in_addr v_lcl_addr; /* local interface address */ + struct in_addr v_rmt_addr; /* remote address (tunnels only) */ + struct ifnet *v_ifp; /* pointer to interface */ + u_long v_pkt_in; /* # pkts in on interface */ + u_long v_pkt_out; /* # pkts out on interface */ + u_long v_bytes_in; /* # bytes in on interface */ + u_long v_bytes_out; /* # bytes out on interface */ + struct route v_route; /* cached route if this is a tunnel */ + u_int v_rsvp_on; /* RSVP listening on this vif */ + struct socket *v_rsvpd; /* RSVP daemon socket */ +}; + +/* + * The kernel's multicast forwarding cache entry structure + * (A field for the type of service (mfc_tos) is to be added + * at a future point) + */ +struct mfc { + struct in_addr mfc_origin; /* IP origin of mcasts */ + struct in_addr mfc_mcastgrp; /* multicast group associated*/ + vifi_t mfc_parent; /* incoming vif */ + u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ + u_long mfc_pkt_cnt; /* pkt count for src-grp */ + u_long mfc_byte_cnt; /* byte count for src-grp */ + u_long mfc_wrong_if; /* wrong if for src-grp */ + int mfc_expire; /* time to clean entry up */ + struct timeval mfc_last_assert; /* last time I sent an assert*/ +}; + +/* + * Struct used to communicate from kernel to multicast router + * note the convenient similarity to an IP packet + */ +struct igmpmsg { + u_long unused1; + u_long unused2; + u_char im_msgtype; /* what type of message */ +#define IGMPMSG_NOCACHE 1 +#define IGMPMSG_WRONGVIF 2 + u_char im_mbz; /* must be zero */ + u_char im_vif; /* vif rec'd on */ + u_char unused3; + struct in_addr im_src, im_dst; +}; + +/* + * Argument structure used for pkt info. while upcall is made + */ +struct rtdetq { + struct mbuf *m; /* A copy of the packet */ + struct ifnet *ifp; /* Interface pkt came in on */ + vifi_t xmt_vif; /* Saved copy of imo_multicast_vif */ +#ifdef UPCALL_TIMING + struct timeval t; /* Timestamp */ +#endif /* UPCALL_TIMING */ +}; + +#define MFCTBLSIZ 256 +#if (MFCTBLSIZ & (MFCTBLSIZ - 1)) == 0 /* from sys:route.h */ +#define MFCHASHMOD(h) ((h) & (MFCTBLSIZ - 1)) +#else +#define MFCHASHMOD(h) ((h) % MFCTBLSIZ) +#endif + +#define MAX_UPQ 4 /* max. no of pkts in upcall Q */ + +/* + * Token Bucket filter code + */ +#define MAX_BKT_SIZE 10000 /* 10K bytes size */ +#define MAXQSIZE 10 /* max # of pkts in queue */ + +/* + * the token bucket filter at each vif + */ +struct tbf +{ + struct timeval tbf_last_pkt_t; /* arr. time of last pkt */ + u_long tbf_n_tok; /* no of tokens in bucket */ + u_long tbf_q_len; /* length of queue at this vif */ + u_long tbf_max_q_len; /* max. queue length */ + struct mbuf *tbf_q; /* Packet queue */ + struct mbuf *tbf_t; /* tail-insertion pointer */ +}; + +#ifdef _KERNEL + +extern int (*ip_mrouter_set)(int, struct socket *, struct mbuf *); +extern int (*ip_mrouter_get)(int, struct socket *, struct mbuf **); +extern int (*ip_mrouter_done)(void); +#ifdef MROUTING +extern int (*mrt_ioctl)(int, caddr_t); +#else +extern int (*mrt_ioctl)(int, caddr_t, struct proc *); +#endif + +#endif /* _KERNEL */ + +#endif /* _NETINET_IP_MROUTE_H_ */ diff --git a/netinet/ip_output.c b/netinet/ip_output.c new file mode 100644 index 0000000..5b1c497 --- /dev/null +++ b/netinet/ip_output.c @@ -0,0 +1,1336 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD: src/sys/netinet/ip_output.c,v 1.271 2007/03/23 09:43:36 bms Exp $ + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#define _IP_VHL + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#if !defined(COMPAT_IPFW) || COMPAT_IPFW == 1 +#undef COMPAT_IPFW +#define COMPAT_IPFW 1 +#else +#undef COMPAT_IPFW +#endif + +u_short ip_id; + +static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); +static void ip_mloopback + (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); +static int ip_getmoptions + (int, struct ip_moptions *, struct mbuf **); +static int ip_optcopy(struct ip *, struct ip *); +static int ip_pcbopts(struct mbuf **, struct mbuf *); +static int ip_setmoptions + (int, struct ip_moptions **, struct mbuf *); + +extern struct protosw inetsw[]; + +/* + * IP output. The packet in mbuf chain m contains a skeletal IP + * header (with len, off, ttl, proto, tos, src, dst). + * The mbuf chain containing the packet will be freed. + * The mbuf opt, if present, will not be freed. + */ +int +ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo) +{ + struct ip *ip, *mhip; + struct ifnet *ifp; + struct mbuf *m = m0; + int hlen = sizeof (struct ip); + int len = 0, off, error = 0; + struct sockaddr_in *dst; + struct in_ifaddr *ia; + int isbroadcast; + +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ip_output no HDR"); + if (!ro) + panic("ip_output no route, proto = %d", + mtod(m, struct ip *)->ip_p); +#endif + if (opt) { + m = ip_insertoptions(m, opt, &len); + hlen = len; + } + ip = mtod(m, struct ip *); + /* + * Fill in IP header. + */ + if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { +#ifdef _IP_VHL + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); +#else + ip->ip_v = IPVERSION; + ip->ip_hl = hlen >> 2; +#endif + ip->ip_off &= IP_DF; + ip->ip_id = htons(ip_id++); + ipstat.ips_localout++; + } else { +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + } + + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + */ + if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if (ro->ro_rt == NULL) { + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + } + /* + * If routing to interface only, + * short circuit routing lookup. + */ +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) + if (flags & IP_ROUTETOIF) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + ifp = ia->ia_ifp; + ip->ip_ttl = 1; + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && + imo != NULL && imo->imo_multicast_ifp != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + ifp = imo->imo_multicast_ifp; + IFP_TO_IA(ifp, ia); + isbroadcast = 0; /* fool gcc */ + } else { + /* + * If this is the case, we probably don't want to allocate + * a protocol-cloned route since we didn't get one from the + * ULP. This lets TCP do its thing, while not burdening + * forwarding or ICMP with the overhead of cloning a route. + * Of course, we still want to do any cloning requested by + * the link layer, as this is probably required in all cases + * for correct operation (as it is for ARP). + */ + if (ro->ro_rt == 0) + rtalloc_ign(ro, RTF_PRCLONING); + if (ro->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + ro->ro_rt->rt_use++; + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + if (ro->ro_rt->rt_flags & RTF_HOST) + isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + + m->m_flags |= M_MCAST; + /* + * IP destination address is multicast. Make sure "dst" + * still points to the address in "ro". (It may have been + * changed to point to a gateway address, above.) + */ + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * See if the caller provided any multicast options + */ + if (imo != NULL) { + ip->ip_ttl = imo->imo_multicast_ttl; + if (imo->imo_multicast_ifp != NULL) + ifp = imo->imo_multicast_ifp; + if (imo->imo_multicast_vif != -1) + ip->ip_src.s_addr = + ip_mcast_src(imo->imo_multicast_vif); + } else + ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + /* + * Confirm that the outgoing interface supports multicast. + */ + if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + } + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + register struct in_ifaddr *ia; + + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) { + ip->ip_src = IA_SIN(ia)->sin_addr; + break; + } + } + + IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); + if (inm != NULL && + (imo == NULL || imo->imo_multicast_loop)) { + /* + * If we belong to the destination multicast group + * on the outgoing interface, and the caller did not + * forbid loopback, loop back a copy. + */ + ip_mloopback(ifp, m, dst, hlen); + } + else { + /* + * If we are acting as a multicast router, perform + * multicast forwarding as if the packet had just + * arrived on the interface to which we are about + * to send. The multicast forwarding function + * recursively calls this function, using the + * IP_FORWARDING flag to prevent infinite recursion. + * + * Multicasts that are looped back by ip_mloopback(), + * above, will be forwarded by the ip_input() routine, + * if necessary. + */ + if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + /* + * Check if rsvp daemon is running. If not, don't + * set ip_moptions. This ensures that the packet + * is multicast and not just sent down one link + * as prescribed by rsvpd. + */ + if (!rsvp_on) + imo = NULL; + if (ip_mforward(ip, ifp, m, imo) != 0) { + m_freem(m); + goto done; + } + } + } + + /* + * Multicasts with a time-to-live of zero may be looped- + * back, above, but must not be transmitted on a network. + * Also, multicasts addressed to the loopback interface + * are not sent -- the above call to ip_mloopback() will + * loop back a copy if this host actually belongs to the + * destination group on the loopback interface. + */ + if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { + m_freem(m); + goto done; + } + + goto sendit; + } +#ifndef notdef + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) + ip->ip_src = IA_SIN(ia)->sin_addr; +#endif + /* + * Verify that we have any chance at all of being able to queue + * the packet or packet fragments + */ + if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= + ifp->if_snd.ifq_maxlen) { + error = ENOBUFS; + goto bad; + } + + /* + * Look for broadcast address and + * verify user is allowed to send + * such a packet. + */ + if (isbroadcast) { + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EADDRNOTAVAIL; + goto bad; + } + if ((flags & IP_ALLOWBROADCAST) == 0) { + error = EACCES; + goto bad; + } + /* don't allow broadcast messages to be fragmented */ + if (ip->ip_len > ifp->if_mtu) { + error = EMSGSIZE; + goto bad; + } + m->m_flags |= M_BCAST; + } else { + m->m_flags &= ~M_BCAST; + } + +sendit: + /* + * IpHack's section. + * - Xlate: translate packet's addr/port (NAT). + * - Firewall: deny/allow/etc. + * - Wrap: fake packet's addr/port + * - Encapsulate: put it in another IP and send out. + */ + +#ifdef COMPAT_IPFW + if (ip_nat_ptr && !(*ip_nat_ptr)(&ip, &m, ifp, IP_NAT_OUT)) { + error = EACCES; + goto done; + } + + /* + * Check with the firewall... + */ + if (ip_fw_chk_ptr) { +#ifdef IPDIVERT + ip_divert_port = (*ip_fw_chk_ptr)(&ip, + hlen, ifp, ip_divert_ignore, &m); + ip_divert_ignore = 0; + if (ip_divert_port) { /* Divert packet */ + (*inetsw[ip_protox[IPPROTO_DIVERT]].pr_input)(m, 0); + goto done; + } +#else + /* If ipfw says divert, we have to just drop packet */ + if ((*ip_fw_chk_ptr)(&ip, hlen, ifp, 0, &m)) { + m_freem(m); + goto done; + } +#endif + if (!m) { + error = EACCES; + goto done; + } + } +#endif /* COMPAT_IPFW */ + + /* + * If small enough for interface, or the interface will take + * care of the fragmentation for us, we can just send directly. + */ + if ((u_short)ip->ip_len <= ifp->if_mtu) { + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; +#ifdef _IP_VHL + if (ip->ip_vhl == IP_VHL_BORING) { +#else + if ((ip->ip_hl == 5) && (ip->ip_v = IPVERSION)) { +#endif + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + goto done; + } + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & IP_DF) { + error = EMSGSIZE; + /* + * This case can happen if the user changed the MTU + * of an interface after enabling IP on it. Because + * most netifs don't keep track of routes pointing to + * them, there is no way for one to update all its + * routes when the MTU is changed. + */ + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + ipstat.ips_cantfrag++; + goto bad; + } + len = (ifp->if_mtu - hlen) &~ 7; + if (len < 8) { + error = EMSGSIZE; + goto bad; + } + + { + int mhlen, firstlen = len; + struct mbuf **mnext = &m->m_nextpkt; + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + */ + m0 = m; + mhlen = sizeof (struct ip); + for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_flags |= (m0->m_flags & M_MCAST); + m->m_data += max_linkhdr; + mhip = mtod(m, struct ip *); + *mhip = *ip; + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); +#ifdef _IP_VHL + mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); +#else + mhip->ip_v = IPVERSION; + mhip->ip_hl = mhlen >> 2; +#endif + } + m->m_len = mhlen; + mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF); + if (ip->ip_off & IP_MF) + mhip->ip_off |= IP_MF; + if (off + len >= (u_short)ip->ip_len) + len = (u_short)ip->ip_len - off; + else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copy(m0, off, len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; /* ??? */ + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_pkthdr.len = mhlen + len; + m->m_pkthdr.rcvif = NULL; + mhip->ip_off = htons(mhip->ip_off); + mhip->ip_sum = 0; +#ifdef _IP_VHL + if (mhip->ip_vhl == IP_VHL_BORING) { +#else + if ((mhip->ip_hl == 5) && (mhip->ip_v == IPVERSION) ) { +#endif + mhip->ip_sum = in_cksum_hdr(mhip); + } else { + mhip->ip_sum = in_cksum(m, mhlen); + } + *mnext = m; + mnext = &m->m_nextpkt; + ipstat.ips_ofragments++; + } + /* + * Update first fragment by trimming what's been copied out + * and updating header, then send each fragment (in order). + */ + m = m0; + m_adj(m, hlen + firstlen - (u_short)ip->ip_len); + m->m_pkthdr.len = hlen + firstlen; + ip->ip_len = htons((u_short)m->m_pkthdr.len); + ip->ip_off |= IP_MF; + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; +#ifdef _IP_VHL + if (ip->ip_vhl == IP_VHL_BORING) { +#else + if ((ip->ip_hl == 5) && (ip->ip_v == IPVERSION) ) { +#endif + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } +sendorfree: + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; + if (error == 0) + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + else + m_freem(m); + } + + if (error == 0) + ipstat.ips_fragmented++; + } +done: + return (error); +bad: + m_freem(m0); + goto done; +} + +/* + * Insert IP options into preformed packet. + * Adjust IP destination as required for IP source routing, + * as indicated by a non-zero in_addr at the start of the options. + * + * XXX This routine assumes that the packet has no options in place. + */ +static struct mbuf * +ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) +{ + register struct ipoption *p = mtod(opt, struct ipoption *); + struct mbuf *n; + register struct ip *ip = mtod(m, struct ip *); + uint32_t optlen; + + optlen = opt->m_len - sizeof(p->ipopt_dst); + if (optlen + ip->ip_len > IP_MAXPACKET) + return (m); /* XXX should fail */ + if (p->ipopt_dst.s_addr) + ip->ip_dst = p->ipopt_dst; + if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { + MGETHDR(n, M_DONTWAIT, MT_HEADER); + if (n == 0) + return (m); + n->m_pkthdr.len = m->m_pkthdr.len + optlen; + m->m_len -= sizeof(struct ip); + m->m_data += sizeof(struct ip); + n->m_next = m; + m = n; + m->m_len = optlen + sizeof(struct ip); + m->m_data += max_linkhdr; + (void)memcpy(mtod(m, void *), ip, sizeof(struct ip)); + } else { + m->m_data -= optlen; + m->m_len += optlen; + m->m_pkthdr.len += optlen; + ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + } + ip = mtod(m, struct ip *); + bcopy(p->ipopt_list, ip + 1, optlen); + *phlen = sizeof(struct ip) + optlen; +#ifdef _IP_VHL + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); +#else + ip->ip_v = IPVERSION; + ip->ip_hl = *phlen >> 2; +#endif + ip->ip_len += optlen; + return (m); +} + +/* + * Copy options from ip to jp, + * omitting those not copied during fragmentation. + */ +static int +ip_optcopy(struct ip *ip, struct ip *jp) +{ + register u_char *cp, *dp; + int opt, optlen, cnt; + + cp = (u_char *)(ip + 1); + dp = (u_char *)(jp + 1); +#ifdef _IP_VHL + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); +#else + cnt = (ip->ip_hl << 2) - sizeof (struct ip); +#endif + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) { + /* Preserve for IP mcast tunnel's LSRR alignment. */ + *dp++ = IPOPT_NOP; + optlen = 1; + continue; + } else + optlen = cp[IPOPT_OLEN]; + /* bogus lengths should have been caught by ip_dooptions */ + if (optlen > cnt) + optlen = cnt; + if (IPOPT_COPIED(opt)) { + bcopy(cp, dp, optlen); + dp += optlen; + } + } + for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) + *dp++ = IPOPT_EOL; + return (optlen); +} + +/* + * IP socket option processing. + */ +int +ip_ctloutput(int op, struct socket *so, int level, int optname, + struct mbuf **mp) +{ + struct inpcb *inp = sotoinpcb(so); + register struct mbuf *m = *mp; + register int optval = 0; + int error = 0; + + if (level != IPPROTO_IP) { + error = EINVAL; + if (op == PRCO_SETOPT && *mp) + (void) m_free(*mp); + } else switch (op) { + + case PRCO_SETOPT: + switch (optname) { + case IP_OPTIONS: +#ifdef notyet + case IP_RETOPTS: + return (ip_pcbopts(optname, &inp->inp_options, m)); +#else + return (ip_pcbopts(&inp->inp_options, m)); +#endif + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: + if (m == 0 || m->m_len != sizeof(int)) + error = EINVAL; + else { + optval = *mtod(m, int *); + switch (optname) { + + case IP_TOS: + inp->inp_ip_tos = optval; + break; + + case IP_TTL: + inp->inp_ip_ttl = optval; + break; +#define OPTSET(bit) \ + if (optval) \ + inp->inp_flags |= bit; \ + else \ + inp->inp_flags &= ~bit; + + case IP_RECVOPTS: + OPTSET(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + OPTSET(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + OPTSET(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + OPTSET(INP_RECVIF); + break; + } + } + break; +#undef OPTSET + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_setmoptions(optname, &inp->inp_moptions, m); + break; + + case IP_PORTRANGE: + if (m == 0 || m->m_len != sizeof(int)) + error = EINVAL; + else { + optval = *mtod(m, int *); + + switch (optval) { + + case IP_PORTRANGE_DEFAULT: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags &= ~(INP_HIGHPORT); + break; + + case IP_PORTRANGE_HIGH: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; + break; + + case IP_PORTRANGE_LOW: + inp->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags |= INP_LOWPORT; + break; + + default: + error = EINVAL; + break; + } + } + break; + + default: + error = ENOPROTOOPT; + break; + } + if (m) + (void)m_free(m); + break; + + case PRCO_GETOPT: + switch (optname) { + case IP_OPTIONS: + case IP_RETOPTS: + *mp = m = m_get(M_WAIT, MT_SOOPTS); + if (inp->inp_options) { + m->m_len = inp->inp_options->m_len; + bcopy(mtod(inp->inp_options, void *), + mtod(m, void *), m->m_len); + } else + m->m_len = 0; + break; + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: + *mp = m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof(int); + switch (optname) { + + case IP_TOS: + optval = inp->inp_ip_tos; + break; + + case IP_TTL: + optval = inp->inp_ip_ttl; + break; + +#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) + + case IP_RECVOPTS: + optval = OPTBIT(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + optval = OPTBIT(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + optval = OPTBIT(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + optval = OPTBIT(INP_RECVIF); + break; + } + *mtod(m, int *) = optval; + break; + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_getmoptions(optname, inp->inp_moptions, mp); + break; + + case IP_PORTRANGE: + *mp = m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof(int); + + if (inp->inp_flags & INP_HIGHPORT) + optval = IP_PORTRANGE_HIGH; + else if (inp->inp_flags & INP_LOWPORT) + optval = IP_PORTRANGE_LOW; + else + optval = 0; + + *mtod(m, int *) = optval; + break; + + default: + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} + +/* + * Set up IP options in pcb for insertion in output packets. + * Store in mbuf with pointer in pcbopt, adding pseudo-option + * with destination address if source routed. + */ +static int +#ifdef notyet +ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m) +#else +ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m) +#endif +{ + register int cnt, optlen; + register u_char *cp; + u_char opt; + + /* turn off any old options */ + if (*pcbopt) + (void)m_free(*pcbopt); + *pcbopt = 0; + if (m == (struct mbuf *)0 || m->m_len == 0) { + /* + * Only turning off any previous options. + */ + if (m) + (void)m_free(m); + return (0); + } + +#ifndef vax + if (m->m_len % sizeof(long)) + goto bad; +#endif + /* + * IP first-hop destination address will be stored before + * actual options; move other options back + * and clear it when none present. + */ + if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) + goto bad; + cnt = m->m_len; + m->m_len += sizeof(struct in_addr); + cp = mtod(m, u_char *) + sizeof(struct in_addr); + ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); + bzero(mtod(m, caddr_t), sizeof(struct in_addr)); + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= IPOPT_OLEN || optlen > cnt) + goto bad; + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + case IPOPT_SSRR: + /* + * user process specifies route as: + * ->A->B->C->D + * D must be our final destination (but we can't + * check that since we may not have connected yet). + * A is first hop destination, which doesn't appear in + * actual IP option, but is stored before the options. + */ + if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) + goto bad; + m->m_len -= sizeof(struct in_addr); + cnt -= sizeof(struct in_addr); + optlen -= sizeof(struct in_addr); + cp[IPOPT_OLEN] = optlen; + /* + * Move first hop before start of options. + */ + bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), + sizeof(struct in_addr)); + /* + * Then copy rest of options back + * to close up the deleted entry. + */ + ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + + sizeof(struct in_addr)), + (caddr_t)&cp[IPOPT_OFFSET+1], + (unsigned)cnt + sizeof(struct in_addr)); + break; + } + } + if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) + goto bad; + *pcbopt = m; + return (0); + +bad: + (void)m_free(m); + return (EINVAL); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + */ +static int +ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m) +{ + int error = 0; + u_char loop; + int i; + struct in_addr addr; + struct ip_mreq *mreq; + struct ifnet *ifp; + struct ip_moptions *imo = *imop; + struct route ro; + register struct sockaddr_in *dst; + int s; + + if (imo == NULL) { + /* + * No multicast option buffer attached to the pcb; + * allocate one and initialize to default values. + */ + imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, + M_WAITOK); + + if (imo == NULL) + return (ENOBUFS); + *imop = imo; + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + imo->imo_num_memberships = 0; + } + + switch (optname) { + /* store an index number for the vif you wanna use in the send */ + case IP_MULTICAST_VIF: + if (!legal_vif_num) { + error = EOPNOTSUPP; + break; + } + if (m == NULL || m->m_len != sizeof(int)) { + error = EINVAL; + break; + } + i = *(mtod(m, int *)); + if (!legal_vif_num(i) && (i != -1)) { + error = EINVAL; + break; + } + imo->imo_multicast_vif = i; + break; + + case IP_MULTICAST_IF: + /* + * Select the interface for outgoing multicast packets. + */ + if (m == NULL || m->m_len != sizeof(struct in_addr)) { + error = EINVAL; + break; + } + addr = *(mtod(m, struct in_addr *)); + /* + * INADDR_ANY is used to remove a previous selection. + * When no interface is selected, a default one is + * chosen every time a multicast packet is sent. + */ + if (addr.s_addr == INADDR_ANY) { + imo->imo_multicast_ifp = NULL; + break; + } + /* + * The selected interface is identified by its local + * IP address. Find the interface and confirm that + * it supports multicasting. + */ + s = splimp(); + INADDR_TO_IFP(addr, ifp); + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + splx(s); + error = EADDRNOTAVAIL; + break; + } + imo->imo_multicast_ifp = ifp; + splx(s); + break; + + case IP_MULTICAST_TTL: + /* + * Set the IP time-to-live for outgoing multicast packets. + */ + if (m == NULL || m->m_len != 1) { + error = EINVAL; + break; + } + imo->imo_multicast_ttl = *(mtod(m, u_char *)); + break; + + case IP_MULTICAST_LOOP: + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. + */ + if (m == NULL || m->m_len != 1 || + (loop = *(mtod(m, u_char *))) > 1) { + error = EINVAL; + break; + } + imo->imo_multicast_loop = loop; + break; + + case IP_ADD_MEMBERSHIP: + /* + * Add a multicast group membership. + * Group must be a valid IP multicast address. + */ + if (m == NULL || m->m_len != sizeof(struct ip_mreq)) { + error = EINVAL; + break; + } + mreq = mtod(m, struct ip_mreq *); + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + s = splimp(); + /* + * If no interface address was provided, use the interface of + * the route to the given multicast address. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) { + bzero((caddr_t)&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_addr = mreq->imr_multiaddr; + rtalloc(&ro); + if (ro.ro_rt == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + ifp = ro.ro_rt->rt_ifp; + rtfree(ro.ro_rt); + } + else { + INADDR_TO_IFP(mreq->imr_interface, ifp); + } + + /* + * See if we found an interface, and confirm that it + * supports multicast. + */ + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * See if the membership already exists or if all the + * membership slots are full. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if (imo->imo_membership[i]->inm_ifp == ifp && + imo->imo_membership[i]->inm_addr.s_addr + == mreq->imr_multiaddr.s_addr) + break; + } + if (i < imo->imo_num_memberships) { + error = EADDRINUSE; + splx(s); + break; + } + if (i == IP_MAX_MEMBERSHIPS) { + error = ETOOMANYREFS; + splx(s); + break; + } + /* + * Everything looks good; add a new record to the multicast + * address list for the given interface. + */ + if ((imo->imo_membership[i] = + in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; + splx(s); + break; + } + ++imo->imo_num_memberships; + splx(s); + break; + + case IP_DROP_MEMBERSHIP: + /* + * Drop a multicast group membership. + * Group must be a valid IP multicast address. + */ + if (m == NULL || m->m_len != sizeof(struct ip_mreq)) { + error = EINVAL; + break; + } + mreq = mtod(m, struct ip_mreq *); + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + + s = splimp(); + /* + * If an interface address was specified, get a pointer + * to its ifnet structure. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) + ifp = NULL; + else { + INADDR_TO_IFP(mreq->imr_interface, ifp); + if (ifp == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + } + /* + * Find the membership in the membership array. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if ((ifp == NULL || + imo->imo_membership[i]->inm_ifp == ifp) && + imo->imo_membership[i]->inm_addr.s_addr == + mreq->imr_multiaddr.s_addr) + break; + } + if (i == imo->imo_num_memberships) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * Give up the multicast address record to which the + * membership points. + */ + in_delmulti(imo->imo_membership[i]); + /* + * Remove the gap in the membership array. + */ + for (++i; i < imo->imo_num_memberships; ++i) + imo->imo_membership[i-1] = imo->imo_membership[i]; + --imo->imo_num_memberships; + splx(s); + break; + + default: + error = EOPNOTSUPP; + break; + } + + /* + * If all options have default values, no need to keep the mbuf. + */ + if (imo->imo_multicast_ifp == NULL && + imo->imo_multicast_vif == -1 && + imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && + imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && + imo->imo_num_memberships == 0) { + free(*imop, M_IPMOPTS); + *imop = NULL; + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +static int +ip_getmoptions(int optname, struct ip_moptions *imo, + struct mbuf **mp) +{ + u_char *ttl; + u_char *loop; + struct in_addr *addr; + struct in_ifaddr *ia; + + *mp = m_get(M_WAIT, MT_SOOPTS); + + switch (optname) { + + case IP_MULTICAST_VIF: + if (imo != NULL) + *(mtod(*mp, int *)) = imo->imo_multicast_vif; + else + *(mtod(*mp, int *)) = -1; + (*mp)->m_len = sizeof(int); + return(0); + + case IP_MULTICAST_IF: + addr = mtod(*mp, struct in_addr *); + (*mp)->m_len = sizeof(struct in_addr); + if (imo == NULL || imo->imo_multicast_ifp == NULL) + addr->s_addr = INADDR_ANY; + else { + IFP_TO_IA(imo->imo_multicast_ifp, ia); + addr->s_addr = (ia == NULL) ? INADDR_ANY + : IA_SIN(ia)->sin_addr.s_addr; + } + return (0); + + case IP_MULTICAST_TTL: + ttl = mtod(*mp, u_char *); + (*mp)->m_len = 1; + *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL + : imo->imo_multicast_ttl; + return (0); + + case IP_MULTICAST_LOOP: + loop = mtod(*mp, u_char *); + (*mp)->m_len = 1; + *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP + : imo->imo_multicast_loop; + return (0); + + default: + return (EOPNOTSUPP); + } +} + +/* + * Discard the IP multicast options. + */ +void +ip_freemoptions(struct ip_moptions *imo) +{ + register int i; + + if (imo != NULL) { + for (i = 0; i < imo->imo_num_memberships; ++i) + in_delmulti(imo->imo_membership[i]); + free(imo, M_IPMOPTS); + } +} + +/* + * Routine called from ip_output() to loop back a copy of an IP multicast + * packet to the input queue of a specified interface. Note that this + * calls the output routine of the loopback "driver", but with an interface + * pointer that might NOT be a loopback interface -- evil, but easier than + * replicating that code here. + */ +static void +ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, + int hlen) +{ + register struct ip *ip; + struct mbuf *copym; + + copym = m_copy(m, 0, M_COPYALL); + if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + copym = m_pullup(copym, hlen); + if (copym != NULL) { + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; +#ifdef _IP_VHL + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(copym, hlen); + } +#else + ip->ip_sum = in_cksum(copym, hlen); +#endif + /* + * NB: + * It's not clear whether there are any lingering + * reentrancy problems in other areas which might + * be exposed by using ip_input directly (in + * particular, everything which modifies the packet + * in-place). Yet another option is using the + * protosw directly to deliver the looped back + * packet. For the moment, we'll err on the side + * of safety by continuing to abuse looutput(). + */ +#ifdef notdef + copym->m_pkthdr.rcvif = ifp; + ip_input(copym); +#else + (void) looutput(ifp, copym, (struct sockaddr *)dst, NULL); +#endif + } +} diff --git a/netinet/ip_var.h b/netinet/ip_var.h new file mode 100644 index 0000000..a6e0bc8 --- /dev/null +++ b/netinet/ip_var.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD: src/sys/netinet/ip_var.h,v 1.94 2005/01/07 01:45:44 imp Exp $ + */ + + +#ifndef _NETINET_IP_VAR_H_ +#define _NETINET_IP_VAR_H_ + +#include /* struct in_addr */ + +/* + * Overlay for ip header used by other protocols (tcp, udp). + */ +struct ipovly { + caddr_t ih_next; + caddr_t ih_prev; /* for protocol sequence q's */ + u_char ih_x1; /* (unused) */ + u_char ih_pr; /* protocol */ + u_short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; + +/* + * Ip reassembly queue structure. Each fragment + * being reassembled is attached to one of these structures. + * They are timed out after ipq_ttl drops to 0, and may also + * be reclaimed if memory becomes tight. + */ +struct ipq { + struct ipq *next,*prev; /* to other reass headers */ + u_char ipq_ttl; /* time for reass q to live */ + u_char ipq_p; /* protocol of this fragment */ + u_short ipq_id; /* sequence id for reassembly */ + struct ipasfrag *ipq_next,*ipq_prev; + /* to ip headers of fragments */ + struct in_addr ipq_src,ipq_dst; +#ifdef IPDIVERT + u_short ipq_divert; /* divert protocol port */ +#endif +}; + +/* + * Ip header, when holding a fragment. + * + * Note: ipf_next must be at same offset as ipq_next above + */ +struct ipasfrag { +#if BYTE_ORDER == LITTLE_ENDIAN + u_char ip_hl:4, + ip_v:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char ip_v:4, + ip_hl:4; +#endif + u_char ipf_mff; /* XXX overlays ip_tos: use low bit + * to avoid destroying tos; + * copied from (ip_off&IP_MF) */ + u_short ip_len; + u_short ip_id; + u_short ip_off; + u_char ip_ttl; + u_char ip_p; + u_short ip_sum; + struct ipasfrag *ipf_next; /* next fragment */ + struct ipasfrag *ipf_prev; /* previous fragment */ +}; + +/* + * Structure stored in mbuf in inpcb.ip_options + * and passed to ip_output when ip options are in use. + * The actual length of the options (including ipopt_dst) + * is in m_len. + */ +#define MAX_IPOPTLEN 40 + +struct ipoption { + struct in_addr ipopt_dst; /* first-hop dst if source routed */ + char ipopt_list[MAX_IPOPTLEN]; /* options proper */ +}; + +/* + * Structure attached to inpcb.ip_moptions and + * passed to ip_output when IP multicast options are in use. + */ +struct ip_moptions { + struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ + u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ + u_char imo_multicast_loop; /* 1 => hear sends if a member */ + u_short imo_num_memberships; /* no. memberships this socket */ + struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; + u_long imo_multicast_vif; /* vif num outgoing multicasts */ +}; + +struct ipstat { + u_long ips_total; /* total packets received */ + u_long ips_badsum; /* checksum bad */ + u_long ips_tooshort; /* packet too short */ + u_long ips_toosmall; /* not enough data */ + u_long ips_badhlen; /* ip header length < data size */ + u_long ips_badlen; /* ip length < ip header length */ + u_long ips_fragments; /* fragments received */ + u_long ips_fragdropped; /* frags dropped (dups, out of space) */ + u_long ips_fragtimeout; /* fragments timed out */ + u_long ips_forward; /* packets forwarded */ + u_long ips_cantforward; /* packets rcvd for unreachable dest */ + u_long ips_redirectsent; /* packets forwarded on same net */ + u_long ips_noproto; /* unknown or unsupported protocol */ + u_long ips_delivered; /* datagrams delivered to upper level*/ + u_long ips_localout; /* total ip packets generated here */ + u_long ips_odropped; /* lost packets due to nobufs, etc. */ + u_long ips_reassembled; /* total packets reassembled ok */ + u_long ips_fragmented; /* datagrams successfully fragmented */ + u_long ips_ofragments; /* output fragments created */ + u_long ips_cantfrag; /* don't fragment flag was set, etc. */ + u_long ips_badoptions; /* error in option processing */ + u_long ips_noroute; /* packets discarded due to no route */ + u_long ips_badvers; /* ip version != 4 */ + u_long ips_rawout; /* total raw ip packets generated */ + u_long ips_toolong; /* ip length > max ip packet size */ +}; + +#ifdef _KERNEL +/* flags passed to ip_output as last parameter */ +#define IP_FORWARDING 0x1 /* most of ip header exists */ +#define IP_RAWOUTPUT 0x2 /* raw ip header exists */ +#define IP_SENDONES 0x4 /* send all-ones broadcast */ +#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */ +#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ + +struct ip; +struct inpcb; +struct route; +struct sockopt; +struct mbuf; + +extern struct ipstat ipstat; +extern u_short ip_id; /* ip packet ctr, for ids */ +extern int ip_defttl; /* default IP ttl */ +extern u_char ip_protox[]; +extern struct socket *ip_rsvpd; /* reservation protocol daemon */ +extern struct socket *ip_mrouter; /* multicast routing daemon */ +extern int (*legal_vif_num)(int); +extern u_long (*ip_mcast_src)(int); +extern int rsvp_on; + +int ip_ctloutput(int, struct socket *, int, int, struct mbuf **); +void ip_drain(void); +void ip_freemoptions(struct ip_moptions *); +void ip_init(void); +extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *); +int ip_output(struct mbuf *, + struct mbuf *, struct route *, int, struct ip_moptions *); +void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, + struct mbuf *); +void ip_slowtimo(void); +struct mbuf * + ip_srcroute(void); +void ip_stripoptions(struct mbuf *, struct mbuf *); +int rip_ctloutput(int, struct socket *, int, int, struct mbuf **); +void rip_init(void); +void rip_input(struct mbuf *, int); +int rip_output(struct mbuf *, struct socket *, u_long); +int rip_usrreq(struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *); +void ipip_input(struct mbuf *, int); +void rsvp_input(struct mbuf *, int); +int ip_rsvp_init(struct socket *); +int ip_rsvp_done(void); +int ip_rsvp_vif_init(struct socket *, struct mbuf *); +int ip_rsvp_vif_done(struct socket *, struct mbuf *); +void ip_rsvp_force_done(struct socket *); + +#ifdef IPDIVERT +void div_init(void); +void div_input(struct mbuf *, int); +int div_usrreq(struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *); +extern u_short ip_divert_port; +extern u_short ip_divert_ignore; +#endif /* IPDIVERT */ + +#endif /* _KERNEL */ + +#endif /* !_NETINET_IP_VAR_H_ */ diff --git a/netinet/raw_ip.c b/netinet/raw_ip.c new file mode 100644 index 0000000..58b85a9 --- /dev/null +++ b/netinet/raw_ip.c @@ -0,0 +1,492 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 + * $FreeBSD: src/sys/netinet/raw_ip.c,v 1.147 2005/01/07 01:45:45 imp Exp $ + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_mac.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define _IP_VHL +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if !defined(COMPAT_IPFW) || COMPAT_IPFW == 1 +#undef COMPAT_IPFW +#define COMPAT_IPFW 1 +#else +#undef COMPAT_IPFW +#endif + +static struct inpcbhead ripcb; +static struct inpcbinfo ripcbinfo; + +/* + * Nominal space allocated to a raw ip socket. + */ +#define RIPSNDQ 8192 +#define RIPRCVQ 8192 + +/* + * Raw interface to IP protocol. + */ + +/* + * Initialize raw connection block q. + */ +void +rip_init(void) +{ + LIST_INIT(&ripcb); + ripcbinfo.listhead = &ripcb; + /* + * XXX We don't use the hash list for raw IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); +} + +static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET, 0, {0}, {0} }; +/* + * Setup generic address and protocol structures + * for raw_input routine, then pass them along with + * mbuf chain. + */ +void +rip_input(struct mbuf *m, int iphlen) +{ + struct ip *ip = mtod(m, struct ip *); + register struct inpcb *inp; + struct inpcb *last = 0; + struct mbuf *opts = 0; + + ripsrc.sin_addr = ip->ip_src; + for (inp = ripcb.lh_first; inp != NULL; inp = inp->inp_list.le_next) { + if (inp->inp_ip_p && inp->inp_ip_p != ip->ip_p) + continue; + if (inp->inp_laddr.s_addr && + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + continue; + if (inp->inp_faddr.s_addr && + inp->inp_faddr.s_addr != ip->ip_src.s_addr) + continue; + if (last) { + struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + if (n) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, n); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, n, + opts) == 0) { + /* should notify about lost packet */ + m_freem(n); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + opts = 0; + } + } + last = inp; + } + if (last) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, m); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, m, opts) == 0) { + m_freem(m); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Generate IP header and pass packet to ip_output. + * Tack on options user may have setup with control call. + */ +int +rip_output(struct mbuf *m, struct socket *so, u_long dst) +{ + struct ip *ip; + struct inpcb *inp = sotoinpcb(so); + int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | + IP_ALLOWBROADCAST; + + /* + * If the user handed us a complete IP packet, use it. + * Otherwise, allocate an mbuf for a header and fill it in. + */ + if ((inp->inp_flags & INP_HDRINCL) == 0) { + if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + M_PREPEND(m, sizeof(struct ip), M_WAIT); + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_off = 0; + ip->ip_p = inp->inp_ip_p; + ip->ip_len = m->m_pkthdr.len; + ip->ip_src = inp->inp_laddr; + ip->ip_dst.s_addr = dst; + ip->ip_ttl = MAXTTL; + } else { + if (m->m_pkthdr.len > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + ip = mtod(m, struct ip *); + /* don't allow both user specified and setsockopt options, + and don't allow packet length sizes that will crash */ +#ifdef _IP_VHL + if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2)) + && inp->inp_options) + || (ip->ip_len > m->m_pkthdr.len) + || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) { +#else + if (((ip->ip_hl != (sizeof (*ip) >> 2)) + && inp->inp_options) + || (ip->ip_len > m->m_pkthdr.len) + || (ip->ip_len < (ip->ip_hl << 2))) { +#endif + m_freem(m); + return EINVAL; + } + if (ip->ip_id == 0) + ip->ip_id = htons(ip_id++); + /* XXX prevent ip_output from overwriting header fields */ + flags |= IP_RAWOUTPUT; + ipstat.ips_rawout++; + } + return (ip_output(m, inp->inp_options, &inp->inp_route, flags, + inp->inp_moptions)); +} + +/* + * Raw IP socket option processing. + */ +int +rip_ctloutput(int op, struct socket *so, int level, int optname, + struct mbuf **m) +{ + struct inpcb *inp = sotoinpcb(so); + int error; + + if (level != IPPROTO_IP) { + if (op == PRCO_SETOPT && *m) + (void)m_free(*m); + return (EINVAL); + } + + switch (optname) { + + case IP_HDRINCL: + error = 0; + if (op == PRCO_SETOPT) { + if (m == 0 || *m == 0 || (*m)->m_len < sizeof (int)) + error = EINVAL; + else if (*mtod(*m, int *)) + inp->inp_flags |= INP_HDRINCL; + else + inp->inp_flags &= ~INP_HDRINCL; + if (*m) + (void)m_free(*m); + } else { + *m = m_get(M_WAIT, MT_SOOPTS); + (*m)->m_len = sizeof (int); + *mtod(*m, int *) = inp->inp_flags & INP_HDRINCL; + } + return (error); + +#ifdef COMPAT_IPFW + case IP_FW_GET: + if (ip_fw_ctl_ptr == NULL || op == PRCO_SETOPT) { + if (*m) (void)m_free(*m); + return(EINVAL); + } + return (*ip_fw_ctl_ptr)(optname, m); + + case IP_FW_ADD: + case IP_FW_DEL: + case IP_FW_FLUSH: + case IP_FW_ZERO: + if (ip_fw_ctl_ptr == NULL || op != PRCO_SETOPT) { + if (*m) (void)m_free(*m); + return(EINVAL); + } + return (*ip_fw_ctl_ptr)(optname, m); + + case IP_NAT: + if (ip_nat_ctl_ptr == NULL) { + if (*m) (void)m_free(*m); + return(EINVAL); + } + return (*ip_nat_ctl_ptr)(op, m); + +#endif + case IP_RSVP_ON: + return ip_rsvp_init(so); + break; + + case IP_RSVP_OFF: + return ip_rsvp_done(); + break; + + case IP_RSVP_VIF_ON: + return ip_rsvp_vif_init(so, *m); + + case IP_RSVP_VIF_OFF: + return ip_rsvp_vif_done(so, *m); + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + if (op == PRCO_SETOPT) { + error = ip_mrouter_set(optname, so, *m); + if (*m) + (void)m_free(*m); + } else if (op == PRCO_GETOPT) { + error = ip_mrouter_get(optname, so, m); + } else + error = EINVAL; + return (error); + } + return (ip_ctloutput(op, so, level, optname, m)); +} + +static u_long rip_sendspace = RIPSNDQ; /* XXX sysctl ? */ +static u_long rip_recvspace = RIPRCVQ; /* XXX sysctl ? */ + +/*ARGSUSED*/ +int +rip_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam, + struct mbuf *control) +{ + register int error = 0; + register struct inpcb *inp = sotoinpcb(so); + int s; + + if (req == PRU_CONTROL) + return (in_control(so, (uintptr_t)m, (caddr_t)nam, + (struct ifnet *)control)); + + switch (req) { + + case PRU_ATTACH: + if (inp) + panic("rip_attach"); + if ((so->so_state & SS_PRIV) == 0) { + error = EACCES; + break; + } + s = splnet(); + error = in_pcballoc(so, &ripcbinfo); + splx(s); + if (error) + break; + error = soreserve(so, rip_sendspace, rip_recvspace); + if (error) + break; + inp = (struct inpcb *)so->so_pcb; + inp->inp_ip_p = (uintptr_t)nam; + break; + + case PRU_DISCONNECT: + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + break; + } + /* FALLTHROUGH */ + case PRU_ABORT: + soisdisconnected(so); + /* FALLTHROUGH */ + case PRU_DETACH: + if (inp == 0) + panic("rip_detach"); + if (so == ip_mrouter) + ip_mrouter_done(); + ip_rsvp_force_done(so); + if (so == ip_rsvpd) + ip_rsvp_done(); + in_pcbdetach(inp); + break; + + case PRU_BIND: + { + struct sockaddr_in *addr = mtod(nam, struct sockaddr_in *); + + if (nam->m_len != sizeof(*addr)) { + error = EINVAL; + break; + } + if ((ifnet == 0) || + ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) || + (addr->sin_addr.s_addr && + ifa_ifwithaddr((struct sockaddr *)addr) == 0)) { + error = EADDRNOTAVAIL; + break; + } + inp->inp_laddr = addr->sin_addr; + break; + } + case PRU_CONNECT: + { + struct sockaddr_in *addr = mtod(nam, struct sockaddr_in *); + + if (nam->m_len != sizeof(*addr)) { + error = EINVAL; + break; + } + if (ifnet == 0) { + error = EADDRNOTAVAIL; + break; + } + if ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) { + error = EAFNOSUPPORT; + break; + } + inp->inp_faddr = addr->sin_addr; + soisconnected(so); + break; + } + + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + /* + * Mark the connection as being incapable of further input. + */ + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + /* + * Ship a packet out. The appropriate raw output + * routine handles any massaging necessary. + */ + case PRU_SEND: + { + register u_long dst; + + if (so->so_state & SS_ISCONNECTED) { + if (nam) { + error = EISCONN; + break; + } + dst = inp->inp_faddr.s_addr; + } else { + if (nam == NULL) { + error = ENOTCONN; + break; + } + dst = mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + } + error = rip_output(m, so, dst); + m = NULL; + break; + } + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + /* + * Not supported. + */ + case PRU_RCVOOB: + case PRU_RCVD: + case PRU_LISTEN: + case PRU_ACCEPT: + case PRU_SENDOOB: + error = EOPNOTSUPP; + break; + + case PRU_SOCKADDR: + in_setsockaddr(inp, nam); + break; + + case PRU_PEERADDR: + in_setpeeraddr(inp, nam); + break; + + default: + panic("rip_usrreq"); + } + if (m != NULL) + m_freem(m); + return (error); +} diff --git a/netinet/tcp_debug.c b/netinet/tcp_debug.c new file mode 100644 index 0000000..a014158 --- /dev/null +++ b/netinet/tcp_debug.c @@ -0,0 +1,172 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "opt_inet.h" +#include "opt_tcpdebug.h" + +#ifdef TCPDEBUG +/* load symbolic names */ +#define PRUREQUESTS +#define TCPSTATES +#define TCPTIMERS +#define TANAMES +#endif + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TCPDEBUG +static int tcpconsdebug = 0; +#endif + +/* + * Global ring buffer of TCP debugging state. Each entry captures a snapshot + * of TCP connection state at any given moment. tcp_debx addresses at the + * next available slot. There is no explicit export of this data structure; + * it will be read via /dev/kmem by debugging tools. + */ +static struct tcp_debug tcp_debug[TCP_NDEBUG]; +static int tcp_debx; + +/* + * Save TCP state at a given moment; optionally, both tcpcb and TCP packet + * header state will be saved. + */ +void +tcp_trace(short act, short ostate, struct tcpcb *tp, struct tcpiphdr *ti, + int req) +{ +#ifdef TCPDEBUG + tcp_seq seq, ack; + int len, flags; +#endif + struct tcp_debug *td; + + td = &tcp_debug[tcp_debx++]; + if (tcp_debx == TCP_NDEBUG) + tcp_debx = 0; + td->td_time = iptime(); + td->td_act = act; + td->td_ostate = ostate; + td->td_tcb = (caddr_t)tp; + if (tp != NULL) + td->td_cb = *tp; + else + bzero((caddr_t)&td->td_cb, sizeof (*tp)); + if (ti) + td->td_ti = *ti; + else + bzero((caddr_t)&td->td_ti, sizeof (*ti)); + td->td_req = req; +#ifdef TCPDEBUG + if (tcpconsdebug == 0) + return; + if (tp != NULL) + printf("%p %s:", tp, tcpstates[ostate]); + else + printf("???????? "); + printf("%s ", tanames[act]); + switch (act) { + case TA_INPUT: + case TA_OUTPUT: + case TA_DROP: + if (ti == 0) + break; + seq = ti->ti_seq; + ack = ti->ti_ack; + len = ti->ti_len; + if (act == TA_OUTPUT) { + seq = ntohl(seq); + ack = ntohl(ack); + len = ntohs((u_short)len); + } + if (act == TA_OUTPUT) + len -= sizeof (struct tcphdr); + if (len) + printf("[%x..%x)", seq, seq+len); + else + printf("%x", seq); + printf("@%x, urp=%x", ack, ti->ti_urp); + flags = ti->ti_flags; + if (flags) { + char *cp = "<"; +#define pf(f) { \ + if (ti->ti_flags & TH_##f) { \ + printf("%s%s", cp, #f); \ + cp = ","; \ + } \ +} + pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG); + printf(">"); + } + break; + + case TA_USER: + printf("%s", prurequests[req&0xff]); + if ((req & 0xff) == PRU_SLOWTIMO) + printf("<%s>", tcptimers[req>>8]); + break; + } + if (tp != NULL) + printf(" -> %s", tcpstates[tp->t_state]); + /* print out internal state of tp !?! */ + printf("\n"); + if (tp == NULL) + return; + printf("\trcv_(nxt,wnd,up) (%x,%x,%x) snd_(una,nxt,max) (%x,%x,%x)\n", + tp->rcv_nxt, tp->rcv_wnd, tp->rcv_up, tp->snd_una, tp->snd_nxt, + tp->snd_max); + printf("\tsnd_(wl1,wl2,wnd) (%x,%x,%x)\n", + tp->snd_wl1, tp->snd_wl2, tp->snd_wnd); +#endif /* TCPDEBUG */ +} diff --git a/netinet/tcp_debug.h b/netinet/tcp_debug.h new file mode 100644 index 0000000..55004f3 --- /dev/null +++ b/netinet/tcp_debug.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/tcp_debug.h,v 1.17 2009/02/13 15:14:43 luigi Exp $ + */ + + +#ifndef _NETINET_TCP_DEBUG_H_ +#define _NETINET_TCP_DEBUG_H_ + +#include /* struct tcpiphdr */ +#include /* struct tcpcb */ + +struct tcp_debug { + uint32_t td_time; /* network format */ + short td_act; + short td_ostate; + caddr_t td_tcb; + struct tcpiphdr td_ti; + short td_req; + struct tcpcb td_cb; +}; + +#define TA_INPUT 0 +#define TA_OUTPUT 1 +#define TA_USER 2 +#define TA_RESPOND 3 +#define TA_DROP 4 + +#ifdef TANAMES +static const char *tanames[] = + { "input", "output", "user", "respond", "drop" }; +#endif + +#define TCP_NDEBUG 100 + +#ifndef _KERNEL +/* XXX common variables for broken applications. */ +struct tcp_debug tcp_debug[TCP_NDEBUG]; +int tcp_debx; +#endif + +#endif /* !_NETINET_TCP_DEBUG_H_ */ diff --git a/netinet/tcp_fsm.h b/netinet/tcp_fsm.h new file mode 100644 index 0000000..ceb053e --- /dev/null +++ b/netinet/tcp_fsm.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_TCP_FSM_H_ +#define _NETINET_TCP_FSM_H_ + +/* + * TCP FSM state definitions. + * Per RFC793, September, 1981. + */ + +#define TCP_NSTATES 11 + +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED) +#define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED) +#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) + +#ifdef TCPOUTFLAGS +/* + * Flags used when sending segments in tcp_output. + * Basic flags (TH_RST,TH_ACK,TH_SYN,TH_FIN) are totally + * determined by state, with the proviso that TH_FIN is sent only + * if all data queued for output is included in the segment. + */ +static u_char tcp_outflags[TCP_NSTATES] = { + TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK, + TH_ACK, TH_ACK, + TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK, +}; +#endif + +#ifdef KPROF +int tcp_acounts[TCP_NSTATES][PRU_NREQ]; +#endif + +#ifdef TCPSTATES +char *tcpstates[] = { + "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", + "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", + "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", +}; +#endif + +#endif diff --git a/netinet/tcp_input.c b/netinet/tcp_input.c new file mode 100644 index 0000000..e62f1eb --- /dev/null +++ b/netinet/tcp_input.c @@ -0,0 +1,2151 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_tcpdebug.h" + +#ifndef TUBA_INCLUDE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* before tcp_seq.h, for tcp_random18() */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +static struct tcpiphdr tcp_saveti; +#endif + +static int tcprexmtthresh = 3; +tcp_seq tcp_iss; +tcp_cc tcp_ccgen; + +struct tcpstat tcpstat; +SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, + CTLFLAG_RD, &tcpstat , tcpstat, ""); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, ""); + +u_long tcp_now; +struct inpcbhead tcb; +struct inpcbinfo tcbinfo; + +static void tcp_dooptions(struct tcpcb *, + u_char *, int, struct tcpiphdr *, struct tcpopt *); +static void tcp_pulloutofband(struct socket *, + struct tcpiphdr *, struct mbuf *); +static int tcp_reass(struct tcpcb *, struct tcpiphdr *, struct mbuf *); +static void tcp_xmit_timer(struct tcpcb *, int); + +#endif /* TUBA_INCLUDE */ + +/* + * Insert segment ti into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. The macro form does the common case inline + * (segment is the next to be received on an established connection, + * and the queue is empty), avoiding linkage into and removal + * from the queue and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ +#ifdef TCP_ACK_HACK +#define TCP_REASS(tp, ti, m, so, flags) { \ + if ((ti)->ti_seq == (tp)->rcv_nxt && \ + (tp)->seg_next == (struct tcpiphdr *)(tp) && \ + (tp)->t_state == TCPS_ESTABLISHED) { \ + if (ti->ti_flags & TH_PUSH) \ + tp->t_flags |= TF_ACKNOW; \ + else \ + tp->t_flags |= TF_DELACK; \ + (tp)->rcv_nxt += (ti)->ti_len; \ + flags = (ti)->ti_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (ti), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} +#else +#define TCP_REASS(tp, ti, m, so, flags) { \ + if ((ti)->ti_seq == (tp)->rcv_nxt && \ + (tp)->seg_next == (struct tcpiphdr *)(tp) && \ + (tp)->t_state == TCPS_ESTABLISHED) { \ + tp->t_flags |= TF_DELACK; \ + (tp)->rcv_nxt += (ti)->ti_len; \ + flags = (ti)->ti_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (ti), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} +#endif +#ifndef TUBA_INCLUDE + +static int +tcp_reass(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m) +{ + register struct tcpiphdr *q; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + /* + * Call with ti==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (ti == 0) + goto present; + + /* + * Find a segment which begins after this one does. + */ + for (q = tp->seg_next; q != (struct tcpiphdr *)tp; + q = (struct tcpiphdr *)q->ti_next) + if (SEQ_GT(q->ti_seq, ti->ti_seq)) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) { + register int i; + q = (struct tcpiphdr *)q->ti_prev; + /* conversion to int (in i) handles seq wraparound */ + i = q->ti_seq + q->ti_len - ti->ti_seq; + if (i > 0) { + if (i >= ti->ti_len) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + m_freem(m); + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + ti->ti_len -= i; + ti->ti_seq += i; + } + q = (struct tcpiphdr *)(q->ti_next); + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += ti->ti_len; +#if (defined(__GNUC__) && (defined(__arm__) || defined(__mips__))) + STR32_UNALGN(ti,m); +#else + REASS_MBUF(ti) = m; /* XXX */ +#endif + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q != (struct tcpiphdr *)tp) { + register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq; + if (i <= 0) + break; + if (i < q->ti_len) { + q->ti_seq += i; + q->ti_len -= i; +#if (defined(__GNUC__) && (defined(__arm__) || defined(__mips__))) + LD32_UNALGN(q,m); + m_adj(m, i); +#else + m_adj(REASS_MBUF(q), i); +#endif + break; + } + q = (struct tcpiphdr *)q->ti_next; +#if (defined(__GNUC__) && (defined(__arm__) || defined(__mips__))) + LD32_UNALGN((struct tcpiphdr *)q->ti_prev,m); +#else + m = REASS_MBUF((struct tcpiphdr *)q->ti_prev); +#endif + remque(q->ti_prev); + m_freem(m); + } + + /* + * Stick new segment in its place. + */ + insque(ti, q->ti_prev); + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + ti = tp->seg_next; + if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt) + return (0); + do { + tp->rcv_nxt += ti->ti_len; + flags = ti->ti_flags & TH_FIN; + remque(ti); +#if (defined(__GNUC__) && (defined(__arm__) || defined(__mips__))) + LD32_UNALGN(ti,m); +#else + m = REASS_MBUF(ti); +#endif + ti = (struct tcpiphdr *)ti->ti_next; + if (so->so_state & SS_CANTRCVMORE) + m_freem(m); + else + sbappend(&so->so_rcv, m); + } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt); + sorwakeup(so); + return (flags); +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +void +tcp_input(struct mbuf *m, int iphlen) +{ + register struct tcpiphdr *ti; + register struct inpcb *inp; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + register struct tcpcb *tp = 0; + register int tiflags; + struct socket *so = 0; + int todrop, acked, ourfinisacked, needoutput = 0; + struct in_addr laddr; + int dropsocket = 0; + int iss = 0; + u_long tiwin; + struct tcpopt to; /* options in this segment */ + struct rmxp_tao *taop; /* pointer to our TAO cache entry */ + struct rmxp_tao tao_noncached; /* in case there's no cached entry */ +#ifdef TCPDEBUG + short ostate = 0; +#endif + + bzero((char *)&to, sizeof(to)); + + tcpstat.tcps_rcvtotal++; + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + ti = mtod(m, struct tcpiphdr *); + if (iphlen > sizeof (struct ip)) + ip_stripoptions(m, (struct mbuf *)0); + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + + /* + * Checksum extended TCP header and data. + */ + tlen = ((struct ip *)ti)->ip_len; + len = sizeof (struct ip) + tlen; + ti->ti_next = ti->ti_prev = 0; + ti->ti_x1 = 0; + ti->ti_len = (u_short)tlen; + HTONS(ti->ti_len); + ti->ti_sum = in_cksum(m, len); + if (ti->ti_sum) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } +#endif /* TUBA_INCLUDE */ + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = ti->ti_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; + ti->ti_len = tlen; + if (off > sizeof (struct tcphdr)) { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + optlen = off - sizeof (struct tcphdr); + optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); + } + tiflags = ti->ti_flags; + + /* + * Convert TCP protocol specific fields to host format. + */ + NTOHL(ti->ti_seq); + NTOHL(ti->ti_ack); + NTOHS(ti->ti_win); + NTOHS(ti->ti_urp); + + /* + * Drop TCP, IP headers and TCP options. + */ + m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + + /* + * Locate pcb for segment. + */ +findpcb: + inp = in_pcblookuphash(&tcbinfo, ti->ti_src, ti->ti_sport, + ti->ti_dst, ti->ti_dport, 1); + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == NULL) { + if (log_in_vain && tiflags & TH_SYN) { + char buf0[INET_ADDRSTRLEN]; + char buf1[INET_ADDRSTRLEN]; + + log(LOG_INFO, "Connection attempt to TCP %s:%d" + " from %s:%d\n", + inet_ntoa_r(ti->ti_dst, buf0), ntohs(ti->ti_dport), + inet_ntoa_r(ti->ti_src, buf1), ntohs(ti->ti_sport)); + } + goto dropwithreset; + } + tp = intotcpcb(inp); + if (tp == 0) + goto dropwithreset; + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((tiflags & TH_SYN) == 0) + tiwin = ti->ti_win << tp->snd_scale; + else + tiwin = ti->ti_win; + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; + tcp_saveti = *ti; + } +#endif + if (so->so_options & SO_ACCEPTCONN) { + register struct tcpcb *tp0 = tp; + struct socket *so2; + if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (tiflags & TH_ACK) { + tcpstat.tcps_badsyn++; + goto dropwithreset; + } + goto drop; + } + so2 = sonewconn(so, 0); + if (so2 == 0) { + tcpstat.tcps_listendrop++; + so2 = sodropablereq(so); + if (so2) { + tcp_drop(sototcpcb(so2), ETIMEDOUT); + so2 = sonewconn(so, 0); + } + if (!so2) + goto drop; + } + so = so2; + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + inp = (struct inpcb *)so->so_pcb; + inp->inp_laddr = ti->ti_dst; + inp->inp_lport = ti->ti_dport; + in_pcbrehash(inp); +#if BSD>=43 + inp->inp_options = ip_srcroute(); +#endif + tp = intotcpcb(inp); + tp->t_state = TCPS_LISTEN; + tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); + + /* Compute proper scaling value from buffer space */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + TCP_MAXWIN << tp->request_r_scale < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + } + } + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_idle = 0; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + + /* + * Process options if not in LISTEN state, + * else do it below (after getting remote address). + */ + if (tp->t_state != TCPS_LISTEN) + tcp_dooptions(tp, optp, optlen, ti, &to); + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED above, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flags & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + /* + * Using the CC option is compulsory if once started: + * the segment is OK if no T/TCP was negotiated or + * if the segment has a CC option equal to CCrecv + */ + ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || + ((to.to_flags & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && + ti->ti_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_now; + tp->ts_recent = to.to_tsval; + } + + if (ti->ti_len == 0) { + if (SEQ_GT(ti->ti_ack, tp->snd_una) && + SEQ_LEQ(ti->ti_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + tp->t_dupacks < tcprexmtthresh) { + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + if ((to.to_flags & TOF_TS) != 0) + tcp_xmit_timer(tp, + tcp_now - to.to_tsecr + 1); + else if (tp->t_rtt && + SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, tp->t_rtt); + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = ti->ti_ack; + m_freem(m); + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + tp->t_timer[TCPT_REXMT] = 0; + else if (tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + + if (so->so_snd.sb_flags & SB_NOTIFY) + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + return; + } + } else if (ti->ti_ack == tp->snd_una && + tp->seg_next == (struct tcpiphdr *)tp && + ti->ti_len <= sbspace(&so->so_rcv)) { + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += ti->ti_len; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += ti->ti_len; + /* + * Add data to socket buffer. + */ + sbappend(&so->so_rcv, m); + sorwakeup(so); +#ifdef TCP_ACK_HACK + /* + * If this is a short packet, then ACK now - with Nagel + * congestion avoidance sender won't send more until + * he gets an ACK. + */ + if (tiflags & TH_PUSH) { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } else { + tp->t_flags |= TF_DELACK; + } +#else + tp->t_flags |= TF_DELACK; +#endif + return; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * If it is from this socket, drop it, it must be forged. + * Don't bother responding if the destination was a broadcast. + * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * tp->iss, and send a segment: + * + * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. + * Fill in remote peer address fields if not previously specified. + * Enter SYN_RECEIVED state, and process any other fields of this + * segment in this state. + */ + case TCPS_LISTEN: { + struct mbuf *am; + register struct sockaddr_in *sin; + + if (tiflags & TH_RST) + goto drop; + if (tiflags & TH_ACK) + goto dropwithreset; + if ((tiflags & TH_SYN) == 0) + goto drop; + if ((ti->ti_dport == ti->ti_sport) && + (ti->ti_dst.s_addr == ti->ti_src.s_addr)) + goto drop; + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + */ + if (m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + goto drop; + am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ + if (am == NULL) + goto drop; + am->m_len = sizeof (struct sockaddr_in); + sin = mtod(am, struct sockaddr_in *); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ti->ti_src; + sin->sin_port = ti->ti_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ti->ti_dst; + if (in_pcbconnect(inp, am)) { + inp->inp_laddr = laddr; + (void) m_free(am); + goto drop; + } + (void) m_free(am); + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); + dropsocket = 0; /* socket is already gone */ + goto drop; + } + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + tcp_dooptions(tp, optp, optlen, ti, &to); + if (iss) + tp->iss = iss; + else + tp->iss = tcp_iss; + tcp_iss += TCP_ISSINCR/4; + tp->irs = ti->ti_seq; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + /* + * Initialization of the tcpcb for transaction; + * set SND.WND = SEG.WND, + * initialize CCsend and CCrecv. + */ + tp->snd_wnd = tiwin; /* initial send-window */ + tp->cc_send = CC_INC(tcp_ccgen); + tp->cc_recv = to.to_cc; + /* + * Perform TAO test on incoming CC (SEG.CC) option, if any. + * - compare SEG.CC against cached CC from the same host, + * if any. + * - if SEG.CC > chached value, SYN must be new and is accepted + * immediately: save new CC in the cache, mark the socket + * connected, enter ESTABLISHED state, turn on flag to + * send a SYN in the next segment. + * A virtual advertised window is set in rcv_adv to + * initialize SWS prevention. Then enter normal segment + * processing: drop SYN, process data and FIN. + * - otherwise do a normal 3-way handshake. + */ + if ((to.to_flags & TOF_CC) != 0) { + if (taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { + taop->tao_cc = to.to_cc; + tp->t_state = TCPS_ESTABLISHED; + + /* + * If there is a FIN, or if there is data and the + * connection is local, then delay SYN,ACK(SYN) in + * the hope of piggy-backing it on a response + * segment. Otherwise must send ACK now in case + * the other side is slow starting. + */ + if ((tiflags & TH_FIN) || (ti->ti_len != 0 && + in_localaddr(inp->inp_faddr))) + tp->t_flags |= (TF_DELACK | TF_NEEDSYN); + else + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + + /* + * Limit the `virtual advertised window' to TCP_MAXWIN + * here. Even if we requested window scaling, it will + * become effective only later when our SYN is acked. + */ + tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); + tcpstat.tcps_connects++; + soisconnected(so); + tp->t_timer[TCPT_KEEP] = tcp_keepinit; + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + /* else do standard 3-way handshake */ + } else { + /* + * No CC option, but maybe CC.NEW: + * invalidate cached value. + */ + taop->tao_cc = 0; + } + /* + * TAO test failed or there was no CC option, + * do a standard 3-way handshake. + */ + tp->t_flags |= TF_ACKNOW; + tp->t_state = TCPS_SYN_RECEIVED; + tp->t_timer[TCPT_KEEP] = tcp_keepinit; + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + + /* + * If the state is SYN_RECEIVED: + * if seg contains SYN/ACK, send a RST. + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if (tiflags & TH_ACK) { + if (tiflags & TH_SYN) { + tcpstat.tcps_badsyn++; + goto dropwithreset; + } + if (SEQ_LEQ(ti->ti_ack, tp->snd_una) || + SEQ_GT(ti->ti_ack, tp->snd_max)) + goto dropwithreset; + } + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + if ((tiflags & TH_ACK) && + (SEQ_LEQ(ti->ti_ack, tp->iss) || + SEQ_GT(ti->ti_ack, tp->snd_max))) { + /* + * If we have a cached CCsent for the remote host, + * hence we haven't just crashed and restarted, + * do not send a RST. This may be a retransmission + * from the other side after our earlier ACK was lost. + * Our new SYN, when it arrives, will serve as the + * needed ACK. + */ + if (taop->tao_ccsent != 0) + goto drop; + else + goto dropwithreset; + } + if (tiflags & TH_RST) { + if (tiflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((tiflags & TH_SYN) == 0) + goto drop; + tp->snd_wnd = ti->ti_win; /* initial send window */ + tp->cc_recv = to.to_cc; /* foreign CC */ + + tp->irs = ti->ti_seq; + tcp_rcvseqinit(tp); + if (tiflags & TH_ACK) { + /* + * Our SYN was acked. If segment contains CC.ECHO + * option, check it to make sure this segment really + * matches our SYN. If not, just drop it as old + * duplicate, but send an RST if we're still playing + * by the old rules. If no CC.ECHO option, make sure + * we don't get fooled into using T/TCP. + */ + if (to.to_flags & TOF_CCECHO) { + if (tp->cc_send != to.to_ccecho) { + if (taop->tao_ccsent != 0) + goto drop; + else + goto dropwithreset; + } + } else + tp->t_flags &= ~TF_RCVD_CC; + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* Segment is acceptable, update cache if undefined. */ + if (taop->tao_ccsent == 0) + taop->tao_ccsent = to.to_ccecho; + + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (ti->ti_len != 0) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + /* + * Received in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + tiflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= TF_ACKNOW; + tp->t_timer[TCPT_REXMT] = 0; + if (to.to_flags & TOF_CC) { + if (taop->tao_cc != 0 && + CC_GT(to.to_cc, taop->tao_cc)) { + /* + * update cache and make transition: + * SYN-SENT -> ESTABLISHED* + * SYN-SENT* -> FIN-WAIT-1* + */ + taop->tao_cc = to.to_cc; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + } + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_state = TCPS_SYN_RECEIVED; + } else { + /* CC.NEW or no option => invalidate cache */ + taop->tao_cc = 0; + tp->t_state = TCPS_SYN_RECEIVED; + } + } + +trimthenstep6: + /* + * Advance ti->ti_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + ti->ti_seq++; + if (ti->ti_len > tp->rcv_wnd) { + todrop = ti->ti_len - tp->rcv_wnd; + m_adj(m, -todrop); + ti->ti_len = tp->rcv_wnd; + tiflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = ti->ti_seq - 1; + tp->rcv_up = ti->ti_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (tiflags & TH_ACK) + goto process_ACK; + goto step6; + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * if segment contains a SYN and CC [not CC.NEW] option: + * if state == TIME_WAIT and connection duration > MSL, + * drop packet and send RST; + * + * if SEG.CC > CCrecv then is new SYN, and can implicitly + * ack the FIN (and data) in retransmission queue. + * Complete close and delete TCPCB. Then reprocess + * segment, hoping to find new TCPCB in LISTEN state; + * + * else must be old SYN; drop it. + * else do normal processing. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + case TCPS_TIME_WAIT: + if ((tiflags & TH_SYN) && + (to.to_flags & TOF_CC) && tp->cc_recv != 0) { + if (tp->t_state == TCPS_TIME_WAIT && + tp->t_duration > TCPTV_MSL) + goto dropwithreset; + if (CC_GT(to.to_cc, tp->cc_recv)) { + tp = tcp_close(tp); + goto findpcb; + } + else + goto drop; + } + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flags & TOF_TS) != 0 && (tiflags & TH_RST) == 0 && + tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + /* + * T/TCP mechanism + * If T/TCP was negotiated and the segment doesn't have CC, + * or if it's CC is wrong then drop the segment. + * RST segments do not have to comply with this. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && + ((to.to_flags & TOF_CC) == 0 || tp->cc_recv != to.to_cc) && + (tiflags & TH_RST) == 0) + goto dropafterack; + + todrop = tp->rcv_nxt - ti->ti_seq; + if (todrop > 0) { + if (tiflags & TH_SYN) { + tiflags &= ~TH_SYN; + ti->ti_seq++; + if (ti->ti_urp > 1) + ti->ti_urp--; + else + tiflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > ti->ti_len + || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + tiflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = ti->ti_len; + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += todrop; + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + m_adj(m, todrop); + ti->ti_seq += todrop; + ti->ti_len -= todrop; + if (ti->ti_urp > todrop) + ti->ti_urp -= todrop; + else { + tiflags &= ~TH_URG; + ti->ti_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= ti->ti_len) { + tcpstat.tcps_rcvbyteafterwin += ti->ti_len; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (tiflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { + iss = tp->rcv_nxt + TCP_ISSINCR; + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + ti->ti_len -= todrop; + tiflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { + tp->ts_recent_age = tcp_now; + tp->ts_recent = to.to_tsval; + } + + /* + * If the RST bit is set examine the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK, TIME_WAIT STATES + * Close the tcb. + */ + if (tiflags&TH_RST) switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + goto drop; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + tp = tcp_close(tp); + goto drop; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (tiflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + goto dropwithreset; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((tiflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* + * Upon successful completion of 3-way handshake, + * update cache.CC if it was undefined, pass any queued + * data to the user, and advance state appropriately. + */ + if ((taop = tcp_gettaocache(inp)) != NULL && + taop->tao_cc == 0) + taop->tao_cc = tp->cc_recv; + + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (ti->ti_len == 0 && (tiflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcpiphdr *)0, + (struct mbuf *)0); + tp->snd_wl1 = ti->ti_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < ti->ti_ack <= tp->snd_max + * then advance tp->snd_una to ti->ti_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { + if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (tp->t_timer[TCPT_REXMT] == 0 || + ti->ti_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtt = 0; + tp->snd_nxt = ti->ti_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + if (SEQ_GT(ti->ti_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + } + +process_ACK: + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (to.to_flags & TOF_TS) + tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); + else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp,tp->t_rtt); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (ti->ti_ack == tp->snd_max) { + tp->t_timer[TCPT_REXMT] = 0; + needoutput = 1; + } else if (tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet). + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + if (so->so_snd.sb_flags & SB_NOTIFY) + sowwakeup(so); + tp->snd_una = ti->ti_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + tp->t_timer[TCPT_2MSL] = tcp_maxidle; + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + tp->t_duration < TCPTV_MSL) + tp->t_timer[TCPT_2MSL] = + tp->t_rxtcur * TCPTV_TWTRUNC; + else + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((tiflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, ti->ti_seq) || + (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || + (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (ti->ti_len == 0 && + tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = ti->ti_seq; + tp->snd_wl2 = ti->ti_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((tiflags & TH_URG) && ti->ti_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { + ti->ti_urp = 0; /* XXX */ + tiflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { + tp->rcv_up = ti->ti_seq + ti->ti_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (ti->ti_urp <= (u_long)ti->ti_len +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, ti, m); + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((ti->ti_len || (tiflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + TCP_REASS(tp, ti, m, so, tiflags); + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + tiflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (tiflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (tp->t_flags & TF_NEEDSYN) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + tp->t_duration < TCPTV_MSL) { + tp->t_timer[TCPT_2MSL] = + tp->t_rxtcur * TCPTV_TWTRUNC; + /* For transaction client, force ACK now. */ + tp->t_flags |= TF_ACKNOW; + } + else + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + break; + } + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + */ + if (tiflags & TH_RST) + goto drop; +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + return; + +dropwithreset: + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) + goto drop; +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + if (tiflags & TH_ACK) + tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + else { + if (tiflags & TH_SYN) + ti->ti_len++; + tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, + TH_RST|TH_ACK); + } + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); +#endif + m_freem(m); + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; +#ifndef TUBA_INCLUDE +} + +static void +tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti, + struct tcpopt *to) +{ + u_short mss = 0; + int opt, optlen; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + switch (opt) { + + default: + continue; + + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + NTOHS(mss); + break; + + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flags |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + NTOHL(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + NTOHL(to->to_tsecr); + + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (ti->ti_flags & TH_SYN) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to->to_tsval; + tp->ts_recent_age = tcp_now; + } + break; + case TCPOPT_CC: + if (optlen != TCPOLEN_CC) + continue; + to->to_flags |= TOF_CC; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + if (ti->ti_flags & TH_SYN) + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCNEW: + if (optlen != TCPOLEN_CC) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + to->to_flags |= TOF_CCNEW; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCECHO: + if (optlen != TCPOLEN_CC) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + to->to_flags |= TOF_CCECHO; + bcopy((char *)cp + 2, + (char *)&to->to_ccecho, sizeof(to->to_ccecho)); + NTOHL(to->to_ccecho); + break; + } + } + if (ti->ti_flags & TH_SYN) + tcp_mss(tp, mss); /* sets t_maxseg */ +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(struct socket *so, struct tcpiphdr *ti, struct mbuf *m) +{ + int cnt = ti->ti_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(struct tcpcb *tp, int rtt) +{ + int delta; + + tcpstat.tcps_rttupdated++; + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtt = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * NOTE that this routine is only called when we process an incoming + * segment, for outgoing segments only tcp_mssopt is called. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + */ +void +tcp_mss(struct tcpcb *tp, int offer) +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct rmxp_tao *taop; + int origoffer = offer; + + inp = tp->t_inpcb; + if ((rt = tcp_rtlookup(inp)) == NULL) { + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + return; + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + + taop = rmx_taop(rt->rt_rmx); + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (offer == -1) + offer = taop->tao_mssopt; + /* + * Offer == 0 means that there was no MSS on the SYN segment, + * in this case we use tcp_mssdflt. + */ + if (offer == 0) + offer = tcp_mssdflt; + else + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even is the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + taop->tao_mssopt = offer; + + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ); + tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + else + { + mss = ifp->if_mtu - sizeof(struct tcpiphdr); + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + mss = min(mss, offer); + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * In case of T/TCP, origoffer==-1 indicates, that no segments + * were received yet. In this case we just guess, otherwise + * we do the same as before T/TCP. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) + mss -= TCPOLEN_CC_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize); + } + /* + * Don't force slow-start on local network. + */ + if (!in_localaddr(inp->inp_faddr)) + tp->snd_cwnd = mss; + + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(struct tcpcb *tp) +{ + struct rtentry *rt; + + rt = tcp_rtlookup(tp->t_inpcb); + if (rt == NULL) + return tcp_mssdflt; + + return rt->rt_ifp->if_mtu - sizeof(struct tcpiphdr); +} +#endif /* TUBA_INCLUDE */ diff --git a/netinet/tcp_output.c b/netinet/tcp_output.c new file mode 100644 index 0000000..0e50329 --- /dev/null +++ b/netinet/tcp_output.c @@ -0,0 +1,757 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_tcpdebug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif + +#ifdef notyet +extern struct mbuf *m_copypack(); +#endif + + +/* + * Tcp output routine: figure out what should be sent and send it. + */ +int +tcp_output( + register struct tcpcb *tp) +{ + register struct socket *so = tp->t_inpcb->inp_socket; + register long len, win; + int off, flags, error; + register struct mbuf *m; + register struct tcpiphdr *ti; + u_char opt[TCP_MAXOLEN]; + unsigned optlen, hdrlen; + int idle, sendalot; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + + /* + * Determine length of data that should be transmitted, + * and flags that will be used. + * If there is some data or critical controls (SYN, RST) + * to send, then transmit; otherwise, investigate further. + */ + idle = (tp->snd_max == tp->snd_una); + if (idle && tp->t_idle >= tp->t_rxtcur) + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + */ + tp->snd_cwnd = tp->t_maxseg; +again: + sendalot = 0; + off = tp->snd_nxt - tp->snd_una; + win = min(tp->snd_wnd, tp->snd_cwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_force) { + if (win == 0) { + /* + * If we still have some data to send, then + * clear the FIN bit. Usually this would + * happen below when it realizes that we + * aren't sending all the data. However, + * if we have exactly 1 byte of unset data, + * then it won't clear the FIN bit below, + * and if we are in persist state, we wind + * up sending the packet without recording + * that we sent the FIN bit. + * + * We can't just blindly clear the FIN bit, + * because if we don't have any more data + * to send then the probe will be the FIN + * itself. + */ + if (off < so->so_snd.sb_cc) + flags &= ~TH_FIN; + win = 1; + } else { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + + len = min(so->so_snd.sb_cc, win) - off; + + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + /* + * Lop off SYN bit if it has already been sent. However, if this + * is SYN-SENT state and if segment contains data and if we don't + * know that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + flags &= ~TH_SYN; + off--, len++; + if (len > 0 && tp->t_state == TCPS_SYN_SENT && + taop->tao_ccsent == 0) + return 0; + } + + /* + * Be careful not to send data and/or FIN on SYN segments + * in cases when no CC option will be sent. + * This measure is needed to prevent interoperability problems + * with not fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && + ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || + ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { + len = 0; + flags &= ~TH_FIN; + } + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be -1. Otherwise, window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + len = 0; + if (win == 0) { + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rxtshift = 0; + tp->snd_nxt = tp->snd_una; + if (tp->t_timer[TCPT_PERSIST] == 0) + tcp_setpersist(tp); + } + } + if (len > tp->t_maxseg) { + len = tp->t_maxseg; + sendalot = 1; + } + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + + win = sbspace(&so->so_rcv); + + /* + * Sender silly window avoidance. If connection is idle + * and can send all data, a maximum segment, + * at least a maximum default-size segment do it, + * or are forced, do it; otherwise don't bother. + * If peer's buffer is tiny, then send + * when window is at least half open. + * If retransmitting (possibly after persist timer forced us + * to send into a small window), then must resend. + */ + if (len) { + if (len == tp->t_maxseg) + goto send; + if ((idle || tp->t_flags & TF_NODELAY) && + (tp->t_flags & TF_NOPUSH) == 0 && + len + off >= so->so_snd.sb_cc) + goto send; + if (tp->t_force) + goto send; + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) + goto send; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + goto send; + } + + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 50% of the maximum possible + * window, then want to send a window update to peer. + */ + if (win > 0) { + /* + * "adv" is the amount we can increase the window, + * taking into account that we are limited by + * TCP_MAXWIN << tp->rcv_scale. + */ + long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - + (tp->rcv_adv - tp->rcv_nxt); + + if (adv >= (long) (2 * tp->t_maxseg)) + goto send; + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; + } + + /* + * Send if we owe peer an ACK. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if ((flags & TH_RST) || + ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) + goto send; + if (SEQ_GT(tp->snd_up, tp->snd_una)) + goto send; + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, or we're retransmitting the FIN, + * then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + + /* + * TCP window updates are not reliable, rather a polling protocol + * using ``persist'' packets is used to insure receipt of window + * updates. The three ``states'' for the output side are: + * idle not doing retransmits or persists + * persisting to move a small or zero window + * (re)transmitting and thereby not persisting + * + * tp->t_timer[TCPT_PERSIST] + * is set when we are in persist state. + * tp->t_force + * is set when we are called to send a persist packet. + * tp->t_timer[TCPT_REXMT] + * is set when we are retransmitting + * The output side is idle when both timers are zero. + * + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. + */ + if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && + tp->t_timer[TCPT_PERSIST] == 0) { + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + + /* + * No reason to send a segment, just return. + */ + return (0); + +send: + /* + * Before ESTABLISHED, force sending of initial options + * unless TCP set not to do any options. + * NOTE: we assume that the IP/TCP header plus TCP options + * always fit in a single mbuf, leaving room for a maximum + * link header, i.e. + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN + */ + optlen = 0; + hdrlen = sizeof (struct tcpiphdr); + if (flags & TH_SYN) { + tp->snd_nxt = tp->iss; + if ((tp->t_flags & TF_NOOPT) == 0) { + u_short mss; + + opt[0] = TCPOPT_MAXSEG; + opt[1] = TCPOLEN_MAXSEG; + mss = htons((u_short) tcp_mssopt(tp)); + (void)memcpy(opt + 2, &mss, sizeof(mss)); + optlen = TCPOLEN_MAXSEG; + + if ((tp->t_flags & TF_REQ_SCALE) && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_SCALE))) { + *((u_long *) (opt + optlen)) = htonl( + TCPOPT_NOP << 24 | + TCPOPT_WINDOW << 16 | + TCPOLEN_WINDOW << 8 | + tp->request_r_scale); + optlen += 4; + } + } + } + + /* + * Send a timestamp and echo-reply if this is a SYN and our side + * wants to use timestamps (TF_REQ_TSTMP is set) or both our side + * and our peer have sent timestamps in our SYN's. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (flags & TH_RST) == 0 && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_TSTMP))) { + u_long *lp = (u_long *)(opt + optlen); + + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(TCPOPT_TSTAMP_HDR); + *lp++ = htonl(tcp_now); + *lp = htonl(tp->ts_recent); + optlen += TCPOLEN_TSTAMP_APPA; + } + + /* + * Send `CC-family' options if our side wants to use them (TF_REQ_CC), + * options are allowed (!TF_NOOPT) and it's not a RST. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (flags & TH_RST) == 0) { + switch (flags & (TH_SYN|TH_ACK)) { + /* + * This is a normal ACK, send CC if we received CC before + * from our peer. + */ + case TH_ACK: + if (!(tp->t_flags & TF_RCVD_CC)) + break; + /*FALLTHROUGH*/ + + /* + * We can only get here in T/TCP's SYN_SENT* state, when + * we're a sending a non-SYN segment without waiting for + * the ACK of our SYN. A check above assures that we only + * do this if our peer understands T/TCP. + */ + case 0: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + + optlen += 4; + break; + + /* + * This is our initial SYN, check whether we have to use + * CC or CC.new. + */ + case TH_SYN: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? + TCPOPT_CCNEW : TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + optlen += 4; + break; + + /* + * This is a SYN,ACK; send CC and CC.echo if we received + * CC from our peer. + */ + case (TH_SYN|TH_ACK): + if (tp->t_flags & TF_RCVD_CC) { + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_send); + optlen += 4; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CCECHO; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_recv); + optlen += 4; + } + break; + } + } + + hdrlen += optlen; + + /* + * Adjust data length if insertion of options will + * bump the packet length beyond the t_maxopd length. + * Clear the FIN bit because we cut off the tail of + * the segment. + */ + if (len + optlen > tp->t_maxopd) { + /* + * If there is still more to send, don't close the connection. + */ + flags &= ~TH_FIN; + len = tp->t_maxopd - optlen; + sendalot = 1; + } + +/*#ifdef DIAGNOSTIC*/ + if (max_linkhdr + hdrlen > MHLEN) + panic("tcphdr too big"); +/*#endif*/ + + /* + * Grab a header mbuf, attaching a copy of data to + * be transmitted, and initialize the header from + * the template for sends on this connection. + */ + if (len) { + if (tp->t_force && len == 1) + tcpstat.tcps_sndprobe++; + else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tcpstat.tcps_sndrexmitpack++; + tcpstat.tcps_sndrexmitbyte += len; + } else { + tcpstat.tcps_sndpack++; + tcpstat.tcps_sndbyte += len; + } +#ifdef notyet + if ((m = m_copypack(so->so_snd.sb_mb, off, + (int)len, max_linkhdr + hdrlen)) == 0) { + error = ENOBUFS; + goto out; + } + /* + * m_copypack left space for our hdr; use it. + */ + m->m_len += hdrlen; + m->m_data -= hdrlen; +#else + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(so->so_snd.sb_mb, off, (int) len, + mtod(m, caddr_t) + hdrlen); + m->m_len += len; + } else { + m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; + goto out; + } + } +#endif + /* + * If we're sending everything we've got, set PUSH. + * (This will keep happy those implementations which only + * give data to the user when a buffer fills or + * a PUSH comes in.) + */ + if (off + len == so->so_snd.sb_cc) + flags |= TH_PUSH; + } else { + if (tp->t_flags & TF_ACKNOW) + tcpstat.tcps_sndacks++; + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + tcpstat.tcps_sndctrl++; + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + tcpstat.tcps_sndurg++; + else + tcpstat.tcps_sndwinup++; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + m->m_pkthdr.rcvif = (struct ifnet *)0; + ti = mtod(m, struct tcpiphdr *); + if (tp->t_template == 0) + panic("tcp_output"); + (void)memcpy(ti, tp->t_template, sizeof (struct tcpiphdr)); + + /* + * Fill in fields, remembering maximum advertised + * window for use in delaying messages about window sizes. + * If resending a FIN, be sure not to use a new sequence number. + */ + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + /* + * If we are doing retransmissions, then snd_nxt will + * not reflect the first unsent octet. For ACK only + * packets, we do not want the sequence number of the + * retransmitted packet, we want the sequence number + * of the next unsent octet. So, if there is no data + * (and no SYN or FIN), use snd_max instead of snd_nxt + * when filling in ti_seq. But if we are in persist + * state, snd_max might reflect one byte beyond the + * right edge of the window, so use snd_nxt in that + * case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) + ti->ti_seq = htonl(tp->snd_nxt); + else + ti->ti_seq = htonl(tp->snd_max); + ti->ti_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy(opt, ti + 1, optlen); + ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; + } + ti->ti_flags = flags; + /* + * Calculate receive window. Don't shrink window, + * but avoid silly window syndrome. + */ + if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) + win = 0; + if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) + win = (long)(tp->rcv_adv - tp->rcv_nxt); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + ti->ti_win = htons((u_short) (win>>tp->rcv_scale)); + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { + ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + ti->ti_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull + * the urgent pointer to the left edge of the send window + * so that it doesn't drift into the send window on sequence + * number wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + + /* + * Put TCP length in extended header, and then + * checksum extended header and data. + */ + if (len + optlen) + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + + optlen + len)); + ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); + + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtt == 0) { + tp->t_rtt = 1; + tp->t_rtseq = startseq; + tcpstat.tcps_segstimed++; + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing an ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ + if (tp->t_timer[TCPT_REXMT] == 0 && + tp->snd_nxt != tp->snd_una) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + } else + if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) + tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); +#endif + + /* + * Fill in IP length and desired time to live and + * send to IP level. There should be a better way + * to handle ttl and tos; we could keep them in + * the template, but need a way to checksum without them. + */ + m->m_pkthdr.len = hdrlen + len; +#ifdef TUBA + if (tp->t_tuba_pcb) + error = tuba_output(m, tp); + else +#endif + { +#if 1 + struct rtentry *rt; +#endif + ((struct ip *)ti)->ip_len = m->m_pkthdr.len; + ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ + ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ +#if 1 + /* + * See if we should do MTU discovery. We do it only if the following + * are true: + * 1) we have a valid route to the destination + * 2) the MTU is not locked (if it is, then discovery has been + * disabled) + */ + if ((rt = tp->t_inpcb->inp_route.ro_rt) + && rt->rt_flags & RTF_UP + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + ((struct ip *)ti)->ip_off |= IP_DF; + } +#endif + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + so->so_options & SO_DONTROUTE, 0); + } + if (error) { +out: + if (error == ENOBUFS) { + tcp_quench(tp->t_inpcb, 0); + return (0); + } +#if 1 + if (error == EMSGSIZE) { + /* + * ip_output() will have already fixed the route + * for us. tcp_mtudisc() will, as its last action, + * initiate retransmission, so it is important to + * not do so here. + */ + tcp_mtudisc(tp->t_inpcb, 0); + return 0; + } +#endif + if ((error == EHOSTUNREACH || error == ENETDOWN) + && TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + return (0); + } + return (error); + } + tcpstat.tcps_sndtotal++; + + /* + * Data sent (as far as we can tell). + * If this advertises a larger window than any other segment, + * then remember the size of the advertised window. + * Any pending ACK has now been sent. + */ + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); + if (sendalot) + goto again; + return (0); +} + +void +tcp_setpersist( + register struct tcpcb *tp) +{ + register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + + if (tp->t_timer[TCPT_REXMT]) + panic("tcp_output REXMT"); + /* + * Start/restart persistance timer. + */ + TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], + t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; +} diff --git a/netinet/tcp_seq.h b/netinet/tcp_seq.h new file mode 100644 index 0000000..c24b294 --- /dev/null +++ b/netinet/tcp_seq.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_seq.h 8.3 (Berkeley) 6/21/95 + * $FreeBSD: src/sys/netinet/tcp_seq.h,v 1.26 2006/06/18 14:24:12 andre Exp $ + */ + + +#ifndef _NETINET_TCP_SEQ_H_ +#define _NETINET_TCP_SEQ_H_ +/* + * TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* for modulo comparisons of timestamps */ +#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* + * TCP connection counts are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define CC_LT(a,b) ((int)((a)-(b)) < 0) +#define CC_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define CC_GT(a,b) ((int)((a)-(b)) > 0) +#define CC_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* Macro to increment a CC: skip 0 which has a special meaning */ +#define CC_INC(c) (++(c) == 0 ? ++(c) : (c)) + +/* + * Macros to initialize tcp sequence numbers for + * send and receive from initial send and receive + * sequence numbers. + */ +#define tcp_rcvseqinit(tp) \ + (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1 + +#define tcp_sendseqinit(tp) \ + (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ + (tp)->iss + +#define TCP_PAWS_IDLE (uint32_t)(24L * 24L * 60L * 60L * PR_SLOWHZ) + /* timestamp wrap-around time */ + +#ifdef _KERNEL +extern tcp_cc tcp_ccgen; /* global connection count */ + +/* + * Increment for tcp_iss each second. + * This is designed to increment at the standard 250 KB/s, + * but with a random component averaging 128 KB. + * We also increment tcp_iss by a quarter of this amount + * each time we use the value for a new connection. + * If defined, the tcp_random18() macro should produce a + * number in the range [0-0x3ffff] that is hard to predict. + */ +#ifndef tcp_random18 +#define tcp_random18() (((uint32_t)random() >> 14) & 0x3ffffL) +#endif +#define TCP_ISSINCR (uint32_t)(122L*1024L + tcp_random18()) + +extern tcp_seq tcp_iss; /* tcp initial send seq # */ +#else +#define TCP_ISSINCR (250*1024) /* increment for tcp_iss each second */ +#endif /* _KERNEL */ +#endif /* _NETINET_TCP_SEQ_H_ */ diff --git a/netinet/tcp_subr.c b/netinet/tcp_subr.c new file mode 100644 index 0000000..69012cc --- /dev/null +++ b/netinet/tcp_subr.c @@ -0,0 +1,729 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_subr.c,v 1.226 2005/05/07 00:41:36 cperciva Exp $ + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_tcpdebug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define _IP_VHL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif + +int tcp_mssdflt = TCP_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, + &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + +static int tcp_do_rfc1323 = 1; +#if !defined(__rtems__) +static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, + CTLFLAG_RW, &tcp_rttdflt , 0, ""); + +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, + CTLFLAG_RW, &tcp_do_rfc1323 , 0, ""); +#endif + +static void tcp_notify(struct inpcb *, int); + +/* + * Target size of TCP PCB hash table. Will be rounded down to a prime + * number. + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 128 +#endif + +/* + * Tcp initialization + */ +void +tcp_init(void) +{ + + tcp_iss = random(); /* wrong, but better than a constant */ + tcp_ccgen = 1; + LIST_INIT(&tcb); + tcbinfo.listhead = &tcb; + tcbinfo.hashbase = hashinit(TCBHASHSIZE, M_PCB, &tcbinfo.hashmask); + if (max_protohdr < sizeof(struct tcpiphdr)) + max_protohdr = sizeof(struct tcpiphdr); + if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) + panic("tcp_init"); +} + +/* + * Create template to be used to send tcp packets on a connection. + * Call after host entry created, allocates an mbuf and fills + * in a skeletal tcp/ip header, minimizing the amount of work + * necessary when the connection is used. + */ +struct tcpiphdr * +tcp_template(struct tcpcb *tp) +{ + register struct inpcb *inp = tp->t_inpcb; + register struct mbuf *m; + register struct tcpiphdr *n; + + if ((n = tp->t_template) == 0) { + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof (struct tcpiphdr); + n = mtod(m, struct tcpiphdr *); + } + n->ti_next = n->ti_prev = 0; + n->ti_x1 = 0; + n->ti_pr = IPPROTO_TCP; + n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); + n->ti_src = inp->inp_laddr; + n->ti_dst = inp->inp_faddr; + n->ti_sport = inp->inp_lport; + n->ti_dport = inp->inp_fport; + n->ti_seq = 0; + n->ti_ack = 0; + n->ti_x2 = 0; + n->ti_off = 5; + n->ti_flags = 0; + n->ti_win = 0; + n->ti_sum = 0; + n->ti_urp = 0; + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection tp->t_template. If flags are given + * then we send a message back to the TCP which originated the + * segment ti, and discard the mbuf containing it and any other + * attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, + tcp_seq ack, tcp_seq seq, int flags) +{ + register int tlen; + int win = 0; + struct route *ro = 0; + struct route sro; + + if (tp) { + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + ro = &tp->t_inpcb->inp_route; + } else { + ro = &sro; + bzero(ro, sizeof *ro); + } + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; +#ifdef TCP_COMPAT_42 + tlen = 1; +#else + tlen = 0; +#endif + m->m_data += max_linkhdr; + *mtod(m, struct tcpiphdr *) = *ti; + ti = mtod(m, struct tcpiphdr *); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = NULL; + m->m_data = (caddr_t)ti; + m->m_len = sizeof (struct tcpiphdr); + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } + xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_long); + xchg(ti->ti_dport, ti->ti_sport, u_short); +#undef xchg + } + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); + tlen += sizeof (struct tcpiphdr); + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + ti->ti_next = ti->ti_prev = 0; + ti->ti_x1 = 0; + ti->ti_seq = htonl(seq); + ti->ti_ack = htonl(ack); + ti->ti_x2 = 0; + ti->ti_off = sizeof (struct tcphdr) >> 2; + ti->ti_flags = flags; + if (tp) + ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); + else + ti->ti_win = htons((u_short)win); + ti->ti_urp = 0; + ti->ti_sum = 0; + ti->ti_sum = in_cksum(m, tlen); + ((struct ip *)ti)->ip_len = tlen; + ((struct ip *)ti)->ip_ttl = ip_defttl; +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, ti, 0); +#endif + (void) ip_output(m, NULL, ro, 0, NULL); + if (ro == &sro && ro->ro_rt) { + RTFREE(ro->ro_rt); + } +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. + */ +struct tcpcb * +tcp_newtcpcb(struct inpcb *inp) +{ + struct tcpcb *tp; + + tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT); + if (tp == NULL) + return ((struct tcpcb *)0); + bzero((char *) tp, sizeof(struct tcpcb)); + tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp; + tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; + + if (tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + tp->t_inpcb = inp; + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = TCPTV_MIN; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + inp->inp_ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(struct tcpcb *tp, int errnum) +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errnum == ETIMEDOUT && tp->t_softerror) + errnum = tp->t_softerror; + so->so_error = errnum; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(struct tcpcb *tp) +{ + register struct tcpiphdr *t; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + register struct mbuf *m; + register struct rtentry *rt; + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as the 16 samples. + * 16 samples is enough for the srtt filter to converge + * to within 5% of the correct value; fewer samples and + * we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (tp->t_rttupdated >= 16 && + (rt = inp->inp_route.ro_rt) && + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { + register u_long i = 0; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + tcpstat.tcps_cachedrtt++; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + tcpstat.tcps_cachedrttvar++; + } + /* + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + */ + if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + ((i = tp->snd_ssthresh) != 0) && rt->rt_rmx.rmx_ssthresh) || + i < (rt->rt_rmx.rmx_sendpipe / 2)) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + tcpstat.tcps_cachedssthresh++; + } + } + /* free the reassembly queue, if any */ + t = tp->seg_next; + while (t != (struct tcpiphdr *)tp) { + t = (struct tcpiphdr *)t->ti_next; +#if (defined(__GNUC__) && (defined(__arm__) || defined(__mips__))) + LD32_UNALGN((struct tcpiphdr *)t->ti_prev,m); +#else + m = REASS_MBUF((struct tcpiphdr *)t->ti_prev); +#endif + remque(t->ti_prev); + m_freem(m); + } + if (tp->t_template) + (void) m_free(dtom(tp->t_template)); + free(tp, M_PCB); + inp->inp_ppcb = 0; + soisdisconnected(so); + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain(void) +{ + +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + */ +static void +tcp_notify(struct inpcb *inp, int error) +{ + struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + struct socket *so = inp->inp_socket; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) + so->so_error = error; + else + tp->t_softerror = error; + soconnwakeup (so); + sorwakeup(so); + sowwakeup(so); +} + +#ifdef __rtems__ +#define INP_INFO_RLOCK(a) +#define INP_INFO_RUNLOCK(a) +#define INP_LOCK(a) +#define INP_UNLOCK(a) +#endif + +static int +tcp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == NULL) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xtcpcb); + return (0); + } + + if (req->newptr != NULL) + return (EPERM); + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + + sysctl_wire_old_buffer(req, 2 * (sizeof xig) + + n * sizeof(struct xtcpcb)); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; +/* xig.xig_sogen = so_gencnt; remove by ccj */ + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + /* ccj add exit if the count is 0 */ + if (!n) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt) +#if 0 + && + cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) +#endif + inp_list[i++] = inp; + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + caddr_t inp_ppcb; + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + else + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); +#if 0 + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); +#endif + error = SYSCTL_OUT(req, &xt, sizeof xt); + } + INP_UNLOCK(inp); + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + INP_INFO_RLOCK(&tcbinfo); + xig.xig_gen = tcbinfo.ipi_gencnt; +#if 0 + xig.xig_sogen = so_gencnt; +#endif + xig.xig_count = tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&tcbinfo); + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +void +tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + struct ip *ip = vip; + struct tcphdr *th; + void (*notify)(struct inpcb *, int) = tcp_notify; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; +#if 1 + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; +#endif + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip != NULL) { +#ifdef _IP_VHL + th = (struct tcphdr *)((caddr_t)ip + + (IP_VHL_HL(ip->ip_vhl) << 2)); +#else + th = (struct tcphdr *)((caddr_t)ip + + (ip->ip_hl << 2)); +#endif + in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport, + cmd, notify); + } else + in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); +} + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +void +tcp_quench( struct inpcb *inp, int errnum) +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +void +tcp_mtudisc(struct inpcb *inp, int errnum) +{ + struct tcpcb *tp = intotcpcb(inp); + struct rtentry *rt; + struct rmxp_tao *taop; + struct socket *so = inp->inp_socket; + int offered; + int mss; + + if (tp) { + rt = tcp_rtlookup(inp); + if (!rt || !rt->rt_rmx.rmx_mtu) { + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; + return; + } + taop = rmx_taop(rt->rt_rmx); + offered = taop->tao_mssopt; + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + if (offered) + mss = min(mss, offered); + /* + * XXX - The above conditional probably violates the TCP + * spec. The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + if (tp->t_maxopd <= mss) + return; + tp->t_maxopd = mss; + + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) + mss -= TCPOLEN_CC_APPA; +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (so->so_snd.sb_hiwat < mss) + mss = so->so_snd.sb_hiwat; + + tp->t_maxseg = mss; + + tcpstat.tcps_mturesent++; + tp->t_rtt = 0; + tp->snd_nxt = tp->snd_una; + tcp_output(tp); + } +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated, then return NULL. This routine + * is called by TCP routines that access the rmx structure and by tcp_mss + * to get the interface MTU. + */ +struct rtentry * +tcp_rtlookup(struct inpcb *inp) +{ + struct route *ro; + struct rtentry *rt; + + ro = &inp->inp_route; + rt = ro->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (inp->inp_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inp->inp_faddr; + rtalloc(ro); + rt = ro->ro_rt; + } + } + return rt; +} + +/* + * Return a pointer to the cached information about the remote host. + * The cached information is stored in the protocol specific part of + * the route metrics. + */ +struct rmxp_tao * +tcp_gettaocache(struct inpcb *inp) +{ + struct rtentry *rt = tcp_rtlookup(inp); + + /* Make sure this is a host route and is up. */ + if (rt == NULL || + (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) + return NULL; + + return rmx_taop(rt->rt_rmx); +} diff --git a/netinet/tcp_timer.c b/netinet/tcp_timer.c new file mode 100644 index 0000000..a6df2b2 --- /dev/null +++ b/netinet/tcp_timer.c @@ -0,0 +1,385 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_tcpdebug.h" + +#ifndef TUBA_INCLUDE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* before tcp_seq.h, for tcp_random18() */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif + +int tcp_keepinit = TCPTV_KEEP_INIT; +SYSCTL_INT(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, + CTLFLAG_RW, &tcp_keepinit , 0, ""); + +int tcp_keepidle = TCPTV_KEEP_IDLE; +SYSCTL_INT(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, + CTLFLAG_RW, &tcp_keepidle , 0, ""); + +static int tcp_keepintvl = TCPTV_KEEPINTVL; +SYSCTL_INT(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, + CTLFLAG_RW, &tcp_keepintvl , 0, ""); + +static int always_keepalive = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, + CTLFLAG_RW, &always_keepalive , 0, ""); + +static int tcp_keepcnt = TCPTV_KEEPCNT; + /* max idle probes */ +static int tcp_maxpersistidle = TCPTV_KEEP_IDLE; + /* max idle time in persist */ +int tcp_maxidle; +#else /* TUBA_INCLUDE */ + +static int tcp_maxpersistidle; +#endif /* TUBA_INCLUDE */ + +/* + * Fast timeout routine for processing delayed acks + */ +void +tcp_fasttimo(void) +{ + register struct inpcb *inp; + register struct tcpcb *tp; + int s; + + s = splnet(); + + for (inp = tcb.lh_first; inp != NULL; inp = inp->inp_list.le_next) { + if ((tp = (struct tcpcb *)inp->inp_ppcb) && + (tp->t_flags & TF_DELACK)) { + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_delack++; + (void) tcp_output(tp); + } + } + splx(s); +} + +/* + * Tcp protocol timeout routine called every 500 ms. + * Updates the timers in all active tcb's and + * causes finite state machine actions if timers expire. + */ +void +tcp_slowtimo(void) +{ + register struct inpcb *ip, *ipnxt; + register struct tcpcb *tp; + register int i; + int s; +#ifdef TCPDEBUG + int ostate; +#endif + + s = splnet(); + + tcp_maxidle = tcp_keepcnt * tcp_keepintvl; + + ip = tcb.lh_first; + if (ip == NULL) { + splx(s); + return; + } + /* + * Search through tcb's and update active timers. + */ + for (; ip != NULL; ip = ipnxt) { + ipnxt = ip->inp_list.le_next; + tp = intotcpcb(ip); + if (tp == 0 || tp->t_state == TCPS_LISTEN) + continue; + for (i = 0; i < TCPT_NTIMERS; i++) { + if (tp->t_timer[i] && --tp->t_timer[i] == 0) { +#ifdef TCPDEBUG + ostate = tp->t_state; +#endif + tp = tcp_timers(tp, i); + if (tp == NULL) + goto tpgone; +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options + & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, + (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + } + } + tp->t_idle++; + tp->t_duration++; + if (tp->t_rtt) + tp->t_rtt++; +tpgone: + ; + } + tcp_iss += TCP_ISSINCR/PR_SLOWHZ; /* increment iss */ +#ifdef TCP_COMPAT_42 + if ((int)tcp_iss < 0) + tcp_iss = TCP_ISSINCR; /* XXX */ +#endif + tcp_now++; /* for timestamps */ + splx(s); +} +#ifndef TUBA_INCLUDE + +/* + * Cancel all timers for TCP tp. + */ +void +tcp_canceltimers(struct tcpcb *tp) +{ + register int i; + + for (i = 0; i < TCPT_NTIMERS; i++) + tp->t_timer[i] = 0; +} + +int tcp_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; + +static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ + +/* + * TCP timer processing. + */ +struct tcpcb * +tcp_timers(struct tcpcb *tp, int timer) +{ + register int rexmt; + + switch (timer) { + + /* + * 2 MSL timeout in shutdown went off. If we're closed but + * still waiting for peer to close and connection has been idle + * too long, or if 2MSL time is up from TIME_WAIT, delete connection + * control block. Otherwise, check again in a bit. + */ + case TCPT_2MSL: + if (tp->t_state != TCPS_TIME_WAIT && + tp->t_idle <= tcp_maxidle) + tp->t_timer[TCPT_2MSL] = tcp_keepintvl; + else + tp = tcp_close(tp); + break; + + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one segment. + */ + case TCPT_REXMT: + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + tcpstat.tcps_timeoutdrop++; + tp = tcp_drop(tp, tp->t_softerror ? + tp->t_softerror : ETIMEDOUT); + break; + } + tcpstat.tcps_rexmttimeo++; + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + /* + * If losing, let the lower level know and try for + * a better route. Also, if we backed off this far, + * our srtt estimate is probably bogus. Clobber it + * so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + tp->snd_nxt = tp->snd_una; + /* + * Force a segment to be sent. + */ + tp->t_flags |= TF_ACKNOW; + /* + * If timing a segment in this window, stop the timer. + */ + tp->t_rtt = 0; + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_dupacks = 0; + } + (void) tcp_output(tp); + break; + + /* + * Persistance timer into zero window. + * Force a byte to be output, if possible. + */ + case TCPT_PERSIST: + tcpstat.tcps_persisttimeo++; + /* + * Hack: if the peer is dead/unreachable, we do not + * time out if the window is closed. After a full + * backoff, drop the connection if the idle time + * (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. + */ + if (tp->t_rxtshift == TCP_MAXRXTSHIFT) { + u_long maxidle = TCP_REXMTVAL(tp); + if (maxidle < tp->t_rttmin) + maxidle = tp->t_rttmin; + maxidle *= tcp_totbackoff; + if (tp->t_idle >= tcp_maxpersistidle || + tp->t_idle >= maxidle) { + tcpstat.tcps_persistdrop++; + tp = tcp_drop(tp, ETIMEDOUT); + break; + } + } + tcp_setpersist(tp); + tp->t_force = 1; + (void) tcp_output(tp); + tp->t_force = 0; + break; + + /* + * Keep-alive timer went off; send something + * or drop connection if idle for too long. + */ + case TCPT_KEEP: + tcpstat.tcps_keeptimeo++; + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((always_keepalive || + tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if (tp->t_idle >= tcp_keepidle + tcp_maxidle) + goto dropit; + /* + * Send a packet designed to force a response + * if the peer is up and reachable: + * either an ACK if the connection is still alive, + * or an RST if the peer has closed the connection + * due to timeout or reboot. + * Using sequence number tp->snd_una-1 + * causes the transmitted zero-length segment + * to lie outside the receive window; + * by the protocol spec, this requires the + * correspondent TCP to respond. + */ + tcpstat.tcps_keepprobe++; +#ifdef TCP_COMPAT_42 + /* + * The keepalive packet must have nonzero length + * to get a 4.2 host to respond. + */ + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt - 1, tp->snd_una - 1, 0); +#else + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); +#endif + tp->t_timer[TCPT_KEEP] = tcp_keepintvl; + } else + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + break; + dropit: + tcpstat.tcps_keepdrops++; + tp = tcp_drop(tp, ETIMEDOUT); + break; + } + return (tp); +} +#endif /* TUBA_INCLUDE */ diff --git a/netinet/tcp_timer.h b/netinet/tcp_timer.h new file mode 100644 index 0000000..4bbbb71 --- /dev/null +++ b/netinet/tcp_timer.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/tcp_timer.h,v 1.26 2004/08/16 18:32:07 rwatson Exp $ + */ + +#ifndef _NETINET_TCP_TIMER_H_ +#define _NETINET_TCP_TIMER_H_ + +/* + * Definitions of the TCP timers. These timers are counted + * down PR_SLOWHZ times a second. + */ +#define TCPT_NTIMERS 4 + +#define TCPT_REXMT 0 /* retransmit */ +#define TCPT_PERSIST 1 /* retransmit persistence */ +#define TCPT_KEEP 2 /* keep alive */ +#define TCPT_2MSL 3 /* 2*msl quiet time timer */ + +/* + * The TCPT_REXMT timer is used to force retransmissions. + * The TCP has the TCPT_REXMT timer set whenever segments + * have been sent for which ACKs are expected but not yet + * received. If an ACK is received which advances tp->snd_una, + * then the retransmit timer is cleared (if there are no more + * outstanding segments) or reset to the base value (if there + * are more ACKs expected). Whenever the retransmit timer goes off, + * we retransmit one unacknowledged segment, and do a backoff + * on the retransmit timer. + * + * The TCPT_PERSIST timer is used to keep window size information + * flowing even if the window goes shut. If all previous transmissions + * have been acknowledged (so that there are no retransmissions in progress), + * and the window is too small to bother sending anything, then we start + * the TCPT_PERSIST timer. When it expires, if the window is nonzero, + * we go to transmit state. Otherwise, at intervals send a single byte + * into the peer's window to force him to update our window information. + * We do this at most as often as TCPT_PERSMIN time intervals, + * but no more frequently than the current estimate of round-trip + * packet time. The TCPT_PERSIST timer is cleared whenever we receive + * a window update from the peer. + * + * The TCPT_KEEP timer is used to keep connections alive. If an + * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, + * but not yet established, then we drop the connection. Once the connection + * is established, if the connection is idle for TCPTV_KEEP_IDLE time + * (and keepalives have been enabled on the socket), we begin to probe + * the connection. We force the peer to send us a segment by sending: + * + * This segment is (deliberately) outside the window, and should elicit + * an ack segment in response from the peer. If, despite the TCPT_KEEP + * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE + * amount of time probing, then we drop the connection. + */ + +/* + * Time constants. + */ +#define TCPTV_MSL ( 30*PR_SLOWHZ) /* max seg lifetime (hah!) */ +#define TCPTV_SRTTBASE 0 /* base roundtrip time; + if 0, no idea yet */ +#define TCPTV_RTOBASE ( 3*PR_SLOWHZ) /* assumed RTO if no info */ +#define TCPTV_SRTTDFLT ( 3*PR_SLOWHZ) /* assumed RTT if no info */ + +#define TCPTV_PERSMIN ( 5*PR_SLOWHZ) /* retransmit persistence */ +#define TCPTV_PERSMAX ( 60*PR_SLOWHZ) /* maximum persist interval */ + +#define TCPTV_KEEP_INIT ( 75*PR_SLOWHZ) /* initial connect keep alive */ +#define TCPTV_KEEP_IDLE (120*60*PR_SLOWHZ) /* dflt time before probing */ +#define TCPTV_KEEPINTVL ( 75*PR_SLOWHZ) /* default probe interval */ +#define TCPTV_KEEPCNT 8 /* max probes before drop */ + +#define TCPTV_MIN ( 1*PR_SLOWHZ) /* minimum allowable value */ +#define TCPTV_REXMTMAX ( 64*PR_SLOWHZ) /* max allowable REXMT value */ + +#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ + +#define TCP_LINGERTIME 120 /* linger at most 2 minutes */ + +#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ + +#ifdef TCPTIMERS +static char *tcptimers[] = + { "REXMT", "PERSIST", "KEEP", "2MSL" }; +#endif + +/* + * Force a time value to be in a certain range. + */ +#define TCPT_RANGESET(tv, value, tvmin, tvmax) { \ + (tv) = (value); \ + if ((u_long)(tv) < (u_long)(tvmin)) \ + (tv) = (tvmin); \ + else if ((u_long)(tv) > (u_long)(tvmax)) \ + (tv) = (tvmax); \ +} + +#ifdef _KERNEL +extern int tcp_keepinit; /* time to establish connection */ +extern int tcp_keepidle; /* time before keepalive probes begin */ +extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_ttl; /* time to live for TCP segs */ +extern int tcp_backoff[]; +#endif + +#endif diff --git a/netinet/tcp_usrreq.c b/netinet/tcp_usrreq.c new file mode 100644 index 0000000..7f4c110 --- /dev/null +++ b/netinet/tcp_usrreq.c @@ -0,0 +1,845 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 + * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.120 2005/05/01 14:01:38 rwatson Exp $ + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "opt_tcpdebug.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif + +/* + * TCP protocol interface to socket abstraction. + */ +extern char *tcpstates[]; + +static int tcp_attach(struct socket *); +static int tcp_connect(struct tcpcb *, struct mbuf *); +static struct tcpcb * + tcp_disconnect(struct tcpcb *); +static struct tcpcb * + tcp_usrclosed(struct tcpcb *); + +#ifdef TCPDEBUG +#define TCPDEBUG0 int ostate +#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 +#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ + tcp_trace(TA_USER, ostate, tp, 0, req) +#else +#define TCPDEBUG0 +#define TCPDEBUG1() +#define TCPDEBUG2(req) +#endif + +/* + * TCP attaches to socket via pru_attach(), reserving space, + * and an internet control block. + */ +static int +tcp_usr_attach(struct socket *so, intptr_t proto) +{ + int s = splnet(); + int error; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = 0; + TCPDEBUG0; + + TCPDEBUG1(); + if (inp) { + error = EISCONN; + goto out; + } + + error = tcp_attach(so); + if (error) + goto out; + + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME * hz; + tp = sototcpcb(so); +out: + TCPDEBUG2(PRU_ATTACH); + splx(s); + return error; +} + +/* + * pru_detach() detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a pru_disconnect(), + * which may finish later; embryonic TCB's can just + * be discarded here. + */ +static int +tcp_usr_detach(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + TCPDEBUG0; + + if (inp == 0) { + splx(s); + return EINVAL; /* XXX */ + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if (tp->t_state > TCPS_LISTEN) + tp = tcp_disconnect(tp); + else + tp = tcp_close(tp); + + TCPDEBUG2(PRU_DETACH); + splx(s); + return error; +} + +#define COMMON_START() TCPDEBUG0; \ + do { \ + if (inp == 0) { \ + splx(s); \ + return EINVAL; \ + } \ + tp = intotcpcb(inp); \ + TCPDEBUG1(); \ + } while(0) + +#define COMMON_END(req) out: TCPDEBUG2(req); splx(s); return error; goto out + + +/* + * Give the socket an address. + */ +static int +tcp_usr_bind(struct socket *so, struct mbuf *nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in *sinp; + + COMMON_START(); + + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + sinp = mtod(nam, struct sockaddr_in *); + if (sinp->sin_family == AF_INET && + IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + error = in_pcbbind(inp, nam); + if (error) + goto out; + COMMON_END(PRU_BIND); + +} + +/* + * Prepare to accept connections. + */ +static int +tcp_usr_listen(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if (inp->inp_lport == 0) + error = in_pcbbind(inp, NULL); + if (error == 0) + tp->t_state = TCPS_LISTEN; + COMMON_END(PRU_LISTEN); +} + +/* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ +static int +tcp_usr_connect(struct socket *so, struct mbuf *nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in *sinp; + + COMMON_START(); + + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + sinp = mtod(nam, struct sockaddr_in *); + if (sinp->sin_family == AF_INET + && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + + if ((error = tcp_connect(tp, nam)) != 0) + goto out; + error = tcp_output(tp); + COMMON_END(PRU_CONNECT); +} + +/* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ +static int +tcp_usr_disconnect(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tp = tcp_disconnect(tp); + COMMON_END(PRU_DISCONNECT); +} + +/* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ +static int +tcp_usr_accept(struct socket *so, struct mbuf *nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + in_setpeeraddr(inp, nam); + COMMON_END(PRU_ACCEPT); +} + +/* + * Mark the connection as being incapable of further output. + */ +static int +tcp_usr_shutdown(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + socantsendmore(so); + tp = tcp_usrclosed(tp); + if (tp) + error = tcp_output(tp); + COMMON_END(PRU_SHUTDOWN); +} + +/* + * After a receive, possibly send window update to peer. + */ +static int +tcp_usr_rcvd(struct socket *so, intptr_t flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tcp_output(tp); + COMMON_END(PRU_RCVD); +} + +/* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. + */ +static int +tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *nam, + struct mbuf *control) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if (control && control->m_len) { + m_freem(control); /* XXX shouldn't caller do this??? */ + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + + if(!(flags & PRUS_OOB)) { + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + error = tcp_connect(tp, nam); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + + if (flags & PRUS_EOF) { + /* + * Close the send side of the connection after + * the data is sent. + */ + socantsendmore(so); + tp = tcp_usrclosed(tp); + } + if (tp != NULL) + error = tcp_output(tp); + } else { + if (sbspace(&so->so_snd) < -512) { + m_freem(m); + error = ENOBUFS; + goto out; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. + */ + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + error = tcp_connect(tp, nam); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + tp->snd_up = tp->snd_una + so->so_snd.sb_cc; + tp->t_force = 1; + error = tcp_output(tp); + tp->t_force = 0; + } + COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); +} + +/* + * Abort the TCP. + */ +static int +tcp_usr_abort(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tp = tcp_drop(tp, ECONNABORTED); + COMMON_END(PRU_ABORT); +} + +/* + * Fill in st_bklsize for fstat() operations on a socket. + */ +static int +tcp_usr_sense(struct socket *so, struct stat *sb) +{ + int s = splnet(); + + sb->st_blksize = so->so_snd.sb_hiwat; + splx(s); + return 0; +} + +/* + * Receive out-of-band data. + */ +static int +tcp_usr_rcvoob(struct socket *so, struct mbuf *m, intptr_t flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if ((so->so_oobmark == 0 && + (so->so_state & SS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + tp->t_oobflags & TCPOOB_HADDATA) { + error = EINVAL; + goto out; + } + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { + error = EWOULDBLOCK; + goto out; + } + m->m_len = 1; + *mtod(m, caddr_t) = tp->t_iobc; + if ((flags & MSG_PEEK) == 0) + tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); + COMMON_END(PRU_RCVOOB); +} + +static int +tcp_usr_sockaddr(struct socket *so, struct mbuf *nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + in_setsockaddr(inp, nam); + COMMON_END(PRU_SOCKADDR); +} + +static int +tcp_usr_peeraddr(struct socket *so, struct mbuf *nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + in_setpeeraddr(inp, nam); + COMMON_END(PRU_PEERADDR); +} + +/* + * XXX - this should just be a call to in_control, but we need to get + * the types worked out. + */ +static int +tcp_usr_control(struct socket *so, intptr_t cmd, caddr_t arg, struct ifnet *ifp) +{ + return in_control(so, cmd, arg, ifp); +} + +/* xxx - should be const */ +struct pr_usrreqs tcp_usrreqs = { + tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind, + tcp_usr_connect, pru_connect2_notsupp, tcp_usr_control, tcp_usr_detach, + tcp_usr_disconnect, tcp_usr_listen, tcp_usr_peeraddr, tcp_usr_rcvd, + tcp_usr_rcvoob, tcp_usr_send, tcp_usr_sense, tcp_usr_shutdown, + tcp_usr_sockaddr +}; + +/* + * Common subroutine to open a TCP connection to remote host specified + * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local + * port number if needed. Call in_pcbladdr to do the routing and to choose + * a local host address (interface). If there is an existing incarnation + * of the same connection in TIME-WAIT state and if the remote host was + * sending CC options and if the connection duration was < MSL, then + * truncate the previous TIME-WAIT state and proceed. + * Initialize connection parameters and enter SYN-SENT state. + */ +static int +tcp_connect(struct tcpcb *tp, struct mbuf *nam) +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct tcpcb *otp; + struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); + struct sockaddr_in *ifaddr; + int error; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, NULL); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + error = in_pcbladdr(inp, nam, &ifaddr); + if (error) + return error; + oinp = in_pcblookuphash(inp->inp_pcbinfo, + sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr + : ifaddr->sin_addr, + inp->inp_lport, 0); + if (oinp) { + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && + otp->t_state == TCPS_TIME_WAIT && + otp->t_duration < TCPTV_MSL && + (otp->t_flags & TF_RCVD_CC)) + otp = tcp_close(otp); + else + return EADDRINUSE; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ifaddr->sin_addr; + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + in_pcbdisconnect(inp); + return ENOBUFS; + } + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + tp->t_timer[TCPT_KEEP] = tcp_keepinit; + tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; + tcp_sendseqinit(tp); + + /* + * Generate a CC value for this connection and + * check whether CC or CCnew should be used. + */ + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + tp->cc_send = CC_INC(tcp_ccgen); + if (taop->tao_ccsent != 0 && + CC_GEQ(tp->cc_send, taop->tao_ccsent)) { + taop->tao_ccsent = tp->cc_send; + } else { + taop->tao_ccsent = 0; + tp->t_flags |= TF_SENDCCNEW; + } + + return 0; +} + +int +tcp_ctloutput(int op, struct socket *so, int level, int optname, + struct mbuf **mp) +{ + int error = 0, s; + struct inpcb *inp; + register struct tcpcb *tp; + register struct mbuf *m; + register int i; + + s = splnet(); + inp = sotoinpcb(so); + if (inp == NULL) { + splx(s); + if (op == PRCO_SETOPT && *mp) + (void) m_free(*mp); + return (ECONNRESET); + } + if (level != IPPROTO_TCP) { + error = ip_ctloutput(op, so, level, optname, mp); + splx(s); + return (error); + } + tp = intotcpcb(inp); + + switch (op) { + + case PRCO_SETOPT: + m = *mp; + switch (optname) { + + case TCP_NODELAY: + if (m == NULL || m->m_len < sizeof (int)) + error = EINVAL; + else if (*mtod(m, int *)) + tp->t_flags |= TF_NODELAY; + else + tp->t_flags &= ~TF_NODELAY; + break; + + case TCP_MAXSEG: + if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg) + tp->t_maxseg = i; + else + error = EINVAL; + break; + + case TCP_NOOPT: + if (m == NULL || m->m_len < sizeof (int)) + error = EINVAL; + else if (*mtod(m, int *)) + tp->t_flags |= TF_NOOPT; + else + tp->t_flags &= ~TF_NOOPT; + break; + + case TCP_NOPUSH: + if (m == NULL || m->m_len < sizeof (int)) + error = EINVAL; + else if (*mtod(m, int *)) + tp->t_flags |= TF_NOPUSH; + else + tp->t_flags &= ~TF_NOPUSH; + break; + + default: + error = ENOPROTOOPT; + break; + } + if (m) + (void) m_free(m); + break; + + case PRCO_GETOPT: + *mp = m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof(int); + + switch (optname) { + case TCP_NODELAY: + *mtod(m, int *) = tp->t_flags & TF_NODELAY; + break; + case TCP_MAXSEG: + *mtod(m, int *) = tp->t_maxseg; + break; + case TCP_NOOPT: + *mtod(m, int *) = tp->t_flags & TF_NOOPT; + break; + case TCP_NOPUSH: + *mtod(m, int *) = tp->t_flags & TF_NOPUSH; + break; + default: + error = ENOPROTOOPT; + break; + } + break; + } + splx(s); + return (error); +} + +/* + * tcp_sendspace and tcp_recvspace are the default send and receive window + * sizes, respectively. These are obsolescent (this information should + * be set by the route). + */ +u_long tcp_sendspace = 1024*16; +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, + CTLFLAG_RW, &tcp_sendspace , 0, ""); +u_long tcp_recvspace = 1024*16; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, + CTLFLAG_RW, &tcp_recvspace , 0, ""); + +#if defined(__rtems__) +void rtems_set_tcp_buffer_sizes(u_long sendspace, u_long recvspace) +{ + if ( sendspace != 0 ) + tcp_sendspace = sendspace; + if ( recvspace != 0 ) + tcp_recvspace = recvspace; +} +#endif + +/* + * Attach TCP protocol to socket, allocating + * internet protocol control block, tcp control block, + * bufer space, and entering LISTEN state if to accept connections. + */ +static int +tcp_attach(struct socket *so) +{ + register struct tcpcb *tp; + struct inpcb *inp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + error = in_pcballoc(so, &tcbinfo); + if (error) + return (error); + inp = sotoinpcb(so); + tp = tcp_newtcpcb(inp); + if (tp == 0) { + int nofd = so->so_state & SS_NOFDREF; /* XXX */ + + so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ + in_pcbdetach(inp); + so->so_state |= nofd; + return (ENOBUFS); + } + tp->t_state = TCPS_CLOSED; + return (0); +} + +/* + * Initiate (or continue) disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +static struct tcpcb * +tcp_disconnect(struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (tp->t_state < TCPS_ESTABLISHED) + tp = tcp_close(tp); + else if ((so->so_options & SO_LINGER) && so->so_linger == 0) + tp = tcp_drop(tp, 0); + else { + soisdisconnecting(so); + sbflush(&so->so_rcv); + tp = tcp_usrclosed(tp); + if (tp) + (void) tcp_output(tp); + } + return (tp); +} + +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +static struct tcpcb * +tcp_usrclosed(struct tcpcb *tp) +{ + + switch (tp->t_state) { + + case TCPS_CLOSED: + case TCPS_LISTEN: + tp->t_state = TCPS_CLOSED; + tp = tcp_close(tp); + break; + + case TCPS_SYN_SENT: + case TCPS_SYN_RECEIVED: + tp->t_flags |= TF_NEEDFIN; + break; + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_FIN_WAIT_1; + break; + + case TCPS_CLOSE_WAIT: + tp->t_state = TCPS_LAST_ACK; + break; + } + if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { + soisdisconnected(tp->t_inpcb->inp_socket); + /* To prevent the connection hanging in FIN_WAIT_2 forever. */ + if (tp->t_state == TCPS_FIN_WAIT_2) + tp->t_timer[TCPT_2MSL] = tcp_maxidle; + } + return (tp); +} diff --git a/netinet/tcp_var.h b/netinet/tcp_var.h new file mode 100644 index 0000000..cd70970 --- /dev/null +++ b/netinet/tcp_var.h @@ -0,0 +1,414 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_var.h,v 1.121 2005/04/21 20:11:01 ps Exp $ + */ + +#ifndef _NETINET_TCP_VAR_H_ +#define _NETINET_TCP_VAR_H_ + +#include + +/* + * Kernel variables for tcp. + */ + +#ifdef __BSD_VISIBLE +#include /* TCPT_NTIMERS */ + +/* + * Tcp control block, one per tcp; fields: + */ +struct tcpcb { + struct tcpiphdr *seg_next; /* sequencing queue */ + struct tcpiphdr *seg_prev; + int t_state; /* state of this connection */ + u_int t_flags; +#define TF_ACKNOW 0x000001 /* ack peer immediately */ +#define TF_DELACK 0x000002 /* ack, but try to delay it */ +#define TF_NODELAY 0x000004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x000008 /* don't use tcp options */ +#define TF_SENTFIN 0x000010 /* have sent FIN */ +#define TF_REQ_SCALE 0x000020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */ +#define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */ +#define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ +#define TF_NOPUSH 0x001000 /* don't push */ +#define TF_REQ_CC 0x002000 /* have/will request CC */ +#define TF_RCVD_CC 0x004000 /* a CC was received in SYN */ +#define TF_SENDCCNEW 0x008000 /* send CCnew instead of CC in SYN */ +#define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ +#define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ +#define TF_LASTIDLE 0x040000 /* connection was previously idle */ +#define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */ +#define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */ +#define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ +#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ + int t_force; /* 1 if forcing out a byte */ + int t_timer[TCPT_NTIMERS]; /* tcp timers */ + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + int t_rxtcur; /* current retransmit value */ + int t_dupacks; /* consecutive dup acks recd */ + u_int t_maxseg; /* maximum segment size */ + u_int t_maxopd; /* mss plus options */ + struct tcpiphdr *t_template; /* skeletal packet for transmit */ + struct inpcb *t_inpcb; /* back pointer to internet pcb */ +/* + * The following fields are used as in the protocol specification. + * See RFC783, Dec. 1981, page 21. + */ +/* send sequence variables */ + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_long rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_long snd_wnd; /* send window */ +/* + * Additional variables for this implementation. + */ +/* congestion control (for slow start, source quench, retransmit after loss) */ + u_long snd_cwnd; /* congestion-controlled window */ + u_long snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ +/* + * transmit timing stuff. See below for scale of srtt and rttvar. + * "Variance" is actually smoothed difference. + */ + u_int t_idle; /* inactivity time */ + int t_rtt; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + u_int t_rttmin; /* minimum rtt allowed */ + u_long max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ +/* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ +#define TCPOOB_HAVEDATA 0x01 +#define TCPOOB_HADDATA 0x02 +/* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_long ts_recent; /* timestamp echo data */ + + u_long ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; +/* RFC 1644 variables */ + tcp_cc cc_send; /* send connection count */ + tcp_cc cc_recv; /* receive connection count */ + u_long t_duration; /* connection duration */ + +/* TUBA stuff */ + caddr_t t_tuba_pcb; /* next level down pcb for TCP over z */ +/* More RTT stuff */ + u_long t_rttupdated; /* number of times rtt sampled */ +}; + +/* + * Structure to hold TCP options that are only used during segment + * processing (in tcp_input), but not held in the tcpcb. + * It's basically used to reduce the number of parameters + * to tcp_dooptions. + */ +struct tcpopt { + u_long to_flags; /* which options are present */ +#define TOF_TS 0x0001 /* timestamp */ +#define TOF_CC 0x0002 /* CC and CCnew are exclusive */ +#define TOF_CCNEW 0x0004 +#define TOF_CCECHO 0x0008 +#define TOF_MSS 0x0010 +#define TOF_SCALE 0x0020 +#define TOF_SIGNATURE 0x0040 /* signature option present */ +#define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */ +#define TOF_SACK 0x0100 /* Peer sent SACK option */ + u_int32_t to_tsval; + u_int32_t to_tsecr; + tcp_cc to_cc; /* holds CC or CCnew */ + tcp_cc to_ccecho; +}; + +/* + * The TAO cache entry which is stored in the protocol family specific + * portion of the route metrics. + */ +struct rmxp_tao { + tcp_cc tao_cc; /* latest CC in valid SYN */ + tcp_cc tao_ccsent; /* latest CC sent to peer */ + u_short tao_mssopt; /* peer's cached MSS */ +}; +#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler) + +#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) +#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) +#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) +#endif /* __BSD_VISIBLE */ + +/* + * The smoothed round-trip time and estimated variance + * are stored as fixed point numbers scaled by the values below. + * For convenience, these scales are also used in smoothing the average + * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). + * With these scales, srtt has 3 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * binary point, and is smoothed with an ALPHA of 0.75. + */ +#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ + +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This version of the macro adapted from a paper by Lawrence + * Brakmo and Larry Peterson which outlines a problem caused + * by insufficient precision in the original implementation, + * which results in inappropriately large RTO values for very + * fast networks. + */ +#define TCP_REXMTVAL(tp) \ + ((((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) + +/* XXX + * We want to avoid doing m_pullup on incoming packets but that + * means avoiding dtom on the tcp reassembly code. That in turn means + * keeping an mbuf pointer in the reassembly queue (since we might + * have a cluster). As a quick hack, the source & destination + * port numbers (which are no longer needed once we've located the + * tcpcb) are overlayed with an mbuf pointer. + */ +#if (defined(__GNUC__) && (defined(__arm__) || defined(__mips__))) +#define STR32_UNALGN(ti,m) \ + (ti)->ti_sport = (unsigned short)(((unsigned int) m & 0xffff0000) >> 16); \ + (ti)->ti_dport = (unsigned short) ((unsigned int) m & 0x0000ffff); +#define LD32_UNALGN(ti,m) \ + m = (struct mbuf *)((((unsigned int) (ti)->ti_sport) << 16) | ( (unsigned int)(ti)->ti_dport)); + +#else +#define REASS_MBUF(ti) (*(struct mbuf **)&((ti)->ti_t)) +#endif + +/* + * TCP statistics. + * Many of these should be kept per connection, + * but that's inconvenient at the moment. + */ +struct tcpstat { + u_long tcps_connattempt; /* connections initiated */ + u_long tcps_accepts; /* connections accepted */ + u_long tcps_connects; /* connections established */ + u_long tcps_drops; /* connections dropped */ + u_long tcps_conndrops; /* embryonic connections dropped */ + u_long tcps_closed; /* conn. closed (includes drops) */ + u_long tcps_segstimed; /* segs where we tried to get rtt */ + u_long tcps_rttupdated; /* times we succeeded */ + u_long tcps_delack; /* delayed acks sent */ + u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ + u_long tcps_rexmttimeo; /* retransmit timeouts */ + u_long tcps_persisttimeo; /* persist timeouts */ + u_long tcps_keeptimeo; /* keepalive timeouts */ + u_long tcps_keepprobe; /* keepalive probes sent */ + u_long tcps_keepdrops; /* connections dropped in keepalive */ + + u_long tcps_sndtotal; /* total packets sent */ + u_long tcps_sndpack; /* data packets sent */ + u_long tcps_sndbyte; /* data bytes sent */ + u_long tcps_sndrexmitpack; /* data packets retransmitted */ + u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ + u_long tcps_sndacks; /* ack-only packets sent */ + u_long tcps_sndprobe; /* window probes sent */ + u_long tcps_sndurg; /* packets sent with URG only */ + u_long tcps_sndwinup; /* window update-only packets sent */ + u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + + u_long tcps_rcvtotal; /* total packets received */ + u_long tcps_rcvpack; /* packets received in sequence */ + u_long tcps_rcvbyte; /* bytes received in sequence */ + u_long tcps_rcvbadsum; /* packets received with ccksum errs */ + u_long tcps_rcvbadoff; /* packets received with bad offset */ + u_long tcps_rcvshort; /* packets received too short */ + u_long tcps_rcvduppack; /* duplicate-only packets received */ + u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ + u_long tcps_rcvpartduppack; /* packets with some duplicate data */ + u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + u_long tcps_rcvoopack; /* out-of-order packets received */ + u_long tcps_rcvoobyte; /* out-of-order bytes received */ + u_long tcps_rcvpackafterwin; /* packets with data after window */ + u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ + u_long tcps_rcvafterclose; /* packets rcvd after "close" */ + u_long tcps_rcvwinprobe; /* rcvd window probe packets */ + u_long tcps_rcvdupack; /* rcvd duplicate acks */ + u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ + u_long tcps_rcvackpack; /* rcvd ack packets */ + u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ + u_long tcps_rcvwinupd; /* rcvd window update packets */ + u_long tcps_pawsdrop; /* segments dropped due to PAWS */ + u_long tcps_predack; /* times hdr predict ok for acks */ + u_long tcps_preddat; /* times hdr predict ok for data pkts */ + u_long tcps_pcbcachemiss; + u_long tcps_cachedrtt; /* times cached RTT in route updated */ + u_long tcps_cachedrttvar; /* times cached rttvar updated */ + u_long tcps_cachedssthresh; /* times cached ssthresh updated */ + u_long tcps_usedrtt; /* times RTT initialized from route */ + u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ + u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ + u_long tcps_persistdrop; /* timeout in persist state */ + u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ + u_long tcps_mturesent; /* resends due to MTU discovery */ + u_long tcps_listendrop; /* listen queue overflows */ +}; + +/* + * TCB structure exported to user-land via sysctl(3). + * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been + * included. Not all of our clients do. + */ +#if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) +struct xtcpcb { + size_t xt_len; + struct inpcb xt_inp; + struct tcpcb xt_tp; +#if 0 + struct xsocket xt_socket; + u_quad_t xt_alignment_hack; +#endif +}; +#endif + +/* + * Names for TCP sysctl objects + */ +#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ +#define TCPCTL_DO_RFC1644 2 /* use RFC-1644 extensions */ +#define TCPCTL_MSSDFLT 3 /* MSS default */ +#define TCPCTL_STATS 4 /* statistics (read-only) */ +#define TCPCTL_RTTDFLT 5 /* default RTT estimate */ +#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ +#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ +#define TCPCTL_SENDSPACE 8 /* send buffer space */ +#define TCPCTL_RECVSPACE 9 /* receive buffer space */ +#define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ +#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ +#define TCPCTL_MAXID 12 + +#define TCPCTL_NAMES { \ + { 0, 0 }, \ + { "rfc1323", CTLTYPE_INT }, \ + { "rfc1644", CTLTYPE_INT }, \ + { "mssdflt", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "rttdflt", CTLTYPE_INT }, \ + { "keepidle", CTLTYPE_INT }, \ + { "keepintvl", CTLTYPE_INT }, \ + { "sendspace", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "keepinit", CTLTYPE_INT }, \ +} + +#ifdef _KERNEL +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_tcp); +#endif + +extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ +extern struct inpcbinfo tcbinfo; +extern struct tcpstat tcpstat; /* tcp statistics */ +extern int tcp_mssdflt; /* XXX */ +extern u_long tcp_now; /* for RFC 1323 timestamps */ + +void tcp_canceltimers(struct tcpcb *); +struct tcpcb * + tcp_close(struct tcpcb *); +void tcp_ctlinput(int, struct sockaddr *, void *); +int tcp_ctloutput(int, struct socket *, int, int, struct mbuf **); +struct tcpcb * + tcp_drop(struct tcpcb *, int); +void tcp_drain(void); +void tcp_fasttimo(void); +struct rmxp_tao * + tcp_gettaocache(struct inpcb *); +void tcp_init(void); +void tcp_input(struct mbuf *, int); +void tcp_mss(struct tcpcb *, int); +int tcp_mssopt(struct tcpcb *); +void tcp_mtudisc(struct inpcb *, int); +struct tcpcb * + tcp_newtcpcb(struct inpcb *); +int tcp_output(struct tcpcb *); +void tcp_quench(struct inpcb *, int); +void tcp_respond(struct tcpcb *, + struct tcpiphdr *, struct mbuf *, tcp_seq, tcp_seq, int); +struct rtentry * + tcp_rtlookup(struct inpcb *); +void tcp_setpersist(struct tcpcb *); +void tcp_slowtimo(void); +struct tcpiphdr * + tcp_template(struct tcpcb *); +struct tcpcb * + tcp_timers(struct tcpcb *, int); +void tcp_trace(short, short, struct tcpcb *, struct tcpiphdr *, int); + +extern struct pr_usrreqs tcp_usrreqs; +extern u_long tcp_sendspace; +extern u_long tcp_recvspace; + +#endif /* _KERNEL */ + +#endif /* _NETINET_TCP_VAR_H_ */ diff --git a/netinet/tcpip.h b/netinet/tcpip.h new file mode 100644 index 0000000..1f16bd0 --- /dev/null +++ b/netinet/tcpip.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcpip.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/tcpip.h,v 1.12.22.1.4.1 2010/06/14 02:09:06 kensmith Exp $ + */ + + +#ifndef _NETINET_TCPIP_H_ +#define _NETINET_TCPIP_H_ + +#ifdef __BSD_VISIBLE +#include /* struct tcphdr */ +#include /* struct ipovly */ + +/* + * Tcp+ip header, after ip options removed. + */ +struct tcpiphdr { + struct ipovly ti_i; /* overlaid ip structure */ + struct tcphdr ti_t; /* tcp header */ +}; +#define ti_next ti_i.ih_next +#define ti_prev ti_i.ih_prev +#define ti_x1 ti_i.ih_x1 +#define ti_pr ti_i.ih_pr +#define ti_len ti_i.ih_len +#define ti_src ti_i.ih_src +#define ti_dst ti_i.ih_dst +#define ti_sport ti_t.th_sport +#define ti_dport ti_t.th_dport +#define ti_seq ti_t.th_seq +#define ti_ack ti_t.th_ack +#define ti_x2 ti_t.th_x2 +#define ti_off ti_t.th_off +#define ti_flags ti_t.th_flags +#define ti_win ti_t.th_win +#define ti_sum ti_t.th_sum +#define ti_urp ti_t.th_urp +#endif /* __BSD_VISIBLE */ + +#endif diff --git a/netinet/udp.h b/netinet/udp.h new file mode 100644 index 0000000..98fc433 --- /dev/null +++ b/netinet/udp.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_UDP_H_ +#define _NETINET_UDP_H_ + +/* + * Udp protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + u_short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; + +#endif diff --git a/netinet/udp_usrreq.c b/netinet/udp_usrreq.c new file mode 100644 index 0000000..e4ae5c0 --- /dev/null +++ b/netinet/udp_usrreq.c @@ -0,0 +1,736 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 + * $FreeBSD: src/sys/netinet/udp_usrreq.c,v 1.170 2004/11/08 14:44:53 phk Exp $ + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * UDP protocol implementation. + * Per RFC 768, August, 1980. + */ +#ifndef COMPAT_42 +static int udpcksum = 1; +#else +static int udpcksum = 0; /* XXX */ +#endif +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, + &udpcksum, 0, ""); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, ""); + +struct inpcbhead udb; /* from udp_var.h */ +struct inpcbinfo udbinfo; + +#ifndef UDBHASHSIZE +#define UDBHASHSIZE 64 +#endif + + struct udpstat udpstat; /* from udp_var.h */ +SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD, + &udpstat, udpstat, ""); + +static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET, 0, {0}, {0} }; + +static void udp_detach(struct inpcb *); +static int udp_output(struct inpcb *, struct mbuf *, struct mbuf *, + struct mbuf *); +static void udp_notify(struct inpcb *, int); + +void +udp_init(void) +{ + LIST_INIT(&udb); + udbinfo.listhead = &udb; + udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask); +} + +void +udp_input(struct mbuf *m, int iphlen) +{ + register struct ip *ip; + register struct udphdr *uh; + register struct inpcb *inp; + struct mbuf *opts = 0; + int len; + struct ip save_ip; + + udpstat.udps_ipackets++; + + /* + * Strip IP options, if any; should skip this, + * make available to user, and use on returned packets, + * but we don't yet have a way to check the checksum + * with options still present. + */ + if (iphlen > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + + /* + * Get IP and UDP header together in first mbuf. + */ + ip = mtod(m, struct ip *); + if (m->m_len < iphlen + sizeof(struct udphdr)) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + udpstat.udps_hdrops++; + return; + } + ip = mtod(m, struct ip *); + } + uh = (struct udphdr *)((caddr_t)ip + iphlen); + + /* + * Make mbuf data length reflect UDP length. + * If not enough data to reflect UDP length, drop. + */ + len = ntohs((u_short)uh->uh_ulen); + if (ip->ip_len != len) { + if (len > ip->ip_len || len < sizeof(struct udphdr)) { + udpstat.udps_badlen++; + goto bad; + } + m_adj(m, len - ip->ip_len); + /* ip->ip_len = len; */ + } + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + save_ip = *ip; + + /* + * Checksum extended UDP header and data. + */ + if (uh->uh_sum) { + ((struct ipovly *)ip)->ih_next = 0; + ((struct ipovly *)ip)->ih_prev = 0; + ((struct ipovly *)ip)->ih_x1 = 0; + ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); + if (uh->uh_sum) { + udpstat.udps_badsum++; + m_freem(m); + return; + } + } + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + struct inpcb *last; + /* + * Deliver a multicast or broadcast datagram to *all* sockets + * for which the local and remote addresses and ports match + * those of the incoming datagram. This allows more than + * one process to receive multi/broadcasts on the same port. + * (This really ought to be done for unicast datagrams as + * well, but that would cause problems with existing + * applications that open both address-specific sockets and + * a wildcard socket listening to the same port -- they would + * end up receiving duplicates of every unicast datagram. + * Those applications open the multiple sockets to overcome an + * inadequacy of the UDP socket interface, but for backwards + * compatibility we avoid the problem here rather than + * fixing the interface. Maybe 4.5BSD will remedy this?) + */ + + /* + * Construct sockaddr format source address. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + m->m_len -= sizeof (struct udpiphdr); + m->m_data += sizeof (struct udpiphdr); + /* + * Locate pcb(s) for datagram. + * (Algorithm copied from raw_intr().) + */ + last = NULL; + for (inp = udb.lh_first; inp != NULL; inp = inp->inp_list.le_next) { + if (inp->inp_lport != uh->uh_dport) + continue; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (inp->inp_laddr.s_addr != + ip->ip_dst.s_addr) + continue; + } + if (inp->inp_faddr.s_addr != INADDR_ANY) { + if (inp->inp_faddr.s_addr != + ip->ip_src.s_addr || + inp->inp_fport != uh->uh_sport) + continue; + } + + if (last != NULL) { + struct mbuf *n; + + if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { + if (last->inp_flags & INP_CONTROLOPTS + || last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, n); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&udp_in, + n, opts) == 0) { + m_freem(n); + if (opts) + m_freem(opts); + udpstat.udps_fullsock++; + } else + sorwakeup(last->inp_socket); + opts = 0; + } + } + last = inp; + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids searching + * through all pcbs in the common case of a non-shared + * port. It * assumes that an application will never + * clear these options after setting them. + */ + if (((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0)) + break; + } + + if (last == NULL) { + /* + * No matching pcb found; discard datagram. + * (No need to send an ICMP Port Unreachable + * for a broadcast or multicast datgram.) + */ + udpstat.udps_noportbcast++; + goto bad; + } + if (last->inp_flags & INP_CONTROLOPTS + || last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, m); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&udp_in, + m, opts) == 0) { + udpstat.udps_fullsock++; + goto bad; + } + sorwakeup(last->inp_socket); + return; + } + /* + * Locate pcb for datagram. + */ + inp = in_pcblookuphash(&udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, 1); + if (inp == NULL) { + if (log_in_vain) { + char buf0[INET_ADDRSTRLEN]; + char buf1[INET_ADDRSTRLEN]; + + log(LOG_INFO, "Connection attempt to UDP %s:%d" + " from %s:%d\n", + inet_ntoa_r(ip->ip_dst, buf0), ntohs(uh->uh_dport), + inet_ntoa_r(ip->ip_src, buf1), ntohs(uh->uh_sport)); + } + udpstat.udps_noport++; + if (m->m_flags & (M_BCAST | M_MCAST)) { + udpstat.udps_noportbcast++; + goto bad; + } + *ip = save_ip; + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); + return; + } + + /* + * Construct sockaddr format source address. + * Stuff source address and datagram in user buffer. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + if (inp->inp_flags & INP_CONTROLOPTS + || inp->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(inp, &opts, ip, m); + iphlen += sizeof(struct udphdr); + m->m_len -= iphlen; + m->m_pkthdr.len -= iphlen; + m->m_data += iphlen; + if (sbappendaddr(&inp->inp_socket->so_rcv, (struct sockaddr *)&udp_in, + m, opts) == 0) { + udpstat.udps_fullsock++; + goto bad; + } + sorwakeup(inp->inp_socket); + return; +bad: + m_freem(m); + if (opts) + m_freem(opts); +} + +/* + * Notify a udp user of an asynchronous error; + * just wake up so that he can collect error status. + */ +static void +udp_notify(struct inpcb *inp, int errnum) +{ + inp->inp_socket->so_error = errnum; + sorwakeup(inp->inp_socket); + sowwakeup(inp->inp_socket); +} + +void +udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + register struct ip *ip = vip; + register struct udphdr *uh; + + if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip) { + uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + in_pcbnotify(&udb, sa, uh->uh_dport, ip->ip_src, uh->uh_sport, + cmd, udp_notify); + } else + in_pcbnotify(&udb, sa, 0, zeroin_addr, 0, cmd, udp_notify); +} + +static int +udp_output(struct inpcb *inp, struct mbuf *m, struct mbuf *addr, + struct mbuf *control) +{ + register struct udpiphdr *ui; + register int len = m->m_pkthdr.len; + struct in_addr laddr; + int s = 0, error = 0; + + laddr.s_addr = 0; + if (control) + m_freem(control); /* XXX */ + + if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { + error = EMSGSIZE; + goto release; + } + + if (addr) { + laddr = inp->inp_laddr; + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + goto release; + } + /* + * Must block input while temporarily connected. + */ + s = splnet(); + error = in_pcbconnect(inp, addr); + if (error) { + splx(s); + goto release; + } + } else { + if (inp->inp_faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + goto release; + } + } + /* + * Calculate data length and get a mbuf + * for UDP and IP headers. + */ + M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); + if (m == 0) { + error = ENOBUFS; + if (addr) { + in_pcbdisconnect(inp); + inp->inp_laddr = laddr; + splx(s); + } + goto release; + } + + /* + * Fill in mbuf with extended UDP header + * and addresses and length put into network format. + */ + ui = mtod(m, struct udpiphdr *); + ui->ui_next = ui->ui_prev = 0; + ui->ui_x1 = 0; + ui->ui_pr = IPPROTO_UDP; + ui->ui_len = htons((u_short)len + sizeof (struct udphdr)); + ui->ui_src = inp->inp_laddr; + ui->ui_dst = inp->inp_faddr; + ui->ui_sport = inp->inp_lport; + ui->ui_dport = inp->inp_fport; + ui->ui_ulen = ui->ui_len; + + /* + * Stuff checksum and output datagram. + */ + ui->ui_sum = 0; + if (udpcksum) { + if ((ui->ui_sum = in_cksum(m, sizeof (struct udpiphdr) + len)) == 0) + ui->ui_sum = 0xffff; + } + ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ + ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ + udpstat.udps_opackets++; + error = ip_output(m, inp->inp_options, &inp->inp_route, + inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST), + inp->inp_moptions); + + if (addr) { + in_pcbdisconnect(inp); + inp->inp_laddr = laddr; + splx(s); + } + return (error); + +release: + m_freem(m); + return (error); +} + +#ifdef __rtems__ +#define INP_INFO_RLOCK(a) +#define INP_INFO_RUNLOCK(a) +#define INP_LOCK(a) +#define INP_UNLOCK(a) +#endif + +static int +udp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = udbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = udbinfo.ipi_gencnt; + n = udbinfo.ipi_count; + splx(s); + + sysctl_wire_old_buffer(req, 2 * (sizeof xig) + + n * sizeof(struct xinpcb)); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; +#if 0 + xig.xig_sogen = so_gencnt; +#endif + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + /* ccj add the exit if count is 0 */ + if (!n) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + INP_INFO_RLOCK(&udbinfo); + for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt) +#if 0 + && + cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) +#endif + inp_list[i++] = inp; + INP_UNLOCK(inp); + } + INP_INFO_RUNLOCK(&udbinfo); + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_LOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); +#if 0 + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); +#endif + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + INP_UNLOCK(inp); + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + INP_INFO_RLOCK(&udbinfo); + xig.xig_gen = udbinfo.ipi_gencnt; +#if 0 + xig.xig_sogen = so_gencnt; +#endif + xig.xig_count = udbinfo.ipi_count; + INP_INFO_RUNLOCK(&udbinfo); + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + udp_pcblist, "S,xinpcb", "List of active UDP sockets"); + +static u_long udp_sendspace = 9216; /* really max datagram size */ + /* 40 1K datagrams */ +SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, + &udp_sendspace, 0, ""); + +static u_long udp_recvspace = 40 * (1024 + sizeof(struct sockaddr_in)); +SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &udp_recvspace, 0, ""); + +#if defined(__rtems__) +void rtems_set_udp_buffer_sizes(u_long sendspace, u_long recvspace) +{ + if ( sendspace != 0 ) + udp_sendspace = sendspace; + if ( recvspace != 0 ) + udp_recvspace = recvspace; +} +#endif + +/*ARGSUSED*/ +int +udp_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *addr, + struct mbuf *control) +{ + struct inpcb *inp = sotoinpcb(so); + int error = 0; + int s; + + if (req == PRU_CONTROL) + return (in_control(so, (uintptr_t)m, (caddr_t)addr, + (struct ifnet *)control)); + if (inp == NULL && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + /* + * Note: need to block udp_input while changing + * the udp pcb queue and/or pcb addresses. + */ + switch (req) { + + case PRU_ATTACH: + if (inp != NULL) { + error = EINVAL; + break; + } + s = splnet(); + error = in_pcballoc(so, &udbinfo); + splx(s); + if (error) + break; + error = soreserve(so, udp_sendspace, udp_recvspace); + if (error) + break; + ((struct inpcb *) so->so_pcb)->inp_ip_ttl = ip_defttl; + break; + + case PRU_DETACH: + udp_detach(inp); + break; + + case PRU_BIND: + s = splnet(); + error = in_pcbbind(inp, addr); + splx(s); + break; + + case PRU_LISTEN: + error = EOPNOTSUPP; + break; + + case PRU_CONNECT: + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + break; + } + s = splnet(); + error = in_pcbconnect(inp, addr); + splx(s); + if (error == 0) + soisconnected(so); + break; + + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + case PRU_ACCEPT: + error = EOPNOTSUPP; + break; + + case PRU_DISCONNECT: + if (inp->inp_faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + break; + } + s = splnet(); + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + splx(s); + so->so_state &= ~SS_ISCONNECTED; /* XXX */ + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + case PRU_SEND: + return (udp_output(inp, m, addr, control)); + + case PRU_ABORT: + soisdisconnected(so); + udp_detach(inp); + break; + + case PRU_SOCKADDR: + in_setsockaddr(inp, addr); + break; + + case PRU_PEERADDR: + in_setpeeraddr(inp, addr); + break; + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + case PRU_SENDOOB: + case PRU_FASTTIMO: + case PRU_SLOWTIMO: + case PRU_PROTORCV: + case PRU_PROTOSEND: + error = EOPNOTSUPP; + break; + + case PRU_RCVD: + case PRU_RCVOOB: + return (EOPNOTSUPP); /* do not free mbuf's */ + + default: + panic("udp_usrreq"); + } + +release: + if (control) { + printf("udp control data unexpectedly retained\n"); + m_freem(control); + } + if (m) + m_freem(m); + return (error); +} + +static void +udp_detach(struct inpcb *inp) +{ + int s = splnet(); + + in_pcbdetach(inp); + splx(s); +} diff --git a/netinet/udp_var.h b/netinet/udp_var.h new file mode 100644 index 0000000..c06e12d --- /dev/null +++ b/netinet/udp_var.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD: src/sys/netinet/udp_var.h,v 1.29 2005/01/07 01:45:45 imp Exp $ + */ + + +#ifndef _NETINET_UDP_VAR_H_ +#define _NETINET_UDP_VAR_H_ + +#include /* struct ipovly */ +#include /* struct udphdr */ + +/* + * UDP kernel structures and variables. + */ +struct udpiphdr { + struct ipovly ui_i; /* overlaid ip structure */ + struct udphdr ui_u; /* udp header */ +}; +#define ui_next ui_i.ih_next +#define ui_prev ui_i.ih_prev +#define ui_x1 ui_i.ih_x1 +#define ui_pr ui_i.ih_pr +#define ui_len ui_i.ih_len +#define ui_src ui_i.ih_src +#define ui_dst ui_i.ih_dst +#define ui_sport ui_u.uh_sport +#define ui_dport ui_u.uh_dport +#define ui_ulen ui_u.uh_ulen +#define ui_sum ui_u.uh_sum + +struct udpstat { + /* input statistics: */ + u_long udps_ipackets; /* total input packets */ + u_long udps_hdrops; /* packet shorter than header */ + u_long udps_badsum; /* checksum error */ + u_long udps_badlen; /* data length larger than packet */ + u_long udps_noport; /* no socket on port */ + u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_fullsock; /* not delivered, input socket full */ + u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ + u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */ + /* output statistics: */ + u_long udps_opackets; /* total output packets */ +}; + +/* + * Names for UDP sysctl objects + */ +#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ +#define UDPCTL_STATS 2 /* statistics (read-only) */ +#define UDPCTL_MAXDGRAM 3 /* max datagram size */ +#define UDPCTL_RECVSPACE 4 /* default receive buffer space */ +#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ +#define UDPCTL_MAXID 6 + +#define UDPCTL_NAMES { \ + { 0, 0 }, \ + { "checksum", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "maxdgram", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_udp); + +extern struct inpcbhead udb; +extern struct inpcbinfo udbinfo; +extern struct udpstat udpstat; + +void udp_ctlinput(int, struct sockaddr *, void *); +void udp_init(void); +void udp_input(struct mbuf *, int); +int udp_usrreq(struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *); +#endif + +#endif -- cgit v1.2.3