diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2013-11-06 16:20:21 +0100 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2013-11-11 10:08:08 +0100 |
commit | 66659ff1ad6831b0ea7425fa6ecd8a8687523658 (patch) | |
tree | 48e22b475fa8854128e0861a33fed6f78c8094b5 /freebsd/sys/netinet | |
parent | Define __GLOBL1() and __GLOBL() (diff) | |
download | rtems-libbsd-66659ff1ad6831b0ea7425fa6ecd8a8687523658.tar.bz2 |
Update to FreeBSD 9.2
Diffstat (limited to 'freebsd/sys/netinet')
70 files changed, 4015 insertions, 2974 deletions
diff --git a/freebsd/sys/netinet/accf_http.c b/freebsd/sys/netinet/accf_http.c index 97344a2c..3af867b0 100644 --- a/freebsd/sys/netinet/accf_http.c +++ b/freebsd/sys/netinet/accf_http.c @@ -71,7 +71,7 @@ DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); static int parse_http_version = 1; -SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0, +static SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0, "HTTP accept filter"); SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW, &parse_http_version, 1, diff --git a/freebsd/sys/netinet/icmp6.h b/freebsd/sys/netinet/icmp6.h index a6e68864..5483721d 100644 --- a/freebsd/sys/netinet/icmp6.h +++ b/freebsd/sys/netinet/icmp6.h @@ -659,7 +659,8 @@ void kmod_icmp6stat_inc(int statnum); #define ICMPV6CTL_MLD_SOMAXSRC 22 #define ICMPV6CTL_MLD_VERSION 23 #define ICMPV6CTL_ND6_MAXQLEN 24 -#define ICMPV6CTL_MAXID 25 +#define ICMPV6CTL_NODEINFO_OLDMCPREFIX 25 +#define ICMPV6CTL_MAXID 26 #define RTF_PROBEMTU RTF_PROTO1 diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c index 6b98161f..98ed0b36 100644 --- a/freebsd/sys/netinet/if_ether.c +++ b/freebsd/sys/netinet/if_ether.c @@ -79,8 +79,8 @@ __FBSDID("$FreeBSD$"); #define SDL(s) ((struct sockaddr_dl *)s) SYSCTL_DECL(_net_link_ether); -SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); -SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); +static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); +static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); /* timer values */ static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 @@ -89,8 +89,8 @@ static VNET_DEFINE(int, arp_maxtries) = 5; VNET_DEFINE(int, useloopback) = 1; /* use loopback interface for * local traffic */ static VNET_DEFINE(int, arp_proxyall) = 0; -static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for - * 20 seconds */ +static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for + * 20 seconds */ VNET_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ static VNET_DEFINE(int, arp_maxhold) = 1; @@ -121,7 +121,7 @@ SYSCTL_VNET_STRUCT(_net_link_ether_arp, OID_AUTO, stats, CTLFLAG_RW, &VNET_NAME(arpstat), arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW, - &VNET_NAME(arp_maxhold), 0, + &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); static void arp_init(void); @@ -169,38 +169,48 @@ arp_ifscrub(struct ifnet *ifp, uint32_t addr) static void arptimer(void *arg) { + struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; - struct llentry *lle; - int pkts_dropped; - KASSERT(arg != NULL, ("%s: arg NULL", __func__)); - lle = (struct llentry *)arg; + if (lle->la_flags & LLE_STATIC) { + LLE_WUNLOCK(lle); + return; + } + ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); + + if ((lle->la_flags & LLE_DELETED) == 0) { + int evt; + + if (lle->la_flags & LLE_VALID) + evt = LLENTRY_EXPIRED; + else + evt = LLENTRY_TIMEDOUT; + EVENTHANDLER_INVOKE(lle_event, lle, evt); + } + + callout_stop(&lle->la_timer); + + /* XXX: LOR avoidance. We still have ref on lle. */ + LLE_WUNLOCK(lle); IF_AFDATA_LOCK(ifp); LLE_WLOCK(lle); - if (lle->la_flags & LLE_STATIC) - LLE_WUNLOCK(lle); - else { - if (!callout_pending(&lle->la_timer) && - callout_active(&lle->la_timer)) { - callout_stop(&lle->la_timer); - LLE_REMREF(lle); - pkts_dropped = llentry_free(lle); - ARPSTAT_ADD(dropped, pkts_dropped); - ARPSTAT_INC(timeouts); - } else { -#ifdef DIAGNOSTIC - struct sockaddr *l3addr = L3_ADDR(lle); - log(LOG_INFO, - "arptimer issue: %p, IPv4 address: \"%s\"\n", lle, - inet_ntoa( - ((const struct sockaddr_in *)l3addr)->sin_addr)); -#endif - LLE_WUNLOCK(lle); - } - } + + /* Guard against race with other llentry_free(). */ + if (lle->la_flags & LLE_LINKED) { + size_t pkts_dropped; + + LLE_REMREF(lle); + pkts_dropped = llentry_free(lle); + ARPSTAT_ADD(dropped, pkts_dropped); + } else + LLE_FREE_LOCKED(lle); + IF_AFDATA_UNLOCK(ifp); + + ARPSTAT_INC(timeouts); + CURVNET_RESTORE(); } @@ -235,7 +245,7 @@ arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip, SIN(ifa->ifa_netmask)->sin_addr.s_addr) ) break; /* found it. */ } - if (sip == NULL) { + if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return; } @@ -304,18 +314,16 @@ arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, return (0); } } - /* XXXXX - */ retry: - IF_AFDATA_RLOCK(ifp); + IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, dst); - IF_AFDATA_RUNLOCK(ifp); + IF_AFDATA_RUNLOCK(ifp); if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0) - && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) { + && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) { flags |= (LLE_CREATE | LLE_EXCLUSIVE); - IF_AFDATA_WLOCK(ifp); + IF_AFDATA_WLOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, dst); - IF_AFDATA_WUNLOCK(ifp); + IF_AFDATA_WUNLOCK(ifp); } if (la == NULL) { if (flags & LLE_CREATE) @@ -324,10 +332,10 @@ retry: inet_ntoa(SIN(dst)->sin_addr)); m_freem(m); return (EINVAL); - } + } if ((la->la_flags & LLE_VALID) && - ((la->la_flags & LLE_STATIC) || la->la_expire > time_second)) { + ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { bcopy(&la->ll_addr, desten, ifp->if_addrlen); /* * If entry has an expiry time and it is approaching, @@ -335,18 +343,18 @@ retry: * arpt_down interval. */ if (!(la->la_flags & LLE_STATIC) && - time_second + la->la_preempt > la->la_expire) { + time_uptime + la->la_preempt > la->la_expire) { arprequest(ifp, NULL, &SIN(dst)->sin_addr, IF_LLADDR(ifp)); la->la_preempt--; } - + *lle = la; error = 0; goto done; - } - + } + if (la->la_flags & LLE_STATIC) { /* should not happen! */ log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n", inet_ntoa(SIN(dst)->sin_addr)); @@ -355,7 +363,7 @@ retry: goto done; } - renew = (la->la_asked == 0 || la->la_expire != time_second); + renew = (la->la_asked == 0 || la->la_expire != time_uptime); if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) { flags |= LLE_EXCLUSIVE; LLE_RUNLOCK(la); @@ -376,20 +384,20 @@ retry: la->la_numheld--; ARPSTAT_INC(dropped); } - } + } if (la->la_hold != NULL) { curr = la->la_hold; while (curr->m_nextpkt != NULL) curr = curr->m_nextpkt; curr->m_nextpkt = m; - } else + } else la->la_hold = m; la->la_numheld++; if (renew == 0 && (flags & LLE_EXCLUSIVE)) { flags &= ~LLE_EXCLUSIVE; LLE_DOWNGRADE(la); } - + } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It @@ -407,7 +415,7 @@ retry: int canceled; LLE_ADDREF(la); - la->la_expire = time_second; + la->la_expire = time_uptime; canceled = callout_reset(&la->la_timer, hz * V_arpt_down, arptimer, la); if (canceled) @@ -437,7 +445,7 @@ arpintr(struct mbuf *m) if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { - log(LOG_ERR, "arp: runt packet -- m_pullup failed\n"); + log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n"); return; } ar = mtod(m, struct arphdr *); @@ -445,16 +453,19 @@ arpintr(struct mbuf *m) if (ntohs(ar->ar_hrd) != ARPHRD_ETHER && ntohs(ar->ar_hrd) != ARPHRD_IEEE802 && ntohs(ar->ar_hrd) != ARPHRD_ARCNET && - ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) { - log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n", - (unsigned char *)&ar->ar_hrd, ""); + ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 && + ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) { + log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)" + " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "", + ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":", + ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":"); m_freem(m); return; } if (m->m_len < arphdr_len(ar)) { if ((m = m_pullup(m, arphdr_len(ar))) == NULL) { - log(LOG_ERR, "arp: runt packet\n"); + log(LOG_NOTICE, "arp: runt packet\n"); m_freem(m); return; } @@ -490,17 +501,19 @@ arpintr(struct mbuf *m) static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; +static int allow_multicast = 0; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, - &log_arp_movements, 0, - "log arp replies from MACs different than the one in the cache"); + &log_arp_movements, 0, + "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, - &log_arp_permanent_modify, 0, - "log arp replies from MACs different than the one in the permanent arp entry"); - + &log_arp_permanent_modify, 0, + "log arp replies from MACs different than the one in the permanent arp entry"); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, + &allow_multicast, 0, "accept multicast addresses"); static void in_arpinput(struct mbuf *m) @@ -530,11 +543,27 @@ in_arpinput(struct mbuf *m) req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) { - log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n"); + log(LOG_NOTICE, "in_arp: runt packet -- m_pullup failed\n"); return; } ah = mtod(m, struct arphdr *); + /* + * ARP is only for IPv4 so we can reject packets with + * a protocol length not equal to an IPv4 address. + */ + if (ah->ar_pln != sizeof(struct in_addr)) { + log(LOG_NOTICE, "in_arp: requested protocol length != %zu\n", + sizeof(struct in_addr)); + goto drop; + } + + if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { + log(LOG_NOTICE, "arp: %*D is multicast\n", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); + goto drop; + } + op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); @@ -553,7 +582,7 @@ in_arpinput(struct mbuf *m) */ IN_IFADDR_RLOCK(); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { - if (((bridged && ia->ia_ifp->if_bridge != NULL) || + if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); @@ -570,7 +599,7 @@ in_arpinput(struct mbuf *m) } } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) - if (((bridged && ia->ia_ifp->if_bridge != NULL) || + if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); @@ -633,7 +662,7 @@ match: if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { - log(LOG_ERR, + log(LOG_NOTICE, "arp: link address is broadcast for IP address %s!\n", inet_ntoa(isaddr)); goto drop; @@ -662,14 +691,14 @@ match: sin.sin_addr = isaddr; flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0; flags |= LLE_EXCLUSIVE; - IF_AFDATA_LOCK(ifp); + IF_AFDATA_LOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin); IF_AFDATA_UNLOCK(ifp); if (la != NULL) { /* the following is not an error when doing bridging */ if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) { if (log_arp_wrong_iface) - log(LOG_ERR, "arp: %s is on %s " + log(LOG_WARNING, "arp: %s is on %s " "but got reply from %*D on %s\n", inet_ntoa(isaddr), la->lle_tbl->llt_ifp->if_xname, @@ -692,7 +721,7 @@ match: goto reply; } if (log_arp_movements) { - log(LOG_INFO, "arp: %s moved from %*D " + log(LOG_INFO, "arp: %s moved from %*D " "to %*D on %s\n", inet_ntoa(isaddr), ifp->if_addrlen, @@ -701,23 +730,25 @@ match: ifp->if_xname); } } - + if (ifp->if_addrlen != ah->ar_hln) { LLE_WUNLOCK(la); - log(LOG_WARNING, - "arp from %*D: addr len: new %d, i/f %d (ignored)", - ifp->if_addrlen, (u_char *) ar_sha(ah), ":", - ah->ar_hln, ifp->if_addrlen); - goto reply; + log(LOG_WARNING, "arp from %*D: addr len: new %d, " + "i/f %d (ignored)\n", ifp->if_addrlen, + (u_char *) ar_sha(ah), ":", ah->ar_hln, + ifp->if_addrlen); + goto drop; } (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); la->la_flags |= LLE_VALID; + EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); + if (!(la->la_flags & LLE_STATIC)) { int canceled; LLE_ADDREF(la); - la->la_expire = time_second + V_arpt_keep; + la->la_expire = time_uptime + V_arpt_keep; canceled = callout_reset(&la->la_timer, hz * V_arpt_keep, arptimer, la); if (canceled) @@ -725,7 +756,7 @@ match: } la->la_asked = 0; la->la_preempt = V_arp_maxtries; - /* + /* * The packets are all freed within the call to the output * routine. * @@ -747,7 +778,7 @@ match: } } else LLE_WUNLOCK(la); - } /* end of FIB loop */ + } reply: if (op != ARPOP_REQUEST) goto drop; @@ -761,7 +792,7 @@ reply: struct llentry *lle = NULL; sin.sin_addr = itaddr; - IF_AFDATA_LOCK(ifp); + IF_AFDATA_LOCK(ifp); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); IF_AFDATA_UNLOCK(ifp); @@ -776,7 +807,7 @@ reply: if (!V_arp_proxyall) goto drop; - + sin.sin_addr = itaddr; /* XXX MRT use table 0 for arp reply */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); @@ -804,7 +835,7 @@ reply: * wrong network. */ sin.sin_addr = isaddr; - + /* XXX MRT use table 0 for arp checks */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); if (!rt) @@ -820,8 +851,7 @@ reply: RTFREE_LOCKED(rt); #ifdef DEBUG_PROXY - printf("arp: proxying for %s\n", - inet_ntoa(itaddr)); + printf("arp: proxying for %s\n", inet_ntoa(itaddr)); #endif } } @@ -843,8 +873,8 @@ reply: (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ - m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); - m->m_pkthdr.len = m->m_len; + m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); + m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; @@ -865,7 +895,7 @@ arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) { arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); - /* + /* * interface address is considered static entry * because the output of the arp utility shows * that L2 entry as permanent diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c index f2949c14..3056fa3a 100644 --- a/freebsd/sys/netinet/igmp.c +++ b/freebsd/sys/netinet/igmp.c @@ -187,7 +187,7 @@ static const struct netisr_handler igmp_nh = { struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ -MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); +static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. @@ -282,8 +282,9 @@ SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, /* * Non-virtualized sysctls. */ -SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, - sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); +static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, + CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, + "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c index 7bf52c6b..0c3f72bc 100644 --- a/freebsd/sys/netinet/in.c +++ b/freebsd/sys/netinet/in.c @@ -78,11 +78,6 @@ static int in_ifinit(struct ifnet *, struct in_ifaddr *, struct sockaddr_in *, int); static void in_purgemaddrs(struct ifnet *); -static VNET_DEFINE(int, subnetsarelocal); -#define V_subnetsarelocal VNET(subnetsarelocal) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, - &VNET_NAME(subnetsarelocal), 0, - "Treat all subnets as directly connected"); static VNET_DEFINE(int, sameprefixcarponly); #define V_sameprefixcarponly VNET(sameprefixcarponly) SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, @@ -97,9 +92,7 @@ VNET_DECLARE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ /* * Return 1 if an internet address is for a ``local'' host - * (one to which we have a connection). If subnetsarelocal - * is true, this includes other subnets of the local net. - * Otherwise, it includes only the directly-connected (sub)nets. + * (one to which we have a connection). */ int in_localaddr(struct in_addr in) @@ -108,19 +101,10 @@ in_localaddr(struct in_addr in) register struct in_ifaddr *ia; IN_IFADDR_RLOCK(); - if (V_subnetsarelocal) { - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - if ((i & ia->ia_netmask) == ia->ia_net) { - IN_IFADDR_RUNLOCK(); - return (1); - } - } - } else { - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - if ((i & ia->ia_subnetmask) == ia->ia_subnet) { - IN_IFADDR_RUNLOCK(); - return (1); - } + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if ((i & ia->ia_subnetmask) == ia->ia_subnet) { + IN_IFADDR_RUNLOCK(); + return (1); } } IN_IFADDR_RUNLOCK(); @@ -541,20 +525,20 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, hostIsNew = 0; } if (ifra->ifra_mask.sin_len) { - /* + /* * QL: XXX * Need to scrub the prefix here in case * the issued command is SIOCAIFADDR with * the same address, but with a different * prefix length. And if the prefix length - * is the same as before, then the call is + * is the same as before, then the call is * un-necessarily executed here. */ in_ifscrub(ifp, ia, LLE_STATIC); ia->ia_sockmask = ifra->ifra_mask; ia->ia_sockmask.sin_family = AF_INET; ia->ia_subnetmask = - ntohl(ia->ia_sockmask.sin_addr.s_addr); + ntohl(ia->ia_sockmask.sin_addr.s_addr); maskIsNew = 1; } if ((ifp->if_flags & IFF_POINTOPOINT) && @@ -567,7 +551,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, (hostIsNew || maskIsNew)) error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); if (error != 0 && iaIsNew) - goto out; + break; if ((ifp->if_flags & IFF_BROADCAST) && (ifra->ifra_broadaddr.sin_family == AF_INET)) @@ -898,23 +882,19 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, in_ifscrub(ifp, ia, LLE_STATIC); ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; } - if (IN_CLASSA(i)) - ia->ia_netmask = IN_CLASSA_NET; - else if (IN_CLASSB(i)) - ia->ia_netmask = IN_CLASSB_NET; - else - ia->ia_netmask = IN_CLASSC_NET; /* - * The subnet mask usually includes at least the standard network part, - * but may may be smaller in the case of supernetting. - * If it is set, we believe it. + * Be compatible with network classes, if netmask isn't supplied, + * guess it based on classes. */ if (ia->ia_subnetmask == 0) { - ia->ia_subnetmask = ia->ia_netmask; + if (IN_CLASSA(i)) + ia->ia_subnetmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_subnetmask = IN_CLASSB_NET; + else + ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); - } else - ia->ia_netmask &= ia->ia_subnetmask; - ia->ia_net = i & ia->ia_netmask; + } ia->ia_subnet = i & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); /* @@ -927,10 +907,11 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, */ ia->ia_ifa.ifa_metric = ifp->if_metric; if (ifp->if_flags & IFF_BROADCAST) { - ia->ia_broadaddr.sin_addr.s_addr = - htonl(ia->ia_subnet | ~ia->ia_subnetmask); - ia->ia_netbroadcast.s_addr = - htonl(ia->ia_net | ~ ia->ia_netmask); + if (ia->ia_subnetmask == IN_RFC3021_MASK) + ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; + else + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); } else if (ifp->if_flags & IFF_LOOPBACK) { ia->ia_dstaddr = ia->ia_addr; flags |= RTF_HOST; @@ -966,8 +947,8 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, RT_ADDREF(ia_ro.ro_rt); RTFREE_LOCKED(ia_ro.ro_rt); } else - error = ifa_add_loopback_route((struct ifaddr *)ia, - (struct sockaddr *)&ia->ia_addr); + error = ifa_add_loopback_route((struct ifaddr *)ia, + (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; if (ia_ro.ro_rt != NULL) @@ -982,10 +963,10 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, ? RTF_HOST : 0) /* - * Generate a routing message when inserting or deleting + * Generate a routing message when inserting or deleting * an interface address alias. */ -static void in_addralias_rtmsg(int cmd, struct in_addr *prefix, +static void in_addralias_rtmsg(int cmd, struct in_addr *prefix, struct in_ifaddr *target) { struct route pfx_ro; @@ -1008,16 +989,13 @@ static void in_addralias_rtmsg(int cmd, struct in_addr *prefix, /* QL: XXX * Point the gateway to the new interface - * address as if a new prefix route entry has - * been added through the new address alias. - * All other parts of the rtentry is accurate, + * address as if a new prefix route entry has + * been added through the new address alias. + * All other parts of the rtentry is accurate, * e.g., rt_key, rt_mask, rt_ifp etc. */ - msg_rt.rt_gateway = - (struct sockaddr *)&target->ia_addr; - rt_newaddrmsg(cmd, - (struct ifaddr *)target, - 0, &msg_rt); + msg_rt.rt_gateway = (struct sockaddr *)&target->ia_addr; + rt_newaddrmsg(cmd, (struct ifaddr *)target, 0, &msg_rt); RTFREE(pfx_ro.ro_rt); } return; @@ -1065,7 +1043,7 @@ in_addprefix(struct in_ifaddr *target, int flags) */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH - if (ia->ia_addr.sin_addr.s_addr == + if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(); return (EEXIST); @@ -1142,7 +1120,7 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) } if (freeit && (flags & LLE_STATIC)) { error = ifa_del_loopback_route((struct ifaddr *)target, - (struct sockaddr *)&target->ia_addr); + (struct sockaddr *)&target->ia_addr); if (error == 0) target->ia_flags &= ~IFA_RTSELF; } @@ -1222,8 +1200,8 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) mask0.sin_len = sizeof(mask0); mask0.sin_family = AF_INET; mask0.sin_addr.s_addr = target->ia_subnetmask; - lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0, - (struct sockaddr *)&mask0, flags); + lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0, + (struct sockaddr *)&mask0, flags); /* * As no-one seem to have this prefix, we can remove the route. @@ -1261,17 +1239,18 @@ in_broadcast(struct in_addr in, struct ifnet *ifp) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || - in.s_addr == ia->ia_netbroadcast.s_addr || /* - * Check for old-style (host 0) broadcast. + * Check for old-style (host 0) broadcast, but + * taking into account that RFC 3021 obsoletes it. */ - t == ia->ia_subnet || t == ia->ia_net) && + (ia->ia_subnetmask != IN_RFC3021_MASK && + t == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ - ia->ia_subnetmask != (u_long)0xffffffff) + ia->ia_subnetmask != (u_long)0xffffffff) return (1); return (0); #undef ia @@ -1343,6 +1322,20 @@ struct in_llentry { struct sockaddr_in l3_addr4; }; +/* + * Deletes an address from the address table. + * This function is called by the timer functions + * such as arptimer() and nd6_llinfo_timer(), and + * the caller does the locking. + */ +static void +in_lltable_free(struct lltable *llt, struct llentry *lle) +{ + LLE_WUNLOCK(lle); + LLE_LOCK_DESTROY(lle); + free(lle, M_LLTABLE); +} + static struct llentry * in_lltable_new(const struct sockaddr *l3addr, u_int flags) { @@ -1352,69 +1345,53 @@ in_lltable_new(const struct sockaddr *l3addr, u_int flags) if (lle == NULL) /* NB: caller generates msg */ return NULL; - callout_init(&lle->base.la_timer, CALLOUT_MPSAFE); /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ - lle->base.la_expire = time_second; /* mark expired */ + lle->base.la_expire = time_uptime; /* mark expired */ lle->l3_addr4 = *(const struct sockaddr_in *)l3addr; lle->base.lle_refcnt = 1; + lle->base.lle_free = in_lltable_free; LLE_LOCK_INIT(&lle->base); - return &lle->base; -} + callout_init_rw(&lle->base.la_timer, &lle->base.lle_lock, + CALLOUT_RETURNUNLOCKED); -/* - * Deletes an address from the address table. - * This function is called by the timer functions - * such as arptimer() and nd6_llinfo_timer(), and - * the caller does the locking. - */ -static void -in_lltable_free(struct lltable *llt, struct llentry *lle) -{ - LLE_WUNLOCK(lle); - LLE_LOCK_DESTROY(lle); - free(lle, M_LLTABLE); + return (&lle->base); } - #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 ) static void -in_lltable_prefix_free(struct lltable *llt, - const struct sockaddr *prefix, - const struct sockaddr *mask, - u_int flags) +in_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, + const struct sockaddr *mask, u_int flags) { const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix; const struct sockaddr_in *msk = (const struct sockaddr_in *)mask; struct llentry *lle, *next; - register int i; + int i; size_t pkts_dropped; - for (i=0; i < LLTBL_HASHTBL_SIZE; i++) { + IF_AFDATA_WLOCK(llt->llt_ifp); + for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { - - /* + /* * (flags & LLE_STATIC) means deleting all entries - * including static ARP entries + * including static ARP entries. */ - if (IN_ARE_MASKED_ADDR_EQUAL((struct sockaddr_in *)L3_ADDR(lle), - pfx, msk) && - ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) { - int canceled; - - canceled = callout_drain(&lle->la_timer); + if (IN_ARE_MASKED_ADDR_EQUAL(satosin(L3_ADDR(lle)), + pfx, msk) && ((flags & LLE_STATIC) || + !(lle->la_flags & LLE_STATIC))) { LLE_WLOCK(lle); - if (canceled) + if (callout_stop(&lle->la_timer)) LLE_REMREF(lle); pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } } } + IF_AFDATA_WUNLOCK(llt->llt_ifp); } @@ -1440,19 +1417,18 @@ in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr */ if (rt->rt_flags & RTF_GATEWAY) { if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp || - rt->rt_ifp->if_type != IFT_ETHER || - (rt->rt_ifp->if_flags & - (IFF_NOARP | IFF_STATICARP)) != 0 || - memcmp(rt->rt_gateway->sa_data, l3addr->sa_data, - sizeof(in_addr_t)) != 0) { + rt->rt_ifp->if_type != IFT_ETHER || + (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || + memcmp(rt->rt_gateway->sa_data, l3addr->sa_data, + sizeof(in_addr_t)) != 0) { RTFREE_LOCKED(rt); return (EINVAL); } } /* - * Make sure that at least the destination address is covered - * by the route. This is for handling the case where 2 or more + * Make sure that at least the destination address is covered + * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. @@ -1512,7 +1488,7 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add hashkey = sin->sin_addr.s_addr; lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)]; LIST_FOREACH(lle, lleh, lle_next) { - struct sockaddr_in *sa2 = (struct sockaddr_in *)L3_ADDR(lle); + struct sockaddr_in *sa2 = satosin(L3_ADDR(lle)); if (lle->la_flags & LLE_DELETED) continue; if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr) @@ -1521,7 +1497,7 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add if (lle == NULL) { #ifdef DIAGNOSTIC if (flags & LLE_DELETE) - log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle); + log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle); #endif if (!(flags & LLE_CREATE)) return (NULL); @@ -1547,18 +1523,24 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add lle->lle_tbl = llt; lle->lle_head = lleh; + lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); } else if (flags & LLE_DELETE) { if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { LLE_WLOCK(lle); - lle->la_flags = LLE_DELETED; - LLE_WUNLOCK(lle); + lle->la_flags |= LLE_DELETED; + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC - log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); + log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif + if ((lle->la_flags & + (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC) + llentry_free(lle); + else + LLE_WUNLOCK(lle); } lle = (void *)-1; - + } if (LLE_IS_VALID(lle)) { if (flags & LLE_EXCLUSIVE) @@ -1590,7 +1572,7 @@ in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { struct sockaddr_dl *sdl; - + /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) continue; @@ -1659,7 +1641,6 @@ in_domifattach(struct ifnet *ifp) llt = lltable_init(ifp, AF_INET); if (llt != NULL) { - llt->llt_free = in_lltable_free; llt->llt_prefix_free = in_lltable_prefix_free; llt->llt_lookup = in_lltable_lookup; llt->llt_dump = in_lltable_dump; diff --git a/freebsd/sys/netinet/in.h b/freebsd/sys/netinet/in.h index 16df5f24..92ba45a6 100644 --- a/freebsd/sys/netinet/in.h +++ b/freebsd/sys/netinet/in.h @@ -93,27 +93,7 @@ typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif -/* Avoid collision with original definition in sys/socket.h. */ -#ifndef _STRUCT_SOCKADDR_STORAGE_DECLARED -/* - * RFC 2553: protocol-independent placeholder for socket addresses - */ -#define _SS_MAXSIZE 128U -#define _SS_ALIGNSIZE (sizeof(__int64_t)) -#define _SS_PAD1SIZE (_SS_ALIGNSIZE - sizeof(unsigned char) - \ - sizeof(sa_family_t)) -#define _SS_PAD2SIZE (_SS_MAXSIZE - sizeof(unsigned char) - \ - sizeof(sa_family_t) - _SS_PAD1SIZE - _SS_ALIGNSIZE) - -struct sockaddr_storage { - unsigned char ss_len; /* address length */ - sa_family_t ss_family; /* address family */ - char __ss_pad1[_SS_PAD1SIZE]; - __int64_t __ss_align; /* force desired struct alignment */ - char __ss_pad2[_SS_PAD2SIZE]; -}; -#define _STRUCT_SOCKADDR_STORAGE_DECLARED -#endif +#include <sys/_sockaddr_storage.h> /* Socket address, internet style. */ struct sockaddr_in { @@ -147,6 +127,7 @@ __END_DECLS #endif /* !_KERNEL && __BSD_VISIBLE */ #if __POSIX_VISIBLE >= 200112 +#define IPPROTO_IPV6 41 /* IP6 header */ #define IPPROTO_RAW 255 /* raw IP packet */ #define INET_ADDRSTRLEN 16 #endif @@ -198,7 +179,6 @@ __END_DECLS #define IPPROTO_CMTP 38 /* Control Message Transport */ #define IPPROTO_TPXX 39 /* TP++ Transport */ #define IPPROTO_IL 40 /* IL transport protocol */ -#define IPPROTO_IPV6 41 /* IP6 header */ #define IPPROTO_SDRP 42 /* Source Demand Routing */ #define IPPROTO_ROUTING 43 /* IP6 routing header */ #define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ @@ -260,10 +240,12 @@ __END_DECLS #define IPPROTO_GMTP 100 /* GMTP*/ #define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ #define IPPROTO_SCTP 132 /* SCTP */ +#define IPPROTO_MH 135 /* IPv6 Mobility Header */ /* 101-254: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ #define IPPROTO_CARP 112 /* CARP */ #define IPPROTO_PGM 113 /* PGM */ +#define IPPROTO_MPLS 137 /* MPLS-in-IP */ #define IPPROTO_PFSYNC 240 /* PFSYNC */ /* 255: Reserved */ /* BSD Private, local use, namespace incursion, no longer used */ @@ -275,6 +257,7 @@ __END_DECLS /* Only used internally, so can be outside the range of valid IP protocols. */ #define IPPROTO_DIVERT 258 /* divert pseudo-protocol */ +#define IPPROTO_SEND 259 /* SeND pseudo-protocol */ /* * Defined to avoid confusion. The master value is defined by @@ -414,6 +397,8 @@ __END_DECLS #define IN_LOOPBACKNET 127 /* official! */ +#define IN_RFC3021_MASK (u_int32_t)0xfffffffe + /* * Options for use with [gs]etsockopt at the IP level. * First word of comment is data type; bool is stored in int. diff --git a/freebsd/sys/netinet/in_gif.c b/freebsd/sys/netinet/in_gif.c index 5461334b..332d7ff4 100644 --- a/freebsd/sys/netinet/in_gif.c +++ b/freebsd/sys/netinet/in_gif.c @@ -258,6 +258,8 @@ in_gif_output(struct ifnet *ifp, int family, struct mbuf *m) #endif } + m_addr_changed(m); + error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL); if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) && diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c index e4b31968..6d748f1f 100644 --- a/freebsd/sys/netinet/in_mcast.c +++ b/freebsd/sys/netinet/in_mcast.c @@ -157,7 +157,8 @@ static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); -SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); +static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, + "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, @@ -176,7 +177,7 @@ SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); TUNABLE_INT("net.inet.ip.mcast.loop", &in_mcast_loop); -SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, +static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); @@ -1861,6 +1862,7 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) ifp = NULL; imf = NULL; + lims = NULL; error = 0; is_new = 0; @@ -1978,34 +1980,47 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) error = EINVAL; goto out_inp_locked; } - /* Throw out duplicates. */ + /* + * Throw out duplicates. + * + * XXX FIXME: This makes a naive assumption that + * even if entries exist for *ssa in this imf, + * they will be rejected as dupes, even if they + * are not valid in the current mode (in-mode). + * + * in_msource is transactioned just as for anything + * else in SSM -- but note naive use of inm_graft() + * below for allocating new filter entries. + * + * This is only an issue if someone mixes the + * full-state SSM API with the delta-based API, + * which is discouraged in the relevant RFCs. + */ lims = imo_match_source(imo, idx, &ssa->sa); - if (lims != NULL) { + if (lims != NULL /*&& + lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* - * MCAST_JOIN_GROUP on an existing inclusive - * membership is an error; if you want to change - * filter mode, you must use the userland API - * setsourcefilter(). - */ - if (imf->imf_st[1] == MCAST_INCLUDE) { - error = EINVAL; - goto out_inp_locked; - } - /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. + * + * On an existing inclusive membership, this is also + * an error; if you want to change filter mode, + * you must use the userland API setsourcefilter(). + * XXX We don't reject this for imf in UNDEFINED + * state at t1, because allocation of a filter + * is atomic with allocation of a membership. */ - if (imf->imf_st[1] == MCAST_EXCLUDE) { + error = EINVAL; + if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; - goto out_inp_locked; - } + goto out_inp_locked; } } @@ -2040,6 +2055,11 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. + * + * Note: Grafting of exclusive mode filters doesn't happen + * in this path. + * XXX: Should check for non-NULL lims (node exists but may + * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ @@ -2424,8 +2444,10 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (error) return (error); - if (msfr.msfr_nsrcs > in_mcast_maxsocksrc || - (msfr.msfr_fmode != MCAST_EXCLUDE && + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) + return (ENOBUFS); + + if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c index 2b50ae8f..5100ac9b 100644 --- a/freebsd/sys/netinet/in_pcb.c +++ b/freebsd/sys/netinet/in_pcb.c @@ -4,8 +4,12 @@ * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -44,17 +48,20 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_ipsec.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_pcbgroup.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/callout.h> #include <sys/domain.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/priv.h> #include <sys/proc.h> +#include <sys/refcount.h> #include <sys/jail.h> #include <sys/kernel.h> #include <sys/sysctl.h> @@ -70,17 +77,22 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <net/vnet.h> +#if defined(INET) || defined(INET6) #include <netinet/in.h> #include <netinet/in_pcb.h> -#include <netinet/in_var.h> #include <netinet/ip_var.h> #include <netinet/tcp_var.h> #include <netinet/udp.h> #include <netinet/udp_var.h> +#endif +#ifdef INET +#include <netinet/in_var.h> +#endif #ifdef INET6 #include <netinet/ip6.h> -#include <netinet6/ip6_var.h> #include <netinet6/in6_pcb.h> +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> #endif /* INET6 */ @@ -91,6 +103,8 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> +static struct callout ipport_tick_callout; + /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. @@ -120,12 +134,17 @@ static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) +static void in_pcbremlists(struct inpcb *inp); +#ifdef INET +static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, + struct in_addr faddr, u_int fport_arg, + struct in_addr laddr, u_int lport_arg, + int lookupflags, struct ifnet *ifp); + #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } -static void in_pcbremlists(struct inpcb *inp); - static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { @@ -149,7 +168,8 @@ sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) #undef RANGECHK -SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); +static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, + "IP Ports"); SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, @@ -182,6 +202,7 @@ SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); +#endif /* * in_pcb.c: manage the Protocol Control Blocks. @@ -192,6 +213,59 @@ SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, */ /* + * Initialize an inpcbinfo -- we should be able to reduce the number of + * arguments in time. + */ +void +in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, + struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, + char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, + uint32_t inpcbzone_flags, u_int hashfields) +{ + + INP_INFO_LOCK_INIT(pcbinfo, name); + INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ +#ifdef VIMAGE + pcbinfo->ipi_vnet = curvnet; +#endif + pcbinfo->ipi_listhead = listhead; + LIST_INIT(pcbinfo->ipi_listhead); + pcbinfo->ipi_count = 0; + pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_hashmask); + pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, + &pcbinfo->ipi_porthashmask); +#ifdef PCBGROUP + in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); +#endif + pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), + NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, + inpcbzone_flags); + uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); +} + +/* + * Destroy an inpcbinfo. + */ +void +in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) +{ + + KASSERT(pcbinfo->ipi_count == 0, + ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); + + hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); + hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, + pcbinfo->ipi_porthashmask); +#ifdef PCBGROUP + in_pcbgroup_destroy(pcbinfo); +#endif + uma_zdestroy(pcbinfo->ipi_zone); + INP_HASH_LOCK_DESTROY(pcbinfo); + INP_INFO_LOCK_DESTROY(pcbinfo); +} + +/* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ @@ -242,7 +316,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - inp->inp_refcount = 1; /* Reference from the inpcbinfo */ + refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ #if defined(IPSEC) || defined(MAC) out: if (error != 0) { @@ -253,13 +327,14 @@ out: return (error); } +#ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); @@ -278,11 +353,12 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) inp->inp_flags |= INP_ANONPORT; return (0); } +#endif #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, - struct ucred *cred, int wild) + struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; @@ -299,8 +375,8 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ - INP_INFO_LOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ @@ -358,6 +434,7 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, laddr = *laddrp; } #endif + tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) @@ -376,14 +453,14 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, - &inp->in6p_laddr, lport, wild, cred); + &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, - lport, wild, cred); + lport, lookupflags, cred); #endif } while (tmpinp != NULL); @@ -395,8 +472,26 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, return (0); } + +/* + * Return cached socket options. + */ +short +inp_so_options(const struct inpcb *inp) +{ + short so_options; + + so_options = 0; + + if ((inp->inp_flags2 & INP_REUSEPORT) != 0) + so_options |= SO_REUSEPORT; + if ((inp->inp_flags2 & INP_REUSEADDR) != 0) + so_options |= SO_REUSEADDR; + return (so_options); +} #endif /* INET || INET6 */ +#ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can @@ -415,15 +510,14 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; - int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* - * Because no actual state changes occur here, a global write lock on - * the pcbinfo isn't required. + * No state changes, so read locks are sufficient here. */ - INP_INFO_LOCK_ASSERT(pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -431,7 +525,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) - wild = INPLOOKUP_WILDCARD; + lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); @@ -505,8 +599,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - (t->inp_socket->so_options & - SO_REUSEPORT) == 0) && + (t->inp_flags2 & INP_REUSEPORT) == 0) && #ifndef __rtems__ (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) @@ -516,7 +609,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, - lport, wild, cred); + lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait @@ -524,19 +617,18 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, * being in use (for now). This is better * than a panic, but not desirable. */ - tw = intotw(inp); + tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); - } else if (t && - (reuseport & t->inp_socket->so_options) == 0) { + } else if (t && (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - INP_SOCKAF(so) == - INP_SOCKAF(t->inp_socket)) + (inp->inp_vflag & INP_IPV6PROTO) == 0 || + (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); } @@ -545,7 +637,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, if (*lportp != 0) lport = *lportp; if (lport == 0) { - error = in_pcb_lport(inp, &laddr, &lport, cred, wild); + error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); @@ -562,14 +654,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, * then pick one. */ int -in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, + struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; @@ -595,13 +688,20 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; - in_pcbrehash(inp); + in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } +int +in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +{ + + return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); +} + /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. @@ -857,8 +957,8 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ - INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo); INP_LOCK_ASSERT(inp); + INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; @@ -933,8 +1033,8 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, if (error) return (error); } - oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport, - 0, NULL); + oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, + laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; @@ -957,13 +1057,14 @@ void in_pcbdisconnect(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } +#endif /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. @@ -982,53 +1083,18 @@ in_pcbdetach(struct inpcb *inp) } /* - * in_pcbfree_internal() frees an inpcb that has been detached from its - * socket, and whose reference count has reached 0. It will also remove the - * inpcb from any global lists it might remain on. - */ -static void -in_pcbfree_internal(struct inpcb *inp) -{ - struct inpcbinfo *ipi = inp->inp_pcbinfo; - - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__)); - - INP_INFO_WLOCK_ASSERT(ipi); - INP_WLOCK_ASSERT(inp); - -#ifdef IPSEC - if (inp->inp_sp != NULL) - ipsec_delete_pcbpolicy(inp); -#endif /* IPSEC */ - inp->inp_gencnt = ++ipi->ipi_gencnt; - in_pcbremlists(inp); -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6PROTO) { - ip6_freepcbopts(inp->in6p_outputopts); - if (inp->in6p_moptions != NULL) - ip6_freemoptions(inp->in6p_moptions); - } -#endif - if (inp->inp_options) - (void)m_free(inp->inp_options); - if (inp->inp_moptions != NULL) - inp_freemoptions(inp->inp_moptions); - inp->inp_vflag = 0; - crfree(inp->inp_cred); - -#ifdef MAC - mac_inpcb_destroy(inp); -#endif - INP_WUNLOCK(inp); - uma_zfree(ipi->ipi_zone, inp); -} - -/* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock is already held. + * but where the inpcb lock may already held, or when acquiring a reference + * via a pcbgroup. + * + * in_pcbref() should be used only to provide brief memory stability, and + * must always be followed by a call to INP_WLOCK() and in_pcbrele() to + * garbage collect the inpcb if it has been in_pcbfree()'d from another + * context. Until in_pcbrele() has returned that the inpcb is still valid, + * lock and rele are the *only* safe operations that may be performed on the + * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to @@ -1039,11 +1105,9 @@ void in_pcbref(struct inpcb *inp) { - INP_WLOCK_ASSERT(inp); - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); - inp->inp_refcount++; + refcount_acquire(&inp->inp_refcount); } /* @@ -1051,47 +1115,118 @@ in_pcbref(struct inpcb *inp) * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. + * + * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a + * reference on an inpcb. Historically more work was done here (actually, in + * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the + * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely + * about memory stability (and continued use of the write lock). */ int -in_pcbrele(struct inpcb *inp) +in_pcbrele_rlocked(struct inpcb *inp) { -#ifdef INVARIANTS - struct inpcbinfo *ipi = inp->inp_pcbinfo; -#endif + struct inpcbinfo *pcbinfo; + + KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + + INP_RLOCK_ASSERT(inp); + + if (refcount_release(&inp->inp_refcount) == 0) { + /* + * If the inpcb has been freed, let the caller know, even if + * this isn't the last reference. + */ + if (inp->inp_flags2 & INP_FREED) { + INP_RUNLOCK(inp); + return (1); + } + return (0); + } + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + + INP_RUNLOCK(inp); + pcbinfo = inp->inp_pcbinfo; + uma_zfree(pcbinfo->ipi_zone, inp); + return (1); +} + +int +in_pcbrele_wlocked(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); - INP_INFO_WLOCK_ASSERT(ipi); INP_WLOCK_ASSERT(inp); - inp->inp_refcount--; - if (inp->inp_refcount > 0) + if (refcount_release(&inp->inp_refcount) == 0) return (0); - in_pcbfree_internal(inp); + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + + INP_WUNLOCK(inp); + pcbinfo = inp->inp_pcbinfo; + uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* + * Temporary wrapper. + */ +int +in_pcbrele(struct inpcb *inp) +{ + + return (in_pcbrele_wlocked(inp)); +} + +/* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is - * released using in_pcbrele(), but the inpcb is still unlocked. + * released using in_pcbrele(), but the inpcb is still unlocked. Almost all + * work, including removal from global lists, is done in this context, where + * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { -#ifdef INVARIANTS - struct inpcbinfo *ipi = inp->inp_pcbinfo; -#endif + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", - __func__)); + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - INP_INFO_WLOCK_ASSERT(ipi); + INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); - if (!in_pcbrele(inp)) + /* XXXRW: Do as much as possible here. */ +#ifdef IPSEC + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif /* IPSEC */ + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + in_pcbremlists(inp); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + if (inp->in6p_moptions != NULL) + ip6_freemoptions(inp->in6p_moptions); + } +#endif + if (inp->inp_options) + (void)m_free(inp->inp_options); +#ifdef INET + if (inp->inp_moptions != NULL) + inp_freemoptions(inp->inp_moptions); +#endif + inp->inp_vflag = 0; + inp->inp_flags2 |= INP_FREED; + crfree(inp->inp_cred); +#ifdef MAC + mac_inpcb_destroy(inp); +#endif + if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } @@ -1106,12 +1241,6 @@ in_pcbfree(struct inpcb *inp) * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * - * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash - * lists, but can lead to confusing netstat output, as open sockets with - * closed TCP connections will no longer appear to have their bound port - * number. An explicit flag would be better, as it would allow us to leave - * the port number intact after the connection is dropped. - * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ @@ -1119,23 +1248,32 @@ void in_pcbdrop(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); + /* + * XXXRW: Possibly we should protect the setting of INP_DROPPED with + * the hash lock...? + */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; + INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } + INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; +#ifdef PCBGROUP + in_pcbgroup_remove(inp); +#endif } } +#ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ @@ -1259,12 +1397,13 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) } /* - * Lookup a PCB based on the local address and port. + * Lookup a PCB based on the local address and port. Caller must hold the + * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, - u_short lport, int wild_okay, struct ucred *cred) + u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 @@ -1274,9 +1413,12 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, #endif int wildcard; - INP_INFO_LOCK_ASSERT(pcbinfo); + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); - if (!wild_okay) { + INP_HASH_LOCK_ASSERT(pcbinfo); + + if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that @@ -1377,19 +1519,166 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } #undef INP_LOOKUP_MAPPED_PCB_COST +#ifdef PCBGROUP /* - * Lookup PCB in hash list. + * Lookup PCB in hash list, using pcbgroup tables. */ -struct inpcb * -in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, - u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, +static struct inpcb * +in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, + struct in_addr faddr, u_int fport_arg, struct in_addr laddr, + u_int lport_arg, int lookupflags, struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + + /* + * First look for an exact match. + */ + tmpinp = NULL; + INP_GROUP_LOCK(pcbgroup); + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP4)) + goto found; + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) { + inp = tmpinp; + goto found; + } + + /* + * Then look for a wildcard match, if requested. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_wildmask)]; + LIST_FOREACH(inp, head, inp_pcbgroup_wild) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif /* INET6 */ + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif /* defined(INET6) */ + if (inp != NULL) + goto found; + } /* if (lookupflags & INPLOOKUP_WILDCARD) */ + INP_GROUP_UNLOCK(pcbgroup); + return (NULL); + +found: + in_pcbref(inp); + INP_GROUP_UNLOCK(pcbgroup); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + return (inp); +} +#endif /* PCBGROUP */ + +/* + * Lookup PCB in hash list, using pcbinfo tables. This variation assumes + * that the caller has locked the hash list, and will not perform any further + * locking or reference operations on either the hash list or the connection. + */ +static struct inpcb * +in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; - INP_INFO_LOCK_ASSERT(pcbinfo); + KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + + INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. @@ -1424,7 +1713,7 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, /* * Then look for a wildcard match, if requested. */ - if (wildcard == INPLOOKUP_WILDCARD) { + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; @@ -1495,16 +1784,112 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, if (local_wild_mapped != NULL) return (local_wild_mapped); #endif /* defined(INET6) */ - } /* if (wildcard == INPLOOKUP_WILDCARD) */ + } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* + * Lookup PCB in hash list, using pcbinfo tables. This variation locks the + * hash list lock, and will return the inpcb locked (i.e., requires + * INPLOOKUP_LOCKPCB). + */ +static struct inpcb * +in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport, struct in_addr laddr, u_int lport, int lookupflags, + struct ifnet *ifp) +{ + struct inpcb *inp; + + INP_HASH_RLOCK(pcbinfo); + inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + if (inp != NULL) { + in_pcbref(inp); + INP_HASH_RUNLOCK(pcbinfo); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (NULL); + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (in_pcbrele_rlocked(inp)) + return (NULL); + } else + panic("%s: locking bug", __func__); + } else + INP_HASH_RUNLOCK(pcbinfo); + return (inp); +} + +/* + * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf + * from which a pre-calculated hash value may be extracted. + * + * Possibly more of this logic should be in in_pcbgroup.c. + */ +struct inpcb * +in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, + struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) +{ +#if defined(PCBGROUP) + struct inpcbgroup *pcbgroup; +#endif + + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, + ("%s: LOCKPCB not set", __func__)); + +#if defined(PCBGROUP) + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif + return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); +} + +struct inpcb * +in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport, struct in_addr laddr, u_int lport, int lookupflags, + struct ifnet *ifp, struct mbuf *m) +{ +#ifdef PCBGROUP + struct inpcbgroup *pcbgroup; +#endif + + KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, + ("%s: invalid lookup flags %d", __func__, lookupflags)); + KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, + ("%s: LOCKPCB not set", __func__)); + +#ifdef PCBGROUP + if (in_pcbgroup_enabled(pcbinfo)) { + pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), + m->m_pkthdr.flowid); + if (pcbgroup != NULL) + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, + fport, laddr, lport, lookupflags, ifp)); + pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, + fport); + return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, + laddr, lport, lookupflags, ifp)); + } +#endif + return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, + lookupflags, ifp)); +} +#endif /* INET */ + +/* * Insert PCB onto various hash lists. */ -int -in_pcbinshash(struct inpcb *inp) +static int +in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; @@ -1512,8 +1897,9 @@ in_pcbinshash(struct inpcb *inp) struct inpcbport *phd; u_int32_t hashkey_faddr; - INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); @@ -1553,24 +1939,54 @@ in_pcbinshash(struct inpcb *inp) LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; +#ifdef PCBGROUP + if (do_pcbgroup_update) + in_pcbgroup_update(inp); +#endif return (0); } /* + * For now, there are two public interfaces to insert an inpcb into the hash + * lists -- one that does update pcbgroups, and one that doesn't. The latter + * is used only in the TCP syncache, where in_pcbinshash is called before the + * full 4-tuple is set for the inpcb, and we don't want to install in the + * pcbgroup until later. + * + * XXXRW: This seems like a misfeature. in_pcbinshash should always update + * connection groups, and partially initialised inpcbs should not be exposed + * to either reservation hash tables or pcbgroups. + */ +int +in_pcbinshash(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 1)); +} + +int +in_pcbinshash_nopcbgroup(struct inpcb *inp) +{ + + return (in_pcbinshash_internal(inp, 0)); +} + +/* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void -in_pcbrehash(struct inpcb *inp) +in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; - INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); @@ -1586,6 +2002,20 @@ in_pcbrehash(struct inpcb *inp) LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); + +#ifdef PCBGROUP + if (m != NULL) + in_pcbgroup_update_mbuf(inp, m); + else + in_pcbgroup_update(inp); +#endif +} + +void +in_pcbrehash(struct inpcb *inp) +{ + + in_pcbrehash_mbuf(inp, NULL); } /* @@ -1603,16 +2033,21 @@ in_pcbremlists(struct inpcb *inp) if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; + INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } + INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; +#ifdef PCBGROUP + in_pcbgroup_remove(inp); +#endif } /* @@ -1643,7 +2078,7 @@ in_pcbsosetlabel(struct socket *so) * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ -void +static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); @@ -1664,6 +2099,30 @@ ipport_tick(void *xtp) callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } +static void +ip_fini(void *xtp) +{ + + callout_stop(&ipport_tick_callout); +} + +/* + * The ipport_callout should start running at about the time we attach the + * inet or inet6 domains. + */ +static void +ipport_tick_init(const void *unused __unused) +{ + + /* Start ipport_tick. */ + callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); + callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); + EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, + SHUTDOWN_PRI_DEFAULT); +} +SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, + ipport_tick_init, NULL); + void inp_wlock(struct inpcb *inp) { diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h index 9f602ce2..a78c6ab6 100644 --- a/freebsd/sys/netinet/in_pcb.h +++ b/freebsd/sys/netinet/in_pcb.h @@ -1,8 +1,12 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -40,8 +44,10 @@ #include <sys/_rwlock.h> #ifdef _KERNEL +#include <rtems/bsd/sys/lock.h> #include <sys/rwlock.h> #include <net/vnet.h> +#include <vm/uma.h> #endif #define in6pcb inpcb /* for KAME src sync over BSD*'s */ @@ -136,6 +142,7 @@ struct icmp6_filter; * * Key: * (c) - Constant after initialization + * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (s) - Protected by another subsystem's locks @@ -155,9 +162,12 @@ struct icmp6_filter; */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ + struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ + LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -169,8 +179,9 @@ struct inpcb { u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ - void *inp_pspare[4]; /* (x) rtentry / general use */ - u_int inp_ispare[4]; /* general use */ + void *inp_pspare[5]; /* (x) route caching / general use */ + u_int inp_ispare[6]; /* (x) route caching / user cookie / + * general use */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */ @@ -259,53 +270,93 @@ struct inpcbport { u_short phd_port; }; -/* +/*- * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. + * + * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock, + * the former covering mutable global fields (such as the global pcb list), + * and the latter covering the hashed lookup tables. The lock order is: + * + * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks} + * + * Locking key: + * + * (c) Constant or nearly constant after initialisation + * (g) Locked by ipi_lock + * (h) Read using either ipi_hash_lock or inpcb lock; write requires both + * (p) Protected by one or more pcbgroup locks + * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global list of inpcbs on the protocol. + * Global lock protecting global inpcb list, inpcb count, etc. */ - struct inpcbhead *ipi_listhead; - u_int ipi_count; + struct rwlock ipi_lock; /* - * Global hash of inpcbs, hashed by local and foreign addresses and - * port numbers. + * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_hashbase; - u_long ipi_hashmask; + struct inpcbhead *ipi_listhead; /* (g) */ + u_int ipi_count; /* (g) */ /* - * Global hash of inpcbs, hashed by only local port number. + * Generation count -- incremented each time a connection is allocated + * or freed. */ - struct inpcbporthead *ipi_porthashbase; - u_long ipi_porthashmask; + u_quad_t ipi_gencnt; /* (g) */ /* * Fields associated with port lookup and allocation. */ - u_short ipi_lastport; - u_short ipi_lastlow; - u_short ipi_lasthi; + u_short ipi_lastport; /* (x) */ + u_short ipi_lastlow; /* (x) */ + u_short ipi_lasthi; /* (x) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ - struct uma_zone *ipi_zone; + struct uma_zone *ipi_zone; /* (c) */ /* - * Generation count--incremented each time a connection is allocated - * or freed. + * Connection groups associated with this protocol. These fields are + * constant, but pcbgroup structures themselves are protected by + * per-pcbgroup locks. */ - u_quad_t ipi_gencnt; - struct rwlock ipi_lock; + struct inpcbgroup *ipi_pcbgroups; /* (c) */ + u_int ipi_npcbgroups; /* (c) */ + u_int ipi_hashfields; /* (c) */ + + /* + * Global lock protecting non-pcbgroup hash lookup tables. + */ + struct rwlock ipi_hash_lock; + + /* + * Global hash of inpcbs, hashed by local and foreign addresses and + * port numbers. + */ + struct inpcbhead *ipi_hashbase; /* (h) */ + u_long ipi_hashmask; /* (h) */ + + /* + * Global hash of inpcbs, hashed by only local port number. + */ + struct inpcbporthead *ipi_porthashbase; /* (h) */ + u_long ipi_porthashmask; /* (h) */ + + /* + * List of wildcard inpcbs for use with pcbgroups. In the past, was + * per-pcbgroup but is now global. All pcbgroup locks must be held + * to modify the list, so any is sufficient to read it. + */ + struct inpcbhead *ipi_wildbase; /* (p) */ + u_long ipi_wildmask; /* (p) */ /* * Pointer to network stack instance */ - struct vnet *ipi_vnet; + struct vnet *ipi_vnet; /* (c) */ /* * general use 2 @@ -313,6 +364,32 @@ struct inpcbinfo { void *ipi_pspare[2]; }; +#ifdef _KERNEL +/* + * Connection groups hold sets of connections that have similar CPU/thread + * affinity. Each connection belongs to exactly one connection group. + */ +struct inpcbgroup { + /* + * Per-connection group hash of inpcbs, hashed by local and foreign + * addresses and port numbers. + */ + struct inpcbhead *ipg_hashbase; /* (c) */ + u_long ipg_hashmask; /* (c) */ + + /* + * Notional affinity of this pcbgroup. + */ + u_int ipg_cpu; /* (p) */ + + /* + * Per-connection group lock, not to be confused with ipi_lock. + * Protects the hash table hung off the group, but also the global + * wildcard list in inpcbinfo. + */ + struct mtx ipg_lock; +} __aligned(CACHE_LINE_SIZE); + #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -330,7 +407,6 @@ struct inpcbinfo { #define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) #define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) -#ifdef _KERNEL /* * These locking functions are for inpcb consumers outside of sys/netinet, * more specifically, they were added for the benefit of TOE drivers. The @@ -366,6 +442,7 @@ struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); +short inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ @@ -384,6 +461,26 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) +#define INP_HASH_LOCK_INIT(ipi, d) \ + rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) +#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) +#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ + RA_LOCKED) +#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ + RA_WLOCKED) + +#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ + MTX_DEF | MTX_DUPOK) +#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) + +#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) +#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) +#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) + #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ @@ -444,8 +541,21 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, */ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ +#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ +#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ +#define INP_FREED 0x00000010 /* inp itself is not valid */ +#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ + +/* + * Flags passed to in_pcblookup*() functions. + */ +#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ +#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ +#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ + +#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ + INPLOOKUP_WLOCKPCB) -#define INPLOOKUP_WILDCARD 1 #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ @@ -453,6 +563,13 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) +/* + * Constants for pcbinfo.ipi_hashfields. + */ +#define IPI_HASHFIELDS_NONE 0 +#define IPI_HASHFIELDS_2TUPLE 1 +#define IPI_HASHFIELDS_4TUPLE 2 + #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); @@ -482,7 +599,23 @@ VNET_DECLARE(int, ipport_tcpallocs); #define V_ipport_stoprandom VNET(ipport_stoprandom) #define V_ipport_tcpallocs VNET(ipport_tcpallocs) -extern struct callout ipport_tick_callout; +void in_pcbinfo_destroy(struct inpcbinfo *); +void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, + int, int, char *, uma_init, uma_fini, uint32_t, u_int); + +struct inpcbgroup * + in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); +struct inpcbgroup * + in_pcbgroup_byinpcb(struct inpcb *); +struct inpcbgroup * + in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, + struct in_addr, u_short); +void in_pcbgroup_destroy(struct inpcbinfo *); +int in_pcbgroup_enabled(struct inpcbinfo *); +void in_pcbgroup_init(struct inpcbinfo *, u_int, int); +void in_pcbgroup_remove(struct inpcb *); +void in_pcbgroup_update(struct inpcb *); +void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); @@ -492,6 +625,8 @@ int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); +int in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *, + struct mbuf *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); @@ -500,24 +635,30 @@ void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); +int in_pcbinshash_nopcbgroup(struct inpcb *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * - in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, + in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); +struct inpcb * + in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, + struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); +void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); +int in_pcbrele_rlocked(struct inpcb *); +int in_pcbrele_wlocked(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); -void ipport_tick(void *xtp); #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c index b479e09e..1eef2c72 100644 --- a/freebsd/sys/netinet/in_proto.c +++ b/freebsd/sys/netinet/in_proto.c @@ -37,8 +37,8 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_ipx.h> #include <rtems/bsd/local/opt_mrouting.h> #include <rtems/bsd/local/opt_ipsec.h> +#include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_pf.h> #include <rtems/bsd/local/opt_sctp.h> #include <rtems/bsd/local/opt_mpath.h> @@ -52,14 +52,26 @@ __FBSDID("$FreeBSD$"); #include <sys/queue.h> #include <sys/sysctl.h> +/* + * While this file provides the domain and protocol switch tables for IPv4, it + * also provides the sysctl node declarations for net.inet.* often shared with + * IPv6 for common features or by upper layer protocols. In case of no IPv4 + * support compile out everything but these sysctl nodes. + */ +#ifdef INET #include <net/if.h> #include <net/route.h> #ifdef RADIX_MPATH #include <net/radix_mpath.h> #endif #include <net/vnet.h> +#endif /* INET */ +#if defined(INET) || defined(INET6) #include <netinet/in.h> +#endif + +#ifdef INET #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> @@ -90,11 +102,6 @@ static struct pr_usrreqs nousrreqs; #include <netinet/sctp_var.h> #endif /* SCTP */ -#ifdef DEV_PFSYNC -#include <net/pfvar.h> -#include <net/if_pfsync.h> -#endif - FEATURE(inet, "Internet Protocol version 4"); extern struct domain inetdomain; @@ -306,17 +313,6 @@ struct protosw inetsw[] = { .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }, -#ifdef DEV_PFSYNC -{ - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_PFSYNC, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = pfsync_input, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}, -#endif /* DEV_PFSYNC */ /* Spacer n-times for loadable protocols. */ IPPROTOSPACER, IPPROTOSPACER, @@ -364,6 +360,7 @@ struct domain inetdomain = { }; VNET_DOMAIN_SET(inet); +#endif /* INET */ SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0, "Internet Family"); @@ -385,6 +382,3 @@ SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW, 0, "IPCOMP"); SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW, 0, "IPIP"); #endif /* IPSEC */ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); -#ifdef DEV_PFSYNC -SYSCTL_NODE(_net_inet, IPPROTO_PFSYNC, pfsync, CTLFLAG_RW, 0, "PFSYNC"); -#endif diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h index c04d45b9..b8477309 100644 --- a/freebsd/sys/netinet/in_var.h +++ b/freebsd/sys/netinet/in_var.h @@ -60,12 +60,9 @@ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags - /* ia_{,sub}net{,mask} in host order */ - u_long ia_net; /* network number of interface */ - u_long ia_netmask; /* mask of net part */ - u_long ia_subnet; /* subnet number, including net */ - u_long ia_subnetmask; /* mask of subnet part */ - struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ + /* ia_subnet{,mask} in host order */ + u_long ia_subnet; /* subnet address */ + u_long ia_subnetmask; /* mask of subnet */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ @@ -162,14 +159,16 @@ do { \ #define IFP_TO_IA(ifp, ia) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ -{ \ +do { \ + IN_IFADDR_RLOCK(); \ for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ if ((ia) != NULL) \ ifa_ref(&(ia)->ia_ifa); \ -} + IN_IFADDR_RUNLOCK(); \ +} while (0) #endif /* diff --git a/freebsd/sys/netinet/ip.h b/freebsd/sys/netinet/ip.h index 6c9482f9..79afeb8f 100644 --- a/freebsd/sys/netinet/ip.h +++ b/freebsd/sys/netinet/ip.h @@ -48,11 +48,11 @@ */ struct ip { #if BYTE_ORDER == LITTLE_ENDIAN - u_int ip_hl:4, /* header length */ + u_char ip_hl:4, /* header length */ ip_v:4; /* version */ #endif #if BYTE_ORDER == BIG_ENDIAN - u_int ip_v:4, /* version */ + u_char ip_v:4, /* version */ ip_hl:4; /* header length */ #endif u_char ip_tos; /* type of service */ @@ -167,11 +167,11 @@ struct ip_timestamp { u_char ipt_len; /* size of structure (variable) */ u_char ipt_ptr; /* index of current entry */ #if BYTE_ORDER == LITTLE_ENDIAN - u_int ipt_flg:4, /* flags, see below */ + u_char ipt_flg:4, /* flags, see below */ ipt_oflw:4; /* overflow counter */ #endif #if BYTE_ORDER == BIG_ENDIAN - u_int ipt_oflw:4, /* overflow counter */ + u_char ipt_oflw:4, /* overflow counter */ ipt_flg:4; /* flags, see below */ #endif union ipt_timestamp { diff --git a/freebsd/sys/netinet/ip6.h b/freebsd/sys/netinet/ip6.h index 3fb08a78..8f498410 100644 --- a/freebsd/sys/netinet/ip6.h +++ b/freebsd/sys/netinet/ip6.h @@ -263,7 +263,7 @@ struct ip6_frag { /* * IP6_EXTHDR_CHECK ensures that region between the IP6 header and the * target header (including IPv6 itself, extension headers and - * TCP/UDP/ICMP6 headers) are continuous. KAME requires drivers + * TCP/UDP/ICMP6 headers) are contiguous. KAME requires drivers * to store incoming data into one internal mbuf or one or more external * mbufs(never into two or more internal mbufs). Thus, the third case is * supposed to never be matched but is prepared just in case. @@ -275,24 +275,24 @@ do { \ if (((m)->m_flags & M_LOOP) && \ ((m)->m_len < (off) + (hlen)) && \ (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \ - V_ip6stat.ip6s_exthdrtoolong++; \ + IP6STAT_INC(ip6s_exthdrtoolong); \ return ret; \ } else if ((m)->m_flags & M_EXT) { \ if ((m)->m_len < (off) + (hlen)) { \ - V_ip6stat.ip6s_exthdrtoolong++; \ + IP6STAT_INC(ip6s_exthdrtoolong); \ m_freem(m); \ return ret; \ } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ - V_ip6stat.ip6s_exthdrtoolong++; \ + IP6STAT_INC(ip6s_exthdrtoolong); \ m_freem(m); \ return ret; \ } \ } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ - V_ip6stat.ip6s_tooshort++; \ + IP6STAT_INC(ip6s_tooshort); \ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \ m_freem(m); \ return ret; \ diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c index a08c3fb8..a34c10c3 100644 --- a/freebsd/sys/netinet/ip_carp.c +++ b/freebsd/sys/netinet/ip_carp.c @@ -68,14 +68,19 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <net/vnet.h> -#ifdef INET +#if defined(INET) || defined(INET6) #include <netinet/in.h> #include <netinet/in_var.h> -#include <netinet/in_systm.h> +#include <netinet/ip_carp.h> #include <netinet/ip.h> + +#include <machine/in_cksum.h> +#endif + +#ifdef INET +#include <netinet/in_systm.h> #include <netinet/ip_var.h> #include <netinet/if_ether.h> -#include <machine/in_cksum.h> #endif #ifdef INET6 @@ -84,11 +89,11 @@ __FBSDID("$FreeBSD$"); #include <netinet6/ip6protosw.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> +#include <netinet6/in6_var.h> #include <netinet6/nd6.h> #endif #include <crypto/sha1.h> -#include <netinet/ip_carp.h> #define CARP_IFNAME "carp" static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces"); @@ -98,7 +103,9 @@ struct carp_softc { struct ifnet *sc_ifp; /* Interface clue */ struct ifnet *sc_carpdev; /* Pointer to parent interface */ struct in_ifaddr *sc_ia; /* primary iface address */ +#ifdef INET struct ip_moptions sc_imo; +#endif #ifdef INET6 struct in6_ifaddr *sc_ia6; /* primary iface address v6 */ struct ip6_moptions sc_im6o; @@ -208,7 +215,9 @@ static int carp_prepare_ad(struct mbuf *, struct carp_softc *, static void carp_send_ad_all(void); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); +#ifdef INET static void carp_send_arp(struct carp_softc *); +#endif static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *); static int carp_ioctl(struct ifnet *, u_long, caddr_t); @@ -217,12 +226,16 @@ static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *, static void carp_start(struct ifnet *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_set_state(struct carp_softc *, int); +#ifdef INET static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int); +#endif enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING }; +#ifdef INET static void carp_multicast_cleanup(struct carp_softc *, int dofree); static int carp_set_addr(struct carp_softc *, struct sockaddr_in *); static int carp_del_addr(struct carp_softc *, struct sockaddr_in *); +#endif static void carp_carpdev_state_locked(struct carp_if *); static void carp_sc_state_locked(struct carp_softc *); #ifdef INET6 @@ -371,6 +384,7 @@ carp_setroute(struct carp_softc *sc, int cmd) s = splnet(); TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { +#ifdef INET if (ifa->ifa_addr->sa_family == AF_INET && sc->sc_carpdev != NULL) { int count = carp_addrcount( @@ -381,6 +395,7 @@ carp_setroute(struct carp_softc *sc, int cmd) (cmd == RTM_DELETE && count == 0)) rtinit(ifa, cmd, RTF_UP | RTF_HOST); } +#endif } splx(s); } @@ -406,12 +421,14 @@ carp_clone_create(struct if_clone *ifc, int unit, caddr_t params) sc->sc_advskew = 0; sc->sc_init_counter = 1; sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */ +#ifdef INET sc->sc_imo.imo_membership = (struct in_multi **)malloc( (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, M_WAITOK); sc->sc_imo.imo_mfilters = NULL; sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; sc->sc_imo.imo_multicast_vif = -1; +#endif #ifdef INET6 sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc( (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, @@ -458,7 +475,9 @@ carp_clone_destroy(struct ifnet *ifp) bpfdetach(ifp); if_detach(ifp); if_free_type(ifp, IFT_ETHER); +#ifdef INET free(sc->sc_imo.imo_membership, M_CARP); +#endif #ifdef INET6 free(sc->sc_im6o.im6o_membership, M_CARP); #endif @@ -497,7 +516,9 @@ carpdetach(struct carp_softc *sc, int unlock) carp_set_state(sc, INIT); SC2IFP(sc)->if_flags &= ~IFF_UP; carp_setrun(sc, 0); +#ifdef INET carp_multicast_cleanup(sc, unlock); +#endif #ifdef INET6 carp_multicast6_cleanup(sc, unlock); #endif @@ -542,6 +563,7 @@ carp_ifdetach(void *arg __unused, struct ifnet *ifp) * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ +#ifdef INET void carp_input(struct mbuf *m, int hlen) { @@ -632,6 +654,7 @@ carp_input(struct mbuf *m, int hlen) carp_input_c(m, ch, AF_INET); } +#endif #ifdef INET6 int @@ -722,12 +745,16 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) SC2IFP(sc)->if_ibytes += m->m_pkthdr.len; if (bpf_peers_present(SC2IFP(sc)->if_bpf)) { - struct ip *ip = mtod(m, struct ip *); uint32_t af1 = af; +#ifdef INET + struct ip *ip = mtod(m, struct ip *); /* BPF wants net byte order */ - ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2)); - ip->ip_off = htons(ip->ip_off); + if (af == AF_INET) { + ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2)); + ip->ip_off = htons(ip->ip_off); + } +#endif bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m); } @@ -1083,6 +1110,7 @@ carp_send_ad_locked(struct carp_softc *sc) } +#ifdef INET /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address @@ -1104,6 +1132,7 @@ carp_send_arp(struct carp_softc *sc) DELAY(1000); /* XXX */ } } +#endif #ifdef INET6 static void @@ -1126,6 +1155,7 @@ carp_send_na(struct carp_softc *sc) } #endif /* INET6 */ +#ifdef INET static int carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type) { @@ -1229,6 +1259,7 @@ carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia, CARP_UNLOCK(cif); return (0); } +#endif #ifdef INET6 struct ifaddr * @@ -1355,7 +1386,9 @@ carp_master_down_locked(struct carp_softc *sc) case BACKUP: carp_set_state(sc, MASTER); carp_send_ad_locked(sc); +#ifdef INET carp_send_arp(sc); +#endif #ifdef INET6 carp_send_na(sc); #endif /* INET6 */ @@ -1434,6 +1467,7 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) } } +#ifdef INET static void carp_multicast_cleanup(struct carp_softc *sc, int dofree) { @@ -1453,6 +1487,7 @@ carp_multicast_cleanup(struct carp_softc *sc, int dofree) imo->imo_num_memberships = 0; imo->imo_multicast_ifp = NULL; } +#endif #ifdef INET6 static void @@ -1475,6 +1510,7 @@ carp_multicast6_cleanup(struct carp_softc *sc, int dofree) } #endif +#ifdef INET static int carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin) { @@ -1651,6 +1687,7 @@ carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin) return (error); } +#endif #ifdef INET6 static int @@ -2351,13 +2388,13 @@ carp_mod_load(void) printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); - return (EINVAL); + return (proto_reg[CARP_INET6]); } err = ip6proto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET6\n", err); carp_mod_cleanup(); - return (EINVAL); + return (err); } #endif #ifdef INET @@ -2367,13 +2404,13 @@ carp_mod_load(void) printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); - return (EINVAL); + return (proto_reg[CARP_INET]); } err = ipproto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET\n", err); carp_mod_cleanup(); - return (EINVAL); + return (err); } #endif return 0; diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c index 5fb32ba6..879f411f 100644 --- a/freebsd/sys/netinet/ip_divert.c +++ b/freebsd/sys/netinet/ip_divert.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #error "IPDIVERT requires INET." #endif #endif +#include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/sys/param.h> #include <sys/kernel.h> @@ -50,20 +51,13 @@ __FBSDID("$FreeBSD$"); #include <sys/priv.h> #include <sys/proc.h> #include <sys/protosw.h> -#include <sys/rwlock.h> -#include <sys/signalvar.h> #include <sys/socket.h> #include <sys/socketvar.h> -#include <sys/sx.h> #include <sys/sysctl.h> -#include <sys/systm.h> - -#include <vm/uma.h> +#include <net/vnet.h> #include <net/if.h> #include <net/netisr.h> -#include <net/route.h> -#include <net/vnet.h> #include <netinet/in.h> #include <netinet/in_pcb.h> @@ -71,6 +65,10 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif #ifdef SCTP #include <netinet/sctp_crc32.h> #endif @@ -156,35 +154,21 @@ static void div_init(void) { - INP_INFO_LOCK_INIT(&V_divcbinfo, "div"); - LIST_INIT(&V_divcb); - V_divcbinfo.ipi_listhead = &V_divcb; -#ifdef VIMAGE - V_divcbinfo.ipi_vnet = curvnet; -#endif /* - * XXX We don't use the hash list for divert IP, but it's easier - * to allocate a one entry hash list than it is to check all - * over the place for hashbase == NULL. + * XXX We don't use the hash list for divert IP, but it's easier to + * allocate one-entry hash lists than it is to check all over the + * place for hashbase == NULL. */ - V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask); - V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, - &V_divcbinfo.ipi_porthashmask); - V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), - NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, - UMA_ZONE_NOFREE); - uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); + in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", + div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); } static void div_destroy(void) { - INP_INFO_LOCK_DESTROY(&V_divcbinfo); - uma_zdestroy(V_divcbinfo.ipi_zone); - hashdestroy(V_divcbinfo.ipi_hashbase, M_PCB, V_divcbinfo.ipi_hashmask); - hashdestroy(V_divcbinfo.ipi_porthashbase, M_PCB, - V_divcbinfo.ipi_porthashmask); + in_pcbinfo_destroy(&V_divcbinfo); } /* @@ -335,10 +319,10 @@ static int div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { + struct ip *const ip = mtod(m, struct ip *); struct m_tag *mtag; struct ipfw_rule_ref *dt; int error = 0; - struct mbuf *options; /* * An mbuf may hasn't come from userland, but we pretend @@ -390,71 +374,104 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { - struct ip *const ip = mtod(m, struct ip *); + struct mbuf *options = NULL; struct inpcb *inp; dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; inp = sotoinpcb(so); INP_RLOCK(inp); - /* - * Don't allow both user specified and setsockopt options, - * and don't allow packet length sizes that will crash - */ - if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || - ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { - error = EINVAL; - INP_RUNLOCK(inp); - m_freem(m); - } else { + switch (ip->ip_v) { + case IPVERSION: + /* + * Don't allow both user specified and setsockopt + * options, and don't allow packet length sizes that + * will crash. + */ + if ((((ip->ip_hl << 2) != sizeof(struct ip)) && + inp->inp_options != NULL) || + ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { + error = EINVAL; + INP_RUNLOCK(inp); + goto cantsend; + } + /* Convert fields to host order for ip_output() */ ip->ip_len = ntohs(ip->ip_len); ip->ip_off = ntohs(ip->ip_off); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + { + struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); + + /* Don't allow packet length sizes that will crash */ + if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { + error = EINVAL; + INP_RUNLOCK(inp); + goto cantsend; + } - /* Send packet to output processing */ - KMOD_IPSTAT_INC(ips_rawout); /* XXX */ + ip6->ip6_plen = ntohs(ip6->ip6_plen); + break; + } +#endif + default: + error = EINVAL; + INP_RUNLOCK(inp); + goto cantsend; + } + + /* Send packet to output processing */ + KMOD_IPSTAT_INC(ips_rawout); /* XXX */ #ifdef MAC - mac_inpcb_create_mbuf(inp, m); + mac_inpcb_create_mbuf(inp, m); #endif - /* - * Get ready to inject the packet into ip_output(). - * Just in case socket options were specified on the - * divert socket, we duplicate them. This is done - * to avoid having to hold the PCB locks over the call - * to ip_output(), as doing this results in a number of - * lock ordering complexities. - * - * Note that we set the multicast options argument for - * ip_output() to NULL since it should be invariant that - * they are not present. - */ - KASSERT(inp->inp_moptions == NULL, - ("multicast options set on a divert socket")); - options = NULL; - /* - * XXXCSJP: It is unclear to me whether or not it makes - * sense for divert sockets to have options. However, - * for now we will duplicate them with the INP locks - * held so we can use them in ip_output() without - * requring a reference to the pcb. - */ - if (inp->inp_options != NULL) { - options = m_dup(inp->inp_options, M_DONTWAIT); - if (options == NULL) - error = ENOBUFS; - } - INP_RUNLOCK(inp); - if (error == ENOBUFS) { - m_freem(m); - return (error); + /* + * Get ready to inject the packet into ip_output(). + * Just in case socket options were specified on the + * divert socket, we duplicate them. This is done + * to avoid having to hold the PCB locks over the call + * to ip_output(), as doing this results in a number of + * lock ordering complexities. + * + * Note that we set the multicast options argument for + * ip_output() to NULL since it should be invariant that + * they are not present. + */ + KASSERT(inp->inp_moptions == NULL, + ("multicast options set on a divert socket")); + /* + * XXXCSJP: It is unclear to me whether or not it makes + * sense for divert sockets to have options. However, + * for now we will duplicate them with the INP locks + * held so we can use them in ip_output() without + * requring a reference to the pcb. + */ + if (inp->inp_options != NULL) { + options = m_dup(inp->inp_options, M_NOWAIT); + if (options == NULL) { + INP_RUNLOCK(inp); + error = ENOBUFS; + goto cantsend; } + } + INP_RUNLOCK(inp); + + switch (ip->ip_v) { + case IPVERSION: error = ip_output(m, options, NULL, - ((so->so_options & SO_DONTROUTE) ? - IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | - IP_RAWOUTPUT, NULL, NULL); - if (options != NULL) - m_freem(options); + ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) + | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); + break; +#endif } + if (options != NULL) + m_freem(options); } else { dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; if (m->m_pkthdr.rcvif == NULL) { @@ -479,14 +496,26 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, mac_socket_create_mbuf(so, m); #endif /* Send packet to input processing via netisr */ - netisr_queue_src(NETISR_IP, (uintptr_t)so, m); + switch (ip->ip_v) { + case IPVERSION: + netisr_queue_src(NETISR_IP, (uintptr_t)so, m); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); + break; +#endif + default: + error = EINVAL; + goto cantsend; + } } - return error; + return (error); cantsend: m_freem(m); - return error; + return (error); } static int @@ -554,7 +583,9 @@ div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_divcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_divcbinfo); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_divcbinfo); return error; @@ -683,9 +714,9 @@ div_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_divcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_divcbinfo); @@ -709,7 +740,8 @@ div_pcblist(SYSCTL_HANDLER_ARGS) } #ifdef SYSCTL_NODE -SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); +static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, + "IPDIVERT"); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); #endif diff --git a/freebsd/sys/netinet/ip_dummynet.h b/freebsd/sys/netinet/ip_dummynet.h index 0bbc3263..dc2c3412 100644 --- a/freebsd/sys/netinet/ip_dummynet.h +++ b/freebsd/sys/netinet/ip_dummynet.h @@ -87,14 +87,14 @@ enum { DN_SYSCTL_SET, DN_LAST, -} ; +}; enum { /* subtype for schedulers, flowset and the like */ DN_SCHED_UNKNOWN = 0, DN_SCHED_FIFO = 1, DN_SCHED_WF2QP = 2, /* others are in individual modules */ -} ; +}; enum { /* user flags */ DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ @@ -113,16 +113,16 @@ enum { /* user flags */ struct dn_link { struct dn_id oid; - /* + /* * Userland sets bw and delay in bits/s and milliseconds. * The kernel converts this back and forth to bits/tick and ticks. * XXX what about burst ? - */ + */ int32_t link_nr; int bandwidth; /* bit/s or bits/tick. */ int delay; /* ms and ticks */ uint64_t burst; /* scaled. bits*Hz XXX */ -} ; +}; /* * A flowset, which is a template for flows. Contains parameters @@ -132,13 +132,13 @@ struct dn_link { */ struct dn_fs { struct dn_id oid; - uint32_t fs_nr; /* the flowset number */ - uint32_t flags; /* userland flags */ - int qsize ; /* queue size in slots or bytes */ - int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ + uint32_t fs_nr; /* the flowset number */ + uint32_t flags; /* userland flags */ + int qsize; /* queue size in slots or bytes */ + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ uint32_t buckets; /* buckets used for the queue hash table */ - struct ipfw_flow_id flow_mask ; + struct ipfw_flow_id flow_mask; uint32_t sched_nr; /* the scheduler we attach to */ /* generic scheduler parameters. Leave them at -1 if unset. * Now we use 0: weight, 1: lmax, 2: priority @@ -149,14 +149,14 @@ struct dn_fs { * weight and probabilities are in the range 0..1 represented * in fixed point arithmetic with SCALE_RED decimal bits. */ -#define SCALE_RED 16 -#define SCALE(x) ( (x) << SCALE_RED ) -#define SCALE_VAL(x) ( (x) >> SCALE_RED ) -#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) - int w_q ; /* queue weight (scaled) */ - int max_th ; /* maximum threshold for queue (scaled) */ - int min_th ; /* minimum threshold for queue (scaled) */ - int max_p ; /* maximum value for p_b (scaled) */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ }; @@ -177,10 +177,10 @@ struct dn_flow { }; - /* +/* * Scheduler template, mostly indicating the name, number, * sched_mask and buckets. - */ + */ struct dn_sch { struct dn_id oid; uint32_t sched_nr; /* N, scheduler number */ @@ -199,14 +199,14 @@ struct dn_sch { #define ED_MAX_SAMPLES_NO 1024 struct dn_profile { struct dn_id oid; - /* fields to simulate a delay profile */ + /* fields to simulate a delay profile */ #define ED_MAX_NAME_LEN 32 - char name[ED_MAX_NAME_LEN]; - int link_nr; - int loss_level; - int bandwidth; // XXX use link bandwidth? - int samples_no; /* actual length of samples[] */ - int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ + char name[ED_MAX_NAME_LEN]; + int link_nr; + int loss_level; + int bandwidth; // XXX use link bandwidth? + int samples_no; /* actual len of samples[] */ + int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ }; diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c index 43f10ef9..863b9a16 100644 --- a/freebsd/sys/netinet/ip_fastfwd.c +++ b/freebsd/sys/netinet/ip_fastfwd.c @@ -153,8 +153,8 @@ ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) /* * Try to forward a packet based on the destination address. * This is a fast path optimized for the plain forwarding case. - * If the packet is handled (and consumed) here then we return 1; - * otherwise 0 is returned and the packet should be delivered + * If the packet is handled (and consumed) here then we return NULL; + * otherwise mbuf is returned and the packet should be delivered * to ip_input for full processing. */ struct mbuf * @@ -169,9 +169,7 @@ ip_fastforward(struct mbuf *m) u_short sum, ip_len; int error = 0; int hlen, mtu; -#ifdef IPFIREWALL_FORWARD - struct m_tag *fwd_tag; -#endif + struct m_tag *fwd_tag = NULL; /* * Are we active and forwarding packets? @@ -380,14 +378,13 @@ ip_fastforward(struct mbuf *m) * Go on with new destination address */ } -#ifdef IPFIREWALL_FORWARD + if (m->m_flags & M_FASTFWD_OURS) { /* * ipfw changed it for a local address on this host. */ goto forwardlocal; } -#endif /* IPFIREWALL_FORWARD */ passin: /* @@ -457,20 +454,13 @@ passin: /* * Destination address changed? */ -#ifndef IPFIREWALL_FORWARD - if (odest.s_addr != dest.s_addr) { -#else - fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + if (m->m_flags & M_IP_NEXTHOP) + fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (odest.s_addr != dest.s_addr || fwd_tag != NULL) { -#endif /* IPFIREWALL_FORWARD */ /* * Is it now for a local address on this host? */ -#ifndef IPFIREWALL_FORWARD - if (in_localip(dest)) { -#else if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) { -#endif /* IPFIREWALL_FORWARD */ forwardlocal: /* * Return packet for processing by ip_input(). @@ -485,13 +475,12 @@ forwardlocal: /* * Redo route lookup with new destination address */ -#ifdef IPFIREWALL_FORWARD if (fwd_tag) { dest.s_addr = ((struct sockaddr_in *) (fwd_tag + 1))->sin_addr.s_addr; m_tag_delete(m, fwd_tag); + m->m_flags &= ~M_IP_NEXTHOP; } -#endif /* IPFIREWALL_FORWARD */ RTFREE(ro.ro_rt); if ((dst = ip_findroute(&ro, dest, m)) == NULL) return NULL; /* icmp unreach already sent */ diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h index 69311a79..14b08f5e 100644 --- a/freebsd/sys/netinet/ip_fw.h +++ b/freebsd/sys/netinet/ip_fw.h @@ -211,12 +211,20 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ O_SETFIB, /* arg1=FIB number */ O_FIB, /* arg1=FIB desired fib number */ + + O_SOCKARG, /* socket argument */ O_CALLRETURN, /* arg1=called rule number */ + O_FORWARD_IP6, /* fwd sockaddr_in6 */ + + O_DSCP, /* 2 u32 = DSCP mask */ + O_SETDSCP, /* arg1=DSCP value */ + O_LAST_OPCODE /* not an opcode! */ }; + /* * The extension header are filtered only for presence using a bit * vector with a flag for each header. @@ -309,6 +317,14 @@ typedef struct _ipfw_insn_sa { } ipfw_insn_sa; /* + * This is used to forward to a given address (ipv6). + */ +typedef struct _ipfw_insn_sa6 { + ipfw_insn o; + struct sockaddr_in6 sa; +} ipfw_insn_sa6; + +/* * This is used for MAC addr-mask pairs. */ typedef struct _ipfw_insn_mac { diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c index 0fc1770f..25c9698e 100644 --- a/freebsd/sys/netinet/ip_gre.c +++ b/freebsd/sys/netinet/ip_gre.c @@ -19,13 +19,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED @@ -214,6 +207,11 @@ gre_input2(struct mbuf *m ,int hlen, u_char proto) bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m); } + if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) { + m_freem(m); + return(NULL); + } + m->m_pkthdr.rcvif = GRE2IFP(sc); netisr_queue(isr, m); @@ -298,6 +296,11 @@ gre_mobile_input(struct mbuf *m, int hlen) bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m); } + if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) { + m_freem(m); + return; + } + m->m_pkthdr.rcvif = GRE2IFP(sc); netisr_queue(NETISR_IP, m); diff --git a/freebsd/sys/netinet/ip_gre.h b/freebsd/sys/netinet/ip_gre.h index 1fb67d93..d2f3866a 100644 --- a/freebsd/sys/netinet/ip_gre.h +++ b/freebsd/sys/netinet/ip_gre.h @@ -16,13 +16,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c index 728e57ec..b003d03f 100644 --- a/freebsd/sys/netinet/ip_icmp.c +++ b/freebsd/sys/netinet/ip_icmp.c @@ -34,6 +34,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_ipsec.h> #include <rtems/bsd/sys/param.h> @@ -64,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcpip.h> #include <netinet/icmp_var.h> +#ifdef INET #ifdef IPSEC #include <netipsec/ipsec.h> #include <netipsec/key.h> @@ -72,12 +74,26 @@ __FBSDID("$FreeBSD$"); #include <machine/in_cksum.h> #include <security/mac/mac_framework.h> +#endif /* INET */ /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ +static VNET_DEFINE(int, icmplim) = 200; +#define V_icmplim VNET(icmplim) +SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, + &VNET_NAME(icmplim), 0, + "Maximum number of ICMP responses per second"); + +static VNET_DEFINE(int, icmplim_output) = 1; +#define V_icmplim_output VNET(icmplim_output) +SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, + &VNET_NAME(icmplim_output), 0, + "Enable rate limiting of ICMP responses"); + +#ifdef INET VNET_DEFINE(struct icmpstat, icmpstat); SYSCTL_VNET_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, &VNET_NAME(icmpstat), icmpstat, ""); @@ -102,18 +118,6 @@ SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); -static VNET_DEFINE(int, icmplim) = 200; -#define V_icmplim VNET(icmplim) -SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, - &VNET_NAME(icmplim), 0, - "Maximum number of ICMP responses per second"); - -static VNET_DEFINE(int, icmplim_output) = 1; -#define V_icmplim_output VNET(icmplim_output) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, - &VNET_NAME(icmplim_output), 0, - "Enable rate limiting of ICMP responses"); - static VNET_DEFINE(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW, @@ -702,6 +706,8 @@ icmp_reflect(struct mbuf *m) goto done; /* Ip_output() will check for broadcast */ } + m_addr_changed(m); + t = ip->ip_dst; ip->ip_dst = ip->ip_src; @@ -953,6 +959,7 @@ ip_next_mtu(int mtu, int dir) } return 0; } +#endif /* INET */ /* diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c index b1154c79..2dbb2a7a 100644 --- a/freebsd/sys/netinet/ip_input.c +++ b/freebsd/sys/netinet/ip_input.c @@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/systm.h> -#include <sys/callout.h> #include <sys/mbuf.h> #include <sys/malloc.h> #include <sys/domain.h> @@ -104,11 +103,6 @@ SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); -VNET_DEFINE(int, ip_defttl) = IPDEFTTL; -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, - &VNET_NAME(ip_defttl), 0, - "Maximum TTL on IP packets"); - static VNET_DEFINE(int, ip_keepfaith); #define V_ip_keepfaith VNET(ip_keepfaith) SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, @@ -196,8 +190,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); -struct callout ipport_tick_callout; - #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); @@ -220,8 +212,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN, "number of entries in the per-cpu output flow caches"); #endif -VNET_DEFINE(int, fw_one_pass) = 1; - static void ip_freef(struct ipqhead *, struct ipq *); /* @@ -356,11 +346,6 @@ ip_init(void) ip_protox[pr->pr_protocol] = pr - inetsw; } - /* Start ipport_tick. */ - callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); - callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); - EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, - SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); @@ -385,13 +370,6 @@ ip_destroy(void) } #endif -void -ip_fini(void *xtp) -{ - - callout_stop(&ipport_tick_callout); -} - /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. @@ -540,22 +518,22 @@ tooshort: dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; -#ifdef IPFIREWALL_FORWARD if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } - if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) { - /* - * Directly ship the packet on. This allows forwarding - * packets originally destined to us to some other directly - * connected host. - */ - ip_forward(m, dchg); - return; + if (m->m_flags & M_IP_NEXTHOP) { + dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL); + if (dchg != 0) { + /* + * Directly ship the packet on. This allows + * forwarding packets originally destined to us + * to some other directly connected host. + */ + ip_forward(m, 1); + return; + } } -#endif /* IPFIREWALL_FORWARD */ - passin: /* * Process options and, if not destined for us, @@ -646,11 +624,6 @@ passin: IF_ADDR_RUNLOCK(ifp); goto ours; } - if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) { - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); - goto ours; - } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { ifa_ref(ifa); @@ -1524,8 +1497,7 @@ ip_forward(struct mbuf *m, int srcrt) if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_rmx.rmx_mtu; - if (ro.ro_rt) - RTFREE(ro.ro_rt); + RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); diff --git a/freebsd/sys/netinet/ip_ipsec.c b/freebsd/sys/netinet/ip_ipsec.c index 35ea9cd5..f3516f1c 100644 --- a/freebsd/sys/netinet/ip_ipsec.c +++ b/freebsd/sys/netinet/ip_ipsec.c @@ -262,8 +262,7 @@ ip_ipsec_mtu(struct mbuf *m, int mtu) * -1 = packet was reinjected and stop processing packet */ int -ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error, - struct ifnet **ifp) +ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error) { #ifdef IPSEC struct secpolicy *sp = NULL; @@ -392,20 +391,6 @@ ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error, } else { /* No IPsec processing for this packet. */ } -#ifdef notyet - /* - * If deferred crypto processing is needed, check that - * the interface supports it. - */ - mtag = m_tag_find(*m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL); - if (mtag != NULL && ifp != NULL && - ((*ifp)->if_capenable & IFCAP_IPSEC) == 0) { - /* notify IPsec to do its own crypto */ - ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1)); - *error = EHOSTUNREACH; - goto bad; - } -#endif } done: if (sp != NULL) diff --git a/freebsd/sys/netinet/ip_ipsec.h b/freebsd/sys/netinet/ip_ipsec.h index 31bc86a1..2870c114 100644 --- a/freebsd/sys/netinet/ip_ipsec.h +++ b/freebsd/sys/netinet/ip_ipsec.h @@ -36,6 +36,5 @@ int ip_ipsec_filtertunnel(struct mbuf *); int ip_ipsec_fwd(struct mbuf *); int ip_ipsec_input(struct mbuf *); int ip_ipsec_mtu(struct mbuf *, int); -int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *, - struct ifnet **); +int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *); #endif diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c index 18419a74..6fc5cc68 100644 --- a/freebsd/sys/netinet/ip_mroute.c +++ b/freebsd/sys/netinet/ip_mroute.c @@ -116,8 +116,6 @@ __FBSDID("$FreeBSD$"); #include <machine/in_cksum.h> -#include <security/mac/mac_framework.h> - #ifndef KTR_IPMF #define KTR_IPMF KTR_INET #endif @@ -928,7 +926,6 @@ add_vif(struct vifctl *vifcp) vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; - bzero(&vifp->v_route, sizeof(vifp->v_route)); /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) @@ -1036,6 +1033,8 @@ expire_mfc(struct mfc *rt) { struct rtdetq *rte, *nrte; + MFC_LOCK_ASSERT(); + free_bw_list(rt->mfc_bw_meter); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { @@ -1704,7 +1703,7 @@ send_packet(struct vif *vifp, struct mbuf *m) * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ - error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL); + error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL); CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, (ptrdiff_t)(vifp - V_viftable), error); } @@ -2809,9 +2808,9 @@ out_locked: return (error); } -SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, - "IPv4 Multicast Forwarding Table (struct *mfc[mfchashsize], " - "netinet/ip_mroute.h)"); +static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, + sysctl_mfctable, "IPv4 Multicast Forwarding Table " + "(struct *mfc[mfchashsize], netinet/ip_mroute.h)"); static void vnet_mroute_init(const void *unused __unused) diff --git a/freebsd/sys/netinet/ip_mroute.h b/freebsd/sys/netinet/ip_mroute.h index c54c75aa..e945b92c 100644 --- a/freebsd/sys/netinet/ip_mroute.h +++ b/freebsd/sys/netinet/ip_mroute.h @@ -262,7 +262,6 @@ struct vif { u_long v_pkt_out; /* # pkts out on interface */ u_long v_bytes_in; /* # bytes in on interface */ u_long v_bytes_out; /* # bytes out on interface */ - struct route v_route; /* cached route */ }; #ifdef _KERNEL diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c index 7b190bfd..98a8a2df 100644 --- a/freebsd/sys/netinet/ip_options.c +++ b/freebsd/sys/netinet/ip_options.c @@ -67,8 +67,6 @@ __FBSDID("$FreeBSD$"); #include <sys/socketvar.h> -#include <security/mac/mac_framework.h> - static int ip_dosourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c index 02dc7bdb..a70d3142 100644 --- a/freebsd/sys/netinet/ip_output.c +++ b/freebsd/sys/netinet/ip_output.c @@ -86,12 +86,6 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ - x, (ntohl(a.s_addr)>>24)&0xFF,\ - (ntohl(a.s_addr)>>16)&0xFF,\ - (ntohl(a.s_addr)>>8)&0xFF,\ - (ntohl(a.s_addr))&0xFF, y); - VNET_DEFINE(u_short, ip_id); #ifdef MBUF_STRESS_TEST @@ -110,8 +104,13 @@ extern struct protosw inetsw[]; /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). + * ip_len and ip_off are in host format. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. + * If route ro is present and has ro_rt initialized, route lookup would be + * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, + * then result of route lookup is stored in ro->ro_rt. + * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ @@ -124,17 +123,15 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; - int len, error = 0; - int nortfree = 0; - struct sockaddr_in *dst = NULL; /* keep compiler happy */ - struct in_ifaddr *ia = NULL; + int n; /* scratchpad */ + int error = 0; + struct sockaddr_in *dst; + struct in_ifaddr *ia; int isbroadcast, sw_csum; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ struct in_addr odst; -#ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag = NULL; -#endif #ifdef IPSEC int no_route_but_check_spd = 0; #endif @@ -152,30 +149,29 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); + } #ifdef FLOWTABLE - { - struct flentry *fle; + if (ro->ro_rt == NULL) { + struct flentry *fle; - /* - * The flow table returns route entries valid for up to 30 - * seconds; we rely on the remainder of ip_output() taking no - * longer than that long for the stability of ro_rt. The - * flow ID assignment must have happened before this point. - */ - if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) { - flow_to_route(fle, ro); - nortfree = 1; - } - } -#endif + /* + * The flow table returns route entries valid for up to 30 + * seconds; we rely on the remainder of ip_output() taking no + * longer than that long for the stability of ro_rt. The + * flow ID assignment must have happened before this point. + */ + fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET); + if (fle != NULL) + flow_to_route(fle, ro); } +#endif if (opt) { - len = 0; + int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) - hlen = len; + hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); @@ -196,11 +192,13 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, ip->ip_id = ip_newid(); IPSTAT_INC(ips_localout); } else { + /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } again: dst = (struct sockaddr_in *)&ro->ro_dst; + ia = NULL; /* * If there is a cached route, * check that it is to the same destination @@ -214,16 +212,11 @@ again: !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { - if (!nortfree) - RTFREE(rte); - rte = ro->ro_rt = (struct rtentry *)NULL; - ro->ro_lle = (struct llentry *)NULL; + RO_RTFREE(ro); + ro->ro_lle = NULL; + rte = NULL; } -#ifdef IPFIREWALL_FORWARD if (rte == NULL && fwd_tag == NULL) { -#else - if (rte == NULL) { -#endif bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); @@ -328,6 +321,9 @@ again: } else { mtu = ifp->if_mtu; } + /* Catch a possible divide by zero later. */ + KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", + __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* @@ -441,18 +437,15 @@ again: * packet or packet fragments, unless ALTQ is enabled on the given * interface in which case packetdrop should be done by queueing. */ + n = ip->ip_len / mtu + 1; /* how many fragments ? */ + if ( #ifdef ALTQ - if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && - ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= - ifp->if_snd.ifq_maxlen)) -#else - if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= - ifp->if_snd.ifq_maxlen) + (!ALTQ_IS_ENABLED(&ifp->if_snd)) && #endif /* ALTQ */ - { + (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) { error = ENOBUFS; IPSTAT_INC(ips_odropped); - ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); + ifp->if_snd.ifq_drops += n; goto bad; } @@ -482,7 +475,7 @@ again: sendit: #ifdef IPSEC - switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) { + switch(ip_ipsec_output(&m, inp, &flags, &error)) { case 1: goto bad; case -1: @@ -537,11 +530,13 @@ sendit: #endif error = netisr_queue(NETISR_IP, m); goto done; - } else + } else { + if (ia != NULL) + ifa_free(&ia->ia_ifa); goto again; /* Redo the routing table lookup. */ + } } -#ifdef IPFIREWALL_FORWARD /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) @@ -562,15 +557,17 @@ sendit: goto done; } /* Or forward to some other address? */ - fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); - if (fwd_tag) { + if ((m->m_flags & M_IP_NEXTHOP) && + (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in *)&ro->ro_dst; bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; + m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); + if (ia != NULL) + ifa_free(&ia->ia_ifa); goto again; } -#endif /* IPFIREWALL_FORWARD */ passout: /* 127/8 must not appear on wire - RFC1122. */ @@ -677,9 +674,8 @@ passout: IPSTAT_INC(ips_fragmented); done: - if (ro == &iproute && ro->ro_rt && !nortfree) { - RTFREE(ro->ro_rt); - } + if (ro == &iproute) + RO_RTFREE(ro); if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); @@ -725,14 +721,12 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, * If the interface will not calculate checksums on * fragmented packets, then do it here. */ - if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && - (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { + if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP - if (m0->m_pkthdr.csum_flags & CSUM_SCTP && - (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { + if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } @@ -900,12 +894,40 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { - if ((sopt->sopt_level == SOL_SOCKET) && - (sopt->sopt_name == SO_SETFIB)) { - inp->inp_inc.inc_fibnum = so->so_fibnum; - return (0); + error = EINVAL; + + if (sopt->sopt_level == SOL_SOCKET && + sopt->sopt_dir == SOPT_SET) { + switch (sopt->sopt_name) { + case SO_REUSEADDR: + INP_WLOCK(inp); + if ((so->so_options & SO_REUSEADDR) != 0) + inp->inp_flags2 |= INP_REUSEADDR; + else + inp->inp_flags2 &= ~INP_REUSEADDR; + INP_WUNLOCK(inp); + error = 0; + break; + case SO_REUSEPORT: + INP_WLOCK(inp); + if ((so->so_options & SO_REUSEPORT) != 0) + inp->inp_flags2 |= INP_REUSEPORT; + else + inp->inp_flags2 &= ~INP_REUSEPORT; + INP_WUNLOCK(inp); + error = 0; + break; + case SO_SETFIB: + INP_WLOCK(inp); + inp->inp_inc.inc_fibnum = so->so_fibnum; + INP_WUNLOCK(inp); + error = 0; + break; + default: + break; + } } - return (EINVAL); + return (error); } switch (sopt->sopt_dir) { diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h index d196fd04..b07ef162 100644 --- a/freebsd/sys/netinet/ip_var.h +++ b/freebsd/sys/netinet/ip_var.h @@ -162,6 +162,7 @@ void kmod_ipstat_dec(int statnum); * mbuf flag used by ip_fastfwd */ #define M_FASTFWD_OURS M_PROTO1 /* changed dst to local */ +#define M_IP_NEXTHOP M_PROTO2 /* explicit ip nexthop */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 @@ -208,7 +209,6 @@ int inp_setmoptions(struct inpcb *, struct sockopt *); int ip_ctloutput(struct socket *, struct sockopt *sopt); void ip_drain(void); -void ip_fini(void *xtp); int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags, int sw_csum); void ip_forward(struct mbuf *m, int srcrt); diff --git a/freebsd/sys/netinet/libalias/alias.h b/freebsd/sys/netinet/libalias/alias.h index b2615c90..b12b353a 100644 --- a/freebsd/sys/netinet/libalias/alias.h +++ b/freebsd/sys/netinet/libalias/alias.h @@ -197,17 +197,6 @@ struct mbuf *m_megapullup(struct mbuf *, int); */ #define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 -#ifndef NO_FW_PUNCH -/* - * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will - * create a 'hole' in the firewall to allow the transfers to work. The - * ipfw rule number that the hole is created with is controlled by - * PacketAliasSetFWBase(). The hole will be attached to that - * particular alias_link, so when the link goes away the hole is deleted. - */ -#define PKT_ALIAS_PUNCH_FW 0x100 -#endif - /* * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only * transparent proxying is performed. @@ -220,6 +209,17 @@ struct mbuf *m_megapullup(struct mbuf *, int); */ #define PKT_ALIAS_REVERSE 0x80 +#ifndef NO_FW_PUNCH +/* + * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will + * create a 'hole' in the firewall to allow the transfers to work. The + * ipfw rule number that the hole is created with is controlled by + * PacketAliasSetFWBase(). The hole will be attached to that + * particular alias_link, so when the link goes away the hole is deleted. + */ +#define PKT_ALIAS_PUNCH_FW 0x100 +#endif + /* * If PKT_ALIAS_SKIP_GLOBAL is set, nat instance is not checked for matching * states in 'ipfw nat global' rule. diff --git a/freebsd/sys/netinet/libalias/alias_db.c b/freebsd/sys/netinet/libalias/alias_db.c index 7385027c..28be85da 100644 --- a/freebsd/sys/netinet/libalias/alias_db.c +++ b/freebsd/sys/netinet/libalias/alias_db.c @@ -2171,7 +2171,6 @@ HouseKeeping(struct libalias *la) int i, n; #ifndef _KERNEL struct timeval tv; - struct timezone tz; #endif LIBALIAS_LOCK_ASSERT(la); @@ -2183,7 +2182,7 @@ HouseKeeping(struct libalias *la) #ifdef _KERNEL la->timeStamp = time_uptime; #else - gettimeofday(&tv, &tz); + gettimeofday(&tv, NULL); la->timeStamp = tv.tv_sec; #endif @@ -2478,7 +2477,6 @@ LibAliasInit(struct libalias *la) int i; #ifndef _KERNEL struct timeval tv; - struct timezone tz; #endif if (la == NULL) { @@ -2505,7 +2503,7 @@ LibAliasInit(struct libalias *la) la->timeStamp = time_uptime; la->lastCleanupTime = time_uptime; #else - gettimeofday(&tv, &tz); + gettimeofday(&tv, NULL); la->timeStamp = tv.tv_sec; la->lastCleanupTime = tv.tv_sec; #endif @@ -2737,7 +2735,6 @@ static void InitPunchFW(struct libalias *la) { - LIBALIAS_LOCK_ASSERT(la); la->fireWallField = malloc(la->fireWallNumNums); if (la->fireWallField) { memset(la->fireWallField, 0, la->fireWallNumNums); @@ -2753,7 +2750,6 @@ static void UninitPunchFW(struct libalias *la) { - LIBALIAS_LOCK_ASSERT(la); ClearAllFWHoles(la); if (la->fireWallFD >= 0) close(la->fireWallFD); @@ -2773,7 +2769,6 @@ PunchFWHole(struct alias_link *lnk) struct ip_fw rule; /* On-the-fly built rule */ int fwhole; /* Where to punch hole */ - LIBALIAS_LOCK_ASSERT(la); la = lnk->la; /* Don't do anything unless we are asked to */ @@ -2847,7 +2842,6 @@ ClearFWHole(struct alias_link *lnk) { struct libalias *la; - LIBALIAS_LOCK_ASSERT(la); la = lnk->la; if (lnk->link_type == LINK_TCP) { int fwhole = lnk->data.tcp->fwhole; /* Where is the firewall @@ -2872,7 +2866,6 @@ ClearAllFWHoles(struct libalias *la) struct ip_fw rule; /* On-the-fly built rule */ int i; - LIBALIAS_LOCK_ASSERT(la); if (la->fireWallFD < 0) return; @@ -2886,7 +2879,7 @@ ClearAllFWHoles(struct libalias *la) memset(la->fireWallField, 0, la->fireWallNumNums); } -#endif +#endif /* !NO_FW_PUNCH */ void LibAliasSetFWBase(struct libalias *la, unsigned int base, unsigned int num) diff --git a/freebsd/sys/netinet/libalias/alias_sctp.c b/freebsd/sys/netinet/libalias/alias_sctp.c index c8d83878..6158149a 100644 --- a/freebsd/sys/netinet/libalias/alias_sctp.c +++ b/freebsd/sys/netinet/libalias/alias_sctp.c @@ -183,7 +183,7 @@ void SctpShowAliasStats(struct libalias *la); #ifdef _KERNEL -MALLOC_DEFINE(M_SCTPNAT, "sctpnat", "sctp nat dbs"); +static MALLOC_DEFINE(M_SCTPNAT, "sctpnat", "sctp nat dbs"); /* Use kernel allocator. */ #ifdef _SYS_MALLOC_H_ #define sn_malloc(x) malloc(x, M_SCTPNAT, M_NOWAIT|M_ZERO) @@ -366,8 +366,8 @@ SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_ip_alias); -SYSCTL_NODE(_net_inet_ip_alias, OID_AUTO, sctp, CTLFLAG_RW, NULL, "SCTP NAT"); - +static SYSCTL_NODE(_net_inet_ip_alias, OID_AUTO, sctp, CTLFLAG_RW, NULL, + "SCTP NAT"); SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, log_level, CTLTYPE_UINT | CTLFLAG_RW, &sysctl_log_level, 0, sysctl_chg_loglevel, "IU", "Level of detail (0 - default, 1 - event, 2 - info, 3 - detail, 4 - debug, 5 - max debug)"); diff --git a/freebsd/sys/netinet/libalias/alias_sctp.h b/freebsd/sys/netinet/libalias/alias_sctp.h index 8c244b45..f538d942 100644 --- a/freebsd/sys/netinet/libalias/alias_sctp.h +++ b/freebsd/sys/netinet/libalias/alias_sctp.h @@ -76,7 +76,6 @@ * */ #include <machine/cpufunc.h> -#include <machine/cpu.h> /* The packed define for 64 bit platforms */ #ifndef SCTP_PACKED #define SCTP_PACKED __attribute__((packed)) @@ -136,13 +135,13 @@ struct sctp_nat_assoc { struct in_addr a_addr; /**< alias ip address */ int state; /**< current state of NAT association */ int TableRegister; /**< stores which look up tables association is registered in */ - int exp; /**< timer expiration in seconds from uptime */ + int exp; /**< timer expiration in seconds from uptime */ int exp_loc; /**< current location in timer_Q */ int num_Gaddr; /**< number of global IP addresses in the list */ LIST_HEAD(sctpGlobalAddresshead,sctp_GlobalAddress) Gaddr; /**< List of global addresses */ - LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/ - LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */ - LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */ + LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/ + LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */ + LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */ //Using libalias locking }; diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c index aa6abae9..827eca6e 100644 --- a/freebsd/sys/netinet/raw_ip.c +++ b/freebsd/sys/netinet/raw_ip.c @@ -35,6 +35,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> @@ -76,6 +77,11 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> +VNET_DEFINE(int, ip_defttl) = IPDEFTTL; +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, + &VNET_NAME(ip_defttl), 0, + "Maximum TTL on IP packets"); + VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); @@ -96,6 +102,10 @@ void (*ip_divert_ptr)(struct mbuf *, int); int (*ng_ipfw_input_p)(struct mbuf **, int, struct ip_fw_args *, int); +/* Hook for telling pf that the destination address changed */ +void (*m_addr_chg_pf_p)(struct mbuf *m); + +#ifdef INET /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. @@ -121,6 +131,15 @@ u_long (*ip_mcast_src)(int); void (*rsvp_input_p)(struct mbuf *m, int off); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); +#endif /* INET */ + +u_long rip_sendspace = 9216; +SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, + &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); + +u_long rip_recvspace = 9216; +SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, + &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); /* * Hash functions @@ -130,6 +149,7 @@ void (*ip_rsvp_force_done)(struct socket *); #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ (((proto) + (laddr) + (faddr)) % (mask) + 1) +#ifdef INET static void rip_inshash(struct inpcb *inp) { @@ -160,6 +180,7 @@ rip_delhash(struct inpcb *inp) LIST_REMOVE(inp, inp_hash); } +#endif /* INET */ /* * Raw interface to IP protocol. @@ -188,19 +209,9 @@ void rip_init(void) { - INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); - LIST_INIT(&V_ripcb); -#ifdef VIMAGE - V_ripcbinfo.ipi_vnet = curvnet; -#endif - V_ripcbinfo.ipi_listhead = &V_ripcb; - V_ripcbinfo.ipi_hashbase = - hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask); - V_ripcbinfo.ipi_porthashbase = - hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); - V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), - NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); + in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, + 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -210,20 +221,18 @@ void rip_destroy(void) { - hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB, - V_ripcbinfo.ipi_hashmask); - hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB, - V_ripcbinfo.ipi_porthashmask); + in_pcbinfo_destroy(&V_ripcbinfo); } #endif +#ifdef INET static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) { int policyfail = 0; - INP_RLOCK_ASSERT(last); + INP_LOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ @@ -771,14 +780,6 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) } } -u_long rip_sendspace = 9216; -u_long rip_recvspace = 9216; - -SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, - &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); -SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, - &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); - static int rip_attach(struct socket *so, int proto, struct thread *td) { @@ -839,16 +840,19 @@ rip_detach(struct socket *so) static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { + struct inpcbinfo *pcbinfo; - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); - INP_WLOCK_ASSERT(inp); - + pcbinfo = inp->inp_pcbinfo; + INP_INFO_WLOCK(pcbinfo); + INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(pcbinfo); } static void @@ -859,11 +863,7 @@ rip_abort(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static void @@ -874,11 +874,7 @@ rip_close(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static int @@ -892,11 +888,7 @@ rip_disconnect(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); - INP_WLOCK(inp); rip_dodisconnect(so, inp); - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } @@ -1003,6 +995,7 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, } return (rip_output(m, so, dst)); } +#endif /* INET */ static int rip_pcblist(SYSCTL_HANDLER_ARGS) @@ -1081,9 +1074,9 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_ripcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_ripcbinfo); @@ -1109,6 +1102,7 @@ SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); +#ifdef INET struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, @@ -1124,3 +1118,4 @@ struct pr_usrreqs rip_usrreqs = { .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; +#endif /* INET */ diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h index 18057009..58ca808e 100644 --- a/freebsd/sys/netinet/sctp_constants.h +++ b/freebsd/sys/netinet/sctp_constants.h @@ -521,9 +521,6 @@ __FBSDID("$FreeBSD$"); /* How long a cookie lives in milli-seconds */ #define SCTP_DEFAULT_COOKIE_LIFE 60000 -/* resource limit of streams */ -#define MAX_SCTP_STREAMS 2048 - /* Maximum the mapping array will grow to (TSN mapping array) */ #define SCTP_MAPPING_ARRAY 512 @@ -658,6 +655,7 @@ __FBSDID("$FreeBSD$"); /* How many streams I request initally by default */ #define SCTP_OSTREAM_INITIAL 10 +#define SCTP_ISTREAM_INITIAL 2048 /* * How many smallest_mtu's need to increase before a window update sack is @@ -997,6 +995,10 @@ __FBSDID("$FreeBSD$"); (((uint8_t *)&(a)->s_addr)[2] == 0) && \ (((uint8_t *)&(a)->s_addr)[3] == 1)) +#define IN4_ISLINKLOCAL_ADDRESS(a) \ + ((((uint8_t *)&(a)->s_addr)[0] == 169) && \ + (((uint8_t *)&(a)->s_addr)[1] == 254)) + #if defined(_KERNEL) #define SCTP_GETTIME_TIMEVAL(x) (getmicrouptime(x)) diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c index 273ad6bc..e00a470d 100644 --- a/freebsd/sys/netinet/sctp_indata.c +++ b/freebsd/sys/netinet/sctp_indata.c @@ -1731,7 +1731,6 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, sctp_alloc_a_readq(stcb, control); sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, protocol_id, - stcb->asoc.context, strmno, strmseq, chunk_flags, dmbuf); @@ -1859,7 +1858,6 @@ failed_pdapi_express_del: sctp_alloc_a_readq(stcb, control); sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, protocol_id, - stcb->asoc.context, strmno, strmseq, chunk_flags, dmbuf); diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h index 5eaa1f4b..79a86e2a 100644 --- a/freebsd/sys/netinet/sctp_indata.h +++ b/freebsd/sys/netinet/sctp_indata.h @@ -47,14 +47,14 @@ sctp_build_readq_entry(struct sctp_tcb *stcb, struct mbuf *dm); -#define sctp_build_readq_entry_mac(_ctl, in_it, a, net, tsn, ppid, context, stream_no, stream_seq, flags, dm) do { \ +#define sctp_build_readq_entry_mac(_ctl, in_it, context, net, tsn, ppid, stream_no, stream_seq, flags, dm) do { \ if (_ctl) { \ atomic_add_int(&((net)->ref_count), 1); \ (_ctl)->sinfo_stream = stream_no; \ (_ctl)->sinfo_ssn = stream_seq; \ (_ctl)->sinfo_flags = (flags << 8); \ (_ctl)->sinfo_ppid = ppid; \ - (_ctl)->sinfo_context = a; \ + (_ctl)->sinfo_context = context; \ (_ctl)->sinfo_timetolive = 0; \ (_ctl)->sinfo_tsn = tsn; \ (_ctl)->sinfo_cumtsn = tsn; \ diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c index 645c807e..7cdb5b09 100644 --- a/freebsd/sys/netinet/sctp_input.c +++ b/freebsd/sys/netinet/sctp_input.c @@ -391,9 +391,10 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); } - asoc->streamincnt = ntohs(init->num_outbound_streams); - if (asoc->streamincnt > MAX_SCTP_STREAMS) { - asoc->streamincnt = MAX_SCTP_STREAMS; + if (asoc->max_inbound_streams > ntohs(init->num_outbound_streams)) { + asoc->streamincnt = ntohs(init->num_outbound_streams); + } else { + asoc->streamincnt = asoc->max_inbound_streams; } SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt * sizeof(struct sctp_stream_in), SCTP_M_STRMI); @@ -405,11 +406,6 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) for (i = 0; i < asoc->streamincnt; i++) { asoc->strmin[i].stream_no = i; asoc->strmin[i].last_sequence_delivered = 0xffff; - /* - * U-stream ranges will be set when the cookie is unpacked. - * Or for the INIT sender they are un set (if pr-sctp not - * supported) when the INIT-ACK arrives. - */ TAILQ_INIT(&asoc->strmin[i].inqueue); asoc->strmin[i].delivery_started = 0; } @@ -1030,12 +1026,13 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, SCTP_SOCKET_UNLOCK(so, 1); #endif } - /* are the queues empty? */ +#ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { - sctp_report_all_outbound(stcb, 0, 0, SCTP_SO_NOT_LOCKED); + panic("Queues are not empty when handling SHUTDOWN-ACK"); } +#endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9); /* send SHUTDOWN-COMPLETE */ @@ -1877,9 +1874,14 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, cookie->tie_tag_peer_vtag != 0) { struct sctpasochead *head; +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + if (asoc->peer_supports_nat) { /* - * This is a gross gross hack. just call the + * This is a gross gross hack. Just call the * cookie_new code since we are allowing a duplicate * association. I hope this works... */ @@ -1941,6 +1943,10 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, asoc->mapping_array_size); } SCTP_TCB_UNLOCK(stcb); +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + SCTP_SOCKET_LOCK(so, 1); +#endif SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(stcb->sctp_ep); SCTP_TCB_LOCK(stcb); @@ -1948,7 +1954,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, /* send up all the data */ SCTP_TCB_SEND_LOCK(stcb); - sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_NOT_LOCKED); + sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED); for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].chunks_on_queues = 0; stcb->asoc.strmout[i].stream_no = i; @@ -1970,11 +1976,15 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); - /* process the INIT info (peer's info) */ SCTP_TCB_SEND_UNLOCK(stcb); SCTP_INP_WUNLOCK(stcb->sctp_ep); SCTP_INP_INFO_WUNLOCK(); - +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + asoc->total_flight = 0; + asoc->total_flight_count = 0; + /* process the INIT info (peer's info) */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) @@ -3198,13 +3208,14 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE /* notify upper layer protocol */ if (stcb->sctp_socket) { sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); - /* are the queues empty? they should be */ - if (!TAILQ_EMPTY(&asoc->send_queue) || - !TAILQ_EMPTY(&asoc->sent_queue) || - !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { - sctp_report_all_outbound(stcb, 0, 0, SCTP_SO_NOT_LOCKED); - } } +#ifdef INVARIANTS + if (!TAILQ_EMPTY(&asoc->send_queue) || + !TAILQ_EMPTY(&asoc->sent_queue) || + !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { + panic("Queues are not empty when handling SHUTDOWN-COMPLETE"); + } +#endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22); SCTP_STAT_INCR_COUNTER32(sctps_shutdown); @@ -3493,18 +3504,13 @@ sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * } static void -sctp_reset_out_streams(struct sctp_tcb *stcb, int number_entries, uint16_t * list) +sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * list) { - int i; + uint32_t i; + uint16_t temp; - if (number_entries == 0) { - for (i = 0; i < stcb->asoc.streamoutcnt; i++) { - stcb->asoc.strmout[i].next_sequence_send = 0; - } - } else if (number_entries) { + if (number_entries > 0) { for (i = 0; i < number_entries; i++) { - uint16_t temp; - temp = ntohs(list[i]); if (temp >= stcb->asoc.streamoutcnt) { /* no such stream */ @@ -3512,6 +3518,10 @@ sctp_reset_out_streams(struct sctp_tcb *stcb, int number_entries, uint16_t * lis } stcb->asoc.strmout[temp].next_sequence_send = 0; } + } else { + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + stcb->asoc.strmout[i].next_sequence_send = 0; + } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } @@ -3598,7 +3608,7 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb, struct sctp_association *asoc = &stcb->asoc; struct sctp_tmit_chunk *chk; struct sctp_stream_reset_out_request *srparam; - int number_entries; + uint32_t number_entries; if (asoc->stream_reset_outstanding == 0) { /* duplicate */ @@ -4556,8 +4566,10 @@ __attribute__((noinline)) if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) || (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) || (ch->chunk_type == SCTP_PACKET_DROPPED)) { - if ((vtag_in == asoc->my_vtag) || - ((ch->chunk_flags & SCTP_HAD_NO_TCB) && + /* Take the T-bit always into account. */ + if ((((ch->chunk_flags & SCTP_HAD_NO_TCB) == 0) && + (vtag_in == asoc->my_vtag)) || + (((ch->chunk_flags & SCTP_HAD_NO_TCB) == SCTP_HAD_NO_TCB) && (vtag_in == asoc->peer_vtag))) { /* this is valid */ } else { @@ -5695,7 +5707,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt #ifdef INET case AF_INET: if (ipsec4_in_reject(m, &inp->ip_inp.inp)) { - MODULE_GLOBAL(ipsec4stat).in_polvio++; + IPSECSTAT_INC(in_polvio); SCTP_STAT_INCR(sctps_hdrops); goto out; } @@ -5704,7 +5716,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt #ifdef INET6 case AF_INET6: if (ipsec6_in_reject(m, &inp->ip_inp.inp)) { - MODULE_GLOBAL(ipsec6stat).in_polvio++; + IPSEC6STAT_INC(in_polvio); SCTP_STAT_INCR(sctps_hdrops); goto out; } diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c index 1bca9771..61260fb7 100644 --- a/freebsd/sys/netinet/sctp_output.c +++ b/freebsd/sys/netinet/sctp_output.c @@ -1967,7 +1967,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len) while (SCTP_BUF_NEXT(mret) != NULL) { mret = SCTP_BUF_NEXT(mret); } - SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_NOWAIT, 1, MT_DATA); + SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_DONTWAIT, 1, MT_DATA); if (SCTP_BUF_NEXT(mret) == NULL) { /* We are hosed, can't add more addresses */ return (m); @@ -4131,10 +4131,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret); if (net == NULL) { /* free tempy routes */ - if (ro->ro_rt) { - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; - } + RO_RTFREE(ro); } else { /* * PMTU check versus smallest asoc MTU goes @@ -4449,8 +4446,9 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else - sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr)); - SCTP_STAT_INCR(sctps_sendswcrc); + m->m_pkthdr.csum_flags = CSUM_SCTP_IPV6; + m->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); #endif } /* send it out. table id is taken from stcb */ @@ -4487,9 +4485,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, } if (net == NULL) { /* Now if we had a temp route free it */ - if (ro->ro_rt) { - RTFREE(ro->ro_rt); - } + RO_RTFREE(ro); } else { /* * PMTU check versus smallest asoc MTU goes @@ -10683,6 +10679,7 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked struct sctp_abort_chunk *abort; struct sctp_auth_chunk *auth = NULL; struct sctp_nets *net; + uint32_t vtag; uint32_t auth_offset = 0; uint16_t cause_len, chunk_len, padding_len; @@ -10738,7 +10735,14 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked /* Fill in the ABORT chunk header. */ abort = mtod(m_abort, struct sctp_abort_chunk *); abort->ch.chunk_type = SCTP_ABORT_ASSOCIATION; - abort->ch.chunk_flags = 0; + if (stcb->asoc.peer_vtag == 0) { + /* This happens iff the assoc is in COOKIE-WAIT state. */ + vtag = stcb->asoc.my_vtag; + abort->ch.chunk_flags = SCTP_HAD_NO_TCB; + } else { + vtag = stcb->asoc.peer_vtag; + abort->ch.chunk_flags = 0; + } abort->ch.chunk_length = htons(chunk_len); /* Add padding, if necessary. */ if (padding_len > 0) { @@ -10750,7 +10754,7 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked (void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net, (struct sockaddr *)&net->ro._l_addr, m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, 0, - stcb->sctp_ep->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), + stcb->sctp_ep->sctp_lport, stcb->rport, htonl(vtag), stcb->asoc.primary_destination->port, NULL, 0, 0, so_locked); @@ -11032,8 +11036,9 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); #else - shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr)); - SCTP_STAT_INCR(sctps_sendswcrc); + mout->m_pkthdr.csum_flags = CSUM_SCTP_IPV6; + mout->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); #endif } #ifdef SCTP_PACKET_LOGGING diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c index 47877ef1..e21c2e03 100644 --- a/freebsd/sys/netinet/sctp_pcb.c +++ b/freebsd/sys/netinet/sctp_pcb.c @@ -2378,8 +2378,13 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) inp->sctp_socket = so; inp->ip_inp.inp.inp_socket = so; #ifdef INET6 - if (MODULE_GLOBAL(ip6_auto_flowlabel)) { - inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL; + if (INP_SOCKAF(so) == AF_INET6) { + if (MODULE_GLOBAL(ip6_auto_flowlabel)) { + inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL; + } + if (MODULE_GLOBAL(ip6_v6only)) { + inp->ip_inp.inp.inp_flags |= IN6P_IPV6_V6ONLY; + } } #endif inp->sctp_associd_counter = 1; @@ -2500,9 +2505,6 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default); m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default); m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default); - - m->max_open_streams_intome = MAX_SCTP_STREAMS; - m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default); m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default); m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default); @@ -2514,6 +2516,7 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module); m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module); + m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default); /* number of streams to pre-open on a association */ m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default); @@ -4450,23 +4453,21 @@ sctp_delete_from_timewait(uint32_t tag, uint16_t lport, uint16_t rport) int i; chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; - if (!LIST_EMPTY(chain)) { - LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { - for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { - if ((twait_block->vtag_block[i].v_tag == tag) && - (twait_block->vtag_block[i].lport == lport) && - (twait_block->vtag_block[i].rport == rport)) { - twait_block->vtag_block[i].tv_sec_at_expire = 0; - twait_block->vtag_block[i].v_tag = 0; - twait_block->vtag_block[i].lport = 0; - twait_block->vtag_block[i].rport = 0; - found = 1; - break; - } - } - if (found) + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if ((twait_block->vtag_block[i].v_tag == tag) && + (twait_block->vtag_block[i].lport == lport) && + (twait_block->vtag_block[i].rport == rport)) { + twait_block->vtag_block[i].tv_sec_at_expire = 0; + twait_block->vtag_block[i].v_tag = 0; + twait_block->vtag_block[i].lport = 0; + twait_block->vtag_block[i].rport = 0; + found = 1; break; + } } + if (found) + break; } } @@ -4480,19 +4481,17 @@ sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport) SCTP_INP_INFO_WLOCK(); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; - if (!LIST_EMPTY(chain)) { - LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { - for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { - if ((twait_block->vtag_block[i].v_tag == tag) && - (twait_block->vtag_block[i].lport == lport) && - (twait_block->vtag_block[i].rport == rport)) { - found = 1; - break; - } - } - if (found) + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if ((twait_block->vtag_block[i].v_tag == tag) && + (twait_block->vtag_block[i].lport == lport) && + (twait_block->vtag_block[i].rport == rport)) { + found = 1; break; + } } + if (found) + break; } SCTP_INP_INFO_WUNLOCK(); return (found); @@ -4514,42 +4513,40 @@ sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t (void)SCTP_GETTIME_TIMEVAL(&now); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; set = 0; - if (!LIST_EMPTY(chain)) { + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { /* Block(s) present, lets find space, and expire on the fly */ - LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { - for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { - if ((twait_block->vtag_block[i].v_tag == 0) && - !set) { - twait_block->vtag_block[i].tv_sec_at_expire = - now.tv_sec + time; + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if ((twait_block->vtag_block[i].v_tag == 0) && + !set) { + twait_block->vtag_block[i].tv_sec_at_expire = + now.tv_sec + time; + twait_block->vtag_block[i].v_tag = tag; + twait_block->vtag_block[i].lport = lport; + twait_block->vtag_block[i].rport = rport; + set = 1; + } else if ((twait_block->vtag_block[i].v_tag) && + ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) { + /* Audit expires this guy */ + twait_block->vtag_block[i].tv_sec_at_expire = 0; + twait_block->vtag_block[i].v_tag = 0; + twait_block->vtag_block[i].lport = 0; + twait_block->vtag_block[i].rport = 0; + if (set == 0) { + /* Reuse it for my new tag */ + twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time; twait_block->vtag_block[i].v_tag = tag; twait_block->vtag_block[i].lport = lport; twait_block->vtag_block[i].rport = rport; set = 1; - } else if ((twait_block->vtag_block[i].v_tag) && - ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) { - /* Audit expires this guy */ - twait_block->vtag_block[i].tv_sec_at_expire = 0; - twait_block->vtag_block[i].v_tag = 0; - twait_block->vtag_block[i].lport = 0; - twait_block->vtag_block[i].rport = 0; - if (set == 0) { - /* Reuse it for my new tag */ - twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time; - twait_block->vtag_block[i].v_tag = tag; - twait_block->vtag_block[i].lport = lport; - twait_block->vtag_block[i].rport = rport; - set = 1; - } } } - if (set) { - /* - * We only do up to the block where we can - * place our tag for audits - */ - break; - } + } + if (set) { + /* + * We only do up to the block where we can place our + * tag for audits + */ + break; } } /* Need to add a new block to chain */ @@ -6699,30 +6696,28 @@ skip_vtag_check: chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; /* Now what about timed wait ? */ - if (!LIST_EMPTY(chain)) { + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { /* * Block(s) are present, lets see if we have this tag in the * list */ - LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { - for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { - if (twait_block->vtag_block[i].v_tag == 0) { - /* not used */ - continue; - } else if ((long)twait_block->vtag_block[i].tv_sec_at_expire < - now->tv_sec) { - /* Audit expires this guy */ - twait_block->vtag_block[i].tv_sec_at_expire = 0; - twait_block->vtag_block[i].v_tag = 0; - twait_block->vtag_block[i].lport = 0; - twait_block->vtag_block[i].rport = 0; - } else if ((twait_block->vtag_block[i].v_tag == tag) && - (twait_block->vtag_block[i].lport == lport) && - (twait_block->vtag_block[i].rport == rport)) { - /* Bad tag, sorry :< */ - SCTP_INP_INFO_RUNLOCK(); - return (0); - } + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if (twait_block->vtag_block[i].v_tag == 0) { + /* not used */ + continue; + } else if ((long)twait_block->vtag_block[i].tv_sec_at_expire < + now->tv_sec) { + /* Audit expires this guy */ + twait_block->vtag_block[i].tv_sec_at_expire = 0; + twait_block->vtag_block[i].v_tag = 0; + twait_block->vtag_block[i].lport = 0; + twait_block->vtag_block[i].rport = 0; + } else if ((twait_block->vtag_block[i].v_tag == tag) && + (twait_block->vtag_block[i].lport == lport) && + (twait_block->vtag_block[i].rport == rport)) { + /* Bad tag, sorry :< */ + SCTP_INP_INFO_RUNLOCK(); + return (0); } } } diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h index abecdabd..bc18f0e8 100644 --- a/freebsd/sys/netinet/sctp_structs.h +++ b/freebsd/sys/netinet/sctp_structs.h @@ -189,6 +189,8 @@ struct iterator_control { struct sctp_net_route { sctp_rtentry_t *ro_rt; void *ro_lle; + void *ro_ia; + int ro_flags; union sctp_sockstore _l_addr; /* remote peer addr */ struct sctp_ifa *_s_addr; /* our selected src addr */ }; diff --git a/freebsd/sys/netinet/sctp_sysctl.c b/freebsd/sys/netinet/sctp_sysctl.c index ca462b7a..95e3c589 100644 --- a/freebsd/sys/netinet/sctp_sysctl.c +++ b/freebsd/sys/netinet/sctp_sysctl.c @@ -83,6 +83,7 @@ sctp_init_sysctls() SCTP_BASE_SYSCTL(sctp_path_rtx_max_default) = SCTPCTL_PATH_RTX_MAX_DEFAULT; SCTP_BASE_SYSCTL(sctp_path_pf_threshold) = SCTPCTL_PATH_PF_THRESHOLD_DEFAULT; SCTP_BASE_SYSCTL(sctp_add_more_threshold) = SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default) = SCTPCTL_INCOMING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT; /* EY */ @@ -625,6 +626,7 @@ sysctl_sctp_check(SYSCTL_HANDLER_ARGS) RANGECHK(SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), SCTPCTL_PATH_RTX_MAX_MIN, SCTPCTL_PATH_RTX_MAX_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_path_pf_threshold), SCTPCTL_PATH_PF_THRESHOLD_MIN, SCTPCTL_PATH_PF_THRESHOLD_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTPCTL_ADD_MORE_ON_OUTPUT_MIN, SCTPCTL_ADD_MORE_ON_OUTPUT_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), SCTPCTL_INCOMING_STREAMS_MIN, SCTPCTL_INCOMING_STREAMS_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), SCTPCTL_OUTGOING_STREAMS_MIN, SCTPCTL_OUTGOING_STREAMS_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_on_off), SCTPCTL_CMT_ON_OFF_MIN, SCTPCTL_CMT_ON_OFF_MAX); /* EY */ @@ -967,6 +969,10 @@ SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, add_more_on_output, CTLTYPE_UINT | CT &SCTP_BASE_SYSCTL(sctp_add_more_threshold), 0, sysctl_sctp_check, "IU", SCTPCTL_ADD_MORE_ON_OUTPUT_DESC); +SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, incoming_streams, CTLTYPE_UINT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_INCOMING_STREAMS_DESC); + SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, outgoing_streams, CTLTYPE_UINT | CTLFLAG_RW, &SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), 0, sysctl_sctp_check, "IU", SCTPCTL_OUTGOING_STREAMS_DESC); diff --git a/freebsd/sys/netinet/sctp_sysctl.h b/freebsd/sys/netinet/sctp_sysctl.h index 4ec37157..8090373e 100644 --- a/freebsd/sys/netinet/sctp_sysctl.h +++ b/freebsd/sys/netinet/sctp_sysctl.h @@ -72,6 +72,7 @@ struct sctp_sysctl { uint32_t sctp_path_rtx_max_default; uint32_t sctp_path_pf_threshold; uint32_t sctp_add_more_threshold; + uint32_t sctp_nr_incoming_streams_default; uint32_t sctp_nr_outgoing_streams_default; uint32_t sctp_cmt_on_off; uint32_t sctp_cmt_use_dac; @@ -322,6 +323,12 @@ struct sctp_sysctl { #define SCTPCTL_ADD_MORE_ON_OUTPUT_MAX 0xFFFFFFFF #define SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT SCTP_DEFAULT_ADD_MORE +/* incoming_streams: Default number of incoming streams */ +#define SCTPCTL_INCOMING_STREAMS_DESC "Default number of incoming streams" +#define SCTPCTL_INCOMING_STREAMS_MIN 1 +#define SCTPCTL_INCOMING_STREAMS_MAX 65535 +#define SCTPCTL_INCOMING_STREAMS_DEFAULT SCTP_ISTREAM_INITIAL + /* outgoing_streams: Default number of outgoing streams */ #define SCTPCTL_OUTGOING_STREAMS_DESC "Default number of outgoing streams" #define SCTPCTL_OUTGOING_STREAMS_MIN 1 diff --git a/freebsd/sys/netinet/sctp_uio.h b/freebsd/sys/netinet/sctp_uio.h index d8e7da45..063fd9f1 100644 --- a/freebsd/sys/netinet/sctp_uio.h +++ b/freebsd/sys/netinet/sctp_uio.h @@ -1267,44 +1267,50 @@ sctp_sorecvmsg(struct socket *so, #if !(defined(_KERNEL)) && !(defined(__Userspace__)) __BEGIN_DECLS -int sctp_peeloff __P((int, sctp_assoc_t)); -int sctp_bindx __P((int, struct sockaddr *, int, int)); -int sctp_connectx __P((int, const struct sockaddr *, int, sctp_assoc_t *)); -int sctp_getaddrlen __P((sa_family_t)); -int sctp_getpaddrs __P((int, sctp_assoc_t, struct sockaddr **)); -void sctp_freepaddrs __P((struct sockaddr *)); -int sctp_getladdrs __P((int, sctp_assoc_t, struct sockaddr **)); -void sctp_freeladdrs __P((struct sockaddr *)); -int sctp_opt_info __P((int, sctp_assoc_t, int, void *, socklen_t *)); +int sctp_peeloff(int, sctp_assoc_t); +int sctp_bindx(int, struct sockaddr *, int, int); +int sctp_connectx(int, const struct sockaddr *, int, sctp_assoc_t *); +int sctp_getaddrlen(sa_family_t); +int sctp_getpaddrs(int, sctp_assoc_t, struct sockaddr **); +void sctp_freepaddrs(struct sockaddr *); +int sctp_getladdrs(int, sctp_assoc_t, struct sockaddr **); +void sctp_freeladdrs(struct sockaddr *); +int sctp_opt_info(int, sctp_assoc_t, int, void *, socklen_t *); /* deprecated */ -ssize_t sctp_sendmsg -__P((int, const void *, size_t, const struct sockaddr *, - socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); +ssize_t +sctp_sendmsg(int, const void *, size_t, const struct sockaddr *, + socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t); /* deprecated */ - ssize_t sctp_send __P((int, const void *, size_t, - const struct sctp_sndrcvinfo *, int)); +ssize_t +sctp_send(int, const void *, size_t, + const struct sctp_sndrcvinfo *, int); /* deprecated */ - ssize_t sctp_sendx __P((int, const void *, size_t, struct sockaddr *, - int, struct sctp_sndrcvinfo *, int)); +ssize_t +sctp_sendx(int, const void *, size_t, struct sockaddr *, + int, struct sctp_sndrcvinfo *, int); /* deprecated */ - ssize_t sctp_sendmsgx __P((int sd, const void *, size_t, struct sockaddr *, - int, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); +ssize_t +sctp_sendmsgx(int sd, const void *, size_t, struct sockaddr *, + int, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t); - sctp_assoc_t sctp_getassocid __P((int, struct sockaddr *)); +sctp_assoc_t sctp_getassocid(int, struct sockaddr *); /* deprecated */ - ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *, socklen_t *, - struct sctp_sndrcvinfo *, int *)); +ssize_t +sctp_recvmsg(int, void *, size_t, struct sockaddr *, socklen_t *, + struct sctp_sndrcvinfo *, int *); - ssize_t sctp_sendv __P((int, const struct iovec *, int, struct sockaddr *, - int, void *, socklen_t, unsigned int, int)); +ssize_t +sctp_sendv(int, const struct iovec *, int, struct sockaddr *, + int, void *, socklen_t, unsigned int, int); - ssize_t sctp_recvv __P((int, const struct iovec *, int, struct sockaddr *, - socklen_t *, void *, socklen_t *, unsigned int *, int *)); +ssize_t +sctp_recvv(int, const struct iovec *, int, struct sockaddr *, + socklen_t *, void *, socklen_t *, unsigned int *, int *); __END_DECLS diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c index 527790ce..81db1dc1 100644 --- a/freebsd/sys/netinet/sctp_usrreq.c +++ b/freebsd/sys/netinet/sctp_usrreq.c @@ -2054,18 +2054,29 @@ flags_out: } case SCTP_MAX_BURST: { - uint8_t *value; + struct sctp_assoc_value *av; - SCTP_CHECK_AND_CAST(value, optval, uint8_t, *optsize); + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); - SCTP_INP_RLOCK(inp); - if (inp->sctp_ep.max_burst < 256) { - *value = inp->sctp_ep.max_burst; + if (stcb) { + av->assoc_value = stcb->asoc.max_burst; + SCTP_TCB_UNLOCK(stcb); } else { - *value = 255; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_ep.max_burst; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); } - SCTP_INP_RUNLOCK(inp); - *optsize = sizeof(uint8_t); break; } case SCTP_MAXSEG: @@ -4378,13 +4389,34 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } case SCTP_MAX_BURST: { - uint8_t *burst; + struct sctp_assoc_value *av; - SCTP_CHECK_AND_CAST(burst, optval, uint8_t, optsize); + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); - SCTP_INP_WLOCK(inp); - inp->sctp_ep.max_burst = *burst; - SCTP_INP_WUNLOCK(inp); + if (stcb) { + stcb->asoc.max_burst = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->sctp_ep.max_burst = av->assoc_value; + SCTP_INP_WUNLOCK(inp); + } + if ((av->assoc_id == SCTP_CURRENT_ASSOC) || + (av->assoc_id == SCTP_ALL_ASSOC)) { + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + stcb->asoc.max_burst = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } + } break; } case SCTP_MAXSEG: diff --git a/freebsd/sys/netinet/sctp_var.h b/freebsd/sys/netinet/sctp_var.h index 3862b90b..d88a2376 100644 --- a/freebsd/sys/netinet/sctp_var.h +++ b/freebsd/sys/netinet/sctp_var.h @@ -321,48 +321,34 @@ struct sctphdr; void sctp_close(struct socket *so); int sctp_disconnect(struct socket *so); +void sctp_ctlinput(int, struct sockaddr *, void *); +int sctp_ctloutput(struct socket *, struct sockopt *); -void sctp_ctlinput __P((int, struct sockaddr *, void *)); -int sctp_ctloutput __P((struct socket *, struct sockopt *)); - -#ifdef INET -void sctp_input_with_port __P((struct mbuf *, int, uint16_t)); - -#endif #ifdef INET -void sctp_input __P((struct mbuf *, int)); +void sctp_input_with_port(struct mbuf *, int, uint16_t); +void sctp_input(struct mbuf *, int); #endif -void sctp_pathmtu_adjustment __P((struct sctp_tcb *, uint16_t)); -void sctp_drain __P((void)); -void sctp_init __P((void)); - +void sctp_pathmtu_adjustment(struct sctp_tcb *, uint16_t); +void sctp_drain(void); +void sctp_init(void); void sctp_finish(void); - int sctp_flush(struct socket *, int); -int sctp_shutdown __P((struct socket *)); -void sctp_notify -__P((struct sctp_inpcb *, struct ip *ip, struct sctphdr *, +int sctp_shutdown(struct socket *); +void +sctp_notify(struct sctp_inpcb *, struct ip *ip, struct sctphdr *, struct sockaddr *, struct sctp_tcb *, - struct sctp_nets *)); - - int sctp_bindx(struct socket *, int, struct sockaddr_storage *, - int, int, struct proc *); + struct sctp_nets *); +int +sctp_bindx(struct socket *, int, struct sockaddr_storage *, + int, int, struct proc *); /* can't use sctp_assoc_t here */ - int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *); - - int sctp_ingetaddr(struct socket *, - struct sockaddr ** -); - - int sctp_peeraddr(struct socket *, - struct sockaddr ** -); - - int sctp_listen(struct socket *, int, struct thread *); - - int sctp_accept(struct socket *, struct sockaddr **); +int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *); +int sctp_ingetaddr(struct socket *, struct sockaddr **); +int sctp_peeraddr(struct socket *, struct sockaddr **); +int sctp_listen(struct socket *, int, struct thread *); +int sctp_accept(struct socket *, struct sockaddr **); #endif /* _KERNEL */ diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c index 3a88b894..15928d8b 100644 --- a/freebsd/sys/netinet/sctputil.c +++ b/freebsd/sys/netinet/sctputil.c @@ -2690,8 +2690,14 @@ set_error: stcb->sctp_socket->so_error = ECONNRESET; } } else { - SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNABORTED); - stcb->sctp_socket->so_error = ECONNABORTED; + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ETIMEDOUT); + stcb->sctp_socket->so_error = ETIMEDOUT; + } else { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNABORTED); + stcb->sctp_socket->so_error = ECONNABORTED; + } } } /* Wake ANY sleepers */ @@ -3532,8 +3538,8 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, if (stcb->sctp_socket->so_rcv.sb_state & SBS_CANTRCVMORE) { return; } - if (stcb && ((stcb->asoc.state & SCTP_STATE_COOKIE_WAIT) || - (stcb->asoc.state & SCTP_STATE_COOKIE_ECHOED))) { + if ((stcb->asoc.state & SCTP_STATE_COOKIE_WAIT) || + (stcb->asoc.state & SCTP_STATE_COOKIE_ECHOED)) { if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) || (notification == SCTP_NOTIFY_INTERFACE_UP) || (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) { @@ -3607,16 +3613,16 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, break; } case SCTP_NOTIFY_ASSOC_LOC_ABORTED: - if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) || - ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) { + if (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) || + ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED)) { sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 0, so_locked); } else { sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 0, so_locked); } break; case SCTP_NOTIFY_ASSOC_REM_ABORTED: - if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) || - ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) { + if (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) || + ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED)) { sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 1, so_locked); } else { sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 1, so_locked); @@ -3969,7 +3975,7 @@ sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, if (stcb == NULL) { /* Got to have a TCB */ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { - if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) { + if (LIST_EMPTY(&inp->sctp_asoc_list)) { sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_DIRECTLY_NOCMPSET); } @@ -4024,7 +4030,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, SCTP_STAT_INCR_COUNTER32(sctps_outoftheblue); /* Generate a TO address for future reference */ if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { - if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) { + if (LIST_EMPTY(&inp->sctp_asoc_list)) { sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_DIRECTLY_NOCMPSET); } diff --git a/freebsd/sys/netinet/tcp.h b/freebsd/sys/netinet/tcp.h index affc4df0..5925b4da 100644 --- a/freebsd/sys/netinet/tcp.h +++ b/freebsd/sys/netinet/tcp.h @@ -34,6 +34,7 @@ #define _NETINET_TCP_H_ #include <sys/cdefs.h> +#include <rtems/bsd/sys/types.h> #if __BSD_VISIBLE @@ -52,11 +53,11 @@ struct tcphdr { tcp_seq th_seq; /* sequence number */ tcp_seq th_ack; /* acknowledgement number */ #if BYTE_ORDER == LITTLE_ENDIAN - u_int th_x2:4, /* (unused) */ + u_char th_x2:4, /* (unused) */ th_off:4; /* data offset */ #endif #if BYTE_ORDER == BIG_ENDIAN - u_int th_off:4, /* data offset */ + u_char th_off:4, /* data offset */ th_x2:4; /* (unused) */ #endif u_char th_flags; @@ -103,29 +104,37 @@ struct tcphdr { /* - * Default maximum segment size for TCP. - * With an IP MTU of 576, this is 536, - * but 512 is probably more convenient. - * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). - */ -#define TCP_MSS 512 -/* - * TCP_MINMSS is defined to be 216 which is fine for the smallest - * link MTU (256 bytes, AX.25 packet radio) in the Internet. - * However it is very unlikely to come across such low MTU interfaces - * these days (anno dato 2003). - * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. - * Setting this to "0" disables the minmss check. + * The default maximum segment size (MSS) to be used for new TCP connections + * when path MTU discovery is not enabled. + * + * RFC879 derives the default MSS from the largest datagram size hosts are + * minimally required to handle directly or through IP reassembly minus the + * size of the IP and TCP header. With IPv6 the minimum MTU is specified + * in RFC2460. + * + * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr) + * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr) + * + * We use explicit numerical definition here to avoid header pollution. */ -#define TCP_MINMSS 216 +#define TCP_MSS 536 +#define TCP6_MSS 1220 /* - * Default maximum segment size for TCP6. - * With an IP6 MSS of 1280, this is 1220, - * but 1024 is probably more convenient. (xxx kazu in doubt) - * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr)) + * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS + * option. Allowing low values of MSS can consume significant resources and + * be used to mount a resource exhaustion attack. + * Connections requesting lower MSS values will be rounded up to this value + * and the IP_DF flag will be cleared to allow fragmentation along the path. + * + * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting + * it to "0" disables the minmss check. + * + * The default value is fine for TCP across the Internet's smallest official + * link MTU (256 bytes for AX.25 packet radio). However, a connection is very + * unlikely to come across such low MTU interfaces these days (anno domini 2003). */ -#define TCP6_MSS 1024 +#define TCP_MINMSS 216 #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ #define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ @@ -152,6 +161,10 @@ struct tcphdr { #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ +#define TCP_KEEPINIT 128 /* N, time to establish connection */ +#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ +#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ +#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR @@ -218,7 +231,7 @@ struct tcp_info { /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ - u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ + u_int32_t tcpi_snd_bwnd; /* No longer used. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ diff --git a/freebsd/sys/netinet/tcp_hostcache.c b/freebsd/sys/netinet/tcp_hostcache.c index a0d38ff7..ee98af3f 100644 --- a/freebsd/sys/netinet/tcp_hostcache.c +++ b/freebsd/sys/netinet/tcp_hostcache.c @@ -120,7 +120,7 @@ static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); static void tcp_hc_purge_internal(int); static void tcp_hc_purge(void *); -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, +static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c index 25afbb26..50dfc1ce 100644 --- a/freebsd/sys/netinet/tcp_input.c +++ b/freebsd/sys/netinet/tcp_input.c @@ -7,6 +7,7 @@ * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> * Copyright (c) 2010 The FreeBSD Foundation + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet @@ -18,6 +19,9 @@ * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -103,6 +107,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -153,6 +160,14 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0, + "Experimental TCP extensions"); + +VNET_DEFINE(int, tcp_do_initcwnd10) = 0; +SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW, + &VNET_NAME(tcp_do_initcwnd10), 0, + "Enable draft-ietf-tcpm-initcwnd-05 (Increasing initial CWND to 10)"); + VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, @@ -163,7 +178,7 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); +static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 0; SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, @@ -181,6 +196,11 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow the old (insecure) criteria for accepting RST packets"); +VNET_DEFINE(int, tcp_recvspace) = 1024*64; +#define V_tcp_recvspace VNET(tcp_recvspace) +SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); + VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, @@ -193,16 +213,12 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_inc), 0, "Incrementor step size of automatic receive buffer"); -VNET_DEFINE(int, tcp_autorcvbuf_max) = 256*1024; +VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); -int tcp_read_locking = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, - &tcp_read_locking, 0, "Enable read locking strategy"); - VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); @@ -217,18 +233,18 @@ static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); -static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, - uint16_t type); -static void inline cc_conn_init(struct tcpcb *tp); -static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void inline tcp_fields_to_host(struct tcphdr *); -static void inline hhook_run_tcp_est_in(struct tcpcb *tp, - struct tcphdr *th, struct tcpopt *to); #ifdef TCP_SIGNATURE static void inline tcp_fields_to_net(struct tcphdr *); static int inline tcp_signature_verify_input(struct mbuf *, int, int, int, struct tcpopt *, struct tcphdr *, u_int); #endif +static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, + uint16_t type); +static void inline cc_conn_init(struct tcpcb *tp); +static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); +static void inline hhook_run_tcp_est_in(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to); /* * Kernel module interface for updating tcpstat. The argument is an index @@ -271,7 +287,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); - if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd)) + if (tp->snd_cwnd <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; @@ -303,9 +319,6 @@ cc_conn_init(struct tcpcb *tp) struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; int rtt; -#ifdef INET6 - int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; -#endif INP_WLOCK_ASSERT(tp->t_inpcb); @@ -339,44 +352,33 @@ cc_conn_init(struct tcpcb *tp) } /* - * Set the slow-start flight size depending on whether this - * is a local network or not. - * - * Extend this so we cache the cwnd too and retrieve it here. - * Make cwnd even bigger than RFC3390 suggests but only if we - * have previous experience with the remote host. Be careful - * not make cwnd bigger than remote receive window or our own - * send socket buffer. Maybe put some additional upper bound - * on the retrieved cwnd. Should do incremental updates to - * hostcache when cwnd collapses so next connection doesn't - * overloads the path again. + * Set the initial slow-start flight size. * - * XXXAO: Initializing the CWND from the hostcache is broken - * and in its current form not RFC conformant. It is disabled - * until fixed or removed entirely. + * RFC5681 Section 3.1 specifies the default conservative values. + * RFC3390 specifies slightly more aggressive values. + * Draft-ietf-tcpm-initcwnd-05 increases it to ten segments. * - * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. - * We currently check only in syncache_socket for that. + * If a SYN or SYN/ACK was lost and retransmitted, we have to + * reduce the initial CWND to one segment as congestion is likely + * requiring us to be cautious. */ -/* #define TCP_METRICS_CWND */ -#ifdef TCP_METRICS_CWND - if (metrics.rmx_cwnd) - tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2, - min(tp->snd_wnd, so->so_snd.sb_hiwat))); - else -#endif - if (V_tcp_do_rfc3390) + if (tp->snd_cwnd == 1) + tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ + else if (V_tcp_do_initcwnd10) + tp->snd_cwnd = min(10 * tp->t_maxseg, + max(2 * tp->t_maxseg, 14600)); + else if (V_tcp_do_rfc3390) tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); -#ifdef INET6 - else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || - (!isipv6 && in_localaddr(inp->inp_faddr))) -#else - else if (in_localaddr(inp->inp_faddr)) -#endif - tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local; - else - tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz; + else { + /* Per RFC5681 Section 3.1 */ + if (tp->t_maxseg > 2190) + tp->snd_cwnd = 2 * tp->t_maxseg; + else if (tp->t_maxseg > 1095) + tp->snd_cwnd = 3 * tp->t_maxseg; + else + tp->snd_cwnd = 4 * tp->t_maxseg; + } if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); @@ -546,43 +548,44 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) tcp_input(m, *offp); return IPPROTO_DONE; } -#endif +#endif /* INET6 */ void tcp_input(struct mbuf *m, int off0) { - struct tcphdr *th; + struct tcphdr *th = NULL; struct ip *ip = NULL; +#ifdef INET struct ipovly *ipov; +#endif struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int optlen = 0; - int len, tlen, off; +#ifdef INET + int len; +#endif + int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ - uint8_t iptos; #ifdef TCP_SIGNATURE uint8_t sig_checked = 0; #endif -#ifdef IPFIREWALL_FORWARD - struct m_tag *fwd_tag; -#endif + uint8_t iptos = 0; + struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; - const int isipv6 = 0; -#endif +#endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 -#define TI_RLOCKED 2 -#define TI_WLOCKED 3 +#define TI_WLOCKED 2 #ifdef TCPDEBUG /* @@ -601,16 +604,34 @@ tcp_input(struct mbuf *m, int off0) to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); - if (isipv6) { #ifdef INET6 + if (isipv6) { /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ + + if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { + m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); + if (m == NULL) { + TCPSTAT_INC(tcps_rcvshort); + return; + } + } + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; - if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_pseudo(ip6, tlen, + IPPROTO_TCP, m->m_pkthdr.csum_data); + th->th_sum ^= 0xffff; + } else + th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); + if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } - th = (struct tcphdr *)((caddr_t)ip6 + off0); /* * Be proactive about unspecified IPv6 address in source. @@ -624,10 +645,13 @@ tcp_input(struct mbuf *m, int off0) /* XXX stat */ goto drop; } -#else - th = NULL; /* XXX: Avoid compiler warning. */ + } #endif - } else { +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. @@ -679,13 +703,18 @@ tcp_input(struct mbuf *m, int off0) /* Re-initialization for later version check */ ip->ip_v = IPVERSION; } +#endif /* INET */ #ifdef INET6 if (isipv6) iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; +#endif +#if defined(INET) && defined(INET6) else #endif +#ifdef INET iptos = ip->ip_tos; +#endif /* * Check that TCP offset makes sense, @@ -698,13 +727,18 @@ tcp_input(struct mbuf *m, int off0) } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { - if (isipv6) { #ifdef INET6 + if (isipv6) { IP6_EXTHDR_CHECK(m, off0, off, ); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); + } #endif - } else { +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { @@ -716,6 +750,7 @@ tcp_input(struct mbuf *m, int off0) th = (struct tcphdr *)((caddr_t)ip + off0); } } +#endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } @@ -732,39 +767,83 @@ tcp_input(struct mbuf *m, int off0) drop_hdrlen = off0 + off; /* - * Locate pcb for segment, which requires a lock on tcbinfo. - * Optimisticaly acquire a global read lock rather than a write lock - * unless header flags necessarily imply a state change. There are - * two cases where we might discover later we need a write lock - * despite the flags: ACKs moving a connection out of the syncache, - * and ACKs for a connection in TIMEWAIT. + * Locate pcb for segment; if we're likely to add or remove a + * connection then first acquire pcbinfo lock. There are two cases + * where we might discover later we need a write lock despite the + * flags: ACKs moving a connection out of the syncache, and ACKs for + * a connection in TIMEWAIT. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; - } else { - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - } + } else + ti_locked = TI_UNLOCKED; findpcb: #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: findpcb ti_locked %d\n", __func__, ti_locked); + } else { + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif -#ifdef IPFIREWALL_FORWARD /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ - fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + if ( +#ifdef INET6 + (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) +#ifdef INET + || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) +#endif +#endif +#if defined(INET) && !defined(INET6) + (m->m_flags & M_IP_NEXTHOP) +#endif + ) + fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); - if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ +#ifdef INET6 + if (isipv6 && fwd_tag != NULL) { + struct sockaddr_in6 *next_hop6; + + next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); + /* + * Transparently forwarded. Pretend to be the destination. + * Already got one like this? + */ + inp = in6_pcblookup_mbuf(&V_tcbinfo, + &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, + INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); + if (!inp) { + /* + * It's new. Try to find the ambushing socket. + * Because we've rewritten the destination address, + * any hardware-generated hash is ignored. + */ + inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, + th->th_sport, &next_hop6->sin6_addr, + next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : + th->th_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); + } + /* Remove the tag from the packet. We don't need it anymore. */ + m_tag_delete(m, fwd_tag); + m->m_flags &= ~M_IP6_NEXTHOP; + fwd_tag = NULL; + } else if (isipv6) { + inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, + th->th_sport, &ip6->ip6_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); + } +#endif /* INET6 */ +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); @@ -772,41 +851,31 @@ findpcb: * Transparently forwarded. Pretend to be the destination. * already got one like this? */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - 0, m->m_pkthdr.rcvif); + inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); if (!inp) { - /* It's new. Try to find the ambushing socket. */ - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - next_hop->sin_addr, - next_hop->sin_port ? - ntohs(next_hop->sin_port) : - th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); + /* + * It's new. Try to find the ambushing socket. + * Because we've rewritten the destination address, + * any hardware-generated hash is ignored. + */ + inp = in_pcblookup(&V_tcbinfo, ip->ip_src, + th->th_sport, next_hop->sin_addr, + next_hop->sin_port ? ntohs(next_hop->sin_port) : + th->th_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); + m->m_flags &= ~M_IP_NEXTHOP; + fwd_tag = NULL; } else -#endif /* IPFIREWALL_FORWARD */ - { - if (isipv6) { -#ifdef INET6 - inp = in6_pcblookup_hash(&V_tcbinfo, - &ip6->ip6_src, th->th_sport, - &ip6->ip6_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); -#endif - } else - inp = in_pcblookup_hash(&V_tcbinfo, - ip->ip_src, th->th_sport, - ip->ip_dst, th->th_dport, - INPLOOKUP_WILDCARD, - m->m_pkthdr.rcvif); - } + inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, + th->th_sport, ip->ip_dst, th->th_dport, + INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, + m->m_pkthdr.rcvif, m); +#endif /* INET */ /* * If the INPCB does not exist then all data in the incoming @@ -835,7 +904,7 @@ findpcb: rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } - INP_WLOCK(inp); + INP_WLOCK_ASSERT(inp); if (!(inp->inp_flags & INP_HW_FLOWID) && (m->m_flags & M_FLOWID) && ((inp->inp_socket == NULL) @@ -847,12 +916,12 @@ findpcb: #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { - V_ipsec6stat.in_polvio++; + IPSEC6STAT_INC(in_polvio); goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { - V_ipsec4stat.in_polvio++; + IPSECSTAT_INC(in_polvio); goto dropunlock; } #endif /* IPSEC */ @@ -876,28 +945,26 @@ findpcb: * legitimate new connection attempt the old INPCB gets removed and * we can try again to find a listening socket. * - * At this point, due to earlier optimism, we may hold a read lock on - * the inpcbinfo, rather than a write lock. If so, we need to - * upgrade, or if that fails, acquire a reference on the inpcb, drop - * all locks, acquire a global write lock, and then re-acquire the - * inpcb lock. We may at that point discover that another thread has - * tried to free the inpcb, in which case we need to loop back and - * try to find a new inpcb to deliver to. + * At this point, due to earlier optimism, we may hold only an inpcb + * lock, and not the inpcbinfo write lock. If so, we need to try to + * acquire it, or if that fails, acquire a reference on the inpcb, + * drop all locks, acquire a global write lock, and then re-acquire + * the inpcb lock. We may at that point discover that another thread + * has tried to free the inpcb, in which case we need to loop back + * and try to find a new inpcb to deliver to. + * + * XXXRW: It may be time to rethink timewait locking. */ relocked: if (inp->inp_flags & INP_TIMEWAIT) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } @@ -927,28 +994,34 @@ relocked: goto dropwithreset; } +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_input(tp, m); + m = NULL; /* consumed by the TOE driver */ + goto dropunlock; + } +#endif + /* * We've identified a valid inpcb, but it could be that we need an - * inpcbinfo write lock and have only a read lock. In this case, - * attempt to upgrade/relock using the same strategy as the TIMEWAIT - * case above. If we relock, we have to jump back to 'relocked' as - * the connection might now be in TIMEWAIT. + * inpcbinfo write lock but don't hold it. In this case, attempt to + * acquire using the same strategy as the TIMEWAIT case above. If we + * relock, we have to jump back to 'relocked' as the connection might + * now be in TIMEWAIT. */ - if (tp->t_state != TCPS_ESTABLISHED || - (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tcp_read_locking == 0) { - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("%s: upgrade check ti_locked %d", __func__, ti_locked)); - - if (ti_locked == TI_RLOCKED) { - if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { +#ifdef INVARIANTS + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); +#endif + if (tp->t_state != TCPS_ESTABLISHED) { + if (ti_locked == TI_UNLOCKED) { + if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); - if (in_pcbrele(inp)) { + if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } @@ -969,25 +1042,28 @@ relocked: #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; - if (isipv6) { #ifdef INET6 + if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); -#endif } else +#endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } -#endif +#endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection - * attempt or the completion of a previous one. + * attempt or the completion of a previous one. Because listen + * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be + * held in this case. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); bzero(&inc, sizeof(inc)); #ifdef INET6 @@ -1151,7 +1227,7 @@ relocked: "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); - goto dropunlock; + goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). @@ -1213,7 +1289,7 @@ relocked: if (ia6) ifa_free(&ia6->ia_ifa); } -#endif +#endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer @@ -1232,8 +1308,8 @@ relocked: "link layer address ignored\n", s, __func__); goto dropunlock; } - if (isipv6) { #ifdef INET6 + if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) @@ -1250,8 +1326,13 @@ relocked: "address ignored\n", s, __func__); goto dropunlock; } + } #endif - } else { +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) @@ -1272,6 +1353,7 @@ relocked: goto dropunlock; } } +#endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. @@ -1289,6 +1371,15 @@ relocked: */ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return; + } else if (tp->t_state == TCPS_LISTEN) { + /* + * When a listen socket is torn down the SO_ACCEPTCONN + * flag is removed first while connections are drained + * from the accept queue in a unlock/lock cycle of the + * ACCEPT_LOCK, opening a race condition allowing a SYN + * attempt go through unhandled. + */ + goto dropunlock; } #ifdef TCP_SIGNATURE @@ -1320,13 +1411,17 @@ relocked: return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); - ti_locked = TI_UNLOCKED; + ti_locked = TI_UNLOCKED; + } +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); @@ -1337,13 +1432,17 @@ dropwithreset: goto drop; dropunlock: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropunlock ti_locked %d", __func__, ti_locked); - ti_locked = TI_UNLOCKED; + ti_locked = TI_UNLOCKED; + } +#ifdef INVARIANTS + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif if (inp != NULL) INP_WUNLOCK(inp); @@ -1398,13 +1497,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - else - panic("%s: ti_locked %d for EST", __func__, - ti_locked); + else { + KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " + "ti_locked: %d", __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); @@ -1421,7 +1520,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) - tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); + tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); /* * Unscale the window into a 32-bit value. @@ -1550,13 +1649,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure ACK", - __func__, ti_locked); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1595,7 +1689,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_xmit_timer(tp, ticks - tp->t_rtttime); } - tcp_xmit_bandwidth_limit(tp, th->th_ack); acked = BYTES_THIS_ACK(tp, th); /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ @@ -1660,13 +1753,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: ti_locked %d on pure data " - "segment", __func__, ti_locked); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -1877,7 +1965,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } else { tp->t_state = TCPS_ESTABLISHED; cc_conn_init(tp); - tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); + tcp_timer_activate(tp, TT_KEEP, + TP_KEEPIDLE(tp)); } } else { /* @@ -2281,7 +2370,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } else { tp->t_state = TCPS_ESTABLISHED; cc_conn_init(tp); - tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); + tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() @@ -2362,7 +2451,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff - * we have less than 1/2 the original window's + * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + @@ -2448,6 +2537,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; + int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, @@ -2469,7 +2559,17 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ break; } - (void) tcp_output(tp); + /* + * Only call tcp_output when there + * is new data available to be sent. + * Otherwise we would send pure ACKs. + */ + SOCKBUF_LOCK(&so->so_snd); + avail = so->so_snd.sb_cc - + (tp->snd_nxt - tp->snd_una); + SOCKBUF_UNLOCK(&so->so_snd); + if (avail > 0) + (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && @@ -2529,9 +2629,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } process_ACK: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_input: process_ACK ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); @@ -2575,7 +2672,6 @@ process_ACK: tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } - tcp_xmit_bandwidth_limit(tp, th->th_ack); /* * If all outstanding data is acked, stop retransmit @@ -2654,12 +2750,11 @@ process_ACK: * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - int timeout; - soisdisconnected(so); - timeout = (tcp_fast_finwait2_recycle) ? - tcp_finwait2_timeout : tcp_maxidle; - tcp_timer_activate(tp, TT_2MSL, timeout); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); } tp->t_state = TCPS_FIN_WAIT_2; } @@ -2698,9 +2793,6 @@ process_ACK: } step6: - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: step6 ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2786,9 +2878,6 @@ step6: tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ - INP_INFO_LOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dodata ti_locked %d", ti_locked)); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2920,13 +3009,8 @@ dodata: /* XXX */ return; } } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dodata epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG @@ -2955,9 +3039,6 @@ check_delack: return; dropafterack: - KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, - ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); - /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. @@ -2984,13 +3065,8 @@ dropafterack: tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropafterack epilogue ti_locked %d", __func__, - ti_locked); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; @@ -3000,12 +3076,8 @@ dropafterack: return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); - else - panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); ti_locked = TI_UNLOCKED; if (tp != NULL) { @@ -3016,15 +3088,14 @@ dropwithreset: return; drop: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - else if (ti_locked == TI_WLOCKED) + if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + } #ifdef INVARIANTS else INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); #endif - ti_locked = TI_UNLOCKED; /* * Drop space held by incoming segment and return. @@ -3048,7 +3119,9 @@ static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { +#ifdef INET struct ip *ip; +#endif #ifdef INET6 struct ip6_hdr *ip6; #endif @@ -3067,8 +3140,12 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ - } else + } #endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || @@ -3077,6 +3154,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } +#endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) @@ -3307,10 +3385,8 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. - * If none, use an mss that can be handled on the outgoing - * interface without forcing IP to fragment; if bigger than - * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES - * to utilize large mbufs. If no route is found, route has no mtu, + * If none, use an mss that can be handled on the outgoing interface + * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening @@ -3331,10 +3407,10 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, - struct hc_metrics_lite *metricptr, int *mtuflags) + struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { - int mss; - u_long maxmtu; + int mss = 0; + u_long maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; int origoffer; @@ -3358,14 +3434,19 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, /* Initialize. */ #ifdef INET6 if (isipv6) { - maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags); + maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; - } else + } +#endif +#if defined(INET) && defined(INET6) + else #endif +#ifdef INET { - maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags); + maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; } +#endif /* * No route to sender, stay with default mss and return. @@ -3426,14 +3507,19 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); - } else + } +#endif +#if defined(INET) && defined(INET6) + else #endif +#ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } +#endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. @@ -3481,13 +3567,6 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; -#if (MCLBYTES & (MCLBYTES - 1)) == 0 - if (mss > MCLBYTES) - mss &= ~(MCLBYTES-1); -#else - if (mss > MCLBYTES) - mss = mss / MCLBYTES * MCLBYTES; -#endif tp->t_maxseg = mss; } @@ -3499,11 +3578,12 @@ tcp_mss(struct tcpcb *tp, int offer) struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; - int mtuflags = 0; + struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); - - tcp_mss_update(tp, offer, -1, &metrics, &mtuflags); + + bzero(&cap, sizeof(cap)); + tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; @@ -3517,7 +3597,7 @@ tcp_mss(struct tcpcb *tp, int offer) */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); - if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) + if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; @@ -3534,7 +3614,7 @@ tcp_mss(struct tcpcb *tp, int offer) tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); - if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) + if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; @@ -3548,8 +3628,10 @@ tcp_mss(struct tcpcb *tp, int offer) SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ - if (mtuflags & CSUM_TSO) + if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; + tp->t_tsomax = cap.tsomax; + } } /* @@ -3569,16 +3651,23 @@ tcp_mssopt(struct in_conninfo *inc) if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); - thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); - } else + } +#endif +#if defined(INET) && defined(INET6) + else #endif +#ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); - thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ min_protoh = sizeof(struct tcpiphdr); } +#endif +#if defined(INET6) || defined(INET) + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ +#endif + if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c index 9f1d13c3..52d92aa0 100644 --- a/freebsd/sys/netinet/tcp_lro.c +++ b/freebsd/sys/netinet/tcp_lro.c @@ -3,8 +3,12 @@ /*- * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. + * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -25,359 +29,589 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + #include <rtems/bsd/sys/param.h> #include <sys/systm.h> -#include <sys/endian.h> #include <sys/mbuf.h> #include <sys/kernel.h> #include <sys/socket.h> #include <net/if.h> +#include <net/if_var.h> #include <net/ethernet.h> -#include <net/if_media.h> +#include <net/vnet.h> #include <netinet/in_systm.h> #include <netinet/in.h> +#include <netinet/ip6.h> #include <netinet/ip.h> +#include <netinet/ip_var.h> #include <netinet/tcp.h> #include <netinet/tcp_lro.h> -#include <machine/bus.h> +#include <netinet6/ip6_var.h> + #include <machine/in_cksum.h> +#ifndef LRO_ENTRIES +#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */ +#endif -static uint16_t do_csum_data(uint16_t *raw, int len) -{ - uint32_t csum; - csum = 0; - while (len > 0) { - csum += *raw; - raw++; - csum += *raw; - raw++; - len -= 4; - } - csum = (csum >> 16) + (csum & 0xffff); - csum = (csum >> 16) + (csum & 0xffff); - return (uint16_t)csum; -} +#define TCP_LRO_UPDATE_CSUM 1 +#ifndef TCP_LRO_UPDATE_CSUM +#define TCP_LRO_INVALID_CSUM 0x0000 +#endif -/* - * Allocate and init the LRO data structures - */ int -tcp_lro_init(struct lro_ctrl *cntl) +tcp_lro_init(struct lro_ctrl *lc) { - struct lro_entry *lro; - int i, error = 0; + struct lro_entry *le; + int error, i; - SLIST_INIT(&cntl->lro_free); - SLIST_INIT(&cntl->lro_active); - - cntl->lro_bad_csum = 0; - cntl->lro_queued = 0; - cntl->lro_flushed = 0; + lc->lro_bad_csum = 0; + lc->lro_queued = 0; + lc->lro_flushed = 0; + lc->lro_cnt = 0; + SLIST_INIT(&lc->lro_free); + SLIST_INIT(&lc->lro_active); + error = 0; for (i = 0; i < LRO_ENTRIES; i++) { - lro = (struct lro_entry *) malloc(sizeof (struct lro_entry), - M_DEVBUF, M_NOWAIT | M_ZERO); - if (lro == NULL) { + le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (le == NULL) { if (i == 0) error = ENOMEM; break; } - cntl->lro_cnt = i; - SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); + lc->lro_cnt = i + 1; + SLIST_INSERT_HEAD(&lc->lro_free, le, next); } return (error); } void -tcp_lro_free(struct lro_ctrl *cntl) +tcp_lro_free(struct lro_ctrl *lc) { - struct lro_entry *entry; + struct lro_entry *le; - while (!SLIST_EMPTY(&cntl->lro_free)) { - entry = SLIST_FIRST(&cntl->lro_free); - SLIST_REMOVE_HEAD(&cntl->lro_free, next); - free(entry, M_DEVBUF); + while (!SLIST_EMPTY(&lc->lro_free)) { + le = SLIST_FIRST(&lc->lro_free); + SLIST_REMOVE_HEAD(&lc->lro_free, next); + free(le, M_DEVBUF); } } +#ifdef TCP_LRO_UPDATE_CSUM +static uint16_t +tcp_lro_csum_th(struct tcphdr *th) +{ + uint32_t ch; + uint16_t *p, l; + + ch = th->th_sum = 0x0000; + l = th->th_off; + p = (uint16_t *)th; + while (l > 0) { + ch += *p; + p++; + ch += *p; + p++; + l--; + } + while (ch > 0xffff) + ch = (ch >> 16) + (ch & 0xffff); + + return (ch & 0xffff); +} + +static uint16_t +tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, + uint16_t tcp_data_len, uint16_t csum) +{ + uint32_t c; + uint16_t cs; + + c = csum; + + /* Remove length from checksum. */ + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)l3hdr; + if (le->append_cnt == 0) + cs = ip6->ip6_plen; + else { + uint32_t cx; + + cx = ntohs(ip6->ip6_plen); + cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); + } + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + struct ip *ip4; + + ip4 = (struct ip *)l3hdr; + if (le->append_cnt == 0) + cs = ip4->ip_len; + else { + cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), + IPPROTO_TCP); + cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, + htons(cs)); + } + break; + } +#endif + default: + cs = 0; /* Keep compiler happy. */ + } + + cs = ~cs; + c += cs; + + /* Remove TCP header csum. */ + cs = ~tcp_lro_csum_th(th); + c += cs; + while (c > 0xffff) + c = (c >> 16) + (c & 0xffff); + + return (c & 0xffff); +} +#endif + void -tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro) +tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { - struct ifnet *ifp; - struct ip *ip; - struct tcphdr *tcp; - uint32_t *ts_ptr; - uint32_t tcplen, tcp_csum; - - - if (lro->append_cnt) { - /* incorporate the new len into the ip header and - * re-calculate the checksum */ - ip = lro->ip; - ip->ip_len = htons(lro->len - ETHER_HDR_LEN); - ip->ip_sum = 0; - ip->ip_sum = 0xffff ^ - do_csum_data((uint16_t*)ip, - sizeof (*ip)); - - lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED | - CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - lro->m_head->m_pkthdr.csum_data = 0xffff; - lro->m_head->m_pkthdr.len = lro->len; - - /* incorporate the latest ack into the tcp header */ - tcp = (struct tcphdr *) (ip + 1); - tcp->th_ack = lro->ack_seq; - tcp->th_win = lro->window; - /* incorporate latest timestamp into the tcp header */ - if (lro->timestamp) { - ts_ptr = (uint32_t *)(tcp + 1); - ts_ptr[1] = htonl(lro->tsval); - ts_ptr[2] = lro->tsecr; + + if (le->append_cnt > 0) { + struct tcphdr *th; + uint16_t p_len; + + p_len = htons(le->p_len); + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + struct ip6_hdr *ip6; + + ip6 = le->le_ip6; + ip6->ip6_plen = p_len; + th = (struct tcphdr *)(ip6 + 1); + le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | + CSUM_PSEUDO_HDR; + le->p_len += ETHER_HDR_LEN + sizeof(*ip6); + break; } - /* - * update checksum in tcp header by re-calculating the - * tcp pseudoheader checksum, and adding it to the checksum - * of the tcp payload data - */ - tcp->th_sum = 0; - tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN; - tcp_csum = lro->data_csum; - tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(tcplen + IPPROTO_TCP)); - tcp_csum += do_csum_data((uint16_t*)tcp, - tcp->th_off << 2); - tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); - tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); - tcp->th_sum = 0xffff ^ tcp_csum; +#endif +#ifdef INET + case ETHERTYPE_IP: + { + struct ip *ip4; +#ifdef TCP_LRO_UPDATE_CSUM + uint32_t cl; + uint16_t c; +#endif + + ip4 = le->le_ip4; +#ifdef TCP_LRO_UPDATE_CSUM + /* Fix IP header checksum for new length. */ + c = ~ip4->ip_sum; + cl = c; + c = ~ip4->ip_len; + cl += c + p_len; + while (cl > 0xffff) + cl = (cl >> 16) + (cl & 0xffff); + c = cl; + ip4->ip_sum = ~c; +#else + ip4->ip_sum = TCP_LRO_INVALID_CSUM; +#endif + ip4->ip_len = p_len; + th = (struct tcphdr *)(ip4 + 1); + le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | + CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; + le->p_len += ETHER_HDR_LEN; + break; + } +#endif + default: + th = NULL; /* Keep compiler happy. */ + } + le->m_head->m_pkthdr.csum_data = 0xffff; + le->m_head->m_pkthdr.len = le->p_len; + + /* Incorporate the latest ACK into the TCP header. */ + th->th_ack = le->ack_seq; + th->th_win = le->window; + /* Incorporate latest timestamp into the TCP header. */ + if (le->timestamp != 0) { + uint32_t *ts_ptr; + + ts_ptr = (uint32_t *)(th + 1); + ts_ptr[1] = htonl(le->tsval); + ts_ptr[2] = le->tsecr; + } +#ifdef TCP_LRO_UPDATE_CSUM + /* Update the TCP header checksum. */ + le->ulp_csum += p_len; + le->ulp_csum += tcp_lro_csum_th(th); + while (le->ulp_csum > 0xffff) + le->ulp_csum = (le->ulp_csum >> 16) + + (le->ulp_csum & 0xffff); + th->th_sum = (le->ulp_csum & 0xffff); + th->th_sum = ~th->th_sum; +#else + th->th_sum = TCP_LRO_INVALID_CSUM; +#endif } - ifp = cntl->ifp; - (*ifp->if_input)(cntl->ifp, lro->m_head); - cntl->lro_queued += lro->append_cnt + 1; - cntl->lro_flushed++; - lro->m_head = NULL; - lro->timestamp = 0; - lro->append_cnt = 0; - SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); + + (*lc->ifp->if_input)(lc->ifp, le->m_head); + lc->lro_queued += le->append_cnt + 1; + lc->lro_flushed++; + bzero(le, sizeof(*le)); + SLIST_INSERT_HEAD(&lc->lro_free, le, next); } -int -tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum) +#ifdef INET6 +static int +tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, + struct tcphdr **th) { - struct ether_header *eh; - struct ip *ip; - struct tcphdr *tcp; - uint32_t *ts_ptr; - struct mbuf *m_nxt, *m_tail; - struct lro_entry *lro; - int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len; - int opt_bytes, trim, csum_flags; - uint32_t seq, tmp_csum, device_mtu; - - - eh = mtod(m_head, struct ether_header *); - if (eh->ether_type != htons(ETHERTYPE_IP)) - return 1; - ip = (struct ip *) (eh + 1); - if (ip->ip_p != IPPROTO_TCP) - return 1; - - /* ensure there are no options */ - if ((ip->ip_hl << 2) != sizeof (*ip)) - return -1; - - /* .. and the packet is not fragmented */ - if (ip->ip_off & htons(IP_MF|IP_OFFMASK)) - return -1; - - /* verify that the IP header checksum is correct */ - csum_flags = m_head->m_pkthdr.csum_flags; + + /* XXX-BZ we should check the flow-label. */ + + /* XXX-BZ We do not yet support ext. hdrs. */ + if (ip6->ip6_nxt != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Find the TCP header. */ + *th = (struct tcphdr *)(ip6 + 1); + + return (0); +} +#endif + +#ifdef INET +static int +tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, + struct tcphdr **th) +{ + int csum_flags; + uint16_t csum; + + if (ip4->ip_p != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Ensure there are no options. */ + if ((ip4->ip_hl << 2) != sizeof (*ip4)) + return (TCP_LRO_CANNOT); + + /* .. and the packet is not fragmented. */ + if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) + return (TCP_LRO_CANNOT); + + /* Legacy IP has a header checksum that needs to be correct. */ + csum_flags = m->m_pkthdr.csum_flags; if (csum_flags & CSUM_IP_CHECKED) { if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { - cntl->lro_bad_csum++; - return -1; + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); } } else { - tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip)); - if (__predict_false((tmp_csum ^ 0xffff) != 0)) { - cntl->lro_bad_csum++; - return -1; + csum = in_cksum_hdr(ip4); + if (__predict_false((csum) != 0)) { + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); } } - - /* find the TCP header */ - tcp = (struct tcphdr *) (ip + 1); - - /* Get the TCP checksum if we dont have it */ - if (!csum) - csum = tcp->th_sum; - - /* ensure no bits set besides ack or psh */ - if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0) - return -1; - - /* check for timestamps. Since the only option we handle are - timestamps, we only have to handle the simple case of - aligned timestamps */ - - opt_bytes = (tcp->th_off << 2) - sizeof (*tcp); - tcp_hdr_len = sizeof (*tcp) + opt_bytes; - ts_ptr = (uint32_t *)(tcp + 1); - if (opt_bytes != 0) { - if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) || - (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| - TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))) - return -1; - } - ip_len = ntohs(ip->ip_len); - tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip); - + /* Find the TCP header (we assured there are no IP options). */ + *th = (struct tcphdr *)(ip4 + 1); - /* - * If frame is padded beyond the end of the IP packet, - * then we must trim the extra bytes off the end. - */ - tot_len = m_head->m_pkthdr.len; - trim = tot_len - (ip_len + ETHER_HDR_LEN); - if (trim != 0) { - if (trim < 0) { - /* truncated packet */ - return -1; + return (0); +} +#endif + +int +tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +{ + struct lro_entry *le; + struct ether_header *eh; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip4 = NULL; /* Keep compiler happy. */ +#endif + struct tcphdr *th; + void *l3hdr = NULL; /* Keep compiler happy. */ + uint32_t *ts_ptr; + tcp_seq seq; + int error, ip_len, l; + uint16_t eh_type, tcp_data_len; + + /* We expect a contiguous header [eh, ip, tcp]. */ + + eh = mtod(m, struct ether_header *); + eh_type = ntohs(eh->ether_type); + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + CURVNET_SET(lc->ifp->if_vnet); + if (V_ip6_forwarding != 0) { + /* XXX-BZ stats but changing lro_ctrl is a problem. */ + CURVNET_RESTORE(); + return (TCP_LRO_CANNOT); + } + CURVNET_RESTORE(); + l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); + error = tcp_lro_rx_ipv6(lc, m, ip6, &th); + if (error != 0) + return (error); + tcp_data_len = ntohs(ip6->ip6_plen); + ip_len = sizeof(*ip6) + tcp_data_len; + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + CURVNET_SET(lc->ifp->if_vnet); + if (V_ipforwarding != 0) { + /* XXX-BZ stats but changing lro_ctrl is a problem. */ + CURVNET_RESTORE(); + return (TCP_LRO_CANNOT); } - m_adj(m_head, -trim); - tot_len = m_head->m_pkthdr.len; + CURVNET_RESTORE(); + l3hdr = ip4 = (struct ip *)(eh + 1); + error = tcp_lro_rx_ipv4(lc, m, ip4, &th); + if (error != 0) + return (error); + ip_len = ntohs(ip4->ip_len); + tcp_data_len = ip_len - sizeof(*ip4); + break; } +#endif + /* XXX-BZ what happens in case of VLAN(s)? */ + default: + return (TCP_LRO_NOT_SUPPORTED); + } + + /* + * If the frame is padded beyond the end of the IP packet, then we must + * trim the extra bytes off. + */ + l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); + if (l != 0) { + if (l < 0) + /* Truncated packet. */ + return (TCP_LRO_CANNOT); - m_nxt = m_head; - m_tail = NULL; /* -Wuninitialized */ - while (m_nxt != NULL) { - m_tail = m_nxt; - m_nxt = m_tail->m_next; + m_adj(m, -l); } - hlen = ip_len + ETHER_HDR_LEN - tcp_data_len; - seq = ntohl(tcp->th_seq); - - SLIST_FOREACH(lro, &cntl->lro_active, next) { - if (lro->source_port == tcp->th_sport && - lro->dest_port == tcp->th_dport && - lro->source_ip == ip->ip_src.s_addr && - lro->dest_ip == ip->ip_dst.s_addr) { - /* Try to append it */ - - if (__predict_false(seq != lro->next_seq)) { - /* out of order packet */ - SLIST_REMOVE(&cntl->lro_active, lro, - lro_entry, next); - tcp_lro_flush(cntl, lro); - return -1; - } - - if (opt_bytes) { - uint32_t tsval = ntohl(*(ts_ptr + 1)); - /* make sure timestamp values are increasing */ - if (__predict_false(lro->tsval > tsval || - *(ts_ptr + 2) == 0)) { - return -1; - } - lro->tsval = tsval; - lro->tsecr = *(ts_ptr + 2); - } - - lro->next_seq += tcp_data_len; - lro->ack_seq = tcp->th_ack; - lro->window = tcp->th_win; - lro->append_cnt++; - if (tcp_data_len == 0) { - m_freem(m_head); - return 0; - } - /* subtract off the checksum of the tcp header - * from the hardware checksum, and add it to the - * stored tcp data checksum. Byteswap the checksum - * if the total length so far is odd - */ - tmp_csum = do_csum_data((uint16_t*)tcp, - tcp_hdr_len); - csum = csum + (tmp_csum ^ 0xffff); - csum = (csum & 0xffff) + (csum >> 16); - csum = (csum & 0xffff) + (csum >> 16); - if (lro->len & 0x1) { - /* Odd number of bytes so far, flip bytes */ - csum = ((csum << 8) | (csum >> 8)) & 0xffff; - } - csum = csum + lro->data_csum; - csum = (csum & 0xffff) + (csum >> 16); - csum = (csum & 0xffff) + (csum >> 16); - lro->data_csum = csum; - - lro->len += tcp_data_len; - - /* adjust mbuf so that m->m_data points to - the first byte of the payload */ - m_adj(m_head, hlen); - /* append mbuf chain */ - lro->m_tail->m_next = m_head; - /* advance the last pointer */ - lro->m_tail = m_tail; - /* flush packet if required */ - device_mtu = cntl->ifp->if_mtu; - if (lro->len > (65535 - device_mtu)) { - SLIST_REMOVE(&cntl->lro_active, lro, - lro_entry, next); - tcp_lro_flush(cntl, lro); - } - return 0; + /* + * Check TCP header constraints. + */ + /* Ensure no bits set besides ACK or PSH. */ + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) + return (TCP_LRO_CANNOT); + + /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */ + /* XXX-BZ Ideally we'd flush on PUSH? */ + + /* + * Check for timestamps. + * Since the only option we handle are timestamps, we only have to + * handle the simple case of aligned timestamps. + */ + l = (th->th_off << 2); + tcp_data_len -= l; + l -= sizeof(*th); + ts_ptr = (uint32_t *)(th + 1); + if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) + return (TCP_LRO_CANNOT); + + /* If the driver did not pass in the checksum, set it now. */ + if (csum == 0x0000) + csum = th->th_sum; + + seq = ntohl(th->th_seq); + + /* Try to find a matching previous segment. */ + SLIST_FOREACH(le, &lc->lro_active, next) { + if (le->eh_type != eh_type) + continue; + if (le->source_port != th->th_sport || + le->dest_port != th->th_dport) + continue; + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + if (bcmp(&le->source_ip6, &ip6->ip6_src, + sizeof(struct in6_addr)) != 0 || + bcmp(&le->dest_ip6, &ip6->ip6_dst, + sizeof(struct in6_addr)) != 0) + continue; + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + if (le->source_ip4 != ip4->ip_src.s_addr || + le->dest_ip4 != ip4->ip_dst.s_addr) + continue; + break; +#endif + } + + /* Flush now if appending will result in overflow. */ + if (le->p_len > (65535 - tcp_data_len)) { + SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + tcp_lro_flush(lc, le); + break; + } + + /* Try to append the new segment. */ + if (__predict_false(seq != le->next_seq || + (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { + /* Out of order packet or duplicate ACK. */ + SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + tcp_lro_flush(lc, le); + return (TCP_LRO_CANNOT); } + + if (l != 0) { + uint32_t tsval = ntohl(*(ts_ptr + 1)); + /* Make sure timestamp values are increasing. */ + /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ + if (__predict_false(le->tsval > tsval || + *(ts_ptr + 2) == 0)) + return (TCP_LRO_CANNOT); + le->tsval = tsval; + le->tsecr = *(ts_ptr + 2); + } + + le->next_seq += tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + le->append_cnt++; + +#ifdef TCP_LRO_UPDATE_CSUM + le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th, + tcp_data_len, ~csum); +#endif + + if (tcp_data_len == 0) { + m_freem(m); + return (0); + } + + le->p_len += tcp_data_len; + + /* + * Adjust the mbuf so that m_data points to the first byte of + * the ULP payload. Adjust the mbuf to avoid complications and + * append new segment to existing mbuf chain. + */ + m_adj(m, m->m_pkthdr.len - tcp_data_len); + m->m_flags &= ~M_PKTHDR; + + le->m_tail->m_next = m; + le->m_tail = m_last(m); + + /* + * If a possible next full length packet would cause an + * overflow, pro-actively flush now. + */ + if (le->p_len > (65535 - lc->ifp->if_mtu)) { + SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + tcp_lro_flush(lc, le); + } + + return (0); } - if (SLIST_EMPTY(&cntl->lro_free)) - return -1; - - /* start a new chain */ - lro = SLIST_FIRST(&cntl->lro_free); - SLIST_REMOVE_HEAD(&cntl->lro_free, next); - SLIST_INSERT_HEAD(&cntl->lro_active, lro, next); - lro->source_port = tcp->th_sport; - lro->dest_port = tcp->th_dport; - lro->source_ip = ip->ip_src.s_addr; - lro->dest_ip = ip->ip_dst.s_addr; - lro->next_seq = seq + tcp_data_len; - lro->mss = tcp_data_len; - lro->ack_seq = tcp->th_ack; - lro->window = tcp->th_win; - - /* save the checksum of just the TCP payload by - * subtracting off the checksum of the TCP header from - * the entire hardware checksum - * Since IP header checksum is correct, checksum over - * the IP header is -0. Substracting -0 is unnecessary. - */ - tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len); - csum = csum + (tmp_csum ^ 0xffff); - csum = (csum & 0xffff) + (csum >> 16); - csum = (csum & 0xffff) + (csum >> 16); - lro->data_csum = csum; - - lro->ip = ip; - /* record timestamp if it is present */ - if (opt_bytes) { - lro->timestamp = 1; - lro->tsval = ntohl(*(ts_ptr + 1)); - lro->tsecr = *(ts_ptr + 2); + /* Try to find an empty slot. */ + if (SLIST_EMPTY(&lc->lro_free)) + return (TCP_LRO_CANNOT); + + /* Start a new segment chain. */ + le = SLIST_FIRST(&lc->lro_free); + SLIST_REMOVE_HEAD(&lc->lro_free, next); + SLIST_INSERT_HEAD(&lc->lro_active, le, next); + + /* Start filling in details. */ + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + le->le_ip6 = ip6; + le->source_ip6 = ip6->ip6_src; + le->dest_ip6 = ip6->ip6_dst; + le->eh_type = eh_type; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + le->le_ip4 = ip4; + le->source_ip4 = ip4->ip_src.s_addr; + le->dest_ip4 = ip4->ip_dst.s_addr; + le->eh_type = eh_type; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; + break; +#endif + } + le->source_port = th->th_sport; + le->dest_port = th->th_dport; + + le->next_seq = seq + tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + if (l != 0) { + le->timestamp = 1; + le->tsval = ntohl(*(ts_ptr + 1)); + le->tsecr = *(ts_ptr + 2); } - lro->len = tot_len; - lro->m_head = m_head; - lro->m_tail = m_tail; - return 0; + +#ifdef TCP_LRO_UPDATE_CSUM + /* + * Do not touch the csum of the first packet. However save the + * "adjusted" checksum of just the source and destination addresses, + * the next header and the TCP payload. The length and TCP header + * parts may change, so we remove those from the saved checksum and + * re-add with final values on tcp_lro_flush() if needed. + */ + KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", + __func__, le, le->ulp_csum)); + + le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, + ~csum); + th->th_sum = csum; /* Restore checksum on first packet. */ +#endif + + le->m_head = m; + le->m_tail = m_last(m); + + return (0); } + +/* end */ diff --git a/freebsd/sys/netinet/tcp_lro.h b/freebsd/sys/netinet/tcp_lro.h index 7e498871..b3a50179 100644 --- a/freebsd/sys/netinet/tcp_lro.h +++ b/freebsd/sys/netinet/tcp_lro.h @@ -30,31 +30,46 @@ #ifndef _TCP_LRO_H_ #define _TCP_LRO_H_ -struct lro_entry; struct lro_entry { - SLIST_ENTRY(lro_entry) next; - struct mbuf *m_head; - struct mbuf *m_tail; - int timestamp; - struct ip *ip; - uint32_t tsval; - uint32_t tsecr; - uint32_t source_ip; - uint32_t dest_ip; - uint32_t next_seq; - uint32_t ack_seq; - uint32_t len; - uint32_t data_csum; - uint16_t window; - uint16_t source_port; - uint16_t dest_port; - uint16_t append_cnt; - uint16_t mss; - + SLIST_ENTRY(lro_entry) next; + struct mbuf *m_head; + struct mbuf *m_tail; + union { + struct ip *ip4; + struct ip6_hdr *ip6; + } leip; + union { + in_addr_t s_ip4; + struct in6_addr s_ip6; + } lesource; + union { + in_addr_t d_ip4; + struct in6_addr d_ip6; + } ledest; + uint16_t source_port; + uint16_t dest_port; + uint16_t eh_type; /* EthernetHeader type. */ + uint16_t append_cnt; + uint32_t p_len; /* IP header payload length. */ + uint32_t ulp_csum; /* TCP, etc. checksum. */ + uint32_t next_seq; /* tcp_seq */ + uint32_t ack_seq; /* tcp_seq */ + uint32_t tsval; + uint32_t tsecr; + uint16_t window; + uint16_t timestamp; /* flag, not a TCP hdr field. */ }; SLIST_HEAD(lro_head, lro_entry); +#define le_ip4 leip.ip4 +#define le_ip6 leip.ip6 +#define source_ip4 lesource.s_ip4 +#define dest_ip4 ledest.d_ip4 +#define source_ip6 lesource.s_ip6 +#define dest_ip6 ledest.d_ip6 + +/* NB: This is part of driver structs. */ struct lro_ctrl { struct ifnet *ifp; int lro_queued; @@ -66,13 +81,12 @@ struct lro_ctrl { struct lro_head lro_free; }; - int tcp_lro_init(struct lro_ctrl *); void tcp_lro_free(struct lro_ctrl *); void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); -/* Number of LRO entries - these are per rx queue */ -#define LRO_ENTRIES 8 +#define TCP_LRO_CANNOT -1 +#define TCP_LRO_NOT_SUPPORTED 1 #endif /* _TCP_LRO_H_ */ diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c index 93b7d8de..cd41edab 100644 --- a/freebsd/sys/netinet/tcp_offload.c +++ b/freebsd/sys/netinet/tcp_offload.c @@ -1,147 +1,178 @@ #include <machine/rtems-bsd-kernel-space.h> /*- - * Copyright (c) 2007, Chelsio Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_inet.h> + #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <rtems/bsd/sys/types.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/socketvar.h> - +#include <sys/sockopt.h> #include <net/if.h> -#include <net/if_types.h> -#include <net/if_var.h> #include <net/route.h> -#include <net/vnet.h> - #include <netinet/in.h> -#include <netinet/in_systm.h> #include <netinet/in_pcb.h> #include <netinet/tcp.h> #include <netinet/tcp_var.h> #include <netinet/tcp_offload.h> -#include <netinet/toedev.h> +#define TCPOUTFLAGS +#include <netinet/tcp_fsm.h> +#include <netinet/toecore.h> -uint32_t toedev_registration_count; +int registered_toedevs; +/* + * Provide an opportunity for a TOE driver to offload. + */ int tcp_offload_connect(struct socket *so, struct sockaddr *nam) { struct ifnet *ifp; - struct toedev *tdev; + struct toedev *tod; struct rtentry *rt; - int error; - - if (toedev_registration_count == 0) - return (EINVAL); - - /* - * Look up the route used for the connection to - * determine if it uses an interface capable of - * offloading the connection. - */ - rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/); - if (rt) + int error = EOPNOTSUPP; + + INP_WLOCK_ASSERT(sotoinpcb(so)); + KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, + ("%s: called with sa_family %d", __func__, nam->sa_family)); + + if (registered_toedevs == 0) + return (error); + + rt = rtalloc1(nam, 0, 0); + if (rt) RT_UNLOCK(rt); - else + else return (EHOSTUNREACH); ifp = rt->rt_ifp; - if ((ifp->if_capenable & IFCAP_TOE) == 0) { - error = EINVAL; - goto fail; - } - - tdev = TOEDEV(ifp); - if (tdev == NULL) { - error = EPERM; - goto fail; - } - - if (tdev->tod_can_offload(tdev, so) == 0) { - error = EPERM; - goto fail; - } - - return (tdev->tod_connect(tdev, so, rt, nam)); -fail: + + if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) + goto done; + if (nam->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)) + goto done; + + tod = TOEDEV(ifp); + if (tod != NULL) + error = tod->tod_connect(tod, so, rt, nam); +done: RTFREE(rt); return (error); } +void +tcp_offload_listen_start(struct tcpcb *tp) +{ -/* - * This file contains code as a short-term staging area before it is moved in - * to sys/netinet/tcp_offload.c - */ + INP_WLOCK_ASSERT(tp->t_inpcb); + + EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); +} void -tcp_offload_twstart(struct tcpcb *tp) +tcp_offload_listen_stop(struct tcpcb *tp) { - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); } -struct tcpcb * -tcp_offload_close(struct tcpcb *tp) +void +tcp_offload_input(struct tcpcb *tp, struct mbuf *m) { + struct toedev *tod = tp->tod; - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tp = tcp_close(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); - if (tp) - INP_WUNLOCK(tp->t_inpcb); + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); - return (tp); + tod->tod_input(tod, tp, m); } -struct tcpcb * -tcp_offload_drop(struct tcpcb *tp, int error) +int +tcp_offload_output(struct tcpcb *tp) { + struct toedev *tod = tp->tod; + int error, flags; - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(tp->t_inpcb); - tp = tcp_drop(tp, error); - INP_INFO_WUNLOCK(&V_tcbinfo); - if (tp) - INP_WUNLOCK(tp->t_inpcb); + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); - return (tp); + flags = tcp_outflags[tp->t_state]; + + if (flags & TH_RST) { + /* XXX: avoid repeated calls like we do for FIN */ + error = tod->tod_send_rst(tod, tp); + } else if ((flags & TH_FIN || tp->t_flags & TF_NEEDFIN) && + (tp->t_flags & TF_SENTFIN) == 0) { + error = tod->tod_send_fin(tod, tp); + if (error == 0) + tp->t_flags |= TF_SENTFIN; + } else + error = tod->tod_output(tod, tp); + + return (error); +} + +void +tcp_offload_rcvd(struct tcpcb *tp) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_rcvd(tod, tp); +} + +void +tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_ctloutput(tod, tp, sopt_dir, sopt_name); } +void +tcp_offload_detach(struct tcpcb *tp) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_pcb_detach(tod, tp); +} diff --git a/freebsd/sys/netinet/tcp_offload.h b/freebsd/sys/netinet/tcp_offload.h index 313185f6..a0523665 100644 --- a/freebsd/sys/netinet/tcp_offload.h +++ b/freebsd/sys/netinet/tcp_offload.h @@ -1,30 +1,30 @@ /*- - * Copyright (c) 2007, Chelsio Inc. + * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Neither the name of the Chelsio Corporation nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * * $FreeBSD$ + * */ #ifndef _NETINET_TCP_OFFLOAD_H_ @@ -34,321 +34,15 @@ #error "no user-serviceable parts inside" #endif -/* - * A driver publishes that it provides offload services - * by setting IFCAP_TOE in the ifnet. The offload connect - * will bypass any further work if the interface that a - * connection would use does not support TCP offload. - * - * The TOE API assumes that the tcp offload engine can offload the - * the entire connection from set up to teardown, with some provision - * being made to allowing the software stack to handle time wait. If - * the device does not meet these criteria, it is the driver's responsibility - * to overload the functions that it needs to in tcp_usrreqs and make - * its own calls to tcp_output if it needs to do so. - * - * There is currently no provision for the device advertising the congestion - * control algorithms it supports as there is currently no API for querying - * an operating system for the protocols that it has loaded. This is a desirable - * future extension. - * - * - * - * It is assumed that individuals deploying TOE will want connections - * to be offloaded without software changes so all connections on an - * interface providing TOE are offloaded unless the SO_NO_OFFLOAD - * flag is set on the socket. - * - * - * The toe_usrreqs structure constitutes the TOE driver's - * interface to the TCP stack for functionality that doesn't - * interact directly with userspace. If one wants to provide - * (optional) functionality to do zero-copy to/from - * userspace one still needs to override soreceive/sosend - * with functions that fault in and pin the user buffers. - * - * + tu_send - * - tells the driver that new data may have been added to the - * socket's send buffer - the driver should not fail if the - * buffer is in fact unchanged - * - the driver is responsible for providing credits (bytes in the send window) - * back to the socket by calling sbdrop() as segments are acknowledged. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_rcvd - * - returns credits to the driver and triggers window updates - * to the peer (a credit as used here is a byte in the peer's receive window) - * - the driver is expected to determine how many bytes have been - * consumed and credit that back to the card so that it can grow - * the window again by maintaining its own state between invocations. - * - In principle this could be used to shrink the window as well as - * grow the window, although it is not used for that now. - * - this function needs to correctly handle being called any number of - * times without any bytes being consumed from the receive buffer. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_disconnect - * - tells the driver to send FIN to peer - * - driver is expected to send the remaining data and then do a clean half close - * - disconnect implies at least half-close so only send, reset, and detach - * are legal - * - the driver is expected to handle transition through the shutdown - * state machine and allow the stack to support SO_LINGER. - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * + tu_reset - * - closes the connection and sends a RST to peer - * - driver is expectd to trigger an RST and detach the toepcb - * - no further calls are legal after reset - * - The driver expects the inpcb lock to be held - the driver is expected - * not to drop the lock. Hence the driver is not allowed to acquire the - * pcbinfo lock during this call. - * - * The following fields in the tcpcb are expected to be referenced by the driver: - * + iss - * + rcv_nxt - * + rcv_wnd - * + snd_isn - * + snd_max - * + snd_nxt - * + snd_una - * + t_flags - * + t_inpcb - * + t_maxseg - * + t_toe - * - * The following fields in the inpcb are expected to be referenced by the driver: - * + inp_lport - * + inp_fport - * + inp_laddr - * + inp_fport - * + inp_socket - * + inp_ip_tos - * - * The following fields in the socket are expected to be referenced by the - * driver: - * + so_comp - * + so_error - * + so_linger - * + so_options - * + so_rcv - * + so_snd - * + so_state - * + so_timeo - * - * These functions all return 0 on success and can return the following errors - * as appropriate: - * + EPERM: - * + ENOBUFS: memory allocation failed - * + EMSGSIZE: MTU changed during the call - * + EHOSTDOWN: - * + EHOSTUNREACH: - * + ENETDOWN: - * * ENETUNREACH: the peer is no longer reachable - * - * + tu_detach - * - tells driver that the socket is going away so disconnect - * the toepcb and free appropriate resources - * - allows the driver to cleanly handle the case of connection state - * outliving the socket - * - no further calls are legal after detach - * - the driver is expected to provide its own synchronization between - * detach and receiving new data. - * - * + tu_syncache_event - * - even if it is not actually needed, the driver is expected to - * call syncache_add for the initial SYN and then syncache_expand - * for the SYN,ACK - * - tells driver that a connection either has not been added or has - * been dropped from the syncache - * - the driver is expected to maintain state that lives outside the - * software stack so the syncache needs to be able to notify the - * toe driver that the software stack is not going to create a connection - * for a received SYN - * - The driver is responsible for any synchronization required between - * the syncache dropping an entry and the driver processing the SYN,ACK. - * - */ -struct toe_usrreqs { - int (*tu_send)(struct tcpcb *tp); - int (*tu_rcvd)(struct tcpcb *tp); - int (*tu_disconnect)(struct tcpcb *tp); - int (*tu_reset)(struct tcpcb *tp); - void (*tu_detach)(struct tcpcb *tp); - void (*tu_syncache_event)(int event, void *toep); -}; - -/* - * Proxy for struct tcpopt between TOE drivers and TCP functions. - */ -struct toeopt { - u_int64_t to_flags; /* see tcpopt in tcp_var.h */ - u_int16_t to_mss; /* maximum segment size */ - u_int8_t to_wscale; /* window scaling */ +extern int registered_toedevs; - u_int8_t _pad1; /* explicit pad for 64bit alignment */ - u_int32_t _pad2; /* explicit pad for 64bit alignment */ - u_int64_t _pad3[4]; /* TBD */ -}; +int tcp_offload_connect(struct socket *, struct sockaddr *); +void tcp_offload_listen_start(struct tcpcb *); +void tcp_offload_listen_stop(struct tcpcb *); +void tcp_offload_input(struct tcpcb *, struct mbuf *); +int tcp_offload_output(struct tcpcb *); +void tcp_offload_rcvd(struct tcpcb *); +void tcp_offload_ctloutput(struct tcpcb *, int, int); +void tcp_offload_detach(struct tcpcb *); -#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */ -#define TOE_SC_DROP 2 /* connection was timed out */ - -/* - * Because listen is a one-to-many relationship (a socket can be listening - * on all interfaces on a machine some of which may be using different TCP - * offload devices), listen uses a publish/subscribe mechanism. The TCP - * offload driver registers a listen notification function with the stack. - * When a listen socket is created all TCP offload devices are notified - * so that they can do the appropriate set up to offload connections on the - * port to which the socket is bound. When the listen socket is closed, - * the offload devices are notified so that they will stop listening on that - * port and free any associated resources as well as sending RSTs on any - * connections in the SYN_RCVD state. - * - */ - -typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); -typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); - -EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); -EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); - -/* - * Check if the socket can be offloaded by the following steps: - * - determine the egress interface - * - check the interface for TOE capability and TOE is enabled - * - check if the device has resources to offload the connection - */ -int tcp_offload_connect(struct socket *so, struct sockaddr *nam); - -/* - * The tcp_output_* routines are wrappers around the toe_usrreqs calls - * which trigger packet transmission. In the non-offloaded case they - * translate to tcp_output. The tcp_offload_* routines notify TOE - * of specific events. I the non-offloaded case they are no-ops. - * - * Listen is a special case because it is a 1 to many relationship - * and there can be more than one offload driver in the system. - */ - -/* - * Connection is offloaded - */ -#define tp_offload(tp) ((tp)->t_flags & TF_TOE) - -/* - * hackish way of allowing this file to also be included by TOE - * which needs to be kept ignorant of socket implementation details - */ -#ifdef _SYS_SOCKETVAR_H_ -/* - * The socket has not been marked as "do not offload" - */ -#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0) - -static __inline int -tcp_output_connect(struct socket *so, struct sockaddr *nam) -{ - struct tcpcb *tp = sototcpcb(so); - int error; - - /* - * If offload has been disabled for this socket or the - * connection cannot be offloaded just call tcp_output - * to start the TCP state machine. - */ -#ifndef TCP_OFFLOAD_DISABLE - if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0) -#endif - error = tcp_output(tp); - return (error); -} - -static __inline int -tcp_output_send(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_send(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_rcvd(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_rcvd(tp)); #endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_disconnect(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_disconnect(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline int -tcp_output_reset(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - return (tp->t_tu->tu_reset(tp)); -#endif - return (tcp_output(tp)); -} - -static __inline void -tcp_offload_detach(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (tp_offload(tp)) - tp->t_tu->tu_detach(tp); -#endif -} - -static __inline void -tcp_offload_listen_open(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket)) - EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); -#endif -} - -static __inline void -tcp_offload_listen_close(struct tcpcb *tp) -{ - -#ifndef TCP_OFFLOAD_DISABLE - EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); -#endif -} -#undef SO_OFFLOADABLE -#endif /* _SYS_SOCKETVAR_H_ */ -#undef tp_offload - -void tcp_offload_twstart(struct tcpcb *tp); -struct tcpcb *tcp_offload_close(struct tcpcb *tp); -struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error); - -#endif /* _NETINET_TCP_OFFLOAD_H_ */ diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c index c73fe099..6215c4e2 100644 --- a/freebsd/sys/netinet/tcp_output.c +++ b/freebsd/sys/netinet/tcp_output.c @@ -77,6 +77,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -86,31 +89,22 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -#ifdef notyet -extern struct mbuf *m_copypack(); -#endif - VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); -VNET_DEFINE(int, ss_fltsz) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, - &VNET_NAME(ss_fltsz), 1, - "Slow start flight size"); - -VNET_DEFINE(int, ss_fltsz_local) = 4; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, - CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1, - "Slow start flight size for local networks"); - VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); +VNET_DEFINE(int, tcp_sendspace) = 1024*32; +#define V_tcp_sendspace VNET(tcp_sendspace) +SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, + &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); + VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, @@ -123,7 +117,7 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); -VNET_DEFINE(int, tcp_autosndbuf_max) = 256*1024; +VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, @@ -175,7 +169,7 @@ tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; - int off, flags, error; + int off, flags, error = 0; /* Keep compiler happy */ struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; @@ -188,7 +182,7 @@ tcp_output(struct tcpcb *tp) int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; - int tso; + int tso, mtu; struct tcpopt to; #if 0 int maxburst = TCP_MAXBURST; @@ -202,6 +196,11 @@ tcp_output(struct tcpcb *tp) INP_WLOCK_ASSERT(tp->t_inpcb); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return (tcp_offload_output(tp)); +#endif + /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -229,9 +228,9 @@ again: tcp_sack_adjust(tp); sendalot = 0; tso = 0; + mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); - sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* @@ -472,9 +471,8 @@ after_sack_rexmit: } /* - * Truncate to the maximum segment length or enable TCP Segmentation - * Offloading (if supported by hardware) and ensure that FIN is removed - * if the length no longer contains the last data byte. + * Decide if we can use TCP Segmentation Offloading (if supported by + * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and @@ -482,10 +480,6 @@ after_sack_rexmit: * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. - * - * The length of TSO bursts is limited to TCP_MAXWIN. That limit and - * removal of FIN (if not already catched here) are handled later after - * the exact length of the TCP options are known. */ #ifdef IPSEC /* @@ -494,22 +488,15 @@ after_sack_rexmit: */ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif - if (len > tp->t_maxseg) { - if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && - ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && sack_rxmit == 0 && - tp->t_inpcb->inp_options == NULL && - tp->t_inpcb->in6p_options == NULL + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && + ((tp->t_flags & TF_SIGNATURE) == 0) && + tp->rcv_numsacks == 0 && sack_rxmit == 0 && #ifdef IPSEC - && ipsec_optlen == 0 + ipsec_optlen == 0 && #endif - ) { - tso = 1; - } else { - len = tp->t_maxseg; - sendalot = 1; - } - } + tp->t_inpcb->inp_options == NULL && + tp->t_inpcb->in6p_options == NULL) + tso = 1; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) @@ -560,19 +547,39 @@ after_sack_rexmit: } /* - * Compare available window to amount of window - * known to peer (as advertised window less - * next expected input). If the difference is at least two - * max size segments, or at least 50% of the maximum possible - * window, then want to send a window update to peer. - * Skip this if the connection is in T/TCP half-open state. - * Don't send pure window updates when the peer has closed - * the connection and won't ever send more data. + * Sending of standalone window updates. + * + * Window updates are important when we close our window due to a + * full socket buffer and are opening it again after the application + * reads data from it. Once the window has opened again and the + * remote end starts to send again the ACK clock takes over and + * provides the most current window information. + * + * We must avoid the silly window syndrome whereas every read + * from the receive buffer, no matter how small, causes a window + * update to be sent. We also should avoid sending a flurry of + * window updates when the socket buffer had queued a lot of data + * and the application is doing small reads. + * + * Prevent a flurry of pointless window updates by only sending + * an update when we can increase the advertized window by more + * than 1/4th of the socket buffer capacity. When the buffer is + * getting full or is very small be more aggressive and send an + * update whenever we can increase by two mss sized segments. + * In all other situations the ACK's to new incoming data will + * carry further window increases. + * + * Don't send an independent window update if a delayed + * ACK is pending (it will get piggy-backed on it) or the + * remote side already has done a half-close and won't send + * more data. Skip this if the connection is in T/TCP + * half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && + !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* - * "adv" is the amount we can increase the window, + * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ @@ -592,9 +599,11 @@ after_sack_rexmit: */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; - if (adv >= (long) (2 * tp->t_maxseg)) - goto send; - if (2 * adv >= (long) so->so_rcv.sb_hiwat) + + if (adv >= (long)(2 * tp->t_maxseg) && + (adv >= (long)(so->so_rcv.sb_hiwat / 4) || + recwin <= (long)(so->so_rcv.sb_hiwat / 8) || + so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) goto send; } dontupdate: @@ -680,7 +689,7 @@ send: hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif - hdrlen = sizeof (struct tcpiphdr); + hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. @@ -753,28 +762,54 @@ send: * bump the packet length beyond the t_maxopd length. * Clear the FIN bit because we cut off the tail of * the segment. - * - * When doing TSO limit a burst to TCP_MAXWIN minus the - * IP, TCP and Options length to keep ip->ip_len from - * overflowing. Prevent the last segment from being - * fractional thus making them all equal sized and set - * the flag to continue sending. TSO is disabled when - * IP options or IPSEC are present. */ if (len + optlen + ipoptlen > tp->t_maxopd) { flags &= ~TH_FIN; + if (tso) { - if (len > TCP_MAXWIN - hdrlen - optlen) { - len = TCP_MAXWIN - hdrlen - optlen; - len = len - (len % (tp->t_maxopd - optlen)); + KASSERT(ipoptlen == 0, + ("%s: TSO can't do IP options", __func__)); + + /* + * Limit a burst to t_tsomax minus IP, + * TCP and options length to keep ip->ip_len + * from overflowing or exceeding the maximum + * length allowed by the network interface. + */ + if (len > tp->t_tsomax - hdrlen) { + len = tp->t_tsomax - hdrlen; + sendalot = 1; + } + + /* + * Prevent the last segment from being + * fractional unless the send sockbuf can + * be emptied. + */ + if (sendalot && off + len < so->so_snd.sb_cc) { + len -= len % (tp->t_maxopd - optlen); sendalot = 1; - } else if (tp->t_flags & TF_NEEDFIN) + } + + /* + * Send the FIN in a separate segment + * after the bulk sending is done. + * We don't trust the TSO implementations + * to clear the FIN flag on all but the + * last segment. + */ + if (tp->t_flags & TF_NEEDFIN) sendalot = 1; + } else { len = tp->t_maxopd - optlen - ipoptlen; sendalot = 1; } - } + } else + tso = 0; + + KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, + ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 @@ -810,19 +845,6 @@ send: TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } -#ifdef notyet - if ((m = m_copypack(so->so_snd.sb_mb, off, - (int)len, max_linkhdr + hdrlen)) == 0) { - SOCKBUF_UNLOCK(&so->so_snd); - error = ENOBUFS; - goto out; - } - /* - * m_copypack left space for our hdr; use it. - */ - m->m_len += hdrlen; - m->m_data -= hdrlen; -#else MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); @@ -862,7 +884,7 @@ send: goto out; } } -#endif + /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only @@ -1059,19 +1081,24 @@ send: * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 - if (isipv6) + if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ - th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), - sizeof(struct tcphdr) + optlen + len); + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + + optlen + len, IPPROTO_TCP, 0); + } +#endif +#if defined(INET6) && defined(INET) else -#endif /* INET6 */ +#endif +#ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); @@ -1079,6 +1106,7 @@ send: KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } +#endif /* * Enable TSO and specify the size of the segments. @@ -1092,6 +1120,16 @@ send: m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; } +#ifdef IPSEC + KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), + ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", + __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); +#else + KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), + ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u", + __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); +#endif + /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. @@ -1183,7 +1221,7 @@ timer: #endif ipov->ih_len = save; } -#endif +#endif /* TCPDEBUG */ /* * Fill in IP length and desired time to live and @@ -1197,6 +1235,9 @@ timer: */ #ifdef INET6 if (isipv6) { + struct route_in6 ro; + + bzero(&ro, sizeof(ro)); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. @@ -1206,13 +1247,23 @@ timer: ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* TODO: IPv6 IP6TOS_ECT bit on */ - error = ip6_output(m, - tp->t_inpcb->in6p_outputopts, NULL, - ((so->so_options & SO_DONTROUTE) ? - IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); - } else + error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, + ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), + NULL, NULL, tp->t_inpcb); + + if (error == EMSGSIZE && ro.ro_rt != NULL) + mtu = ro.ro_rt->rt_rmx.rmx_mtu; + RO_RTFREE(&ro); + } #endif /* INET6 */ +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET { + struct route ro; + + bzero(&ro, sizeof(ro)); ip->ip_len = m->m_pkthdr.len; #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) @@ -1229,10 +1280,15 @@ timer: if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) ip->ip_off |= IP_DF; - error = ip_output(m, tp->t_inpcb->inp_options, NULL, + error = ip_output(m, tp->t_inpcb->inp_options, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); + + if (error == EMSGSIZE && ro.ro_rt != NULL) + mtu = ro.ro_rt->rt_rmx.rmx_mtu; + RO_RTFREE(&ro); } +#endif /* INET */ if (error) { /* @@ -1277,21 +1333,18 @@ out: * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. - * - * tcp_mtudisc() will find out the new MTU and as - * its last action, initiate retransmission, so it - * is important to not do so here. - * * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. - * Disable it for this connection as too and - * immediatly retry with MSS sized segments generated - * by this function. + * If we obtained mtu from ip_output() then update + * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; - tcp_mtudisc(tp->t_inpcb, -1); - return (0); + if (mtu != 0) { + tcp_mss_update(tp, -1, mtu, NULL, NULL); + goto again; + } + return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c index 6b2605ce..aebda9db 100644 --- a/freebsd/sys/netinet/tcp_reass.c +++ b/freebsd/sys/netinet/tcp_reass.c @@ -76,24 +76,19 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_debug.h> #endif /* TCPDEBUG */ -static int tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS); static int tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS); -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, +static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); static VNET_DEFINE(int, tcp_reass_maxseg) = 0; #define V_tcp_reass_maxseg VNET(tcp_reass_maxseg) -SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, maxsegments, - CTLTYPE_INT | CTLFLAG_RDTUN, - &VNET_NAME(tcp_reass_maxseg), 0, &tcp_reass_sysctl_maxseg, "I", +SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, + &VNET_NAME(tcp_reass_maxseg), 0, "Global maximum number of TCP Segments in Reassembly Queue"); -static VNET_DEFINE(int, tcp_reass_qsize) = 0; -#define V_tcp_reass_qsize VNET(tcp_reass_qsize) SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, cursegments, - CTLTYPE_INT | CTLFLAG_RD, - &VNET_NAME(tcp_reass_qsize), 0, &tcp_reass_sysctl_qsize, "I", + (CTLTYPE_INT | CTLFLAG_RD), NULL, 0, &tcp_reass_sysctl_qsize, "I", "Global number of TCP Segments currently in Reassembly Queue"); static VNET_DEFINE(int, tcp_reass_overflows) = 0; @@ -111,8 +106,10 @@ static void tcp_reass_zone_change(void *tag) { + /* Set the zone limit and read back the effective value. */ V_tcp_reass_maxseg = nmbclusters / 16; - uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); + V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone, + V_tcp_reass_maxseg); } void @@ -124,7 +121,9 @@ tcp_reass_init(void) &V_tcp_reass_maxseg); V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); + /* Set the zone limit and read back the effective value. */ + V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone, + V_tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -158,17 +157,12 @@ tcp_reass_flush(struct tcpcb *tp) } static int -tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS) -{ - V_tcp_reass_maxseg = uma_zone_get_max(V_tcp_reass_zone); - return (sysctl_handle_int(oidp, arg1, arg2, req)); -} - -static int tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS) { - V_tcp_reass_qsize = uma_zone_get_cur(V_tcp_reass_zone); - return (sysctl_handle_int(oidp, arg1, arg2, req)); + int qsize; + + qsize = uma_zone_get_cur(V_tcp_reass_zone); + return (sysctl_handle_int(oidp, &qsize, 0, req)); } int @@ -299,7 +293,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) th->th_seq += i; } } - tp->t_rcvoopack++; + tp->t_rcvoopack++; TCPSTAT_INC(tcps_rcvoopack); TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); diff --git a/freebsd/sys/netinet/tcp_sack.c b/freebsd/sys/netinet/tcp_sack.c index 449b538f..9cc1d86a 100644 --- a/freebsd/sys/netinet/tcp_sack.c +++ b/freebsd/sys/netinet/tcp_sack.c @@ -579,7 +579,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ - if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) > 2) + if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index e23a0997..4c6d14eb 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -72,29 +72,25 @@ __FBSDID("$FreeBSD$"); #include <netinet/cc.h> #include <netinet/in.h> +#include <netinet/in_pcb.h> #include <netinet/in_systm.h> +#include <netinet/in_var.h> #include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> -#endif -#include <netinet/in_pcb.h> -#ifdef INET6 #include <netinet6/in6_pcb.h> -#endif -#include <netinet/in_var.h> -#include <netinet/ip_var.h> -#ifdef INET6 #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet6/nd6.h> #endif -#include <netinet/ip_icmp.h> + #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_syncache.h> -#include <netinet/tcp_offload.h> #ifdef INET6 #include <netinet6/tcp6_var.h> #endif @@ -102,7 +98,12 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +#ifdef INET6 #include <netinet6/ip6protosw.h> +#endif +#ifdef TCP_OFFLOAD +#include <netinet/tcp_offload.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -166,15 +167,7 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); -#endif - -static int -vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) -{ - - VNET_SYSCTL_ARG(req, arg1); - return (sysctl_msec_to_ticks(oidp, arg1, arg2, req)); -} +#endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where @@ -187,7 +180,7 @@ vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, - "Minmum TCP Maximum Segment Size"); + "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, @@ -221,49 +214,9 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); -/* - * TCP bandwidth limiting sysctls. Note that the default lower bound of - * 1024 exists only for debugging. A good production default would be - * something like 6100. - */ -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, - "TCP inflight data limiting"); - -static VNET_DEFINE(int, tcp_inflight_enable) = 0; -#define V_tcp_inflight_enable VNET(tcp_inflight_enable) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_enable), 0, - "Enable automatic TCP inflight data limiting"); - -static int tcp_inflight_debug = 0; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, - &tcp_inflight_debug, 0, - "Debug TCP inflight calculations"); - -static VNET_DEFINE(int, tcp_inflight_rttthresh); -#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh) -SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0, - vnet_sysctl_msec_to_ticks, "I", - "RTT threshold below which inflight will deactivate itself"); - -static VNET_DEFINE(int, tcp_inflight_min) = 6144; -#define V_tcp_inflight_min VNET(tcp_inflight_min) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_min), 0, - "Lower-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT; -#define V_tcp_inflight_max VNET(tcp_inflight_max) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_max), 0, - "Upper-bound for TCP inflight window"); - -static VNET_DEFINE(int, tcp_inflight_stab) = 20; -#define V_tcp_inflight_stab VNET(tcp_inflight_stab) -SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &VNET_NAME(tcp_inflight_stab), 0, - "Inflight Algorithm Stabilization 20 = 2 packets"); +static int tcp_soreceive_stream = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, + &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); #ifdef TCP_SIGNATURE static int tcp_sig_checksigs = 1; @@ -278,7 +231,6 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); -static void tcp_isn_tick(void *); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); @@ -309,7 +261,6 @@ static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); -struct callout isn_callout; static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -342,13 +293,6 @@ tcp_init(void) { int hashsize; - INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); - LIST_INIT(&V_tcb); -#ifdef VIMAGE - V_tcbinfo.ipi_vnet = curvnet; -#endif - V_tcbinfo.ipi_listhead = &V_tcb; - if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); @@ -362,14 +306,9 @@ tcp_init(void) printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } - V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, - &V_tcbinfo.ipi_hashmask); - V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, - &V_tcbinfo.ipi_porthashmask); - V_tcbinfo.ipi_zone = uma_zcreate("tcp_inpcb", sizeof(struct inpcb), - NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); - V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; + in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, + "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. @@ -405,6 +344,16 @@ tcp_init(void) tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; + TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream); + if (tcp_soreceive_stream) { +#ifdef INET + tcp_usrreqs.pru_soreceive = soreceive_stream; +#endif +#ifdef INET6 + tcp6_usrreqs.pru_soreceive = soreceive_stream; +#endif /* INET6 */ + } + #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ @@ -417,8 +366,6 @@ tcp_init(void) #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); - callout_init(&isn_callout, CALLOUT_MPSAFE); - callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, @@ -434,18 +381,9 @@ tcp_destroy(void) tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); - - /* XXX check that hashes are empty! */ - hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, - V_tcbinfo.ipi_hashmask); - hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, - V_tcbinfo.ipi_porthashmask); - + in_pcbinfo_destroy(&V_tcbinfo); uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); - uma_zdestroy(V_tcbinfo.ipi_zone); - - INP_INFO_LOCK_DESTROY(&V_tcbinfo); } #endif @@ -453,7 +391,6 @@ void tcp_fini(void *xtp) { - callout_stop(&isn_callout); } /* @@ -481,8 +418,12 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; - } else + } +#endif /* INET6 */ +#if defined(INET6) && defined(INET) + else #endif +#ifdef INET { struct ip *ip; @@ -499,6 +440,7 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } +#endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; @@ -560,7 +502,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 - isipv6 = ((struct ip *)ipgen)->ip_v == 6; + isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; @@ -608,6 +550,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; + m_addr_changed(m); /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } @@ -638,11 +581,14 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; - ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + - tlen)); + ip6->ip6_plen = 0; /* Set in ip6_output(). */ tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); - } else + } +#endif +#if defined(INET) && defined(INET6) + else #endif +#ifdef INET { tlen += sizeof (struct tcpiphdr); ip->ip_len = tlen; @@ -650,6 +596,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (V_path_mtu_discovery) ip->ip_off |= IP_DF; } +#endif m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; @@ -679,22 +626,27 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, else nth->th_win = htons((u_short)win); nth->th_urp = 0; + + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { - nth->th_sum = 0; - nth->th_sum = in6_cksum(m, IPPROTO_TCP, - sizeof(struct ip6_hdr), - tlen - sizeof(struct ip6_hdr)); + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + nth->th_sum = in6_cksum_pseudo(ip6, + tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); - } else + } #endif /* INET6 */ +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET { + m->m_pkthdr.csum_flags = CSUM_TCP; nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); - m->m_pkthdr.csum_flags = CSUM_TCP; - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); } +#endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); @@ -702,9 +654,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); - else #endif /* INET6 */ - (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); +#endif } /* @@ -786,10 +742,8 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; - tp->t_bw_rtttime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -878,7 +832,7 @@ tcp_drop(struct tcpcb *tp, int errno) if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_state = TCPS_CLOSED; - (void) tcp_output_reset(tp); + (void) tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -900,8 +854,19 @@ tcp_discardcb(struct tcpcb *tp) INP_WLOCK_ASSERT(inp); /* - * Make sure that all of our timers are stopped before we - * delete the PCB. + * Make sure that all of our timers are stopped before we delete the + * PCB. + * + * XXXRW: Really, we would like to use callout_drain() here in order + * to avoid races experienced in tcp_timer.c where a timer is already + * executing at this point. However, we can't, both because we're + * running in a context where we can't sleep, and also because we + * hold locks required by the timers. What we instead need to do is + * test to see if callout_drain() is required, and if so, defer some + * portion of the remainder of tcp_discardcb() to an asynchronous + * context that can callout_drain() and then continue. Some care + * will be required to ensure that no further processing takes place + * on the tcpcb, even though it hasn't been freed (a flag?). */ callout_stop(&tp->t_timers->tt_rexmt); callout_stop(&tp->t_timers->tt_persist); @@ -958,8 +923,6 @@ tcp_discardcb(struct tcpcb *tp) metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; - /* XXX: This wraps if the pipe is more than 4 Gbit per second */ - metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; @@ -969,8 +932,12 @@ tcp_discardcb(struct tcpcb *tp) /* free the reassembly queue, if any */ tcp_reass_flush(tp); + +#ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ - tcp_offload_detach(tp); + if (tp->t_flags & TF_TOE) + tcp_offload_detach(tp); +#endif tcp_free_sackholes(tp); @@ -999,9 +966,10 @@ tcp_close(struct tcpcb *tp) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); - /* Notify any offload devices of listener close */ +#ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) - tcp_offload_listen_close(tp); + tcp_offload_listen_stop(tp); +#endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); @@ -1211,8 +1179,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) else if (inp->inp_flags & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; - } else + } else { bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + if (xt.xt_tp.t_timers) + tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); + } if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { @@ -1228,9 +1199,9 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); @@ -1257,6 +1228,7 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); +#ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { @@ -1271,12 +1243,9 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, - addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1284,10 +1253,8 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1296,6 +1263,7 @@ tcp_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); +#endif /* INET */ #ifdef INET6 static int @@ -1304,7 +1272,10 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; - int error, mapped = 0; + int error; +#ifdef INET + int mapped = 0; +#endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) @@ -1317,27 +1288,28 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { +#ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else +#endif return (EINVAL); } - INP_INFO_RLOCK(&V_tcbinfo); +#ifdef INET if (mapped == 1) - inp = in_pcblookup_hash(&V_tcbinfo, + inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], - addrs[0].sin6_port, - 0, NULL); + addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else - inp = in6_pcblookup_hash(&V_tcbinfo, +#endif + inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, - &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); + &addrs[0].sin6_addr, addrs[0].sin6_port, + INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -1345,10 +1317,8 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_tcbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -1357,9 +1327,10 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); -#endif +#endif /* INET6 */ +#ifdef INET void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { @@ -1408,10 +1379,9 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&V_tcbinfo); - inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - INP_WLOCK(inp); if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { @@ -1473,6 +1443,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) } else in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); } +#endif /* INET */ #ifdef INET6 void @@ -1600,11 +1571,13 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) #define ISN_RANDOM_INCREMENT (4096 - 1) static VNET_DEFINE(u_char, isn_secret[32]); +static VNET_DEFINE(int, isn_last); static VNET_DEFINE(int, isn_last_reseed); static VNET_DEFINE(u_int32_t, isn_offset); static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) +#define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) @@ -1615,6 +1588,7 @@ tcp_new_isn(struct tcpcb *tp) MD5_CTX isn_ctx; u_int32_t md5_buffer[4]; tcp_seq new_isn; + u_int32_t projected_offset; INP_WLOCK_ASSERT(tp->t_inpcb); @@ -1650,38 +1624,17 @@ tcp_new_isn(struct tcpcb *tp) new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); - new_isn += V_isn_offset; - ISN_UNLOCK(); - return (new_isn); -} - -/* - * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary - * to keep time flowing at a relatively constant rate. If the random - * increments have already pushed us past the projected offset, do nothing. - */ -static void -tcp_isn_tick(void *xtp) -{ - VNET_ITERATOR_DECL(vnet_iter); - u_int32_t projected_offset; - - VNET_LIST_RLOCK_NOSLEEP(); - ISN_LOCK(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */ - projected_offset = - V_isn_offset_old + ISN_BYTES_PER_SECOND / 100; - + if (ticks != V_isn_last) { + projected_offset = V_isn_offset_old + + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; - V_isn_offset_old = V_isn_offset; - CURVNET_RESTORE(); + V_isn_last = ticks; } + new_isn += V_isn_offset; ISN_UNLOCK(); - VNET_LIST_RUNLOCK_NOSLEEP(); - callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); + return (new_isn); } /* @@ -1755,10 +1708,11 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output_send(tp); + tcp_output(tp); return (inp); } +#ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine @@ -1766,7 +1720,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) * tcp_mss_update to get the peer/interface MTU. */ u_long -tcp_maxmtu(struct in_conninfo *inc, int *flags) +tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct route sro; struct sockaddr_in *dst; @@ -1791,19 +1745,21 @@ tcp_maxmtu(struct in_conninfo *inc, int *flags) maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); /* Report additional interface capabilities. */ - if (flags != NULL) { + if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) - *flags |= CSUM_TSO; + cap->ifcap |= CSUM_TSO; + cap->tsomax = ifp->if_hw_tsomax; } RTFREE(sro.ro_rt); } return (maxmtu); } +#endif /* INET */ #ifdef INET6 u_long -tcp_maxmtu6(struct in_conninfo *inc, int *flags) +tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct route_in6 sro6; struct ifnet *ifp; @@ -1827,10 +1783,11 @@ tcp_maxmtu6(struct in_conninfo *inc, int *flags) IN6_LINKMTU(sro6.ro_rt->rt_ifp)); /* Report additional interface capabilities. */ - if (flags != NULL) { + if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) - *flags |= CSUM_TSO; + cap->ifcap |= CSUM_TSO; + cap->tsomax = ifp->if_hw_tsomax; } RTFREE(sro6.ro_rt); } @@ -1882,154 +1839,6 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp) } #endif /* IPSEC */ -/* - * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING - * - * This code attempts to calculate the bandwidth-delay product as a - * means of determining the optimal window size to maximize bandwidth, - * minimize RTT, and avoid the over-allocation of buffers on interfaces and - * routers. This code also does a fairly good job keeping RTTs in check - * across slow links like modems. We implement an algorithm which is very - * similar (but not meant to be) TCP/Vegas. The code operates on the - * transmitter side of a TCP connection and so only effects the transmit - * side of the connection. - * - * BACKGROUND: TCP makes no provision for the management of buffer space - * at the end points or at the intermediate routers and switches. A TCP - * stream, whether using NewReno or not, will eventually buffer as - * many packets as it is able and the only reason this typically works is - * due to the fairly small default buffers made available for a connection - * (typicaly 16K or 32K). As machines use larger windows and/or window - * scaling it is now fairly easy for even a single TCP connection to blow-out - * all available buffer space not only on the local interface, but on - * intermediate routers and switches as well. NewReno makes a misguided - * attempt to 'solve' this problem by waiting for an actual failure to occur, - * then backing off, then steadily increasing the window again until another - * failure occurs, ad-infinitum. This results in terrible oscillation that - * is only made worse as network loads increase and the idea of intentionally - * blowing out network buffers is, frankly, a terrible way to manage network - * resources. - * - * It is far better to limit the transmit window prior to the failure - * condition being achieved. There are two general ways to do this: First - * you can 'scan' through different transmit window sizes and locate the - * point where the RTT stops increasing, indicating that you have filled the - * pipe, then scan backwards until you note that RTT stops decreasing, then - * repeat ad-infinitum. This method works in principle but has severe - * implementation issues due to RTT variances, timer granularity, and - * instability in the algorithm which can lead to many false positives and - * create oscillations as well as interact badly with other TCP streams - * implementing the same algorithm. - * - * The second method is to limit the window to the bandwidth delay product - * of the link. This is the method we implement. RTT variances and our - * own manipulation of the congestion window, bwnd, can potentially - * destabilize the algorithm. For this reason we have to stabilize the - * elements used to calculate the window. We do this by using the minimum - * observed RTT, the long term average of the observed bandwidth, and - * by adding two segments worth of slop. It isn't perfect but it is able - * to react to changing conditions and gives us a very stable basis on - * which to extend the algorithm. - */ -void -tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) -{ - u_long bw; - u_long bwnd; - int save_ticks; - - INP_WLOCK_ASSERT(tp->t_inpcb); - - /* - * If inflight_enable is disabled in the middle of a tcp connection, - * make sure snd_bwnd is effectively disabled. - */ - if (V_tcp_inflight_enable == 0 || - tp->t_rttlow < V_tcp_inflight_rttthresh) { - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bandwidth = 0; - return; - } - - /* - * Figure out the bandwidth. Due to the tick granularity this - * is a very rough number and it MUST be averaged over a fairly - * long period of time. XXX we need to take into account a link - * that is not using all available bandwidth, but for now our - * slop will ramp us up if this case occurs and the bandwidth later - * increases. - * - * Note: if ticks rollover 'bw' may wind up negative. We must - * effectively reset t_bw_rtttime for this case. - */ - save_ticks = ticks; - if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) - return; - - bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / - (save_ticks - tp->t_bw_rtttime); - tp->t_bw_rtttime = save_ticks; - tp->t_bw_rtseq = ack_seq; - if (tp->t_bw_rtttime == 0 || (int)bw < 0) - return; - bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; - - tp->snd_bandwidth = bw; - - /* - * Calculate the semi-static bandwidth delay product, plus two maximal - * segments. The additional slop puts us squarely in the sweet - * spot and also handles the bandwidth run-up case and stabilization. - * Without the slop we could be locking ourselves into a lower - * bandwidth. - * - * Situations Handled: - * (1) Prevents over-queueing of packets on LANs, especially on - * high speed LANs, allowing larger TCP buffers to be - * specified, and also does a good job preventing - * over-queueing of packets over choke points like modems - * (at least for the transmit side). - * - * (2) Is able to handle changing network loads (bandwidth - * drops so bwnd drops, bandwidth increases so bwnd - * increases). - * - * (3) Theoretically should stabilize in the face of multiple - * connections implementing the same algorithm (this may need - * a little work). - * - * (4) Stability value (defaults to 20 = 2 maximal packets) can - * be adjusted with a sysctl but typically only needs to be - * on very slow connections. A value no smaller then 5 - * should be used, but only reduce this default if you have - * no other choice. - */ -#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) - bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; -#undef USERTT - - if (tcp_inflight_debug > 0) { - static int ltime; - if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { - ltime = ticks; - printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", - tp, - bw, - tp->t_rttbest, - tp->t_srtt, - bwnd - ); - } - } - if ((long)bwnd < V_tcp_inflight_min) - bwnd = V_tcp_inflight_min; - if (bwnd > V_tcp_inflight_max) - bwnd = V_tcp_inflight_max; - if ((long)bwnd < tp->t_maxseg * 2) - bwnd = tp->t_maxseg * 2; - tp->snd_bwnd = bwnd; -} - #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data @@ -2071,11 +1880,15 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, u_char *buf, u_int direction) { union sockaddr_union dst; +#ifdef INET struct ippseudo ippseudo; +#endif MD5_CTX ctx; int doff; struct ip *ip; +#ifdef INET struct ipovly *ipovly; +#endif struct secasvar *sav; struct tcphdr *th; #ifdef INET6 @@ -2097,12 +1910,14 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, ip6 = NULL; /* Make the compiler happy. */ #endif switch (ip->ip_v) { +#ifdef INET case IPVERSION: dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; break; +#endif #ifdef INET6 case (IPV6_VERSION >> 4): ip6 = mtod(m, struct ip6_hdr *); @@ -2142,6 +1957,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, * tcp_output(), the underlying ip_len member has not yet been set. */ switch (ip->ip_v) { +#ifdef INET case IPVERSION: ipovly = (struct ipovly *)ip; ippseudo.ippseudo_src = ipovly->ih_src; @@ -2155,6 +1971,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; break; +#endif #ifdef INET6 /* * RFC 2385, 2.0 Proposal @@ -2335,6 +2152,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) return (error); break; #endif +#ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; @@ -2342,6 +2160,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; +#endif default: return (EINVAL); } @@ -2349,18 +2168,19 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: - inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr, - fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0, - NULL); + inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, + fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, + INPLOOKUP_WLOCKPCB, NULL); break; #endif +#ifdef INET case AF_INET: - inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, - fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); + inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, + lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; +#endif } if (inp != NULL) { - INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an @@ -2387,7 +2207,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) return (error); } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, +SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); @@ -2485,6 +2305,7 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ +#ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); @@ -2493,6 +2314,7 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); +#endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c index 80da0349..10bd00ae 100644 --- a/freebsd/sys/netinet/tcp_syncache.c +++ b/freebsd/sys/netinet/tcp_syncache.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> +#include <rtems/bsd/local/opt_pcbgroup.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> @@ -82,10 +83,12 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcp_syncache.h> -#include <netinet/tcp_offload.h> #ifdef INET6 #include <netinet6/tcp6_var.h> #endif +#ifdef TCP_OFFLOAD +#include <netinet/toecore.h> +#endif #ifdef IPSEC #include <netipsec/ipsec.h> @@ -111,10 +114,8 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); -#ifdef TCP_OFFLOAD_DISABLE -#define TOEPCB_ISSET(sc) (0) -#else -#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) +#ifdef TCP_OFFLOAD +#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); @@ -124,6 +125,7 @@ struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); static int syncache_respond(struct syncache *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); +static int syncache_sysctl_count(SYSCTL_HANDLER_ARGS); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); @@ -148,7 +150,8 @@ static struct syncache static VNET_DEFINE(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); +static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, + "TCP SYN cache"); SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, @@ -158,8 +161,8 @@ SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, - &VNET_NAME(tcp_syncache.cache_count), 0, +SYSCTL_VNET_PROC(_net_inet_tcp_syncache, OID_AUTO, count, (CTLTYPE_UINT|CTLFLAG_RD), + NULL, 0, &syncache_sysctl_count, "IU", "Current number of entries in syncache"); SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, @@ -225,7 +228,6 @@ syncache_init(void) { int i; - V_tcp_syncache.cache_count = 0; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; @@ -268,7 +270,8 @@ syncache_init(void) /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); + V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, + V_tcp_syncache.cache_limit); } #ifdef VIMAGE @@ -296,8 +299,8 @@ syncache_destroy(void) mtx_destroy(&sch->sch_mtx); } - KASSERT(V_tcp_syncache.cache_count == 0, ("%s: cache_count %d not 0", - __func__, V_tcp_syncache.cache_count)); + KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, + ("%s: cache_count not 0", __func__)); /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); @@ -305,6 +308,15 @@ syncache_destroy(void) } #endif +static int +syncache_sysctl_count(SYSCTL_HANDLER_ARGS) +{ + int count; + + count = uma_zone_get_cur(V_tcp_syncache.zone); + return (sysctl_handle_int(oidp, &count, 0, req)); +} + /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. @@ -332,6 +344,14 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch) TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_added(tod, sc->sc_todctx); + } +#endif + /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; @@ -339,7 +359,6 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch) SCH_UNLOCK(sch); - V_tcp_syncache.cache_count++; TCPSTAT_INC(tcps_sc_added); } @@ -356,12 +375,15 @@ syncache_drop(struct syncache *sc, struct syncache_head *sch) TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; -#ifndef TCP_OFFLOAD_DISABLE - if (sc->sc_tu) - sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb); -#endif +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_removed(tod, sc->sc_todctx); + } +#endif + syncache_free(sc); - V_tcp_syncache.cache_count--; } /* @@ -629,7 +651,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; - int error = 0; + int error; char *s; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); @@ -663,6 +685,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); + INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; @@ -677,8 +700,14 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) #ifdef INET6 } #endif + + /* + * Install in the reservation hash table for now, but don't yet + * install a connection group since the full 4-tuple isn't yet + * configured. + */ inp->inp_lport = sc->sc_inc.inc_lport; - if ((error = in_pcbinshash(inp)) != 0) { + if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. @@ -696,6 +725,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) s, __func__, error); free(s, M_TCPLOG); } + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } #ifdef IPSEC @@ -730,13 +760,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; + if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, #ifndef __rtems__ - if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6, - thread0.td_ucred)) != 0) { -#else /* __rtems__ */ - if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6, - NULL)) != 0) { -#endif /* __rtems__ */ + thread0.td_ucred, m)) != 0) { +#else /* __rtems__ */ + NULL, m)) != 0) { +#endif /* __rtems__ */ inp->in6p_laddr = laddr6; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " @@ -744,13 +773,18 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) s, __func__, error); free(s, M_TCPLOG); } + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; inp->inp_flow |= sc->sc_flowlabel; - } else + } +#endif /* INET6 */ +#if defined(INET) && defined(INET6) + else #endif +#ifdef INET { struct in_addr laddr; struct sockaddr_in sin; @@ -770,14 +804,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; + if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, #ifndef __rtems__ - if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin, - thread0.td_ucred)) != 0) { -#else /* __rtems__ */ - if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin, - NULL)) != 0) { -#endif /* __rtems__ */ - + thread0.td_ucred, m)) != 0) { +#else /* __rtems__ */ + NULL, m)) != 0) { +#endif /* __rtems__ */ inp->inp_laddr = laddr; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " @@ -785,9 +817,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) s, __func__, error); free(s, M_TCPLOG); } + INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } } +#endif /* INET */ + INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); tp->t_state = TCPS_SYN_RECEIVED; tp->iss = sc->sc_iss; @@ -835,12 +870,33 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) tcp_mss(tp, sc->sc_peer_mss); /* - * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. + * If the SYN,ACK was retransmitted, indicate that CWND to be + * limited to one segment in cc_conn_init(). * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. */ if (sc->sc_rxmits > 1) - tp->snd_cwnd = tp->t_maxseg; - tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); + tp->snd_cwnd = 1; + +#ifdef TCP_OFFLOAD + /* + * Allow a TOE driver to install its hooks. Note that we hold the + * pcbinfo lock too and that prevents tcp_usr_accept from accepting a + * new connection before the TOE driver has done its thing. + */ + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_offload_socket(tod, sc->sc_todctx, so); + } +#endif + /* + * Copy and activate timers. + */ + tp->t_keepinit = sototcpcb(lso)->t_keepinit; + tp->t_keepidle = sototcpcb(lso)->t_keepidle; + tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; + tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); INP_WUNLOCK(inp); @@ -913,7 +969,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; - V_tcp_syncache.cache_count--; +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + tod->tod_syncache_removed(tod, sc->sc_todctx); + } +#endif SCH_UNLOCK(sch); } @@ -921,7 +983,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ - if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) { + if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); @@ -932,9 +994,8 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ - if ((SEQ_LEQ(th->th_seq, sc->sc_irs) || - SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) && - !TOEPCB_ISSET(sc)) { + if (SEQ_LEQ(th->th_seq, sc->sc_irs) || + SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); @@ -951,8 +1012,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ - if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts && - !TOEPCB_ISSET(sc)) { + if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", @@ -980,25 +1040,6 @@ failed: return (0); } -int -tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct socket **lsop, struct mbuf *m) -{ - struct tcpopt to; - int rc; - - bzero(&to, sizeof(struct tcpopt)); - to.to_mss = toeo->to_mss; - to.to_wscale = toeo->to_wscale; - to.to_flags = toeo->to_flags; - - INP_INFO_WLOCK(&V_tcbinfo); - rc = syncache_expand(inc, &to, th, lsop, m); - INP_INFO_WUNLOCK(&V_tcbinfo); - - return (rc); -} - /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: @@ -1014,8 +1055,8 @@ tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, */ static void _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m, - struct toe_usrreqs *tu, void *toepcb) + struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, + void *todctx) { struct tcpcb *tp; struct socket *so; @@ -1080,7 +1121,11 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif +#ifdef INET ipopts = (m) ? ip_srcroute(m) : NULL; +#else + ipopts = NULL; +#endif /* * See if we already have an entry for this connection. @@ -1097,11 +1142,6 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { -#ifndef TCP_OFFLOAD_DISABLE - if (sc->sc_tu) - sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT, - sc->sc_toepcb); -#endif TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* @@ -1134,7 +1174,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); free(s, M_TCPLOG); } - if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) { + if (syncache_respond(sc) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1185,9 +1225,9 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } -#ifndef TCP_OFFLOAD_DISABLE - sc->sc_tu = tu; - sc->sc_toepcb = toepcb; +#ifdef TCP_OFFLOAD + sc->sc_tod = tod; + sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); @@ -1282,7 +1322,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, /* * Do a standard 3-way handshake. */ - if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) { + if (syncache_respond(sc) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) @@ -1314,8 +1354,8 @@ syncache_respond(struct syncache *sc) { struct ip *ip = NULL; struct mbuf *m; - struct tcphdr *th; - int optlen, error; + struct tcphdr *th = NULL; + int optlen, error = 0; /* Make compiler happy */ u_int16_t hlen, tlen, mssopt; struct tcpopt to; #ifdef INET6 @@ -1363,8 +1403,12 @@ syncache_respond(struct syncache *sc) ip6->ip6_flow |= sc->sc_flowlabel; th = (struct tcphdr *)(ip6 + 1); - } else + } +#endif +#if defined(INET6) && defined(INET) + else #endif +#ifdef INET { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; @@ -1391,6 +1435,7 @@ syncache_respond(struct syncache *sc) th = (struct tcphdr *)(ip + 1); } +#endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; @@ -1451,22 +1496,45 @@ syncache_respond(struct syncache *sc) optlen = 0; M_SETFIB(m, sc->sc_inc.inc_fibnum); + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { - th->th_sum = 0; - th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, - tlen + optlen - hlen); + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, + IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(NULL, NULL); +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); + + return (error); + } +#endif error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); - } else + } +#endif +#if defined(INET6) && defined(INET) + else #endif +#ifdef INET { + m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); - m->m_pkthdr.csum_flags = CSUM_TCP; - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); +#ifdef TCP_OFFLOAD + if (ADDED_BY_TOE(sc)) { + struct toedev *tod = sc->sc_tod; + + error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); + + return (error); + } +#endif error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } +#endif return (error); } @@ -1478,23 +1546,12 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } void -tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct inpcb *inp, struct socket **lsop, - struct toe_usrreqs *tu, void *toepcb) +tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct inpcb *inp, struct socket **lsop, void *tod, void *todctx) { - struct tcpopt to; - bzero(&to, sizeof(struct tcpopt)); - to.to_mss = toeo->to_mss; - to.to_wscale = toeo->to_wscale; - to.to_flags = toeo->to_flags; - - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(inp); - - _syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb); + _syncache_add(inc, to, th, inp, lsop, NULL, tod, todctx); } - /* * The purpose of SYN cookies is to avoid keeping track of all SYN's we * receive and to be able to handle SYN floods from bogus source addresses diff --git a/freebsd/sys/netinet/tcp_syncache.h b/freebsd/sys/netinet/tcp_syncache.h index 93c7aaa2..c55bfbcd 100644 --- a/freebsd/sys/netinet/tcp_syncache.h +++ b/freebsd/sys/netinet/tcp_syncache.h @@ -34,8 +34,6 @@ #define _NETINET_TCP_SYNCACHE_H_ #ifdef _KERNEL -struct toeopt; - void syncache_init(void); #ifdef VIMAGE void syncache_destroy(void); @@ -43,14 +41,10 @@ void syncache_destroy(void); void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); -int tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, - struct tcphdr *th, struct socket **lsop, struct mbuf *m); void syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); -void tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *, - struct tcphdr *, struct inpcb *, struct socket **, - struct toe_usrreqs *tu, void *toepcb); - +void tcp_offload_syncache_add(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct inpcb *, struct socket **, void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); int syncache_pcbcount(void); @@ -75,12 +69,14 @@ struct syncache { u_int8_t sc_requested_s_scale:4, sc_requested_r_scale:4; u_int16_t sc_flags; -#ifndef TCP_OFFLOAD_DISABLE - struct toe_usrreqs *sc_tu; /* TOE operations */ - void *sc_toepcb; /* TOE protocol block */ -#endif +#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE) + struct toedev *sc_tod; /* entry added by this TOE */ + void *sc_todctx; /* TOE driver context */ +#endif struct label *sc_label; /* MAC label reference */ struct ucred *sc_cred; /* cred cache for jail checks */ + + u_int32_t sc_spare[2]; /* UTO */ }; /* @@ -117,7 +113,6 @@ struct tcp_syncache { u_int hashsize; u_int hashmask; u_int bucket_limit; - u_int cache_count; /* XXX: unprotected */ u_int cache_limit; u_int rexmt_limit; u_int hash_secret; diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c index 77cc1feb..db952e42 100644 --- a/freebsd/sys/netinet/tcp_timer.c +++ b/freebsd/sys/netinet/tcp_timer.c @@ -34,6 +34,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_tcpdebug.h> @@ -43,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/mbuf.h> #include <sys/mutex.h> #include <sys/protosw.h> +#include <sys/smp.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> @@ -112,18 +114,25 @@ int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); +int tcp_keepcnt = TCPTV_KEEPCNT; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, + "Number of keepalive probes to send"); -static int tcp_keepcnt = TCPTV_KEEPCNT; /* max idle probes */ int tcp_maxpersistidle; - /* max idle time in persist */ -int tcp_maxidle; static int tcp_rexmit_drop_options = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); +static int per_cpu_timers = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, + &per_cpu_timers , 0, "run tcp timers on all cpus"); + +#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ + ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) + /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP @@ -137,7 +146,6 @@ tcp_slowtimo(void) VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - tcp_maxidle = tcp_keepcnt * tcp_keepintvl; INP_INFO_WLOCK(&V_tcbinfo); (void) tcp_tw_2msl_scan(0); INP_INFO_WUNLOCK(&V_tcbinfo); @@ -265,9 +273,9 @@ tcp_timer_2msl(void *xtp) tp = tcp_close(tp); } else { if (tp->t_state != TCPS_TIME_WAIT && - ticks - tp->t_rcvtime <= tcp_maxidle) - callout_reset(&tp->t_timers->tt_2msl, tcp_keepintvl, - tcp_timer_2msl, tp); + ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) + callout_reset_on(&tp->t_timers->tt_2msl, + TP_KEEPINTVL(tp), tcp_timer_2msl, tp, INP_CPU(inp)); else tp = tcp_close(tp); } @@ -334,7 +342,7 @@ tcp_timer_keep(void *xtp) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { - if (ticks - tp->t_rcvtime >= tcp_keepidle + tcp_maxidle) + if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response @@ -356,9 +364,11 @@ tcp_timer_keep(void *xtp) tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } - callout_reset(&tp->t_timers->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); + callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), + tcp_timer_keep, tp, INP_CPU(inp)); } else - callout_reset(&tp->t_timers->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), + tcp_timer_keep, tp, INP_CPU(inp)); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) @@ -445,6 +455,16 @@ tcp_timer_persist(void *xtp) tp = tcp_drop(tp, ETIMEDOUT); goto out; } + /* + * If the user has closed the socket then drop a persisting + * connection after a much reduced timeout. + */ + if (tp->t_state > TCPS_CLOSE_WAIT && + (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { + TCPSTAT_INC(tcps_persistdrop); + tp = tcp_drop(tp, ETIMEDOUT); + goto out; + } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tcp_output(tp); @@ -474,8 +494,7 @@ tcp_timer_rexmt(void * xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); - headlocked = 1; + INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb @@ -486,7 +505,7 @@ tcp_timer_rexmt(void * xtp) */ if (inp == NULL) { tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -494,14 +513,14 @@ tcp_timer_rexmt(void * xtp) if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -514,13 +533,37 @@ tcp_timer_rexmt(void * xtp) if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); + in_pcbref(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + INP_WUNLOCK(inp); + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) { + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + if (inp->inp_flags & INP_DROPPED) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); + headlocked = 1; goto out; } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); headlocked = 0; - if (tp->t_rxtshift == 1) { + if (tp->t_state == TCPS_SYN_SENT) { + /* + * If the SYN was retransmitted, indicate CWND to be + * limited to 1 segment in cc_conn_init(). + */ + tp->snd_cwnd = 1; + } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. @@ -547,13 +590,13 @@ tcp_timer_rexmt(void * xtp) tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if (tp->t_state == TCPS_SYN_SENT) - rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; + rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* - * Disable rfc1323 if we haven't got any response to + * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing @@ -561,7 +604,7 @@ tcp_timer_rexmt(void * xtp) */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) - tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP); + tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; @@ -572,7 +615,6 @@ tcp_timer_rexmt(void * xtp) #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); - else #endif tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; @@ -610,6 +652,13 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) { struct callout *t_callout; void *f_callout; + struct inpcb *inp = tp->t_inpcb; + int cpu = INP_CPU(inp); + +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + return; +#endif switch (timer_type) { case TT_DELACK: @@ -638,7 +687,7 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) if (delta == 0) { callout_stop(t_callout); } else { - callout_reset(t_callout, delta, f_callout, tp); + callout_reset_on(t_callout, delta, f_callout, tp, cpu); } } @@ -668,3 +717,24 @@ tcp_timer_active(struct tcpcb *tp, int timer_type) } return callout_active(t_callout); } + +#define ticks_to_msecs(t) (1000*(t) / hz) + +void +tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) +{ + bzero(xtimer, sizeof(struct xtcp_timer)); + if (timer == NULL) + return; + if (callout_active(&timer->tt_delack)) + xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks); + if (callout_active(&timer->tt_rexmt)) + xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks); + if (callout_active(&timer->tt_persist)) + xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks); + if (callout_active(&timer->tt_keep)) + xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks); + if (callout_active(&timer->tt_2msl)) + xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks); + xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); +} diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h index ff455b6b..0da58fd8 100644 --- a/freebsd/sys/netinet/tcp_timer.h +++ b/freebsd/sys/netinet/tcp_timer.h @@ -86,9 +86,6 @@ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ -#define TCPTV_INFLIGHT_RTTTHRESH (10*hz/1000) /* below which inflight - disengages, in msec */ - #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* @@ -121,7 +118,7 @@ #define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ -#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */ +#define TCPTV_DELACK ( hz/10 ) /* 100ms timeout */ #ifdef TCPTIMERS static const char *tcptimers[] = @@ -141,6 +138,8 @@ static const char *tcptimers[] = #ifdef _KERNEL +struct xtcp_timer; + struct tcp_timer { struct callout tt_rexmt; /* retransmit timer */ struct callout tt_persist; /* retransmit persistence */ @@ -154,10 +153,16 @@ struct tcp_timer { #define TT_KEEP 0x08 #define TT_2MSL 0x10 +#define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) +#define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) +#define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) +#define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) +#define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) + extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ -extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ extern int tcp_maxpersistidle; extern int tcp_rexmit_min; @@ -177,6 +182,8 @@ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); void tcp_timer_delack(void *xtp); +void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, + struct xtcp_timer *xtimer); #endif /* _KERNEL */ diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c index f9b613a7..9034fab4 100644 --- a/freebsd/sys/netinet/tcp_timewait.c +++ b/freebsd/sys/netinet/tcp_timewait.c @@ -59,23 +59,19 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> #include <netinet/in.h> +#include <netinet/in_pcb.h> #include <netinet/in_systm.h> +#include <netinet/in_var.h> #include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> #ifdef INET6 #include <netinet/ip6.h> -#endif -#include <netinet/in_pcb.h> -#ifdef INET6 #include <netinet6/in6_pcb.h> -#endif -#include <netinet/in_var.h> -#include <netinet/ip_var.h> -#ifdef INET6 #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #include <netinet6/nd6.h> #endif -#include <netinet/ip_icmp.h> #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> @@ -88,7 +84,9 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +#ifdef INET6 #include <netinet6/ip6protosw.h> +#endif #include <machine/in_cksum.h> @@ -204,15 +202,31 @@ tcp_twstart(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; int acknow; struct socket *so; +#ifdef INET6 + int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; +#endif INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */ INP_WLOCK_ASSERT(inp); - if (V_nolocaltimewait && in_localip(inp->inp_faddr)) { - tp = tcp_close(tp); - if (tp != NULL) - INP_WUNLOCK(inp); - return; + if (V_nolocaltimewait) { + int error = 0; +#ifdef INET6 + if (isipv6) + error = in6_localaddr(&inp->in6p_faddr); +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + error = in_localip(inp->inp_faddr); +#endif + if (error) { + tp = tcp_close(tp); + if (tp != NULL) + INP_WUNLOCK(inp); + return; + } } tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); @@ -493,16 +507,21 @@ int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; - struct tcphdr *th; +#if defined(INET6) || defined(INET) + struct tcphdr *th = NULL; +#endif struct mbuf *m; +#ifdef INET struct ip *ip = NULL; +#endif u_int hdrlen, optlen; - int error; + int error = 0; /* Keep compiler happy */ struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif + hdrlen = 0; /* Keep compiler happy */ INP_WLOCK_ASSERT(inp); @@ -521,14 +540,19 @@ tcp_twrespond(struct tcptw *tw, int flags) ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); - } else + } #endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } +#endif to.to_flags = 0; /* @@ -553,20 +577,25 @@ tcp_twrespond(struct tcptw *tw, int flags) th->th_flags = flags; th->th_win = htons(tw->last_win); + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { - th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), - sizeof(struct tcphdr) + optlen); + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + th->th_sum = in6_cksum_pseudo(ip6, + sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); - } else + } #endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET { + m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); - m->m_pkthdr.csum_flags = CSUM_TCP; - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; if (V_path_mtu_discovery) ip->ip_off |= IP_DF; @@ -574,6 +603,7 @@ tcp_twrespond(struct tcptw *tw, int flags) ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } +#endif if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c index 2c1cd615..61711a6e 100644 --- a/freebsd/sys/netinet/tcp_usrreq.c +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -4,8 +4,12 @@ * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -43,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/systm.h> +#include <sys/limits.h> #include <sys/malloc.h> #include <sys/kernel.h> #include <sys/sysctl.h> @@ -66,17 +71,13 @@ __FBSDID("$FreeBSD$"); #include <netinet/cc.h> #include <netinet/in.h> -#include <netinet/in_systm.h> -#ifdef INET6 -#include <netinet/ip6.h> -#endif #include <netinet/in_pcb.h> -#ifdef INET6 -#include <netinet6/in6_pcb.h> -#endif +#include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #endif @@ -88,14 +89,18 @@ __FBSDID("$FreeBSD$"); #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif +#ifdef TCP_OFFLOAD #include <netinet/tcp_offload.h> +#endif /* * TCP protocol interface to socket abstraction. */ static int tcp_attach(struct socket *); +#ifdef INET static int tcp_connect(struct tcpcb *, struct sockaddr *, struct thread *td); +#endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct thread *td); @@ -233,6 +238,7 @@ tcp_usr_detach(struct socket *so) INP_INFO_WUNLOCK(&V_tcbinfo); } +#ifdef INET /* * Give the socket an address. */ @@ -256,7 +262,6 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); @@ -266,14 +271,16 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } tp = intotcpcb(inp); TCPDEBUG1(); + INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } +#endif /* INET */ #ifdef INET6 static int @@ -296,7 +303,6 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) return (EAFNOSUPPORT); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); @@ -306,8 +312,10 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) } tp = intotcpcb(inp); TCPDEBUG1(); + INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; +#ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) inp->inp_vflag |= INP_IPV4; @@ -319,18 +327,21 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } +#endif error = in6_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ +#ifdef INET /* * Prepare to accept connections. */ @@ -342,7 +353,6 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); @@ -354,21 +364,26 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); + INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); - tcp_offload_listen_open(tp); +#ifdef TCP_OFFLOAD + if ((so->so_options & SO_NO_OFFLOAD) == 0) + tcp_offload_listen_start(tp); +#endif } SOCK_UNLOCK(so); out: TCPDEBUG2(PRU_LISTEN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } +#endif /* INET */ #ifdef INET6 static int @@ -379,7 +394,6 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); @@ -391,26 +405,32 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); + INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); } + INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tp->t_state = TCPS_LISTEN; solisten_proto(so, backlog); +#ifdef TCP_OFFLOAD + if ((so->so_options & SO_NO_OFFLOAD) == 0) + tcp_offload_listen_start(tp); +#endif } SOCK_UNLOCK(so); out: TCPDEBUG2(PRU_LISTEN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ +#ifdef INET /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. @@ -439,7 +459,6 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) return (error); TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); @@ -451,13 +470,20 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) TCPDEBUG1(); if ((error = tcp_connect(tp, nam, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (so->so_options & SO_NO_OFFLOAD) == 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + error = tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } +#endif /* INET */ #ifdef INET6 static int @@ -480,7 +506,6 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); - INP_INFO_WLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); @@ -490,6 +515,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) } tp = intotcpcb(inp); TCPDEBUG1(); +#ifdef INET + /* + * XXXRW: Some confusion: V4/V6 flags relate to binding, and + * therefore probably require the hash lock, which isn't held here. + * Is this a significant problem? + */ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; @@ -506,9 +537,16 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (so->so_options & SO_NO_OFFLOAD) == 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + error = tcp_output(tp); goto out; } +#endif inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; @@ -516,12 +554,18 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; - error = tcp_output_connect(so, nam); +#ifdef TCP_OFFLOAD + if (registered_toedevs > 0 && + (so->so_options & SO_NO_OFFLOAD) == 0 && + (error = tcp_offload_connect(so, nam)) == 0) + goto out; +#endif + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); + error = tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET6 */ @@ -563,6 +607,7 @@ out: return (error); } +#ifdef INET /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. @@ -614,6 +659,7 @@ out: *nam = in_sockaddr(port, &addr); return error; } +#endif /* INET */ #ifdef INET6 static int @@ -633,6 +679,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -658,6 +705,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) out: TCPDEBUG2(PRU_ACCEPT); INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); @@ -692,7 +740,7 @@ tcp_usr_shutdown(struct socket *so) socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output_disconnect(tp); + error = tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); @@ -722,7 +770,12 @@ tcp_usr_rcvd(struct socket *so, int flags) } tp = intotcpcb(inp); TCPDEBUG1(); - tcp_output_rcvd(tp); +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) + tcp_offload_rcvd(tp); + else +#endif + tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); @@ -744,25 +797,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - int headlocked = 0; #ifdef INET6 int isipv6; #endif TCPDEBUG0; /* - * We require the pcbinfo lock in two cases: - * - * (1) An implied connect is taking place, which can result in - * binding IPs and ports and hence modification of the pcb hash - * chains. - * - * (2) PRUS_EOF is set, resulting in explicit close on the send. + * We require the pcbinfo lock if we will close the socket as part of + * this call. */ - if ((nam != NULL) || (flags & PRUS_EOF)) { + if (flags & PRUS_EOF) INP_INFO_WLOCK(&V_tcbinfo); - headlocked = 1; - } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); @@ -799,13 +844,16 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); - else #endif /* INET6 */ - error = tcp_connect(tp, nam, td); +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + error = tcp_connect(tp, nam, td); +#endif if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; @@ -820,14 +868,10 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, socantsendmore(so); tcp_usrclosed(tp); } - if (headlocked) { - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; - } if (!(inp->inp_flags & INP_DROPPED)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output_send(tp); + error = tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -859,33 +903,31 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * initialize maxseg/maxopd using peer's cached * MSS. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); - else #endif /* INET6 */ - error = tcp_connect(tp, nam, td); +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + error = tcp_connect(tp, nam, td); +#endif if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; - } else if (nam) { - INP_INFO_WUNLOCK(&V_tcbinfo); - headlocked = 0; } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; - error = tcp_output_send(tp); + error = tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); - if (headlocked) + if (flags & PRUS_EOF) INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } @@ -1009,6 +1051,7 @@ out: return (error); } +#ifdef INET struct pr_usrreqs tcp_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp_usr_accept, @@ -1025,12 +1068,10 @@ struct pr_usrreqs tcp_usrreqs = { .pru_send = tcp_usr_send, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, -#if 0 - .pru_soreceive = soreceive_stream, -#endif .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; +#endif /* INET */ #ifdef INET6 struct pr_usrreqs tcp6_usrreqs = { @@ -1049,14 +1090,12 @@ struct pr_usrreqs tcp6_usrreqs = { .pru_send = tcp_usr_send, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, -#if 0 - .pru_soreceive = soreceive_stream, -#endif - .pru_sosetlabel = in_pcbsosetlabel, + .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET6 */ +#ifdef INET /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local @@ -1076,13 +1115,13 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) u_short lport; int error; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) - return error; + goto out; } /* @@ -1095,11 +1134,14 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) - return error; - if (oinp) - return EADDRINUSE; + goto out; + if (oinp) { + error = EADDRINUSE; + goto out; + } inp->inp_laddr = laddr; in_pcbrehash(inp); + INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: @@ -1113,13 +1155,16 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; + +out: + INP_HASH_WUNLOCK(&V_tcbinfo); + return (error); } +#endif /* INET */ #ifdef INET6 static int @@ -1131,13 +1176,13 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) struct in6_addr addr6; int error; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) - return error; + goto out; } /* @@ -1145,18 +1190,23 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. * in6_pcbladdr() also handles scope zone IDs. + * + * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked() + * outside of in6_pcb.c if there were an in6_pcbconnect_setup(). */ error = in6_pcbladdr(inp, nam, &addr6); if (error) - return error; - oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + goto out; + oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL); - if (oinp) - return EADDRINUSE; + if (oinp) { + error = EADDRINUSE; + goto out; + } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = addr6; inp->in6p_faddr = sin6->sin6_addr; @@ -1167,6 +1217,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); + INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && @@ -1176,12 +1227,14 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tp->t_state = TCPS_SYN_SENT; - tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); tp->iss = tcp_new_isn(tp); - tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); return 0; + +out: + INP_HASH_WUNLOCK(&V_tcbinfo); + return error; } #endif /* INET6 */ @@ -1224,7 +1277,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; - ti->tcpi_snd_bwnd = tp->snd_bwnd; + ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; @@ -1254,6 +1307,7 @@ int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int error, opt, optval; + u_int ui; struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; @@ -1269,11 +1323,15 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) if (inp->inp_vflag & INP_IPV6PROTO) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); - } else { + } #endif /* INET6 */ +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); -#ifdef INET6 } #endif return (error); @@ -1299,9 +1357,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_flags |= TF_SIGNATURE; else tp->t_flags &= ~TF_SIGNATURE; - INP_WUNLOCK(inp); - break; + goto unlock_and_done; #endif /* TCP_SIGNATURE */ + case TCP_NODELAY: case TCP_NOOPT: INP_WUNLOCK(inp); @@ -1327,6 +1385,13 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_flags |= opt; else tp->t_flags &= ~opt; +unlock_and_done: +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_ctloutput(tp, sopt->sopt_dir, + sopt->sopt_name); + } +#endif INP_WUNLOCK(inp); break; @@ -1345,8 +1410,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) if (TCPS_HAVEESTABLISHED(tp->t_state)) error = tcp_output(tp); } - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); @@ -1361,8 +1425,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) tp->t_maxseg = optval; else error = EINVAL; - INP_WUNLOCK(inp); - break; + goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); @@ -1414,6 +1477,64 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) } } CC_LIST_RUNLOCK(); + goto unlock_and_done; + + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPINIT: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); + if (error) + return (error); + + if (ui > (UINT_MAX / hz)) { + error = EINVAL; + break; + } + ui *= hz; + + INP_WLOCK_RECHECK(inp); + switch (sopt->sopt_name) { + case TCP_KEEPIDLE: + tp->t_keepidle = ui; + /* + * XXX: better check current remaining + * timeout and "merge" it with new value. + */ + if ((tp->t_state > TCPS_LISTEN) && + (tp->t_state <= TCPS_CLOSING)) + tcp_timer_activate(tp, TT_KEEP, + TP_KEEPIDLE(tp)); + break; + case TCP_KEEPINTVL: + tp->t_keepintvl = ui; + if ((tp->t_state == TCPS_FIN_WAIT_2) && + (TP_MAXIDLE(tp) > 0)) + tcp_timer_activate(tp, TT_2MSL, + TP_MAXIDLE(tp)); + break; + case TCP_KEEPINIT: + tp->t_keepinit = ui; + if (tp->t_state == TCPS_SYN_RECEIVED || + tp->t_state == TCPS_SYN_SENT) + tcp_timer_activate(tp, TT_KEEP, + TP_KEEPINIT(tp)); + break; + } + goto unlock_and_done; + + case TCP_KEEPCNT: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + tp->t_keepcnt = ui; + if ((tp->t_state == TCPS_FIN_WAIT_2) && + (TP_MAXIDLE(tp) > 0)) + tcp_timer_activate(tp, TT_2MSL, + TP_MAXIDLE(tp)); INP_WUNLOCK(inp); break; @@ -1478,18 +1599,6 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) #undef INP_WLOCK_RECHECK /* - * tcp_sendspace and tcp_recvspace are the default send and receive window - * sizes, respectively. These are obsolescent (this information should - * be set by the route). - */ -u_long tcp_sendspace = 1024*32; -SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, - &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); -u_long tcp_recvspace = 1024*64; -SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, - &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); - -/* * Attach TCP protocol to socket, allocating * internet protocol control block, tcp control block, * bufer space, and entering LISTEN state if to accept connections. @@ -1502,7 +1611,7 @@ tcp_attach(struct socket *so) int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { - error = soreserve(so, tcp_sendspace, tcp_recvspace); + error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); if (error) return (error); } @@ -1570,7 +1679,7 @@ tcp_disconnect(struct tcpcb *tp) sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output_disconnect(tp); + tcp_output(tp); } } @@ -1593,7 +1702,9 @@ tcp_usrclosed(struct tcpcb *tp) switch (tp->t_state) { case TCPS_LISTEN: - tcp_offload_listen_close(tp); +#ifdef TCP_OFFLOAD + tcp_offload_listen_stop(tp); +#endif /* FALLTHROUGH */ case TCPS_CLOSED: tp->t_state = TCPS_CLOSED; @@ -1626,7 +1737,7 @@ tcp_usrclosed(struct tcpcb *tp) int timeout; timeout = (tcp_fast_finwait2_recycle) ? - tcp_finwait2_timeout : tcp_maxidle; + tcp_finwait2_timeout : TP_MAXIDLE(tp); tcp_timer_activate(tp, TT_2MSL, timeout); } } @@ -1865,26 +1976,24 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); - db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n", - tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd); + db_printf("snd_wnd: %lu snd_cwnd: %lu\n", + tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); - db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: " - "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth, - tp->snd_recover); + db_printf("snd_ssthresh: %lu snd_recover: " + "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); - db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n", - tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime); + db_printf("t_rttime: %u t_rtsq: 0x%08x\n", + tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); - db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u " - "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg, - tp->t_srtt); + db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", + tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h index 618250cd..171eafb6 100644 --- a/freebsd/sys/netinet/tcp_var.h +++ b/freebsd/sys/netinet/tcp_var.h @@ -72,6 +72,7 @@ struct sackhint { int sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ + int ispare; /* explicit pad for 64bit alignment */ uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */ }; @@ -131,12 +132,12 @@ struct tcpcb { u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ - u_long snd_bwnd; /* bandwidth-controlled window */ + u_long snd_spare1; /* unused */ u_long snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ - u_long snd_bandwidth; /* calculated bandwidth or 0 */ + u_long snd_spare2; /* unused */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ @@ -146,8 +147,8 @@ struct tcpcb { u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ - u_int t_bw_rtttime; /* used for bandwidth calculation */ - tcp_seq t_bw_rtseq; /* used for bandwidth calculation */ + u_int t_bw_spare1; /* unused */ + tcp_seq t_bw_spare2; /* unused */ int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ @@ -177,6 +178,7 @@ struct tcpcb { u_long snd_cwnd_prev; /* cwnd prior to retransmit */ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ + int t_sndzerowin; /* zero-window updates sent */ u_int t_badrxtwin; /* window for retransmit recovery */ u_char snd_limited; /* segments limited transmitted */ /* SACK related state */ @@ -192,21 +194,25 @@ struct tcpcb { int t_rttlow; /* smallest observerved RTT */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ - struct toe_usrreqs *t_tu; /* offload operations vector */ + struct toedev *tod; /* toedev handling this connection */ + int t_sndrexmitpack; /* retransmit packets sent */ + int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ int t_bytes_acked; /* # bytes acked during current RTT */ - - int t_sndzerowin; /* zero-window updates sent */ - struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ - void *t_pspare2[3]; /* 3 TBD */ - uint64_t _pad[10]; /* 7 UTO, 3 TBD (1-2 CC/RTT?) */ + u_int t_keepinit; /* time to establish connection */ + u_int t_keepidle; /* time before keepalive probes begin */ + u_int t_keepintvl; /* interval between keepalives */ + u_int t_keepcnt; /* number of keepalives before close */ - uint64_t t_sndrexmitpack;/* retransmit packets sent */ - uint64_t t_rcvoopack; /* out-of-order packets received */ + u_int t_tsomax; /* tso burst length limit */ + + uint32_t t_ispare[7]; /* 5 UTO, 2 TBD */ + void *t_pspare2[4]; /* 4 TBD */ + uint64_t _pad[6]; /* 6 TBD (1-2 CC/RTT?) */ }; /* @@ -301,6 +307,7 @@ struct tcpopt { u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ + u_int32_t to_spare; /* UTO */ }; /* @@ -319,6 +326,15 @@ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ }; +/* + * Used by tcp_maxmtu() to communicate interface specific features + * and limits at the time of connection setup. + */ +struct tcp_ifcap { + int ifcap; + u_int tsomax; +}; + #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ @@ -478,7 +494,7 @@ struct tcpstat { u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ - u_long tcps_sack_sboverflow; /* times scoreboard overflowed */ + u_long tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ u_long tcps_ecn_ce; /* ECN Congestion Experienced */ @@ -494,7 +510,7 @@ struct tcpstat { u_long tcps_sig_err_sigopt; /* No signature expected by socket */ u_long tcps_sig_err_nosigopt; /* No signature provided by segment */ - u_long _pad[7]; /* 6 UTO, 1 TBD */ + u_long _pad[12]; /* 6 UTO, 6 TBD */ }; #ifdef _KERNEL @@ -535,11 +551,20 @@ struct tcp_hhook_data { * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) +struct xtcp_timer { + int tt_rexmt; /* retransmit timer */ + int tt_persist; /* retransmit persistence */ + int tt_keep; /* keepalive */ + int tt_2msl; /* 2*msl TIME_WAIT timer */ + int tt_delack; /* delayed ACK timer */ + int t_rcvtime; /* Time since last packet received */ +}; struct xtcpcb { size_t xt_len; struct inpcb xt_inp; struct tcpcb xt_tp; struct xsocket xt_socket; + struct xtcp_timer xt_timer; u_quad_t xt_alignment_hack; }; #endif @@ -597,9 +622,10 @@ VNET_DECLARE(int, tcp_mssdflt); /* XXX */ VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_rfc3390); +VNET_DECLARE(int, tcp_do_initcwnd10); +VNET_DECLARE(int, tcp_sendspace); +VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, path_mtu_discovery); -VNET_DECLARE(int, ss_fltsz); -VNET_DECLARE(int, ss_fltsz_local); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_abc_l_var); #define V_tcb VNET(tcb) @@ -609,9 +635,10 @@ VNET_DECLARE(int, tcp_abc_l_var); #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) +#define V_tcp_do_initcwnd10 VNET(tcp_do_initcwnd10) +#define V_tcp_sendspace VNET(tcp_sendspace) +#define V_tcp_recvspace VNET(tcp_recvspace) #define V_path_mtu_discovery VNET(path_mtu_discovery) -#define V_ss_fltsz VNET(ss_fltsz) -#define V_ss_fltsz_local VNET(ss_fltsz_local) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) @@ -648,7 +675,7 @@ void tcp_init(void); void tcp_destroy(void); #endif void tcp_fini(void *); -char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, +char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); @@ -659,10 +686,10 @@ void tcp_reass_flush(struct tcpcb *); void tcp_reass_destroy(void); #endif void tcp_input(struct mbuf *, int); -u_long tcp_maxmtu(struct in_conninfo *, int *); -u_long tcp_maxmtu6(struct in_conninfo *, int *); +u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); +u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, - int *); + struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * @@ -695,7 +722,6 @@ void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, int, u_int); int tcp_timer_active(struct tcpcb *, int); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); -void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ @@ -709,8 +735,6 @@ void tcp_hc_updatemtu(struct in_conninfo *, u_long); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; -extern u_long tcp_sendspace; -extern u_long tcp_recvspace; tcp_seq tcp_new_isn(struct tcpcb *); void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); diff --git a/freebsd/sys/netinet/toecore.h b/freebsd/sys/netinet/toecore.h new file mode 100644 index 00000000..6ea98518 --- /dev/null +++ b/freebsd/sys/netinet/toecore.h @@ -0,0 +1,130 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_TOE_H_ +#define _NETINET_TOE_H_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +struct tcpopt; +struct tcphdr; +struct in_conninfo; + +struct toedev { + TAILQ_ENTRY(toedev) link; /* glue for toedev_list */ + void *tod_softc; /* TOE driver private data */ + + /* + * Active open. If a failure occurs, it is reported back by the driver + * via toe_connect_failed. + */ + int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *, + struct sockaddr *); + + /* Passive open. */ + int (*tod_listen_start)(struct toedev *, struct tcpcb *); + int (*tod_listen_stop)(struct toedev *, struct tcpcb *); + + /* + * The kernel uses this routine to pass on any frame it receives for an + * offloaded connection to the TOE driver. This is an unusual event. + */ + void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *); + + /* + * This is called by the kernel during pru_rcvd for an offloaded TCP + * connection and provides an opportunity for the TOE driver to manage + * its rx window and credits. + */ + void (*tod_rcvd)(struct toedev *, struct tcpcb *); + + /* + * Transmit routine. The kernel calls this to have the TOE driver + * evaluate whether there is data to be transmitted, and transmit it. + */ + int (*tod_output)(struct toedev *, struct tcpcb *); + + /* Immediate teardown: send RST to peer. */ + int (*tod_send_rst)(struct toedev *, struct tcpcb *); + + /* Initiate orderly disconnect by sending FIN to the peer. */ + int (*tod_send_fin)(struct toedev *, struct tcpcb *); + + /* Called to indicate that the kernel is done with this TCP PCB. */ + void (*tod_pcb_detach)(struct toedev *, struct tcpcb *); + + /* + * The kernel calls this once it has information about an L2 entry that + * the TOE driver enquired about previously (via toe_l2_resolve). + */ + void (*tod_l2_update)(struct toedev *, struct ifnet *, + struct sockaddr *, uint8_t *, uint16_t); + + /* XXX. Route has been redirected. */ + void (*tod_route_redirect)(struct toedev *, struct ifnet *, + struct rtentry *, struct rtentry *); + + /* Syncache interaction. */ + void (*tod_syncache_added)(struct toedev *, void *); + void (*tod_syncache_removed)(struct toedev *, void *); + int (*tod_syncache_respond)(struct toedev *, void *, struct mbuf *); + void (*tod_offload_socket)(struct toedev *, void *, struct socket *); + + /* TCP socket option */ + void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int); +}; + +#include <sys/eventhandler.h> +typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); +typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); +EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); +EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); + +void init_toedev(struct toedev *); +int register_toedev(struct toedev *); +int unregister_toedev(struct toedev *); + +/* + * General interface for looking up L2 information for an IP address. If an + * answer is not available right away then the TOE driver's tod_l2_update will + * be called later. + */ +int toe_l2_resolve(struct toedev *, struct ifnet *, struct sockaddr *, + uint8_t *, uint16_t *); + +void toe_connect_failed(struct toedev *, struct inpcb *, int); + +void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, + struct inpcb *, void *, void *); +int toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, + struct socket **); + +int toe_4tuple_check(struct in_conninfo *, struct tcphdr *, struct ifnet *); +#endif diff --git a/freebsd/sys/netinet/udp.h b/freebsd/sys/netinet/udp.h index 5ec55970..c2d638dd 100644 --- a/freebsd/sys/netinet/udp.h +++ b/freebsd/sys/netinet/udp.h @@ -57,7 +57,7 @@ struct udphdr { * UDP Encapsulation of IPsec Packets options. */ /* Encapsulation types. */ -#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ +#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ #define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */ /* Default ESP in UDP encapsulation port. */ diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c index 6c0e61c1..bf95e954 100644 --- a/freebsd/sys/netinet/udp_usrreq.c +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -4,8 +4,12 @@ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson + * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * + * Portions of this software were developed by Robert N. M. Watson under + * contract to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -41,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_ipfw.h> +#include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> @@ -149,9 +154,12 @@ SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &VNET_NAME(udpstat), udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); +#ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); +#endif + #ifdef IPSEC #ifdef IPSEC_NAT_T #define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP) @@ -183,25 +191,12 @@ void udp_init(void) { - - INP_INFO_LOCK_INIT(&V_udbinfo, "udp"); - LIST_INIT(&V_udb); -#ifdef VIMAGE - V_udbinfo.ipi_vnet = curvnet; -#endif - V_udbinfo.ipi_listhead = &V_udb; - V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB, - &V_udbinfo.ipi_hashmask); - V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, - &V_udbinfo.ipi_porthashmask); - V_udbinfo.ipi_zone = uma_zcreate("udp_inpcb", sizeof(struct inpcb), - NULL, NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); - + in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, + "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, + IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udpcb_zone, maxsockets); - EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -244,17 +239,12 @@ void udp_destroy(void) { - hashdestroy(V_udbinfo.ipi_hashbase, M_PCB, - V_udbinfo.ipi_hashmask); - hashdestroy(V_udbinfo.ipi_porthashbase, M_PCB, - V_udbinfo.ipi_porthashmask); - + in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); - uma_zdestroy(V_udbinfo.ipi_zone); - INP_INFO_LOCK_DESTROY(&V_udbinfo); } #endif +#ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that @@ -272,25 +262,32 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, #ifdef INET6 struct sockaddr_in6 udp_in6; #endif -#ifdef IPSEC -#ifdef IPSEC_NAT_T -#ifdef INET struct udpcb *up; -#endif -#endif -#endif - INP_RLOCK_ASSERT(inp); + INP_LOCK_ASSERT(inp); + + /* + * Engage the tunneling protocol. + */ + up = intoudpcb(inp); + if (up->u_tun_func != NULL) { + (*up->u_tun_func)(n, off, inp); + return; + } + + if (n == NULL) + return; + + off += sizeof(struct udphdr); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); - V_ipsec4stat.in_polvio++; + IPSECSTAT_INC(in_polvio); return; } #ifdef IPSEC_NAT_T -#ifdef INET up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ @@ -298,7 +295,6 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, if (n == NULL) /* Consumed. */ return; } -#endif /* INET */ #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ #ifdef MAC @@ -306,14 +302,14 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, m_freem(n); return; } -#endif +#endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else -#endif +#endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } #ifdef INET6 @@ -324,7 +320,7 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else -#endif +#endif /* INET6 */ append_sa = (struct sockaddr *)udp_in; m_adj(n, off); @@ -348,13 +344,10 @@ udp_input(struct mbuf *m, int off) struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; - struct udpcb *up; int len; struct ip save_ip; struct sockaddr_in udp_in; -#ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; -#endif ifp = m->m_pkthdr.rcvif; UDPSTAT_INC(udps_ipackets); @@ -452,34 +445,12 @@ udp_input(struct mbuf *m, int off) } else UDPSTAT_INC(udps_nosum); -#ifdef IPFIREWALL_FORWARD - /* - * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. - */ - fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); - if (fwd_tag != NULL) { - struct sockaddr_in *next_hop; - - /* - * Do the hack. - */ - next_hop = (struct sockaddr_in *)(fwd_tag + 1); - ip->ip_dst = next_hop->sin_addr; - uh->uh_dport = ntohs(next_hop->sin_port); - - /* - * Remove the tag from the packet. We don't need it anymore. - */ - m_tag_delete(m, fwd_tag); - } -#endif - - INP_INFO_RLOCK(&V_udbinfo); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct ip_moptions *imo; + INP_INFO_RLOCK(&V_udbinfo); last = NULL; LIST_FOREACH(inp, &V_udb, inp_list) { if (inp->inp_lport != uh->uh_dport) @@ -501,24 +472,24 @@ udp_input(struct mbuf *m, int off) INP_RLOCK(inp); /* - * Detached PCBs can linger in the list if someone - * holds a reference. (e.g. udp_pcblist) + * XXXRW: Because we weren't holding either the inpcb + * or the hash lock when we checked for a match + * before, we should probably recheck now that the + * inpcb lock is held. */ - if (inp->inp_socket == NULL) { - INP_RUNLOCK(inp); - continue; - } /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && - imo != NULL) { + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct sockaddr_in group; int blocked; - + if (imo == NULL) { + INP_RUNLOCK(inp); + continue; + } bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; @@ -541,24 +512,7 @@ udp_input(struct mbuf *m, int off) struct mbuf *n; n = m_copy(m, 0, M_COPYALL); - up = intoudpcb(last); - if (up->u_tun_func == NULL) { - if (n != NULL) - udp_append(last, - ip, n, - iphlen + - sizeof(struct udphdr), - &udp_in); - } else { - /* - * Engage the tunneling protocol we - * will have to leave the info_lock - * up, since we are hunting through - * multiple UDP's. - */ - - (*up->u_tun_func)(n, iphlen, last); - } + udp_append(last, ip, n, iphlen, &udp_in); INP_RUNLOCK(last); } last = inp; @@ -582,18 +536,12 @@ udp_input(struct mbuf *m, int off) * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); - goto badheadlocked; - } - up = intoudpcb(last); - if (up->u_tun_func == NULL) { - udp_append(last, ip, m, iphlen + sizeof(struct udphdr), - &udp_in); - } else { - /* - * Engage the tunneling protocol. - */ - (*up->u_tun_func)(m, iphlen, last); + if (inp) + INP_RUNLOCK(inp); + INP_INFO_RUNLOCK(&V_udbinfo); + goto badunlocked; } + udp_append(last, ip, m, iphlen, &udp_in); INP_RUNLOCK(last); INP_INFO_RUNLOCK(&V_udbinfo); return; @@ -602,8 +550,41 @@ udp_input(struct mbuf *m, int off) /* * Locate pcb for datagram. */ - inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, - ip->ip_dst, uh->uh_dport, 1, ifp); + + /* + * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. + */ + if ((m->m_flags & M_IP_NEXTHOP) && + (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { + struct sockaddr_in *next_hop; + + next_hop = (struct sockaddr_in *)(fwd_tag + 1); + + /* + * Transparently forwarded. Pretend to be the destination. + * Already got one like this? + */ + inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); + if (!inp) { + /* + * It's new. Try to find the ambushing socket. + * Because we've rewritten the destination address, + * any hardware-generated hash is ignored. + */ + inp = in_pcblookup(&V_udbinfo, ip->ip_src, + uh->uh_sport, next_hop->sin_addr, + next_hop->sin_port ? htons(next_hop->sin_port) : + uh->uh_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, ifp); + } + /* Remove the tag from the packet. We don't need it anymore. */ + m_tag_delete(m, fwd_tag); + m->m_flags &= ~M_IP_NEXTHOP; + } else + inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | + INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; @@ -617,57 +598,35 @@ udp_input(struct mbuf *m, int off) UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); - goto badheadlocked; + goto badunlocked; } if (V_udp_blackhole) - goto badheadlocked; + goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) - goto badheadlocked; + goto badunlocked; *ip = save_ip; ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - INP_INFO_RUNLOCK(&V_udbinfo); return; } /* * Check the minimum TTL for socket. */ - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); - - /* - * Detached PCBs can linger in the hash table if someone holds a - * reference. (e.g. udp_pcblist) - */ - if (inp->inp_socket == NULL) { - INP_RUNLOCK(inp); - goto badunlocked; - } + INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); - goto badunlocked; - } - up = intoudpcb(inp); - if (up->u_tun_func == NULL) { - udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in); - } else { - /* - * Engage the tunneling protocol. - */ - - (*up->u_tun_func)(m, iphlen, inp); + m_freem(m); + return; } + udp_append(inp, ip, m, iphlen, &udp_in); INP_RUNLOCK(inp); return; -badheadlocked: - if (inp) - INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); badunlocked: m_freem(m); } +#endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can @@ -691,6 +650,7 @@ udp_notify(struct inpcb *inp, int errno) return (inp); } +#ifdef INET void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { @@ -721,21 +681,20 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport, - ip->ip_src, uh->uh_sport, 0, NULL); + inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); + INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_udbinfo); } else in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], udp_notify); } +#endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) @@ -820,9 +779,9 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; - INP_WLOCK(inp); - if (!in_pcbrele(inp)) - INP_WUNLOCK(inp); + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); @@ -848,6 +807,7 @@ SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); +#ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { @@ -862,12 +822,11 @@ udp_getcred(SYSCTL_HANDLER_ARGS) error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); - INP_INFO_RLOCK(&V_udbinfo); - inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, - addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, + INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { - INP_RLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) @@ -875,10 +834,8 @@ udp_getcred(SYSCTL_HANDLER_ARGS) if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); - } else { - INP_INFO_RUNLOCK(&V_udbinfo); + } else error = ENOENT; - } if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); @@ -887,6 +844,7 @@ udp_getcred(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); +#endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) @@ -905,11 +863,15 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); - } else { + } #endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); -#ifdef INET6 } #endif return (error); @@ -981,6 +943,10 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) return (error); } +#ifdef INET +#define UH_WLOCKED 2 +#define UH_RLOCKED 1 +#define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) @@ -1010,6 +976,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } src.sin_family = 0; + INP_RLOCK(inp); tos = inp->inp_ip_tos; if (control != NULL) { /* @@ -1017,6 +984,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * stored in a single mbuf. */ if (control->m_next) { + INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); @@ -1066,6 +1034,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, m_freem(control); } if (error) { + INP_RUNLOCK(inp); m_freem(m); return (error); } @@ -1083,29 +1052,26 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. + * + * XXXRW: Check that hash locking update here is correct. */ sin = (struct sockaddr_in *)addr; - INP_RLOCK(inp); if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_RUNLOCK(inp); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); - unlock_udbinfo = 2; + INP_HASH_WLOCK(&V_udbinfo); + unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) { - INP_RUNLOCK(inp); - INP_INFO_RLOCK(&V_udbinfo); - INP_RLOCK(inp); - } - unlock_udbinfo = 1; + INP_HASH_RLOCK(&V_udbinfo); + unlock_udbinfo = UH_RLOCKED; } else - unlock_udbinfo = 0; + unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the @@ -1115,7 +1081,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { - INP_INFO_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(&V_udbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { @@ -1166,7 +1132,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { - INP_INFO_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(&V_udbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); @@ -1180,8 +1146,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { - INP_INFO_WLOCK_ASSERT(&V_udbinfo); INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(&V_udbinfo); /* * Remember addr if jailed, to prevent * rebinding. @@ -1276,25 +1242,25 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); - if (unlock_udbinfo == 2) - INP_INFO_WUNLOCK(&V_udbinfo); - else if (unlock_udbinfo == 1) - INP_INFO_RUNLOCK(&V_udbinfo); + if (unlock_udbinfo == UH_WLOCKED) + INP_HASH_WUNLOCK(&V_udbinfo); + else if (unlock_udbinfo == UH_RLOCKED) + INP_HASH_RUNLOCK(&V_udbinfo); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); - if (unlock_udbinfo == 2) + if (unlock_udbinfo == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: - if (unlock_udbinfo == 2) { + if (unlock_udbinfo == UH_WLOCKED) { + INP_HASH_WUNLOCK(&V_udbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); - } else if (unlock_udbinfo == 1) { + } else if (unlock_udbinfo == UH_RLOCKED) { + INP_HASH_RUNLOCK(&V_udbinfo); INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); } else INP_RUNLOCK(inp); m_freem(m); @@ -1303,7 +1269,6 @@ release: #if defined(IPSEC) && defined(IPSEC_NAT_T) -#ifdef INET /* * Potentially decap ESP in UDP frame. Check for an ESP header * and optional marker; if present, strip the UDP header and @@ -1332,7 +1297,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if ((m = m_pullup(m, minlen)) == NULL) { - V_ipsec4stat.in_inval++; + IPSECSTAT_INC(in_inval); return (NULL); /* Bypass caller processing. */ } data = mtod(m, caddr_t); /* Points to ip header. */ @@ -1372,7 +1337,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) uint32_t spi; if (payload <= sizeof(struct esp)) { - V_ipsec4stat.in_inval++; + IPSECSTAT_INC(in_inval); m_freem(m); return (NULL); /* Discard. */ } @@ -1393,7 +1358,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, 2 * sizeof(uint16_t), M_NOWAIT); if (tag == NULL) { - V_ipsec4stat.in_nomem++; + IPSECSTAT_INC(in_nomem); m_freem(m); return (NULL); /* Discard. */ } @@ -1435,7 +1400,6 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) (void) ipsec4_common_input(m, iphlen, ip->ip_p); return (NULL); /* NB: consumed, bypass processing. */ } -#endif /* INET */ #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ static void @@ -1445,15 +1409,15 @@ udp_abort(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -1490,6 +1454,7 @@ udp_attach(struct socket *so, int proto, struct thread *td) INP_INFO_WUNLOCK(&V_udbinfo); return (0); } +#endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f) @@ -1512,6 +1477,7 @@ udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f) return (0); } +#ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { @@ -1520,11 +1486,11 @@ udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_udbinfo); error = in_pcbbind(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -1535,15 +1501,15 @@ udp_close(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); } static int @@ -1555,25 +1521,23 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } + INP_HASH_WLOCK(&V_udbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); + INP_HASH_WUNLOCK(&V_udbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (error); } @@ -1605,21 +1569,19 @@ udp_disconnect(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); - INP_INFO_WLOCK(&V_udbinfo); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (ENOTCONN); } - + INP_HASH_WLOCK(&V_udbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; + INP_HASH_WUNLOCK(&V_udbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); return (0); } @@ -1633,6 +1595,7 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } +#endif /* INET */ int udp_shutdown(struct socket *so) @@ -1647,6 +1610,7 @@ udp_shutdown(struct socket *so) return (0); } +#ifdef INET struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, @@ -1664,3 +1628,4 @@ struct pr_usrreqs udp_usrreqs = { .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; +#endif /* INET */ diff --git a/freebsd/sys/netinet/udp_var.h b/freebsd/sys/netinet/udp_var.h index 5cf7dc9f..6b9b5362 100644 --- a/freebsd/sys/netinet/udp_var.h +++ b/freebsd/sys/netinet/udp_var.h @@ -152,7 +152,7 @@ int udp_newudpcb(struct inpcb *); void udp_discardcb(struct udpcb *); void udp_ctlinput(int, struct sockaddr *, void *); -int udp_ctloutput(struct socket *, struct sockopt *); +int udp_ctloutput(struct socket *, struct sockopt *); void udp_init(void); #ifdef VIMAGE void udp_destroy(void); |