summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet
diff options
context:
space:
mode:
authorSebastian Huber <sebastian.huber@embedded-brains.de>2013-11-06 16:20:21 +0100
committerSebastian Huber <sebastian.huber@embedded-brains.de>2013-11-11 10:08:08 +0100
commit66659ff1ad6831b0ea7425fa6ecd8a8687523658 (patch)
tree48e22b475fa8854128e0861a33fed6f78c8094b5 /freebsd/sys/netinet
parentDefine __GLOBL1() and __GLOBL() (diff)
downloadrtems-libbsd-66659ff1ad6831b0ea7425fa6ecd8a8687523658.tar.bz2
Update to FreeBSD 9.2
Diffstat (limited to 'freebsd/sys/netinet')
-rw-r--r--freebsd/sys/netinet/accf_http.c2
-rw-r--r--freebsd/sys/netinet/icmp6.h3
-rw-r--r--freebsd/sys/netinet/if_ether.c196
-rw-r--r--freebsd/sys/netinet/igmp.c7
-rw-r--r--freebsd/sys/netinet/in.c201
-rw-r--r--freebsd/sys/netinet/in.h29
-rw-r--r--freebsd/sys/netinet/in_gif.c2
-rw-r--r--freebsd/sys/netinet/in_mcast.c60
-rw-r--r--freebsd/sys/netinet/in_pcb.c697
-rw-r--r--freebsd/sys/netinet/in_pcb.h195
-rw-r--r--freebsd/sys/netinet/in_proto.c34
-rw-r--r--freebsd/sys/netinet/in_var.h15
-rw-r--r--freebsd/sys/netinet/ip.h8
-rw-r--r--freebsd/sys/netinet/ip6.h10
-rw-r--r--freebsd/sys/netinet/ip_carp.c59
-rw-r--r--freebsd/sys/netinet/ip_divert.c206
-rw-r--r--freebsd/sys/netinet/ip_dummynet.h54
-rw-r--r--freebsd/sys/netinet/ip_fastfwd.c25
-rw-r--r--freebsd/sys/netinet/ip_fw.h16
-rw-r--r--freebsd/sys/netinet/ip_gre.c17
-rw-r--r--freebsd/sys/netinet/ip_gre.h7
-rw-r--r--freebsd/sys/netinet/ip_icmp.c31
-rw-r--r--freebsd/sys/netinet/ip_input.c52
-rw-r--r--freebsd/sys/netinet/ip_ipsec.c17
-rw-r--r--freebsd/sys/netinet/ip_ipsec.h3
-rw-r--r--freebsd/sys/netinet/ip_mroute.c13
-rw-r--r--freebsd/sys/netinet/ip_mroute.h1
-rw-r--r--freebsd/sys/netinet/ip_options.c2
-rw-r--r--freebsd/sys/netinet/ip_output.c146
-rw-r--r--freebsd/sys/netinet/ip_var.h2
-rw-r--r--freebsd/sys/netinet/libalias/alias.h22
-rw-r--r--freebsd/sys/netinet/libalias/alias_db.c13
-rw-r--r--freebsd/sys/netinet/libalias/alias_sctp.c6
-rw-r--r--freebsd/sys/netinet/libalias/alias_sctp.h9
-rw-r--r--freebsd/sys/netinet/raw_ip.c83
-rw-r--r--freebsd/sys/netinet/sctp_constants.h8
-rw-r--r--freebsd/sys/netinet/sctp_indata.c2
-rw-r--r--freebsd/sys/netinet/sctp_indata.h4
-rw-r--r--freebsd/sys/netinet/sctp_input.c80
-rw-r--r--freebsd/sys/netinet/sctp_output.c33
-rw-r--r--freebsd/sys/netinet/sctp_pcb.c155
-rw-r--r--freebsd/sys/netinet/sctp_structs.h2
-rw-r--r--freebsd/sys/netinet/sctp_sysctl.c6
-rw-r--r--freebsd/sys/netinet/sctp_sysctl.h7
-rw-r--r--freebsd/sys/netinet/sctp_uio.h56
-rw-r--r--freebsd/sys/netinet/sctp_usrreq.c58
-rw-r--r--freebsd/sys/netinet/sctp_var.h52
-rw-r--r--freebsd/sys/netinet/sctputil.c26
-rw-r--r--freebsd/sys/netinet/tcp.h57
-rw-r--r--freebsd/sys/netinet/tcp_hostcache.c2
-rw-r--r--freebsd/sys/netinet/tcp_input.c609
-rw-r--r--freebsd/sys/netinet/tcp_lro.c814
-rw-r--r--freebsd/sys/netinet/tcp_lro.h60
-rw-r--r--freebsd/sys/netinet/tcp_offload.c209
-rw-r--r--freebsd/sys/netinet/tcp_offload.h364
-rw-r--r--freebsd/sys/netinet/tcp_output.c253
-rw-r--r--freebsd/sys/netinet/tcp_reass.c36
-rw-r--r--freebsd/sys/netinet/tcp_sack.c2
-rw-r--r--freebsd/sys/netinet/tcp_subr.c488
-rw-r--r--freebsd/sys/netinet/tcp_syncache.c253
-rw-r--r--freebsd/sys/netinet/tcp_syncache.h21
-rw-r--r--freebsd/sys/netinet/tcp_timer.c114
-rw-r--r--freebsd/sys/netinet/tcp_timer.h17
-rw-r--r--freebsd/sys/netinet/tcp_timewait.c72
-rw-r--r--freebsd/sys/netinet/tcp_usrreq.c331
-rw-r--r--freebsd/sys/netinet/tcp_var.h74
-rw-r--r--freebsd/sys/netinet/toecore.h130
-rw-r--r--freebsd/sys/netinet/udp.h2
-rw-r--r--freebsd/sys/netinet/udp_usrreq.c347
-rw-r--r--freebsd/sys/netinet/udp_var.h2
70 files changed, 4015 insertions, 2974 deletions
diff --git a/freebsd/sys/netinet/accf_http.c b/freebsd/sys/netinet/accf_http.c
index 97344a2c..3af867b0 100644
--- a/freebsd/sys/netinet/accf_http.c
+++ b/freebsd/sys/netinet/accf_http.c
@@ -71,7 +71,7 @@ DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
static int parse_http_version = 1;
-SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0,
+static SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0,
"HTTP accept filter");
SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW,
&parse_http_version, 1,
diff --git a/freebsd/sys/netinet/icmp6.h b/freebsd/sys/netinet/icmp6.h
index a6e68864..5483721d 100644
--- a/freebsd/sys/netinet/icmp6.h
+++ b/freebsd/sys/netinet/icmp6.h
@@ -659,7 +659,8 @@ void kmod_icmp6stat_inc(int statnum);
#define ICMPV6CTL_MLD_SOMAXSRC 22
#define ICMPV6CTL_MLD_VERSION 23
#define ICMPV6CTL_ND6_MAXQLEN 24
-#define ICMPV6CTL_MAXID 25
+#define ICMPV6CTL_NODEINFO_OLDMCPREFIX 25
+#define ICMPV6CTL_MAXID 26
#define RTF_PROBEMTU RTF_PROTO1
diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c
index 6b98161f..98ed0b36 100644
--- a/freebsd/sys/netinet/if_ether.c
+++ b/freebsd/sys/netinet/if_ether.c
@@ -79,8 +79,8 @@ __FBSDID("$FreeBSD$");
#define SDL(s) ((struct sockaddr_dl *)s)
SYSCTL_DECL(_net_link_ether);
-SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
-SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
+static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");
+static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, "");
/* timer values */
static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20
@@ -89,8 +89,8 @@ static VNET_DEFINE(int, arp_maxtries) = 5;
VNET_DEFINE(int, useloopback) = 1; /* use loopback interface for
* local traffic */
static VNET_DEFINE(int, arp_proxyall) = 0;
-static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for
- * 20 seconds */
+static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for
+ * 20 seconds */
VNET_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
static VNET_DEFINE(int, arp_maxhold) = 1;
@@ -121,7 +121,7 @@ SYSCTL_VNET_STRUCT(_net_link_ether_arp, OID_AUTO, stats, CTLFLAG_RW,
&VNET_NAME(arpstat), arpstat,
"ARP statistics (struct arpstat, net/if_arp.h)");
SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW,
- &VNET_NAME(arp_maxhold), 0,
+ &VNET_NAME(arp_maxhold), 0,
"Number of packets to hold per ARP entry");
static void arp_init(void);
@@ -169,38 +169,48 @@ arp_ifscrub(struct ifnet *ifp, uint32_t addr)
static void
arptimer(void *arg)
{
+ struct llentry *lle = (struct llentry *)arg;
struct ifnet *ifp;
- struct llentry *lle;
- int pkts_dropped;
- KASSERT(arg != NULL, ("%s: arg NULL", __func__));
- lle = (struct llentry *)arg;
+ if (lle->la_flags & LLE_STATIC) {
+ LLE_WUNLOCK(lle);
+ return;
+ }
+
ifp = lle->lle_tbl->llt_ifp;
CURVNET_SET(ifp->if_vnet);
+
+ if ((lle->la_flags & LLE_DELETED) == 0) {
+ int evt;
+
+ if (lle->la_flags & LLE_VALID)
+ evt = LLENTRY_EXPIRED;
+ else
+ evt = LLENTRY_TIMEDOUT;
+ EVENTHANDLER_INVOKE(lle_event, lle, evt);
+ }
+
+ callout_stop(&lle->la_timer);
+
+ /* XXX: LOR avoidance. We still have ref on lle. */
+ LLE_WUNLOCK(lle);
IF_AFDATA_LOCK(ifp);
LLE_WLOCK(lle);
- if (lle->la_flags & LLE_STATIC)
- LLE_WUNLOCK(lle);
- else {
- if (!callout_pending(&lle->la_timer) &&
- callout_active(&lle->la_timer)) {
- callout_stop(&lle->la_timer);
- LLE_REMREF(lle);
- pkts_dropped = llentry_free(lle);
- ARPSTAT_ADD(dropped, pkts_dropped);
- ARPSTAT_INC(timeouts);
- } else {
-#ifdef DIAGNOSTIC
- struct sockaddr *l3addr = L3_ADDR(lle);
- log(LOG_INFO,
- "arptimer issue: %p, IPv4 address: \"%s\"\n", lle,
- inet_ntoa(
- ((const struct sockaddr_in *)l3addr)->sin_addr));
-#endif
- LLE_WUNLOCK(lle);
- }
- }
+
+ /* Guard against race with other llentry_free(). */
+ if (lle->la_flags & LLE_LINKED) {
+ size_t pkts_dropped;
+
+ LLE_REMREF(lle);
+ pkts_dropped = llentry_free(lle);
+ ARPSTAT_ADD(dropped, pkts_dropped);
+ } else
+ LLE_FREE_LOCKED(lle);
+
IF_AFDATA_UNLOCK(ifp);
+
+ ARPSTAT_INC(timeouts);
+
CURVNET_RESTORE();
}
@@ -235,7 +245,7 @@ arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip,
SIN(ifa->ifa_netmask)->sin_addr.s_addr) )
break; /* found it. */
}
- if (sip == NULL) {
+ if (sip == NULL) {
printf("%s: cannot find matching address\n", __func__);
return;
}
@@ -304,18 +314,16 @@ arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m,
return (0);
}
}
- /* XXXXX
- */
retry:
- IF_AFDATA_RLOCK(ifp);
+ IF_AFDATA_RLOCK(ifp);
la = lla_lookup(LLTABLE(ifp), flags, dst);
- IF_AFDATA_RUNLOCK(ifp);
+ IF_AFDATA_RUNLOCK(ifp);
if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0)
- && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
+ && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) {
flags |= (LLE_CREATE | LLE_EXCLUSIVE);
- IF_AFDATA_WLOCK(ifp);
+ IF_AFDATA_WLOCK(ifp);
la = lla_lookup(LLTABLE(ifp), flags, dst);
- IF_AFDATA_WUNLOCK(ifp);
+ IF_AFDATA_WUNLOCK(ifp);
}
if (la == NULL) {
if (flags & LLE_CREATE)
@@ -324,10 +332,10 @@ retry:
inet_ntoa(SIN(dst)->sin_addr));
m_freem(m);
return (EINVAL);
- }
+ }
if ((la->la_flags & LLE_VALID) &&
- ((la->la_flags & LLE_STATIC) || la->la_expire > time_second)) {
+ ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
bcopy(&la->ll_addr, desten, ifp->if_addrlen);
/*
* If entry has an expiry time and it is approaching,
@@ -335,18 +343,18 @@ retry:
* arpt_down interval.
*/
if (!(la->la_flags & LLE_STATIC) &&
- time_second + la->la_preempt > la->la_expire) {
+ time_uptime + la->la_preempt > la->la_expire) {
arprequest(ifp, NULL,
&SIN(dst)->sin_addr, IF_LLADDR(ifp));
la->la_preempt--;
}
-
+
*lle = la;
error = 0;
goto done;
- }
-
+ }
+
if (la->la_flags & LLE_STATIC) { /* should not happen! */
log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n",
inet_ntoa(SIN(dst)->sin_addr));
@@ -355,7 +363,7 @@ retry:
goto done;
}
- renew = (la->la_asked == 0 || la->la_expire != time_second);
+ renew = (la->la_asked == 0 || la->la_expire != time_uptime);
if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) {
flags |= LLE_EXCLUSIVE;
LLE_RUNLOCK(la);
@@ -376,20 +384,20 @@ retry:
la->la_numheld--;
ARPSTAT_INC(dropped);
}
- }
+ }
if (la->la_hold != NULL) {
curr = la->la_hold;
while (curr->m_nextpkt != NULL)
curr = curr->m_nextpkt;
curr->m_nextpkt = m;
- } else
+ } else
la->la_hold = m;
la->la_numheld++;
if (renew == 0 && (flags & LLE_EXCLUSIVE)) {
flags &= ~LLE_EXCLUSIVE;
LLE_DOWNGRADE(la);
}
-
+
}
/*
* Return EWOULDBLOCK if we have tried less than arp_maxtries. It
@@ -407,7 +415,7 @@ retry:
int canceled;
LLE_ADDREF(la);
- la->la_expire = time_second;
+ la->la_expire = time_uptime;
canceled = callout_reset(&la->la_timer, hz * V_arpt_down,
arptimer, la);
if (canceled)
@@ -437,7 +445,7 @@ arpintr(struct mbuf *m)
if (m->m_len < sizeof(struct arphdr) &&
((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
- log(LOG_ERR, "arp: runt packet -- m_pullup failed\n");
+ log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n");
return;
}
ar = mtod(m, struct arphdr *);
@@ -445,16 +453,19 @@ arpintr(struct mbuf *m)
if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
- ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) {
- log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n",
- (unsigned char *)&ar->ar_hrd, "");
+ ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 &&
+ ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) {
+ log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)"
+ " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "",
+ ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":",
+ ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":");
m_freem(m);
return;
}
if (m->m_len < arphdr_len(ar)) {
if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
- log(LOG_ERR, "arp: runt packet\n");
+ log(LOG_NOTICE, "arp: runt packet\n");
m_freem(m);
return;
}
@@ -490,17 +501,19 @@ arpintr(struct mbuf *m)
static int log_arp_wrong_iface = 1;
static int log_arp_movements = 1;
static int log_arp_permanent_modify = 1;
+static int allow_multicast = 0;
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
&log_arp_wrong_iface, 0,
"log arp packets arriving on the wrong interface");
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
- &log_arp_movements, 0,
- "log arp replies from MACs different than the one in the cache");
+ &log_arp_movements, 0,
+ "log arp replies from MACs different than the one in the cache");
SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
- &log_arp_permanent_modify, 0,
- "log arp replies from MACs different than the one in the permanent arp entry");
-
+ &log_arp_permanent_modify, 0,
+ "log arp replies from MACs different than the one in the permanent arp entry");
+SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
+ &allow_multicast, 0, "accept multicast addresses");
static void
in_arpinput(struct mbuf *m)
@@ -530,11 +543,27 @@ in_arpinput(struct mbuf *m)
req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
- log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n");
+ log(LOG_NOTICE, "in_arp: runt packet -- m_pullup failed\n");
return;
}
ah = mtod(m, struct arphdr *);
+ /*
+ * ARP is only for IPv4 so we can reject packets with
+ * a protocol length not equal to an IPv4 address.
+ */
+ if (ah->ar_pln != sizeof(struct in_addr)) {
+ log(LOG_NOTICE, "in_arp: requested protocol length != %zu\n",
+ sizeof(struct in_addr));
+ goto drop;
+ }
+
+ if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
+ log(LOG_NOTICE, "arp: %*D is multicast\n",
+ ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
+ goto drop;
+ }
+
op = ntohs(ah->ar_op);
(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
@@ -553,7 +582,7 @@ in_arpinput(struct mbuf *m)
*/
IN_IFADDR_RLOCK();
LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
- if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
+ if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
ia->ia_ifp == ifp) &&
itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
ifa_ref(&ia->ia_ifa);
@@ -570,7 +599,7 @@ in_arpinput(struct mbuf *m)
}
}
LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
- if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
+ if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
ia->ia_ifp == ifp) &&
isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
ifa_ref(&ia->ia_ifa);
@@ -633,7 +662,7 @@ match:
if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
goto drop; /* it's from me, ignore it. */
if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
- log(LOG_ERR,
+ log(LOG_NOTICE,
"arp: link address is broadcast for IP address %s!\n",
inet_ntoa(isaddr));
goto drop;
@@ -662,14 +691,14 @@ match:
sin.sin_addr = isaddr;
flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0;
flags |= LLE_EXCLUSIVE;
- IF_AFDATA_LOCK(ifp);
+ IF_AFDATA_LOCK(ifp);
la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin);
IF_AFDATA_UNLOCK(ifp);
if (la != NULL) {
/* the following is not an error when doing bridging */
if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) {
if (log_arp_wrong_iface)
- log(LOG_ERR, "arp: %s is on %s "
+ log(LOG_WARNING, "arp: %s is on %s "
"but got reply from %*D on %s\n",
inet_ntoa(isaddr),
la->lle_tbl->llt_ifp->if_xname,
@@ -692,7 +721,7 @@ match:
goto reply;
}
if (log_arp_movements) {
- log(LOG_INFO, "arp: %s moved from %*D "
+ log(LOG_INFO, "arp: %s moved from %*D "
"to %*D on %s\n",
inet_ntoa(isaddr),
ifp->if_addrlen,
@@ -701,23 +730,25 @@ match:
ifp->if_xname);
}
}
-
+
if (ifp->if_addrlen != ah->ar_hln) {
LLE_WUNLOCK(la);
- log(LOG_WARNING,
- "arp from %*D: addr len: new %d, i/f %d (ignored)",
- ifp->if_addrlen, (u_char *) ar_sha(ah), ":",
- ah->ar_hln, ifp->if_addrlen);
- goto reply;
+ log(LOG_WARNING, "arp from %*D: addr len: new %d, "
+ "i/f %d (ignored)\n", ifp->if_addrlen,
+ (u_char *) ar_sha(ah), ":", ah->ar_hln,
+ ifp->if_addrlen);
+ goto drop;
}
(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
la->la_flags |= LLE_VALID;
+ EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
+
if (!(la->la_flags & LLE_STATIC)) {
int canceled;
LLE_ADDREF(la);
- la->la_expire = time_second + V_arpt_keep;
+ la->la_expire = time_uptime + V_arpt_keep;
canceled = callout_reset(&la->la_timer,
hz * V_arpt_keep, arptimer, la);
if (canceled)
@@ -725,7 +756,7 @@ match:
}
la->la_asked = 0;
la->la_preempt = V_arp_maxtries;
- /*
+ /*
* The packets are all freed within the call to the output
* routine.
*
@@ -747,7 +778,7 @@ match:
}
} else
LLE_WUNLOCK(la);
- } /* end of FIB loop */
+ }
reply:
if (op != ARPOP_REQUEST)
goto drop;
@@ -761,7 +792,7 @@ reply:
struct llentry *lle = NULL;
sin.sin_addr = itaddr;
- IF_AFDATA_LOCK(ifp);
+ IF_AFDATA_LOCK(ifp);
lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
IF_AFDATA_UNLOCK(ifp);
@@ -776,7 +807,7 @@ reply:
if (!V_arp_proxyall)
goto drop;
-
+
sin.sin_addr = itaddr;
/* XXX MRT use table 0 for arp reply */
rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
@@ -804,7 +835,7 @@ reply:
* wrong network.
*/
sin.sin_addr = isaddr;
-
+
/* XXX MRT use table 0 for arp checks */
rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
if (!rt)
@@ -820,8 +851,7 @@ reply:
RTFREE_LOCKED(rt);
#ifdef DEBUG_PROXY
- printf("arp: proxying for %s\n",
- inet_ntoa(itaddr));
+ printf("arp: proxying for %s\n", inet_ntoa(itaddr));
#endif
}
}
@@ -843,8 +873,8 @@ reply:
(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
ah->ar_op = htons(ARPOP_REPLY);
ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
- m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
- m->m_pkthdr.len = m->m_len;
+ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
+ m->m_pkthdr.len = m->m_len;
m->m_pkthdr.rcvif = NULL;
sa.sa_family = AF_ARP;
sa.sa_len = 2;
@@ -865,7 +895,7 @@ arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) {
arprequest(ifp, &IA_SIN(ifa)->sin_addr,
&IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
- /*
+ /*
* interface address is considered static entry
* because the output of the arp utility shows
* that L2 entry as permanent
diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c
index f2949c14..3056fa3a 100644
--- a/freebsd/sys/netinet/igmp.c
+++ b/freebsd/sys/netinet/igmp.c
@@ -187,7 +187,7 @@ static const struct netisr_handler igmp_nh = {
struct mtx igmp_mtx;
struct mbuf *m_raopt; /* Router Alert option */
-MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
+static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
/*
* VIMAGE-wide globals.
@@ -282,8 +282,9 @@ SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
/*
* Non-virtualized sysctls.
*/
-SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE,
- sysctl_igmp_ifinfo, "Per-interface IGMPv3 state");
+static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
+ "Per-interface IGMPv3 state");
static __inline void
igmp_save_context(struct mbuf *m, struct ifnet *ifp)
diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c
index 7bf52c6b..0c3f72bc 100644
--- a/freebsd/sys/netinet/in.c
+++ b/freebsd/sys/netinet/in.c
@@ -78,11 +78,6 @@ static int in_ifinit(struct ifnet *,
struct in_ifaddr *, struct sockaddr_in *, int);
static void in_purgemaddrs(struct ifnet *);
-static VNET_DEFINE(int, subnetsarelocal);
-#define V_subnetsarelocal VNET(subnetsarelocal)
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW,
- &VNET_NAME(subnetsarelocal), 0,
- "Treat all subnets as directly connected");
static VNET_DEFINE(int, sameprefixcarponly);
#define V_sameprefixcarponly VNET(sameprefixcarponly)
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW,
@@ -97,9 +92,7 @@ VNET_DECLARE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */
/*
* Return 1 if an internet address is for a ``local'' host
- * (one to which we have a connection). If subnetsarelocal
- * is true, this includes other subnets of the local net.
- * Otherwise, it includes only the directly-connected (sub)nets.
+ * (one to which we have a connection).
*/
int
in_localaddr(struct in_addr in)
@@ -108,19 +101,10 @@ in_localaddr(struct in_addr in)
register struct in_ifaddr *ia;
IN_IFADDR_RLOCK();
- if (V_subnetsarelocal) {
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
- if ((i & ia->ia_netmask) == ia->ia_net) {
- IN_IFADDR_RUNLOCK();
- return (1);
- }
- }
- } else {
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
- if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
- IN_IFADDR_RUNLOCK();
- return (1);
- }
+ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
+ IN_IFADDR_RUNLOCK();
+ return (1);
}
}
IN_IFADDR_RUNLOCK();
@@ -541,20 +525,20 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
hostIsNew = 0;
}
if (ifra->ifra_mask.sin_len) {
- /*
+ /*
* QL: XXX
* Need to scrub the prefix here in case
* the issued command is SIOCAIFADDR with
* the same address, but with a different
* prefix length. And if the prefix length
- * is the same as before, then the call is
+ * is the same as before, then the call is
* un-necessarily executed here.
*/
in_ifscrub(ifp, ia, LLE_STATIC);
ia->ia_sockmask = ifra->ifra_mask;
ia->ia_sockmask.sin_family = AF_INET;
ia->ia_subnetmask =
- ntohl(ia->ia_sockmask.sin_addr.s_addr);
+ ntohl(ia->ia_sockmask.sin_addr.s_addr);
maskIsNew = 1;
}
if ((ifp->if_flags & IFF_POINTOPOINT) &&
@@ -567,7 +551,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
(hostIsNew || maskIsNew))
error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0);
if (error != 0 && iaIsNew)
- goto out;
+ break;
if ((ifp->if_flags & IFF_BROADCAST) &&
(ifra->ifra_broadaddr.sin_family == AF_INET))
@@ -898,23 +882,19 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
in_ifscrub(ifp, ia, LLE_STATIC);
ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
}
- if (IN_CLASSA(i))
- ia->ia_netmask = IN_CLASSA_NET;
- else if (IN_CLASSB(i))
- ia->ia_netmask = IN_CLASSB_NET;
- else
- ia->ia_netmask = IN_CLASSC_NET;
/*
- * The subnet mask usually includes at least the standard network part,
- * but may may be smaller in the case of supernetting.
- * If it is set, we believe it.
+ * Be compatible with network classes, if netmask isn't supplied,
+ * guess it based on classes.
*/
if (ia->ia_subnetmask == 0) {
- ia->ia_subnetmask = ia->ia_netmask;
+ if (IN_CLASSA(i))
+ ia->ia_subnetmask = IN_CLASSA_NET;
+ else if (IN_CLASSB(i))
+ ia->ia_subnetmask = IN_CLASSB_NET;
+ else
+ ia->ia_subnetmask = IN_CLASSC_NET;
ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
- } else
- ia->ia_netmask &= ia->ia_subnetmask;
- ia->ia_net = i & ia->ia_netmask;
+ }
ia->ia_subnet = i & ia->ia_subnetmask;
in_socktrim(&ia->ia_sockmask);
/*
@@ -927,10 +907,11 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
*/
ia->ia_ifa.ifa_metric = ifp->if_metric;
if (ifp->if_flags & IFF_BROADCAST) {
- ia->ia_broadaddr.sin_addr.s_addr =
- htonl(ia->ia_subnet | ~ia->ia_subnetmask);
- ia->ia_netbroadcast.s_addr =
- htonl(ia->ia_net | ~ ia->ia_netmask);
+ if (ia->ia_subnetmask == IN_RFC3021_MASK)
+ ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST;
+ else
+ ia->ia_broadaddr.sin_addr.s_addr =
+ htonl(ia->ia_subnet | ~ia->ia_subnetmask);
} else if (ifp->if_flags & IFF_LOOPBACK) {
ia->ia_dstaddr = ia->ia_addr;
flags |= RTF_HOST;
@@ -966,8 +947,8 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
RT_ADDREF(ia_ro.ro_rt);
RTFREE_LOCKED(ia_ro.ro_rt);
} else
- error = ifa_add_loopback_route((struct ifaddr *)ia,
- (struct sockaddr *)&ia->ia_addr);
+ error = ifa_add_loopback_route((struct ifaddr *)ia,
+ (struct sockaddr *)&ia->ia_addr);
if (error == 0)
ia->ia_flags |= IFA_RTSELF;
if (ia_ro.ro_rt != NULL)
@@ -982,10 +963,10 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
? RTF_HOST : 0)
/*
- * Generate a routing message when inserting or deleting
+ * Generate a routing message when inserting or deleting
* an interface address alias.
*/
-static void in_addralias_rtmsg(int cmd, struct in_addr *prefix,
+static void in_addralias_rtmsg(int cmd, struct in_addr *prefix,
struct in_ifaddr *target)
{
struct route pfx_ro;
@@ -1008,16 +989,13 @@ static void in_addralias_rtmsg(int cmd, struct in_addr *prefix,
/* QL: XXX
* Point the gateway to the new interface
- * address as if a new prefix route entry has
- * been added through the new address alias.
- * All other parts of the rtentry is accurate,
+ * address as if a new prefix route entry has
+ * been added through the new address alias.
+ * All other parts of the rtentry is accurate,
* e.g., rt_key, rt_mask, rt_ifp etc.
*/
- msg_rt.rt_gateway =
- (struct sockaddr *)&target->ia_addr;
- rt_newaddrmsg(cmd,
- (struct ifaddr *)target,
- 0, &msg_rt);
+ msg_rt.rt_gateway = (struct sockaddr *)&target->ia_addr;
+ rt_newaddrmsg(cmd, (struct ifaddr *)target, 0, &msg_rt);
RTFREE(pfx_ro.ro_rt);
}
return;
@@ -1065,7 +1043,7 @@ in_addprefix(struct in_ifaddr *target, int flags)
*/
if (ia->ia_flags & IFA_ROUTE) {
#ifdef RADIX_MPATH
- if (ia->ia_addr.sin_addr.s_addr ==
+ if (ia->ia_addr.sin_addr.s_addr ==
target->ia_addr.sin_addr.s_addr) {
IN_IFADDR_RUNLOCK();
return (EEXIST);
@@ -1142,7 +1120,7 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags)
}
if (freeit && (flags & LLE_STATIC)) {
error = ifa_del_loopback_route((struct ifaddr *)target,
- (struct sockaddr *)&target->ia_addr);
+ (struct sockaddr *)&target->ia_addr);
if (error == 0)
target->ia_flags &= ~IFA_RTSELF;
}
@@ -1222,8 +1200,8 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags)
mask0.sin_len = sizeof(mask0);
mask0.sin_family = AF_INET;
mask0.sin_addr.s_addr = target->ia_subnetmask;
- lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0,
- (struct sockaddr *)&mask0, flags);
+ lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0,
+ (struct sockaddr *)&mask0, flags);
/*
* As no-one seem to have this prefix, we can remove the route.
@@ -1261,17 +1239,18 @@ in_broadcast(struct in_addr in, struct ifnet *ifp)
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (ifa->ifa_addr->sa_family == AF_INET &&
(in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
- in.s_addr == ia->ia_netbroadcast.s_addr ||
/*
- * Check for old-style (host 0) broadcast.
+ * Check for old-style (host 0) broadcast, but
+ * taking into account that RFC 3021 obsoletes it.
*/
- t == ia->ia_subnet || t == ia->ia_net) &&
+ (ia->ia_subnetmask != IN_RFC3021_MASK &&
+ t == ia->ia_subnet)) &&
/*
* Check for an all one subnetmask. These
* only exist when an interface gets a secondary
* address.
*/
- ia->ia_subnetmask != (u_long)0xffffffff)
+ ia->ia_subnetmask != (u_long)0xffffffff)
return (1);
return (0);
#undef ia
@@ -1343,6 +1322,20 @@ struct in_llentry {
struct sockaddr_in l3_addr4;
};
+/*
+ * Deletes an address from the address table.
+ * This function is called by the timer functions
+ * such as arptimer() and nd6_llinfo_timer(), and
+ * the caller does the locking.
+ */
+static void
+in_lltable_free(struct lltable *llt, struct llentry *lle)
+{
+ LLE_WUNLOCK(lle);
+ LLE_LOCK_DESTROY(lle);
+ free(lle, M_LLTABLE);
+}
+
static struct llentry *
in_lltable_new(const struct sockaddr *l3addr, u_int flags)
{
@@ -1352,69 +1345,53 @@ in_lltable_new(const struct sockaddr *l3addr, u_int flags)
if (lle == NULL) /* NB: caller generates msg */
return NULL;
- callout_init(&lle->base.la_timer, CALLOUT_MPSAFE);
/*
* For IPv4 this will trigger "arpresolve" to generate
* an ARP request.
*/
- lle->base.la_expire = time_second; /* mark expired */
+ lle->base.la_expire = time_uptime; /* mark expired */
lle->l3_addr4 = *(const struct sockaddr_in *)l3addr;
lle->base.lle_refcnt = 1;
+ lle->base.lle_free = in_lltable_free;
LLE_LOCK_INIT(&lle->base);
- return &lle->base;
-}
+ callout_init_rw(&lle->base.la_timer, &lle->base.lle_lock,
+ CALLOUT_RETURNUNLOCKED);
-/*
- * Deletes an address from the address table.
- * This function is called by the timer functions
- * such as arptimer() and nd6_llinfo_timer(), and
- * the caller does the locking.
- */
-static void
-in_lltable_free(struct lltable *llt, struct llentry *lle)
-{
- LLE_WUNLOCK(lle);
- LLE_LOCK_DESTROY(lle);
- free(lle, M_LLTABLE);
+ return (&lle->base);
}
-
#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
(((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 )
static void
-in_lltable_prefix_free(struct lltable *llt,
- const struct sockaddr *prefix,
- const struct sockaddr *mask,
- u_int flags)
+in_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix,
+ const struct sockaddr *mask, u_int flags)
{
const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix;
const struct sockaddr_in *msk = (const struct sockaddr_in *)mask;
struct llentry *lle, *next;
- register int i;
+ int i;
size_t pkts_dropped;
- for (i=0; i < LLTBL_HASHTBL_SIZE; i++) {
+ IF_AFDATA_WLOCK(llt->llt_ifp);
+ for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
-
- /*
+ /*
* (flags & LLE_STATIC) means deleting all entries
- * including static ARP entries
+ * including static ARP entries.
*/
- if (IN_ARE_MASKED_ADDR_EQUAL((struct sockaddr_in *)L3_ADDR(lle),
- pfx, msk) &&
- ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) {
- int canceled;
-
- canceled = callout_drain(&lle->la_timer);
+ if (IN_ARE_MASKED_ADDR_EQUAL(satosin(L3_ADDR(lle)),
+ pfx, msk) && ((flags & LLE_STATIC) ||
+ !(lle->la_flags & LLE_STATIC))) {
LLE_WLOCK(lle);
- if (canceled)
+ if (callout_stop(&lle->la_timer))
LLE_REMREF(lle);
pkts_dropped = llentry_free(lle);
ARPSTAT_ADD(dropped, pkts_dropped);
}
}
}
+ IF_AFDATA_WUNLOCK(llt->llt_ifp);
}
@@ -1440,19 +1417,18 @@ in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr
*/
if (rt->rt_flags & RTF_GATEWAY) {
if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp ||
- rt->rt_ifp->if_type != IFT_ETHER ||
- (rt->rt_ifp->if_flags &
- (IFF_NOARP | IFF_STATICARP)) != 0 ||
- memcmp(rt->rt_gateway->sa_data, l3addr->sa_data,
- sizeof(in_addr_t)) != 0) {
+ rt->rt_ifp->if_type != IFT_ETHER ||
+ (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 ||
+ memcmp(rt->rt_gateway->sa_data, l3addr->sa_data,
+ sizeof(in_addr_t)) != 0) {
RTFREE_LOCKED(rt);
return (EINVAL);
}
}
/*
- * Make sure that at least the destination address is covered
- * by the route. This is for handling the case where 2 or more
+ * Make sure that at least the destination address is covered
+ * by the route. This is for handling the case where 2 or more
* interfaces have the same prefix. An incoming packet arrives
* on one interface and the corresponding outgoing packet leaves
* another interface.
@@ -1512,7 +1488,7 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add
hashkey = sin->sin_addr.s_addr;
lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)];
LIST_FOREACH(lle, lleh, lle_next) {
- struct sockaddr_in *sa2 = (struct sockaddr_in *)L3_ADDR(lle);
+ struct sockaddr_in *sa2 = satosin(L3_ADDR(lle));
if (lle->la_flags & LLE_DELETED)
continue;
if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr)
@@ -1521,7 +1497,7 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add
if (lle == NULL) {
#ifdef DIAGNOSTIC
if (flags & LLE_DELETE)
- log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle);
+ log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle);
#endif
if (!(flags & LLE_CREATE))
return (NULL);
@@ -1547,18 +1523,24 @@ in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3add
lle->lle_tbl = llt;
lle->lle_head = lleh;
+ lle->la_flags |= LLE_LINKED;
LIST_INSERT_HEAD(lleh, lle, lle_next);
} else if (flags & LLE_DELETE) {
if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
LLE_WLOCK(lle);
- lle->la_flags = LLE_DELETED;
- LLE_WUNLOCK(lle);
+ lle->la_flags |= LLE_DELETED;
+ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
#ifdef DIAGNOSTIC
- log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
+ log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
#endif
+ if ((lle->la_flags &
+ (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC)
+ llentry_free(lle);
+ else
+ LLE_WUNLOCK(lle);
}
lle = (void *)-1;
-
+
}
if (LLE_IS_VALID(lle)) {
if (flags & LLE_EXCLUSIVE)
@@ -1590,7 +1572,7 @@ in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
struct sockaddr_dl *sdl;
-
+
/* skip deleted entries */
if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
continue;
@@ -1659,7 +1641,6 @@ in_domifattach(struct ifnet *ifp)
llt = lltable_init(ifp, AF_INET);
if (llt != NULL) {
- llt->llt_free = in_lltable_free;
llt->llt_prefix_free = in_lltable_prefix_free;
llt->llt_lookup = in_lltable_lookup;
llt->llt_dump = in_lltable_dump;
diff --git a/freebsd/sys/netinet/in.h b/freebsd/sys/netinet/in.h
index 16df5f24..92ba45a6 100644
--- a/freebsd/sys/netinet/in.h
+++ b/freebsd/sys/netinet/in.h
@@ -93,27 +93,7 @@ typedef __socklen_t socklen_t;
#define _SOCKLEN_T_DECLARED
#endif
-/* Avoid collision with original definition in sys/socket.h. */
-#ifndef _STRUCT_SOCKADDR_STORAGE_DECLARED
-/*
- * RFC 2553: protocol-independent placeholder for socket addresses
- */
-#define _SS_MAXSIZE 128U
-#define _SS_ALIGNSIZE (sizeof(__int64_t))
-#define _SS_PAD1SIZE (_SS_ALIGNSIZE - sizeof(unsigned char) - \
- sizeof(sa_family_t))
-#define _SS_PAD2SIZE (_SS_MAXSIZE - sizeof(unsigned char) - \
- sizeof(sa_family_t) - _SS_PAD1SIZE - _SS_ALIGNSIZE)
-
-struct sockaddr_storage {
- unsigned char ss_len; /* address length */
- sa_family_t ss_family; /* address family */
- char __ss_pad1[_SS_PAD1SIZE];
- __int64_t __ss_align; /* force desired struct alignment */
- char __ss_pad2[_SS_PAD2SIZE];
-};
-#define _STRUCT_SOCKADDR_STORAGE_DECLARED
-#endif
+#include <sys/_sockaddr_storage.h>
/* Socket address, internet style. */
struct sockaddr_in {
@@ -147,6 +127,7 @@ __END_DECLS
#endif /* !_KERNEL && __BSD_VISIBLE */
#if __POSIX_VISIBLE >= 200112
+#define IPPROTO_IPV6 41 /* IP6 header */
#define IPPROTO_RAW 255 /* raw IP packet */
#define INET_ADDRSTRLEN 16
#endif
@@ -198,7 +179,6 @@ __END_DECLS
#define IPPROTO_CMTP 38 /* Control Message Transport */
#define IPPROTO_TPXX 39 /* TP++ Transport */
#define IPPROTO_IL 40 /* IL transport protocol */
-#define IPPROTO_IPV6 41 /* IP6 header */
#define IPPROTO_SDRP 42 /* Source Demand Routing */
#define IPPROTO_ROUTING 43 /* IP6 routing header */
#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */
@@ -260,10 +240,12 @@ __END_DECLS
#define IPPROTO_GMTP 100 /* GMTP*/
#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */
#define IPPROTO_SCTP 132 /* SCTP */
+#define IPPROTO_MH 135 /* IPv6 Mobility Header */
/* 101-254: Partly Unassigned */
#define IPPROTO_PIM 103 /* Protocol Independent Mcast */
#define IPPROTO_CARP 112 /* CARP */
#define IPPROTO_PGM 113 /* PGM */
+#define IPPROTO_MPLS 137 /* MPLS-in-IP */
#define IPPROTO_PFSYNC 240 /* PFSYNC */
/* 255: Reserved */
/* BSD Private, local use, namespace incursion, no longer used */
@@ -275,6 +257,7 @@ __END_DECLS
/* Only used internally, so can be outside the range of valid IP protocols. */
#define IPPROTO_DIVERT 258 /* divert pseudo-protocol */
+#define IPPROTO_SEND 259 /* SeND pseudo-protocol */
/*
* Defined to avoid confusion. The master value is defined by
@@ -414,6 +397,8 @@ __END_DECLS
#define IN_LOOPBACKNET 127 /* official! */
+#define IN_RFC3021_MASK (u_int32_t)0xfffffffe
+
/*
* Options for use with [gs]etsockopt at the IP level.
* First word of comment is data type; bool is stored in int.
diff --git a/freebsd/sys/netinet/in_gif.c b/freebsd/sys/netinet/in_gif.c
index 5461334b..332d7ff4 100644
--- a/freebsd/sys/netinet/in_gif.c
+++ b/freebsd/sys/netinet/in_gif.c
@@ -258,6 +258,8 @@ in_gif_output(struct ifnet *ifp, int family, struct mbuf *m)
#endif
}
+ m_addr_changed(m);
+
error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL);
if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) &&
diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c
index e4b31968..6d748f1f 100644
--- a/freebsd/sys/netinet/in_mcast.c
+++ b/freebsd/sys/netinet/in_mcast.c
@@ -157,7 +157,8 @@ static int inp_set_multicast_if(struct inpcb *, struct sockopt *);
static int inp_set_source_filters(struct inpcb *, struct sockopt *);
static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
-SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast");
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0,
+ "IPv4 multicast");
static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
@@ -176,7 +177,7 @@ SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_TUN,
&in_mcast_loop, 0, "Loopback multicast datagrams by default");
TUNABLE_INT("net.inet.ip.mcast.loop", &in_mcast_loop);
-SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
+static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
"Per-interface stack-wide source filters");
@@ -1861,6 +1862,7 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)
ifp = NULL;
imf = NULL;
+ lims = NULL;
error = 0;
is_new = 0;
@@ -1978,34 +1980,47 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)
error = EINVAL;
goto out_inp_locked;
}
- /* Throw out duplicates. */
+ /*
+ * Throw out duplicates.
+ *
+ * XXX FIXME: This makes a naive assumption that
+ * even if entries exist for *ssa in this imf,
+ * they will be rejected as dupes, even if they
+ * are not valid in the current mode (in-mode).
+ *
+ * in_msource is transactioned just as for anything
+ * else in SSM -- but note naive use of inm_graft()
+ * below for allocating new filter entries.
+ *
+ * This is only an issue if someone mixes the
+ * full-state SSM API with the delta-based API,
+ * which is discouraged in the relevant RFCs.
+ */
lims = imo_match_source(imo, idx, &ssa->sa);
- if (lims != NULL) {
+ if (lims != NULL /*&&
+ lims->imsl_st[1] == MCAST_INCLUDE*/) {
error = EADDRNOTAVAIL;
goto out_inp_locked;
}
} else {
/*
- * MCAST_JOIN_GROUP on an existing inclusive
- * membership is an error; if you want to change
- * filter mode, you must use the userland API
- * setsourcefilter().
- */
- if (imf->imf_st[1] == MCAST_INCLUDE) {
- error = EINVAL;
- goto out_inp_locked;
- }
- /*
* MCAST_JOIN_GROUP on an existing exclusive
* membership is an error; return EADDRINUSE
* to preserve 4.4BSD API idempotence, and
* avoid tedious detour to code below.
* NOTE: This is bending RFC 3678 a bit.
+ *
+ * On an existing inclusive membership, this is also
+ * an error; if you want to change filter mode,
+ * you must use the userland API setsourcefilter().
+ * XXX We don't reject this for imf in UNDEFINED
+ * state at t1, because allocation of a filter
+ * is atomic with allocation of a membership.
*/
- if (imf->imf_st[1] == MCAST_EXCLUDE) {
+ error = EINVAL;
+ if (imf->imf_st[1] == MCAST_EXCLUDE)
error = EADDRINUSE;
- goto out_inp_locked;
- }
+ goto out_inp_locked;
}
}
@@ -2040,6 +2055,11 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)
* membership of the group. The in_multi may not have
* been allocated yet if this is a new membership, however,
* the in_mfilter slot will be allocated and must be initialized.
+ *
+ * Note: Grafting of exclusive mode filters doesn't happen
+ * in this path.
+ * XXX: Should check for non-NULL lims (node exists but may
+ * not be in-mode) for interop with full-state API.
*/
if (ssa->ss.ss_family != AF_UNSPEC) {
/* Membership starts in IN mode */
@@ -2424,8 +2444,10 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
if (error)
return (error);
- if (msfr.msfr_nsrcs > in_mcast_maxsocksrc ||
- (msfr.msfr_fmode != MCAST_EXCLUDE &&
+ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
+ return (ENOBUFS);
+
+ if ((msfr.msfr_fmode != MCAST_EXCLUDE &&
msfr.msfr_fmode != MCAST_INCLUDE))
return (EINVAL);
diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c
index 2b50ae8f..5100ac9b 100644
--- a/freebsd/sys/netinet/in_pcb.c
+++ b/freebsd/sys/netinet/in_pcb.c
@@ -4,8 +4,12 @@
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California.
* Copyright (c) 2007-2009 Robert N. M. Watson
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -44,17 +48,20 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_ipsec.h>
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
+#include <rtems/bsd/local/opt_pcbgroup.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
+#include <sys/callout.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/refcount.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
@@ -70,17 +77,22 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
+#if defined(INET) || defined(INET6)
#include <netinet/in.h>
#include <netinet/in_pcb.h>
-#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
+#endif
+#ifdef INET
+#include <netinet/in_var.h>
+#endif
#ifdef INET6
#include <netinet/ip6.h>
-#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
#endif /* INET6 */
@@ -91,6 +103,8 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
+static struct callout ipport_tick_callout;
+
/*
* These configure the range of local port addresses assigned to
* "unspecified" outgoing connections/packets/whatever.
@@ -120,12 +134,17 @@ static VNET_DEFINE(int, ipport_tcplastcount);
#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
+static void in_pcbremlists(struct inpcb *inp);
+#ifdef INET
+static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
+ struct in_addr faddr, u_int fport_arg,
+ struct in_addr laddr, u_int lport_arg,
+ int lookupflags, struct ifnet *ifp);
+
#define RANGECHK(var, min, max) \
if ((var) < (min)) { (var) = (min); } \
else if ((var) > (max)) { (var) = (max); }
-static void in_pcbremlists(struct inpcb *inp);
-
static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
{
@@ -149,7 +168,8 @@ sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
#undef RANGECHK
-SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
+static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
+ "IP Ports");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
@@ -182,6 +202,7 @@ SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
&VNET_NAME(ipport_randomtime), 0,
"Minimum time to keep sequental port "
"allocation before switching to a random one");
+#endif
/*
* in_pcb.c: manage the Protocol Control Blocks.
@@ -192,6 +213,59 @@ SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
*/
/*
+ * Initialize an inpcbinfo -- we should be able to reduce the number of
+ * arguments in time.
+ */
+void
+in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
+ struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
+ char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini,
+ uint32_t inpcbzone_flags, u_int hashfields)
+{
+
+ INP_INFO_LOCK_INIT(pcbinfo, name);
+ INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
+#ifdef VIMAGE
+ pcbinfo->ipi_vnet = curvnet;
+#endif
+ pcbinfo->ipi_listhead = listhead;
+ LIST_INIT(pcbinfo->ipi_listhead);
+ pcbinfo->ipi_count = 0;
+ pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
+ &pcbinfo->ipi_hashmask);
+ pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
+ &pcbinfo->ipi_porthashmask);
+#ifdef PCBGROUP
+ in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
+#endif
+ pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
+ NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR,
+ inpcbzone_flags);
+ uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
+}
+
+/*
+ * Destroy an inpcbinfo.
+ */
+void
+in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
+{
+
+ KASSERT(pcbinfo->ipi_count == 0,
+ ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
+
+ hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
+ hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
+ pcbinfo->ipi_porthashmask);
+#ifdef PCBGROUP
+ in_pcbgroup_destroy(pcbinfo);
+#endif
+ uma_zdestroy(pcbinfo->ipi_zone);
+ INP_HASH_LOCK_DESTROY(pcbinfo);
+ INP_INFO_LOCK_DESTROY(pcbinfo);
+}
+
+/*
* Allocate a PCB and associate it with the socket.
* On success return with the PCB locked.
*/
@@ -242,7 +316,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
#endif
INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- inp->inp_refcount = 1; /* Reference from the inpcbinfo */
+ refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
@@ -253,13 +327,14 @@ out:
return (error);
}
+#ifdef INET
int
in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
{
int anonport, error;
- INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
return (EINVAL);
@@ -278,11 +353,12 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
+#endif
#if defined(INET) || defined(INET6)
int
in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
- struct ucred *cred, int wild)
+ struct ucred *cred, int lookupflags)
{
struct inpcbinfo *pcbinfo;
struct inpcb *tmpinp;
@@ -299,8 +375,8 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
* Because no actual state changes occur here, a global write lock on
* the pcbinfo isn't required.
*/
- INP_INFO_LOCK_ASSERT(pcbinfo);
INP_LOCK_ASSERT(inp);
+ INP_HASH_LOCK_ASSERT(pcbinfo);
if (inp->inp_flags & INP_HIGHPORT) {
first = V_ipport_hifirstauto; /* sysctl */
@@ -358,6 +434,7 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
laddr = *laddrp;
}
#endif
+ tmpinp = NULL; /* Make compiler happy. */
lport = *lportp;
if (dorandom)
@@ -376,14 +453,14 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
#ifdef INET6
if ((inp->inp_vflag & INP_IPV6) != 0)
tmpinp = in6_pcblookup_local(pcbinfo,
- &inp->in6p_laddr, lport, wild, cred);
+ &inp->in6p_laddr, lport, lookupflags, cred);
#endif
#if defined(INET) && defined(INET6)
else
#endif
#ifdef INET
tmpinp = in_pcblookup_local(pcbinfo, laddr,
- lport, wild, cred);
+ lport, lookupflags, cred);
#endif
} while (tmpinp != NULL);
@@ -395,8 +472,26 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
return (0);
}
+
+/*
+ * Return cached socket options.
+ */
+short
+inp_so_options(const struct inpcb *inp)
+{
+ short so_options;
+
+ so_options = 0;
+
+ if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+ so_options |= SO_REUSEPORT;
+ if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+ so_options |= SO_REUSEADDR;
+ return (so_options);
+}
#endif /* INET || INET6 */
+#ifdef INET
/*
* Set up a bind operation on a PCB, performing port allocation
* as required, but do not actually modify the PCB. Callers can
@@ -415,15 +510,14 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct in_addr laddr;
u_short lport = 0;
- int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
+ int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
int error;
/*
- * Because no actual state changes occur here, a global write lock on
- * the pcbinfo isn't required.
+ * No state changes, so read locks are sufficient here.
*/
- INP_INFO_LOCK_ASSERT(pcbinfo);
INP_LOCK_ASSERT(inp);
+ INP_HASH_LOCK_ASSERT(pcbinfo);
if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
return (EADDRNOTAVAIL);
@@ -431,7 +525,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
- wild = INPLOOKUP_WILDCARD;
+ lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
return (error);
@@ -505,8 +599,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_socket->so_options &
- SO_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) == 0) &&
#ifndef __rtems__
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
@@ -516,7 +609,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
return (EADDRINUSE);
}
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
- lport, wild, cred);
+ lport, lookupflags, cred);
if (t && (t->inp_flags & INP_TIMEWAIT)) {
/*
* XXXRW: If an incpb has had its timewait
@@ -524,19 +617,18 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
* being in use (for now). This is better
* than a panic, but not desirable.
*/
- tw = intotw(inp);
+ tw = intotw(t);
if (tw == NULL ||
(reuseport & tw->tw_so_options) == 0)
return (EADDRINUSE);
- } else if (t &&
- (reuseport & t->inp_socket->so_options) == 0) {
+ } else if (t && (reuseport & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) !=
INADDR_ANY ||
- INP_SOCKAF(so) ==
- INP_SOCKAF(t->inp_socket))
+ (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
+ (t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
return (EADDRINUSE);
}
@@ -545,7 +637,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
if (*lportp != 0)
lport = *lportp;
if (lport == 0) {
- error = in_pcb_lport(inp, &laddr, &lport, cred, wild);
+ error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
if (error != 0)
return (error);
@@ -562,14 +654,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
* then pick one.
*/
int
-in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
+in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
+ struct ucred *cred, struct mbuf *m)
{
u_short lport, fport;
in_addr_t laddr, faddr;
int anonport, error;
- INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
lport = inp->inp_lport;
laddr = inp->inp_laddr.s_addr;
@@ -595,13 +688,20 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
inp->inp_laddr.s_addr = laddr;
inp->inp_faddr.s_addr = faddr;
inp->inp_fport = fport;
- in_pcbrehash(inp);
+ in_pcbrehash_mbuf(inp, m);
if (anonport)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
+int
+in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
+{
+
+ return (in_pcbconnect_mbuf(inp, nam, cred, NULL));
+}
+
/*
* Do proper source address selection on an unbound socket in case
* of connect. Take jails into account as well.
@@ -857,8 +957,8 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
* Because a global state change doesn't actually occur here, a read
* lock is sufficient.
*/
- INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
INP_LOCK_ASSERT(inp);
+ INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
if (oinpp != NULL)
*oinpp = NULL;
@@ -933,8 +1033,8 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
if (error)
return (error);
}
- oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
- 0, NULL);
+ oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport,
+ laddr, lport, 0, NULL);
if (oinp != NULL) {
if (oinpp != NULL)
*oinpp = oinp;
@@ -957,13 +1057,14 @@ void
in_pcbdisconnect(struct inpcb *inp)
{
- INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
inp->inp_faddr.s_addr = INADDR_ANY;
inp->inp_fport = 0;
in_pcbrehash(inp);
}
+#endif
/*
* in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
@@ -982,53 +1083,18 @@ in_pcbdetach(struct inpcb *inp)
}
/*
- * in_pcbfree_internal() frees an inpcb that has been detached from its
- * socket, and whose reference count has reached 0. It will also remove the
- * inpcb from any global lists it might remain on.
- */
-static void
-in_pcbfree_internal(struct inpcb *inp)
-{
- struct inpcbinfo *ipi = inp->inp_pcbinfo;
-
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
-
- INP_INFO_WLOCK_ASSERT(ipi);
- INP_WLOCK_ASSERT(inp);
-
-#ifdef IPSEC
- if (inp->inp_sp != NULL)
- ipsec_delete_pcbpolicy(inp);
-#endif /* IPSEC */
- inp->inp_gencnt = ++ipi->ipi_gencnt;
- in_pcbremlists(inp);
-#ifdef INET6
- if (inp->inp_vflag & INP_IPV6PROTO) {
- ip6_freepcbopts(inp->in6p_outputopts);
- if (inp->in6p_moptions != NULL)
- ip6_freemoptions(inp->in6p_moptions);
- }
-#endif
- if (inp->inp_options)
- (void)m_free(inp->inp_options);
- if (inp->inp_moptions != NULL)
- inp_freemoptions(inp->inp_moptions);
- inp->inp_vflag = 0;
- crfree(inp->inp_cred);
-
-#ifdef MAC
- mac_inpcb_destroy(inp);
-#endif
- INP_WUNLOCK(inp);
- uma_zfree(ipi->ipi_zone, inp);
-}
-
-/*
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
- * but where the inpcb lock is already held.
+ * but where the inpcb lock may already held, or when acquiring a reference
+ * via a pcbgroup.
+ *
+ * in_pcbref() should be used only to provide brief memory stability, and
+ * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
+ * garbage collect the inpcb if it has been in_pcbfree()'d from another
+ * context. Until in_pcbrele() has returned that the inpcb is still valid,
+ * lock and rele are the *only* safe operations that may be performed on the
+ * inpcb.
*
* While the inpcb will not be freed, releasing the inpcb lock means that the
* connection's state may change, so the caller should be careful to
@@ -1039,11 +1105,9 @@ void
in_pcbref(struct inpcb *inp)
{
- INP_WLOCK_ASSERT(inp);
-
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
- inp->inp_refcount++;
+ refcount_acquire(&inp->inp_refcount);
}
/*
@@ -1051,47 +1115,118 @@ in_pcbref(struct inpcb *inp)
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
* return a flag indicating whether or not the inpcb remains valid. If it is
* valid, we return with the inpcb lock held.
+ *
+ * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
+ * reference on an inpcb. Historically more work was done here (actually, in
+ * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
+ * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely
+ * about memory stability (and continued use of the write lock).
*/
int
-in_pcbrele(struct inpcb *inp)
+in_pcbrele_rlocked(struct inpcb *inp)
{
-#ifdef INVARIANTS
- struct inpcbinfo *ipi = inp->inp_pcbinfo;
-#endif
+ struct inpcbinfo *pcbinfo;
+
+ KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+
+ INP_RLOCK_ASSERT(inp);
+
+ if (refcount_release(&inp->inp_refcount) == 0) {
+ /*
+ * If the inpcb has been freed, let the caller know, even if
+ * this isn't the last reference.
+ */
+ if (inp->inp_flags2 & INP_FREED) {
+ INP_RUNLOCK(inp);
+ return (1);
+ }
+ return (0);
+ }
+
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+
+ INP_RUNLOCK(inp);
+ pcbinfo = inp->inp_pcbinfo;
+ uma_zfree(pcbinfo->ipi_zone, inp);
+ return (1);
+}
+
+int
+in_pcbrele_wlocked(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo;
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
- INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
- inp->inp_refcount--;
- if (inp->inp_refcount > 0)
+ if (refcount_release(&inp->inp_refcount) == 0)
return (0);
- in_pcbfree_internal(inp);
+
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+
+ INP_WUNLOCK(inp);
+ pcbinfo = inp->inp_pcbinfo;
+ uma_zfree(pcbinfo->ipi_zone, inp);
return (1);
}
/*
+ * Temporary wrapper.
+ */
+int
+in_pcbrele(struct inpcb *inp)
+{
+
+ return (in_pcbrele_wlocked(inp));
+}
+
+/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
- * released using in_pcbrele(), but the inpcb is still unlocked.
+ * released using in_pcbrele(), but the inpcb is still unlocked. Almost all
+ * work, including removal from global lists, is done in this context, where
+ * the pcbinfo lock is held.
*/
void
in_pcbfree(struct inpcb *inp)
{
-#ifdef INVARIANTS
- struct inpcbinfo *ipi = inp->inp_pcbinfo;
-#endif
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
- __func__));
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- INP_INFO_WLOCK_ASSERT(ipi);
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
- if (!in_pcbrele(inp))
+ /* XXXRW: Do as much as possible here. */
+#ifdef IPSEC
+ if (inp->inp_sp != NULL)
+ ipsec_delete_pcbpolicy(inp);
+#endif /* IPSEC */
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ in_pcbremlists(inp);
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ ip6_freepcbopts(inp->in6p_outputopts);
+ if (inp->in6p_moptions != NULL)
+ ip6_freemoptions(inp->in6p_moptions);
+ }
+#endif
+ if (inp->inp_options)
+ (void)m_free(inp->inp_options);
+#ifdef INET
+ if (inp->inp_moptions != NULL)
+ inp_freemoptions(inp->inp_moptions);
+#endif
+ inp->inp_vflag = 0;
+ inp->inp_flags2 |= INP_FREED;
+ crfree(inp->inp_cred);
+#ifdef MAC
+ mac_inpcb_destroy(inp);
+#endif
+ if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
}
@@ -1106,12 +1241,6 @@ in_pcbfree(struct inpcb *inp)
* maintaining the invariant that so_pcb always points to a valid inpcb until
* in_pcbdetach().
*
- * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
- * lists, but can lead to confusing netstat output, as open sockets with
- * closed TCP connections will no longer appear to have their bound port
- * number. An explicit flag would be better, as it would allow us to leave
- * the port number intact after the connection is dropped.
- *
* XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
* in_pcbnotifyall() and in_pcbpurgeif0()?
*/
@@ -1119,23 +1248,32 @@ void
in_pcbdrop(struct inpcb *inp)
{
- INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
+ /*
+ * XXXRW: Possibly we should protect the setting of INP_DROPPED with
+ * the hash lock...?
+ */
inp->inp_flags |= INP_DROPPED;
if (inp->inp_flags & INP_INHASHLIST) {
struct inpcbport *phd = inp->inp_phd;
+ INP_HASH_WLOCK(inp->inp_pcbinfo);
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
LIST_REMOVE(phd, phd_hash);
free(phd, M_PCB);
}
+ INP_HASH_WUNLOCK(inp->inp_pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
+#ifdef PCBGROUP
+ in_pcbgroup_remove(inp);
+#endif
}
}
+#ifdef INET
/*
* Common routines to return the socket addresses associated with inpcbs.
*/
@@ -1259,12 +1397,13 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
}
/*
- * Lookup a PCB based on the local address and port.
+ * Lookup a PCB based on the local address and port. Caller must hold the
+ * hash lock. No inpcb locks or references are acquired.
*/
#define INP_LOOKUP_MAPPED_PCB_COST 3
struct inpcb *
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
- u_short lport, int wild_okay, struct ucred *cred)
+ u_short lport, int lookupflags, struct ucred *cred)
{
struct inpcb *inp;
#ifdef INET6
@@ -1274,9 +1413,12 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
#endif
int wildcard;
- INP_INFO_LOCK_ASSERT(pcbinfo);
+ KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
+ ("%s: invalid lookup flags %d", __func__, lookupflags));
- if (!wild_okay) {
+ INP_HASH_LOCK_ASSERT(pcbinfo);
+
+ if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
struct inpcbhead *head;
/*
* Look for an unconnected (wildcard foreign addr) PCB that
@@ -1377,19 +1519,166 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
}
#undef INP_LOOKUP_MAPPED_PCB_COST
+#ifdef PCBGROUP
/*
- * Lookup PCB in hash list.
+ * Lookup PCB in hash list, using pcbgroup tables.
*/
-struct inpcb *
-in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
- u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
+static struct inpcb *
+in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
+ struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
+ u_int lport_arg, int lookupflags, struct ifnet *ifp)
+{
+ struct inpcbhead *head;
+ struct inpcb *inp, *tmpinp;
+ u_short fport = fport_arg, lport = lport_arg;
+
+ /*
+ * First look for an exact match.
+ */
+ tmpinp = NULL;
+ INP_GROUP_LOCK(pcbgroup);
+ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
+ pcbgroup->ipg_hashmask)];
+ LIST_FOREACH(inp, head, inp_pcbgrouphash) {
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr == faddr.s_addr &&
+ inp->inp_laddr.s_addr == laddr.s_addr &&
+ inp->inp_fport == fport &&
+ inp->inp_lport == lport) {
+ /*
+ * XXX We should be able to directly return
+ * the inp here, without any checks.
+ * Well unless both bound with SO_REUSEPORT?
+ */
+ if (prison_flag(inp->inp_cred, PR_IP4))
+ goto found;
+ if (tmpinp == NULL)
+ tmpinp = inp;
+ }
+ }
+ if (tmpinp != NULL) {
+ inp = tmpinp;
+ goto found;
+ }
+
+ /*
+ * Then look for a wildcard match, if requested.
+ */
+ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
+ struct inpcb *local_wild = NULL, *local_exact = NULL;
+#ifdef INET6
+ struct inpcb *local_wild_mapped = NULL;
+#endif
+ struct inpcb *jail_wild = NULL;
+ struct inpcbhead *head;
+ int injail;
+
+ /*
+ * Order of socket selection - we always prefer jails.
+ * 1. jailed, non-wild.
+ * 2. jailed, wild.
+ * 3. non-jailed, non-wild.
+ * 4. non-jailed, wild.
+ */
+ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
+ 0, pcbinfo->ipi_wildmask)];
+ LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
+#ifdef INET6
+ /* XXX inp locking */
+ if ((inp->inp_vflag & INP_IPV4) == 0)
+ continue;
+#endif
+ if (inp->inp_faddr.s_addr != INADDR_ANY ||
+ inp->inp_lport != lport)
+ continue;
+
+ /* XXX inp locking */
+ if (ifp && ifp->if_type == IFT_FAITH &&
+ (inp->inp_flags & INP_FAITH) == 0)
+ continue;
+
+ injail = prison_flag(inp->inp_cred, PR_IP4);
+ if (injail) {
+ if (prison_check_ip4(inp->inp_cred,
+ &laddr) != 0)
+ continue;
+ } else {
+ if (local_exact != NULL)
+ continue;
+ }
+
+ if (inp->inp_laddr.s_addr == laddr.s_addr) {
+ if (injail)
+ goto found;
+ else
+ local_exact = inp;
+ } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
+#ifdef INET6
+ /* XXX inp locking, NULL check */
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ local_wild_mapped = inp;
+ else
+#endif /* INET6 */
+ if (injail)
+ jail_wild = inp;
+ else
+ local_wild = inp;
+ }
+ } /* LIST_FOREACH */
+ inp = jail_wild;
+ if (inp == NULL)
+ inp = local_exact;
+ if (inp == NULL)
+ inp = local_wild;
+#ifdef INET6
+ if (inp == NULL)
+ inp = local_wild_mapped;
+#endif /* defined(INET6) */
+ if (inp != NULL)
+ goto found;
+ } /* if (lookupflags & INPLOOKUP_WILDCARD) */
+ INP_GROUP_UNLOCK(pcbgroup);
+ return (NULL);
+
+found:
+ in_pcbref(inp);
+ INP_GROUP_UNLOCK(pcbgroup);
+ if (lookupflags & INPLOOKUP_WLOCKPCB) {
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp))
+ return (NULL);
+ } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
+ INP_RLOCK(inp);
+ if (in_pcbrele_rlocked(inp))
+ return (NULL);
+ } else
+ panic("%s: locking bug", __func__);
+ return (inp);
+}
+#endif /* PCBGROUP */
+
+/*
+ * Lookup PCB in hash list, using pcbinfo tables. This variation assumes
+ * that the caller has locked the hash list, and will not perform any further
+ * locking or reference operations on either the hash list or the connection.
+ */
+static struct inpcb *
+in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+ u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
struct ifnet *ifp)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
u_short fport = fport_arg, lport = lport_arg;
- INP_INFO_LOCK_ASSERT(pcbinfo);
+ KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
+ ("%s: invalid lookup flags %d", __func__, lookupflags));
+
+ INP_HASH_LOCK_ASSERT(pcbinfo);
/*
* First look for an exact match.
@@ -1424,7 +1713,7 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
/*
* Then look for a wildcard match, if requested.
*/
- if (wildcard == INPLOOKUP_WILDCARD) {
+ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
struct inpcb *local_wild = NULL, *local_exact = NULL;
#ifdef INET6
struct inpcb *local_wild_mapped = NULL;
@@ -1495,16 +1784,112 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
if (local_wild_mapped != NULL)
return (local_wild_mapped);
#endif /* defined(INET6) */
- } /* if (wildcard == INPLOOKUP_WILDCARD) */
+ } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
return (NULL);
}
/*
+ * Lookup PCB in hash list, using pcbinfo tables. This variation locks the
+ * hash list lock, and will return the inpcb locked (i.e., requires
+ * INPLOOKUP_LOCKPCB).
+ */
+static struct inpcb *
+in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+ u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
+ struct ifnet *ifp)
+{
+ struct inpcb *inp;
+
+ INP_HASH_RLOCK(pcbinfo);
+ inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
+ (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
+ if (inp != NULL) {
+ in_pcbref(inp);
+ INP_HASH_RUNLOCK(pcbinfo);
+ if (lookupflags & INPLOOKUP_WLOCKPCB) {
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp))
+ return (NULL);
+ } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
+ INP_RLOCK(inp);
+ if (in_pcbrele_rlocked(inp))
+ return (NULL);
+ } else
+ panic("%s: locking bug", __func__);
+ } else
+ INP_HASH_RUNLOCK(pcbinfo);
+ return (inp);
+}
+
+/*
+ * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
+ * from which a pre-calculated hash value may be extracted.
+ *
+ * Possibly more of this logic should be in in_pcbgroup.c.
+ */
+struct inpcb *
+in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
+ struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
+{
+#if defined(PCBGROUP)
+ struct inpcbgroup *pcbgroup;
+#endif
+
+ KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
+ ("%s: invalid lookup flags %d", __func__, lookupflags));
+ KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
+ ("%s: LOCKPCB not set", __func__));
+
+#if defined(PCBGROUP)
+ if (in_pcbgroup_enabled(pcbinfo)) {
+ pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
+ fport);
+ return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
+ laddr, lport, lookupflags, ifp));
+ }
+#endif
+ return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
+ lookupflags, ifp));
+}
+
+struct inpcb *
+in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+ u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
+ struct ifnet *ifp, struct mbuf *m)
+{
+#ifdef PCBGROUP
+ struct inpcbgroup *pcbgroup;
+#endif
+
+ KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
+ ("%s: invalid lookup flags %d", __func__, lookupflags));
+ KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
+ ("%s: LOCKPCB not set", __func__));
+
+#ifdef PCBGROUP
+ if (in_pcbgroup_enabled(pcbinfo)) {
+ pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
+ m->m_pkthdr.flowid);
+ if (pcbgroup != NULL)
+ return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
+ fport, laddr, lport, lookupflags, ifp));
+ pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
+ fport);
+ return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
+ laddr, lport, lookupflags, ifp));
+ }
+#endif
+ return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
+ lookupflags, ifp));
+}
+#endif /* INET */
+
+/*
* Insert PCB onto various hash lists.
*/
-int
-in_pcbinshash(struct inpcb *inp)
+static int
+in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
{
struct inpcbhead *pcbhash;
struct inpcbporthead *pcbporthash;
@@ -1512,8 +1897,9 @@ in_pcbinshash(struct inpcb *inp)
struct inpcbport *phd;
u_int32_t hashkey_faddr;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
+
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
("in_pcbinshash: INP_INHASHLIST"));
@@ -1553,24 +1939,54 @@ in_pcbinshash(struct inpcb *inp)
LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
inp->inp_flags |= INP_INHASHLIST;
+#ifdef PCBGROUP
+ if (do_pcbgroup_update)
+ in_pcbgroup_update(inp);
+#endif
return (0);
}
/*
+ * For now, there are two public interfaces to insert an inpcb into the hash
+ * lists -- one that does update pcbgroups, and one that doesn't. The latter
+ * is used only in the TCP syncache, where in_pcbinshash is called before the
+ * full 4-tuple is set for the inpcb, and we don't want to install in the
+ * pcbgroup until later.
+ *
+ * XXXRW: This seems like a misfeature. in_pcbinshash should always update
+ * connection groups, and partially initialised inpcbs should not be exposed
+ * to either reservation hash tables or pcbgroups.
+ */
+int
+in_pcbinshash(struct inpcb *inp)
+{
+
+ return (in_pcbinshash_internal(inp, 1));
+}
+
+int
+in_pcbinshash_nopcbgroup(struct inpcb *inp)
+{
+
+ return (in_pcbinshash_internal(inp, 0));
+}
+
+/*
* Move PCB to the proper hash bucket when { faddr, fport } have been
* changed. NOTE: This does not handle the case of the lport changing (the
* hashed port list would have to be updated as well), so the lport must
* not change after in_pcbinshash() has been called.
*/
void
-in_pcbrehash(struct inpcb *inp)
+in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbhead *head;
u_int32_t hashkey_faddr;
- INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
+
KASSERT(inp->inp_flags & INP_INHASHLIST,
("in_pcbrehash: !INP_INHASHLIST"));
@@ -1586,6 +2002,20 @@ in_pcbrehash(struct inpcb *inp)
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
+
+#ifdef PCBGROUP
+ if (m != NULL)
+ in_pcbgroup_update_mbuf(inp, m);
+ else
+ in_pcbgroup_update(inp);
+#endif
+}
+
+void
+in_pcbrehash(struct inpcb *inp)
+{
+
+ in_pcbrehash_mbuf(inp, NULL);
}
/*
@@ -1603,16 +2033,21 @@ in_pcbremlists(struct inpcb *inp)
if (inp->inp_flags & INP_INHASHLIST) {
struct inpcbport *phd = inp->inp_phd;
+ INP_HASH_WLOCK(pcbinfo);
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
LIST_REMOVE(phd, phd_hash);
free(phd, M_PCB);
}
+ INP_HASH_WUNLOCK(pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
}
LIST_REMOVE(inp, inp_list);
pcbinfo->ipi_count--;
+#ifdef PCBGROUP
+ in_pcbgroup_remove(inp);
+#endif
}
/*
@@ -1643,7 +2078,7 @@ in_pcbsosetlabel(struct socket *so)
* allocation. We return to random allocation only once we drop below
* ipport_randomcps for at least ipport_randomtime seconds.
*/
-void
+static void
ipport_tick(void *xtp)
{
VNET_ITERATOR_DECL(vnet_iter);
@@ -1664,6 +2099,30 @@ ipport_tick(void *xtp)
callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
}
+static void
+ip_fini(void *xtp)
+{
+
+ callout_stop(&ipport_tick_callout);
+}
+
+/*
+ * The ipport_callout should start running at about the time we attach the
+ * inet or inet6 domains.
+ */
+static void
+ipport_tick_init(const void *unused __unused)
+{
+
+ /* Start ipport_tick. */
+ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
+ callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
+ SHUTDOWN_PRI_DEFAULT);
+}
+SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
+ ipport_tick_init, NULL);
+
void
inp_wlock(struct inpcb *inp)
{
diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h
index 9f602ce2..a78c6ab6 100644
--- a/freebsd/sys/netinet/in_pcb.h
+++ b/freebsd/sys/netinet/in_pcb.h
@@ -1,8 +1,12 @@
/*-
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California.
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -40,8 +44,10 @@
#include <sys/_rwlock.h>
#ifdef _KERNEL
+#include <rtems/bsd/sys/lock.h>
#include <sys/rwlock.h>
#include <net/vnet.h>
+#include <vm/uma.h>
#endif
#define in6pcb inpcb /* for KAME src sync over BSD*'s */
@@ -136,6 +142,7 @@ struct icmp6_filter;
*
* Key:
* (c) - Constant after initialization
+ * (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
* (s) - Protected by another subsystem's locks
@@ -155,9 +162,12 @@ struct icmp6_filter;
*/
struct inpcb {
LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */
+ LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
+ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
+ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */
struct socket *inp_socket; /* (i) back pointer to socket */
struct ucred *inp_cred; /* (c) cache of socket cred */
u_int32_t inp_flow; /* (i) IPv6 flow information */
@@ -169,8 +179,9 @@ struct inpcb {
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
uint32_t inp_flowid; /* (x) flow id / queue id */
u_int inp_refcount; /* (i) refcount */
- void *inp_pspare[4]; /* (x) rtentry / general use */
- u_int inp_ispare[4]; /* general use */
+ void *inp_pspare[5]; /* (x) route caching / general use */
+ u_int inp_ispare[6]; /* (x) route caching / user cookie /
+ * general use */
/* Local and foreign ports, local and foreign addr. */
struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */
@@ -259,53 +270,93 @@ struct inpcbport {
u_short phd_port;
};
-/*
+/*-
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
+ *
+ * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock,
+ * the former covering mutable global fields (such as the global pcb list),
+ * and the latter covering the hashed lookup tables. The lock order is:
+ *
+ * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks}
+ *
+ * Locking key:
+ *
+ * (c) Constant or nearly constant after initialisation
+ * (g) Locked by ipi_lock
+ * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
+ * (p) Protected by one or more pcbgroup locks
+ * (x) Synchronisation properties poorly defined
*/
struct inpcbinfo {
/*
- * Global list of inpcbs on the protocol.
+ * Global lock protecting global inpcb list, inpcb count, etc.
*/
- struct inpcbhead *ipi_listhead;
- u_int ipi_count;
+ struct rwlock ipi_lock;
/*
- * Global hash of inpcbs, hashed by local and foreign addresses and
- * port numbers.
+ * Global list of inpcbs on the protocol.
*/
- struct inpcbhead *ipi_hashbase;
- u_long ipi_hashmask;
+ struct inpcbhead *ipi_listhead; /* (g) */
+ u_int ipi_count; /* (g) */
/*
- * Global hash of inpcbs, hashed by only local port number.
+ * Generation count -- incremented each time a connection is allocated
+ * or freed.
*/
- struct inpcbporthead *ipi_porthashbase;
- u_long ipi_porthashmask;
+ u_quad_t ipi_gencnt; /* (g) */
/*
* Fields associated with port lookup and allocation.
*/
- u_short ipi_lastport;
- u_short ipi_lastlow;
- u_short ipi_lasthi;
+ u_short ipi_lastport; /* (x) */
+ u_short ipi_lastlow; /* (x) */
+ u_short ipi_lasthi; /* (x) */
/*
* UMA zone from which inpcbs are allocated for this protocol.
*/
- struct uma_zone *ipi_zone;
+ struct uma_zone *ipi_zone; /* (c) */
/*
- * Generation count--incremented each time a connection is allocated
- * or freed.
+ * Connection groups associated with this protocol. These fields are
+ * constant, but pcbgroup structures themselves are protected by
+ * per-pcbgroup locks.
*/
- u_quad_t ipi_gencnt;
- struct rwlock ipi_lock;
+ struct inpcbgroup *ipi_pcbgroups; /* (c) */
+ u_int ipi_npcbgroups; /* (c) */
+ u_int ipi_hashfields; /* (c) */
+
+ /*
+ * Global lock protecting non-pcbgroup hash lookup tables.
+ */
+ struct rwlock ipi_hash_lock;
+
+ /*
+ * Global hash of inpcbs, hashed by local and foreign addresses and
+ * port numbers.
+ */
+ struct inpcbhead *ipi_hashbase; /* (h) */
+ u_long ipi_hashmask; /* (h) */
+
+ /*
+ * Global hash of inpcbs, hashed by only local port number.
+ */
+ struct inpcbporthead *ipi_porthashbase; /* (h) */
+ u_long ipi_porthashmask; /* (h) */
+
+ /*
+ * List of wildcard inpcbs for use with pcbgroups. In the past, was
+ * per-pcbgroup but is now global. All pcbgroup locks must be held
+ * to modify the list, so any is sufficient to read it.
+ */
+ struct inpcbhead *ipi_wildbase; /* (p) */
+ u_long ipi_wildmask; /* (p) */
/*
* Pointer to network stack instance
*/
- struct vnet *ipi_vnet;
+ struct vnet *ipi_vnet; /* (c) */
/*
* general use 2
@@ -313,6 +364,32 @@ struct inpcbinfo {
void *ipi_pspare[2];
};
+#ifdef _KERNEL
+/*
+ * Connection groups hold sets of connections that have similar CPU/thread
+ * affinity. Each connection belongs to exactly one connection group.
+ */
+struct inpcbgroup {
+ /*
+ * Per-connection group hash of inpcbs, hashed by local and foreign
+ * addresses and port numbers.
+ */
+ struct inpcbhead *ipg_hashbase; /* (c) */
+ u_long ipg_hashmask; /* (c) */
+
+ /*
+ * Notional affinity of this pcbgroup.
+ */
+ u_int ipg_cpu; /* (p) */
+
+ /*
+ * Per-connection group lock, not to be confused with ipi_lock.
+ * Protects the hash table hung off the group, but also the global
+ * wildcard list in inpcbinfo.
+ */
+ struct mtx ipg_lock;
+} __aligned(CACHE_LINE_SIZE);
+
#define INP_LOCK_INIT(inp, d, t) \
rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK)
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
@@ -330,7 +407,6 @@ struct inpcbinfo {
#define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED)
#define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
-#ifdef _KERNEL
/*
* These locking functions are for inpcb consumers outside of sys/netinet,
* more specifically, they were added for the benefit of TOE drivers. The
@@ -366,6 +442,7 @@ struct tcpcb *
inp_inpcbtotcpcb(struct inpcb *inp);
void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp);
+short inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
@@ -384,6 +461,26 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+#define INP_HASH_LOCK_INIT(ipi, d) \
+ rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
+#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock)
+#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
+ RA_LOCKED)
+#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
+ RA_WLOCKED)
+
+#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \
+ MTX_DEF | MTX_DUPOK)
+#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock)
+
+#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock)
+#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
+#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock)
+
#define INP_PCBHASH(faddr, lport, fport, mask) \
(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
#define INP_PCBPORTHASH(lport, mask) \
@@ -444,8 +541,21 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
*/
#define INP_LLE_VALID 0x00000001 /* cached lle is valid */
#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */
+#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */
+#define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */
+#define INP_FREED 0x00000010 /* inp itself is not valid */
+#define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */
+
+/*
+ * Flags passed to in_pcblookup*() functions.
+ */
+#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */
+#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */
+#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */
+
+#define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
+ INPLOOKUP_WLOCKPCB)
-#define INPLOOKUP_WILDCARD 1
#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb)
#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */
@@ -453,6 +563,13 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af)
+/*
+ * Constants for pcbinfo.ipi_hashfields.
+ */
+#define IPI_HASHFIELDS_NONE 0
+#define IPI_HASHFIELDS_2TUPLE 1
+#define IPI_HASHFIELDS_4TUPLE 2
+
#ifdef _KERNEL
VNET_DECLARE(int, ipport_reservedhigh);
VNET_DECLARE(int, ipport_reservedlow);
@@ -482,7 +599,23 @@ VNET_DECLARE(int, ipport_tcpallocs);
#define V_ipport_stoprandom VNET(ipport_stoprandom)
#define V_ipport_tcpallocs VNET(ipport_tcpallocs)
-extern struct callout ipport_tick_callout;
+void in_pcbinfo_destroy(struct inpcbinfo *);
+void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
+ int, int, char *, uma_init, uma_fini, uint32_t, u_int);
+
+struct inpcbgroup *
+ in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t);
+struct inpcbgroup *
+ in_pcbgroup_byinpcb(struct inpcb *);
+struct inpcbgroup *
+ in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short,
+ struct in_addr, u_short);
+void in_pcbgroup_destroy(struct inpcbinfo *);
+int in_pcbgroup_enabled(struct inpcbinfo *);
+void in_pcbgroup_init(struct inpcbinfo *, u_int, int);
+void in_pcbgroup_remove(struct inpcb *);
+void in_pcbgroup_update(struct inpcb *);
+void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *);
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
int in_pcballoc(struct socket *, struct inpcbinfo *);
@@ -492,6 +625,8 @@ int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
u_short *, struct ucred *);
int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
+int in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *,
+ struct mbuf *);
int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
u_short *, in_addr_t *, u_short *, struct inpcb **,
struct ucred *);
@@ -500,24 +635,30 @@ void in_pcbdisconnect(struct inpcb *);
void in_pcbdrop(struct inpcb *);
void in_pcbfree(struct inpcb *);
int in_pcbinshash(struct inpcb *);
+int in_pcbinshash_nopcbgroup(struct inpcb *);
struct inpcb *
in_pcblookup_local(struct inpcbinfo *,
struct in_addr, u_short, int, struct ucred *);
struct inpcb *
- in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int,
+ in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *);
+struct inpcb *
+ in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
+ struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
int, struct inpcb *(*)(struct inpcb *, int));
void in_pcbref(struct inpcb *);
void in_pcbrehash(struct inpcb *);
+void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *);
int in_pcbrele(struct inpcb *);
+int in_pcbrele_rlocked(struct inpcb *);
+int in_pcbrele_wlocked(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
int in_getsockaddr(struct socket *so, struct sockaddr **nam);
struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
-void ipport_tick(void *xtp);
#endif /* _KERNEL */
#endif /* !_NETINET_IN_PCB_H_ */
diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c
index b479e09e..1eef2c72 100644
--- a/freebsd/sys/netinet/in_proto.c
+++ b/freebsd/sys/netinet/in_proto.c
@@ -37,8 +37,8 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_ipx.h>
#include <rtems/bsd/local/opt_mrouting.h>
#include <rtems/bsd/local/opt_ipsec.h>
+#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
-#include <rtems/bsd/local/opt_pf.h>
#include <rtems/bsd/local/opt_sctp.h>
#include <rtems/bsd/local/opt_mpath.h>
@@ -52,14 +52,26 @@ __FBSDID("$FreeBSD$");
#include <sys/queue.h>
#include <sys/sysctl.h>
+/*
+ * While this file provides the domain and protocol switch tables for IPv4, it
+ * also provides the sysctl node declarations for net.inet.* often shared with
+ * IPv6 for common features or by upper layer protocols. In case of no IPv4
+ * support compile out everything but these sysctl nodes.
+ */
+#ifdef INET
#include <net/if.h>
#include <net/route.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <net/vnet.h>
+#endif /* INET */
+#if defined(INET) || defined(INET6)
#include <netinet/in.h>
+#endif
+
+#ifdef INET
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
@@ -90,11 +102,6 @@ static struct pr_usrreqs nousrreqs;
#include <netinet/sctp_var.h>
#endif /* SCTP */
-#ifdef DEV_PFSYNC
-#include <net/pfvar.h>
-#include <net/if_pfsync.h>
-#endif
-
FEATURE(inet, "Internet Protocol version 4");
extern struct domain inetdomain;
@@ -306,17 +313,6 @@ struct protosw inetsw[] = {
.pr_ctloutput = rip_ctloutput,
.pr_usrreqs = &rip_usrreqs
},
-#ifdef DEV_PFSYNC
-{
- .pr_type = SOCK_RAW,
- .pr_domain = &inetdomain,
- .pr_protocol = IPPROTO_PFSYNC,
- .pr_flags = PR_ATOMIC|PR_ADDR,
- .pr_input = pfsync_input,
- .pr_ctloutput = rip_ctloutput,
- .pr_usrreqs = &rip_usrreqs
-},
-#endif /* DEV_PFSYNC */
/* Spacer n-times for loadable protocols. */
IPPROTOSPACER,
IPPROTOSPACER,
@@ -364,6 +360,7 @@ struct domain inetdomain = {
};
VNET_DOMAIN_SET(inet);
+#endif /* INET */
SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0,
"Internet Family");
@@ -385,6 +382,3 @@ SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW, 0, "IPCOMP");
SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW, 0, "IPIP");
#endif /* IPSEC */
SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW");
-#ifdef DEV_PFSYNC
-SYSCTL_NODE(_net_inet, IPPROTO_PFSYNC, pfsync, CTLFLAG_RW, 0, "PFSYNC");
-#endif
diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h
index c04d45b9..b8477309 100644
--- a/freebsd/sys/netinet/in_var.h
+++ b/freebsd/sys/netinet/in_var.h
@@ -60,12 +60,9 @@ struct in_ifaddr {
struct ifaddr ia_ifa; /* protocol-independent info */
#define ia_ifp ia_ifa.ifa_ifp
#define ia_flags ia_ifa.ifa_flags
- /* ia_{,sub}net{,mask} in host order */
- u_long ia_net; /* network number of interface */
- u_long ia_netmask; /* mask of net part */
- u_long ia_subnet; /* subnet number, including net */
- u_long ia_subnetmask; /* mask of subnet part */
- struct in_addr ia_netbroadcast; /* to recognize net broadcasts */
+ /* ia_subnet{,mask} in host order */
+ u_long ia_subnet; /* subnet address */
+ u_long ia_subnetmask; /* mask of subnet */
LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */
TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */
struct sockaddr_in ia_addr; /* reserve space for interface name */
@@ -162,14 +159,16 @@ do { \
#define IFP_TO_IA(ifp, ia) \
/* struct ifnet *ifp; */ \
/* struct in_ifaddr *ia; */ \
-{ \
+do { \
+ IN_IFADDR_RLOCK(); \
for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \
(ia) != NULL && (ia)->ia_ifp != (ifp); \
(ia) = TAILQ_NEXT((ia), ia_link)) \
continue; \
if ((ia) != NULL) \
ifa_ref(&(ia)->ia_ifa); \
-}
+ IN_IFADDR_RUNLOCK(); \
+} while (0)
#endif
/*
diff --git a/freebsd/sys/netinet/ip.h b/freebsd/sys/netinet/ip.h
index 6c9482f9..79afeb8f 100644
--- a/freebsd/sys/netinet/ip.h
+++ b/freebsd/sys/netinet/ip.h
@@ -48,11 +48,11 @@
*/
struct ip {
#if BYTE_ORDER == LITTLE_ENDIAN
- u_int ip_hl:4, /* header length */
+ u_char ip_hl:4, /* header length */
ip_v:4; /* version */
#endif
#if BYTE_ORDER == BIG_ENDIAN
- u_int ip_v:4, /* version */
+ u_char ip_v:4, /* version */
ip_hl:4; /* header length */
#endif
u_char ip_tos; /* type of service */
@@ -167,11 +167,11 @@ struct ip_timestamp {
u_char ipt_len; /* size of structure (variable) */
u_char ipt_ptr; /* index of current entry */
#if BYTE_ORDER == LITTLE_ENDIAN
- u_int ipt_flg:4, /* flags, see below */
+ u_char ipt_flg:4, /* flags, see below */
ipt_oflw:4; /* overflow counter */
#endif
#if BYTE_ORDER == BIG_ENDIAN
- u_int ipt_oflw:4, /* overflow counter */
+ u_char ipt_oflw:4, /* overflow counter */
ipt_flg:4; /* flags, see below */
#endif
union ipt_timestamp {
diff --git a/freebsd/sys/netinet/ip6.h b/freebsd/sys/netinet/ip6.h
index 3fb08a78..8f498410 100644
--- a/freebsd/sys/netinet/ip6.h
+++ b/freebsd/sys/netinet/ip6.h
@@ -263,7 +263,7 @@ struct ip6_frag {
/*
* IP6_EXTHDR_CHECK ensures that region between the IP6 header and the
* target header (including IPv6 itself, extension headers and
- * TCP/UDP/ICMP6 headers) are continuous. KAME requires drivers
+ * TCP/UDP/ICMP6 headers) are contiguous. KAME requires drivers
* to store incoming data into one internal mbuf or one or more external
* mbufs(never into two or more internal mbufs). Thus, the third case is
* supposed to never be matched but is prepared just in case.
@@ -275,24 +275,24 @@ do { \
if (((m)->m_flags & M_LOOP) && \
((m)->m_len < (off) + (hlen)) && \
(((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \
- V_ip6stat.ip6s_exthdrtoolong++; \
+ IP6STAT_INC(ip6s_exthdrtoolong); \
return ret; \
} else if ((m)->m_flags & M_EXT) { \
if ((m)->m_len < (off) + (hlen)) { \
- V_ip6stat.ip6s_exthdrtoolong++; \
+ IP6STAT_INC(ip6s_exthdrtoolong); \
m_freem(m); \
return ret; \
} \
} else { \
if ((m)->m_len < (off) + (hlen)) { \
- V_ip6stat.ip6s_exthdrtoolong++; \
+ IP6STAT_INC(ip6s_exthdrtoolong); \
m_freem(m); \
return ret; \
} \
} \
} else { \
if ((m)->m_len < (off) + (hlen)) { \
- V_ip6stat.ip6s_tooshort++; \
+ IP6STAT_INC(ip6s_tooshort); \
in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \
m_freem(m); \
return ret; \
diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c
index a08c3fb8..a34c10c3 100644
--- a/freebsd/sys/netinet/ip_carp.c
+++ b/freebsd/sys/netinet/ip_carp.c
@@ -68,14 +68,19 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
-#ifdef INET
+#if defined(INET) || defined(INET6)
#include <netinet/in.h>
#include <netinet/in_var.h>
-#include <netinet/in_systm.h>
+#include <netinet/ip_carp.h>
#include <netinet/ip.h>
+
+#include <machine/in_cksum.h>
+#endif
+
+#ifdef INET
+#include <netinet/in_systm.h>
#include <netinet/ip_var.h>
#include <netinet/if_ether.h>
-#include <machine/in_cksum.h>
#endif
#ifdef INET6
@@ -84,11 +89,11 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6protosw.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
+#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#endif
#include <crypto/sha1.h>
-#include <netinet/ip_carp.h>
#define CARP_IFNAME "carp"
static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
@@ -98,7 +103,9 @@ struct carp_softc {
struct ifnet *sc_ifp; /* Interface clue */
struct ifnet *sc_carpdev; /* Pointer to parent interface */
struct in_ifaddr *sc_ia; /* primary iface address */
+#ifdef INET
struct ip_moptions sc_imo;
+#endif
#ifdef INET6
struct in6_ifaddr *sc_ia6; /* primary iface address v6 */
struct ip6_moptions sc_im6o;
@@ -208,7 +215,9 @@ static int carp_prepare_ad(struct mbuf *, struct carp_softc *,
static void carp_send_ad_all(void);
static void carp_send_ad(void *);
static void carp_send_ad_locked(struct carp_softc *);
+#ifdef INET
static void carp_send_arp(struct carp_softc *);
+#endif
static void carp_master_down(void *);
static void carp_master_down_locked(struct carp_softc *);
static int carp_ioctl(struct ifnet *, u_long, caddr_t);
@@ -217,12 +226,16 @@ static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *,
static void carp_start(struct ifnet *);
static void carp_setrun(struct carp_softc *, sa_family_t);
static void carp_set_state(struct carp_softc *, int);
+#ifdef INET
static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int);
+#endif
enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING };
+#ifdef INET
static void carp_multicast_cleanup(struct carp_softc *, int dofree);
static int carp_set_addr(struct carp_softc *, struct sockaddr_in *);
static int carp_del_addr(struct carp_softc *, struct sockaddr_in *);
+#endif
static void carp_carpdev_state_locked(struct carp_if *);
static void carp_sc_state_locked(struct carp_softc *);
#ifdef INET6
@@ -371,6 +384,7 @@ carp_setroute(struct carp_softc *sc, int cmd)
s = splnet();
TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
+#ifdef INET
if (ifa->ifa_addr->sa_family == AF_INET &&
sc->sc_carpdev != NULL) {
int count = carp_addrcount(
@@ -381,6 +395,7 @@ carp_setroute(struct carp_softc *sc, int cmd)
(cmd == RTM_DELETE && count == 0))
rtinit(ifa, cmd, RTF_UP | RTF_HOST);
}
+#endif
}
splx(s);
}
@@ -406,12 +421,14 @@ carp_clone_create(struct if_clone *ifc, int unit, caddr_t params)
sc->sc_advskew = 0;
sc->sc_init_counter = 1;
sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */
+#ifdef INET
sc->sc_imo.imo_membership = (struct in_multi **)malloc(
(sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
M_WAITOK);
sc->sc_imo.imo_mfilters = NULL;
sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
sc->sc_imo.imo_multicast_vif = -1;
+#endif
#ifdef INET6
sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc(
(sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
@@ -458,7 +475,9 @@ carp_clone_destroy(struct ifnet *ifp)
bpfdetach(ifp);
if_detach(ifp);
if_free_type(ifp, IFT_ETHER);
+#ifdef INET
free(sc->sc_imo.imo_membership, M_CARP);
+#endif
#ifdef INET6
free(sc->sc_im6o.im6o_membership, M_CARP);
#endif
@@ -497,7 +516,9 @@ carpdetach(struct carp_softc *sc, int unlock)
carp_set_state(sc, INIT);
SC2IFP(sc)->if_flags &= ~IFF_UP;
carp_setrun(sc, 0);
+#ifdef INET
carp_multicast_cleanup(sc, unlock);
+#endif
#ifdef INET6
carp_multicast6_cleanup(sc, unlock);
#endif
@@ -542,6 +563,7 @@ carp_ifdetach(void *arg __unused, struct ifnet *ifp)
* we have rearranged checks order compared to the rfc,
* but it seems more efficient this way or not possible otherwise.
*/
+#ifdef INET
void
carp_input(struct mbuf *m, int hlen)
{
@@ -632,6 +654,7 @@ carp_input(struct mbuf *m, int hlen)
carp_input_c(m, ch, AF_INET);
}
+#endif
#ifdef INET6
int
@@ -722,12 +745,16 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
SC2IFP(sc)->if_ibytes += m->m_pkthdr.len;
if (bpf_peers_present(SC2IFP(sc)->if_bpf)) {
- struct ip *ip = mtod(m, struct ip *);
uint32_t af1 = af;
+#ifdef INET
+ struct ip *ip = mtod(m, struct ip *);
/* BPF wants net byte order */
- ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
- ip->ip_off = htons(ip->ip_off);
+ if (af == AF_INET) {
+ ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
+ ip->ip_off = htons(ip->ip_off);
+ }
+#endif
bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m);
}
@@ -1083,6 +1110,7 @@ carp_send_ad_locked(struct carp_softc *sc)
}
+#ifdef INET
/*
* Broadcast a gratuitous ARP request containing
* the virtual router MAC address for each IP address
@@ -1104,6 +1132,7 @@ carp_send_arp(struct carp_softc *sc)
DELAY(1000); /* XXX */
}
}
+#endif
#ifdef INET6
static void
@@ -1126,6 +1155,7 @@ carp_send_na(struct carp_softc *sc)
}
#endif /* INET6 */
+#ifdef INET
static int
carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type)
{
@@ -1229,6 +1259,7 @@ carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia,
CARP_UNLOCK(cif);
return (0);
}
+#endif
#ifdef INET6
struct ifaddr *
@@ -1355,7 +1386,9 @@ carp_master_down_locked(struct carp_softc *sc)
case BACKUP:
carp_set_state(sc, MASTER);
carp_send_ad_locked(sc);
+#ifdef INET
carp_send_arp(sc);
+#endif
#ifdef INET6
carp_send_na(sc);
#endif /* INET6 */
@@ -1434,6 +1467,7 @@ carp_setrun(struct carp_softc *sc, sa_family_t af)
}
}
+#ifdef INET
static void
carp_multicast_cleanup(struct carp_softc *sc, int dofree)
{
@@ -1453,6 +1487,7 @@ carp_multicast_cleanup(struct carp_softc *sc, int dofree)
imo->imo_num_memberships = 0;
imo->imo_multicast_ifp = NULL;
}
+#endif
#ifdef INET6
static void
@@ -1475,6 +1510,7 @@ carp_multicast6_cleanup(struct carp_softc *sc, int dofree)
}
#endif
+#ifdef INET
static int
carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
{
@@ -1651,6 +1687,7 @@ carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin)
return (error);
}
+#endif
#ifdef INET6
static int
@@ -2351,13 +2388,13 @@ carp_mod_load(void)
printf("carp: error %d attaching to PF_INET6\n",
proto_reg[CARP_INET6]);
carp_mod_cleanup();
- return (EINVAL);
+ return (proto_reg[CARP_INET6]);
}
err = ip6proto_register(IPPROTO_CARP);
if (err) {
printf("carp: error %d registering with INET6\n", err);
carp_mod_cleanup();
- return (EINVAL);
+ return (err);
}
#endif
#ifdef INET
@@ -2367,13 +2404,13 @@ carp_mod_load(void)
printf("carp: error %d attaching to PF_INET\n",
proto_reg[CARP_INET]);
carp_mod_cleanup();
- return (EINVAL);
+ return (proto_reg[CARP_INET]);
}
err = ipproto_register(IPPROTO_CARP);
if (err) {
printf("carp: error %d registering with INET\n", err);
carp_mod_cleanup();
- return (EINVAL);
+ return (err);
}
#endif
return 0;
diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c
index 5fb32ba6..879f411f 100644
--- a/freebsd/sys/netinet/ip_divert.c
+++ b/freebsd/sys/netinet/ip_divert.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
#error "IPDIVERT requires INET."
#endif
#endif
+#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/sys/param.h>
#include <sys/kernel.h>
@@ -50,20 +51,13 @@ __FBSDID("$FreeBSD$");
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
-#include <sys/rwlock.h>
-#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
-#include <sys/sx.h>
#include <sys/sysctl.h>
-#include <sys/systm.h>
-
-#include <vm/uma.h>
+#include <net/vnet.h>
#include <net/if.h>
#include <net/netisr.h>
-#include <net/route.h>
-#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
@@ -71,6 +65,10 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
#ifdef SCTP
#include <netinet/sctp_crc32.h>
#endif
@@ -156,35 +154,21 @@ static void
div_init(void)
{
- INP_INFO_LOCK_INIT(&V_divcbinfo, "div");
- LIST_INIT(&V_divcb);
- V_divcbinfo.ipi_listhead = &V_divcb;
-#ifdef VIMAGE
- V_divcbinfo.ipi_vnet = curvnet;
-#endif
/*
- * XXX We don't use the hash list for divert IP, but it's easier
- * to allocate a one entry hash list than it is to check all
- * over the place for hashbase == NULL.
+ * XXX We don't use the hash list for divert IP, but it's easier to
+ * allocate one-entry hash lists than it is to check all over the
+ * place for hashbase == NULL.
*/
- V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask);
- V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB,
- &V_divcbinfo.ipi_porthashmask);
- V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
- NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR,
- UMA_ZONE_NOFREE);
- uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
+ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb",
+ div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE,
+ IPI_HASHFIELDS_NONE);
}
static void
div_destroy(void)
{
- INP_INFO_LOCK_DESTROY(&V_divcbinfo);
- uma_zdestroy(V_divcbinfo.ipi_zone);
- hashdestroy(V_divcbinfo.ipi_hashbase, M_PCB, V_divcbinfo.ipi_hashmask);
- hashdestroy(V_divcbinfo.ipi_porthashbase, M_PCB,
- V_divcbinfo.ipi_porthashmask);
+ in_pcbinfo_destroy(&V_divcbinfo);
}
/*
@@ -335,10 +319,10 @@ static int
div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
struct mbuf *control)
{
+ struct ip *const ip = mtod(m, struct ip *);
struct m_tag *mtag;
struct ipfw_rule_ref *dt;
int error = 0;
- struct mbuf *options;
/*
* An mbuf may hasn't come from userland, but we pretend
@@ -390,71 +374,104 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
/* Reinject packet into the system as incoming or outgoing */
if (!sin || sin->sin_addr.s_addr == 0) {
- struct ip *const ip = mtod(m, struct ip *);
+ struct mbuf *options = NULL;
struct inpcb *inp;
dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
inp = sotoinpcb(so);
INP_RLOCK(inp);
- /*
- * Don't allow both user specified and setsockopt options,
- * and don't allow packet length sizes that will crash
- */
- if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
- ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
- error = EINVAL;
- INP_RUNLOCK(inp);
- m_freem(m);
- } else {
+ switch (ip->ip_v) {
+ case IPVERSION:
+ /*
+ * Don't allow both user specified and setsockopt
+ * options, and don't allow packet length sizes that
+ * will crash.
+ */
+ if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
+ inp->inp_options != NULL) ||
+ ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
+ error = EINVAL;
+ INP_RUNLOCK(inp);
+ goto cantsend;
+ }
+
/* Convert fields to host order for ip_output() */
ip->ip_len = ntohs(ip->ip_len);
ip->ip_off = ntohs(ip->ip_off);
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *);
+
+ /* Don't allow packet length sizes that will crash */
+ if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
+ error = EINVAL;
+ INP_RUNLOCK(inp);
+ goto cantsend;
+ }
- /* Send packet to output processing */
- KMOD_IPSTAT_INC(ips_rawout); /* XXX */
+ ip6->ip6_plen = ntohs(ip6->ip6_plen);
+ break;
+ }
+#endif
+ default:
+ error = EINVAL;
+ INP_RUNLOCK(inp);
+ goto cantsend;
+ }
+
+ /* Send packet to output processing */
+ KMOD_IPSTAT_INC(ips_rawout); /* XXX */
#ifdef MAC
- mac_inpcb_create_mbuf(inp, m);
+ mac_inpcb_create_mbuf(inp, m);
#endif
- /*
- * Get ready to inject the packet into ip_output().
- * Just in case socket options were specified on the
- * divert socket, we duplicate them. This is done
- * to avoid having to hold the PCB locks over the call
- * to ip_output(), as doing this results in a number of
- * lock ordering complexities.
- *
- * Note that we set the multicast options argument for
- * ip_output() to NULL since it should be invariant that
- * they are not present.
- */
- KASSERT(inp->inp_moptions == NULL,
- ("multicast options set on a divert socket"));
- options = NULL;
- /*
- * XXXCSJP: It is unclear to me whether or not it makes
- * sense for divert sockets to have options. However,
- * for now we will duplicate them with the INP locks
- * held so we can use them in ip_output() without
- * requring a reference to the pcb.
- */
- if (inp->inp_options != NULL) {
- options = m_dup(inp->inp_options, M_DONTWAIT);
- if (options == NULL)
- error = ENOBUFS;
- }
- INP_RUNLOCK(inp);
- if (error == ENOBUFS) {
- m_freem(m);
- return (error);
+ /*
+ * Get ready to inject the packet into ip_output().
+ * Just in case socket options were specified on the
+ * divert socket, we duplicate them. This is done
+ * to avoid having to hold the PCB locks over the call
+ * to ip_output(), as doing this results in a number of
+ * lock ordering complexities.
+ *
+ * Note that we set the multicast options argument for
+ * ip_output() to NULL since it should be invariant that
+ * they are not present.
+ */
+ KASSERT(inp->inp_moptions == NULL,
+ ("multicast options set on a divert socket"));
+ /*
+ * XXXCSJP: It is unclear to me whether or not it makes
+ * sense for divert sockets to have options. However,
+ * for now we will duplicate them with the INP locks
+ * held so we can use them in ip_output() without
+ * requring a reference to the pcb.
+ */
+ if (inp->inp_options != NULL) {
+ options = m_dup(inp->inp_options, M_NOWAIT);
+ if (options == NULL) {
+ INP_RUNLOCK(inp);
+ error = ENOBUFS;
+ goto cantsend;
}
+ }
+ INP_RUNLOCK(inp);
+
+ switch (ip->ip_v) {
+ case IPVERSION:
error = ip_output(m, options, NULL,
- ((so->so_options & SO_DONTROUTE) ?
- IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST |
- IP_RAWOUTPUT, NULL, NULL);
- if (options != NULL)
- m_freem(options);
+ ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
+ | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+ break;
+#endif
}
+ if (options != NULL)
+ m_freem(options);
} else {
dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
if (m->m_pkthdr.rcvif == NULL) {
@@ -479,14 +496,26 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
mac_socket_create_mbuf(so, m);
#endif
/* Send packet to input processing via netisr */
- netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
+ switch (ip->ip_v) {
+ case IPVERSION:
+ netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m);
+ break;
+#endif
+ default:
+ error = EINVAL;
+ goto cantsend;
+ }
}
- return error;
+ return (error);
cantsend:
m_freem(m);
- return error;
+ return (error);
}
static int
@@ -554,7 +583,9 @@ div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
INP_INFO_WLOCK(&V_divcbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_divcbinfo);
error = in_pcbbind(inp, nam, td->td_ucred);
+ INP_HASH_WUNLOCK(&V_divcbinfo);
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&V_divcbinfo);
return error;
@@ -683,9 +714,9 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
INP_INFO_WLOCK(&V_divcbinfo);
for (i = 0; i < n; i++) {
inp = inp_list[i];
- INP_WLOCK(inp);
- if (!in_pcbrele(inp))
- INP_WUNLOCK(inp);
+ INP_RLOCK(inp);
+ if (!in_pcbrele_rlocked(inp))
+ INP_RUNLOCK(inp);
}
INP_INFO_WUNLOCK(&V_divcbinfo);
@@ -709,7 +740,8 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
}
#ifdef SYSCTL_NODE
-SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT");
+static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0,
+ "IPDIVERT");
SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets");
#endif
diff --git a/freebsd/sys/netinet/ip_dummynet.h b/freebsd/sys/netinet/ip_dummynet.h
index 0bbc3263..dc2c3412 100644
--- a/freebsd/sys/netinet/ip_dummynet.h
+++ b/freebsd/sys/netinet/ip_dummynet.h
@@ -87,14 +87,14 @@ enum {
DN_SYSCTL_SET,
DN_LAST,
-} ;
+};
enum { /* subtype for schedulers, flowset and the like */
DN_SCHED_UNKNOWN = 0,
DN_SCHED_FIFO = 1,
DN_SCHED_WF2QP = 2,
/* others are in individual modules */
-} ;
+};
enum { /* user flags */
DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */
@@ -113,16 +113,16 @@ enum { /* user flags */
struct dn_link {
struct dn_id oid;
- /*
+ /*
* Userland sets bw and delay in bits/s and milliseconds.
* The kernel converts this back and forth to bits/tick and ticks.
* XXX what about burst ?
- */
+ */
int32_t link_nr;
int bandwidth; /* bit/s or bits/tick. */
int delay; /* ms and ticks */
uint64_t burst; /* scaled. bits*Hz XXX */
-} ;
+};
/*
* A flowset, which is a template for flows. Contains parameters
@@ -132,13 +132,13 @@ struct dn_link {
*/
struct dn_fs {
struct dn_id oid;
- uint32_t fs_nr; /* the flowset number */
- uint32_t flags; /* userland flags */
- int qsize ; /* queue size in slots or bytes */
- int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */
+ uint32_t fs_nr; /* the flowset number */
+ uint32_t flags; /* userland flags */
+ int qsize; /* queue size in slots or bytes */
+ int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */
uint32_t buckets; /* buckets used for the queue hash table */
- struct ipfw_flow_id flow_mask ;
+ struct ipfw_flow_id flow_mask;
uint32_t sched_nr; /* the scheduler we attach to */
/* generic scheduler parameters. Leave them at -1 if unset.
* Now we use 0: weight, 1: lmax, 2: priority
@@ -149,14 +149,14 @@ struct dn_fs {
* weight and probabilities are in the range 0..1 represented
* in fixed point arithmetic with SCALE_RED decimal bits.
*/
-#define SCALE_RED 16
-#define SCALE(x) ( (x) << SCALE_RED )
-#define SCALE_VAL(x) ( (x) >> SCALE_RED )
-#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
- int w_q ; /* queue weight (scaled) */
- int max_th ; /* maximum threshold for queue (scaled) */
- int min_th ; /* minimum threshold for queue (scaled) */
- int max_p ; /* maximum value for p_b (scaled) */
+#define SCALE_RED 16
+#define SCALE(x) ( (x) << SCALE_RED )
+#define SCALE_VAL(x) ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
};
@@ -177,10 +177,10 @@ struct dn_flow {
};
- /*
+/*
* Scheduler template, mostly indicating the name, number,
* sched_mask and buckets.
- */
+ */
struct dn_sch {
struct dn_id oid;
uint32_t sched_nr; /* N, scheduler number */
@@ -199,14 +199,14 @@ struct dn_sch {
#define ED_MAX_SAMPLES_NO 1024
struct dn_profile {
struct dn_id oid;
- /* fields to simulate a delay profile */
+ /* fields to simulate a delay profile */
#define ED_MAX_NAME_LEN 32
- char name[ED_MAX_NAME_LEN];
- int link_nr;
- int loss_level;
- int bandwidth; // XXX use link bandwidth?
- int samples_no; /* actual length of samples[] */
- int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
+ char name[ED_MAX_NAME_LEN];
+ int link_nr;
+ int loss_level;
+ int bandwidth; // XXX use link bandwidth?
+ int samples_no; /* actual len of samples[] */
+ int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
};
diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c
index 43f10ef9..863b9a16 100644
--- a/freebsd/sys/netinet/ip_fastfwd.c
+++ b/freebsd/sys/netinet/ip_fastfwd.c
@@ -153,8 +153,8 @@ ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
/*
* Try to forward a packet based on the destination address.
* This is a fast path optimized for the plain forwarding case.
- * If the packet is handled (and consumed) here then we return 1;
- * otherwise 0 is returned and the packet should be delivered
+ * If the packet is handled (and consumed) here then we return NULL;
+ * otherwise mbuf is returned and the packet should be delivered
* to ip_input for full processing.
*/
struct mbuf *
@@ -169,9 +169,7 @@ ip_fastforward(struct mbuf *m)
u_short sum, ip_len;
int error = 0;
int hlen, mtu;
-#ifdef IPFIREWALL_FORWARD
- struct m_tag *fwd_tag;
-#endif
+ struct m_tag *fwd_tag = NULL;
/*
* Are we active and forwarding packets?
@@ -380,14 +378,13 @@ ip_fastforward(struct mbuf *m)
* Go on with new destination address
*/
}
-#ifdef IPFIREWALL_FORWARD
+
if (m->m_flags & M_FASTFWD_OURS) {
/*
* ipfw changed it for a local address on this host.
*/
goto forwardlocal;
}
-#endif /* IPFIREWALL_FORWARD */
passin:
/*
@@ -457,20 +454,13 @@ passin:
/*
* Destination address changed?
*/
-#ifndef IPFIREWALL_FORWARD
- if (odest.s_addr != dest.s_addr) {
-#else
- fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+ if (m->m_flags & M_IP_NEXTHOP)
+ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
if (odest.s_addr != dest.s_addr || fwd_tag != NULL) {
-#endif /* IPFIREWALL_FORWARD */
/*
* Is it now for a local address on this host?
*/
-#ifndef IPFIREWALL_FORWARD
- if (in_localip(dest)) {
-#else
if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) {
-#endif /* IPFIREWALL_FORWARD */
forwardlocal:
/*
* Return packet for processing by ip_input().
@@ -485,13 +475,12 @@ forwardlocal:
/*
* Redo route lookup with new destination address
*/
-#ifdef IPFIREWALL_FORWARD
if (fwd_tag) {
dest.s_addr = ((struct sockaddr_in *)
(fwd_tag + 1))->sin_addr.s_addr;
m_tag_delete(m, fwd_tag);
+ m->m_flags &= ~M_IP_NEXTHOP;
}
-#endif /* IPFIREWALL_FORWARD */
RTFREE(ro.ro_rt);
if ((dst = ip_findroute(&ro, dest, m)) == NULL)
return NULL; /* icmp unreach already sent */
diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h
index 69311a79..14b08f5e 100644
--- a/freebsd/sys/netinet/ip_fw.h
+++ b/freebsd/sys/netinet/ip_fw.h
@@ -211,12 +211,20 @@ enum ipfw_opcodes { /* arguments (4 byte each) */
O_SETFIB, /* arg1=FIB number */
O_FIB, /* arg1=FIB desired fib number */
+
+ O_SOCKARG, /* socket argument */
O_CALLRETURN, /* arg1=called rule number */
+ O_FORWARD_IP6, /* fwd sockaddr_in6 */
+
+ O_DSCP, /* 2 u32 = DSCP mask */
+ O_SETDSCP, /* arg1=DSCP value */
+
O_LAST_OPCODE /* not an opcode! */
};
+
/*
* The extension header are filtered only for presence using a bit
* vector with a flag for each header.
@@ -309,6 +317,14 @@ typedef struct _ipfw_insn_sa {
} ipfw_insn_sa;
/*
+ * This is used to forward to a given address (ipv6).
+ */
+typedef struct _ipfw_insn_sa6 {
+ ipfw_insn o;
+ struct sockaddr_in6 sa;
+} ipfw_insn_sa6;
+
+/*
* This is used for MAC addr-mask pairs.
*/
typedef struct _ipfw_insn_mac {
diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c
index 0fc1770f..25c9698e 100644
--- a/freebsd/sys/netinet/ip_gre.c
+++ b/freebsd/sys/netinet/ip_gre.c
@@ -19,13 +19,6 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the NetBSD
- * Foundation, Inc. and its contributors.
- * 4. Neither the name of The NetBSD Foundation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
@@ -214,6 +207,11 @@ gre_input2(struct mbuf *m ,int hlen, u_char proto)
bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m);
}
+ if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) {
+ m_freem(m);
+ return(NULL);
+ }
+
m->m_pkthdr.rcvif = GRE2IFP(sc);
netisr_queue(isr, m);
@@ -298,6 +296,11 @@ gre_mobile_input(struct mbuf *m, int hlen)
bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m);
}
+ if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) {
+ m_freem(m);
+ return;
+ }
+
m->m_pkthdr.rcvif = GRE2IFP(sc);
netisr_queue(NETISR_IP, m);
diff --git a/freebsd/sys/netinet/ip_gre.h b/freebsd/sys/netinet/ip_gre.h
index 1fb67d93..d2f3866a 100644
--- a/freebsd/sys/netinet/ip_gre.h
+++ b/freebsd/sys/netinet/ip_gre.h
@@ -16,13 +16,6 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the NetBSD
- * Foundation, Inc. and its contributors.
- * 4. Neither the name of The NetBSD Foundation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c
index 728e57ec..b003d03f 100644
--- a/freebsd/sys/netinet/ip_icmp.c
+++ b/freebsd/sys/netinet/ip_icmp.c
@@ -34,6 +34,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_ipsec.h>
#include <rtems/bsd/sys/param.h>
@@ -64,6 +65,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcpip.h>
#include <netinet/icmp_var.h>
+#ifdef INET
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
@@ -72,12 +74,26 @@ __FBSDID("$FreeBSD$");
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
+#endif /* INET */
/*
* ICMP routines: error generation, receive packet processing, and
* routines to turnaround packets back to the originator, and
* host table maintenance routines.
*/
+static VNET_DEFINE(int, icmplim) = 200;
+#define V_icmplim VNET(icmplim)
+SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
+ &VNET_NAME(icmplim), 0,
+ "Maximum number of ICMP responses per second");
+
+static VNET_DEFINE(int, icmplim_output) = 1;
+#define V_icmplim_output VNET(icmplim_output)
+SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
+ &VNET_NAME(icmplim_output), 0,
+ "Enable rate limiting of ICMP responses");
+
+#ifdef INET
VNET_DEFINE(struct icmpstat, icmpstat);
SYSCTL_VNET_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW,
&VNET_NAME(icmpstat), icmpstat, "");
@@ -102,18 +118,6 @@ SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW,
&VNET_NAME(log_redirect), 0,
"Log ICMP redirects to the console");
-static VNET_DEFINE(int, icmplim) = 200;
-#define V_icmplim VNET(icmplim)
-SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
- &VNET_NAME(icmplim), 0,
- "Maximum number of ICMP responses per second");
-
-static VNET_DEFINE(int, icmplim_output) = 1;
-#define V_icmplim_output VNET(icmplim_output)
-SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
- &VNET_NAME(icmplim_output), 0,
- "Enable rate limiting of ICMP responses");
-
static VNET_DEFINE(char, reply_src[IFNAMSIZ]);
#define V_reply_src VNET(reply_src)
SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW,
@@ -702,6 +706,8 @@ icmp_reflect(struct mbuf *m)
goto done; /* Ip_output() will check for broadcast */
}
+ m_addr_changed(m);
+
t = ip->ip_dst;
ip->ip_dst = ip->ip_src;
@@ -953,6 +959,7 @@ ip_next_mtu(int mtu, int dir)
}
return 0;
}
+#endif /* INET */
/*
diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c
index b1154c79..2dbb2a7a 100644
--- a/freebsd/sys/netinet/ip_input.c
+++ b/freebsd/sys/netinet/ip_input.c
@@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
-#include <sys/callout.h>
#include <sys/mbuf.h>
#include <sys/malloc.h>
#include <sys/domain.h>
@@ -104,11 +103,6 @@ SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
&VNET_NAME(ipsendredirects), 0,
"Enable sending IP redirects");
-VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
-SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
- &VNET_NAME(ip_defttl), 0,
- "Maximum TTL on IP packets");
-
static VNET_DEFINE(int, ip_keepfaith);
#define V_ip_keepfaith VNET(ip_keepfaith)
SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
@@ -196,8 +190,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
&VNET_NAME(maxfragsperpacket), 0,
"Maximum number of IPv4 fragments allowed per packet");
-struct callout ipport_tick_callout;
-
#ifdef IPCTL_DEFMTU
SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
&ip_mtu, 0, "Default MTU");
@@ -220,8 +212,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
"number of entries in the per-cpu output flow caches");
#endif
-VNET_DEFINE(int, fw_one_pass) = 1;
-
static void ip_freef(struct ipqhead *, struct ipq *);
/*
@@ -356,11 +346,6 @@ ip_init(void)
ip_protox[pr->pr_protocol] = pr - inetsw;
}
- /* Start ipport_tick. */
- callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
- callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
- EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
- SHUTDOWN_PRI_DEFAULT);
EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
NULL, EVENTHANDLER_PRI_ANY);
@@ -385,13 +370,6 @@ ip_destroy(void)
}
#endif
-void
-ip_fini(void *xtp)
-{
-
- callout_stop(&ipport_tick_callout);
-}
-
/*
* Ip input routine. Checksum and byte swap header. If fragmented
* try to reassemble. Process options. Pass to next level.
@@ -540,22 +518,22 @@ tooshort:
dchg = (odst.s_addr != ip->ip_dst.s_addr);
ifp = m->m_pkthdr.rcvif;
-#ifdef IPFIREWALL_FORWARD
if (m->m_flags & M_FASTFWD_OURS) {
m->m_flags &= ~M_FASTFWD_OURS;
goto ours;
}
- if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) {
- /*
- * Directly ship the packet on. This allows forwarding
- * packets originally destined to us to some other directly
- * connected host.
- */
- ip_forward(m, dchg);
- return;
+ if (m->m_flags & M_IP_NEXTHOP) {
+ dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL);
+ if (dchg != 0) {
+ /*
+ * Directly ship the packet on. This allows
+ * forwarding packets originally destined to us
+ * to some other directly connected host.
+ */
+ ip_forward(m, 1);
+ return;
+ }
}
-#endif /* IPFIREWALL_FORWARD */
-
passin:
/*
* Process options and, if not destined for us,
@@ -646,11 +624,6 @@ passin:
IF_ADDR_RUNLOCK(ifp);
goto ours;
}
- if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) {
- ifa_ref(ifa);
- IF_ADDR_RUNLOCK(ifp);
- goto ours;
- }
#ifdef BOOTP_COMPAT
if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
ifa_ref(ifa);
@@ -1524,8 +1497,7 @@ ip_forward(struct mbuf *m, int srcrt)
if (error == EMSGSIZE && ro.ro_rt)
mtu = ro.ro_rt->rt_rmx.rmx_mtu;
- if (ro.ro_rt)
- RTFREE(ro.ro_rt);
+ RO_RTFREE(&ro);
if (error)
IPSTAT_INC(ips_cantforward);
diff --git a/freebsd/sys/netinet/ip_ipsec.c b/freebsd/sys/netinet/ip_ipsec.c
index 35ea9cd5..f3516f1c 100644
--- a/freebsd/sys/netinet/ip_ipsec.c
+++ b/freebsd/sys/netinet/ip_ipsec.c
@@ -262,8 +262,7 @@ ip_ipsec_mtu(struct mbuf *m, int mtu)
* -1 = packet was reinjected and stop processing packet
*/
int
-ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error,
- struct ifnet **ifp)
+ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error)
{
#ifdef IPSEC
struct secpolicy *sp = NULL;
@@ -392,20 +391,6 @@ ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error,
} else {
/* No IPsec processing for this packet. */
}
-#ifdef notyet
- /*
- * If deferred crypto processing is needed, check that
- * the interface supports it.
- */
- mtag = m_tag_find(*m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
- if (mtag != NULL && ifp != NULL &&
- ((*ifp)->if_capenable & IFCAP_IPSEC) == 0) {
- /* notify IPsec to do its own crypto */
- ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
- *error = EHOSTUNREACH;
- goto bad;
- }
-#endif
}
done:
if (sp != NULL)
diff --git a/freebsd/sys/netinet/ip_ipsec.h b/freebsd/sys/netinet/ip_ipsec.h
index 31bc86a1..2870c114 100644
--- a/freebsd/sys/netinet/ip_ipsec.h
+++ b/freebsd/sys/netinet/ip_ipsec.h
@@ -36,6 +36,5 @@ int ip_ipsec_filtertunnel(struct mbuf *);
int ip_ipsec_fwd(struct mbuf *);
int ip_ipsec_input(struct mbuf *);
int ip_ipsec_mtu(struct mbuf *, int);
-int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *,
- struct ifnet **);
+int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *);
#endif
diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c
index 18419a74..6fc5cc68 100644
--- a/freebsd/sys/netinet/ip_mroute.c
+++ b/freebsd/sys/netinet/ip_mroute.c
@@ -116,8 +116,6 @@ __FBSDID("$FreeBSD$");
#include <machine/in_cksum.h>
-#include <security/mac/mac_framework.h>
-
#ifndef KTR_IPMF
#define KTR_IPMF KTR_INET
#endif
@@ -928,7 +926,6 @@ add_vif(struct vifctl *vifcp)
vifp->v_pkt_out = 0;
vifp->v_bytes_in = 0;
vifp->v_bytes_out = 0;
- bzero(&vifp->v_route, sizeof(vifp->v_route));
/* Adjust numvifs up if the vifi is higher than numvifs */
if (V_numvifs <= vifcp->vifc_vifi)
@@ -1036,6 +1033,8 @@ expire_mfc(struct mfc *rt)
{
struct rtdetq *rte, *nrte;
+ MFC_LOCK_ASSERT();
+
free_bw_list(rt->mfc_bw_meter);
TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
@@ -1704,7 +1703,7 @@ send_packet(struct vif *vifp, struct mbuf *m)
* should get rejected because they appear to come from
* the loopback interface, thus preventing looping.
*/
- error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL);
+ error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
(ptrdiff_t)(vifp - V_viftable), error);
}
@@ -2809,9 +2808,9 @@ out_locked:
return (error);
}
-SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable,
- "IPv4 Multicast Forwarding Table (struct *mfc[mfchashsize], "
- "netinet/ip_mroute.h)");
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
+ sysctl_mfctable, "IPv4 Multicast Forwarding Table "
+ "(struct *mfc[mfchashsize], netinet/ip_mroute.h)");
static void
vnet_mroute_init(const void *unused __unused)
diff --git a/freebsd/sys/netinet/ip_mroute.h b/freebsd/sys/netinet/ip_mroute.h
index c54c75aa..e945b92c 100644
--- a/freebsd/sys/netinet/ip_mroute.h
+++ b/freebsd/sys/netinet/ip_mroute.h
@@ -262,7 +262,6 @@ struct vif {
u_long v_pkt_out; /* # pkts out on interface */
u_long v_bytes_in; /* # bytes in on interface */
u_long v_bytes_out; /* # bytes out on interface */
- struct route v_route; /* cached route */
};
#ifdef _KERNEL
diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c
index 7b190bfd..98a8a2df 100644
--- a/freebsd/sys/netinet/ip_options.c
+++ b/freebsd/sys/netinet/ip_options.c
@@ -67,8 +67,6 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
-#include <security/mac/mac_framework.h>
-
static int ip_dosourceroute = 0;
SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
&ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c
index 02dc7bdb..a70d3142 100644
--- a/freebsd/sys/netinet/ip_output.c
+++ b/freebsd/sys/netinet/ip_output.c
@@ -86,12 +86,6 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\
- x, (ntohl(a.s_addr)>>24)&0xFF,\
- (ntohl(a.s_addr)>>16)&0xFF,\
- (ntohl(a.s_addr)>>8)&0xFF,\
- (ntohl(a.s_addr))&0xFF, y);
-
VNET_DEFINE(u_short, ip_id);
#ifdef MBUF_STRESS_TEST
@@ -110,8 +104,13 @@ extern struct protosw inetsw[];
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
+ * ip_len and ip_off are in host format.
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
+ * If route ro is present and has ro_rt initialized, route lookup would be
+ * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
+ * then result of route lookup is stored in ro->ro_rt.
+ *
* In the IP forwarding case, the packet will arrive with options already
* inserted, so must have a NULL opt pointer.
*/
@@ -124,17 +123,15 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
struct mbuf *m0;
int hlen = sizeof (struct ip);
int mtu;
- int len, error = 0;
- int nortfree = 0;
- struct sockaddr_in *dst = NULL; /* keep compiler happy */
- struct in_ifaddr *ia = NULL;
+ int n; /* scratchpad */
+ int error = 0;
+ struct sockaddr_in *dst;
+ struct in_ifaddr *ia;
int isbroadcast, sw_csum;
struct route iproute;
struct rtentry *rte; /* cache for ro->ro_rt */
struct in_addr odst;
-#ifdef IPFIREWALL_FORWARD
struct m_tag *fwd_tag = NULL;
-#endif
#ifdef IPSEC
int no_route_but_check_spd = 0;
#endif
@@ -152,30 +149,29 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
if (ro == NULL) {
ro = &iproute;
bzero(ro, sizeof (*ro));
+ }
#ifdef FLOWTABLE
- {
- struct flentry *fle;
+ if (ro->ro_rt == NULL) {
+ struct flentry *fle;
- /*
- * The flow table returns route entries valid for up to 30
- * seconds; we rely on the remainder of ip_output() taking no
- * longer than that long for the stability of ro_rt. The
- * flow ID assignment must have happened before this point.
- */
- if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) {
- flow_to_route(fle, ro);
- nortfree = 1;
- }
- }
-#endif
+ /*
+ * The flow table returns route entries valid for up to 30
+ * seconds; we rely on the remainder of ip_output() taking no
+ * longer than that long for the stability of ro_rt. The
+ * flow ID assignment must have happened before this point.
+ */
+ fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
+ if (fle != NULL)
+ flow_to_route(fle, ro);
}
+#endif
if (opt) {
- len = 0;
+ int len = 0;
m = ip_insertoptions(m, opt, &len);
if (len != 0)
- hlen = len;
+ hlen = len; /* ip->ip_hl is updated above */
}
ip = mtod(m, struct ip *);
@@ -196,11 +192,13 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
ip->ip_id = ip_newid();
IPSTAT_INC(ips_localout);
} else {
+ /* Header already set, fetch hlen from there */
hlen = ip->ip_hl << 2;
}
again:
dst = (struct sockaddr_in *)&ro->ro_dst;
+ ia = NULL;
/*
* If there is a cached route,
* check that it is to the same destination
@@ -214,16 +212,11 @@ again:
!RT_LINK_IS_UP(rte->rt_ifp) ||
dst->sin_family != AF_INET ||
dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
- if (!nortfree)
- RTFREE(rte);
- rte = ro->ro_rt = (struct rtentry *)NULL;
- ro->ro_lle = (struct llentry *)NULL;
+ RO_RTFREE(ro);
+ ro->ro_lle = NULL;
+ rte = NULL;
}
-#ifdef IPFIREWALL_FORWARD
if (rte == NULL && fwd_tag == NULL) {
-#else
- if (rte == NULL) {
-#endif
bzero(dst, sizeof(*dst));
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
@@ -328,6 +321,9 @@ again:
} else {
mtu = ifp->if_mtu;
}
+ /* Catch a possible divide by zero later. */
+ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
+ __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp));
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
m->m_flags |= M_MCAST;
/*
@@ -441,18 +437,15 @@ again:
* packet or packet fragments, unless ALTQ is enabled on the given
* interface in which case packetdrop should be done by queueing.
*/
+ n = ip->ip_len / mtu + 1; /* how many fragments ? */
+ if (
#ifdef ALTQ
- if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
- ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
- ifp->if_snd.ifq_maxlen))
-#else
- if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
- ifp->if_snd.ifq_maxlen)
+ (!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
#endif /* ALTQ */
- {
+ (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) {
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
- ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
+ ifp->if_snd.ifq_drops += n;
goto bad;
}
@@ -482,7 +475,7 @@ again:
sendit:
#ifdef IPSEC
- switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) {
+ switch(ip_ipsec_output(&m, inp, &flags, &error)) {
case 1:
goto bad;
case -1:
@@ -537,11 +530,13 @@ sendit:
#endif
error = netisr_queue(NETISR_IP, m);
goto done;
- } else
+ } else {
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
goto again; /* Redo the routing table lookup. */
+ }
}
-#ifdef IPFIREWALL_FORWARD
/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
if (m->m_flags & M_FASTFWD_OURS) {
if (m->m_pkthdr.rcvif == NULL)
@@ -562,15 +557,17 @@ sendit:
goto done;
}
/* Or forward to some other address? */
- fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
- if (fwd_tag) {
+ if ((m->m_flags & M_IP_NEXTHOP) &&
+ (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
dst = (struct sockaddr_in *)&ro->ro_dst;
bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
m->m_flags |= M_SKIP_FIREWALL;
+ m->m_flags &= ~M_IP_NEXTHOP;
m_tag_delete(m, fwd_tag);
+ if (ia != NULL)
+ ifa_free(&ia->ia_ifa);
goto again;
}
-#endif /* IPFIREWALL_FORWARD */
passout:
/* 127/8 must not appear on wire - RFC1122. */
@@ -677,9 +674,8 @@ passout:
IPSTAT_INC(ips_fragmented);
done:
- if (ro == &iproute && ro->ro_rt && !nortfree) {
- RTFREE(ro->ro_rt);
- }
+ if (ro == &iproute)
+ RO_RTFREE(ro);
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (error);
@@ -725,14 +721,12 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
* If the interface will not calculate checksums on
* fragmented packets, then do it here.
*/
- if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
- (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
+ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
in_delayed_cksum(m0);
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
- if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
- (if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
+ if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
sctp_delayed_cksum(m0, hlen);
m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
@@ -900,12 +894,40 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
error = optval = 0;
if (sopt->sopt_level != IPPROTO_IP) {
- if ((sopt->sopt_level == SOL_SOCKET) &&
- (sopt->sopt_name == SO_SETFIB)) {
- inp->inp_inc.inc_fibnum = so->so_fibnum;
- return (0);
+ error = EINVAL;
+
+ if (sopt->sopt_level == SOL_SOCKET &&
+ sopt->sopt_dir == SOPT_SET) {
+ switch (sopt->sopt_name) {
+ case SO_REUSEADDR:
+ INP_WLOCK(inp);
+ if ((so->so_options & SO_REUSEADDR) != 0)
+ inp->inp_flags2 |= INP_REUSEADDR;
+ else
+ inp->inp_flags2 &= ~INP_REUSEADDR;
+ INP_WUNLOCK(inp);
+ error = 0;
+ break;
+ case SO_REUSEPORT:
+ INP_WLOCK(inp);
+ if ((so->so_options & SO_REUSEPORT) != 0)
+ inp->inp_flags2 |= INP_REUSEPORT;
+ else
+ inp->inp_flags2 &= ~INP_REUSEPORT;
+ INP_WUNLOCK(inp);
+ error = 0;
+ break;
+ case SO_SETFIB:
+ INP_WLOCK(inp);
+ inp->inp_inc.inc_fibnum = so->so_fibnum;
+ INP_WUNLOCK(inp);
+ error = 0;
+ break;
+ default:
+ break;
+ }
}
- return (EINVAL);
+ return (error);
}
switch (sopt->sopt_dir) {
diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h
index d196fd04..b07ef162 100644
--- a/freebsd/sys/netinet/ip_var.h
+++ b/freebsd/sys/netinet/ip_var.h
@@ -162,6 +162,7 @@ void kmod_ipstat_dec(int statnum);
* mbuf flag used by ip_fastfwd
*/
#define M_FASTFWD_OURS M_PROTO1 /* changed dst to local */
+#define M_IP_NEXTHOP M_PROTO2 /* explicit ip nexthop */
#ifdef __NO_STRICT_ALIGNMENT
#define IP_HDR_ALIGNED_P(ip) 1
@@ -208,7 +209,6 @@ int inp_setmoptions(struct inpcb *, struct sockopt *);
int ip_ctloutput(struct socket *, struct sockopt *sopt);
void ip_drain(void);
-void ip_fini(void *xtp);
int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
u_long if_hwassist_flags, int sw_csum);
void ip_forward(struct mbuf *m, int srcrt);
diff --git a/freebsd/sys/netinet/libalias/alias.h b/freebsd/sys/netinet/libalias/alias.h
index b2615c90..b12b353a 100644
--- a/freebsd/sys/netinet/libalias/alias.h
+++ b/freebsd/sys/netinet/libalias/alias.h
@@ -197,17 +197,6 @@ struct mbuf *m_megapullup(struct mbuf *, int);
*/
#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20
-#ifndef NO_FW_PUNCH
-/*
- * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will
- * create a 'hole' in the firewall to allow the transfers to work. The
- * ipfw rule number that the hole is created with is controlled by
- * PacketAliasSetFWBase(). The hole will be attached to that
- * particular alias_link, so when the link goes away the hole is deleted.
- */
-#define PKT_ALIAS_PUNCH_FW 0x100
-#endif
-
/*
* If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only
* transparent proxying is performed.
@@ -220,6 +209,17 @@ struct mbuf *m_megapullup(struct mbuf *, int);
*/
#define PKT_ALIAS_REVERSE 0x80
+#ifndef NO_FW_PUNCH
+/*
+ * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will
+ * create a 'hole' in the firewall to allow the transfers to work. The
+ * ipfw rule number that the hole is created with is controlled by
+ * PacketAliasSetFWBase(). The hole will be attached to that
+ * particular alias_link, so when the link goes away the hole is deleted.
+ */
+#define PKT_ALIAS_PUNCH_FW 0x100
+#endif
+
/*
* If PKT_ALIAS_SKIP_GLOBAL is set, nat instance is not checked for matching
* states in 'ipfw nat global' rule.
diff --git a/freebsd/sys/netinet/libalias/alias_db.c b/freebsd/sys/netinet/libalias/alias_db.c
index 7385027c..28be85da 100644
--- a/freebsd/sys/netinet/libalias/alias_db.c
+++ b/freebsd/sys/netinet/libalias/alias_db.c
@@ -2171,7 +2171,6 @@ HouseKeeping(struct libalias *la)
int i, n;
#ifndef _KERNEL
struct timeval tv;
- struct timezone tz;
#endif
LIBALIAS_LOCK_ASSERT(la);
@@ -2183,7 +2182,7 @@ HouseKeeping(struct libalias *la)
#ifdef _KERNEL
la->timeStamp = time_uptime;
#else
- gettimeofday(&tv, &tz);
+ gettimeofday(&tv, NULL);
la->timeStamp = tv.tv_sec;
#endif
@@ -2478,7 +2477,6 @@ LibAliasInit(struct libalias *la)
int i;
#ifndef _KERNEL
struct timeval tv;
- struct timezone tz;
#endif
if (la == NULL) {
@@ -2505,7 +2503,7 @@ LibAliasInit(struct libalias *la)
la->timeStamp = time_uptime;
la->lastCleanupTime = time_uptime;
#else
- gettimeofday(&tv, &tz);
+ gettimeofday(&tv, NULL);
la->timeStamp = tv.tv_sec;
la->lastCleanupTime = tv.tv_sec;
#endif
@@ -2737,7 +2735,6 @@ static void
InitPunchFW(struct libalias *la)
{
- LIBALIAS_LOCK_ASSERT(la);
la->fireWallField = malloc(la->fireWallNumNums);
if (la->fireWallField) {
memset(la->fireWallField, 0, la->fireWallNumNums);
@@ -2753,7 +2750,6 @@ static void
UninitPunchFW(struct libalias *la)
{
- LIBALIAS_LOCK_ASSERT(la);
ClearAllFWHoles(la);
if (la->fireWallFD >= 0)
close(la->fireWallFD);
@@ -2773,7 +2769,6 @@ PunchFWHole(struct alias_link *lnk)
struct ip_fw rule; /* On-the-fly built rule */
int fwhole; /* Where to punch hole */
- LIBALIAS_LOCK_ASSERT(la);
la = lnk->la;
/* Don't do anything unless we are asked to */
@@ -2847,7 +2842,6 @@ ClearFWHole(struct alias_link *lnk)
{
struct libalias *la;
- LIBALIAS_LOCK_ASSERT(la);
la = lnk->la;
if (lnk->link_type == LINK_TCP) {
int fwhole = lnk->data.tcp->fwhole; /* Where is the firewall
@@ -2872,7 +2866,6 @@ ClearAllFWHoles(struct libalias *la)
struct ip_fw rule; /* On-the-fly built rule */
int i;
- LIBALIAS_LOCK_ASSERT(la);
if (la->fireWallFD < 0)
return;
@@ -2886,7 +2879,7 @@ ClearAllFWHoles(struct libalias *la)
memset(la->fireWallField, 0, la->fireWallNumNums);
}
-#endif
+#endif /* !NO_FW_PUNCH */
void
LibAliasSetFWBase(struct libalias *la, unsigned int base, unsigned int num)
diff --git a/freebsd/sys/netinet/libalias/alias_sctp.c b/freebsd/sys/netinet/libalias/alias_sctp.c
index c8d83878..6158149a 100644
--- a/freebsd/sys/netinet/libalias/alias_sctp.c
+++ b/freebsd/sys/netinet/libalias/alias_sctp.c
@@ -183,7 +183,7 @@ void SctpShowAliasStats(struct libalias *la);
#ifdef _KERNEL
-MALLOC_DEFINE(M_SCTPNAT, "sctpnat", "sctp nat dbs");
+static MALLOC_DEFINE(M_SCTPNAT, "sctpnat", "sctp nat dbs");
/* Use kernel allocator. */
#ifdef _SYS_MALLOC_H_
#define sn_malloc(x) malloc(x, M_SCTPNAT, M_NOWAIT|M_ZERO)
@@ -366,8 +366,8 @@ SYSCTL_DECL(_net_inet);
SYSCTL_DECL(_net_inet_ip);
SYSCTL_DECL(_net_inet_ip_alias);
-SYSCTL_NODE(_net_inet_ip_alias, OID_AUTO, sctp, CTLFLAG_RW, NULL, "SCTP NAT");
-
+static SYSCTL_NODE(_net_inet_ip_alias, OID_AUTO, sctp, CTLFLAG_RW, NULL,
+ "SCTP NAT");
SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, log_level, CTLTYPE_UINT | CTLFLAG_RW,
&sysctl_log_level, 0, sysctl_chg_loglevel, "IU",
"Level of detail (0 - default, 1 - event, 2 - info, 3 - detail, 4 - debug, 5 - max debug)");
diff --git a/freebsd/sys/netinet/libalias/alias_sctp.h b/freebsd/sys/netinet/libalias/alias_sctp.h
index 8c244b45..f538d942 100644
--- a/freebsd/sys/netinet/libalias/alias_sctp.h
+++ b/freebsd/sys/netinet/libalias/alias_sctp.h
@@ -76,7 +76,6 @@
*
*/
#include <machine/cpufunc.h>
-#include <machine/cpu.h>
/* The packed define for 64 bit platforms */
#ifndef SCTP_PACKED
#define SCTP_PACKED __attribute__((packed))
@@ -136,13 +135,13 @@ struct sctp_nat_assoc {
struct in_addr a_addr; /**< alias ip address */
int state; /**< current state of NAT association */
int TableRegister; /**< stores which look up tables association is registered in */
- int exp; /**< timer expiration in seconds from uptime */
+ int exp; /**< timer expiration in seconds from uptime */
int exp_loc; /**< current location in timer_Q */
int num_Gaddr; /**< number of global IP addresses in the list */
LIST_HEAD(sctpGlobalAddresshead,sctp_GlobalAddress) Gaddr; /**< List of global addresses */
- LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/
- LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */
- LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */
+ LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/
+ LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */
+ LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */
//Using libalias locking
};
diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c
index aa6abae9..827eca6e 100644
--- a/freebsd/sys/netinet/raw_ip.c
+++ b/freebsd/sys/netinet/raw_ip.c
@@ -35,6 +35,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
@@ -76,6 +77,11 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
+VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
+SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
+ &VNET_NAME(ip_defttl), 0,
+ "Maximum TTL on IP packets");
+
VNET_DEFINE(struct inpcbhead, ripcb);
VNET_DEFINE(struct inpcbinfo, ripcbinfo);
@@ -96,6 +102,10 @@ void (*ip_divert_ptr)(struct mbuf *, int);
int (*ng_ipfw_input_p)(struct mbuf **, int,
struct ip_fw_args *, int);
+/* Hook for telling pf that the destination address changed */
+void (*m_addr_chg_pf_p)(struct mbuf *m);
+
+#ifdef INET
/*
* Hooks for multicast routing. They all default to NULL, so leave them not
* initialized and rely on BSS being set to 0.
@@ -121,6 +131,15 @@ u_long (*ip_mcast_src)(int);
void (*rsvp_input_p)(struct mbuf *m, int off);
int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
void (*ip_rsvp_force_done)(struct socket *);
+#endif /* INET */
+
+u_long rip_sendspace = 9216;
+SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
+
+u_long rip_recvspace = 9216;
+SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
+ &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
/*
* Hash functions
@@ -130,6 +149,7 @@ void (*ip_rsvp_force_done)(struct socket *);
#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
(((proto) + (laddr) + (faddr)) % (mask) + 1)
+#ifdef INET
static void
rip_inshash(struct inpcb *inp)
{
@@ -160,6 +180,7 @@ rip_delhash(struct inpcb *inp)
LIST_REMOVE(inp, inp_hash);
}
+#endif /* INET */
/*
* Raw interface to IP protocol.
@@ -188,19 +209,9 @@ void
rip_init(void)
{
- INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
- LIST_INIT(&V_ripcb);
-#ifdef VIMAGE
- V_ripcbinfo.ipi_vnet = curvnet;
-#endif
- V_ripcbinfo.ipi_listhead = &V_ripcb;
- V_ripcbinfo.ipi_hashbase =
- hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask);
- V_ripcbinfo.ipi_porthashbase =
- hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask);
- V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
- NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
+ in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE,
+ 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE,
+ IPI_HASHFIELDS_NONE);
EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
}
@@ -210,20 +221,18 @@ void
rip_destroy(void)
{
- hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB,
- V_ripcbinfo.ipi_hashmask);
- hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB,
- V_ripcbinfo.ipi_porthashmask);
+ in_pcbinfo_destroy(&V_ripcbinfo);
}
#endif
+#ifdef INET
static int
rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n,
struct sockaddr_in *ripsrc)
{
int policyfail = 0;
- INP_RLOCK_ASSERT(last);
+ INP_LOCK_ASSERT(last);
#ifdef IPSEC
/* check AH/ESP integrity. */
@@ -771,14 +780,6 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
}
}
-u_long rip_sendspace = 9216;
-u_long rip_recvspace = 9216;
-
-SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
- &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
-SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
- &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
-
static int
rip_attach(struct socket *so, int proto, struct thread *td)
{
@@ -839,16 +840,19 @@ rip_detach(struct socket *so)
static void
rip_dodisconnect(struct socket *so, struct inpcb *inp)
{
+ struct inpcbinfo *pcbinfo;
- INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
- INP_WLOCK_ASSERT(inp);
-
+ pcbinfo = inp->inp_pcbinfo;
+ INP_INFO_WLOCK(pcbinfo);
+ INP_WLOCK(inp);
rip_delhash(inp);
inp->inp_faddr.s_addr = INADDR_ANY;
rip_inshash(inp);
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED;
SOCK_UNLOCK(so);
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(pcbinfo);
}
static void
@@ -859,11 +863,7 @@ rip_abort(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
- INP_INFO_WLOCK(&V_ripcbinfo);
- INP_WLOCK(inp);
rip_dodisconnect(so, inp);
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
}
static void
@@ -874,11 +874,7 @@ rip_close(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_close: inp == NULL"));
- INP_INFO_WLOCK(&V_ripcbinfo);
- INP_WLOCK(inp);
rip_dodisconnect(so, inp);
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
}
static int
@@ -892,11 +888,7 @@ rip_disconnect(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
- INP_INFO_WLOCK(&V_ripcbinfo);
- INP_WLOCK(inp);
rip_dodisconnect(so, inp);
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_ripcbinfo);
return (0);
}
@@ -1003,6 +995,7 @@ rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
}
return (rip_output(m, so, dst));
}
+#endif /* INET */
static int
rip_pcblist(SYSCTL_HANDLER_ARGS)
@@ -1081,9 +1074,9 @@ rip_pcblist(SYSCTL_HANDLER_ARGS)
INP_INFO_WLOCK(&V_ripcbinfo);
for (i = 0; i < n; i++) {
inp = inp_list[i];
- INP_WLOCK(inp);
- if (!in_pcbrele(inp))
- INP_WUNLOCK(inp);
+ INP_RLOCK(inp);
+ if (!in_pcbrele_rlocked(inp))
+ INP_RUNLOCK(inp);
}
INP_INFO_WUNLOCK(&V_ripcbinfo);
@@ -1109,6 +1102,7 @@ SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
rip_pcblist, "S,xinpcb", "List of active raw IP sockets");
+#ifdef INET
struct pr_usrreqs rip_usrreqs = {
.pru_abort = rip_abort,
.pru_attach = rip_attach,
@@ -1124,3 +1118,4 @@ struct pr_usrreqs rip_usrreqs = {
.pru_sosetlabel = in_pcbsosetlabel,
.pru_close = rip_close,
};
+#endif /* INET */
diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h
index 18057009..58ca808e 100644
--- a/freebsd/sys/netinet/sctp_constants.h
+++ b/freebsd/sys/netinet/sctp_constants.h
@@ -521,9 +521,6 @@ __FBSDID("$FreeBSD$");
/* How long a cookie lives in milli-seconds */
#define SCTP_DEFAULT_COOKIE_LIFE 60000
-/* resource limit of streams */
-#define MAX_SCTP_STREAMS 2048
-
/* Maximum the mapping array will grow to (TSN mapping array) */
#define SCTP_MAPPING_ARRAY 512
@@ -658,6 +655,7 @@ __FBSDID("$FreeBSD$");
/* How many streams I request initally by default */
#define SCTP_OSTREAM_INITIAL 10
+#define SCTP_ISTREAM_INITIAL 2048
/*
* How many smallest_mtu's need to increase before a window update sack is
@@ -997,6 +995,10 @@ __FBSDID("$FreeBSD$");
(((uint8_t *)&(a)->s_addr)[2] == 0) && \
(((uint8_t *)&(a)->s_addr)[3] == 1))
+#define IN4_ISLINKLOCAL_ADDRESS(a) \
+ ((((uint8_t *)&(a)->s_addr)[0] == 169) && \
+ (((uint8_t *)&(a)->s_addr)[1] == 254))
+
#if defined(_KERNEL)
#define SCTP_GETTIME_TIMEVAL(x) (getmicrouptime(x))
diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c
index 273ad6bc..e00a470d 100644
--- a/freebsd/sys/netinet/sctp_indata.c
+++ b/freebsd/sys/netinet/sctp_indata.c
@@ -1731,7 +1731,6 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
sctp_alloc_a_readq(stcb, control);
sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn,
protocol_id,
- stcb->asoc.context,
strmno, strmseq,
chunk_flags,
dmbuf);
@@ -1859,7 +1858,6 @@ failed_pdapi_express_del:
sctp_alloc_a_readq(stcb, control);
sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn,
protocol_id,
- stcb->asoc.context,
strmno, strmseq,
chunk_flags,
dmbuf);
diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h
index 5eaa1f4b..79a86e2a 100644
--- a/freebsd/sys/netinet/sctp_indata.h
+++ b/freebsd/sys/netinet/sctp_indata.h
@@ -47,14 +47,14 @@ sctp_build_readq_entry(struct sctp_tcb *stcb,
struct mbuf *dm);
-#define sctp_build_readq_entry_mac(_ctl, in_it, a, net, tsn, ppid, context, stream_no, stream_seq, flags, dm) do { \
+#define sctp_build_readq_entry_mac(_ctl, in_it, context, net, tsn, ppid, stream_no, stream_seq, flags, dm) do { \
if (_ctl) { \
atomic_add_int(&((net)->ref_count), 1); \
(_ctl)->sinfo_stream = stream_no; \
(_ctl)->sinfo_ssn = stream_seq; \
(_ctl)->sinfo_flags = (flags << 8); \
(_ctl)->sinfo_ppid = ppid; \
- (_ctl)->sinfo_context = a; \
+ (_ctl)->sinfo_context = context; \
(_ctl)->sinfo_timetolive = 0; \
(_ctl)->sinfo_tsn = tsn; \
(_ctl)->sinfo_cumtsn = tsn; \
diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c
index 645c807e..7cdb5b09 100644
--- a/freebsd/sys/netinet/sctp_input.c
+++ b/freebsd/sys/netinet/sctp_input.c
@@ -391,9 +391,10 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb)
}
SCTP_FREE(asoc->strmin, SCTP_M_STRMI);
}
- asoc->streamincnt = ntohs(init->num_outbound_streams);
- if (asoc->streamincnt > MAX_SCTP_STREAMS) {
- asoc->streamincnt = MAX_SCTP_STREAMS;
+ if (asoc->max_inbound_streams > ntohs(init->num_outbound_streams)) {
+ asoc->streamincnt = ntohs(init->num_outbound_streams);
+ } else {
+ asoc->streamincnt = asoc->max_inbound_streams;
}
SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt *
sizeof(struct sctp_stream_in), SCTP_M_STRMI);
@@ -405,11 +406,6 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb)
for (i = 0; i < asoc->streamincnt; i++) {
asoc->strmin[i].stream_no = i;
asoc->strmin[i].last_sequence_delivered = 0xffff;
- /*
- * U-stream ranges will be set when the cookie is unpacked.
- * Or for the INIT sender they are un set (if pr-sctp not
- * supported) when the INIT-ACK arrives.
- */
TAILQ_INIT(&asoc->strmin[i].inqueue);
asoc->strmin[i].delivery_started = 0;
}
@@ -1030,12 +1026,13 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED,
SCTP_SOCKET_UNLOCK(so, 1);
#endif
}
- /* are the queues empty? */
+#ifdef INVARIANTS
if (!TAILQ_EMPTY(&asoc->send_queue) ||
!TAILQ_EMPTY(&asoc->sent_queue) ||
!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
- sctp_report_all_outbound(stcb, 0, 0, SCTP_SO_NOT_LOCKED);
+ panic("Queues are not empty when handling SHUTDOWN-ACK");
}
+#endif
/* stop the timer */
sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9);
/* send SHUTDOWN-COMPLETE */
@@ -1877,9 +1874,14 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
cookie->tie_tag_peer_vtag != 0) {
struct sctpasochead *head;
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ struct socket *so;
+
+#endif
+
if (asoc->peer_supports_nat) {
/*
- * This is a gross gross hack. just call the
+ * This is a gross gross hack. Just call the
* cookie_new code since we are allowing a duplicate
* association. I hope this works...
*/
@@ -1941,6 +1943,10 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
asoc->mapping_array_size);
}
SCTP_TCB_UNLOCK(stcb);
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ so = SCTP_INP_SO(stcb->sctp_ep);
+ SCTP_SOCKET_LOCK(so, 1);
+#endif
SCTP_INP_INFO_WLOCK();
SCTP_INP_WLOCK(stcb->sctp_ep);
SCTP_TCB_LOCK(stcb);
@@ -1948,7 +1954,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
/* send up all the data */
SCTP_TCB_SEND_LOCK(stcb);
- sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_NOT_LOCKED);
+ sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED);
for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
stcb->asoc.strmout[i].chunks_on_queues = 0;
stcb->asoc.strmout[i].stream_no = i;
@@ -1970,11 +1976,15 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
*/
LIST_INSERT_HEAD(head, stcb, sctp_asocs);
- /* process the INIT info (peer's info) */
SCTP_TCB_SEND_UNLOCK(stcb);
SCTP_INP_WUNLOCK(stcb->sctp_ep);
SCTP_INP_INFO_WUNLOCK();
-
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+ SCTP_SOCKET_UNLOCK(so, 1);
+#endif
+ asoc->total_flight = 0;
+ asoc->total_flight_count = 0;
+ /* process the INIT info (peer's info) */
retval = sctp_process_init(init_cp, stcb);
if (retval < 0) {
if (how_indx < sizeof(asoc->cookie_how))
@@ -3198,13 +3208,14 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE
/* notify upper layer protocol */
if (stcb->sctp_socket) {
sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
- /* are the queues empty? they should be */
- if (!TAILQ_EMPTY(&asoc->send_queue) ||
- !TAILQ_EMPTY(&asoc->sent_queue) ||
- !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
- sctp_report_all_outbound(stcb, 0, 0, SCTP_SO_NOT_LOCKED);
- }
}
+#ifdef INVARIANTS
+ if (!TAILQ_EMPTY(&asoc->send_queue) ||
+ !TAILQ_EMPTY(&asoc->sent_queue) ||
+ !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
+ panic("Queues are not empty when handling SHUTDOWN-COMPLETE");
+ }
+#endif
/* stop the timer */
sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22);
SCTP_STAT_INCR_COUNTER32(sctps_shutdown);
@@ -3493,18 +3504,13 @@ sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *
}
static void
-sctp_reset_out_streams(struct sctp_tcb *stcb, int number_entries, uint16_t * list)
+sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * list)
{
- int i;
+ uint32_t i;
+ uint16_t temp;
- if (number_entries == 0) {
- for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
- stcb->asoc.strmout[i].next_sequence_send = 0;
- }
- } else if (number_entries) {
+ if (number_entries > 0) {
for (i = 0; i < number_entries; i++) {
- uint16_t temp;
-
temp = ntohs(list[i]);
if (temp >= stcb->asoc.streamoutcnt) {
/* no such stream */
@@ -3512,6 +3518,10 @@ sctp_reset_out_streams(struct sctp_tcb *stcb, int number_entries, uint16_t * lis
}
stcb->asoc.strmout[temp].next_sequence_send = 0;
}
+ } else {
+ for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
+ stcb->asoc.strmout[i].next_sequence_send = 0;
+ }
}
sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED);
}
@@ -3598,7 +3608,7 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb,
struct sctp_association *asoc = &stcb->asoc;
struct sctp_tmit_chunk *chk;
struct sctp_stream_reset_out_request *srparam;
- int number_entries;
+ uint32_t number_entries;
if (asoc->stream_reset_outstanding == 0) {
/* duplicate */
@@ -4556,8 +4566,10 @@ __attribute__((noinline))
if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) ||
(ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) ||
(ch->chunk_type == SCTP_PACKET_DROPPED)) {
- if ((vtag_in == asoc->my_vtag) ||
- ((ch->chunk_flags & SCTP_HAD_NO_TCB) &&
+ /* Take the T-bit always into account. */
+ if ((((ch->chunk_flags & SCTP_HAD_NO_TCB) == 0) &&
+ (vtag_in == asoc->my_vtag)) ||
+ (((ch->chunk_flags & SCTP_HAD_NO_TCB) == SCTP_HAD_NO_TCB) &&
(vtag_in == asoc->peer_vtag))) {
/* this is valid */
} else {
@@ -5695,7 +5707,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
#ifdef INET
case AF_INET:
if (ipsec4_in_reject(m, &inp->ip_inp.inp)) {
- MODULE_GLOBAL(ipsec4stat).in_polvio++;
+ IPSECSTAT_INC(in_polvio);
SCTP_STAT_INCR(sctps_hdrops);
goto out;
}
@@ -5704,7 +5716,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt
#ifdef INET6
case AF_INET6:
if (ipsec6_in_reject(m, &inp->ip_inp.inp)) {
- MODULE_GLOBAL(ipsec6stat).in_polvio++;
+ IPSEC6STAT_INC(in_polvio);
SCTP_STAT_INCR(sctps_hdrops);
goto out;
}
diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c
index 1bca9771..61260fb7 100644
--- a/freebsd/sys/netinet/sctp_output.c
+++ b/freebsd/sys/netinet/sctp_output.c
@@ -1967,7 +1967,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len)
while (SCTP_BUF_NEXT(mret) != NULL) {
mret = SCTP_BUF_NEXT(mret);
}
- SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_NOWAIT, 1, MT_DATA);
+ SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_DONTWAIT, 1, MT_DATA);
if (SCTP_BUF_NEXT(mret) == NULL) {
/* We are hosed, can't add more addresses */
return (m);
@@ -4131,10 +4131,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret);
if (net == NULL) {
/* free tempy routes */
- if (ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
+ RO_RTFREE(ro);
} else {
/*
* PMTU check versus smallest asoc MTU goes
@@ -4449,8 +4446,9 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
#if defined(SCTP_WITH_NO_CSUM)
SCTP_STAT_INCR(sctps_sendnocrc);
#else
- sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr));
- SCTP_STAT_INCR(sctps_sendswcrc);
+ m->m_pkthdr.csum_flags = CSUM_SCTP_IPV6;
+ m->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
#endif
}
/* send it out. table id is taken from stcb */
@@ -4487,9 +4485,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
}
if (net == NULL) {
/* Now if we had a temp route free it */
- if (ro->ro_rt) {
- RTFREE(ro->ro_rt);
- }
+ RO_RTFREE(ro);
} else {
/*
* PMTU check versus smallest asoc MTU goes
@@ -10683,6 +10679,7 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked
struct sctp_abort_chunk *abort;
struct sctp_auth_chunk *auth = NULL;
struct sctp_nets *net;
+ uint32_t vtag;
uint32_t auth_offset = 0;
uint16_t cause_len, chunk_len, padding_len;
@@ -10738,7 +10735,14 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked
/* Fill in the ABORT chunk header. */
abort = mtod(m_abort, struct sctp_abort_chunk *);
abort->ch.chunk_type = SCTP_ABORT_ASSOCIATION;
- abort->ch.chunk_flags = 0;
+ if (stcb->asoc.peer_vtag == 0) {
+ /* This happens iff the assoc is in COOKIE-WAIT state. */
+ vtag = stcb->asoc.my_vtag;
+ abort->ch.chunk_flags = SCTP_HAD_NO_TCB;
+ } else {
+ vtag = stcb->asoc.peer_vtag;
+ abort->ch.chunk_flags = 0;
+ }
abort->ch.chunk_length = htons(chunk_len);
/* Add padding, if necessary. */
if (padding_len > 0) {
@@ -10750,7 +10754,7 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked
(void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net,
(struct sockaddr *)&net->ro._l_addr,
m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, 0,
- stcb->sctp_ep->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
+ stcb->sctp_ep->sctp_lport, stcb->rport, htonl(vtag),
stcb->asoc.primary_destination->port, NULL,
0, 0,
so_locked);
@@ -11032,8 +11036,9 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
#if defined(SCTP_WITH_NO_CSUM)
SCTP_STAT_INCR(sctps_sendnocrc);
#else
- shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr));
- SCTP_STAT_INCR(sctps_sendswcrc);
+ mout->m_pkthdr.csum_flags = CSUM_SCTP_IPV6;
+ mout->m_pkthdr.csum_data = 0;
+ SCTP_STAT_INCR(sctps_sendhwcrc);
#endif
}
#ifdef SCTP_PACKET_LOGGING
diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c
index 47877ef1..e21c2e03 100644
--- a/freebsd/sys/netinet/sctp_pcb.c
+++ b/freebsd/sys/netinet/sctp_pcb.c
@@ -2378,8 +2378,13 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
inp->sctp_socket = so;
inp->ip_inp.inp.inp_socket = so;
#ifdef INET6
- if (MODULE_GLOBAL(ip6_auto_flowlabel)) {
- inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL;
+ if (INP_SOCKAF(so) == AF_INET6) {
+ if (MODULE_GLOBAL(ip6_auto_flowlabel)) {
+ inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL;
+ }
+ if (MODULE_GLOBAL(ip6_v6only)) {
+ inp->ip_inp.inp.inp_flags |= IN6P_IPV6_V6ONLY;
+ }
}
#endif
inp->sctp_associd_counter = 1;
@@ -2500,9 +2505,6 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default);
m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default);
m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default);
-
- m->max_open_streams_intome = MAX_SCTP_STREAMS;
-
m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default);
m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default);
m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default);
@@ -2514,6 +2516,7 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id)
m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module);
m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module);
+ m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default);
/* number of streams to pre-open on a association */
m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default);
@@ -4450,23 +4453,21 @@ sctp_delete_from_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
int i;
chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
- if (!LIST_EMPTY(chain)) {
- LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
- for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
- if ((twait_block->vtag_block[i].v_tag == tag) &&
- (twait_block->vtag_block[i].lport == lport) &&
- (twait_block->vtag_block[i].rport == rport)) {
- twait_block->vtag_block[i].tv_sec_at_expire = 0;
- twait_block->vtag_block[i].v_tag = 0;
- twait_block->vtag_block[i].lport = 0;
- twait_block->vtag_block[i].rport = 0;
- found = 1;
- break;
- }
- }
- if (found)
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if ((twait_block->vtag_block[i].v_tag == tag) &&
+ (twait_block->vtag_block[i].lport == lport) &&
+ (twait_block->vtag_block[i].rport == rport)) {
+ twait_block->vtag_block[i].tv_sec_at_expire = 0;
+ twait_block->vtag_block[i].v_tag = 0;
+ twait_block->vtag_block[i].lport = 0;
+ twait_block->vtag_block[i].rport = 0;
+ found = 1;
break;
+ }
}
+ if (found)
+ break;
}
}
@@ -4480,19 +4481,17 @@ sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport)
SCTP_INP_INFO_WLOCK();
chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
- if (!LIST_EMPTY(chain)) {
- LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
- for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
- if ((twait_block->vtag_block[i].v_tag == tag) &&
- (twait_block->vtag_block[i].lport == lport) &&
- (twait_block->vtag_block[i].rport == rport)) {
- found = 1;
- break;
- }
- }
- if (found)
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if ((twait_block->vtag_block[i].v_tag == tag) &&
+ (twait_block->vtag_block[i].lport == lport) &&
+ (twait_block->vtag_block[i].rport == rport)) {
+ found = 1;
break;
+ }
}
+ if (found)
+ break;
}
SCTP_INP_INFO_WUNLOCK();
return (found);
@@ -4514,42 +4513,40 @@ sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t
(void)SCTP_GETTIME_TIMEVAL(&now);
chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
set = 0;
- if (!LIST_EMPTY(chain)) {
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
/* Block(s) present, lets find space, and expire on the fly */
- LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
- for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
- if ((twait_block->vtag_block[i].v_tag == 0) &&
- !set) {
- twait_block->vtag_block[i].tv_sec_at_expire =
- now.tv_sec + time;
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if ((twait_block->vtag_block[i].v_tag == 0) &&
+ !set) {
+ twait_block->vtag_block[i].tv_sec_at_expire =
+ now.tv_sec + time;
+ twait_block->vtag_block[i].v_tag = tag;
+ twait_block->vtag_block[i].lport = lport;
+ twait_block->vtag_block[i].rport = rport;
+ set = 1;
+ } else if ((twait_block->vtag_block[i].v_tag) &&
+ ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) {
+ /* Audit expires this guy */
+ twait_block->vtag_block[i].tv_sec_at_expire = 0;
+ twait_block->vtag_block[i].v_tag = 0;
+ twait_block->vtag_block[i].lport = 0;
+ twait_block->vtag_block[i].rport = 0;
+ if (set == 0) {
+ /* Reuse it for my new tag */
+ twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time;
twait_block->vtag_block[i].v_tag = tag;
twait_block->vtag_block[i].lport = lport;
twait_block->vtag_block[i].rport = rport;
set = 1;
- } else if ((twait_block->vtag_block[i].v_tag) &&
- ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) {
- /* Audit expires this guy */
- twait_block->vtag_block[i].tv_sec_at_expire = 0;
- twait_block->vtag_block[i].v_tag = 0;
- twait_block->vtag_block[i].lport = 0;
- twait_block->vtag_block[i].rport = 0;
- if (set == 0) {
- /* Reuse it for my new tag */
- twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time;
- twait_block->vtag_block[i].v_tag = tag;
- twait_block->vtag_block[i].lport = lport;
- twait_block->vtag_block[i].rport = rport;
- set = 1;
- }
}
}
- if (set) {
- /*
- * We only do up to the block where we can
- * place our tag for audits
- */
- break;
- }
+ }
+ if (set) {
+ /*
+ * We only do up to the block where we can place our
+ * tag for audits
+ */
+ break;
}
}
/* Need to add a new block to chain */
@@ -6699,30 +6696,28 @@ skip_vtag_check:
chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)];
/* Now what about timed wait ? */
- if (!LIST_EMPTY(chain)) {
+ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
/*
* Block(s) are present, lets see if we have this tag in the
* list
*/
- LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) {
- for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
- if (twait_block->vtag_block[i].v_tag == 0) {
- /* not used */
- continue;
- } else if ((long)twait_block->vtag_block[i].tv_sec_at_expire <
- now->tv_sec) {
- /* Audit expires this guy */
- twait_block->vtag_block[i].tv_sec_at_expire = 0;
- twait_block->vtag_block[i].v_tag = 0;
- twait_block->vtag_block[i].lport = 0;
- twait_block->vtag_block[i].rport = 0;
- } else if ((twait_block->vtag_block[i].v_tag == tag) &&
- (twait_block->vtag_block[i].lport == lport) &&
- (twait_block->vtag_block[i].rport == rport)) {
- /* Bad tag, sorry :< */
- SCTP_INP_INFO_RUNLOCK();
- return (0);
- }
+ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) {
+ if (twait_block->vtag_block[i].v_tag == 0) {
+ /* not used */
+ continue;
+ } else if ((long)twait_block->vtag_block[i].tv_sec_at_expire <
+ now->tv_sec) {
+ /* Audit expires this guy */
+ twait_block->vtag_block[i].tv_sec_at_expire = 0;
+ twait_block->vtag_block[i].v_tag = 0;
+ twait_block->vtag_block[i].lport = 0;
+ twait_block->vtag_block[i].rport = 0;
+ } else if ((twait_block->vtag_block[i].v_tag == tag) &&
+ (twait_block->vtag_block[i].lport == lport) &&
+ (twait_block->vtag_block[i].rport == rport)) {
+ /* Bad tag, sorry :< */
+ SCTP_INP_INFO_RUNLOCK();
+ return (0);
}
}
}
diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h
index abecdabd..bc18f0e8 100644
--- a/freebsd/sys/netinet/sctp_structs.h
+++ b/freebsd/sys/netinet/sctp_structs.h
@@ -189,6 +189,8 @@ struct iterator_control {
struct sctp_net_route {
sctp_rtentry_t *ro_rt;
void *ro_lle;
+ void *ro_ia;
+ int ro_flags;
union sctp_sockstore _l_addr; /* remote peer addr */
struct sctp_ifa *_s_addr; /* our selected src addr */
};
diff --git a/freebsd/sys/netinet/sctp_sysctl.c b/freebsd/sys/netinet/sctp_sysctl.c
index ca462b7a..95e3c589 100644
--- a/freebsd/sys/netinet/sctp_sysctl.c
+++ b/freebsd/sys/netinet/sctp_sysctl.c
@@ -83,6 +83,7 @@ sctp_init_sysctls()
SCTP_BASE_SYSCTL(sctp_path_rtx_max_default) = SCTPCTL_PATH_RTX_MAX_DEFAULT;
SCTP_BASE_SYSCTL(sctp_path_pf_threshold) = SCTPCTL_PATH_PF_THRESHOLD_DEFAULT;
SCTP_BASE_SYSCTL(sctp_add_more_threshold) = SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT;
+ SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default) = SCTPCTL_INCOMING_STREAMS_DEFAULT;
SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT;
SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT;
/* EY */
@@ -625,6 +626,7 @@ sysctl_sctp_check(SYSCTL_HANDLER_ARGS)
RANGECHK(SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), SCTPCTL_PATH_RTX_MAX_MIN, SCTPCTL_PATH_RTX_MAX_MAX);
RANGECHK(SCTP_BASE_SYSCTL(sctp_path_pf_threshold), SCTPCTL_PATH_PF_THRESHOLD_MIN, SCTPCTL_PATH_PF_THRESHOLD_MAX);
RANGECHK(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTPCTL_ADD_MORE_ON_OUTPUT_MIN, SCTPCTL_ADD_MORE_ON_OUTPUT_MAX);
+ RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), SCTPCTL_INCOMING_STREAMS_MIN, SCTPCTL_INCOMING_STREAMS_MAX);
RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), SCTPCTL_OUTGOING_STREAMS_MIN, SCTPCTL_OUTGOING_STREAMS_MAX);
RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_on_off), SCTPCTL_CMT_ON_OFF_MIN, SCTPCTL_CMT_ON_OFF_MAX);
/* EY */
@@ -967,6 +969,10 @@ SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, add_more_on_output, CTLTYPE_UINT | CT
&SCTP_BASE_SYSCTL(sctp_add_more_threshold), 0, sysctl_sctp_check, "IU",
SCTPCTL_ADD_MORE_ON_OUTPUT_DESC);
+SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, incoming_streams, CTLTYPE_UINT | CTLFLAG_RW,
+ &SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), 0, sysctl_sctp_check, "IU",
+ SCTPCTL_INCOMING_STREAMS_DESC);
+
SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, outgoing_streams, CTLTYPE_UINT | CTLFLAG_RW,
&SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), 0, sysctl_sctp_check, "IU",
SCTPCTL_OUTGOING_STREAMS_DESC);
diff --git a/freebsd/sys/netinet/sctp_sysctl.h b/freebsd/sys/netinet/sctp_sysctl.h
index 4ec37157..8090373e 100644
--- a/freebsd/sys/netinet/sctp_sysctl.h
+++ b/freebsd/sys/netinet/sctp_sysctl.h
@@ -72,6 +72,7 @@ struct sctp_sysctl {
uint32_t sctp_path_rtx_max_default;
uint32_t sctp_path_pf_threshold;
uint32_t sctp_add_more_threshold;
+ uint32_t sctp_nr_incoming_streams_default;
uint32_t sctp_nr_outgoing_streams_default;
uint32_t sctp_cmt_on_off;
uint32_t sctp_cmt_use_dac;
@@ -322,6 +323,12 @@ struct sctp_sysctl {
#define SCTPCTL_ADD_MORE_ON_OUTPUT_MAX 0xFFFFFFFF
#define SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT SCTP_DEFAULT_ADD_MORE
+/* incoming_streams: Default number of incoming streams */
+#define SCTPCTL_INCOMING_STREAMS_DESC "Default number of incoming streams"
+#define SCTPCTL_INCOMING_STREAMS_MIN 1
+#define SCTPCTL_INCOMING_STREAMS_MAX 65535
+#define SCTPCTL_INCOMING_STREAMS_DEFAULT SCTP_ISTREAM_INITIAL
+
/* outgoing_streams: Default number of outgoing streams */
#define SCTPCTL_OUTGOING_STREAMS_DESC "Default number of outgoing streams"
#define SCTPCTL_OUTGOING_STREAMS_MIN 1
diff --git a/freebsd/sys/netinet/sctp_uio.h b/freebsd/sys/netinet/sctp_uio.h
index d8e7da45..063fd9f1 100644
--- a/freebsd/sys/netinet/sctp_uio.h
+++ b/freebsd/sys/netinet/sctp_uio.h
@@ -1267,44 +1267,50 @@ sctp_sorecvmsg(struct socket *so,
#if !(defined(_KERNEL)) && !(defined(__Userspace__))
__BEGIN_DECLS
-int sctp_peeloff __P((int, sctp_assoc_t));
-int sctp_bindx __P((int, struct sockaddr *, int, int));
-int sctp_connectx __P((int, const struct sockaddr *, int, sctp_assoc_t *));
-int sctp_getaddrlen __P((sa_family_t));
-int sctp_getpaddrs __P((int, sctp_assoc_t, struct sockaddr **));
-void sctp_freepaddrs __P((struct sockaddr *));
-int sctp_getladdrs __P((int, sctp_assoc_t, struct sockaddr **));
-void sctp_freeladdrs __P((struct sockaddr *));
-int sctp_opt_info __P((int, sctp_assoc_t, int, void *, socklen_t *));
+int sctp_peeloff(int, sctp_assoc_t);
+int sctp_bindx(int, struct sockaddr *, int, int);
+int sctp_connectx(int, const struct sockaddr *, int, sctp_assoc_t *);
+int sctp_getaddrlen(sa_family_t);
+int sctp_getpaddrs(int, sctp_assoc_t, struct sockaddr **);
+void sctp_freepaddrs(struct sockaddr *);
+int sctp_getladdrs(int, sctp_assoc_t, struct sockaddr **);
+void sctp_freeladdrs(struct sockaddr *);
+int sctp_opt_info(int, sctp_assoc_t, int, void *, socklen_t *);
/* deprecated */
-ssize_t sctp_sendmsg
-__P((int, const void *, size_t, const struct sockaddr *,
- socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t));
+ssize_t
+sctp_sendmsg(int, const void *, size_t, const struct sockaddr *,
+ socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t);
/* deprecated */
- ssize_t sctp_send __P((int, const void *, size_t,
- const struct sctp_sndrcvinfo *, int));
+ssize_t
+sctp_send(int, const void *, size_t,
+ const struct sctp_sndrcvinfo *, int);
/* deprecated */
- ssize_t sctp_sendx __P((int, const void *, size_t, struct sockaddr *,
- int, struct sctp_sndrcvinfo *, int));
+ssize_t
+sctp_sendx(int, const void *, size_t, struct sockaddr *,
+ int, struct sctp_sndrcvinfo *, int);
/* deprecated */
- ssize_t sctp_sendmsgx __P((int sd, const void *, size_t, struct sockaddr *,
- int, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t));
+ssize_t
+sctp_sendmsgx(int sd, const void *, size_t, struct sockaddr *,
+ int, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t);
- sctp_assoc_t sctp_getassocid __P((int, struct sockaddr *));
+sctp_assoc_t sctp_getassocid(int, struct sockaddr *);
/* deprecated */
- ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *, socklen_t *,
- struct sctp_sndrcvinfo *, int *));
+ssize_t
+sctp_recvmsg(int, void *, size_t, struct sockaddr *, socklen_t *,
+ struct sctp_sndrcvinfo *, int *);
- ssize_t sctp_sendv __P((int, const struct iovec *, int, struct sockaddr *,
- int, void *, socklen_t, unsigned int, int));
+ssize_t
+sctp_sendv(int, const struct iovec *, int, struct sockaddr *,
+ int, void *, socklen_t, unsigned int, int);
- ssize_t sctp_recvv __P((int, const struct iovec *, int, struct sockaddr *,
- socklen_t *, void *, socklen_t *, unsigned int *, int *));
+ssize_t
+sctp_recvv(int, const struct iovec *, int, struct sockaddr *,
+ socklen_t *, void *, socklen_t *, unsigned int *, int *);
__END_DECLS
diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c
index 527790ce..81db1dc1 100644
--- a/freebsd/sys/netinet/sctp_usrreq.c
+++ b/freebsd/sys/netinet/sctp_usrreq.c
@@ -2054,18 +2054,29 @@ flags_out:
}
case SCTP_MAX_BURST:
{
- uint8_t *value;
+ struct sctp_assoc_value *av;
- SCTP_CHECK_AND_CAST(value, optval, uint8_t, *optsize);
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
- SCTP_INP_RLOCK(inp);
- if (inp->sctp_ep.max_burst < 256) {
- *value = inp->sctp_ep.max_burst;
+ if (stcb) {
+ av->assoc_value = stcb->asoc.max_burst;
+ SCTP_TCB_UNLOCK(stcb);
} else {
- *value = 255;
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ av->assoc_value = inp->sctp_ep.max_burst;
+ SCTP_INP_RUNLOCK(inp);
+ } else {
+ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+ error = EINVAL;
+ }
+ }
+ if (error == 0) {
+ *optsize = sizeof(struct sctp_assoc_value);
}
- SCTP_INP_RUNLOCK(inp);
- *optsize = sizeof(uint8_t);
break;
}
case SCTP_MAXSEG:
@@ -4378,13 +4389,34 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
}
case SCTP_MAX_BURST:
{
- uint8_t *burst;
+ struct sctp_assoc_value *av;
- SCTP_CHECK_AND_CAST(burst, optval, uint8_t, optsize);
+ SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
+ SCTP_FIND_STCB(inp, stcb, av->assoc_id);
- SCTP_INP_WLOCK(inp);
- inp->sctp_ep.max_burst = *burst;
- SCTP_INP_WUNLOCK(inp);
+ if (stcb) {
+ stcb->asoc.max_burst = av->assoc_value;
+ SCTP_TCB_UNLOCK(stcb);
+ } else {
+ if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+ (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+ (av->assoc_id == SCTP_FUTURE_ASSOC) ||
+ (av->assoc_id == SCTP_ALL_ASSOC)) {
+ SCTP_INP_WLOCK(inp);
+ inp->sctp_ep.max_burst = av->assoc_value;
+ SCTP_INP_WUNLOCK(inp);
+ }
+ if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
+ (av->assoc_id == SCTP_ALL_ASSOC)) {
+ SCTP_INP_RLOCK(inp);
+ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
+ SCTP_TCB_LOCK(stcb);
+ stcb->asoc.max_burst = av->assoc_value;
+ SCTP_TCB_UNLOCK(stcb);
+ }
+ SCTP_INP_RUNLOCK(inp);
+ }
+ }
break;
}
case SCTP_MAXSEG:
diff --git a/freebsd/sys/netinet/sctp_var.h b/freebsd/sys/netinet/sctp_var.h
index 3862b90b..d88a2376 100644
--- a/freebsd/sys/netinet/sctp_var.h
+++ b/freebsd/sys/netinet/sctp_var.h
@@ -321,48 +321,34 @@ struct sctphdr;
void sctp_close(struct socket *so);
int sctp_disconnect(struct socket *so);
+void sctp_ctlinput(int, struct sockaddr *, void *);
+int sctp_ctloutput(struct socket *, struct sockopt *);
-void sctp_ctlinput __P((int, struct sockaddr *, void *));
-int sctp_ctloutput __P((struct socket *, struct sockopt *));
-
-#ifdef INET
-void sctp_input_with_port __P((struct mbuf *, int, uint16_t));
-
-#endif
#ifdef INET
-void sctp_input __P((struct mbuf *, int));
+void sctp_input_with_port(struct mbuf *, int, uint16_t);
+void sctp_input(struct mbuf *, int);
#endif
-void sctp_pathmtu_adjustment __P((struct sctp_tcb *, uint16_t));
-void sctp_drain __P((void));
-void sctp_init __P((void));
-
+void sctp_pathmtu_adjustment(struct sctp_tcb *, uint16_t);
+void sctp_drain(void);
+void sctp_init(void);
void sctp_finish(void);
-
int sctp_flush(struct socket *, int);
-int sctp_shutdown __P((struct socket *));
-void sctp_notify
-__P((struct sctp_inpcb *, struct ip *ip, struct sctphdr *,
+int sctp_shutdown(struct socket *);
+void
+sctp_notify(struct sctp_inpcb *, struct ip *ip, struct sctphdr *,
struct sockaddr *, struct sctp_tcb *,
- struct sctp_nets *));
-
- int sctp_bindx(struct socket *, int, struct sockaddr_storage *,
- int, int, struct proc *);
+ struct sctp_nets *);
+int
+sctp_bindx(struct socket *, int, struct sockaddr_storage *,
+ int, int, struct proc *);
/* can't use sctp_assoc_t here */
- int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *);
-
- int sctp_ingetaddr(struct socket *,
- struct sockaddr **
-);
-
- int sctp_peeraddr(struct socket *,
- struct sockaddr **
-);
-
- int sctp_listen(struct socket *, int, struct thread *);
-
- int sctp_accept(struct socket *, struct sockaddr **);
+int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *);
+int sctp_ingetaddr(struct socket *, struct sockaddr **);
+int sctp_peeraddr(struct socket *, struct sockaddr **);
+int sctp_listen(struct socket *, int, struct thread *);
+int sctp_accept(struct socket *, struct sockaddr **);
#endif /* _KERNEL */
diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c
index 3a88b894..15928d8b 100644
--- a/freebsd/sys/netinet/sctputil.c
+++ b/freebsd/sys/netinet/sctputil.c
@@ -2690,8 +2690,14 @@ set_error:
stcb->sctp_socket->so_error = ECONNRESET;
}
} else {
- SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNABORTED);
- stcb->sctp_socket->so_error = ECONNABORTED;
+ if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) ||
+ (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ETIMEDOUT);
+ stcb->sctp_socket->so_error = ETIMEDOUT;
+ } else {
+ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNABORTED);
+ stcb->sctp_socket->so_error = ECONNABORTED;
+ }
}
}
/* Wake ANY sleepers */
@@ -3532,8 +3538,8 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
if (stcb->sctp_socket->so_rcv.sb_state & SBS_CANTRCVMORE) {
return;
}
- if (stcb && ((stcb->asoc.state & SCTP_STATE_COOKIE_WAIT) ||
- (stcb->asoc.state & SCTP_STATE_COOKIE_ECHOED))) {
+ if ((stcb->asoc.state & SCTP_STATE_COOKIE_WAIT) ||
+ (stcb->asoc.state & SCTP_STATE_COOKIE_ECHOED)) {
if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) ||
(notification == SCTP_NOTIFY_INTERFACE_UP) ||
(notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) {
@@ -3607,16 +3613,16 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
break;
}
case SCTP_NOTIFY_ASSOC_LOC_ABORTED:
- if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) ||
- ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) {
+ if (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) ||
+ ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED)) {
sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 0, so_locked);
} else {
sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 0, so_locked);
}
break;
case SCTP_NOTIFY_ASSOC_REM_ABORTED:
- if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) ||
- ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) {
+ if (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) ||
+ ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED)) {
sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 1, so_locked);
} else {
sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 1, so_locked);
@@ -3969,7 +3975,7 @@ sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
if (stcb == NULL) {
/* Got to have a TCB */
if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
- if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) {
+ if (LIST_EMPTY(&inp->sctp_asoc_list)) {
sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
SCTP_CALLED_DIRECTLY_NOCMPSET);
}
@@ -4024,7 +4030,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset,
SCTP_STAT_INCR_COUNTER32(sctps_outoftheblue);
/* Generate a TO address for future reference */
if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
- if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) {
+ if (LIST_EMPTY(&inp->sctp_asoc_list)) {
sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
SCTP_CALLED_DIRECTLY_NOCMPSET);
}
diff --git a/freebsd/sys/netinet/tcp.h b/freebsd/sys/netinet/tcp.h
index affc4df0..5925b4da 100644
--- a/freebsd/sys/netinet/tcp.h
+++ b/freebsd/sys/netinet/tcp.h
@@ -34,6 +34,7 @@
#define _NETINET_TCP_H_
#include <sys/cdefs.h>
+#include <rtems/bsd/sys/types.h>
#if __BSD_VISIBLE
@@ -52,11 +53,11 @@ struct tcphdr {
tcp_seq th_seq; /* sequence number */
tcp_seq th_ack; /* acknowledgement number */
#if BYTE_ORDER == LITTLE_ENDIAN
- u_int th_x2:4, /* (unused) */
+ u_char th_x2:4, /* (unused) */
th_off:4; /* data offset */
#endif
#if BYTE_ORDER == BIG_ENDIAN
- u_int th_off:4, /* data offset */
+ u_char th_off:4, /* data offset */
th_x2:4; /* (unused) */
#endif
u_char th_flags;
@@ -103,29 +104,37 @@ struct tcphdr {
/*
- * Default maximum segment size for TCP.
- * With an IP MTU of 576, this is 536,
- * but 512 is probably more convenient.
- * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)).
- */
-#define TCP_MSS 512
-/*
- * TCP_MINMSS is defined to be 216 which is fine for the smallest
- * link MTU (256 bytes, AX.25 packet radio) in the Internet.
- * However it is very unlikely to come across such low MTU interfaces
- * these days (anno dato 2003).
- * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.
- * Setting this to "0" disables the minmss check.
+ * The default maximum segment size (MSS) to be used for new TCP connections
+ * when path MTU discovery is not enabled.
+ *
+ * RFC879 derives the default MSS from the largest datagram size hosts are
+ * minimally required to handle directly or through IP reassembly minus the
+ * size of the IP and TCP header. With IPv6 the minimum MTU is specified
+ * in RFC2460.
+ *
+ * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr)
+ * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr)
+ *
+ * We use explicit numerical definition here to avoid header pollution.
*/
-#define TCP_MINMSS 216
+#define TCP_MSS 536
+#define TCP6_MSS 1220
/*
- * Default maximum segment size for TCP6.
- * With an IP6 MSS of 1280, this is 1220,
- * but 1024 is probably more convenient. (xxx kazu in doubt)
- * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr))
+ * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS
+ * option. Allowing low values of MSS can consume significant resources and
+ * be used to mount a resource exhaustion attack.
+ * Connections requesting lower MSS values will be rounded up to this value
+ * and the IP_DF flag will be cleared to allow fragmentation along the path.
+ *
+ * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting
+ * it to "0" disables the minmss check.
+ *
+ * The default value is fine for TCP across the Internet's smallest official
+ * link MTU (256 bytes for AX.25 packet radio). However, a connection is very
+ * unlikely to come across such low MTU interfaces these days (anno domini 2003).
*/
-#define TCP6_MSS 1024
+#define TCP_MINMSS 216
#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */
#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */
@@ -152,6 +161,10 @@ struct tcphdr {
#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */
#define TCP_INFO 32 /* retrieve tcp_info structure */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
+#define TCP_KEEPINIT 128 /* N, time to establish connection */
+#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
+#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
+#define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@@ -218,7 +231,7 @@ struct tcp_info {
/* FreeBSD extensions to tcp_info. */
u_int32_t tcpi_snd_wnd; /* Advertised send window. */
- u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */
+ u_int32_t tcpi_snd_bwnd; /* No longer used. */
u_int32_t tcpi_snd_nxt; /* Next egress seqno */
u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */
u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */
diff --git a/freebsd/sys/netinet/tcp_hostcache.c b/freebsd/sys/netinet/tcp_hostcache.c
index a0d38ff7..ee98af3f 100644
--- a/freebsd/sys/netinet/tcp_hostcache.c
+++ b/freebsd/sys/netinet/tcp_hostcache.c
@@ -120,7 +120,7 @@ static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
static void tcp_hc_purge_internal(int);
static void tcp_hc_purge(void *);
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
"TCP Host cache");
SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c
index 25afbb26..50dfc1ce 100644
--- a/freebsd/sys/netinet/tcp_input.c
+++ b/freebsd/sys/netinet/tcp_input.c
@@ -7,6 +7,7 @@
* Swinburne University of Technology, Melbourne, Australia.
* Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
* Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
* Portions of this software were developed at the Centre for Advanced Internet
@@ -18,6 +19,9 @@
* Internet Architectures, Swinburne University of Technology, Melbourne,
* Australia by David Hayes under sponsorship from the FreeBSD Foundation.
*
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -103,6 +107,9 @@ __FBSDID("$FreeBSD$");
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
@@ -153,6 +160,14 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3390), 0,
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0,
+ "Experimental TCP extensions");
+
+VNET_DEFINE(int, tcp_do_initcwnd10) = 0;
+SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_initcwnd10), 0,
+ "Enable draft-ietf-tcpm-initcwnd-05 (Increasing initial CWND to 10)");
+
VNET_DEFINE(int, tcp_do_rfc3465) = 1;
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3465), 0,
@@ -163,7 +178,7 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
&VNET_NAME(tcp_abc_l_var), 2,
"Cap the max cwnd increment during slow-start to this number of segments");
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
VNET_DEFINE(int, tcp_do_ecn) = 0;
SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
@@ -181,6 +196,11 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
&VNET_NAME(tcp_insecure_rst), 0,
"Follow the old (insecure) criteria for accepting RST packets");
+VNET_DEFINE(int, tcp_recvspace) = 1024*64;
+#define V_tcp_recvspace VNET(tcp_recvspace)
+SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+ &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
+
VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
@@ -193,16 +213,12 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
&VNET_NAME(tcp_autorcvbuf_inc), 0,
"Incrementor step size of automatic receive buffer");
-VNET_DEFINE(int, tcp_autorcvbuf_max) = 256*1024;
+VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
&VNET_NAME(tcp_autorcvbuf_max), 0,
"Max size of automatic receive buffer");
-int tcp_read_locking = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW,
- &tcp_read_locking, 0, "Enable read locking strategy");
-
VNET_DEFINE(struct inpcbhead, tcb);
#define tcb6 tcb /* for KAME src sync over BSD*'s */
VNET_DEFINE(struct inpcbinfo, tcbinfo);
@@ -217,18 +233,18 @@ static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static void tcp_xmit_timer(struct tcpcb *, int);
static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
- uint16_t type);
-static void inline cc_conn_init(struct tcpcb *tp);
-static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
static void inline tcp_fields_to_host(struct tcphdr *);
-static void inline hhook_run_tcp_est_in(struct tcpcb *tp,
- struct tcphdr *th, struct tcpopt *to);
#ifdef TCP_SIGNATURE
static void inline tcp_fields_to_net(struct tcphdr *);
static int inline tcp_signature_verify_input(struct mbuf *, int, int,
int, struct tcpopt *, struct tcphdr *, u_int);
#endif
+static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+ uint16_t type);
+static void inline cc_conn_init(struct tcpcb *tp);
+static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+static void inline hhook_run_tcp_est_in(struct tcpcb *tp,
+ struct tcphdr *th, struct tcpopt *to);
/*
* Kernel module interface for updating tcpstat. The argument is an index
@@ -271,7 +287,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
INP_WLOCK_ASSERT(tp->t_inpcb);
tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
- if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd))
+ if (tp->snd_cwnd <= tp->snd_wnd)
tp->ccv->flags |= CCF_CWND_LIMITED;
else
tp->ccv->flags &= ~CCF_CWND_LIMITED;
@@ -303,9 +319,6 @@ cc_conn_init(struct tcpcb *tp)
struct hc_metrics_lite metrics;
struct inpcb *inp = tp->t_inpcb;
int rtt;
-#ifdef INET6
- int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
-#endif
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -339,44 +352,33 @@ cc_conn_init(struct tcpcb *tp)
}
/*
- * Set the slow-start flight size depending on whether this
- * is a local network or not.
- *
- * Extend this so we cache the cwnd too and retrieve it here.
- * Make cwnd even bigger than RFC3390 suggests but only if we
- * have previous experience with the remote host. Be careful
- * not make cwnd bigger than remote receive window or our own
- * send socket buffer. Maybe put some additional upper bound
- * on the retrieved cwnd. Should do incremental updates to
- * hostcache when cwnd collapses so next connection doesn't
- * overloads the path again.
+ * Set the initial slow-start flight size.
*
- * XXXAO: Initializing the CWND from the hostcache is broken
- * and in its current form not RFC conformant. It is disabled
- * until fixed or removed entirely.
+ * RFC5681 Section 3.1 specifies the default conservative values.
+ * RFC3390 specifies slightly more aggressive values.
+ * Draft-ietf-tcpm-initcwnd-05 increases it to ten segments.
*
- * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
- * We currently check only in syncache_socket for that.
+ * If a SYN or SYN/ACK was lost and retransmitted, we have to
+ * reduce the initial CWND to one segment as congestion is likely
+ * requiring us to be cautious.
*/
-/* #define TCP_METRICS_CWND */
-#ifdef TCP_METRICS_CWND
- if (metrics.rmx_cwnd)
- tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2,
- min(tp->snd_wnd, so->so_snd.sb_hiwat)));
- else
-#endif
- if (V_tcp_do_rfc3390)
+ if (tp->snd_cwnd == 1)
+ tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
+ else if (V_tcp_do_initcwnd10)
+ tp->snd_cwnd = min(10 * tp->t_maxseg,
+ max(2 * tp->t_maxseg, 14600));
+ else if (V_tcp_do_rfc3390)
tp->snd_cwnd = min(4 * tp->t_maxseg,
max(2 * tp->t_maxseg, 4380));
-#ifdef INET6
- else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
- (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
- else if (in_localaddr(inp->inp_faddr))
-#endif
- tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local;
- else
- tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz;
+ else {
+ /* Per RFC5681 Section 3.1 */
+ if (tp->t_maxseg > 2190)
+ tp->snd_cwnd = 2 * tp->t_maxseg;
+ else if (tp->t_maxseg > 1095)
+ tp->snd_cwnd = 3 * tp->t_maxseg;
+ else
+ tp->snd_cwnd = 4 * tp->t_maxseg;
+ }
if (CC_ALGO(tp)->conn_init != NULL)
CC_ALGO(tp)->conn_init(tp->ccv);
@@ -546,43 +548,44 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
tcp_input(m, *offp);
return IPPROTO_DONE;
}
-#endif
+#endif /* INET6 */
void
tcp_input(struct mbuf *m, int off0)
{
- struct tcphdr *th;
+ struct tcphdr *th = NULL;
struct ip *ip = NULL;
+#ifdef INET
struct ipovly *ipov;
+#endif
struct inpcb *inp = NULL;
struct tcpcb *tp = NULL;
struct socket *so = NULL;
u_char *optp = NULL;
int optlen = 0;
- int len, tlen, off;
+#ifdef INET
+ int len;
+#endif
+ int tlen = 0, off;
int drop_hdrlen;
int thflags;
int rstreason = 0; /* For badport_bandlim accounting purposes */
- uint8_t iptos;
#ifdef TCP_SIGNATURE
uint8_t sig_checked = 0;
#endif
-#ifdef IPFIREWALL_FORWARD
- struct m_tag *fwd_tag;
-#endif
+ uint8_t iptos = 0;
+ struct m_tag *fwd_tag = NULL;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int isipv6;
#else
const void *ip6 = NULL;
- const int isipv6 = 0;
-#endif
+#endif /* INET6 */
struct tcpopt to; /* options in this segment */
char *s = NULL; /* address and port logging */
int ti_locked;
#define TI_UNLOCKED 1
-#define TI_RLOCKED 2
-#define TI_WLOCKED 3
+#define TI_WLOCKED 2
#ifdef TCPDEBUG
/*
@@ -601,16 +604,34 @@ tcp_input(struct mbuf *m, int off0)
to.to_flags = 0;
TCPSTAT_INC(tcps_rcvtotal);
- if (isipv6) {
#ifdef INET6
+ if (isipv6) {
/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
+
+ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+ m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+ if (m == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ return;
+ }
+ }
+
ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)((caddr_t)ip6 + off0);
tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
- if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in6_cksum_pseudo(ip6, tlen,
+ IPPROTO_TCP, m->m_pkthdr.csum_data);
+ th->th_sum ^= 0xffff;
+ } else
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
+ if (th->th_sum) {
TCPSTAT_INC(tcps_rcvbadsum);
goto drop;
}
- th = (struct tcphdr *)((caddr_t)ip6 + off0);
/*
* Be proactive about unspecified IPv6 address in source.
@@ -624,10 +645,13 @@ tcp_input(struct mbuf *m, int off0)
/* XXX stat */
goto drop;
}
-#else
- th = NULL; /* XXX: Avoid compiler warning. */
+ }
#endif
- } else {
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
/*
* Get IP and TCP header together in first mbuf.
* Note: IP leaves IP header in first mbuf.
@@ -679,13 +703,18 @@ tcp_input(struct mbuf *m, int off0)
/* Re-initialization for later version check */
ip->ip_v = IPVERSION;
}
+#endif /* INET */
#ifdef INET6
if (isipv6)
iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+#endif
+#if defined(INET) && defined(INET6)
else
#endif
+#ifdef INET
iptos = ip->ip_tos;
+#endif
/*
* Check that TCP offset makes sense,
@@ -698,13 +727,18 @@ tcp_input(struct mbuf *m, int off0)
}
tlen -= off; /* tlen is used instead of ti->ti_len */
if (off > sizeof (struct tcphdr)) {
- if (isipv6) {
#ifdef INET6
+ if (isipv6) {
IP6_EXTHDR_CHECK(m, off0, off, );
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)((caddr_t)ip6 + off0);
+ }
#endif
- } else {
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
if (m->m_len < sizeof(struct ip) + off) {
if ((m = m_pullup(m, sizeof (struct ip) + off))
== NULL) {
@@ -716,6 +750,7 @@ tcp_input(struct mbuf *m, int off0)
th = (struct tcphdr *)((caddr_t)ip + off0);
}
}
+#endif
optlen = off - sizeof (struct tcphdr);
optp = (u_char *)(th + 1);
}
@@ -732,39 +767,83 @@ tcp_input(struct mbuf *m, int off0)
drop_hdrlen = off0 + off;
/*
- * Locate pcb for segment, which requires a lock on tcbinfo.
- * Optimisticaly acquire a global read lock rather than a write lock
- * unless header flags necessarily imply a state change. There are
- * two cases where we might discover later we need a write lock
- * despite the flags: ACKs moving a connection out of the syncache,
- * and ACKs for a connection in TIMEWAIT.
+ * Locate pcb for segment; if we're likely to add or remove a
+ * connection then first acquire pcbinfo lock. There are two cases
+ * where we might discover later we need a write lock despite the
+ * flags: ACKs moving a connection out of the syncache, and ACKs for
+ * a connection in TIMEWAIT.
*/
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
- tcp_read_locking == 0) {
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) {
INP_INFO_WLOCK(&V_tcbinfo);
ti_locked = TI_WLOCKED;
- } else {
- INP_INFO_RLOCK(&V_tcbinfo);
- ti_locked = TI_RLOCKED;
- }
+ } else
+ ti_locked = TI_UNLOCKED;
findpcb:
#ifdef INVARIANTS
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- else
- panic("%s: findpcb ti_locked %d\n", __func__, ti_locked);
+ } else {
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
#endif
-#ifdef IPFIREWALL_FORWARD
/*
* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
*/
- fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
+ if (
+#ifdef INET6
+ (isipv6 && (m->m_flags & M_IP6_NEXTHOP))
+#ifdef INET
+ || (!isipv6 && (m->m_flags & M_IP_NEXTHOP))
+#endif
+#endif
+#if defined(INET) && !defined(INET6)
+ (m->m_flags & M_IP_NEXTHOP)
+#endif
+ )
+ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
- if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */
+#ifdef INET6
+ if (isipv6 && fwd_tag != NULL) {
+ struct sockaddr_in6 *next_hop6;
+
+ next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
+ /*
+ * Transparently forwarded. Pretend to be the destination.
+ * Already got one like this?
+ */
+ inp = in6_pcblookup_mbuf(&V_tcbinfo,
+ &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
+ INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m);
+ if (!inp) {
+ /*
+ * It's new. Try to find the ambushing socket.
+ * Because we've rewritten the destination address,
+ * any hardware-generated hash is ignored.
+ */
+ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
+ th->th_sport, &next_hop6->sin6_addr,
+ next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
+ th->th_dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
+ }
+ /* Remove the tag from the packet. We don't need it anymore. */
+ m_tag_delete(m, fwd_tag);
+ m->m_flags &= ~M_IP6_NEXTHOP;
+ fwd_tag = NULL;
+ } else if (isipv6) {
+ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
+ th->th_sport, &ip6->ip6_dst, th->th_dport,
+ INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
+ m->m_pkthdr.rcvif, m);
+ }
+#endif /* INET6 */
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ if (fwd_tag != NULL) {
struct sockaddr_in *next_hop;
next_hop = (struct sockaddr_in *)(fwd_tag+1);
@@ -772,41 +851,31 @@ findpcb:
* Transparently forwarded. Pretend to be the destination.
* already got one like this?
*/
- inp = in_pcblookup_hash(&V_tcbinfo,
- ip->ip_src, th->th_sport,
- ip->ip_dst, th->th_dport,
- 0, m->m_pkthdr.rcvif);
+ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
+ ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB,
+ m->m_pkthdr.rcvif, m);
if (!inp) {
- /* It's new. Try to find the ambushing socket. */
- inp = in_pcblookup_hash(&V_tcbinfo,
- ip->ip_src, th->th_sport,
- next_hop->sin_addr,
- next_hop->sin_port ?
- ntohs(next_hop->sin_port) :
- th->th_dport,
- INPLOOKUP_WILDCARD,
- m->m_pkthdr.rcvif);
+ /*
+ * It's new. Try to find the ambushing socket.
+ * Because we've rewritten the destination address,
+ * any hardware-generated hash is ignored.
+ */
+ inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
+ th->th_sport, next_hop->sin_addr,
+ next_hop->sin_port ? ntohs(next_hop->sin_port) :
+ th->th_dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
}
/* Remove the tag from the packet. We don't need it anymore. */
m_tag_delete(m, fwd_tag);
+ m->m_flags &= ~M_IP_NEXTHOP;
+ fwd_tag = NULL;
} else
-#endif /* IPFIREWALL_FORWARD */
- {
- if (isipv6) {
-#ifdef INET6
- inp = in6_pcblookup_hash(&V_tcbinfo,
- &ip6->ip6_src, th->th_sport,
- &ip6->ip6_dst, th->th_dport,
- INPLOOKUP_WILDCARD,
- m->m_pkthdr.rcvif);
-#endif
- } else
- inp = in_pcblookup_hash(&V_tcbinfo,
- ip->ip_src, th->th_sport,
- ip->ip_dst, th->th_dport,
- INPLOOKUP_WILDCARD,
- m->m_pkthdr.rcvif);
- }
+ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
+ th->th_sport, ip->ip_dst, th->th_dport,
+ INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
+ m->m_pkthdr.rcvif, m);
+#endif /* INET */
/*
* If the INPCB does not exist then all data in the incoming
@@ -835,7 +904,7 @@ findpcb:
rstreason = BANDLIM_RST_CLOSEDPORT;
goto dropwithreset;
}
- INP_WLOCK(inp);
+ INP_WLOCK_ASSERT(inp);
if (!(inp->inp_flags & INP_HW_FLOWID)
&& (m->m_flags & M_FLOWID)
&& ((inp->inp_socket == NULL)
@@ -847,12 +916,12 @@ findpcb:
#ifdef IPSEC
#ifdef INET6
if (isipv6 && ipsec6_in_reject(m, inp)) {
- V_ipsec6stat.in_polvio++;
+ IPSEC6STAT_INC(in_polvio);
goto dropunlock;
} else
#endif /* INET6 */
if (ipsec4_in_reject(m, inp) != 0) {
- V_ipsec4stat.in_polvio++;
+ IPSECSTAT_INC(in_polvio);
goto dropunlock;
}
#endif /* IPSEC */
@@ -876,28 +945,26 @@ findpcb:
* legitimate new connection attempt the old INPCB gets removed and
* we can try again to find a listening socket.
*
- * At this point, due to earlier optimism, we may hold a read lock on
- * the inpcbinfo, rather than a write lock. If so, we need to
- * upgrade, or if that fails, acquire a reference on the inpcb, drop
- * all locks, acquire a global write lock, and then re-acquire the
- * inpcb lock. We may at that point discover that another thread has
- * tried to free the inpcb, in which case we need to loop back and
- * try to find a new inpcb to deliver to.
+ * At this point, due to earlier optimism, we may hold only an inpcb
+ * lock, and not the inpcbinfo write lock. If so, we need to try to
+ * acquire it, or if that fails, acquire a reference on the inpcb,
+ * drop all locks, acquire a global write lock, and then re-acquire
+ * the inpcb lock. We may at that point discover that another thread
+ * has tried to free the inpcb, in which case we need to loop back
+ * and try to find a new inpcb to deliver to.
+ *
+ * XXXRW: It may be time to rethink timewait locking.
*/
relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked));
-
- if (ti_locked == TI_RLOCKED) {
- if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
+ if (ti_locked == TI_UNLOCKED) {
+ if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
INP_INFO_WLOCK(&V_tcbinfo);
ti_locked = TI_WLOCKED;
INP_WLOCK(inp);
- if (in_pcbrele(inp)) {
+ if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
}
@@ -927,28 +994,34 @@ relocked:
goto dropwithreset;
}
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE) {
+ tcp_offload_input(tp, m);
+ m = NULL; /* consumed by the TOE driver */
+ goto dropunlock;
+ }
+#endif
+
/*
* We've identified a valid inpcb, but it could be that we need an
- * inpcbinfo write lock and have only a read lock. In this case,
- * attempt to upgrade/relock using the same strategy as the TIMEWAIT
- * case above. If we relock, we have to jump back to 'relocked' as
- * the connection might now be in TIMEWAIT.
+ * inpcbinfo write lock but don't hold it. In this case, attempt to
+ * acquire using the same strategy as the TIMEWAIT case above. If we
+ * relock, we have to jump back to 'relocked' as the connection might
+ * now be in TIMEWAIT.
*/
- if (tp->t_state != TCPS_ESTABLISHED ||
- (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
- tcp_read_locking == 0) {
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("%s: upgrade check ti_locked %d", __func__, ti_locked));
-
- if (ti_locked == TI_RLOCKED) {
- if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
+#ifdef INVARIANTS
+ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0)
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+#endif
+ if (tp->t_state != TCPS_ESTABLISHED) {
+ if (ti_locked == TI_UNLOCKED) {
+ if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
INP_INFO_WLOCK(&V_tcbinfo);
ti_locked = TI_WLOCKED;
INP_WLOCK(inp);
- if (in_pcbrele(inp)) {
+ if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
}
@@ -969,25 +1042,28 @@ relocked:
#ifdef TCPDEBUG
if (so->so_options & SO_DEBUG) {
ostate = tp->t_state;
- if (isipv6) {
#ifdef INET6
+ if (isipv6) {
bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
-#endif
} else
+#endif
bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
tcp_savetcp = *th;
}
-#endif
+#endif /* TCPDEBUG */
/*
* When the socket is accepting connections (the INPCB is in LISTEN
* state) we look into the SYN cache if this is a new connection
- * attempt or the completion of a previous one.
+ * attempt or the completion of a previous one. Because listen
+ * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be
+ * held in this case.
*/
if (so->so_options & SO_ACCEPTCONN) {
struct in_conninfo inc;
KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
"tp not listening", __func__));
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
bzero(&inc, sizeof(inc));
#ifdef INET6
@@ -1151,7 +1227,7 @@ relocked:
"SYN|FIN segment ignored (based on "
"sysctl setting)\n", s, __func__);
TCPSTAT_INC(tcps_badsyn);
- goto dropunlock;
+ goto dropunlock;
}
/*
* Segment's flags are (SYN) or (SYN|FIN).
@@ -1213,7 +1289,7 @@ relocked:
if (ia6)
ifa_free(&ia6->ia_ifa);
}
-#endif
+#endif /* INET6 */
/*
* Basic sanity checks on incoming SYN requests:
* Don't respond if the destination is a link layer
@@ -1232,8 +1308,8 @@ relocked:
"link layer address ignored\n", s, __func__);
goto dropunlock;
}
- if (isipv6) {
#ifdef INET6
+ if (isipv6) {
if (th->th_dport == th->th_sport &&
IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
@@ -1250,8 +1326,13 @@ relocked:
"address ignored\n", s, __func__);
goto dropunlock;
}
+ }
#endif
- } else {
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
if (th->th_dport == th->th_sport &&
ip->ip_dst.s_addr == ip->ip_src.s_addr) {
if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
@@ -1272,6 +1353,7 @@ relocked:
goto dropunlock;
}
}
+#endif
/*
* SYN appears to be valid. Create compressed TCP state
* for syncache.
@@ -1289,6 +1371,15 @@ relocked:
*/
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
return;
+ } else if (tp->t_state == TCPS_LISTEN) {
+ /*
+ * When a listen socket is torn down the SO_ACCEPTCONN
+ * flag is removed first while connections are drained
+ * from the accept queue in a unlock/lock cycle of the
+ * ACCEPT_LOCK, opening a race condition allowing a SYN
+ * attempt go through unhandled.
+ */
+ goto dropunlock;
}
#ifdef TCP_SIGNATURE
@@ -1320,13 +1411,17 @@ relocked:
return;
dropwithreset:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
- ti_locked = TI_UNLOCKED;
+ ti_locked = TI_UNLOCKED;
+ }
+#ifdef INVARIANTS
+ else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
if (inp != NULL) {
tcp_dropwithreset(m, th, tp, tlen, rstreason);
@@ -1337,13 +1432,17 @@ dropwithreset:
goto drop;
dropunlock:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropunlock ti_locked %d", __func__, ti_locked);
- ti_locked = TI_UNLOCKED;
+ ti_locked = TI_UNLOCKED;
+ }
+#ifdef INVARIANTS
+ else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
if (inp != NULL)
INP_WUNLOCK(inp);
@@ -1398,13 +1497,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
} else {
#ifdef INVARIANTS
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- else
- panic("%s: ti_locked %d for EST", __func__,
- ti_locked);
+ else {
+ KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST "
+ "ti_locked: %d", __func__, ti_locked));
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ }
#endif
}
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -1421,7 +1520,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
tp->t_rcvtime = ticks;
if (TCPS_HAVEESTABLISHED(tp->t_state))
- tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
+ tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
/*
* Unscale the window into a 32-bit value.
@@ -1550,13 +1649,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/*
* This is a pure ack for outstanding data.
*/
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: ti_locked %d on pure ACK",
- __func__, ti_locked);
ti_locked = TI_UNLOCKED;
TCPSTAT_INC(tcps_predack);
@@ -1595,7 +1689,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_xmit_timer(tp,
ticks - tp->t_rtttime);
}
- tcp_xmit_bandwidth_limit(tp, th->th_ack);
acked = BYTES_THIS_ACK(tp, th);
/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
@@ -1660,13 +1753,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* nothing on the reassembly queue and we have enough
* buffer space to take it.
*/
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: ti_locked %d on pure data "
- "segment", __func__, ti_locked);
ti_locked = TI_UNLOCKED;
/* Clean receiver SACK report if present */
@@ -1877,7 +1965,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else {
tp->t_state = TCPS_ESTABLISHED;
cc_conn_init(tp);
- tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
+ tcp_timer_activate(tp, TT_KEEP,
+ TP_KEEPIDLE(tp));
}
} else {
/*
@@ -2281,7 +2370,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
} else {
tp->t_state = TCPS_ESTABLISHED;
cc_conn_init(tp);
- tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
+ tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
}
/*
* If segment contains data or ACK, will call tcp_reass()
@@ -2362,7 +2451,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
/*
* Compute the amount of data in flight first.
* We can inject new data into the pipe iff
- * we have less than 1/2 the original window's
+ * we have less than 1/2 the original window's
* worth of data in flight.
*/
awnd = (tp->snd_nxt - tp->snd_fack) +
@@ -2448,6 +2537,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
u_long oldcwnd = tp->snd_cwnd;
tcp_seq oldsndmax = tp->snd_max;
u_int sent;
+ int avail;
KASSERT(tp->t_dupacks == 1 ||
tp->t_dupacks == 2,
@@ -2469,7 +2559,17 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
break;
}
- (void) tcp_output(tp);
+ /*
+ * Only call tcp_output when there
+ * is new data available to be sent.
+ * Otherwise we would send pure ACKs.
+ */
+ SOCKBUF_LOCK(&so->so_snd);
+ avail = so->so_snd.sb_cc -
+ (tp->snd_nxt - tp->snd_una);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (avail > 0)
+ (void) tcp_output(tp);
sent = tp->snd_max - oldsndmax;
if (sent > tp->t_maxseg) {
KASSERT((tp->t_dupacks == 2 &&
@@ -2529,9 +2629,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
process_ACK:
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_input: process_ACK ti_locked %d", ti_locked));
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
@@ -2575,7 +2672,6 @@ process_ACK:
tp->t_rttlow = ticks - tp->t_rtttime;
tcp_xmit_timer(tp, ticks - tp->t_rtttime);
}
- tcp_xmit_bandwidth_limit(tp, th->th_ack);
/*
* If all outstanding data is acked, stop retransmit
@@ -2654,12 +2750,11 @@ process_ACK:
* compressed state.
*/
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
- int timeout;
-
soisdisconnected(so);
- timeout = (tcp_fast_finwait2_recycle) ?
- tcp_finwait2_timeout : tcp_maxidle;
- tcp_timer_activate(tp, TT_2MSL, timeout);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
}
tp->t_state = TCPS_FIN_WAIT_2;
}
@@ -2698,9 +2793,6 @@ process_ACK:
}
step6:
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_do_segment: step6 ti_locked %d", ti_locked));
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2786,9 +2878,6 @@ step6:
tp->rcv_up = tp->rcv_nxt;
}
dodata: /* XXX */
- INP_INFO_LOCK_ASSERT(&V_tcbinfo);
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_do_segment: dodata ti_locked %d", ti_locked));
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
@@ -2920,13 +3009,8 @@ dodata: /* XXX */
return;
}
}
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dodata epilogue ti_locked %d", __func__,
- ti_locked);
ti_locked = TI_UNLOCKED;
#ifdef TCPDEBUG
@@ -2955,9 +3039,6 @@ check_delack:
return;
dropafterack:
- KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
- ("tcp_do_segment: dropafterack ti_locked %d", ti_locked));
-
/*
* Generate an ACK dropping incoming segment if it occupies
* sequence space, where the ACK reflects our state.
@@ -2984,13 +3065,8 @@ dropafterack:
tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
&tcp_savetcp, 0);
#endif
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropafterack epilogue ti_locked %d", __func__,
- ti_locked);
ti_locked = TI_UNLOCKED;
tp->t_flags |= TF_ACKNOW;
@@ -3000,12 +3076,8 @@ dropafterack:
return;
dropwithreset:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED)
INP_INFO_WUNLOCK(&V_tcbinfo);
- else
- panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
ti_locked = TI_UNLOCKED;
if (tp != NULL) {
@@ -3016,15 +3088,14 @@ dropwithreset:
return;
drop:
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK(&V_tcbinfo);
- else if (ti_locked == TI_WLOCKED)
+ if (ti_locked == TI_WLOCKED) {
INP_INFO_WUNLOCK(&V_tcbinfo);
+ ti_locked = TI_UNLOCKED;
+ }
#ifdef INVARIANTS
else
INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
#endif
- ti_locked = TI_UNLOCKED;
/*
* Drop space held by incoming segment and return.
@@ -3048,7 +3119,9 @@ static void
tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
int tlen, int rstreason)
{
+#ifdef INET
struct ip *ip;
+#endif
#ifdef INET6
struct ip6_hdr *ip6;
#endif
@@ -3067,8 +3140,12 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
goto drop;
/* IPv6 anycast check is done at tcp6_input() */
- } else
+ }
#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
{
ip = mtod(m, struct ip *);
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
@@ -3077,6 +3154,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
goto drop;
}
+#endif
/* Perform bandwidth limiting. */
if (badport_bandlim(rstreason) < 0)
@@ -3307,10 +3385,8 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
/*
* Determine a reasonable value for maxseg size.
* If the route is known, check route for mtu.
- * If none, use an mss that can be handled on the outgoing
- * interface without forcing IP to fragment; if bigger than
- * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
- * to utilize large mbufs. If no route is found, route has no mtu,
+ * If none, use an mss that can be handled on the outgoing interface
+ * without forcing IP to fragment. If no route is found, route has no mtu,
* or the destination isn't local, use a default, hopefully conservative
* size (usually 512 or the default IP max size, but no more than the mtu
* of the interface), as we can't discover anything about intervening
@@ -3331,10 +3407,10 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
*/
void
tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
- struct hc_metrics_lite *metricptr, int *mtuflags)
+ struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
{
- int mss;
- u_long maxmtu;
+ int mss = 0;
+ u_long maxmtu = 0;
struct inpcb *inp = tp->t_inpcb;
struct hc_metrics_lite metrics;
int origoffer;
@@ -3358,14 +3434,19 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
/* Initialize. */
#ifdef INET6
if (isipv6) {
- maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
+ maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
- } else
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
#endif
+#ifdef INET
{
- maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
+ maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
}
+#endif
/*
* No route to sender, stay with default mss and return.
@@ -3426,14 +3507,19 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
if (!V_path_mtu_discovery &&
!in6_localaddr(&inp->in6p_faddr))
mss = min(mss, V_tcp_v6mssdflt);
- } else
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
#endif
+#ifdef INET
{
mss = maxmtu - min_protoh;
if (!V_path_mtu_discovery &&
!in_localaddr(inp->inp_faddr))
mss = min(mss, V_tcp_mssdflt);
}
+#endif
/*
* XXX - The above conditional (mss = maxmtu - min_protoh)
* probably violates the TCP spec.
@@ -3481,13 +3567,6 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
mss -= TCPOLEN_TSTAMP_APPA;
-#if (MCLBYTES & (MCLBYTES - 1)) == 0
- if (mss > MCLBYTES)
- mss &= ~(MCLBYTES-1);
-#else
- if (mss > MCLBYTES)
- mss = mss / MCLBYTES * MCLBYTES;
-#endif
tp->t_maxseg = mss;
}
@@ -3499,11 +3578,12 @@ tcp_mss(struct tcpcb *tp, int offer)
struct inpcb *inp;
struct socket *so;
struct hc_metrics_lite metrics;
- int mtuflags = 0;
+ struct tcp_ifcap cap;
KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
-
- tcp_mss_update(tp, offer, -1, &metrics, &mtuflags);
+
+ bzero(&cap, sizeof(cap));
+ tcp_mss_update(tp, offer, -1, &metrics, &cap);
mss = tp->t_maxseg;
inp = tp->t_inpcb;
@@ -3517,7 +3597,7 @@ tcp_mss(struct tcpcb *tp, int offer)
*/
so = inp->inp_socket;
SOCKBUF_LOCK(&so->so_snd);
- if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
+ if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe)
bufsize = metrics.rmx_sendpipe;
else
bufsize = so->so_snd.sb_hiwat;
@@ -3534,7 +3614,7 @@ tcp_mss(struct tcpcb *tp, int offer)
tp->t_maxseg = mss;
SOCKBUF_LOCK(&so->so_rcv);
- if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
+ if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe)
bufsize = metrics.rmx_recvpipe;
else
bufsize = so->so_rcv.sb_hiwat;
@@ -3548,8 +3628,10 @@ tcp_mss(struct tcpcb *tp, int offer)
SOCKBUF_UNLOCK(&so->so_rcv);
/* Check the interface for TSO capabilities. */
- if (mtuflags & CSUM_TSO)
+ if (cap.ifcap & CSUM_TSO) {
tp->t_flags |= TF_TSO;
+ tp->t_tsomax = cap.tsomax;
+ }
}
/*
@@ -3569,16 +3651,23 @@ tcp_mssopt(struct in_conninfo *inc)
if (inc->inc_flags & INC_ISIPV6) {
mss = V_tcp_v6mssdflt;
maxmtu = tcp_maxmtu6(inc, NULL);
- thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
- } else
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
#endif
+#ifdef INET
{
mss = V_tcp_mssdflt;
maxmtu = tcp_maxmtu(inc, NULL);
- thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
min_protoh = sizeof(struct tcpiphdr);
}
+#endif
+#if defined(INET6) || defined(INET)
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+#endif
+
if (maxmtu && thcmtu)
mss = min(maxmtu, thcmtu) - min_protoh;
else if (maxmtu || thcmtu)
diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c
index 9f1d13c3..52d92aa0 100644
--- a/freebsd/sys/netinet/tcp_lro.c
+++ b/freebsd/sys/netinet/tcp_lro.c
@@ -3,8 +3,12 @@
/*-
* Copyright (c) 2007, Myricom Inc.
* Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2012 The FreeBSD Foundation
* All rights reserved.
*
+ * Portions of this software were developed by Bjoern Zeeb
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -25,359 +29,589 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * $FreeBSD$
*/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <rtems/bsd/local/opt_inet.h>
+#include <rtems/bsd/local/opt_inet6.h>
+
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
-#include <sys/endian.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/ethernet.h>
-#include <net/if_media.h>
+#include <net/vnet.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
+#include <netinet/ip6.h>
#include <netinet/ip.h>
+#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
-#include <machine/bus.h>
+#include <netinet6/ip6_var.h>
+
#include <machine/in_cksum.h>
+#ifndef LRO_ENTRIES
+#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */
+#endif
-static uint16_t do_csum_data(uint16_t *raw, int len)
-{
- uint32_t csum;
- csum = 0;
- while (len > 0) {
- csum += *raw;
- raw++;
- csum += *raw;
- raw++;
- len -= 4;
- }
- csum = (csum >> 16) + (csum & 0xffff);
- csum = (csum >> 16) + (csum & 0xffff);
- return (uint16_t)csum;
-}
+#define TCP_LRO_UPDATE_CSUM 1
+#ifndef TCP_LRO_UPDATE_CSUM
+#define TCP_LRO_INVALID_CSUM 0x0000
+#endif
-/*
- * Allocate and init the LRO data structures
- */
int
-tcp_lro_init(struct lro_ctrl *cntl)
+tcp_lro_init(struct lro_ctrl *lc)
{
- struct lro_entry *lro;
- int i, error = 0;
+ struct lro_entry *le;
+ int error, i;
- SLIST_INIT(&cntl->lro_free);
- SLIST_INIT(&cntl->lro_active);
-
- cntl->lro_bad_csum = 0;
- cntl->lro_queued = 0;
- cntl->lro_flushed = 0;
+ lc->lro_bad_csum = 0;
+ lc->lro_queued = 0;
+ lc->lro_flushed = 0;
+ lc->lro_cnt = 0;
+ SLIST_INIT(&lc->lro_free);
+ SLIST_INIT(&lc->lro_active);
+ error = 0;
for (i = 0; i < LRO_ENTRIES; i++) {
- lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
- M_DEVBUF, M_NOWAIT | M_ZERO);
- if (lro == NULL) {
+ le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (le == NULL) {
if (i == 0)
error = ENOMEM;
break;
}
- cntl->lro_cnt = i;
- SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+ lc->lro_cnt = i + 1;
+ SLIST_INSERT_HEAD(&lc->lro_free, le, next);
}
return (error);
}
void
-tcp_lro_free(struct lro_ctrl *cntl)
+tcp_lro_free(struct lro_ctrl *lc)
{
- struct lro_entry *entry;
+ struct lro_entry *le;
- while (!SLIST_EMPTY(&cntl->lro_free)) {
- entry = SLIST_FIRST(&cntl->lro_free);
- SLIST_REMOVE_HEAD(&cntl->lro_free, next);
- free(entry, M_DEVBUF);
+ while (!SLIST_EMPTY(&lc->lro_free)) {
+ le = SLIST_FIRST(&lc->lro_free);
+ SLIST_REMOVE_HEAD(&lc->lro_free, next);
+ free(le, M_DEVBUF);
}
}
+#ifdef TCP_LRO_UPDATE_CSUM
+static uint16_t
+tcp_lro_csum_th(struct tcphdr *th)
+{
+ uint32_t ch;
+ uint16_t *p, l;
+
+ ch = th->th_sum = 0x0000;
+ l = th->th_off;
+ p = (uint16_t *)th;
+ while (l > 0) {
+ ch += *p;
+ p++;
+ ch += *p;
+ p++;
+ l--;
+ }
+ while (ch > 0xffff)
+ ch = (ch >> 16) + (ch & 0xffff);
+
+ return (ch & 0xffff);
+}
+
+static uint16_t
+tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
+ uint16_t tcp_data_len, uint16_t csum)
+{
+ uint32_t c;
+ uint16_t cs;
+
+ c = csum;
+
+ /* Remove length from checksum. */
+ switch (le->eh_type) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ {
+ struct ip6_hdr *ip6;
+
+ ip6 = (struct ip6_hdr *)l3hdr;
+ if (le->append_cnt == 0)
+ cs = ip6->ip6_plen;
+ else {
+ uint32_t cx;
+
+ cx = ntohs(ip6->ip6_plen);
+ cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
+ }
+ break;
+ }
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ struct ip *ip4;
+
+ ip4 = (struct ip *)l3hdr;
+ if (le->append_cnt == 0)
+ cs = ip4->ip_len;
+ else {
+ cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
+ IPPROTO_TCP);
+ cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
+ htons(cs));
+ }
+ break;
+ }
+#endif
+ default:
+ cs = 0; /* Keep compiler happy. */
+ }
+
+ cs = ~cs;
+ c += cs;
+
+ /* Remove TCP header csum. */
+ cs = ~tcp_lro_csum_th(th);
+ c += cs;
+ while (c > 0xffff)
+ c = (c >> 16) + (c & 0xffff);
+
+ return (c & 0xffff);
+}
+#endif
+
void
-tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
+tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
{
- struct ifnet *ifp;
- struct ip *ip;
- struct tcphdr *tcp;
- uint32_t *ts_ptr;
- uint32_t tcplen, tcp_csum;
-
-
- if (lro->append_cnt) {
- /* incorporate the new len into the ip header and
- * re-calculate the checksum */
- ip = lro->ip;
- ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
- ip->ip_sum = 0;
- ip->ip_sum = 0xffff ^
- do_csum_data((uint16_t*)ip,
- sizeof (*ip));
-
- lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
- CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
- lro->m_head->m_pkthdr.csum_data = 0xffff;
- lro->m_head->m_pkthdr.len = lro->len;
-
- /* incorporate the latest ack into the tcp header */
- tcp = (struct tcphdr *) (ip + 1);
- tcp->th_ack = lro->ack_seq;
- tcp->th_win = lro->window;
- /* incorporate latest timestamp into the tcp header */
- if (lro->timestamp) {
- ts_ptr = (uint32_t *)(tcp + 1);
- ts_ptr[1] = htonl(lro->tsval);
- ts_ptr[2] = lro->tsecr;
+
+ if (le->append_cnt > 0) {
+ struct tcphdr *th;
+ uint16_t p_len;
+
+ p_len = htons(le->p_len);
+ switch (le->eh_type) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ {
+ struct ip6_hdr *ip6;
+
+ ip6 = le->le_ip6;
+ ip6->ip6_plen = p_len;
+ th = (struct tcphdr *)(ip6 + 1);
+ le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
+ CSUM_PSEUDO_HDR;
+ le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
+ break;
}
- /*
- * update checksum in tcp header by re-calculating the
- * tcp pseudoheader checksum, and adding it to the checksum
- * of the tcp payload data
- */
- tcp->th_sum = 0;
- tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
- tcp_csum = lro->data_csum;
- tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
- htons(tcplen + IPPROTO_TCP));
- tcp_csum += do_csum_data((uint16_t*)tcp,
- tcp->th_off << 2);
- tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
- tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
- tcp->th_sum = 0xffff ^ tcp_csum;
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ struct ip *ip4;
+#ifdef TCP_LRO_UPDATE_CSUM
+ uint32_t cl;
+ uint16_t c;
+#endif
+
+ ip4 = le->le_ip4;
+#ifdef TCP_LRO_UPDATE_CSUM
+ /* Fix IP header checksum for new length. */
+ c = ~ip4->ip_sum;
+ cl = c;
+ c = ~ip4->ip_len;
+ cl += c + p_len;
+ while (cl > 0xffff)
+ cl = (cl >> 16) + (cl & 0xffff);
+ c = cl;
+ ip4->ip_sum = ~c;
+#else
+ ip4->ip_sum = TCP_LRO_INVALID_CSUM;
+#endif
+ ip4->ip_len = p_len;
+ th = (struct tcphdr *)(ip4 + 1);
+ le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
+ CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
+ le->p_len += ETHER_HDR_LEN;
+ break;
+ }
+#endif
+ default:
+ th = NULL; /* Keep compiler happy. */
+ }
+ le->m_head->m_pkthdr.csum_data = 0xffff;
+ le->m_head->m_pkthdr.len = le->p_len;
+
+ /* Incorporate the latest ACK into the TCP header. */
+ th->th_ack = le->ack_seq;
+ th->th_win = le->window;
+ /* Incorporate latest timestamp into the TCP header. */
+ if (le->timestamp != 0) {
+ uint32_t *ts_ptr;
+
+ ts_ptr = (uint32_t *)(th + 1);
+ ts_ptr[1] = htonl(le->tsval);
+ ts_ptr[2] = le->tsecr;
+ }
+#ifdef TCP_LRO_UPDATE_CSUM
+ /* Update the TCP header checksum. */
+ le->ulp_csum += p_len;
+ le->ulp_csum += tcp_lro_csum_th(th);
+ while (le->ulp_csum > 0xffff)
+ le->ulp_csum = (le->ulp_csum >> 16) +
+ (le->ulp_csum & 0xffff);
+ th->th_sum = (le->ulp_csum & 0xffff);
+ th->th_sum = ~th->th_sum;
+#else
+ th->th_sum = TCP_LRO_INVALID_CSUM;
+#endif
}
- ifp = cntl->ifp;
- (*ifp->if_input)(cntl->ifp, lro->m_head);
- cntl->lro_queued += lro->append_cnt + 1;
- cntl->lro_flushed++;
- lro->m_head = NULL;
- lro->timestamp = 0;
- lro->append_cnt = 0;
- SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+
+ (*lc->ifp->if_input)(lc->ifp, le->m_head);
+ lc->lro_queued += le->append_cnt + 1;
+ lc->lro_flushed++;
+ bzero(le, sizeof(*le));
+ SLIST_INSERT_HEAD(&lc->lro_free, le, next);
}
-int
-tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
+#ifdef INET6
+static int
+tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
+ struct tcphdr **th)
{
- struct ether_header *eh;
- struct ip *ip;
- struct tcphdr *tcp;
- uint32_t *ts_ptr;
- struct mbuf *m_nxt, *m_tail;
- struct lro_entry *lro;
- int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
- int opt_bytes, trim, csum_flags;
- uint32_t seq, tmp_csum, device_mtu;
-
-
- eh = mtod(m_head, struct ether_header *);
- if (eh->ether_type != htons(ETHERTYPE_IP))
- return 1;
- ip = (struct ip *) (eh + 1);
- if (ip->ip_p != IPPROTO_TCP)
- return 1;
-
- /* ensure there are no options */
- if ((ip->ip_hl << 2) != sizeof (*ip))
- return -1;
-
- /* .. and the packet is not fragmented */
- if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
- return -1;
-
- /* verify that the IP header checksum is correct */
- csum_flags = m_head->m_pkthdr.csum_flags;
+
+ /* XXX-BZ we should check the flow-label. */
+
+ /* XXX-BZ We do not yet support ext. hdrs. */
+ if (ip6->ip6_nxt != IPPROTO_TCP)
+ return (TCP_LRO_NOT_SUPPORTED);
+
+ /* Find the TCP header. */
+ *th = (struct tcphdr *)(ip6 + 1);
+
+ return (0);
+}
+#endif
+
+#ifdef INET
+static int
+tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
+ struct tcphdr **th)
+{
+ int csum_flags;
+ uint16_t csum;
+
+ if (ip4->ip_p != IPPROTO_TCP)
+ return (TCP_LRO_NOT_SUPPORTED);
+
+ /* Ensure there are no options. */
+ if ((ip4->ip_hl << 2) != sizeof (*ip4))
+ return (TCP_LRO_CANNOT);
+
+ /* .. and the packet is not fragmented. */
+ if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
+ return (TCP_LRO_CANNOT);
+
+ /* Legacy IP has a header checksum that needs to be correct. */
+ csum_flags = m->m_pkthdr.csum_flags;
if (csum_flags & CSUM_IP_CHECKED) {
if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
- cntl->lro_bad_csum++;
- return -1;
+ lc->lro_bad_csum++;
+ return (TCP_LRO_CANNOT);
}
} else {
- tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
- if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
- cntl->lro_bad_csum++;
- return -1;
+ csum = in_cksum_hdr(ip4);
+ if (__predict_false((csum) != 0)) {
+ lc->lro_bad_csum++;
+ return (TCP_LRO_CANNOT);
}
}
-
- /* find the TCP header */
- tcp = (struct tcphdr *) (ip + 1);
-
- /* Get the TCP checksum if we dont have it */
- if (!csum)
- csum = tcp->th_sum;
-
- /* ensure no bits set besides ack or psh */
- if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
- return -1;
-
- /* check for timestamps. Since the only option we handle are
- timestamps, we only have to handle the simple case of
- aligned timestamps */
-
- opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
- tcp_hdr_len = sizeof (*tcp) + opt_bytes;
- ts_ptr = (uint32_t *)(tcp + 1);
- if (opt_bytes != 0) {
- if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
- (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
- TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
- return -1;
- }
- ip_len = ntohs(ip->ip_len);
- tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
-
+ /* Find the TCP header (we assured there are no IP options). */
+ *th = (struct tcphdr *)(ip4 + 1);
- /*
- * If frame is padded beyond the end of the IP packet,
- * then we must trim the extra bytes off the end.
- */
- tot_len = m_head->m_pkthdr.len;
- trim = tot_len - (ip_len + ETHER_HDR_LEN);
- if (trim != 0) {
- if (trim < 0) {
- /* truncated packet */
- return -1;
+ return (0);
+}
+#endif
+
+int
+tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
+{
+ struct lro_entry *le;
+ struct ether_header *eh;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
+#endif
+#ifdef INET
+ struct ip *ip4 = NULL; /* Keep compiler happy. */
+#endif
+ struct tcphdr *th;
+ void *l3hdr = NULL; /* Keep compiler happy. */
+ uint32_t *ts_ptr;
+ tcp_seq seq;
+ int error, ip_len, l;
+ uint16_t eh_type, tcp_data_len;
+
+ /* We expect a contiguous header [eh, ip, tcp]. */
+
+ eh = mtod(m, struct ether_header *);
+ eh_type = ntohs(eh->ether_type);
+ switch (eh_type) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ {
+ CURVNET_SET(lc->ifp->if_vnet);
+ if (V_ip6_forwarding != 0) {
+ /* XXX-BZ stats but changing lro_ctrl is a problem. */
+ CURVNET_RESTORE();
+ return (TCP_LRO_CANNOT);
+ }
+ CURVNET_RESTORE();
+ l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
+ error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
+ if (error != 0)
+ return (error);
+ tcp_data_len = ntohs(ip6->ip6_plen);
+ ip_len = sizeof(*ip6) + tcp_data_len;
+ break;
+ }
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ CURVNET_SET(lc->ifp->if_vnet);
+ if (V_ipforwarding != 0) {
+ /* XXX-BZ stats but changing lro_ctrl is a problem. */
+ CURVNET_RESTORE();
+ return (TCP_LRO_CANNOT);
}
- m_adj(m_head, -trim);
- tot_len = m_head->m_pkthdr.len;
+ CURVNET_RESTORE();
+ l3hdr = ip4 = (struct ip *)(eh + 1);
+ error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
+ if (error != 0)
+ return (error);
+ ip_len = ntohs(ip4->ip_len);
+ tcp_data_len = ip_len - sizeof(*ip4);
+ break;
}
+#endif
+ /* XXX-BZ what happens in case of VLAN(s)? */
+ default:
+ return (TCP_LRO_NOT_SUPPORTED);
+ }
+
+ /*
+ * If the frame is padded beyond the end of the IP packet, then we must
+ * trim the extra bytes off.
+ */
+ l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
+ if (l != 0) {
+ if (l < 0)
+ /* Truncated packet. */
+ return (TCP_LRO_CANNOT);
- m_nxt = m_head;
- m_tail = NULL; /* -Wuninitialized */
- while (m_nxt != NULL) {
- m_tail = m_nxt;
- m_nxt = m_tail->m_next;
+ m_adj(m, -l);
}
- hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
- seq = ntohl(tcp->th_seq);
-
- SLIST_FOREACH(lro, &cntl->lro_active, next) {
- if (lro->source_port == tcp->th_sport &&
- lro->dest_port == tcp->th_dport &&
- lro->source_ip == ip->ip_src.s_addr &&
- lro->dest_ip == ip->ip_dst.s_addr) {
- /* Try to append it */
-
- if (__predict_false(seq != lro->next_seq)) {
- /* out of order packet */
- SLIST_REMOVE(&cntl->lro_active, lro,
- lro_entry, next);
- tcp_lro_flush(cntl, lro);
- return -1;
- }
-
- if (opt_bytes) {
- uint32_t tsval = ntohl(*(ts_ptr + 1));
- /* make sure timestamp values are increasing */
- if (__predict_false(lro->tsval > tsval ||
- *(ts_ptr + 2) == 0)) {
- return -1;
- }
- lro->tsval = tsval;
- lro->tsecr = *(ts_ptr + 2);
- }
-
- lro->next_seq += tcp_data_len;
- lro->ack_seq = tcp->th_ack;
- lro->window = tcp->th_win;
- lro->append_cnt++;
- if (tcp_data_len == 0) {
- m_freem(m_head);
- return 0;
- }
- /* subtract off the checksum of the tcp header
- * from the hardware checksum, and add it to the
- * stored tcp data checksum. Byteswap the checksum
- * if the total length so far is odd
- */
- tmp_csum = do_csum_data((uint16_t*)tcp,
- tcp_hdr_len);
- csum = csum + (tmp_csum ^ 0xffff);
- csum = (csum & 0xffff) + (csum >> 16);
- csum = (csum & 0xffff) + (csum >> 16);
- if (lro->len & 0x1) {
- /* Odd number of bytes so far, flip bytes */
- csum = ((csum << 8) | (csum >> 8)) & 0xffff;
- }
- csum = csum + lro->data_csum;
- csum = (csum & 0xffff) + (csum >> 16);
- csum = (csum & 0xffff) + (csum >> 16);
- lro->data_csum = csum;
-
- lro->len += tcp_data_len;
-
- /* adjust mbuf so that m->m_data points to
- the first byte of the payload */
- m_adj(m_head, hlen);
- /* append mbuf chain */
- lro->m_tail->m_next = m_head;
- /* advance the last pointer */
- lro->m_tail = m_tail;
- /* flush packet if required */
- device_mtu = cntl->ifp->if_mtu;
- if (lro->len > (65535 - device_mtu)) {
- SLIST_REMOVE(&cntl->lro_active, lro,
- lro_entry, next);
- tcp_lro_flush(cntl, lro);
- }
- return 0;
+ /*
+ * Check TCP header constraints.
+ */
+ /* Ensure no bits set besides ACK or PSH. */
+ if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
+ return (TCP_LRO_CANNOT);
+
+ /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
+ /* XXX-BZ Ideally we'd flush on PUSH? */
+
+ /*
+ * Check for timestamps.
+ * Since the only option we handle are timestamps, we only have to
+ * handle the simple case of aligned timestamps.
+ */
+ l = (th->th_off << 2);
+ tcp_data_len -= l;
+ l -= sizeof(*th);
+ ts_ptr = (uint32_t *)(th + 1);
+ if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
+ (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
+ TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
+ return (TCP_LRO_CANNOT);
+
+ /* If the driver did not pass in the checksum, set it now. */
+ if (csum == 0x0000)
+ csum = th->th_sum;
+
+ seq = ntohl(th->th_seq);
+
+ /* Try to find a matching previous segment. */
+ SLIST_FOREACH(le, &lc->lro_active, next) {
+ if (le->eh_type != eh_type)
+ continue;
+ if (le->source_port != th->th_sport ||
+ le->dest_port != th->th_dport)
+ continue;
+ switch (eh_type) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ if (bcmp(&le->source_ip6, &ip6->ip6_src,
+ sizeof(struct in6_addr)) != 0 ||
+ bcmp(&le->dest_ip6, &ip6->ip6_dst,
+ sizeof(struct in6_addr)) != 0)
+ continue;
+ break;
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ if (le->source_ip4 != ip4->ip_src.s_addr ||
+ le->dest_ip4 != ip4->ip_dst.s_addr)
+ continue;
+ break;
+#endif
+ }
+
+ /* Flush now if appending will result in overflow. */
+ if (le->p_len > (65535 - tcp_data_len)) {
+ SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ tcp_lro_flush(lc, le);
+ break;
+ }
+
+ /* Try to append the new segment. */
+ if (__predict_false(seq != le->next_seq ||
+ (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
+ /* Out of order packet or duplicate ACK. */
+ SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ tcp_lro_flush(lc, le);
+ return (TCP_LRO_CANNOT);
}
+
+ if (l != 0) {
+ uint32_t tsval = ntohl(*(ts_ptr + 1));
+ /* Make sure timestamp values are increasing. */
+ /* XXX-BZ flip and use TSTMP_GEQ macro for this? */
+ if (__predict_false(le->tsval > tsval ||
+ *(ts_ptr + 2) == 0))
+ return (TCP_LRO_CANNOT);
+ le->tsval = tsval;
+ le->tsecr = *(ts_ptr + 2);
+ }
+
+ le->next_seq += tcp_data_len;
+ le->ack_seq = th->th_ack;
+ le->window = th->th_win;
+ le->append_cnt++;
+
+#ifdef TCP_LRO_UPDATE_CSUM
+ le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
+ tcp_data_len, ~csum);
+#endif
+
+ if (tcp_data_len == 0) {
+ m_freem(m);
+ return (0);
+ }
+
+ le->p_len += tcp_data_len;
+
+ /*
+ * Adjust the mbuf so that m_data points to the first byte of
+ * the ULP payload. Adjust the mbuf to avoid complications and
+ * append new segment to existing mbuf chain.
+ */
+ m_adj(m, m->m_pkthdr.len - tcp_data_len);
+ m->m_flags &= ~M_PKTHDR;
+
+ le->m_tail->m_next = m;
+ le->m_tail = m_last(m);
+
+ /*
+ * If a possible next full length packet would cause an
+ * overflow, pro-actively flush now.
+ */
+ if (le->p_len > (65535 - lc->ifp->if_mtu)) {
+ SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ tcp_lro_flush(lc, le);
+ }
+
+ return (0);
}
- if (SLIST_EMPTY(&cntl->lro_free))
- return -1;
-
- /* start a new chain */
- lro = SLIST_FIRST(&cntl->lro_free);
- SLIST_REMOVE_HEAD(&cntl->lro_free, next);
- SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
- lro->source_port = tcp->th_sport;
- lro->dest_port = tcp->th_dport;
- lro->source_ip = ip->ip_src.s_addr;
- lro->dest_ip = ip->ip_dst.s_addr;
- lro->next_seq = seq + tcp_data_len;
- lro->mss = tcp_data_len;
- lro->ack_seq = tcp->th_ack;
- lro->window = tcp->th_win;
-
- /* save the checksum of just the TCP payload by
- * subtracting off the checksum of the TCP header from
- * the entire hardware checksum
- * Since IP header checksum is correct, checksum over
- * the IP header is -0. Substracting -0 is unnecessary.
- */
- tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
- csum = csum + (tmp_csum ^ 0xffff);
- csum = (csum & 0xffff) + (csum >> 16);
- csum = (csum & 0xffff) + (csum >> 16);
- lro->data_csum = csum;
-
- lro->ip = ip;
- /* record timestamp if it is present */
- if (opt_bytes) {
- lro->timestamp = 1;
- lro->tsval = ntohl(*(ts_ptr + 1));
- lro->tsecr = *(ts_ptr + 2);
+ /* Try to find an empty slot. */
+ if (SLIST_EMPTY(&lc->lro_free))
+ return (TCP_LRO_CANNOT);
+
+ /* Start a new segment chain. */
+ le = SLIST_FIRST(&lc->lro_free);
+ SLIST_REMOVE_HEAD(&lc->lro_free, next);
+ SLIST_INSERT_HEAD(&lc->lro_active, le, next);
+
+ /* Start filling in details. */
+ switch (eh_type) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ le->le_ip6 = ip6;
+ le->source_ip6 = ip6->ip6_src;
+ le->dest_ip6 = ip6->ip6_dst;
+ le->eh_type = eh_type;
+ le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
+ break;
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ le->le_ip4 = ip4;
+ le->source_ip4 = ip4->ip_src.s_addr;
+ le->dest_ip4 = ip4->ip_dst.s_addr;
+ le->eh_type = eh_type;
+ le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
+ break;
+#endif
+ }
+ le->source_port = th->th_sport;
+ le->dest_port = th->th_dport;
+
+ le->next_seq = seq + tcp_data_len;
+ le->ack_seq = th->th_ack;
+ le->window = th->th_win;
+ if (l != 0) {
+ le->timestamp = 1;
+ le->tsval = ntohl(*(ts_ptr + 1));
+ le->tsecr = *(ts_ptr + 2);
}
- lro->len = tot_len;
- lro->m_head = m_head;
- lro->m_tail = m_tail;
- return 0;
+
+#ifdef TCP_LRO_UPDATE_CSUM
+ /*
+ * Do not touch the csum of the first packet. However save the
+ * "adjusted" checksum of just the source and destination addresses,
+ * the next header and the TCP payload. The length and TCP header
+ * parts may change, so we remove those from the saved checksum and
+ * re-add with final values on tcp_lro_flush() if needed.
+ */
+ KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
+ __func__, le, le->ulp_csum));
+
+ le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
+ ~csum);
+ th->th_sum = csum; /* Restore checksum on first packet. */
+#endif
+
+ le->m_head = m;
+ le->m_tail = m_last(m);
+
+ return (0);
}
+
+/* end */
diff --git a/freebsd/sys/netinet/tcp_lro.h b/freebsd/sys/netinet/tcp_lro.h
index 7e498871..b3a50179 100644
--- a/freebsd/sys/netinet/tcp_lro.h
+++ b/freebsd/sys/netinet/tcp_lro.h
@@ -30,31 +30,46 @@
#ifndef _TCP_LRO_H_
#define _TCP_LRO_H_
-struct lro_entry;
struct lro_entry
{
- SLIST_ENTRY(lro_entry) next;
- struct mbuf *m_head;
- struct mbuf *m_tail;
- int timestamp;
- struct ip *ip;
- uint32_t tsval;
- uint32_t tsecr;
- uint32_t source_ip;
- uint32_t dest_ip;
- uint32_t next_seq;
- uint32_t ack_seq;
- uint32_t len;
- uint32_t data_csum;
- uint16_t window;
- uint16_t source_port;
- uint16_t dest_port;
- uint16_t append_cnt;
- uint16_t mss;
-
+ SLIST_ENTRY(lro_entry) next;
+ struct mbuf *m_head;
+ struct mbuf *m_tail;
+ union {
+ struct ip *ip4;
+ struct ip6_hdr *ip6;
+ } leip;
+ union {
+ in_addr_t s_ip4;
+ struct in6_addr s_ip6;
+ } lesource;
+ union {
+ in_addr_t d_ip4;
+ struct in6_addr d_ip6;
+ } ledest;
+ uint16_t source_port;
+ uint16_t dest_port;
+ uint16_t eh_type; /* EthernetHeader type. */
+ uint16_t append_cnt;
+ uint32_t p_len; /* IP header payload length. */
+ uint32_t ulp_csum; /* TCP, etc. checksum. */
+ uint32_t next_seq; /* tcp_seq */
+ uint32_t ack_seq; /* tcp_seq */
+ uint32_t tsval;
+ uint32_t tsecr;
+ uint16_t window;
+ uint16_t timestamp; /* flag, not a TCP hdr field. */
};
SLIST_HEAD(lro_head, lro_entry);
+#define le_ip4 leip.ip4
+#define le_ip6 leip.ip6
+#define source_ip4 lesource.s_ip4
+#define dest_ip4 ledest.d_ip4
+#define source_ip6 lesource.s_ip6
+#define dest_ip6 ledest.d_ip6
+
+/* NB: This is part of driver structs. */
struct lro_ctrl {
struct ifnet *ifp;
int lro_queued;
@@ -66,13 +81,12 @@ struct lro_ctrl {
struct lro_head lro_free;
};
-
int tcp_lro_init(struct lro_ctrl *);
void tcp_lro_free(struct lro_ctrl *);
void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
-/* Number of LRO entries - these are per rx queue */
-#define LRO_ENTRIES 8
+#define TCP_LRO_CANNOT -1
+#define TCP_LRO_NOT_SUPPORTED 1
#endif /* _TCP_LRO_H_ */
diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c
index 93b7d8de..cd41edab 100644
--- a/freebsd/sys/netinet/tcp_offload.c
+++ b/freebsd/sys/netinet/tcp_offload.c
@@ -1,147 +1,178 @@
#include <machine/rtems-bsd-kernel-space.h>
/*-
- * Copyright (c) 2007, Chelsio Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
*
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <rtems/bsd/local/opt_inet.h>
+
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <rtems/bsd/sys/types.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
-
+#include <sys/sockopt.h>
#include <net/if.h>
-#include <net/if_types.h>
-#include <net/if_var.h>
#include <net/route.h>
-#include <net/vnet.h>
-
#include <netinet/in.h>
-#include <netinet/in_systm.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_offload.h>
-#include <netinet/toedev.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
-uint32_t toedev_registration_count;
+int registered_toedevs;
+/*
+ * Provide an opportunity for a TOE driver to offload.
+ */
int
tcp_offload_connect(struct socket *so, struct sockaddr *nam)
{
struct ifnet *ifp;
- struct toedev *tdev;
+ struct toedev *tod;
struct rtentry *rt;
- int error;
-
- if (toedev_registration_count == 0)
- return (EINVAL);
-
- /*
- * Look up the route used for the connection to
- * determine if it uses an interface capable of
- * offloading the connection.
- */
- rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/);
- if (rt)
+ int error = EOPNOTSUPP;
+
+ INP_WLOCK_ASSERT(sotoinpcb(so));
+ KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
+ ("%s: called with sa_family %d", __func__, nam->sa_family));
+
+ if (registered_toedevs == 0)
+ return (error);
+
+ rt = rtalloc1(nam, 0, 0);
+ if (rt)
RT_UNLOCK(rt);
- else
+ else
return (EHOSTUNREACH);
ifp = rt->rt_ifp;
- if ((ifp->if_capenable & IFCAP_TOE) == 0) {
- error = EINVAL;
- goto fail;
- }
-
- tdev = TOEDEV(ifp);
- if (tdev == NULL) {
- error = EPERM;
- goto fail;
- }
-
- if (tdev->tod_can_offload(tdev, so) == 0) {
- error = EPERM;
- goto fail;
- }
-
- return (tdev->tod_connect(tdev, so, rt, nam));
-fail:
+
+ if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4))
+ goto done;
+ if (nam->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6))
+ goto done;
+
+ tod = TOEDEV(ifp);
+ if (tod != NULL)
+ error = tod->tod_connect(tod, so, rt, nam);
+done:
RTFREE(rt);
return (error);
}
+void
+tcp_offload_listen_start(struct tcpcb *tp)
+{
-/*
- * This file contains code as a short-term staging area before it is moved in
- * to sys/netinet/tcp_offload.c
- */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
+}
void
-tcp_offload_twstart(struct tcpcb *tp)
+tcp_offload_listen_stop(struct tcpcb *tp)
{
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(tp->t_inpcb);
- tcp_twstart(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
}
-struct tcpcb *
-tcp_offload_close(struct tcpcb *tp)
+void
+tcp_offload_input(struct tcpcb *tp, struct mbuf *m)
{
+ struct toedev *tod = tp->tod;
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(tp->t_inpcb);
- tp = tcp_close(tp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- if (tp)
- INP_WUNLOCK(tp->t_inpcb);
+ KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
- return (tp);
+ tod->tod_input(tod, tp, m);
}
-struct tcpcb *
-tcp_offload_drop(struct tcpcb *tp, int error)
+int
+tcp_offload_output(struct tcpcb *tp)
{
+ struct toedev *tod = tp->tod;
+ int error, flags;
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(tp->t_inpcb);
- tp = tcp_drop(tp, error);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- if (tp)
- INP_WUNLOCK(tp->t_inpcb);
+ KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
- return (tp);
+ flags = tcp_outflags[tp->t_state];
+
+ if (flags & TH_RST) {
+ /* XXX: avoid repeated calls like we do for FIN */
+ error = tod->tod_send_rst(tod, tp);
+ } else if ((flags & TH_FIN || tp->t_flags & TF_NEEDFIN) &&
+ (tp->t_flags & TF_SENTFIN) == 0) {
+ error = tod->tod_send_fin(tod, tp);
+ if (error == 0)
+ tp->t_flags |= TF_SENTFIN;
+ } else
+ error = tod->tod_output(tod, tp);
+
+ return (error);
+}
+
+void
+tcp_offload_rcvd(struct tcpcb *tp)
+{
+ struct toedev *tod = tp->tod;
+
+ KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tod->tod_rcvd(tod, tp);
+}
+
+void
+tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name)
+{
+ struct toedev *tod = tp->tod;
+
+ KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tod->tod_ctloutput(tod, tp, sopt_dir, sopt_name);
}
+void
+tcp_offload_detach(struct tcpcb *tp)
+{
+ struct toedev *tod = tp->tod;
+
+ KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tod->tod_pcb_detach(tod, tp);
+}
diff --git a/freebsd/sys/netinet/tcp_offload.h b/freebsd/sys/netinet/tcp_offload.h
index 313185f6..a0523665 100644
--- a/freebsd/sys/netinet/tcp_offload.h
+++ b/freebsd/sys/netinet/tcp_offload.h
@@ -1,30 +1,30 @@
/*-
- * Copyright (c) 2007, Chelsio Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*
* $FreeBSD$
+ *
*/
#ifndef _NETINET_TCP_OFFLOAD_H_
@@ -34,321 +34,15 @@
#error "no user-serviceable parts inside"
#endif
-/*
- * A driver publishes that it provides offload services
- * by setting IFCAP_TOE in the ifnet. The offload connect
- * will bypass any further work if the interface that a
- * connection would use does not support TCP offload.
- *
- * The TOE API assumes that the tcp offload engine can offload the
- * the entire connection from set up to teardown, with some provision
- * being made to allowing the software stack to handle time wait. If
- * the device does not meet these criteria, it is the driver's responsibility
- * to overload the functions that it needs to in tcp_usrreqs and make
- * its own calls to tcp_output if it needs to do so.
- *
- * There is currently no provision for the device advertising the congestion
- * control algorithms it supports as there is currently no API for querying
- * an operating system for the protocols that it has loaded. This is a desirable
- * future extension.
- *
- *
- *
- * It is assumed that individuals deploying TOE will want connections
- * to be offloaded without software changes so all connections on an
- * interface providing TOE are offloaded unless the SO_NO_OFFLOAD
- * flag is set on the socket.
- *
- *
- * The toe_usrreqs structure constitutes the TOE driver's
- * interface to the TCP stack for functionality that doesn't
- * interact directly with userspace. If one wants to provide
- * (optional) functionality to do zero-copy to/from
- * userspace one still needs to override soreceive/sosend
- * with functions that fault in and pin the user buffers.
- *
- * + tu_send
- * - tells the driver that new data may have been added to the
- * socket's send buffer - the driver should not fail if the
- * buffer is in fact unchanged
- * - the driver is responsible for providing credits (bytes in the send window)
- * back to the socket by calling sbdrop() as segments are acknowledged.
- * - The driver expects the inpcb lock to be held - the driver is expected
- * not to drop the lock. Hence the driver is not allowed to acquire the
- * pcbinfo lock during this call.
- *
- * + tu_rcvd
- * - returns credits to the driver and triggers window updates
- * to the peer (a credit as used here is a byte in the peer's receive window)
- * - the driver is expected to determine how many bytes have been
- * consumed and credit that back to the card so that it can grow
- * the window again by maintaining its own state between invocations.
- * - In principle this could be used to shrink the window as well as
- * grow the window, although it is not used for that now.
- * - this function needs to correctly handle being called any number of
- * times without any bytes being consumed from the receive buffer.
- * - The driver expects the inpcb lock to be held - the driver is expected
- * not to drop the lock. Hence the driver is not allowed to acquire the
- * pcbinfo lock during this call.
- *
- * + tu_disconnect
- * - tells the driver to send FIN to peer
- * - driver is expected to send the remaining data and then do a clean half close
- * - disconnect implies at least half-close so only send, reset, and detach
- * are legal
- * - the driver is expected to handle transition through the shutdown
- * state machine and allow the stack to support SO_LINGER.
- * - The driver expects the inpcb lock to be held - the driver is expected
- * not to drop the lock. Hence the driver is not allowed to acquire the
- * pcbinfo lock during this call.
- *
- * + tu_reset
- * - closes the connection and sends a RST to peer
- * - driver is expectd to trigger an RST and detach the toepcb
- * - no further calls are legal after reset
- * - The driver expects the inpcb lock to be held - the driver is expected
- * not to drop the lock. Hence the driver is not allowed to acquire the
- * pcbinfo lock during this call.
- *
- * The following fields in the tcpcb are expected to be referenced by the driver:
- * + iss
- * + rcv_nxt
- * + rcv_wnd
- * + snd_isn
- * + snd_max
- * + snd_nxt
- * + snd_una
- * + t_flags
- * + t_inpcb
- * + t_maxseg
- * + t_toe
- *
- * The following fields in the inpcb are expected to be referenced by the driver:
- * + inp_lport
- * + inp_fport
- * + inp_laddr
- * + inp_fport
- * + inp_socket
- * + inp_ip_tos
- *
- * The following fields in the socket are expected to be referenced by the
- * driver:
- * + so_comp
- * + so_error
- * + so_linger
- * + so_options
- * + so_rcv
- * + so_snd
- * + so_state
- * + so_timeo
- *
- * These functions all return 0 on success and can return the following errors
- * as appropriate:
- * + EPERM:
- * + ENOBUFS: memory allocation failed
- * + EMSGSIZE: MTU changed during the call
- * + EHOSTDOWN:
- * + EHOSTUNREACH:
- * + ENETDOWN:
- * * ENETUNREACH: the peer is no longer reachable
- *
- * + tu_detach
- * - tells driver that the socket is going away so disconnect
- * the toepcb and free appropriate resources
- * - allows the driver to cleanly handle the case of connection state
- * outliving the socket
- * - no further calls are legal after detach
- * - the driver is expected to provide its own synchronization between
- * detach and receiving new data.
- *
- * + tu_syncache_event
- * - even if it is not actually needed, the driver is expected to
- * call syncache_add for the initial SYN and then syncache_expand
- * for the SYN,ACK
- * - tells driver that a connection either has not been added or has
- * been dropped from the syncache
- * - the driver is expected to maintain state that lives outside the
- * software stack so the syncache needs to be able to notify the
- * toe driver that the software stack is not going to create a connection
- * for a received SYN
- * - The driver is responsible for any synchronization required between
- * the syncache dropping an entry and the driver processing the SYN,ACK.
- *
- */
-struct toe_usrreqs {
- int (*tu_send)(struct tcpcb *tp);
- int (*tu_rcvd)(struct tcpcb *tp);
- int (*tu_disconnect)(struct tcpcb *tp);
- int (*tu_reset)(struct tcpcb *tp);
- void (*tu_detach)(struct tcpcb *tp);
- void (*tu_syncache_event)(int event, void *toep);
-};
-
-/*
- * Proxy for struct tcpopt between TOE drivers and TCP functions.
- */
-struct toeopt {
- u_int64_t to_flags; /* see tcpopt in tcp_var.h */
- u_int16_t to_mss; /* maximum segment size */
- u_int8_t to_wscale; /* window scaling */
+extern int registered_toedevs;
- u_int8_t _pad1; /* explicit pad for 64bit alignment */
- u_int32_t _pad2; /* explicit pad for 64bit alignment */
- u_int64_t _pad3[4]; /* TBD */
-};
+int tcp_offload_connect(struct socket *, struct sockaddr *);
+void tcp_offload_listen_start(struct tcpcb *);
+void tcp_offload_listen_stop(struct tcpcb *);
+void tcp_offload_input(struct tcpcb *, struct mbuf *);
+int tcp_offload_output(struct tcpcb *);
+void tcp_offload_rcvd(struct tcpcb *);
+void tcp_offload_ctloutput(struct tcpcb *, int, int);
+void tcp_offload_detach(struct tcpcb *);
-#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */
-#define TOE_SC_DROP 2 /* connection was timed out */
-
-/*
- * Because listen is a one-to-many relationship (a socket can be listening
- * on all interfaces on a machine some of which may be using different TCP
- * offload devices), listen uses a publish/subscribe mechanism. The TCP
- * offload driver registers a listen notification function with the stack.
- * When a listen socket is created all TCP offload devices are notified
- * so that they can do the appropriate set up to offload connections on the
- * port to which the socket is bound. When the listen socket is closed,
- * the offload devices are notified so that they will stop listening on that
- * port and free any associated resources as well as sending RSTs on any
- * connections in the SYN_RCVD state.
- *
- */
-
-typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
-typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
-
-EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
-EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
-
-/*
- * Check if the socket can be offloaded by the following steps:
- * - determine the egress interface
- * - check the interface for TOE capability and TOE is enabled
- * - check if the device has resources to offload the connection
- */
-int tcp_offload_connect(struct socket *so, struct sockaddr *nam);
-
-/*
- * The tcp_output_* routines are wrappers around the toe_usrreqs calls
- * which trigger packet transmission. In the non-offloaded case they
- * translate to tcp_output. The tcp_offload_* routines notify TOE
- * of specific events. I the non-offloaded case they are no-ops.
- *
- * Listen is a special case because it is a 1 to many relationship
- * and there can be more than one offload driver in the system.
- */
-
-/*
- * Connection is offloaded
- */
-#define tp_offload(tp) ((tp)->t_flags & TF_TOE)
-
-/*
- * hackish way of allowing this file to also be included by TOE
- * which needs to be kept ignorant of socket implementation details
- */
-#ifdef _SYS_SOCKETVAR_H_
-/*
- * The socket has not been marked as "do not offload"
- */
-#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0)
-
-static __inline int
-tcp_output_connect(struct socket *so, struct sockaddr *nam)
-{
- struct tcpcb *tp = sototcpcb(so);
- int error;
-
- /*
- * If offload has been disabled for this socket or the
- * connection cannot be offloaded just call tcp_output
- * to start the TCP state machine.
- */
-#ifndef TCP_OFFLOAD_DISABLE
- if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
-#endif
- error = tcp_output(tp);
- return (error);
-}
-
-static __inline int
-tcp_output_send(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
- if (tp_offload(tp))
- return (tp->t_tu->tu_send(tp));
-#endif
- return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_rcvd(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
- if (tp_offload(tp))
- return (tp->t_tu->tu_rcvd(tp));
#endif
- return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_disconnect(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
- if (tp_offload(tp))
- return (tp->t_tu->tu_disconnect(tp));
-#endif
- return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_reset(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
- if (tp_offload(tp))
- return (tp->t_tu->tu_reset(tp));
-#endif
- return (tcp_output(tp));
-}
-
-static __inline void
-tcp_offload_detach(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
- if (tp_offload(tp))
- tp->t_tu->tu_detach(tp);
-#endif
-}
-
-static __inline void
-tcp_offload_listen_open(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
- if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
- EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
-#endif
-}
-
-static __inline void
-tcp_offload_listen_close(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
- EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
-#endif
-}
-#undef SO_OFFLOADABLE
-#endif /* _SYS_SOCKETVAR_H_ */
-#undef tp_offload
-
-void tcp_offload_twstart(struct tcpcb *tp);
-struct tcpcb *tcp_offload_close(struct tcpcb *tp);
-struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
-
-#endif /* _NETINET_TCP_OFFLOAD_H_ */
diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c
index c73fe099..6215c4e2 100644
--- a/freebsd/sys/netinet/tcp_output.c
+++ b/freebsd/sys/netinet/tcp_output.c
@@ -77,6 +77,9 @@ __FBSDID("$FreeBSD$");
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
@@ -86,31 +89,22 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-#ifdef notyet
-extern struct mbuf *m_copypack();
-#endif
-
VNET_DEFINE(int, path_mtu_discovery) = 1;
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
&VNET_NAME(path_mtu_discovery), 1,
"Enable Path MTU Discovery");
-VNET_DEFINE(int, ss_fltsz) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
- &VNET_NAME(ss_fltsz), 1,
- "Slow start flight size");
-
-VNET_DEFINE(int, ss_fltsz_local) = 4;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize,
- CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1,
- "Slow start flight size for local networks");
-
VNET_DEFINE(int, tcp_do_tso) = 1;
#define V_tcp_do_tso VNET(tcp_do_tso)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
&VNET_NAME(tcp_do_tso), 0,
"Enable TCP Segmentation Offload");
+VNET_DEFINE(int, tcp_sendspace) = 1024*32;
+#define V_tcp_sendspace VNET(tcp_sendspace)
+SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
+ &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size");
+
VNET_DEFINE(int, tcp_do_autosndbuf) = 1;
#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
@@ -123,7 +117,7 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
&VNET_NAME(tcp_autosndbuf_inc), 0,
"Incrementor step size of automatic send buffer");
-VNET_DEFINE(int, tcp_autosndbuf_max) = 256*1024;
+VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024;
#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
&VNET_NAME(tcp_autosndbuf_max), 0,
@@ -175,7 +169,7 @@ tcp_output(struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
long len, recwin, sendwin;
- int off, flags, error;
+ int off, flags, error = 0; /* Keep compiler happy */
struct mbuf *m;
struct ip *ip = NULL;
struct ipovly *ipov = NULL;
@@ -188,7 +182,7 @@ tcp_output(struct tcpcb *tp)
int idle, sendalot;
int sack_rxmit, sack_bytes_rxmt;
struct sackhole *p;
- int tso;
+ int tso, mtu;
struct tcpopt to;
#if 0
int maxburst = TCP_MAXBURST;
@@ -202,6 +196,11 @@ tcp_output(struct tcpcb *tp)
INP_WLOCK_ASSERT(tp->t_inpcb);
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE)
+ return (tcp_offload_output(tp));
+#endif
+
/*
* Determine length of data that should be transmitted,
* and flags that will be used.
@@ -229,9 +228,9 @@ again:
tcp_sack_adjust(tp);
sendalot = 0;
tso = 0;
+ mtu = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
- sendwin = min(sendwin, tp->snd_bwnd);
flags = tcp_outflags[tp->t_state];
/*
@@ -472,9 +471,8 @@ after_sack_rexmit:
}
/*
- * Truncate to the maximum segment length or enable TCP Segmentation
- * Offloading (if supported by hardware) and ensure that FIN is removed
- * if the length no longer contains the last data byte.
+ * Decide if we can use TCP Segmentation Offloading (if supported by
+ * hardware).
*
* TSO may only be used if we are in a pure bulk sending state. The
* presence of TCP-MD5, SACK retransmits, SACK advertizements and
@@ -482,10 +480,6 @@ after_sack_rexmit:
* (except for the sequence number) for all generated packets. This
* makes it impossible to transmit any options which vary per generated
* segment or packet.
- *
- * The length of TSO bursts is limited to TCP_MAXWIN. That limit and
- * removal of FIN (if not already catched here) are handled later after
- * the exact length of the TCP options are known.
*/
#ifdef IPSEC
/*
@@ -494,22 +488,15 @@ after_sack_rexmit:
*/
ipsec_optlen = ipsec_hdrsiz_tcp(tp);
#endif
- if (len > tp->t_maxseg) {
- if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
- ((tp->t_flags & TF_SIGNATURE) == 0) &&
- tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
- tp->t_inpcb->inp_options == NULL &&
- tp->t_inpcb->in6p_options == NULL
+ if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
+ ((tp->t_flags & TF_SIGNATURE) == 0) &&
+ tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
#ifdef IPSEC
- && ipsec_optlen == 0
+ ipsec_optlen == 0 &&
#endif
- ) {
- tso = 1;
- } else {
- len = tp->t_maxseg;
- sendalot = 1;
- }
- }
+ tp->t_inpcb->inp_options == NULL &&
+ tp->t_inpcb->in6p_options == NULL)
+ tso = 1;
if (sack_rxmit) {
if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
@@ -560,19 +547,39 @@ after_sack_rexmit:
}
/*
- * Compare available window to amount of window
- * known to peer (as advertised window less
- * next expected input). If the difference is at least two
- * max size segments, or at least 50% of the maximum possible
- * window, then want to send a window update to peer.
- * Skip this if the connection is in T/TCP half-open state.
- * Don't send pure window updates when the peer has closed
- * the connection and won't ever send more data.
+ * Sending of standalone window updates.
+ *
+ * Window updates are important when we close our window due to a
+ * full socket buffer and are opening it again after the application
+ * reads data from it. Once the window has opened again and the
+ * remote end starts to send again the ACK clock takes over and
+ * provides the most current window information.
+ *
+ * We must avoid the silly window syndrome whereas every read
+ * from the receive buffer, no matter how small, causes a window
+ * update to be sent. We also should avoid sending a flurry of
+ * window updates when the socket buffer had queued a lot of data
+ * and the application is doing small reads.
+ *
+ * Prevent a flurry of pointless window updates by only sending
+ * an update when we can increase the advertized window by more
+ * than 1/4th of the socket buffer capacity. When the buffer is
+ * getting full or is very small be more aggressive and send an
+ * update whenever we can increase by two mss sized segments.
+ * In all other situations the ACK's to new incoming data will
+ * carry further window increases.
+ *
+ * Don't send an independent window update if a delayed
+ * ACK is pending (it will get piggy-backed on it) or the
+ * remote side already has done a half-close and won't send
+ * more data. Skip this if the connection is in T/TCP
+ * half-open state.
*/
if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
+ !(tp->t_flags & TF_DELACK) &&
!TCPS_HAVERCVDFIN(tp->t_state)) {
/*
- * "adv" is the amount we can increase the window,
+ * "adv" is the amount we could increase the window,
* taking into account that we are limited by
* TCP_MAXWIN << tp->rcv_scale.
*/
@@ -592,9 +599,11 @@ after_sack_rexmit:
*/
if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
goto dontupdate;
- if (adv >= (long) (2 * tp->t_maxseg))
- goto send;
- if (2 * adv >= (long) so->so_rcv.sb_hiwat)
+
+ if (adv >= (long)(2 * tp->t_maxseg) &&
+ (adv >= (long)(so->so_rcv.sb_hiwat / 4) ||
+ recwin <= (long)(so->so_rcv.sb_hiwat / 8) ||
+ so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg))
goto send;
}
dontupdate:
@@ -680,7 +689,7 @@ send:
hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
else
#endif
- hdrlen = sizeof (struct tcpiphdr);
+ hdrlen = sizeof (struct tcpiphdr);
/*
* Compute options for segment.
@@ -753,28 +762,54 @@ send:
* bump the packet length beyond the t_maxopd length.
* Clear the FIN bit because we cut off the tail of
* the segment.
- *
- * When doing TSO limit a burst to TCP_MAXWIN minus the
- * IP, TCP and Options length to keep ip->ip_len from
- * overflowing. Prevent the last segment from being
- * fractional thus making them all equal sized and set
- * the flag to continue sending. TSO is disabled when
- * IP options or IPSEC are present.
*/
if (len + optlen + ipoptlen > tp->t_maxopd) {
flags &= ~TH_FIN;
+
if (tso) {
- if (len > TCP_MAXWIN - hdrlen - optlen) {
- len = TCP_MAXWIN - hdrlen - optlen;
- len = len - (len % (tp->t_maxopd - optlen));
+ KASSERT(ipoptlen == 0,
+ ("%s: TSO can't do IP options", __func__));
+
+ /*
+ * Limit a burst to t_tsomax minus IP,
+ * TCP and options length to keep ip->ip_len
+ * from overflowing or exceeding the maximum
+ * length allowed by the network interface.
+ */
+ if (len > tp->t_tsomax - hdrlen) {
+ len = tp->t_tsomax - hdrlen;
+ sendalot = 1;
+ }
+
+ /*
+ * Prevent the last segment from being
+ * fractional unless the send sockbuf can
+ * be emptied.
+ */
+ if (sendalot && off + len < so->so_snd.sb_cc) {
+ len -= len % (tp->t_maxopd - optlen);
sendalot = 1;
- } else if (tp->t_flags & TF_NEEDFIN)
+ }
+
+ /*
+ * Send the FIN in a separate segment
+ * after the bulk sending is done.
+ * We don't trust the TSO implementations
+ * to clear the FIN flag on all but the
+ * last segment.
+ */
+ if (tp->t_flags & TF_NEEDFIN)
sendalot = 1;
+
} else {
len = tp->t_maxopd - optlen - ipoptlen;
sendalot = 1;
}
- }
+ } else
+ tso = 0;
+
+ KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
+ ("%s: len > IP_MAXPACKET", __func__));
/*#ifdef DIAGNOSTIC*/
#ifdef INET6
@@ -810,19 +845,6 @@ send:
TCPSTAT_INC(tcps_sndpack);
TCPSTAT_ADD(tcps_sndbyte, len);
}
-#ifdef notyet
- if ((m = m_copypack(so->so_snd.sb_mb, off,
- (int)len, max_linkhdr + hdrlen)) == 0) {
- SOCKBUF_UNLOCK(&so->so_snd);
- error = ENOBUFS;
- goto out;
- }
- /*
- * m_copypack left space for our hdr; use it.
- */
- m->m_len += hdrlen;
- m->m_data -= hdrlen;
-#else
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m == NULL) {
SOCKBUF_UNLOCK(&so->so_snd);
@@ -862,7 +884,7 @@ send:
goto out;
}
}
-#endif
+
/*
* If we're sending everything we've got, set PUSH.
* (This will keep happy those implementations which only
@@ -1059,19 +1081,24 @@ send:
* checksum extended header and data.
*/
m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
#ifdef INET6
- if (isipv6)
+ if (isipv6) {
/*
* ip6_plen is not need to be filled now, and will be filled
* in ip6_output.
*/
- th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
- sizeof(struct tcphdr) + optlen + len);
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
+ optlen + len, IPPROTO_TCP, 0);
+ }
+#endif
+#if defined(INET6) && defined(INET)
else
-#endif /* INET6 */
+#endif
+#ifdef INET
{
m->m_pkthdr.csum_flags = CSUM_TCP;
- m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
@@ -1079,6 +1106,7 @@ send:
KASSERT(ip->ip_v == IPVERSION,
("%s: IP version incorrect: %d", __func__, ip->ip_v));
}
+#endif
/*
* Enable TSO and specify the size of the segments.
@@ -1092,6 +1120,16 @@ send:
m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
}
+#ifdef IPSEC
+ KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
+ ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u",
+ __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
+#else
+ KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
+ ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u",
+ __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
+#endif
+
/*
* In transmit state, time the transmission and arrange for
* the retransmit. In persist state, just set snd_max.
@@ -1183,7 +1221,7 @@ timer:
#endif
ipov->ih_len = save;
}
-#endif
+#endif /* TCPDEBUG */
/*
* Fill in IP length and desired time to live and
@@ -1197,6 +1235,9 @@ timer:
*/
#ifdef INET6
if (isipv6) {
+ struct route_in6 ro;
+
+ bzero(&ro, sizeof(ro));
/*
* we separately set hoplimit for every segment, since the
* user might want to change the value via setsockopt.
@@ -1206,13 +1247,23 @@ timer:
ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
/* TODO: IPv6 IP6TOS_ECT bit on */
- error = ip6_output(m,
- tp->t_inpcb->in6p_outputopts, NULL,
- ((so->so_options & SO_DONTROUTE) ?
- IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb);
- } else
+ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
+ ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
+ NULL, NULL, tp->t_inpcb);
+
+ if (error == EMSGSIZE && ro.ro_rt != NULL)
+ mtu = ro.ro_rt->rt_rmx.rmx_mtu;
+ RO_RTFREE(&ro);
+ }
#endif /* INET6 */
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
{
+ struct route ro;
+
+ bzero(&ro, sizeof(ro));
ip->ip_len = m->m_pkthdr.len;
#ifdef INET6
if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
@@ -1229,10 +1280,15 @@ timer:
if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
ip->ip_off |= IP_DF;
- error = ip_output(m, tp->t_inpcb->inp_options, NULL,
+ error = ip_output(m, tp->t_inpcb->inp_options, &ro,
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
tp->t_inpcb);
+
+ if (error == EMSGSIZE && ro.ro_rt != NULL)
+ mtu = ro.ro_rt->rt_rmx.rmx_mtu;
+ RO_RTFREE(&ro);
}
+#endif /* INET */
if (error) {
/*
@@ -1277,21 +1333,18 @@ out:
* For some reason the interface we used initially
* to send segments changed to another or lowered
* its MTU.
- *
- * tcp_mtudisc() will find out the new MTU and as
- * its last action, initiate retransmission, so it
- * is important to not do so here.
- *
* If TSO was active we either got an interface
* without TSO capabilits or TSO was turned off.
- * Disable it for this connection as too and
- * immediatly retry with MSS sized segments generated
- * by this function.
+ * If we obtained mtu from ip_output() then update
+ * it and try again.
*/
if (tso)
tp->t_flags &= ~TF_TSO;
- tcp_mtudisc(tp->t_inpcb, -1);
- return (0);
+ if (mtu != 0) {
+ tcp_mss_update(tp, -1, mtu, NULL, NULL);
+ goto again;
+ }
+ return (error);
case EHOSTDOWN:
case EHOSTUNREACH:
case ENETDOWN:
diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c
index 6b2605ce..aebda9db 100644
--- a/freebsd/sys/netinet/tcp_reass.c
+++ b/freebsd/sys/netinet/tcp_reass.c
@@ -76,24 +76,19 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_debug.h>
#endif /* TCPDEBUG */
-static int tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS);
static int tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS);
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
"TCP Segment Reassembly Queue");
static VNET_DEFINE(int, tcp_reass_maxseg) = 0;
#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg)
-SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, maxsegments,
- CTLTYPE_INT | CTLFLAG_RDTUN,
- &VNET_NAME(tcp_reass_maxseg), 0, &tcp_reass_sysctl_maxseg, "I",
+SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_reass_maxseg), 0,
"Global maximum number of TCP Segments in Reassembly Queue");
-static VNET_DEFINE(int, tcp_reass_qsize) = 0;
-#define V_tcp_reass_qsize VNET(tcp_reass_qsize)
SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, cursegments,
- CTLTYPE_INT | CTLFLAG_RD,
- &VNET_NAME(tcp_reass_qsize), 0, &tcp_reass_sysctl_qsize, "I",
+ (CTLTYPE_INT | CTLFLAG_RD), NULL, 0, &tcp_reass_sysctl_qsize, "I",
"Global number of TCP Segments currently in Reassembly Queue");
static VNET_DEFINE(int, tcp_reass_overflows) = 0;
@@ -111,8 +106,10 @@ static void
tcp_reass_zone_change(void *tag)
{
+ /* Set the zone limit and read back the effective value. */
V_tcp_reass_maxseg = nmbclusters / 16;
- uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg);
+ V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
+ V_tcp_reass_maxseg);
}
void
@@ -124,7 +121,9 @@ tcp_reass_init(void)
&V_tcp_reass_maxseg);
V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg);
+ /* Set the zone limit and read back the effective value. */
+ V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
+ V_tcp_reass_maxseg);
EVENTHANDLER_REGISTER(nmbclusters_change,
tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
}
@@ -158,17 +157,12 @@ tcp_reass_flush(struct tcpcb *tp)
}
static int
-tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS)
-{
- V_tcp_reass_maxseg = uma_zone_get_max(V_tcp_reass_zone);
- return (sysctl_handle_int(oidp, arg1, arg2, req));
-}
-
-static int
tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS)
{
- V_tcp_reass_qsize = uma_zone_get_cur(V_tcp_reass_zone);
- return (sysctl_handle_int(oidp, arg1, arg2, req));
+ int qsize;
+
+ qsize = uma_zone_get_cur(V_tcp_reass_zone);
+ return (sysctl_handle_int(oidp, &qsize, 0, req));
}
int
@@ -299,7 +293,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
th->th_seq += i;
}
}
- tp->t_rcvoopack++;
+ tp->t_rcvoopack++;
TCPSTAT_INC(tcps_rcvoopack);
TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
diff --git a/freebsd/sys/netinet/tcp_sack.c b/freebsd/sys/netinet/tcp_sack.c
index 449b538f..9cc1d86a 100644
--- a/freebsd/sys/netinet/tcp_sack.c
+++ b/freebsd/sys/netinet/tcp_sack.c
@@ -579,7 +579,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
/* Send one or 2 segments based on how much new data was acked. */
- if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) > 2)
+ if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2)
num_segs = 2;
tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
(tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c
index e23a0997..4c6d14eb 100644
--- a/freebsd/sys/netinet/tcp_subr.c
+++ b/freebsd/sys/netinet/tcp_subr.c
@@ -72,29 +72,25 @@ __FBSDID("$FreeBSD$");
#include <netinet/cc.h>
#include <netinet/in.h>
+#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
-#endif
-#include <netinet/in_pcb.h>
-#ifdef INET6
#include <netinet6/in6_pcb.h>
-#endif
-#include <netinet/in_var.h>
-#include <netinet/ip_var.h>
-#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
#endif
-#include <netinet/ip_icmp.h>
+
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
@@ -102,7 +98,12 @@ __FBSDID("$FreeBSD$");
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
+#ifdef INET6
#include <netinet6/ip6protosw.h>
+#endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
@@ -166,15 +167,7 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0,
&sysctl_net_inet_tcp_mss_v6_check, "I",
"Default TCP Maximum Segment Size for IPv6");
-#endif
-
-static int
-vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
-{
-
- VNET_SYSCTL_ARG(req, arg1);
- return (sysctl_msec_to_ticks(oidp, arg1, arg2, req));
-}
+#endif /* INET6 */
/*
* Minimum MSS we accept and use. This prevents DoS attacks where
@@ -187,7 +180,7 @@ vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
&VNET_NAME(tcp_minmss), 0,
- "Minmum TCP Maximum Segment Size");
+ "Minimum TCP Maximum Segment Size");
VNET_DEFINE(int, tcp_do_rfc1323) = 1;
SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
@@ -221,49 +214,9 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
&VNET_NAME(tcp_isn_reseed_interval), 0,
"Seconds between reseeding of ISN secret");
-/*
- * TCP bandwidth limiting sysctls. Note that the default lower bound of
- * 1024 exists only for debugging. A good production default would be
- * something like 6100.
- */
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
- "TCP inflight data limiting");
-
-static VNET_DEFINE(int, tcp_inflight_enable) = 0;
-#define V_tcp_inflight_enable VNET(tcp_inflight_enable)
-SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
- &VNET_NAME(tcp_inflight_enable), 0,
- "Enable automatic TCP inflight data limiting");
-
-static int tcp_inflight_debug = 0;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
- &tcp_inflight_debug, 0,
- "Debug TCP inflight calculations");
-
-static VNET_DEFINE(int, tcp_inflight_rttthresh);
-#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh)
-SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh,
- CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0,
- vnet_sysctl_msec_to_ticks, "I",
- "RTT threshold below which inflight will deactivate itself");
-
-static VNET_DEFINE(int, tcp_inflight_min) = 6144;
-#define V_tcp_inflight_min VNET(tcp_inflight_min)
-SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
- &VNET_NAME(tcp_inflight_min), 0,
- "Lower-bound for TCP inflight window");
-
-static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT;
-#define V_tcp_inflight_max VNET(tcp_inflight_max)
-SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
- &VNET_NAME(tcp_inflight_max), 0,
- "Upper-bound for TCP inflight window");
-
-static VNET_DEFINE(int, tcp_inflight_stab) = 20;
-#define V_tcp_inflight_stab VNET(tcp_inflight_stab)
-SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
- &VNET_NAME(tcp_inflight_stab), 0,
- "Inflight Algorithm Stabilization 20 = 2 packets");
+static int tcp_soreceive_stream = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
+ &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
#ifdef TCP_SIGNATURE
static int tcp_sig_checksigs = 1;
@@ -278,7 +231,6 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
-static void tcp_isn_tick(void *);
static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
void *ip4hdr, const void *ip6hdr);
@@ -309,7 +261,6 @@ static VNET_DEFINE(uma_zone_t, tcpcb_zone);
#define V_tcpcb_zone VNET(tcpcb_zone)
MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
-struct callout isn_callout;
static struct mtx isn_mtx;
#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
@@ -342,13 +293,6 @@ tcp_init(void)
{
int hashsize;
- INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp");
- LIST_INIT(&V_tcb);
-#ifdef VIMAGE
- V_tcbinfo.ipi_vnet = curvnet;
-#endif
- V_tcbinfo.ipi_listhead = &V_tcb;
-
if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
&V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
printf("%s: WARNING: unable to register helper hook\n", __func__);
@@ -362,14 +306,9 @@ tcp_init(void)
printf("WARNING: TCB hash size not a power of 2\n");
hashsize = 512; /* safe default */
}
- V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB,
- &V_tcbinfo.ipi_hashmask);
- V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB,
- &V_tcbinfo.ipi_porthashmask);
- V_tcbinfo.ipi_zone = uma_zcreate("tcp_inpcb", sizeof(struct inpcb),
- NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
- V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
+ in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize,
+ "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE,
+ IPI_HASHFIELDS_4TUPLE);
/*
* These have to be type stable for the benefit of the timers.
@@ -405,6 +344,16 @@ tcp_init(void)
tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
tcp_tcbhashsize = hashsize;
+ TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream);
+ if (tcp_soreceive_stream) {
+#ifdef INET
+ tcp_usrreqs.pru_soreceive = soreceive_stream;
+#endif
+#ifdef INET6
+ tcp6_usrreqs.pru_soreceive = soreceive_stream;
+#endif /* INET6 */
+ }
+
#ifdef INET6
#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
#else /* INET6 */
@@ -417,8 +366,6 @@ tcp_init(void)
#undef TCP_MINPROTOHDR
ISN_LOCK_INIT();
- callout_init(&isn_callout, CALLOUT_MPSAFE);
- callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
@@ -434,18 +381,9 @@ tcp_destroy(void)
tcp_hc_destroy();
syncache_destroy();
tcp_tw_destroy();
-
- /* XXX check that hashes are empty! */
- hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB,
- V_tcbinfo.ipi_hashmask);
- hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB,
- V_tcbinfo.ipi_porthashmask);
-
+ in_pcbinfo_destroy(&V_tcbinfo);
uma_zdestroy(V_sack_hole_zone);
uma_zdestroy(V_tcpcb_zone);
- uma_zdestroy(V_tcbinfo.ipi_zone);
-
- INP_INFO_LOCK_DESTROY(&V_tcbinfo);
}
#endif
@@ -453,7 +391,6 @@ void
tcp_fini(void *xtp)
{
- callout_stop(&isn_callout);
}
/*
@@ -481,8 +418,12 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
ip6->ip6_plen = htons(sizeof(struct tcphdr));
ip6->ip6_src = inp->in6p_laddr;
ip6->ip6_dst = inp->in6p_faddr;
- } else
+ }
+#endif /* INET6 */
+#if defined(INET6) && defined(INET)
+ else
#endif
+#ifdef INET
{
struct ip *ip;
@@ -499,6 +440,7 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
ip->ip_src = inp->inp_laddr;
ip->ip_dst = inp->inp_faddr;
}
+#endif /* INET */
th->th_sport = inp->inp_lport;
th->th_dport = inp->inp_fport;
th->th_seq = 0;
@@ -560,7 +502,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
#ifdef INET6
- isipv6 = ((struct ip *)ipgen)->ip_v == 6;
+ isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
ip6 = ipgen;
#endif /* INET6 */
ip = ipgen;
@@ -608,6 +550,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
m_freem(m->m_next);
m->m_next = NULL;
m->m_data = (caddr_t)ipgen;
+ m_addr_changed(m);
/* m_len is set later */
tlen = 0;
#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
@@ -638,11 +581,14 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
ip6->ip6_flow = 0;
ip6->ip6_vfc = IPV6_VERSION;
ip6->ip6_nxt = IPPROTO_TCP;
- ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
- tlen));
+ ip6->ip6_plen = 0; /* Set in ip6_output(). */
tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
- } else
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
#endif
+#ifdef INET
{
tlen += sizeof (struct tcpiphdr);
ip->ip_len = tlen;
@@ -650,6 +596,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
if (V_path_mtu_discovery)
ip->ip_off |= IP_DF;
}
+#endif
m->m_len = tlen;
m->m_pkthdr.len = tlen;
m->m_pkthdr.rcvif = NULL;
@@ -679,22 +626,27 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
else
nth->th_win = htons((u_short)win);
nth->th_urp = 0;
+
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
#ifdef INET6
if (isipv6) {
- nth->th_sum = 0;
- nth->th_sum = in6_cksum(m, IPPROTO_TCP,
- sizeof(struct ip6_hdr),
- tlen - sizeof(struct ip6_hdr));
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ nth->th_sum = in6_cksum_pseudo(ip6,
+ tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
NULL, NULL);
- } else
+ }
#endif /* INET6 */
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
{
+ m->m_pkthdr.csum_flags = CSUM_TCP;
nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
- m->m_pkthdr.csum_flags = CSUM_TCP;
- m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
}
+#endif /* INET */
#ifdef TCPDEBUG
if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
@@ -702,9 +654,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
#ifdef INET6
if (isipv6)
(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
- else
#endif /* INET6 */
- (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
+#endif
}
/*
@@ -786,10 +742,8 @@ tcp_newtcpcb(struct inpcb *inp)
tp->t_rttmin = tcp_rexmit_min;
tp->t_rxtcur = TCPTV_RTOBASE;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
- tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_rcvtime = ticks;
- tp->t_bw_rtttime = ticks;
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@@ -878,7 +832,7 @@ tcp_drop(struct tcpcb *tp, int errno)
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_state = TCPS_CLOSED;
- (void) tcp_output_reset(tp);
+ (void) tcp_output(tp);
TCPSTAT_INC(tcps_drops);
} else
TCPSTAT_INC(tcps_conndrops);
@@ -900,8 +854,19 @@ tcp_discardcb(struct tcpcb *tp)
INP_WLOCK_ASSERT(inp);
/*
- * Make sure that all of our timers are stopped before we
- * delete the PCB.
+ * Make sure that all of our timers are stopped before we delete the
+ * PCB.
+ *
+ * XXXRW: Really, we would like to use callout_drain() here in order
+ * to avoid races experienced in tcp_timer.c where a timer is already
+ * executing at this point. However, we can't, both because we're
+ * running in a context where we can't sleep, and also because we
+ * hold locks required by the timers. What we instead need to do is
+ * test to see if callout_drain() is required, and if so, defer some
+ * portion of the remainder of tcp_discardcb() to an asynchronous
+ * context that can callout_drain() and then continue. Some care
+ * will be required to ensure that no further processing takes place
+ * on the tcpcb, even though it hasn't been freed (a flag?).
*/
callout_stop(&tp->t_timers->tt_rexmt);
callout_stop(&tp->t_timers->tt_persist);
@@ -958,8 +923,6 @@ tcp_discardcb(struct tcpcb *tp)
metrics.rmx_rtt = tp->t_srtt;
metrics.rmx_rttvar = tp->t_rttvar;
- /* XXX: This wraps if the pipe is more than 4 Gbit per second */
- metrics.rmx_bandwidth = tp->snd_bandwidth;
metrics.rmx_cwnd = tp->snd_cwnd;
metrics.rmx_sendpipe = 0;
metrics.rmx_recvpipe = 0;
@@ -969,8 +932,12 @@ tcp_discardcb(struct tcpcb *tp)
/* free the reassembly queue, if any */
tcp_reass_flush(tp);
+
+#ifdef TCP_OFFLOAD
/* Disconnect offload device, if any. */
- tcp_offload_detach(tp);
+ if (tp->t_flags & TF_TOE)
+ tcp_offload_detach(tp);
+#endif
tcp_free_sackholes(tp);
@@ -999,9 +966,10 @@ tcp_close(struct tcpcb *tp)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
- /* Notify any offload devices of listener close */
+#ifdef TCP_OFFLOAD
if (tp->t_state == TCPS_LISTEN)
- tcp_offload_listen_close(tp);
+ tcp_offload_listen_stop(tp);
+#endif
in_pcbdrop(inp);
TCPSTAT_INC(tcps_closed);
KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
@@ -1211,8 +1179,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
else if (inp->inp_flags & INP_TIMEWAIT) {
bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
xt.xt_tp.t_state = TCPS_TIME_WAIT;
- } else
+ } else {
bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
+ if (xt.xt_tp.t_timers)
+ tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer);
+ }
if (inp->inp_socket != NULL)
sotoxsocket(inp->inp_socket, &xt.xt_socket);
else {
@@ -1228,9 +1199,9 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
INP_INFO_WLOCK(&V_tcbinfo);
for (i = 0; i < n; i++) {
inp = inp_list[i];
- INP_WLOCK(inp);
- if (!in_pcbrele(inp))
- INP_WUNLOCK(inp);
+ INP_RLOCK(inp);
+ if (!in_pcbrele_rlocked(inp))
+ INP_RUNLOCK(inp);
}
INP_INFO_WUNLOCK(&V_tcbinfo);
@@ -1257,6 +1228,7 @@ SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
+#ifdef INET
static int
tcp_getcred(SYSCTL_HANDLER_ARGS)
{
@@ -1271,12 +1243,9 @@ tcp_getcred(SYSCTL_HANDLER_ARGS)
error = SYSCTL_IN(req, addrs, sizeof(addrs));
if (error)
return (error);
- INP_INFO_RLOCK(&V_tcbinfo);
- inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr,
- addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
+ inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
+ addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
if (inp != NULL) {
- INP_RLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
if (inp->inp_socket == NULL)
error = ENOENT;
if (error == 0)
@@ -1284,10 +1253,8 @@ tcp_getcred(SYSCTL_HANDLER_ARGS)
if (error == 0)
cru2x(inp->inp_cred, &xuc);
INP_RUNLOCK(inp);
- } else {
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ } else
error = ENOENT;
- }
if (error == 0)
error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
return (error);
@@ -1296,6 +1263,7 @@ tcp_getcred(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
+#endif /* INET */
#ifdef INET6
static int
@@ -1304,7 +1272,10 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS)
struct xucred xuc;
struct sockaddr_in6 addrs[2];
struct inpcb *inp;
- int error, mapped = 0;
+ int error;
+#ifdef INET
+ int mapped = 0;
+#endif
error = priv_check(req->td, PRIV_NETINET_GETCRED);
if (error)
@@ -1317,27 +1288,28 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS)
return (error);
}
if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
+#ifdef INET
if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
mapped = 1;
else
+#endif
return (EINVAL);
}
- INP_INFO_RLOCK(&V_tcbinfo);
+#ifdef INET
if (mapped == 1)
- inp = in_pcblookup_hash(&V_tcbinfo,
+ inp = in_pcblookup(&V_tcbinfo,
*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
addrs[1].sin6_port,
*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
- addrs[0].sin6_port,
- 0, NULL);
+ addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
else
- inp = in6_pcblookup_hash(&V_tcbinfo,
+#endif
+ inp = in6_pcblookup(&V_tcbinfo,
&addrs[1].sin6_addr, addrs[1].sin6_port,
- &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
+ &addrs[0].sin6_addr, addrs[0].sin6_port,
+ INPLOOKUP_RLOCKPCB, NULL);
if (inp != NULL) {
- INP_RLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
if (inp->inp_socket == NULL)
error = ENOENT;
if (error == 0)
@@ -1345,10 +1317,8 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS)
if (error == 0)
cru2x(inp->inp_cred, &xuc);
INP_RUNLOCK(inp);
- } else {
- INP_INFO_RUNLOCK(&V_tcbinfo);
+ } else
error = ENOENT;
- }
if (error == 0)
error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
return (error);
@@ -1357,9 +1327,10 @@ tcp6_getcred(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
-#endif
+#endif /* INET6 */
+#ifdef INET
void
tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
{
@@ -1408,10 +1379,9 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
th = (struct tcphdr *)((caddr_t)ip
+ (ip->ip_hl << 2));
INP_INFO_WLOCK(&V_tcbinfo);
- inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport,
- ip->ip_src, th->th_sport, 0, NULL);
+ inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport,
+ ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
if (inp != NULL) {
- INP_WLOCK(inp);
if (!(inp->inp_flags & INP_TIMEWAIT) &&
!(inp->inp_flags & INP_DROPPED) &&
!(inp->inp_socket == NULL)) {
@@ -1473,6 +1443,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
} else
in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
}
+#endif /* INET */
#ifdef INET6
void
@@ -1600,11 +1571,13 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
#define ISN_RANDOM_INCREMENT (4096 - 1)
static VNET_DEFINE(u_char, isn_secret[32]);
+static VNET_DEFINE(int, isn_last);
static VNET_DEFINE(int, isn_last_reseed);
static VNET_DEFINE(u_int32_t, isn_offset);
static VNET_DEFINE(u_int32_t, isn_offset_old);
#define V_isn_secret VNET(isn_secret)
+#define V_isn_last VNET(isn_last)
#define V_isn_last_reseed VNET(isn_last_reseed)
#define V_isn_offset VNET(isn_offset)
#define V_isn_offset_old VNET(isn_offset_old)
@@ -1615,6 +1588,7 @@ tcp_new_isn(struct tcpcb *tp)
MD5_CTX isn_ctx;
u_int32_t md5_buffer[4];
tcp_seq new_isn;
+ u_int32_t projected_offset;
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -1650,38 +1624,17 @@ tcp_new_isn(struct tcpcb *tp)
new_isn = (tcp_seq) md5_buffer[0];
V_isn_offset += ISN_STATIC_INCREMENT +
(arc4random() & ISN_RANDOM_INCREMENT);
- new_isn += V_isn_offset;
- ISN_UNLOCK();
- return (new_isn);
-}
-
-/*
- * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary
- * to keep time flowing at a relatively constant rate. If the random
- * increments have already pushed us past the projected offset, do nothing.
- */
-static void
-tcp_isn_tick(void *xtp)
-{
- VNET_ITERATOR_DECL(vnet_iter);
- u_int32_t projected_offset;
-
- VNET_LIST_RLOCK_NOSLEEP();
- ISN_LOCK();
- VNET_FOREACH(vnet_iter) {
- CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */
- projected_offset =
- V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;
-
+ if (ticks != V_isn_last) {
+ projected_offset = V_isn_offset_old +
+ ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
if (SEQ_GT(projected_offset, V_isn_offset))
V_isn_offset = projected_offset;
-
V_isn_offset_old = V_isn_offset;
- CURVNET_RESTORE();
+ V_isn_last = ticks;
}
+ new_isn += V_isn_offset;
ISN_UNLOCK();
- VNET_LIST_RUNLOCK_NOSLEEP();
- callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
+ return (new_isn);
}
/*
@@ -1755,10 +1708,11 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
tp->snd_recover = tp->snd_max;
if (tp->t_flags & TF_SACK_PERMIT)
EXIT_FASTRECOVERY(tp->t_flags);
- tcp_output_send(tp);
+ tcp_output(tp);
return (inp);
}
+#ifdef INET
/*
* Look-up the routing entry to the peer of this inpcb. If no route
* is found and it cannot be allocated, then return 0. This routine
@@ -1766,7 +1720,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
* tcp_mss_update to get the peer/interface MTU.
*/
u_long
-tcp_maxmtu(struct in_conninfo *inc, int *flags)
+tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
{
struct route sro;
struct sockaddr_in *dst;
@@ -1791,19 +1745,21 @@ tcp_maxmtu(struct in_conninfo *inc, int *flags)
maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
/* Report additional interface capabilities. */
- if (flags != NULL) {
+ if (cap != NULL) {
if (ifp->if_capenable & IFCAP_TSO4 &&
ifp->if_hwassist & CSUM_TSO)
- *flags |= CSUM_TSO;
+ cap->ifcap |= CSUM_TSO;
+ cap->tsomax = ifp->if_hw_tsomax;
}
RTFREE(sro.ro_rt);
}
return (maxmtu);
}
+#endif /* INET */
#ifdef INET6
u_long
-tcp_maxmtu6(struct in_conninfo *inc, int *flags)
+tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
{
struct route_in6 sro6;
struct ifnet *ifp;
@@ -1827,10 +1783,11 @@ tcp_maxmtu6(struct in_conninfo *inc, int *flags)
IN6_LINKMTU(sro6.ro_rt->rt_ifp));
/* Report additional interface capabilities. */
- if (flags != NULL) {
+ if (cap != NULL) {
if (ifp->if_capenable & IFCAP_TSO6 &&
ifp->if_hwassist & CSUM_TSO)
- *flags |= CSUM_TSO;
+ cap->ifcap |= CSUM_TSO;
+ cap->tsomax = ifp->if_hw_tsomax;
}
RTFREE(sro6.ro_rt);
}
@@ -1882,154 +1839,6 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp)
}
#endif /* IPSEC */
-/*
- * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
- *
- * This code attempts to calculate the bandwidth-delay product as a
- * means of determining the optimal window size to maximize bandwidth,
- * minimize RTT, and avoid the over-allocation of buffers on interfaces and
- * routers. This code also does a fairly good job keeping RTTs in check
- * across slow links like modems. We implement an algorithm which is very
- * similar (but not meant to be) TCP/Vegas. The code operates on the
- * transmitter side of a TCP connection and so only effects the transmit
- * side of the connection.
- *
- * BACKGROUND: TCP makes no provision for the management of buffer space
- * at the end points or at the intermediate routers and switches. A TCP
- * stream, whether using NewReno or not, will eventually buffer as
- * many packets as it is able and the only reason this typically works is
- * due to the fairly small default buffers made available for a connection
- * (typicaly 16K or 32K). As machines use larger windows and/or window
- * scaling it is now fairly easy for even a single TCP connection to blow-out
- * all available buffer space not only on the local interface, but on
- * intermediate routers and switches as well. NewReno makes a misguided
- * attempt to 'solve' this problem by waiting for an actual failure to occur,
- * then backing off, then steadily increasing the window again until another
- * failure occurs, ad-infinitum. This results in terrible oscillation that
- * is only made worse as network loads increase and the idea of intentionally
- * blowing out network buffers is, frankly, a terrible way to manage network
- * resources.
- *
- * It is far better to limit the transmit window prior to the failure
- * condition being achieved. There are two general ways to do this: First
- * you can 'scan' through different transmit window sizes and locate the
- * point where the RTT stops increasing, indicating that you have filled the
- * pipe, then scan backwards until you note that RTT stops decreasing, then
- * repeat ad-infinitum. This method works in principle but has severe
- * implementation issues due to RTT variances, timer granularity, and
- * instability in the algorithm which can lead to many false positives and
- * create oscillations as well as interact badly with other TCP streams
- * implementing the same algorithm.
- *
- * The second method is to limit the window to the bandwidth delay product
- * of the link. This is the method we implement. RTT variances and our
- * own manipulation of the congestion window, bwnd, can potentially
- * destabilize the algorithm. For this reason we have to stabilize the
- * elements used to calculate the window. We do this by using the minimum
- * observed RTT, the long term average of the observed bandwidth, and
- * by adding two segments worth of slop. It isn't perfect but it is able
- * to react to changing conditions and gives us a very stable basis on
- * which to extend the algorithm.
- */
-void
-tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
-{
- u_long bw;
- u_long bwnd;
- int save_ticks;
-
- INP_WLOCK_ASSERT(tp->t_inpcb);
-
- /*
- * If inflight_enable is disabled in the middle of a tcp connection,
- * make sure snd_bwnd is effectively disabled.
- */
- if (V_tcp_inflight_enable == 0 ||
- tp->t_rttlow < V_tcp_inflight_rttthresh) {
- tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
- tp->snd_bandwidth = 0;
- return;
- }
-
- /*
- * Figure out the bandwidth. Due to the tick granularity this
- * is a very rough number and it MUST be averaged over a fairly
- * long period of time. XXX we need to take into account a link
- * that is not using all available bandwidth, but for now our
- * slop will ramp us up if this case occurs and the bandwidth later
- * increases.
- *
- * Note: if ticks rollover 'bw' may wind up negative. We must
- * effectively reset t_bw_rtttime for this case.
- */
- save_ticks = ticks;
- if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
- return;
-
- bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
- (save_ticks - tp->t_bw_rtttime);
- tp->t_bw_rtttime = save_ticks;
- tp->t_bw_rtseq = ack_seq;
- if (tp->t_bw_rtttime == 0 || (int)bw < 0)
- return;
- bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
-
- tp->snd_bandwidth = bw;
-
- /*
- * Calculate the semi-static bandwidth delay product, plus two maximal
- * segments. The additional slop puts us squarely in the sweet
- * spot and also handles the bandwidth run-up case and stabilization.
- * Without the slop we could be locking ourselves into a lower
- * bandwidth.
- *
- * Situations Handled:
- * (1) Prevents over-queueing of packets on LANs, especially on
- * high speed LANs, allowing larger TCP buffers to be
- * specified, and also does a good job preventing
- * over-queueing of packets over choke points like modems
- * (at least for the transmit side).
- *
- * (2) Is able to handle changing network loads (bandwidth
- * drops so bwnd drops, bandwidth increases so bwnd
- * increases).
- *
- * (3) Theoretically should stabilize in the face of multiple
- * connections implementing the same algorithm (this may need
- * a little work).
- *
- * (4) Stability value (defaults to 20 = 2 maximal packets) can
- * be adjusted with a sysctl but typically only needs to be
- * on very slow connections. A value no smaller then 5
- * should be used, but only reduce this default if you have
- * no other choice.
- */
-#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
- bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
-#undef USERTT
-
- if (tcp_inflight_debug > 0) {
- static int ltime;
- if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
- ltime = ticks;
- printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
- tp,
- bw,
- tp->t_rttbest,
- tp->t_srtt,
- bwnd
- );
- }
- }
- if ((long)bwnd < V_tcp_inflight_min)
- bwnd = V_tcp_inflight_min;
- if (bwnd > V_tcp_inflight_max)
- bwnd = V_tcp_inflight_max;
- if ((long)bwnd < tp->t_maxseg * 2)
- bwnd = tp->t_maxseg * 2;
- tp->snd_bwnd = bwnd;
-}
-
#ifdef TCP_SIGNATURE
/*
* Callback function invoked by m_apply() to digest TCP segment data
@@ -2071,11 +1880,15 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
u_char *buf, u_int direction)
{
union sockaddr_union dst;
+#ifdef INET
struct ippseudo ippseudo;
+#endif
MD5_CTX ctx;
int doff;
struct ip *ip;
+#ifdef INET
struct ipovly *ipovly;
+#endif
struct secasvar *sav;
struct tcphdr *th;
#ifdef INET6
@@ -2097,12 +1910,14 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
ip6 = NULL; /* Make the compiler happy. */
#endif
switch (ip->ip_v) {
+#ifdef INET
case IPVERSION:
dst.sa.sa_len = sizeof(struct sockaddr_in);
dst.sa.sa_family = AF_INET;
dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
ip->ip_src : ip->ip_dst;
break;
+#endif
#ifdef INET6
case (IPV6_VERSION >> 4):
ip6 = mtod(m, struct ip6_hdr *);
@@ -2142,6 +1957,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
* tcp_output(), the underlying ip_len member has not yet been set.
*/
switch (ip->ip_v) {
+#ifdef INET
case IPVERSION:
ipovly = (struct ipovly *)ip;
ippseudo.ippseudo_src = ipovly->ih_src;
@@ -2155,6 +1971,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip));
doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
break;
+#endif
#ifdef INET6
/*
* RFC 2385, 2.0 Proposal
@@ -2335,6 +2152,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
return (error);
break;
#endif
+#ifdef INET
case AF_INET:
fin = (struct sockaddr_in *)&addrs[0];
lin = (struct sockaddr_in *)&addrs[1];
@@ -2342,6 +2160,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
lin->sin_len != sizeof(struct sockaddr_in))
return (EINVAL);
break;
+#endif
default:
return (EINVAL);
}
@@ -2349,18 +2168,19 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
switch (addrs[0].ss_family) {
#ifdef INET6
case AF_INET6:
- inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr,
- fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0,
- NULL);
+ inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
+ fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
+ INPLOOKUP_WLOCKPCB, NULL);
break;
#endif
+#ifdef INET
case AF_INET:
- inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr,
- fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL);
+ inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
+ lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
break;
+#endif
}
if (inp != NULL) {
- INP_WLOCK(inp);
if (inp->inp_flags & INP_TIMEWAIT) {
/*
* XXXRW: There currently exists a state where an
@@ -2387,7 +2207,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
return (error);
}
-SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
+SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
0, sysctl_drop, "", "Drop TCP connection");
@@ -2485,6 +2305,7 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
sp = s + strlen(s);
sprintf(sp, "]:%i", ntohs(th->th_dport));
#endif /* INET6 */
+#ifdef INET
} else if (ip && th) {
inet_ntoa_r(ip->ip_src, sp);
sp = s + strlen(s);
@@ -2493,6 +2314,7 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr,
inet_ntoa_r(ip->ip_dst, sp);
sp = s + strlen(s);
sprintf(sp, "]:%i", ntohs(th->th_dport));
+#endif /* INET */
} else {
free(s, M_TCPLOG);
return (NULL);
diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c
index 80da0349..10bd00ae 100644
--- a/freebsd/sys/netinet/tcp_syncache.c
+++ b/freebsd/sys/netinet/tcp_syncache.c
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
+#include <rtems/bsd/local/opt_pcbgroup.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
@@ -82,10 +83,12 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
#endif
+#ifdef TCP_OFFLOAD
+#include <netinet/toecore.h>
+#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
@@ -111,10 +114,8 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
&VNET_NAME(tcp_syncookiesonly), 0,
"Use only TCP SYN cookies");
-#ifdef TCP_OFFLOAD_DISABLE
-#define TOEPCB_ISSET(sc) (0)
-#else
-#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
+#ifdef TCP_OFFLOAD
+#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL)
#endif
static void syncache_drop(struct syncache *, struct syncache_head *);
@@ -124,6 +125,7 @@ struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
static int syncache_respond(struct syncache *);
static struct socket *syncache_socket(struct syncache *, struct socket *,
struct mbuf *m);
+static int syncache_sysctl_count(SYSCTL_HANDLER_ARGS);
static void syncache_timeout(struct syncache *sc, struct syncache_head *sch,
int docallout);
static void syncache_timer(void *);
@@ -148,7 +150,8 @@ static struct syncache
static VNET_DEFINE(struct tcp_syncache, tcp_syncache);
#define V_tcp_syncache VNET(tcp_syncache)
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0,
+ "TCP SYN cache");
SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
&VNET_NAME(tcp_syncache.bucket_limit), 0,
@@ -158,8 +161,8 @@ SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
&VNET_NAME(tcp_syncache.cache_limit), 0,
"Overall entry limit for syncache");
-SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
- &VNET_NAME(tcp_syncache.cache_count), 0,
+SYSCTL_VNET_PROC(_net_inet_tcp_syncache, OID_AUTO, count, (CTLTYPE_UINT|CTLFLAG_RD),
+ NULL, 0, &syncache_sysctl_count, "IU",
"Current number of entries in syncache");
SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
@@ -225,7 +228,6 @@ syncache_init(void)
{
int i;
- V_tcp_syncache.cache_count = 0;
V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
@@ -268,7 +270,8 @@ syncache_init(void)
/* Create the syncache entry zone. */
V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
- uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit);
+ V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone,
+ V_tcp_syncache.cache_limit);
}
#ifdef VIMAGE
@@ -296,8 +299,8 @@ syncache_destroy(void)
mtx_destroy(&sch->sch_mtx);
}
- KASSERT(V_tcp_syncache.cache_count == 0, ("%s: cache_count %d not 0",
- __func__, V_tcp_syncache.cache_count));
+ KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0,
+ ("%s: cache_count not 0", __func__));
/* Free the allocated global resources. */
uma_zdestroy(V_tcp_syncache.zone);
@@ -305,6 +308,15 @@ syncache_destroy(void)
}
#endif
+static int
+syncache_sysctl_count(SYSCTL_HANDLER_ARGS)
+{
+ int count;
+
+ count = uma_zone_get_cur(V_tcp_syncache.zone);
+ return (sysctl_handle_int(oidp, &count, 0, req));
+}
+
/*
* Inserts a syncache entry into the specified bucket row.
* Locks and unlocks the syncache_head autonomously.
@@ -332,6 +344,14 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch)
TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
sch->sch_length++;
+#ifdef TCP_OFFLOAD
+ if (ADDED_BY_TOE(sc)) {
+ struct toedev *tod = sc->sc_tod;
+
+ tod->tod_syncache_added(tod, sc->sc_todctx);
+ }
+#endif
+
/* Reinitialize the bucket row's timer. */
if (sch->sch_length == 1)
sch->sch_nextc = ticks + INT_MAX;
@@ -339,7 +359,6 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch)
SCH_UNLOCK(sch);
- V_tcp_syncache.cache_count++;
TCPSTAT_INC(tcps_sc_added);
}
@@ -356,12 +375,15 @@ syncache_drop(struct syncache *sc, struct syncache_head *sch)
TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
sch->sch_length--;
-#ifndef TCP_OFFLOAD_DISABLE
- if (sc->sc_tu)
- sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
-#endif
+#ifdef TCP_OFFLOAD
+ if (ADDED_BY_TOE(sc)) {
+ struct toedev *tod = sc->sc_tod;
+
+ tod->tod_syncache_removed(tod, sc->sc_todctx);
+ }
+#endif
+
syncache_free(sc);
- V_tcp_syncache.cache_count--;
}
/*
@@ -629,7 +651,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
struct inpcb *inp = NULL;
struct socket *so;
struct tcpcb *tp;
- int error = 0;
+ int error;
char *s;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
@@ -663,6 +685,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
inp = sotoinpcb(so);
inp->inp_inc.inc_fibnum = so->so_fibnum;
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_tcbinfo);
/* Insert new socket into PCB hash list. */
inp->inp_inc.inc_flags = sc->sc_inc.inc_flags;
@@ -677,8 +700,14 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
#ifdef INET6
}
#endif
+
+ /*
+ * Install in the reservation hash table for now, but don't yet
+ * install a connection group since the full 4-tuple isn't yet
+ * configured.
+ */
inp->inp_lport = sc->sc_inc.inc_lport;
- if ((error = in_pcbinshash(inp)) != 0) {
+ if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) {
/*
* Undo the assignments above if we failed to
* put the PCB on the hash lists.
@@ -696,6 +725,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
s, __func__, error);
free(s, M_TCPLOG);
}
+ INP_HASH_WUNLOCK(&V_tcbinfo);
goto abort;
}
#ifdef IPSEC
@@ -730,13 +760,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
laddr6 = inp->in6p_laddr;
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
inp->in6p_laddr = sc->sc_inc.inc6_laddr;
+ if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6,
#ifndef __rtems__
- if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6,
- thread0.td_ucred)) != 0) {
-#else /* __rtems__ */
- if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6,
- NULL)) != 0) {
-#endif /* __rtems__ */
+ thread0.td_ucred, m)) != 0) {
+#else /* __rtems__ */
+ NULL, m)) != 0) {
+#endif /* __rtems__ */
inp->in6p_laddr = laddr6;
if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed "
@@ -744,13 +773,18 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
s, __func__, error);
free(s, M_TCPLOG);
}
+ INP_HASH_WUNLOCK(&V_tcbinfo);
goto abort;
}
/* Override flowlabel from in6_pcbconnect. */
inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
inp->inp_flow |= sc->sc_flowlabel;
- } else
+ }
+#endif /* INET6 */
+#if defined(INET) && defined(INET6)
+ else
#endif
+#ifdef INET
{
struct in_addr laddr;
struct sockaddr_in sin;
@@ -770,14 +804,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
laddr = inp->inp_laddr;
if (inp->inp_laddr.s_addr == INADDR_ANY)
inp->inp_laddr = sc->sc_inc.inc_laddr;
+ if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin,
#ifndef __rtems__
- if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin,
- thread0.td_ucred)) != 0) {
-#else /* __rtems__ */
- if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin,
- NULL)) != 0) {
-#endif /* __rtems__ */
-
+ thread0.td_ucred, m)) != 0) {
+#else /* __rtems__ */
+ NULL, m)) != 0) {
+#endif /* __rtems__ */
inp->inp_laddr = laddr;
if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
log(LOG_DEBUG, "%s; %s: in_pcbconnect failed "
@@ -785,9 +817,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
s, __func__, error);
free(s, M_TCPLOG);
}
+ INP_HASH_WUNLOCK(&V_tcbinfo);
goto abort;
}
}
+#endif /* INET */
+ INP_HASH_WUNLOCK(&V_tcbinfo);
tp = intotcpcb(inp);
tp->t_state = TCPS_SYN_RECEIVED;
tp->iss = sc->sc_iss;
@@ -835,12 +870,33 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
tcp_mss(tp, sc->sc_peer_mss);
/*
- * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
+ * If the SYN,ACK was retransmitted, indicate that CWND to be
+ * limited to one segment in cc_conn_init().
* NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits.
*/
if (sc->sc_rxmits > 1)
- tp->snd_cwnd = tp->t_maxseg;
- tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+ tp->snd_cwnd = 1;
+
+#ifdef TCP_OFFLOAD
+ /*
+ * Allow a TOE driver to install its hooks. Note that we hold the
+ * pcbinfo lock too and that prevents tcp_usr_accept from accepting a
+ * new connection before the TOE driver has done its thing.
+ */
+ if (ADDED_BY_TOE(sc)) {
+ struct toedev *tod = sc->sc_tod;
+
+ tod->tod_offload_socket(tod, sc->sc_todctx, so);
+ }
+#endif
+ /*
+ * Copy and activate timers.
+ */
+ tp->t_keepinit = sototcpcb(lso)->t_keepinit;
+ tp->t_keepidle = sototcpcb(lso)->t_keepidle;
+ tp->t_keepintvl = sototcpcb(lso)->t_keepintvl;
+ tp->t_keepcnt = sototcpcb(lso)->t_keepcnt;
+ tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
INP_WUNLOCK(inp);
@@ -913,7 +969,13 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
/* Pull out the entry to unlock the bucket row. */
TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
sch->sch_length--;
- V_tcp_syncache.cache_count--;
+#ifdef TCP_OFFLOAD
+ if (ADDED_BY_TOE(sc)) {
+ struct toedev *tod = sc->sc_tod;
+
+ tod->tod_syncache_removed(tod, sc->sc_todctx);
+ }
+#endif
SCH_UNLOCK(sch);
}
@@ -921,7 +983,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
* Segment validation:
* ACK must match our initial sequence number + 1 (the SYN|ACK).
*/
- if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
+ if (th->th_ack != sc->sc_iss + 1) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
"rejected\n", s, __func__, th->th_ack, sc->sc_iss);
@@ -932,9 +994,8 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
* The SEQ must fall in the window starting at the received
* initial receive sequence number + 1 (the SYN).
*/
- if ((SEQ_LEQ(th->th_seq, sc->sc_irs) ||
- SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
- !TOEPCB_ISSET(sc)) {
+ if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
+ SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
"rejected\n", s, __func__, th->th_seq, sc->sc_irs);
@@ -951,8 +1012,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
* If timestamps were negotiated the reflected timestamp
* must be equal to what we actually sent in the SYN|ACK.
*/
- if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
- !TOEPCB_ISSET(sc)) {
+ if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) {
if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
"segment rejected\n",
@@ -980,25 +1040,6 @@ failed:
return (0);
}
-int
-tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
- struct tcphdr *th, struct socket **lsop, struct mbuf *m)
-{
- struct tcpopt to;
- int rc;
-
- bzero(&to, sizeof(struct tcpopt));
- to.to_mss = toeo->to_mss;
- to.to_wscale = toeo->to_wscale;
- to.to_flags = toeo->to_flags;
-
- INP_INFO_WLOCK(&V_tcbinfo);
- rc = syncache_expand(inc, &to, th, lsop, m);
- INP_INFO_WUNLOCK(&V_tcbinfo);
-
- return (rc);
-}
-
/*
* Given a LISTEN socket and an inbound SYN request, add
* this to the syn cache, and send back a segment:
@@ -1014,8 +1055,8 @@ tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
*/
static void
_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
- struct inpcb *inp, struct socket **lsop, struct mbuf *m,
- struct toe_usrreqs *tu, void *toepcb)
+ struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
+ void *todctx)
{
struct tcpcb *tp;
struct socket *so;
@@ -1080,7 +1121,11 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
#ifdef INET6
if (!(inc->inc_flags & INC_ISIPV6))
#endif
+#ifdef INET
ipopts = (m) ? ip_srcroute(m) : NULL;
+#else
+ ipopts = NULL;
+#endif
/*
* See if we already have an entry for this connection.
@@ -1097,11 +1142,6 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
sc = syncache_lookup(inc, &sch); /* returns locked entry */
SCH_LOCK_ASSERT(sch);
if (sc != NULL) {
-#ifndef TCP_OFFLOAD_DISABLE
- if (sc->sc_tu)
- sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
- sc->sc_toepcb);
-#endif
TCPSTAT_INC(tcps_sc_dupsyn);
if (ipopts) {
/*
@@ -1134,7 +1174,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
s, __func__);
free(s, M_TCPLOG);
}
- if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
+ if (syncache_respond(sc) == 0) {
sc->sc_rxmits = 0;
syncache_timeout(sc, sch, 1);
TCPSTAT_INC(tcps_sndacks);
@@ -1185,9 +1225,9 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
sc->sc_ip_tos = ip_tos;
sc->sc_ip_ttl = ip_ttl;
}
-#ifndef TCP_OFFLOAD_DISABLE
- sc->sc_tu = tu;
- sc->sc_toepcb = toepcb;
+#ifdef TCP_OFFLOAD
+ sc->sc_tod = tod;
+ sc->sc_todctx = todctx;
#endif
sc->sc_irs = th->th_seq;
sc->sc_iss = arc4random();
@@ -1282,7 +1322,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
/*
* Do a standard 3-way handshake.
*/
- if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) {
+ if (syncache_respond(sc) == 0) {
if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
syncache_free(sc);
else if (sc != &scs)
@@ -1314,8 +1354,8 @@ syncache_respond(struct syncache *sc)
{
struct ip *ip = NULL;
struct mbuf *m;
- struct tcphdr *th;
- int optlen, error;
+ struct tcphdr *th = NULL;
+ int optlen, error = 0; /* Make compiler happy */
u_int16_t hlen, tlen, mssopt;
struct tcpopt to;
#ifdef INET6
@@ -1363,8 +1403,12 @@ syncache_respond(struct syncache *sc)
ip6->ip6_flow |= sc->sc_flowlabel;
th = (struct tcphdr *)(ip6 + 1);
- } else
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
#endif
+#ifdef INET
{
ip = mtod(m, struct ip *);
ip->ip_v = IPVERSION;
@@ -1391,6 +1435,7 @@ syncache_respond(struct syncache *sc)
th = (struct tcphdr *)(ip + 1);
}
+#endif /* INET */
th->th_sport = sc->sc_inc.inc_lport;
th->th_dport = sc->sc_inc.inc_fport;
@@ -1451,22 +1496,45 @@ syncache_respond(struct syncache *sc)
optlen = 0;
M_SETFIB(m, sc->sc_inc.inc_fibnum);
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
#ifdef INET6
if (sc->sc_inc.inc_flags & INC_ISIPV6) {
- th->th_sum = 0;
- th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen,
- tlen + optlen - hlen);
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen,
+ IPPROTO_TCP, 0);
ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
+#ifdef TCP_OFFLOAD
+ if (ADDED_BY_TOE(sc)) {
+ struct toedev *tod = sc->sc_tod;
+
+ error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
+
+ return (error);
+ }
+#endif
error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
- } else
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
#endif
+#ifdef INET
{
+ m->m_pkthdr.csum_flags = CSUM_TCP;
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(tlen + optlen - hlen + IPPROTO_TCP));
- m->m_pkthdr.csum_flags = CSUM_TCP;
- m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+#ifdef TCP_OFFLOAD
+ if (ADDED_BY_TOE(sc)) {
+ struct toedev *tod = sc->sc_tod;
+
+ error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
+
+ return (error);
+ }
+#endif
error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
}
+#endif
return (error);
}
@@ -1478,23 +1546,12 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
}
void
-tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo,
- struct tcphdr *th, struct inpcb *inp, struct socket **lsop,
- struct toe_usrreqs *tu, void *toepcb)
+tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+ struct inpcb *inp, struct socket **lsop, void *tod, void *todctx)
{
- struct tcpopt to;
- bzero(&to, sizeof(struct tcpopt));
- to.to_mss = toeo->to_mss;
- to.to_wscale = toeo->to_wscale;
- to.to_flags = toeo->to_flags;
-
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(inp);
-
- _syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb);
+ _syncache_add(inc, to, th, inp, lsop, NULL, tod, todctx);
}
-
/*
* The purpose of SYN cookies is to avoid keeping track of all SYN's we
* receive and to be able to handle SYN floods from bogus source addresses
diff --git a/freebsd/sys/netinet/tcp_syncache.h b/freebsd/sys/netinet/tcp_syncache.h
index 93c7aaa2..c55bfbcd 100644
--- a/freebsd/sys/netinet/tcp_syncache.h
+++ b/freebsd/sys/netinet/tcp_syncache.h
@@ -34,8 +34,6 @@
#define _NETINET_TCP_SYNCACHE_H_
#ifdef _KERNEL
-struct toeopt;
-
void syncache_init(void);
#ifdef VIMAGE
void syncache_destroy(void);
@@ -43,14 +41,10 @@ void syncache_destroy(void);
void syncache_unreach(struct in_conninfo *, struct tcphdr *);
int syncache_expand(struct in_conninfo *, struct tcpopt *,
struct tcphdr *, struct socket **, struct mbuf *);
-int tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
- struct tcphdr *th, struct socket **lsop, struct mbuf *m);
void syncache_add(struct in_conninfo *, struct tcpopt *,
struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *);
-void tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *,
- struct tcphdr *, struct inpcb *, struct socket **,
- struct toe_usrreqs *tu, void *toepcb);
-
+void tcp_offload_syncache_add(struct in_conninfo *, struct tcpopt *,
+ struct tcphdr *, struct inpcb *, struct socket **, void *, void *);
void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
void syncache_badack(struct in_conninfo *);
int syncache_pcbcount(void);
@@ -75,12 +69,14 @@ struct syncache {
u_int8_t sc_requested_s_scale:4,
sc_requested_r_scale:4;
u_int16_t sc_flags;
-#ifndef TCP_OFFLOAD_DISABLE
- struct toe_usrreqs *sc_tu; /* TOE operations */
- void *sc_toepcb; /* TOE protocol block */
-#endif
+#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE)
+ struct toedev *sc_tod; /* entry added by this TOE */
+ void *sc_todctx; /* TOE driver context */
+#endif
struct label *sc_label; /* MAC label reference */
struct ucred *sc_cred; /* cred cache for jail checks */
+
+ u_int32_t sc_spare[2]; /* UTO */
};
/*
@@ -117,7 +113,6 @@ struct tcp_syncache {
u_int hashsize;
u_int hashmask;
u_int bucket_limit;
- u_int cache_count; /* XXX: unprotected */
u_int cache_limit;
u_int rexmt_limit;
u_int hash_secret;
diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c
index 77cc1feb..db952e42 100644
--- a/freebsd/sys/netinet/tcp_timer.c
+++ b/freebsd/sys/netinet/tcp_timer.c
@@ -34,6 +34,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_tcpdebug.h>
@@ -43,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
+#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
@@ -112,18 +114,25 @@ int tcp_finwait2_timeout;
SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
&tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
+int tcp_keepcnt = TCPTV_KEEPCNT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
+ "Number of keepalive probes to send");
-static int tcp_keepcnt = TCPTV_KEEPCNT;
/* max idle probes */
int tcp_maxpersistidle;
- /* max idle time in persist */
-int tcp_maxidle;
static int tcp_rexmit_drop_options = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
&tcp_rexmit_drop_options, 0,
"Drop TCP options from 3rd and later retransmitted SYN");
+static int per_cpu_timers = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
+ &per_cpu_timers , 0, "run tcp timers on all cpus");
+
+#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
+ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
+
/*
* Tcp protocol timeout routine called every 500 ms.
* Updates timestamps used for TCP
@@ -137,7 +146,6 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
INP_INFO_WLOCK(&V_tcbinfo);
(void) tcp_tw_2msl_scan(0);
INP_INFO_WUNLOCK(&V_tcbinfo);
@@ -265,9 +273,9 @@ tcp_timer_2msl(void *xtp)
tp = tcp_close(tp);
} else {
if (tp->t_state != TCPS_TIME_WAIT &&
- ticks - tp->t_rcvtime <= tcp_maxidle)
- callout_reset(&tp->t_timers->tt_2msl, tcp_keepintvl,
- tcp_timer_2msl, tp);
+ ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
+ callout_reset_on(&tp->t_timers->tt_2msl,
+ TP_KEEPINTVL(tp), tcp_timer_2msl, tp, INP_CPU(inp));
else
tp = tcp_close(tp);
}
@@ -334,7 +342,7 @@ tcp_timer_keep(void *xtp)
goto dropit;
if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
tp->t_state <= TCPS_CLOSING) {
- if (ticks - tp->t_rcvtime >= tcp_keepidle + tcp_maxidle)
+ if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
goto dropit;
/*
* Send a packet designed to force a response
@@ -356,9 +364,11 @@ tcp_timer_keep(void *xtp)
tp->rcv_nxt, tp->snd_una - 1, 0);
free(t_template, M_TEMP);
}
- callout_reset(&tp->t_timers->tt_keep, tcp_keepintvl, tcp_timer_keep, tp);
+ callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
+ tcp_timer_keep, tp, INP_CPU(inp));
} else
- callout_reset(&tp->t_timers->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+ callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
+ tcp_timer_keep, tp, INP_CPU(inp));
#ifdef TCPDEBUG
if (inp->inp_socket->so_options & SO_DEBUG)
@@ -445,6 +455,16 @@ tcp_timer_persist(void *xtp)
tp = tcp_drop(tp, ETIMEDOUT);
goto out;
}
+ /*
+ * If the user has closed the socket then drop a persisting
+ * connection after a much reduced timeout.
+ */
+ if (tp->t_state > TCPS_CLOSE_WAIT &&
+ (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
+ TCPSTAT_INC(tcps_persistdrop);
+ tp = tcp_drop(tp, ETIMEDOUT);
+ goto out;
+ }
tcp_setpersist(tp);
tp->t_flags |= TF_FORCEDATA;
(void) tcp_output(tp);
@@ -474,8 +494,7 @@ tcp_timer_rexmt(void * xtp)
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
- headlocked = 1;
+ INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
/*
* XXXRW: While this assert is in fact correct, bugs in the tcpcb
@@ -486,7 +505,7 @@ tcp_timer_rexmt(void * xtp)
*/
if (inp == NULL) {
tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -494,14 +513,14 @@ tcp_timer_rexmt(void * xtp)
if (callout_pending(&tp->t_timers->tt_rexmt) ||
!callout_active(&tp->t_timers->tt_rexmt)) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
callout_deactivate(&tp->t_timers->tt_rexmt);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -514,13 +533,37 @@ tcp_timer_rexmt(void * xtp)
if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
TCPSTAT_INC(tcps_timeoutdrop);
+ in_pcbref(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_WUNLOCK(inp);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+ if (inp->inp_flags & INP_DROPPED) {
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
+
tp = tcp_drop(tp, tp->t_softerror ?
tp->t_softerror : ETIMEDOUT);
+ headlocked = 1;
goto out;
}
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
headlocked = 0;
- if (tp->t_rxtshift == 1) {
+ if (tp->t_state == TCPS_SYN_SENT) {
+ /*
+ * If the SYN was retransmitted, indicate CWND to be
+ * limited to 1 segment in cc_conn_init().
+ */
+ tp->snd_cwnd = 1;
+ } else if (tp->t_rxtshift == 1) {
/*
* first retransmit; record ssthresh and cwnd so they can
* be recovered if this turns out to be a "bad" retransmit.
@@ -547,13 +590,13 @@ tcp_timer_rexmt(void * xtp)
tp->t_flags &= ~TF_PREVVALID;
TCPSTAT_INC(tcps_rexmttimeo);
if (tp->t_state == TCPS_SYN_SENT)
- rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
+ rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
TCPT_RANGESET(tp->t_rxtcur, rexmt,
tp->t_rttmin, TCPTV_REXMTMAX);
/*
- * Disable rfc1323 if we haven't got any response to
+ * Disable RFC1323 and SACK if we haven't got any response to
* our third SYN to work-around some broken terminal servers
* (most of which have hopefully been retired) that have bad VJ
* header compression code which trashes TCP segments containing
@@ -561,7 +604,7 @@ tcp_timer_rexmt(void * xtp)
*/
if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
(tp->t_rxtshift == 3))
- tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP);
+ tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
/*
* If we backed off this far, our srtt estimate is probably bogus.
* Clobber it so we'll take the next rtt measurement as our srtt;
@@ -572,7 +615,6 @@ tcp_timer_rexmt(void * xtp)
#ifdef INET6
if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
in6_losing(tp->t_inpcb);
- else
#endif
tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
tp->t_srtt = 0;
@@ -610,6 +652,13 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
{
struct callout *t_callout;
void *f_callout;
+ struct inpcb *inp = tp->t_inpcb;
+ int cpu = INP_CPU(inp);
+
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE)
+ return;
+#endif
switch (timer_type) {
case TT_DELACK:
@@ -638,7 +687,7 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
if (delta == 0) {
callout_stop(t_callout);
} else {
- callout_reset(t_callout, delta, f_callout, tp);
+ callout_reset_on(t_callout, delta, f_callout, tp, cpu);
}
}
@@ -668,3 +717,24 @@ tcp_timer_active(struct tcpcb *tp, int timer_type)
}
return callout_active(t_callout);
}
+
+#define ticks_to_msecs(t) (1000*(t) / hz)
+
+void
+tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer)
+{
+ bzero(xtimer, sizeof(struct xtcp_timer));
+ if (timer == NULL)
+ return;
+ if (callout_active(&timer->tt_delack))
+ xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks);
+ if (callout_active(&timer->tt_rexmt))
+ xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks);
+ if (callout_active(&timer->tt_persist))
+ xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks);
+ if (callout_active(&timer->tt_keep))
+ xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks);
+ if (callout_active(&timer->tt_2msl))
+ xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks);
+ xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
+}
diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h
index ff455b6b..0da58fd8 100644
--- a/freebsd/sys/netinet/tcp_timer.h
+++ b/freebsd/sys/netinet/tcp_timer.h
@@ -86,9 +86,6 @@
#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */
#define TCPTV_KEEPCNT 8 /* max probes before drop */
-#define TCPTV_INFLIGHT_RTTTHRESH (10*hz/1000) /* below which inflight
- disengages, in msec */
-
#define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */
/*
@@ -121,7 +118,7 @@
#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */
-#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */
+#define TCPTV_DELACK ( hz/10 ) /* 100ms timeout */
#ifdef TCPTIMERS
static const char *tcptimers[] =
@@ -141,6 +138,8 @@ static const char *tcptimers[] =
#ifdef _KERNEL
+struct xtcp_timer;
+
struct tcp_timer {
struct callout tt_rexmt; /* retransmit timer */
struct callout tt_persist; /* retransmit persistence */
@@ -154,10 +153,16 @@ struct tcp_timer {
#define TT_KEEP 0x08
#define TT_2MSL 0x10
+#define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
+#define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
+#define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl)
+#define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt)
+#define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp))
+
extern int tcp_keepinit; /* time to establish connection */
extern int tcp_keepidle; /* time before keepalive probes begin */
extern int tcp_keepintvl; /* time between keepalive probes */
-extern int tcp_maxidle; /* time to drop after starting probes */
+extern int tcp_keepcnt; /* number of keepalives */
extern int tcp_delacktime; /* time before sending a delayed ACK */
extern int tcp_maxpersistidle;
extern int tcp_rexmit_min;
@@ -177,6 +182,8 @@ void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
void tcp_timer_delack(void *xtp);
+void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
+ struct xtcp_timer *xtimer);
#endif /* _KERNEL */
diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c
index f9b613a7..9034fab4 100644
--- a/freebsd/sys/netinet/tcp_timewait.c
+++ b/freebsd/sys/netinet/tcp_timewait.c
@@ -59,23 +59,19 @@ __FBSDID("$FreeBSD$");
#include <net/vnet.h>
#include <netinet/in.h>
+#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
-#endif
-#include <netinet/in_pcb.h>
-#ifdef INET6
#include <netinet6/in6_pcb.h>
-#endif
-#include <netinet/in_var.h>
-#include <netinet/ip_var.h>
-#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
#endif
-#include <netinet/ip_icmp.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
@@ -88,7 +84,9 @@ __FBSDID("$FreeBSD$");
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
+#ifdef INET6
#include <netinet6/ip6protosw.h>
+#endif
#include <machine/in_cksum.h>
@@ -204,15 +202,31 @@ tcp_twstart(struct tcpcb *tp)
struct inpcb *inp = tp->t_inpcb;
int acknow;
struct socket *so;
+#ifdef INET6
+ int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
+#endif
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
INP_WLOCK_ASSERT(inp);
- if (V_nolocaltimewait && in_localip(inp->inp_faddr)) {
- tp = tcp_close(tp);
- if (tp != NULL)
- INP_WUNLOCK(inp);
- return;
+ if (V_nolocaltimewait) {
+ int error = 0;
+#ifdef INET6
+ if (isipv6)
+ error = in6_localaddr(&inp->in6p_faddr);
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ error = in_localip(inp->inp_faddr);
+#endif
+ if (error) {
+ tp = tcp_close(tp);
+ if (tp != NULL)
+ INP_WUNLOCK(inp);
+ return;
+ }
}
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
@@ -493,16 +507,21 @@ int
tcp_twrespond(struct tcptw *tw, int flags)
{
struct inpcb *inp = tw->tw_inpcb;
- struct tcphdr *th;
+#if defined(INET6) || defined(INET)
+ struct tcphdr *th = NULL;
+#endif
struct mbuf *m;
+#ifdef INET
struct ip *ip = NULL;
+#endif
u_int hdrlen, optlen;
- int error;
+ int error = 0; /* Keep compiler happy */
struct tcpopt to;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
+ hdrlen = 0; /* Keep compiler happy */
INP_WLOCK_ASSERT(inp);
@@ -521,14 +540,19 @@ tcp_twrespond(struct tcptw *tw, int flags)
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
tcpip_fillheaders(inp, ip6, th);
- } else
+ }
#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
{
hdrlen = sizeof(struct tcpiphdr);
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
tcpip_fillheaders(inp, ip, th);
}
+#endif
to.to_flags = 0;
/*
@@ -553,20 +577,25 @@ tcp_twrespond(struct tcptw *tw, int flags)
th->th_flags = flags;
th->th_win = htons(tw->last_win);
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
#ifdef INET6
if (isipv6) {
- th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
- sizeof(struct tcphdr) + optlen);
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ th->th_sum = in6_cksum_pseudo(ip6,
+ sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0);
ip6->ip6_hlim = in6_selecthlim(inp, NULL);
error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
- } else
+ }
#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
{
+ m->m_pkthdr.csum_flags = CSUM_TCP;
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
- m->m_pkthdr.csum_flags = CSUM_TCP;
- m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
if (V_path_mtu_discovery)
ip->ip_off |= IP_DF;
@@ -574,6 +603,7 @@ tcp_twrespond(struct tcptw *tw, int flags)
((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, inp);
}
+#endif
if (flags & TH_ACK)
TCPSTAT_INC(tcps_sndacks);
else
diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c
index 2c1cd615..61711a6e 100644
--- a/freebsd/sys/netinet/tcp_usrreq.c
+++ b/freebsd/sys/netinet/tcp_usrreq.c
@@ -4,8 +4,12 @@
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California.
* Copyright (c) 2006-2007 Robert N. M. Watson
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -43,6 +47,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
+#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
@@ -66,17 +71,13 @@ __FBSDID("$FreeBSD$");
#include <netinet/cc.h>
#include <netinet/in.h>
-#include <netinet/in_systm.h>
-#ifdef INET6
-#include <netinet/ip6.h>
-#endif
#include <netinet/in_pcb.h>
-#ifdef INET6
-#include <netinet6/in6_pcb.h>
-#endif
+#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif
@@ -88,14 +89,18 @@ __FBSDID("$FreeBSD$");
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
#endif
+#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
+#endif
/*
* TCP protocol interface to socket abstraction.
*/
static int tcp_attach(struct socket *);
+#ifdef INET
static int tcp_connect(struct tcpcb *, struct sockaddr *,
struct thread *td);
+#endif /* INET */
#ifdef INET6
static int tcp6_connect(struct tcpcb *, struct sockaddr *,
struct thread *td);
@@ -233,6 +238,7 @@ tcp_usr_detach(struct socket *so)
INP_INFO_WUNLOCK(&V_tcbinfo);
}
+#ifdef INET
/*
* Give the socket an address.
*/
@@ -256,7 +262,6 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
return (EAFNOSUPPORT);
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
INP_WLOCK(inp);
@@ -266,14 +271,16 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
}
tp = intotcpcb(inp);
TCPDEBUG1();
+ INP_HASH_WLOCK(&V_tcbinfo);
error = in_pcbbind(inp, nam, td->td_ucred);
+ INP_HASH_WUNLOCK(&V_tcbinfo);
out:
TCPDEBUG2(PRU_BIND);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
return (error);
}
+#endif /* INET */
#ifdef INET6
static int
@@ -296,7 +303,6 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
return (EAFNOSUPPORT);
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
INP_WLOCK(inp);
@@ -306,8 +312,10 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
}
tp = intotcpcb(inp);
TCPDEBUG1();
+ INP_HASH_WLOCK(&V_tcbinfo);
inp->inp_vflag &= ~INP_IPV4;
inp->inp_vflag |= INP_IPV6;
+#ifdef INET
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
inp->inp_vflag |= INP_IPV4;
@@ -319,18 +327,21 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
inp->inp_vflag &= ~INP_IPV6;
error = in_pcbbind(inp, (struct sockaddr *)&sin,
td->td_ucred);
+ INP_HASH_WUNLOCK(&V_tcbinfo);
goto out;
}
}
+#endif
error = in6_pcbbind(inp, nam, td->td_ucred);
+ INP_HASH_WUNLOCK(&V_tcbinfo);
out:
TCPDEBUG2(PRU_BIND);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
return (error);
}
#endif /* INET6 */
+#ifdef INET
/*
* Prepare to accept connections.
*/
@@ -342,7 +353,6 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
struct tcpcb *tp = NULL;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
INP_WLOCK(inp);
@@ -354,21 +364,26 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
TCPDEBUG1();
SOCK_LOCK(so);
error = solisten_proto_check(so);
+ INP_HASH_WLOCK(&V_tcbinfo);
if (error == 0 && inp->inp_lport == 0)
error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ INP_HASH_WUNLOCK(&V_tcbinfo);
if (error == 0) {
tp->t_state = TCPS_LISTEN;
solisten_proto(so, backlog);
- tcp_offload_listen_open(tp);
+#ifdef TCP_OFFLOAD
+ if ((so->so_options & SO_NO_OFFLOAD) == 0)
+ tcp_offload_listen_start(tp);
+#endif
}
SOCK_UNLOCK(so);
out:
TCPDEBUG2(PRU_LISTEN);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
return (error);
}
+#endif /* INET */
#ifdef INET6
static int
@@ -379,7 +394,6 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
struct tcpcb *tp = NULL;
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
INP_WLOCK(inp);
@@ -391,26 +405,32 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
TCPDEBUG1();
SOCK_LOCK(so);
error = solisten_proto_check(so);
+ INP_HASH_WLOCK(&V_tcbinfo);
if (error == 0 && inp->inp_lport == 0) {
inp->inp_vflag &= ~INP_IPV4;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
inp->inp_vflag |= INP_IPV4;
error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
}
+ INP_HASH_WUNLOCK(&V_tcbinfo);
if (error == 0) {
tp->t_state = TCPS_LISTEN;
solisten_proto(so, backlog);
+#ifdef TCP_OFFLOAD
+ if ((so->so_options & SO_NO_OFFLOAD) == 0)
+ tcp_offload_listen_start(tp);
+#endif
}
SOCK_UNLOCK(so);
out:
TCPDEBUG2(PRU_LISTEN);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
return (error);
}
#endif /* INET6 */
+#ifdef INET
/*
* Initiate connection to peer.
* Create a template for use in transmissions on this connection.
@@ -439,7 +459,6 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
return (error);
TCPDEBUG0;
- INP_INFO_WLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
INP_WLOCK(inp);
@@ -451,13 +470,20 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
TCPDEBUG1();
if ((error = tcp_connect(tp, nam, td)) != 0)
goto out;
- error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+ if (registered_toedevs > 0 &&
+ (so->so_options & SO_NO_OFFLOAD) == 0 &&
+ (error = tcp_offload_connect(so, nam)) == 0)
+ goto out;
+#endif
+ tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+ error = tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
return (error);
}
+#endif /* INET */
#ifdef INET6
static int
@@ -480,7 +506,6 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
&& IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
return (EAFNOSUPPORT);
- INP_INFO_WLOCK(&V_tcbinfo);
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
INP_WLOCK(inp);
@@ -490,6 +515,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
}
tp = intotcpcb(inp);
TCPDEBUG1();
+#ifdef INET
+ /*
+ * XXXRW: Some confusion: V4/V6 flags relate to binding, and
+ * therefore probably require the hash lock, which isn't held here.
+ * Is this a significant problem?
+ */
if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
struct sockaddr_in sin;
@@ -506,9 +537,16 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out;
if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
goto out;
- error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+ if (registered_toedevs > 0 &&
+ (so->so_options & SO_NO_OFFLOAD) == 0 &&
+ (error = tcp_offload_connect(so, nam)) == 0)
+ goto out;
+#endif
+ error = tcp_output(tp);
goto out;
}
+#endif
inp->inp_vflag &= ~INP_IPV4;
inp->inp_vflag |= INP_IPV6;
inp->inp_inc.inc_flags |= INC_ISIPV6;
@@ -516,12 +554,18 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out;
if ((error = tcp6_connect(tp, nam, td)) != 0)
goto out;
- error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+ if (registered_toedevs > 0 &&
+ (so->so_options & SO_NO_OFFLOAD) == 0 &&
+ (error = tcp_offload_connect(so, nam)) == 0)
+ goto out;
+#endif
+ tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+ error = tcp_output(tp);
out:
TCPDEBUG2(PRU_CONNECT);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
return (error);
}
#endif /* INET6 */
@@ -563,6 +607,7 @@ out:
return (error);
}
+#ifdef INET
/*
* Accept a connection. Essentially all the work is done at higher levels;
* just return the address of the peer, storing through addr.
@@ -614,6 +659,7 @@ out:
*nam = in_sockaddr(port, &addr);
return error;
}
+#endif /* INET */
#ifdef INET6
static int
@@ -633,6 +679,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
+ INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
error = ECONNABORTED;
@@ -658,6 +705,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
out:
TCPDEBUG2(PRU_ACCEPT);
INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
if (error == 0) {
if (v4)
*nam = in6_v4mapsin6_sockaddr(port, &addr);
@@ -692,7 +740,7 @@ tcp_usr_shutdown(struct socket *so)
socantsendmore(so);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- error = tcp_output_disconnect(tp);
+ error = tcp_output(tp);
out:
TCPDEBUG2(PRU_SHUTDOWN);
@@ -722,7 +770,12 @@ tcp_usr_rcvd(struct socket *so, int flags)
}
tp = intotcpcb(inp);
TCPDEBUG1();
- tcp_output_rcvd(tp);
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE)
+ tcp_offload_rcvd(tp);
+ else
+#endif
+ tcp_output(tp);
out:
TCPDEBUG2(PRU_RCVD);
@@ -744,25 +797,17 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
int error = 0;
struct inpcb *inp;
struct tcpcb *tp = NULL;
- int headlocked = 0;
#ifdef INET6
int isipv6;
#endif
TCPDEBUG0;
/*
- * We require the pcbinfo lock in two cases:
- *
- * (1) An implied connect is taking place, which can result in
- * binding IPs and ports and hence modification of the pcb hash
- * chains.
- *
- * (2) PRUS_EOF is set, resulting in explicit close on the send.
+ * We require the pcbinfo lock if we will close the socket as part of
+ * this call.
*/
- if ((nam != NULL) || (flags & PRUS_EOF)) {
+ if (flags & PRUS_EOF)
INP_INFO_WLOCK(&V_tcbinfo);
- headlocked = 1;
- }
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
INP_WLOCK(inp);
@@ -799,13 +844,16 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* initialize maxseg/maxopd using peer's cached
* MSS.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
#ifdef INET6
if (isipv6)
error = tcp6_connect(tp, nam, td);
- else
#endif /* INET6 */
- error = tcp_connect(tp, nam, td);
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ error = tcp_connect(tp, nam, td);
+#endif
if (error)
goto out;
tp->snd_wnd = TTCP_CLIENT_SND_WND;
@@ -820,14 +868,10 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
socantsendmore(so);
tcp_usrclosed(tp);
}
- if (headlocked) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
- headlocked = 0;
- }
if (!(inp->inp_flags & INP_DROPPED)) {
if (flags & PRUS_MORETOCOME)
tp->t_flags |= TF_MORETOCOME;
- error = tcp_output_send(tp);
+ error = tcp_output(tp);
if (flags & PRUS_MORETOCOME)
tp->t_flags &= ~TF_MORETOCOME;
}
@@ -859,33 +903,31 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
* initialize maxseg/maxopd using peer's cached
* MSS.
*/
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
#ifdef INET6
if (isipv6)
error = tcp6_connect(tp, nam, td);
- else
#endif /* INET6 */
- error = tcp_connect(tp, nam, td);
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ error = tcp_connect(tp, nam, td);
+#endif
if (error)
goto out;
tp->snd_wnd = TTCP_CLIENT_SND_WND;
tcp_mss(tp, -1);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- headlocked = 0;
- } else if (nam) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
- headlocked = 0;
}
tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
tp->t_flags |= TF_FORCEDATA;
- error = tcp_output_send(tp);
+ error = tcp_output(tp);
tp->t_flags &= ~TF_FORCEDATA;
}
out:
TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
INP_WUNLOCK(inp);
- if (headlocked)
+ if (flags & PRUS_EOF)
INP_INFO_WUNLOCK(&V_tcbinfo);
return (error);
}
@@ -1009,6 +1051,7 @@ out:
return (error);
}
+#ifdef INET
struct pr_usrreqs tcp_usrreqs = {
.pru_abort = tcp_usr_abort,
.pru_accept = tcp_usr_accept,
@@ -1025,12 +1068,10 @@ struct pr_usrreqs tcp_usrreqs = {
.pru_send = tcp_usr_send,
.pru_shutdown = tcp_usr_shutdown,
.pru_sockaddr = in_getsockaddr,
-#if 0
- .pru_soreceive = soreceive_stream,
-#endif
.pru_sosetlabel = in_pcbsosetlabel,
.pru_close = tcp_usr_close,
};
+#endif /* INET */
#ifdef INET6
struct pr_usrreqs tcp6_usrreqs = {
@@ -1049,14 +1090,12 @@ struct pr_usrreqs tcp6_usrreqs = {
.pru_send = tcp_usr_send,
.pru_shutdown = tcp_usr_shutdown,
.pru_sockaddr = in6_mapped_sockaddr,
-#if 0
- .pru_soreceive = soreceive_stream,
-#endif
- .pru_sosetlabel = in_pcbsosetlabel,
+ .pru_sosetlabel = in_pcbsosetlabel,
.pru_close = tcp_usr_close,
};
#endif /* INET6 */
+#ifdef INET
/*
* Common subroutine to open a TCP connection to remote host specified
* by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
@@ -1076,13 +1115,13 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
u_short lport;
int error;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK(&V_tcbinfo);
if (inp->inp_lport == 0) {
error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
if (error)
- return error;
+ goto out;
}
/*
@@ -1095,11 +1134,14 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
&inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
if (error && oinp == NULL)
- return error;
- if (oinp)
- return EADDRINUSE;
+ goto out;
+ if (oinp) {
+ error = EADDRINUSE;
+ goto out;
+ }
inp->inp_laddr = laddr;
in_pcbrehash(inp);
+ INP_HASH_WUNLOCK(&V_tcbinfo);
/*
* Compute window scaling to request:
@@ -1113,13 +1155,16 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
soisconnecting(so);
TCPSTAT_INC(tcps_connattempt);
tp->t_state = TCPS_SYN_SENT;
- tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
tp->iss = tcp_new_isn(tp);
- tp->t_bw_rtseq = tp->iss;
tcp_sendseqinit(tp);
return 0;
+
+out:
+ INP_HASH_WUNLOCK(&V_tcbinfo);
+ return (error);
}
+#endif /* INET */
#ifdef INET6
static int
@@ -1131,13 +1176,13 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
struct in6_addr addr6;
int error;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK(&V_tcbinfo);
if (inp->inp_lport == 0) {
error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
if (error)
- return error;
+ goto out;
}
/*
@@ -1145,18 +1190,23 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
* earlier incarnation of this same connection still in
* TIME_WAIT state, creating an ADDRINUSE error.
* in6_pcbladdr() also handles scope zone IDs.
+ *
+ * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked()
+ * outside of in6_pcb.c if there were an in6_pcbconnect_setup().
*/
error = in6_pcbladdr(inp, nam, &addr6);
if (error)
- return error;
- oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
+ goto out;
+ oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo,
&sin6->sin6_addr, sin6->sin6_port,
IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
? &addr6
: &inp->in6p_laddr,
inp->inp_lport, 0, NULL);
- if (oinp)
- return EADDRINUSE;
+ if (oinp) {
+ error = EADDRINUSE;
+ goto out;
+ }
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
inp->in6p_laddr = addr6;
inp->in6p_faddr = sin6->sin6_addr;
@@ -1167,6 +1217,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
inp->inp_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
in_pcbrehash(inp);
+ INP_HASH_WUNLOCK(&V_tcbinfo);
/* Compute window scaling to request. */
while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
@@ -1176,12 +1227,14 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
soisconnecting(so);
TCPSTAT_INC(tcps_connattempt);
tp->t_state = TCPS_SYN_SENT;
- tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
tp->iss = tcp_new_isn(tp);
- tp->t_bw_rtseq = tp->iss;
tcp_sendseqinit(tp);
return 0;
+
+out:
+ INP_HASH_WUNLOCK(&V_tcbinfo);
+ return error;
}
#endif /* INET6 */
@@ -1224,7 +1277,7 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
ti->tcpi_rcv_space = tp->rcv_wnd;
ti->tcpi_rcv_nxt = tp->rcv_nxt;
ti->tcpi_snd_wnd = tp->snd_wnd;
- ti->tcpi_snd_bwnd = tp->snd_bwnd;
+ ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
ti->tcpi_snd_nxt = tp->snd_nxt;
ti->tcpi_snd_mss = tp->t_maxseg;
ti->tcpi_rcv_mss = tp->t_maxseg;
@@ -1254,6 +1307,7 @@ int
tcp_ctloutput(struct socket *so, struct sockopt *sopt)
{
int error, opt, optval;
+ u_int ui;
struct inpcb *inp;
struct tcpcb *tp;
struct tcp_info ti;
@@ -1269,11 +1323,15 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
if (inp->inp_vflag & INP_IPV6PROTO) {
INP_WUNLOCK(inp);
error = ip6_ctloutput(so, sopt);
- } else {
+ }
#endif /* INET6 */
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ {
INP_WUNLOCK(inp);
error = ip_ctloutput(so, sopt);
-#ifdef INET6
}
#endif
return (error);
@@ -1299,9 +1357,9 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
tp->t_flags |= TF_SIGNATURE;
else
tp->t_flags &= ~TF_SIGNATURE;
- INP_WUNLOCK(inp);
- break;
+ goto unlock_and_done;
#endif /* TCP_SIGNATURE */
+
case TCP_NODELAY:
case TCP_NOOPT:
INP_WUNLOCK(inp);
@@ -1327,6 +1385,13 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
tp->t_flags |= opt;
else
tp->t_flags &= ~opt;
+unlock_and_done:
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE) {
+ tcp_offload_ctloutput(tp, sopt->sopt_dir,
+ sopt->sopt_name);
+ }
+#endif
INP_WUNLOCK(inp);
break;
@@ -1345,8 +1410,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
if (TCPS_HAVEESTABLISHED(tp->t_state))
error = tcp_output(tp);
}
- INP_WUNLOCK(inp);
- break;
+ goto unlock_and_done;
case TCP_MAXSEG:
INP_WUNLOCK(inp);
@@ -1361,8 +1425,7 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
tp->t_maxseg = optval;
else
error = EINVAL;
- INP_WUNLOCK(inp);
- break;
+ goto unlock_and_done;
case TCP_INFO:
INP_WUNLOCK(inp);
@@ -1414,6 +1477,64 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
}
}
CC_LIST_RUNLOCK();
+ goto unlock_and_done;
+
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPINIT:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
+ if (error)
+ return (error);
+
+ if (ui > (UINT_MAX / hz)) {
+ error = EINVAL;
+ break;
+ }
+ ui *= hz;
+
+ INP_WLOCK_RECHECK(inp);
+ switch (sopt->sopt_name) {
+ case TCP_KEEPIDLE:
+ tp->t_keepidle = ui;
+ /*
+ * XXX: better check current remaining
+ * timeout and "merge" it with new value.
+ */
+ if ((tp->t_state > TCPS_LISTEN) &&
+ (tp->t_state <= TCPS_CLOSING))
+ tcp_timer_activate(tp, TT_KEEP,
+ TP_KEEPIDLE(tp));
+ break;
+ case TCP_KEEPINTVL:
+ tp->t_keepintvl = ui;
+ if ((tp->t_state == TCPS_FIN_WAIT_2) &&
+ (TP_MAXIDLE(tp) > 0))
+ tcp_timer_activate(tp, TT_2MSL,
+ TP_MAXIDLE(tp));
+ break;
+ case TCP_KEEPINIT:
+ tp->t_keepinit = ui;
+ if (tp->t_state == TCPS_SYN_RECEIVED ||
+ tp->t_state == TCPS_SYN_SENT)
+ tcp_timer_activate(tp, TT_KEEP,
+ TP_KEEPINIT(tp));
+ break;
+ }
+ goto unlock_and_done;
+
+ case TCP_KEEPCNT:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ tp->t_keepcnt = ui;
+ if ((tp->t_state == TCPS_FIN_WAIT_2) &&
+ (TP_MAXIDLE(tp) > 0))
+ tcp_timer_activate(tp, TT_2MSL,
+ TP_MAXIDLE(tp));
INP_WUNLOCK(inp);
break;
@@ -1478,18 +1599,6 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
#undef INP_WLOCK_RECHECK
/*
- * tcp_sendspace and tcp_recvspace are the default send and receive window
- * sizes, respectively. These are obsolescent (this information should
- * be set by the route).
- */
-u_long tcp_sendspace = 1024*32;
-SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
- &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
-u_long tcp_recvspace = 1024*64;
-SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
- &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
-
-/*
* Attach TCP protocol to socket, allocating
* internet protocol control block, tcp control block,
* bufer space, and entering LISTEN state if to accept connections.
@@ -1502,7 +1611,7 @@ tcp_attach(struct socket *so)
int error;
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
- error = soreserve(so, tcp_sendspace, tcp_recvspace);
+ error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
if (error)
return (error);
}
@@ -1570,7 +1679,7 @@ tcp_disconnect(struct tcpcb *tp)
sbflush(&so->so_rcv);
tcp_usrclosed(tp);
if (!(inp->inp_flags & INP_DROPPED))
- tcp_output_disconnect(tp);
+ tcp_output(tp);
}
}
@@ -1593,7 +1702,9 @@ tcp_usrclosed(struct tcpcb *tp)
switch (tp->t_state) {
case TCPS_LISTEN:
- tcp_offload_listen_close(tp);
+#ifdef TCP_OFFLOAD
+ tcp_offload_listen_stop(tp);
+#endif
/* FALLTHROUGH */
case TCPS_CLOSED:
tp->t_state = TCPS_CLOSED;
@@ -1626,7 +1737,7 @@ tcp_usrclosed(struct tcpcb *tp)
int timeout;
timeout = (tcp_fast_finwait2_recycle) ?
- tcp_finwait2_timeout : tcp_maxidle;
+ tcp_finwait2_timeout : TP_MAXIDLE(tp);
tcp_timer_activate(tp, TT_2MSL, timeout);
}
}
@@ -1865,26 +1976,24 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
db_print_indent(indent);
- db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n",
- tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd);
+ db_printf("snd_wnd: %lu snd_cwnd: %lu\n",
+ tp->snd_wnd, tp->snd_cwnd);
db_print_indent(indent);
- db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: "
- "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth,
- tp->snd_recover);
+ db_printf("snd_ssthresh: %lu snd_recover: "
+ "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
db_print_indent(indent);
db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n",
tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);
db_print_indent(indent);
- db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n",
- tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime);
+ db_printf("t_rttime: %u t_rtsq: 0x%08x\n",
+ tp->t_rtttime, tp->t_rtseq);
db_print_indent(indent);
- db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u "
- "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg,
- tp->t_srtt);
+ db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n",
+ tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
db_print_indent(indent);
db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u "
diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h
index 618250cd..171eafb6 100644
--- a/freebsd/sys/netinet/tcp_var.h
+++ b/freebsd/sys/netinet/tcp_var.h
@@ -72,6 +72,7 @@ struct sackhint {
int sack_bytes_rexmit;
tcp_seq last_sack_ack; /* Most recent/largest sacked ack */
+ int ispare; /* explicit pad for 64bit alignment */
uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */
};
@@ -131,12 +132,12 @@ struct tcpcb {
u_long snd_wnd; /* send window */
u_long snd_cwnd; /* congestion-controlled window */
- u_long snd_bwnd; /* bandwidth-controlled window */
+ u_long snd_spare1; /* unused */
u_long snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
- u_long snd_bandwidth; /* calculated bandwidth or 0 */
+ u_long snd_spare2; /* unused */
tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
u_int t_maxopd; /* mss plus options */
@@ -146,8 +147,8 @@ struct tcpcb {
u_int t_rtttime; /* RTT measurement start time */
tcp_seq t_rtseq; /* sequence number being timed */
- u_int t_bw_rtttime; /* used for bandwidth calculation */
- tcp_seq t_bw_rtseq; /* used for bandwidth calculation */
+ u_int t_bw_spare1; /* unused */
+ tcp_seq t_bw_spare2; /* unused */
int t_rxtcur; /* current retransmit value (ticks) */
u_int t_maxseg; /* maximum segment size */
@@ -177,6 +178,7 @@ struct tcpcb {
u_long snd_cwnd_prev; /* cwnd prior to retransmit */
u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */
tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */
+ int t_sndzerowin; /* zero-window updates sent */
u_int t_badrxtwin; /* window for retransmit recovery */
u_char snd_limited; /* segments limited transmitted */
/* SACK related state */
@@ -192,21 +194,25 @@ struct tcpcb {
int t_rttlow; /* smallest observerved RTT */
u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
int rfbuf_cnt; /* recv buffer autoscaling byte count */
- struct toe_usrreqs *t_tu; /* offload operations vector */
+ struct toedev *tod; /* toedev handling this connection */
+ int t_sndrexmitpack; /* retransmit packets sent */
+ int t_rcvoopack; /* out-of-order packets received */
void *t_toe; /* TOE pcb pointer */
int t_bytes_acked; /* # bytes acked during current RTT */
-
- int t_sndzerowin; /* zero-window updates sent */
-
struct cc_algo *cc_algo; /* congestion control algorithm */
struct cc_var *ccv; /* congestion control specific vars */
struct osd *osd; /* storage for Khelp module data */
- void *t_pspare2[3]; /* 3 TBD */
- uint64_t _pad[10]; /* 7 UTO, 3 TBD (1-2 CC/RTT?) */
+ u_int t_keepinit; /* time to establish connection */
+ u_int t_keepidle; /* time before keepalive probes begin */
+ u_int t_keepintvl; /* interval between keepalives */
+ u_int t_keepcnt; /* number of keepalives before close */
- uint64_t t_sndrexmitpack;/* retransmit packets sent */
- uint64_t t_rcvoopack; /* out-of-order packets received */
+ u_int t_tsomax; /* tso burst length limit */
+
+ uint32_t t_ispare[7]; /* 5 UTO, 2 TBD */
+ void *t_pspare2[4]; /* 4 TBD */
+ uint64_t _pad[6]; /* 6 TBD (1-2 CC/RTT?) */
};
/*
@@ -301,6 +307,7 @@ struct tcpopt {
u_int16_t to_mss; /* maximum segment size */
u_int8_t to_wscale; /* window scaling */
u_int8_t to_nsacks; /* number of SACK blocks */
+ u_int32_t to_spare; /* UTO */
};
/*
@@ -319,6 +326,15 @@ struct hc_metrics_lite { /* must stay in sync with hc_metrics */
u_long rmx_recvpipe; /* inbound delay-bandwidth product */
};
+/*
+ * Used by tcp_maxmtu() to communicate interface specific features
+ * and limits at the time of connection setup.
+ */
+struct tcp_ifcap {
+ int ifcap;
+ u_int tsomax;
+};
+
#ifndef _NETINET_IN_PCB_H_
struct in_conninfo;
#endif /* _NETINET_IN_PCB_H_ */
@@ -478,7 +494,7 @@ struct tcpstat {
u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
- u_long tcps_sack_sboverflow; /* times scoreboard overflowed */
+ u_long tcps_sack_sboverflow; /* times scoreboard overflowed */
/* ECN related stats */
u_long tcps_ecn_ce; /* ECN Congestion Experienced */
@@ -494,7 +510,7 @@ struct tcpstat {
u_long tcps_sig_err_sigopt; /* No signature expected by socket */
u_long tcps_sig_err_nosigopt; /* No signature provided by segment */
- u_long _pad[7]; /* 6 UTO, 1 TBD */
+ u_long _pad[12]; /* 6 UTO, 6 TBD */
};
#ifdef _KERNEL
@@ -535,11 +551,20 @@ struct tcp_hhook_data {
* included. Not all of our clients do.
*/
#if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_)
+struct xtcp_timer {
+ int tt_rexmt; /* retransmit timer */
+ int tt_persist; /* retransmit persistence */
+ int tt_keep; /* keepalive */
+ int tt_2msl; /* 2*msl TIME_WAIT timer */
+ int tt_delack; /* delayed ACK timer */
+ int t_rcvtime; /* Time since last packet received */
+};
struct xtcpcb {
size_t xt_len;
struct inpcb xt_inp;
struct tcpcb xt_tp;
struct xsocket xt_socket;
+ struct xtcp_timer xt_timer;
u_quad_t xt_alignment_hack;
};
#endif
@@ -597,9 +622,10 @@ VNET_DECLARE(int, tcp_mssdflt); /* XXX */
VNET_DECLARE(int, tcp_minmss);
VNET_DECLARE(int, tcp_delack_enabled);
VNET_DECLARE(int, tcp_do_rfc3390);
+VNET_DECLARE(int, tcp_do_initcwnd10);
+VNET_DECLARE(int, tcp_sendspace);
+VNET_DECLARE(int, tcp_recvspace);
VNET_DECLARE(int, path_mtu_discovery);
-VNET_DECLARE(int, ss_fltsz);
-VNET_DECLARE(int, ss_fltsz_local);
VNET_DECLARE(int, tcp_do_rfc3465);
VNET_DECLARE(int, tcp_abc_l_var);
#define V_tcb VNET(tcb)
@@ -609,9 +635,10 @@ VNET_DECLARE(int, tcp_abc_l_var);
#define V_tcp_minmss VNET(tcp_minmss)
#define V_tcp_delack_enabled VNET(tcp_delack_enabled)
#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390)
+#define V_tcp_do_initcwnd10 VNET(tcp_do_initcwnd10)
+#define V_tcp_sendspace VNET(tcp_sendspace)
+#define V_tcp_recvspace VNET(tcp_recvspace)
#define V_path_mtu_discovery VNET(path_mtu_discovery)
-#define V_ss_fltsz VNET(ss_fltsz)
-#define V_ss_fltsz_local VNET(ss_fltsz_local)
#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465)
#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
@@ -648,7 +675,7 @@ void tcp_init(void);
void tcp_destroy(void);
#endif
void tcp_fini(void *);
-char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
+char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
const void *);
char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
const void *);
@@ -659,10 +686,10 @@ void tcp_reass_flush(struct tcpcb *);
void tcp_reass_destroy(void);
#endif
void tcp_input(struct mbuf *, int);
-u_long tcp_maxmtu(struct in_conninfo *, int *);
-u_long tcp_maxmtu6(struct in_conninfo *, int *);
+u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
+u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
- int *);
+ struct tcp_ifcap *);
void tcp_mss(struct tcpcb *, int);
int tcp_mssopt(struct in_conninfo *);
struct inpcb *
@@ -695,7 +722,6 @@ void tcpip_fillheaders(struct inpcb *, void *, void *);
void tcp_timer_activate(struct tcpcb *, int, u_int);
int tcp_timer_active(struct tcpcb *, int);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
-void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
*/
@@ -709,8 +735,6 @@ void tcp_hc_updatemtu(struct in_conninfo *, u_long);
void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
extern struct pr_usrreqs tcp_usrreqs;
-extern u_long tcp_sendspace;
-extern u_long tcp_recvspace;
tcp_seq tcp_new_isn(struct tcpcb *);
void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
diff --git a/freebsd/sys/netinet/toecore.h b/freebsd/sys/netinet/toecore.h
new file mode 100644
index 00000000..6ea98518
--- /dev/null
+++ b/freebsd/sys/netinet/toecore.h
@@ -0,0 +1,130 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_TOE_H_
+#define _NETINET_TOE_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+struct tcpopt;
+struct tcphdr;
+struct in_conninfo;
+
+struct toedev {
+ TAILQ_ENTRY(toedev) link; /* glue for toedev_list */
+ void *tod_softc; /* TOE driver private data */
+
+ /*
+ * Active open. If a failure occurs, it is reported back by the driver
+ * via toe_connect_failed.
+ */
+ int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *,
+ struct sockaddr *);
+
+ /* Passive open. */
+ int (*tod_listen_start)(struct toedev *, struct tcpcb *);
+ int (*tod_listen_stop)(struct toedev *, struct tcpcb *);
+
+ /*
+ * The kernel uses this routine to pass on any frame it receives for an
+ * offloaded connection to the TOE driver. This is an unusual event.
+ */
+ void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *);
+
+ /*
+ * This is called by the kernel during pru_rcvd for an offloaded TCP
+ * connection and provides an opportunity for the TOE driver to manage
+ * its rx window and credits.
+ */
+ void (*tod_rcvd)(struct toedev *, struct tcpcb *);
+
+ /*
+ * Transmit routine. The kernel calls this to have the TOE driver
+ * evaluate whether there is data to be transmitted, and transmit it.
+ */
+ int (*tod_output)(struct toedev *, struct tcpcb *);
+
+ /* Immediate teardown: send RST to peer. */
+ int (*tod_send_rst)(struct toedev *, struct tcpcb *);
+
+ /* Initiate orderly disconnect by sending FIN to the peer. */
+ int (*tod_send_fin)(struct toedev *, struct tcpcb *);
+
+ /* Called to indicate that the kernel is done with this TCP PCB. */
+ void (*tod_pcb_detach)(struct toedev *, struct tcpcb *);
+
+ /*
+ * The kernel calls this once it has information about an L2 entry that
+ * the TOE driver enquired about previously (via toe_l2_resolve).
+ */
+ void (*tod_l2_update)(struct toedev *, struct ifnet *,
+ struct sockaddr *, uint8_t *, uint16_t);
+
+ /* XXX. Route has been redirected. */
+ void (*tod_route_redirect)(struct toedev *, struct ifnet *,
+ struct rtentry *, struct rtentry *);
+
+ /* Syncache interaction. */
+ void (*tod_syncache_added)(struct toedev *, void *);
+ void (*tod_syncache_removed)(struct toedev *, void *);
+ int (*tod_syncache_respond)(struct toedev *, void *, struct mbuf *);
+ void (*tod_offload_socket)(struct toedev *, void *, struct socket *);
+
+ /* TCP socket option */
+ void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int);
+};
+
+#include <sys/eventhandler.h>
+typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
+typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
+EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
+EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
+
+void init_toedev(struct toedev *);
+int register_toedev(struct toedev *);
+int unregister_toedev(struct toedev *);
+
+/*
+ * General interface for looking up L2 information for an IP address. If an
+ * answer is not available right away then the TOE driver's tod_l2_update will
+ * be called later.
+ */
+int toe_l2_resolve(struct toedev *, struct ifnet *, struct sockaddr *,
+ uint8_t *, uint16_t *);
+
+void toe_connect_failed(struct toedev *, struct inpcb *, int);
+
+void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+ struct inpcb *, void *, void *);
+int toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+ struct socket **);
+
+int toe_4tuple_check(struct in_conninfo *, struct tcphdr *, struct ifnet *);
+#endif
diff --git a/freebsd/sys/netinet/udp.h b/freebsd/sys/netinet/udp.h
index 5ec55970..c2d638dd 100644
--- a/freebsd/sys/netinet/udp.h
+++ b/freebsd/sys/netinet/udp.h
@@ -57,7 +57,7 @@ struct udphdr {
* UDP Encapsulation of IPsec Packets options.
*/
/* Encapsulation types. */
-#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
+#define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
#define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */
/* Default ESP in UDP encapsulation port. */
diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c
index 6c0e61c1..bf95e954 100644
--- a/freebsd/sys/netinet/udp_usrreq.c
+++ b/freebsd/sys/netinet/udp_usrreq.c
@@ -4,8 +4,12 @@
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California.
* Copyright (c) 2008 Robert N. M. Watson
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -41,6 +45,7 @@
__FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_ipfw.h>
+#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
@@ -149,9 +154,12 @@ SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW,
&VNET_NAME(udpstat), udpstat,
"UDP statistics (struct udpstat, netinet/udp_var.h)");
+#ifdef INET
static void udp_detach(struct socket *so);
static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
struct mbuf *, struct thread *);
+#endif
+
#ifdef IPSEC
#ifdef IPSEC_NAT_T
#define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP)
@@ -183,25 +191,12 @@ void
udp_init(void)
{
-
- INP_INFO_LOCK_INIT(&V_udbinfo, "udp");
- LIST_INIT(&V_udb);
-#ifdef VIMAGE
- V_udbinfo.ipi_vnet = curvnet;
-#endif
- V_udbinfo.ipi_listhead = &V_udb;
- V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB,
- &V_udbinfo.ipi_hashmask);
- V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB,
- &V_udbinfo.ipi_porthashmask);
- V_udbinfo.ipi_zone = uma_zcreate("udp_inpcb", sizeof(struct inpcb),
- NULL, NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
-
+ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
+ "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
+ IPI_HASHFIELDS_2TUPLE);
V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
uma_zone_set_max(V_udpcb_zone, maxsockets);
-
EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
EVENTHANDLER_PRI_ANY);
}
@@ -244,17 +239,12 @@ void
udp_destroy(void)
{
- hashdestroy(V_udbinfo.ipi_hashbase, M_PCB,
- V_udbinfo.ipi_hashmask);
- hashdestroy(V_udbinfo.ipi_porthashbase, M_PCB,
- V_udbinfo.ipi_porthashmask);
-
+ in_pcbinfo_destroy(&V_udbinfo);
uma_zdestroy(V_udpcb_zone);
- uma_zdestroy(V_udbinfo.ipi_zone);
- INP_INFO_LOCK_DESTROY(&V_udbinfo);
}
#endif
+#ifdef INET
/*
* Subroutine of udp_input(), which appends the provided mbuf chain to the
* passed pcb/socket. The caller must provide a sockaddr_in via udp_in that
@@ -272,25 +262,32 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
#ifdef INET6
struct sockaddr_in6 udp_in6;
#endif
-#ifdef IPSEC
-#ifdef IPSEC_NAT_T
-#ifdef INET
struct udpcb *up;
-#endif
-#endif
-#endif
- INP_RLOCK_ASSERT(inp);
+ INP_LOCK_ASSERT(inp);
+
+ /*
+ * Engage the tunneling protocol.
+ */
+ up = intoudpcb(inp);
+ if (up->u_tun_func != NULL) {
+ (*up->u_tun_func)(n, off, inp);
+ return;
+ }
+
+ if (n == NULL)
+ return;
+
+ off += sizeof(struct udphdr);
#ifdef IPSEC
/* Check AH/ESP integrity. */
if (ipsec4_in_reject(n, inp)) {
m_freem(n);
- V_ipsec4stat.in_polvio++;
+ IPSECSTAT_INC(in_polvio);
return;
}
#ifdef IPSEC_NAT_T
-#ifdef INET
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */
@@ -298,7 +295,6 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
if (n == NULL) /* Consumed. */
return;
}
-#endif /* INET */
#endif /* IPSEC_NAT_T */
#endif /* IPSEC */
#ifdef MAC
@@ -306,14 +302,14 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
m_freem(n);
return;
}
-#endif
+#endif /* MAC */
if (inp->inp_flags & INP_CONTROLOPTS ||
inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
else
-#endif
+#endif /* INET6 */
ip_savecontrol(inp, &opts, ip, n);
}
#ifdef INET6
@@ -324,7 +320,7 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
in6_sin_2_v4mapsin6(udp_in, &udp_in6);
append_sa = (struct sockaddr *)&udp_in6;
} else
-#endif
+#endif /* INET6 */
append_sa = (struct sockaddr *)udp_in;
m_adj(n, off);
@@ -348,13 +344,10 @@ udp_input(struct mbuf *m, int off)
struct udphdr *uh;
struct ifnet *ifp;
struct inpcb *inp;
- struct udpcb *up;
int len;
struct ip save_ip;
struct sockaddr_in udp_in;
-#ifdef IPFIREWALL_FORWARD
struct m_tag *fwd_tag;
-#endif
ifp = m->m_pkthdr.rcvif;
UDPSTAT_INC(udps_ipackets);
@@ -452,34 +445,12 @@ udp_input(struct mbuf *m, int off)
} else
UDPSTAT_INC(udps_nosum);
-#ifdef IPFIREWALL_FORWARD
- /*
- * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
- */
- fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
- if (fwd_tag != NULL) {
- struct sockaddr_in *next_hop;
-
- /*
- * Do the hack.
- */
- next_hop = (struct sockaddr_in *)(fwd_tag + 1);
- ip->ip_dst = next_hop->sin_addr;
- uh->uh_dport = ntohs(next_hop->sin_port);
-
- /*
- * Remove the tag from the packet. We don't need it anymore.
- */
- m_tag_delete(m, fwd_tag);
- }
-#endif
-
- INP_INFO_RLOCK(&V_udbinfo);
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
in_broadcast(ip->ip_dst, ifp)) {
struct inpcb *last;
struct ip_moptions *imo;
+ INP_INFO_RLOCK(&V_udbinfo);
last = NULL;
LIST_FOREACH(inp, &V_udb, inp_list) {
if (inp->inp_lport != uh->uh_dport)
@@ -501,24 +472,24 @@ udp_input(struct mbuf *m, int off)
INP_RLOCK(inp);
/*
- * Detached PCBs can linger in the list if someone
- * holds a reference. (e.g. udp_pcblist)
+ * XXXRW: Because we weren't holding either the inpcb
+ * or the hash lock when we checked for a match
+ * before, we should probably recheck now that the
+ * inpcb lock is held.
*/
- if (inp->inp_socket == NULL) {
- INP_RUNLOCK(inp);
- continue;
- }
/*
* Handle socket delivery policy for any-source
* and source-specific multicast. [RFC3678]
*/
imo = inp->inp_moptions;
- if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
- imo != NULL) {
+ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
struct sockaddr_in group;
int blocked;
-
+ if (imo == NULL) {
+ INP_RUNLOCK(inp);
+ continue;
+ }
bzero(&group, sizeof(struct sockaddr_in));
group.sin_len = sizeof(struct sockaddr_in);
group.sin_family = AF_INET;
@@ -541,24 +512,7 @@ udp_input(struct mbuf *m, int off)
struct mbuf *n;
n = m_copy(m, 0, M_COPYALL);
- up = intoudpcb(last);
- if (up->u_tun_func == NULL) {
- if (n != NULL)
- udp_append(last,
- ip, n,
- iphlen +
- sizeof(struct udphdr),
- &udp_in);
- } else {
- /*
- * Engage the tunneling protocol we
- * will have to leave the info_lock
- * up, since we are hunting through
- * multiple UDP's.
- */
-
- (*up->u_tun_func)(n, iphlen, last);
- }
+ udp_append(last, ip, n, iphlen, &udp_in);
INP_RUNLOCK(last);
}
last = inp;
@@ -582,18 +536,12 @@ udp_input(struct mbuf *m, int off)
* or multicast datgram.)
*/
UDPSTAT_INC(udps_noportbcast);
- goto badheadlocked;
- }
- up = intoudpcb(last);
- if (up->u_tun_func == NULL) {
- udp_append(last, ip, m, iphlen + sizeof(struct udphdr),
- &udp_in);
- } else {
- /*
- * Engage the tunneling protocol.
- */
- (*up->u_tun_func)(m, iphlen, last);
+ if (inp)
+ INP_RUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_udbinfo);
+ goto badunlocked;
}
+ udp_append(last, ip, m, iphlen, &udp_in);
INP_RUNLOCK(last);
INP_INFO_RUNLOCK(&V_udbinfo);
return;
@@ -602,8 +550,41 @@ udp_input(struct mbuf *m, int off)
/*
* Locate pcb for datagram.
*/
- inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport,
- ip->ip_dst, uh->uh_dport, 1, ifp);
+
+ /*
+ * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
+ */
+ if ((m->m_flags & M_IP_NEXTHOP) &&
+ (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
+ struct sockaddr_in *next_hop;
+
+ next_hop = (struct sockaddr_in *)(fwd_tag + 1);
+
+ /*
+ * Transparently forwarded. Pretend to be the destination.
+ * Already got one like this?
+ */
+ inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
+ ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
+ if (!inp) {
+ /*
+ * It's new. Try to find the ambushing socket.
+ * Because we've rewritten the destination address,
+ * any hardware-generated hash is ignored.
+ */
+ inp = in_pcblookup(&V_udbinfo, ip->ip_src,
+ uh->uh_sport, next_hop->sin_addr,
+ next_hop->sin_port ? htons(next_hop->sin_port) :
+ uh->uh_dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_RLOCKPCB, ifp);
+ }
+ /* Remove the tag from the packet. We don't need it anymore. */
+ m_tag_delete(m, fwd_tag);
+ m->m_flags &= ~M_IP_NEXTHOP;
+ } else
+ inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport,
+ ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_RLOCKPCB, ifp, m);
if (inp == NULL) {
if (udp_log_in_vain) {
char buf[4*sizeof "123"];
@@ -617,57 +598,35 @@ udp_input(struct mbuf *m, int off)
UDPSTAT_INC(udps_noport);
if (m->m_flags & (M_BCAST | M_MCAST)) {
UDPSTAT_INC(udps_noportbcast);
- goto badheadlocked;
+ goto badunlocked;
}
if (V_udp_blackhole)
- goto badheadlocked;
+ goto badunlocked;
if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
- goto badheadlocked;
+ goto badunlocked;
*ip = save_ip;
ip->ip_len += iphlen;
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
- INP_INFO_RUNLOCK(&V_udbinfo);
return;
}
/*
* Check the minimum TTL for socket.
*/
- INP_RLOCK(inp);
- INP_INFO_RUNLOCK(&V_udbinfo);
-
- /*
- * Detached PCBs can linger in the hash table if someone holds a
- * reference. (e.g. udp_pcblist)
- */
- if (inp->inp_socket == NULL) {
- INP_RUNLOCK(inp);
- goto badunlocked;
- }
+ INP_RLOCK_ASSERT(inp);
if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
INP_RUNLOCK(inp);
- goto badunlocked;
- }
- up = intoudpcb(inp);
- if (up->u_tun_func == NULL) {
- udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in);
- } else {
- /*
- * Engage the tunneling protocol.
- */
-
- (*up->u_tun_func)(m, iphlen, inp);
+ m_freem(m);
+ return;
}
+ udp_append(inp, ip, m, iphlen, &udp_in);
INP_RUNLOCK(inp);
return;
-badheadlocked:
- if (inp)
- INP_RUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_udbinfo);
badunlocked:
m_freem(m);
}
+#endif /* INET */
/*
* Notify a udp user of an asynchronous error; just wake up so that they can
@@ -691,6 +650,7 @@ udp_notify(struct inpcb *inp, int errno)
return (inp);
}
+#ifdef INET
void
udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
{
@@ -721,21 +681,20 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
return;
if (ip != NULL) {
uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
- INP_INFO_RLOCK(&V_udbinfo);
- inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport,
- ip->ip_src, uh->uh_sport, 0, NULL);
+ inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport,
+ ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL);
if (inp != NULL) {
- INP_RLOCK(inp);
+ INP_RLOCK_ASSERT(inp);
if (inp->inp_socket != NULL) {
udp_notify(inp, inetctlerrmap[cmd]);
}
INP_RUNLOCK(inp);
}
- INP_INFO_RUNLOCK(&V_udbinfo);
} else
in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
udp_notify);
}
+#endif /* INET */
static int
udp_pcblist(SYSCTL_HANDLER_ARGS)
@@ -820,9 +779,9 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)
INP_INFO_WLOCK(&V_udbinfo);
for (i = 0; i < n; i++) {
inp = inp_list[i];
- INP_WLOCK(inp);
- if (!in_pcbrele(inp))
- INP_WUNLOCK(inp);
+ INP_RLOCK(inp);
+ if (!in_pcbrele_rlocked(inp))
+ INP_RUNLOCK(inp);
}
INP_INFO_WUNLOCK(&V_udbinfo);
@@ -848,6 +807,7 @@ SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0,
udp_pcblist, "S,xinpcb", "List of active UDP sockets");
+#ifdef INET
static int
udp_getcred(SYSCTL_HANDLER_ARGS)
{
@@ -862,12 +822,11 @@ udp_getcred(SYSCTL_HANDLER_ARGS)
error = SYSCTL_IN(req, addrs, sizeof(addrs));
if (error)
return (error);
- INP_INFO_RLOCK(&V_udbinfo);
- inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
- addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
+ inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
+ addrs[0].sin_addr, addrs[0].sin_port,
+ INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
if (inp != NULL) {
- INP_RLOCK(inp);
- INP_INFO_RUNLOCK(&V_udbinfo);
+ INP_RLOCK_ASSERT(inp);
if (inp->inp_socket == NULL)
error = ENOENT;
if (error == 0)
@@ -875,10 +834,8 @@ udp_getcred(SYSCTL_HANDLER_ARGS)
if (error == 0)
cru2x(inp->inp_cred, &xuc);
INP_RUNLOCK(inp);
- } else {
- INP_INFO_RUNLOCK(&V_udbinfo);
+ } else
error = ENOENT;
- }
if (error == 0)
error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
return (error);
@@ -887,6 +844,7 @@ udp_getcred(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
udp_getcred, "S,xucred", "Get the xucred of a UDP connection");
+#endif /* INET */
int
udp_ctloutput(struct socket *so, struct sockopt *sopt)
@@ -905,11 +863,15 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt)
if (INP_CHECK_SOCKAF(so, AF_INET6)) {
INP_WUNLOCK(inp);
error = ip6_ctloutput(so, sopt);
- } else {
+ }
#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
INP_WUNLOCK(inp);
error = ip_ctloutput(so, sopt);
-#ifdef INET6
}
#endif
return (error);
@@ -981,6 +943,10 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt)
return (error);
}
+#ifdef INET
+#define UH_WLOCKED 2
+#define UH_RLOCKED 1
+#define UH_UNLOCKED 0
static int
udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
struct mbuf *control, struct thread *td)
@@ -1010,6 +976,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
}
src.sin_family = 0;
+ INP_RLOCK(inp);
tos = inp->inp_ip_tos;
if (control != NULL) {
/*
@@ -1017,6 +984,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
* stored in a single mbuf.
*/
if (control->m_next) {
+ INP_RUNLOCK(inp);
m_freem(control);
m_freem(m);
return (EINVAL);
@@ -1066,6 +1034,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
m_freem(control);
}
if (error) {
+ INP_RUNLOCK(inp);
m_freem(m);
return (error);
}
@@ -1083,29 +1052,26 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
* conservative locks than required the second time around, so later
* assertions have to accept that. Further analysis of the number of
* misses under contention is required.
+ *
+ * XXXRW: Check that hash locking update here is correct.
*/
sin = (struct sockaddr_in *)addr;
- INP_RLOCK(inp);
if (sin != NULL &&
(inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
INP_RUNLOCK(inp);
- INP_INFO_WLOCK(&V_udbinfo);
INP_WLOCK(inp);
- unlock_udbinfo = 2;
+ INP_HASH_WLOCK(&V_udbinfo);
+ unlock_udbinfo = UH_WLOCKED;
} else if ((sin != NULL && (
(sin->sin_addr.s_addr == INADDR_ANY) ||
(sin->sin_addr.s_addr == INADDR_BROADCAST) ||
(inp->inp_laddr.s_addr == INADDR_ANY) ||
(inp->inp_lport == 0))) ||
(src.sin_family == AF_INET)) {
- if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) {
- INP_RUNLOCK(inp);
- INP_INFO_RLOCK(&V_udbinfo);
- INP_RLOCK(inp);
- }
- unlock_udbinfo = 1;
+ INP_HASH_RLOCK(&V_udbinfo);
+ unlock_udbinfo = UH_RLOCKED;
} else
- unlock_udbinfo = 0;
+ unlock_udbinfo = UH_UNLOCKED;
/*
* If the IP_SENDSRCADDR control message was specified, override the
@@ -1115,7 +1081,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
laddr = inp->inp_laddr;
lport = inp->inp_lport;
if (src.sin_family == AF_INET) {
- INP_INFO_LOCK_ASSERT(&V_udbinfo);
+ INP_HASH_LOCK_ASSERT(&V_udbinfo);
if ((lport == 0) ||
(laddr.s_addr == INADDR_ANY &&
src.sin_addr.s_addr == INADDR_ANY)) {
@@ -1166,7 +1132,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
inp->inp_lport == 0 ||
sin->sin_addr.s_addr == INADDR_ANY ||
sin->sin_addr.s_addr == INADDR_BROADCAST) {
- INP_INFO_LOCK_ASSERT(&V_udbinfo);
+ INP_HASH_LOCK_ASSERT(&V_udbinfo);
error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
&lport, &faddr.s_addr, &fport, NULL,
td->td_ucred);
@@ -1180,8 +1146,8 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
/* Commit the local port if newly assigned. */
if (inp->inp_laddr.s_addr == INADDR_ANY &&
inp->inp_lport == 0) {
- INP_INFO_WLOCK_ASSERT(&V_udbinfo);
INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(&V_udbinfo);
/*
* Remember addr if jailed, to prevent
* rebinding.
@@ -1276,25 +1242,25 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
((struct ip *)ui)->ip_tos = tos; /* XXX */
UDPSTAT_INC(udps_opackets);
- if (unlock_udbinfo == 2)
- INP_INFO_WUNLOCK(&V_udbinfo);
- else if (unlock_udbinfo == 1)
- INP_INFO_RUNLOCK(&V_udbinfo);
+ if (unlock_udbinfo == UH_WLOCKED)
+ INP_HASH_WUNLOCK(&V_udbinfo);
+ else if (unlock_udbinfo == UH_RLOCKED)
+ INP_HASH_RUNLOCK(&V_udbinfo);
error = ip_output(m, inp->inp_options, NULL, ipflags,
inp->inp_moptions, inp);
- if (unlock_udbinfo == 2)
+ if (unlock_udbinfo == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
return (error);
release:
- if (unlock_udbinfo == 2) {
+ if (unlock_udbinfo == UH_WLOCKED) {
+ INP_HASH_WUNLOCK(&V_udbinfo);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
- } else if (unlock_udbinfo == 1) {
+ } else if (unlock_udbinfo == UH_RLOCKED) {
+ INP_HASH_RUNLOCK(&V_udbinfo);
INP_RUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_udbinfo);
} else
INP_RUNLOCK(inp);
m_freem(m);
@@ -1303,7 +1269,6 @@ release:
#if defined(IPSEC) && defined(IPSEC_NAT_T)
-#ifdef INET
/*
* Potentially decap ESP in UDP frame. Check for an ESP header
* and optional marker; if present, strip the UDP header and
@@ -1332,7 +1297,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
if (minlen > m->m_pkthdr.len)
minlen = m->m_pkthdr.len;
if ((m = m_pullup(m, minlen)) == NULL) {
- V_ipsec4stat.in_inval++;
+ IPSECSTAT_INC(in_inval);
return (NULL); /* Bypass caller processing. */
}
data = mtod(m, caddr_t); /* Points to ip header. */
@@ -1372,7 +1337,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
uint32_t spi;
if (payload <= sizeof(struct esp)) {
- V_ipsec4stat.in_inval++;
+ IPSECSTAT_INC(in_inval);
m_freem(m);
return (NULL); /* Discard. */
}
@@ -1393,7 +1358,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
2 * sizeof(uint16_t), M_NOWAIT);
if (tag == NULL) {
- V_ipsec4stat.in_nomem++;
+ IPSECSTAT_INC(in_nomem);
m_freem(m);
return (NULL); /* Discard. */
}
@@ -1435,7 +1400,6 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off)
(void) ipsec4_common_input(m, iphlen, ip->ip_p);
return (NULL); /* NB: consumed, bypass processing. */
}
-#endif /* INET */
#endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */
static void
@@ -1445,15 +1409,15 @@ udp_abort(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
- INP_INFO_WLOCK(&V_udbinfo);
INP_WLOCK(inp);
if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ INP_HASH_WLOCK(&V_udbinfo);
in_pcbdisconnect(inp);
inp->inp_laddr.s_addr = INADDR_ANY;
+ INP_HASH_WUNLOCK(&V_udbinfo);
soisdisconnected(so);
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
}
static int
@@ -1490,6 +1454,7 @@ udp_attach(struct socket *so, int proto, struct thread *td)
INP_INFO_WUNLOCK(&V_udbinfo);
return (0);
}
+#endif /* INET */
int
udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
@@ -1512,6 +1477,7 @@ udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f)
return (0);
}
+#ifdef INET
static int
udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
@@ -1520,11 +1486,11 @@ udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
- INP_INFO_WLOCK(&V_udbinfo);
INP_WLOCK(inp);
+ INP_HASH_WLOCK(&V_udbinfo);
error = in_pcbbind(inp, nam, td->td_ucred);
+ INP_HASH_WUNLOCK(&V_udbinfo);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
return (error);
}
@@ -1535,15 +1501,15 @@ udp_close(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_close: inp == NULL"));
- INP_INFO_WLOCK(&V_udbinfo);
INP_WLOCK(inp);
if (inp->inp_faddr.s_addr != INADDR_ANY) {
+ INP_HASH_WLOCK(&V_udbinfo);
in_pcbdisconnect(inp);
inp->inp_laddr.s_addr = INADDR_ANY;
+ INP_HASH_WUNLOCK(&V_udbinfo);
soisdisconnected(so);
}
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
}
static int
@@ -1555,25 +1521,23 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
- INP_INFO_WLOCK(&V_udbinfo);
INP_WLOCK(inp);
if (inp->inp_faddr.s_addr != INADDR_ANY) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
return (EISCONN);
}
sin = (struct sockaddr_in *)nam;
error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
if (error != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
return (error);
}
+ INP_HASH_WLOCK(&V_udbinfo);
error = in_pcbconnect(inp, nam, td->td_ucred);
+ INP_HASH_WUNLOCK(&V_udbinfo);
if (error == 0)
soisconnected(so);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
return (error);
}
@@ -1605,21 +1569,19 @@ udp_disconnect(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
- INP_INFO_WLOCK(&V_udbinfo);
INP_WLOCK(inp);
if (inp->inp_faddr.s_addr == INADDR_ANY) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
return (ENOTCONN);
}
-
+ INP_HASH_WLOCK(&V_udbinfo);
in_pcbdisconnect(inp);
inp->inp_laddr.s_addr = INADDR_ANY;
+ INP_HASH_WUNLOCK(&V_udbinfo);
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED; /* XXX */
SOCK_UNLOCK(so);
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_udbinfo);
return (0);
}
@@ -1633,6 +1595,7 @@ udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
KASSERT(inp != NULL, ("udp_send: inp == NULL"));
return (udp_output(inp, m, addr, control, td));
}
+#endif /* INET */
int
udp_shutdown(struct socket *so)
@@ -1647,6 +1610,7 @@ udp_shutdown(struct socket *so)
return (0);
}
+#ifdef INET
struct pr_usrreqs udp_usrreqs = {
.pru_abort = udp_abort,
.pru_attach = udp_attach,
@@ -1664,3 +1628,4 @@ struct pr_usrreqs udp_usrreqs = {
.pru_sosetlabel = in_pcbsosetlabel,
.pru_close = udp_close,
};
+#endif /* INET */
diff --git a/freebsd/sys/netinet/udp_var.h b/freebsd/sys/netinet/udp_var.h
index 5cf7dc9f..6b9b5362 100644
--- a/freebsd/sys/netinet/udp_var.h
+++ b/freebsd/sys/netinet/udp_var.h
@@ -152,7 +152,7 @@ int udp_newudpcb(struct inpcb *);
void udp_discardcb(struct udpcb *);
void udp_ctlinput(int, struct sockaddr *, void *);
-int udp_ctloutput(struct socket *, struct sockopt *);
+int udp_ctloutput(struct socket *, struct sockopt *);
void udp_init(void);
#ifdef VIMAGE
void udp_destroy(void);