summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/ip_output.c
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/ip_output.c')
-rw-r--r--freebsd/sys/netinet/ip_output.c614
1 files changed, 361 insertions, 253 deletions
diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c
index a06fed68..81e7b123 100644
--- a/freebsd/sys/netinet/ip_output.c
+++ b/freebsd/sys/netinet/ip_output.c
@@ -34,27 +34,32 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_ipfw.h>
+#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_ipsec.h>
-#include <rtems/bsd/local/opt_route.h>
#include <rtems/bsd/local/opt_mbuf_stress_test.h>
#include <rtems/bsd/local/opt_mpath.h>
+#include <rtems/bsd/local/opt_route.h>
#include <rtems/bsd/local/opt_sctp.h>
+#include <rtems/bsd/local/opt_rss.h>
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <rtems/bsd/sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
+#include <sys/rmlock.h>
+#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/ucred.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/if_llatbl.h>
#include <net/netisr.h>
#include <net/pfil.h>
@@ -63,12 +68,15 @@ __FBSDID("$FreeBSD$");
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
+#include <net/rss_config.h>
#include <net/vnet.h>
#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
+#include <netinet/in_rss.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
@@ -86,25 +94,112 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-VNET_DEFINE(u_short, ip_id);
-
#ifdef MBUF_STRESS_TEST
static int mbuf_frag_size = 0;
SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
#endif
-static void ip_mloopback
- (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
+static void ip_mloopback(struct ifnet *, const struct mbuf *, int);
extern int in_mcast_loop;
extern struct protosw inetsw[];
+static inline int
+ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp,
+ struct sockaddr_in *dst, int *fibnum, int *error)
+{
+ struct m_tag *fwd_tag = NULL;
+ struct mbuf *m;
+ struct in_addr odst;
+ struct ip *ip;
+
+ m = *mp;
+ ip = mtod(m, struct ip *);
+
+ /* Run through list of hooks for output packets. */
+ odst.s_addr = ip->ip_dst.s_addr;
+ *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp);
+ m = *mp;
+ if ((*error) != 0 || m == NULL)
+ return 1; /* Finished */
+
+ ip = mtod(m, struct ip *);
+
+ /* See if destination IP address was changed by packet filter. */
+ if (odst.s_addr != ip->ip_dst.s_addr) {
+ m->m_flags |= M_SKIP_FIREWALL;
+ /* If destination is now ourself drop to ip_input(). */
+ if (in_localip(ip->ip_dst)) {
+ m->m_flags |= M_FASTFWD_OURS;
+ if (m->m_pkthdr.rcvif == NULL)
+ m->m_pkthdr.rcvif = V_loif;
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+ m->m_pkthdr.csum_flags |=
+ CSUM_IP_CHECKED | CSUM_IP_VALID;
+#ifdef SCTP
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP)
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+#endif
+ *error = netisr_queue(NETISR_IP, m);
+ return 1; /* Finished */
+ }
+
+ bzero(dst, sizeof(*dst));
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = ip->ip_dst;
+
+ return -1; /* Reloop */
+ }
+ /* See if fib was changed by packet filter. */
+ if ((*fibnum) != M_GETFIB(m)) {
+ m->m_flags |= M_SKIP_FIREWALL;
+ *fibnum = M_GETFIB(m);
+ return -1; /* Reloop for FIB change */
+ }
+
+ /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
+ if (m->m_flags & M_FASTFWD_OURS) {
+ if (m->m_pkthdr.rcvif == NULL)
+ m->m_pkthdr.rcvif = V_loif;
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m->m_pkthdr.csum_flags |=
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+#ifdef SCTP
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP)
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+#endif
+ m->m_pkthdr.csum_flags |=
+ CSUM_IP_CHECKED | CSUM_IP_VALID;
+
+ *error = netisr_queue(NETISR_IP, m);
+ return 1; /* Finished */
+ }
+ /* Or forward to some other address? */
+ if ((m->m_flags & M_IP_NEXTHOP) &&
+ ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
+ bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
+ m->m_flags |= M_SKIP_FIREWALL;
+ m->m_flags &= ~M_IP_NEXTHOP;
+ m_tag_delete(m, fwd_tag);
+
+ return -1; /* Reloop for CHANGE of dst */
+ }
+
+ return 0;
+}
+
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
- * ip_len and ip_off are in host format.
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
* If route ro is present and has ro_rt initialized, route lookup would be
@@ -118,20 +213,22 @@ int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
struct ip_moptions *imo, struct inpcb *inp)
{
+ struct rm_priotracker in_ifa_tracker;
struct ip *ip;
struct ifnet *ifp = NULL; /* keep compiler happy */
struct mbuf *m0;
int hlen = sizeof (struct ip);
int mtu;
- int n; /* scratchpad */
int error = 0;
struct sockaddr_in *dst;
+ const struct sockaddr_in *gw;
struct in_ifaddr *ia;
- int isbroadcast, sw_csum;
+ int isbroadcast;
+ uint16_t ip_len, ip_off;
struct route iproute;
struct rtentry *rte; /* cache for ro->ro_rt */
- struct in_addr odst;
- struct m_tag *fwd_tag = NULL;
+ uint32_t fibnum;
+ int have_ia_ref;
#ifdef IPSEC
int no_route_but_check_spd = 0;
#endif
@@ -140,31 +237,21 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
M_SETFIB(m, inp->inp_inc.inc_fibnum);
- if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
+ if ((flags & IP_NODEFAULTFLOWID) == 0) {
m->m_pkthdr.flowid = inp->inp_flowid;
- m->m_flags |= M_FLOWID;
+ M_HASHTYPE_SET(m, inp->inp_flowtype);
}
}
if (ro == NULL) {
ro = &iproute;
bzero(ro, sizeof (*ro));
- }
+ } else
+ ro->ro_flags |= RT_LLE_CACHE;
#ifdef FLOWTABLE
- if (ro->ro_rt == NULL) {
- struct flentry *fle;
-
- /*
- * The flow table returns route entries valid for up to 30
- * seconds; we rely on the remainder of ip_output() taking no
- * longer than that long for the stability of ro_rt. The
- * flow ID assignment must have happened before this point.
- */
- fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
- if (fle != NULL)
- flow_to_route(fle, ro);
- }
+ if (ro->ro_rt == NULL)
+ (void )flowtable_lookup(AF_INET, m, ro);
#endif
if (opt) {
@@ -174,37 +261,49 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
hlen = len; /* ip->ip_hl is updated above */
}
ip = mtod(m, struct ip *);
+ ip_len = ntohs(ip->ip_len);
+ ip_off = ntohs(ip->ip_off);
- /*
- * Fill in IP header. If we are not allowing fragmentation,
- * then the ip_id field is meaningless, but we don't set it
- * to zero. Doing so causes various problems when devices along
- * the path (routers, load balancers, firewalls, etc.) illegally
- * disable DF on our packet. Note that a 16-bit counter
- * will wrap around in less than 10 seconds at 100 Mbit/s on a
- * medium with MTU 1500. See Steven M. Bellovin, "A Technique
- * for Counting NATted Hosts", Proc. IMW'02, available at
- * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
- */
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
ip->ip_v = IPVERSION;
ip->ip_hl = hlen >> 2;
- ip->ip_id = ip_newid();
+ ip_fillid(ip);
IPSTAT_INC(ips_localout);
} else {
/* Header already set, fetch hlen from there */
hlen = ip->ip_hl << 2;
}
+ /*
+ * dst/gw handling:
+ *
+ * dst can be rewritten but always points to &ro->ro_dst.
+ * gw is readonly but can point either to dst OR rt_gateway,
+ * therefore we need restore gw if we're redoing lookup.
+ */
+ gw = dst = (struct sockaddr_in *)&ro->ro_dst;
+ fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
+ rte = ro->ro_rt;
+ if (rte == NULL) {
+ bzero(dst, sizeof(*dst));
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = ip->ip_dst;
+ }
again:
- dst = (struct sockaddr_in *)&ro->ro_dst;
- ia = NULL;
+ /*
+ * Validate route against routing table additions;
+ * a better/more specific route might have been added.
+ */
+ if (inp)
+ RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
/*
* If there is a cached route,
* check that it is to the same destination
* and is still up. If not, free it and try again.
* The address family should also be checked in case of sharing the
* cache with IPv6.
+ * Also check whether routing cache needs invalidation.
*/
rte = ro->ro_rt;
if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
@@ -212,16 +311,14 @@ again:
!RT_LINK_IS_UP(rte->rt_ifp) ||
dst->sin_family != AF_INET ||
dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
- RO_RTFREE(ro);
- ro->ro_lle = NULL;
- rte = NULL;
- }
- if (rte == NULL && fwd_tag == NULL) {
- bzero(dst, sizeof(*dst));
- dst->sin_family = AF_INET;
- dst->sin_len = sizeof(*dst);
- dst->sin_addr = ip->ip_dst;
+ RTFREE(rte);
+ rte = ro->ro_rt = (struct rtentry *)NULL;
+ if (ro->ro_lle)
+ LLE_FREE(ro->ro_lle); /* zeros ro_lle */
+ ro->ro_lle = (struct llentry *)NULL;
}
+ ia = NULL;
+ have_ia_ref = 0;
/*
* If routing to interface only, short circuit routing lookup.
* The use of an all-ones broadcast address implies this; an
@@ -229,27 +326,33 @@ again:
* or the destination address of a ptp interface.
*/
if (flags & IP_SENDONES) {
- if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
- (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
+ if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
+ M_GETFIB(m)))) == NULL &&
+ (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
+ M_GETFIB(m)))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
+ have_ia_ref = 1;
ip->ip_dst.s_addr = INADDR_BROADCAST;
dst->sin_addr = ip->ip_dst;
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
isbroadcast = 1;
} else if (flags & IP_ROUTETOIF) {
- if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
- (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) {
+ if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
+ M_GETFIB(m)))) == NULL &&
+ (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
+ M_GETFIB(m)))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
+ have_ia_ref = 1;
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
- isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ isbroadcast = in_ifaddr_broadcast(dst->sin_addr, ia);
} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
imo != NULL && imo->imo_multicast_ifp != NULL) {
/*
@@ -257,7 +360,9 @@ again:
* packets if the interface is specified.
*/
ifp = imo->imo_multicast_ifp;
- IFP_TO_IA(ifp, ia);
+ IFP_TO_IA(ifp, ia, &in_ifa_tracker);
+ if (ia)
+ have_ia_ref = 1;
isbroadcast = 0; /* fool gcc */
} else {
/*
@@ -269,14 +374,14 @@ again:
#ifdef RADIX_MPATH
rtalloc_mpath_fib(ro,
ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
- inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+ fibnum);
#else
- in_rtalloc_ign(ro, 0,
- inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+ in_rtalloc_ign(ro, 0, fibnum);
#endif
rte = ro->ro_rt;
}
if (rte == NULL ||
+ (rte->rt_flags & RTF_UP) == 0 ||
rte->rt_ifp == NULL ||
!RT_LINK_IS_UP(rte->rt_ifp)) {
#ifdef IPSEC
@@ -293,45 +398,37 @@ again:
goto bad;
}
ia = ifatoia(rte->rt_ifa);
- ifa_ref(&ia->ia_ifa);
ifp = rte->rt_ifp;
- rte->rt_rmx.rmx_pksent++;
+ counter_u64_add(rte->rt_pksent, 1);
+ rt_update_ro_flags(ro);
if (rte->rt_flags & RTF_GATEWAY)
- dst = (struct sockaddr_in *)rte->rt_gateway;
+ gw = (struct sockaddr_in *)rte->rt_gateway;
if (rte->rt_flags & RTF_HOST)
isbroadcast = (rte->rt_flags & RTF_BROADCAST);
else
- isbroadcast = in_broadcast(dst->sin_addr, ifp);
+ isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
}
+
/*
* Calculate MTU. If we have a route that is up, use that,
* otherwise use the interface's MTU.
*/
- if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) {
- /*
- * This case can happen if the user changed the MTU
- * of an interface after enabling IP on it. Because
- * most netifs don't keep track of routes pointing to
- * them, there is no way for one to update all its
- * routes when the MTU is changed.
- */
- if (rte->rt_rmx.rmx_mtu > ifp->if_mtu)
- rte->rt_rmx.rmx_mtu = ifp->if_mtu;
- mtu = rte->rt_rmx.rmx_mtu;
- } else {
+ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST)))
+ mtu = rte->rt_mtu;
+ else
mtu = ifp->if_mtu;
- }
/* Catch a possible divide by zero later. */
KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p",
__func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp));
+
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
m->m_flags |= M_MCAST;
/*
- * IP destination address is multicast. Make sure "dst"
+ * IP destination address is multicast. Make sure "gw"
* still points to the address in "ro". (It may have been
* changed to point to a gateway address, above.)
*/
- dst = (struct sockaddr_in *)&ro->ro_dst;
+ gw = dst;
/*
* See if the caller provided any multicast options
*/
@@ -373,7 +470,7 @@ again:
* thus deferring a hash lookup and mutex acquisition
* at the expense of a cheap copy using m_copym().
*/
- ip_mloopback(ifp, m, dst, hlen);
+ ip_mloopback(ifp, m, hlen);
} else {
/*
* If we are acting as a multicast router, perform
@@ -433,23 +530,6 @@ again:
}
/*
- * Verify that we have any chance at all of being able to queue the
- * packet or packet fragments, unless ALTQ is enabled on the given
- * interface in which case packetdrop should be done by queueing.
- */
- n = ip->ip_len / mtu + 1; /* how many fragments ? */
- if (
-#ifdef ALTQ
- (!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
-#endif /* ALTQ */
- (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) {
- error = ENOBUFS;
- IPSTAT_INC(ips_odropped);
- ifp->if_snd.ifq_drops += n;
- goto bad;
- }
-
- /*
* Look for broadcast address and
* verify user is allowed to send
* such a packet.
@@ -464,7 +544,7 @@ again:
goto bad;
}
/* don't allow broadcast messages to be fragmented */
- if (ip->ip_len > mtu) {
+ if (ip_len > mtu) {
error = EMSGSIZE;
goto bad;
}
@@ -475,7 +555,7 @@ again:
sendit:
#ifdef IPSEC
- switch(ip_ipsec_output(&m, inp, &flags, &error)) {
+ switch(ip_ipsec_output(&m, inp, &error)) {
case 1:
goto bad;
case -1:
@@ -498,78 +578,29 @@ sendit:
#endif /* IPSEC */
/* Jump over all PFIL processing if hooks are not active. */
- if (!PFIL_HOOKED(&V_inet_pfil_hook))
- goto passout;
-
- /* Run through list of hooks for output packets. */
- odst.s_addr = ip->ip_dst.s_addr;
- error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
- if (error != 0 || m == NULL)
- goto done;
+ if (PFIL_HOOKED(&V_inet_pfil_hook)) {
+ switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) {
+ case 1: /* Finished */
+ goto done;
- ip = mtod(m, struct ip *);
+ case 0: /* Continue normally */
+ ip = mtod(m, struct ip *);
+ break;
- /* See if destination IP address was changed by packet filter. */
- if (odst.s_addr != ip->ip_dst.s_addr) {
- m->m_flags |= M_SKIP_FIREWALL;
- /* If destination is now ourself drop to ip_input(). */
- if (in_localip(ip->ip_dst)) {
- m->m_flags |= M_FASTFWD_OURS;
- if (m->m_pkthdr.rcvif == NULL)
- m->m_pkthdr.rcvif = V_loif;
- if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
- m->m_pkthdr.csum_flags |=
- CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
- m->m_pkthdr.csum_data = 0xffff;
- }
- m->m_pkthdr.csum_flags |=
- CSUM_IP_CHECKED | CSUM_IP_VALID;
-#ifdef SCTP
- if (m->m_pkthdr.csum_flags & CSUM_SCTP)
- m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
-#endif
- error = netisr_queue(NETISR_IP, m);
- goto done;
- } else {
- if (ia != NULL)
+ case -1: /* Need to try again */
+ /* Reset everything for a new round */
+ RO_RTFREE(ro);
+ if (have_ia_ref)
ifa_free(&ia->ia_ifa);
- goto again; /* Redo the routing table lookup. */
- }
- }
+ ro->ro_prepend = NULL;
+ rte = NULL;
+ gw = dst;
+ ip = mtod(m, struct ip *);
+ goto again;
- /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
- if (m->m_flags & M_FASTFWD_OURS) {
- if (m->m_pkthdr.rcvif == NULL)
- m->m_pkthdr.rcvif = V_loif;
- if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
- m->m_pkthdr.csum_flags |=
- CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
- m->m_pkthdr.csum_data = 0xffff;
}
-#ifdef SCTP
- if (m->m_pkthdr.csum_flags & CSUM_SCTP)
- m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
-#endif
- m->m_pkthdr.csum_flags |=
- CSUM_IP_CHECKED | CSUM_IP_VALID;
-
- error = netisr_queue(NETISR_IP, m);
- goto done;
- }
- /* Or forward to some other address? */
- if ((m->m_flags & M_IP_NEXTHOP) &&
- (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
- dst = (struct sockaddr_in *)&ro->ro_dst;
- bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
- m->m_flags |= M_SKIP_FIREWALL;
- m->m_flags &= ~M_IP_NEXTHOP;
- m_tag_delete(m, fwd_tag);
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
- goto again;
}
-passout:
/* 127/8 must not appear on wire - RFC1122. */
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
@@ -581,31 +612,28 @@ passout:
}
m->m_pkthdr.csum_flags |= CSUM_IP;
- sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
- if (sw_csum & CSUM_DELAY_DATA) {
+ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
in_delayed_cksum(m);
- sw_csum &= ~CSUM_DELAY_DATA;
+ m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
- if (sw_csum & CSUM_SCTP) {
+ if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
- sw_csum &= ~CSUM_SCTP;
+ m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
#endif
- m->m_pkthdr.csum_flags &= ifp->if_hwassist;
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
*/
- if (ip->ip_len <= mtu ||
- (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
- ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
+ if (ip_len <= mtu ||
+ (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
ip->ip_sum = 0;
- if (sw_csum & CSUM_DELAY_IP)
+ if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
ip->ip_sum = in_cksum(m, hlen);
+ m->m_pkthdr.csum_flags &= ~CSUM_IP;
+ }
/*
* Record statistics for this interface address.
@@ -615,28 +643,30 @@ passout:
*/
if (!(flags & IP_FORWARDING) && ia) {
if (m->m_pkthdr.csum_flags & CSUM_TSO)
- ia->ia_ifa.if_opackets +=
- m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
+ counter_u64_add(ia->ia_ifa.ifa_opackets,
+ m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
else
- ia->ia_ifa.if_opackets++;
- ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
+
+ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
}
#ifdef MBUF_STRESS_TEST
if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
- m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
+ m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
#endif
/*
* Reset layer specific mbuf flags
* to avoid confusing lower layers.
*/
- m->m_flags &= ~(M_PROTOFLAGS);
+ m_clrprotoflags(m);
+ IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
error = (*ifp->if_output)(ifp, m,
- (struct sockaddr *)dst, ro);
+ (const struct sockaddr *)gw, ro);
goto done;
}
/* Balk when DF bit is set or the interface didn't support TSO. */
- if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
+ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;
@@ -646,7 +676,7 @@ passout:
* Too large for interface; fragment if possible. If successful,
* on return, m will point to a list of packets to be sent.
*/
- error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
+ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
if (error)
goto bad;
for (; m; m = m0) {
@@ -655,17 +685,19 @@ passout:
if (error == 0) {
/* Record statistics for this interface address. */
if (ia != NULL) {
- ia->ia_ifa.if_opackets++;
- ia->ia_ifa.if_obytes += m->m_pkthdr.len;
+ counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
+ counter_u64_add(ia->ia_ifa.ifa_obytes,
+ m->m_pkthdr.len);
}
/*
* Reset layer specific mbuf flags
* to avoid confusing upper layers.
*/
- m->m_flags &= ~(M_PROTOFLAGS);
+ m_clrprotoflags(m);
+ IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
error = (*ifp->if_output)(ifp, m,
- (struct sockaddr *)dst, ro);
+ (const struct sockaddr *)gw, ro);
} else
m_freem(m);
}
@@ -674,9 +706,20 @@ passout:
IPSTAT_INC(ips_fragmented);
done:
- if (ro == &iproute)
+ /*
+ * Release the route if using our private route, or if
+ * (with flowtable) we don't have our own reference.
+ */
+ if (ro == &iproute || ro->ro_flags & RT_NORTREF)
RO_RTFREE(ro);
- if (ia != NULL)
+ else if (rte == NULL)
+ /*
+ * If the caller supplied a route but somehow the reference
+ * to it has been released need to prevent the caller
+ * calling RTFREE on it again.
+ */
+ ro->ro_rt = NULL;
+ if (have_ia_ref)
ifa_free(&ia->ia_ifa);
return (error);
bad:
@@ -691,11 +734,10 @@ bad:
* chain of fragments that should be freed by the caller.
*
* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
- * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
*/
int
ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
- u_long if_hwassist_flags, int sw_csum)
+ u_long if_hwassist_flags)
{
int error = 0;
int hlen = ip->ip_hl << 2;
@@ -705,8 +747,12 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
int firstlen;
struct mbuf **mnext;
int nfrags;
+ uint16_t ip_len, ip_off;
+
+ ip_len = ntohs(ip->ip_len);
+ ip_off = ntohs(ip->ip_off);
- if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
+ if (ip_off & IP_DF) { /* Fragmentation not allowed */
IPSTAT_INC(ips_cantfrag);
return EMSGSIZE;
}
@@ -732,10 +778,10 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
}
#endif
if (len > PAGE_SIZE) {
- /*
- * Fragment large datagrams such that each segment
- * contains a multiple of PAGE_SIZE amount of data,
- * plus headers. This enables a receiver to perform
+ /*
+ * Fragment large datagrams such that each segment
+ * contains a multiple of PAGE_SIZE amount of data,
+ * plus headers. This enables a receiver to perform
* page-flipping zero-copy optimizations.
*
* XXX When does this help given that sender and receiver
@@ -747,7 +793,7 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
off = MIN(mtu, m0->m_pkthdr.len);
/*
- * firstlen (off - hlen) must be aligned on an
+ * firstlen (off - hlen) must be aligned on an
* 8-byte boundary
*/
if (off < hlen)
@@ -776,22 +822,30 @@ smart_frag_failure:
* The fragments are linked off the m_nextpkt of the original
* packet, which after processing serves as the first fragment.
*/
- for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
+ for (nfrags = 1; off < ip_len; off += len, nfrags++) {
struct ip *mhip; /* ip header on the fragment */
struct mbuf *m;
int mhlen = sizeof (struct ip);
- MGETHDR(m, M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
goto done;
}
- /* copy multicast and flowid flag, if any */
- m->m_flags |= (m0->m_flags & (M_FLOWID | M_MCAST)) | M_FRAG;
- /* make sure the flowid is the same for the fragmented mbufs */
- M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0));
- m->m_pkthdr.flowid = m0->m_pkthdr.flowid;
+ /*
+ * Make sure the complete packet header gets copied
+ * from the originating mbuf to the newly created
+ * mbuf. This also ensures that existing firewall
+ * classification(s), VLAN tags and so on get copied
+ * to the resulting fragmented packet(s):
+ */
+ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
+ m_free(m);
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
/*
* In the first mbuf, leave room for the link header, then
* copy the original IP header including options. The payload
@@ -806,15 +860,14 @@ smart_frag_failure:
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
- /* XXX do we need to add ip->ip_off below ? */
- mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
- if (off + len >= ip->ip_len) { /* last fragment */
- len = ip->ip_len - off;
- m->m_flags |= M_LASTFRAG;
- } else
+ /* XXX do we need to add ip_off below ? */
+ mhip->ip_off = ((off - hlen) >> 3) + ip_off;
+ if (off + len >= ip_len)
+ len = ip_len - off;
+ else
mhip->ip_off |= IP_MF;
mhip->ip_len = htons((u_short)(len + mhlen));
- m->m_next = m_copym(m0, off, len, M_DONTWAIT);
+ m->m_next = m_copym(m0, off, len, M_NOWAIT);
if (m->m_next == NULL) { /* copy failed */
m_free(m);
error = ENOBUFS; /* ??? */
@@ -822,36 +875,33 @@ smart_frag_failure:
goto done;
}
m->m_pkthdr.len = mhlen + len;
- m->m_pkthdr.rcvif = NULL;
#ifdef MAC
mac_netinet_fragment(m0, m);
#endif
- m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
mhip->ip_off = htons(mhip->ip_off);
mhip->ip_sum = 0;
- if (sw_csum & CSUM_DELAY_IP)
+ if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
mhip->ip_sum = in_cksum(m, mhlen);
+ m->m_pkthdr.csum_flags &= ~CSUM_IP;
+ }
*mnext = m;
mnext = &m->m_nextpkt;
}
IPSTAT_ADD(ips_ofragments, nfrags);
- /* set first marker for fragment chain */
- m0->m_flags |= M_FIRSTFRAG | M_FRAG;
- m0->m_pkthdr.csum_data = nfrags;
-
/*
* Update first fragment by trimming what's been copied out
* and updating header.
*/
- m_adj(m0, hlen + firstlen - ip->ip_len);
+ m_adj(m0, hlen + firstlen - ip_len);
m0->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_short)m0->m_pkthdr.len);
- ip->ip_off |= IP_MF;
- ip->ip_off = htons(ip->ip_off);
+ ip->ip_off = htons(ip_off | IP_MF);
ip->ip_sum = 0;
- if (sw_csum & CSUM_DELAY_IP)
+ if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
ip->ip_sum = in_cksum(m0, hlen);
+ m0->m_pkthdr.csum_flags &= ~CSUM_IP;
+ }
done:
*m_frag = m0;
@@ -862,11 +912,12 @@ void
in_delayed_cksum(struct mbuf *m)
{
struct ip *ip;
- u_short csum, offset;
+ uint16_t csum, offset, ip_len;
ip = mtod(m, struct ip *);
offset = ip->ip_hl << 2 ;
- csum = in_cksum_skip(m, ip->ip_len, offset);
+ ip_len = ntohs(ip->ip_len);
+ csum = in_cksum_skip(m, ip_len, offset);
if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
csum = 0xffff;
offset += m->m_pkthdr.csum_data; /* checksum offset */
@@ -889,6 +940,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
{
struct inpcb *inp = sotoinpcb(so);
int error, optval;
+#ifdef RSS
+ uint32_t rss_bucket;
+ int retval;
+#endif
error = optval = 0;
if (sopt->sopt_level != IPPROTO_IP) {
@@ -941,7 +996,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
error = EMSGSIZE;
break;
}
- MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
+ m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
break;
@@ -967,6 +1022,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
}
/* FALLTHROUGH */
+ case IP_BINDMULTI:
+#ifdef RSS
+ case IP_RSS_LISTEN_BUCKET:
+#endif
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
@@ -975,10 +1034,13 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVDSTADDR:
case IP_RECVTTL:
case IP_RECVIF:
- case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
case IP_RECVTOS:
+ case IP_RECVFLOWID:
+#ifdef RSS
+ case IP_RECVRSSBUCKETID:
+#endif
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
@@ -1009,6 +1071,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp); \
} while (0)
+#define OPTSET2(bit, val) do { \
+ INP_WLOCK(inp); \
+ if (val) \
+ inp->inp_flags2 |= bit; \
+ else \
+ inp->inp_flags2 &= ~bit; \
+ INP_WUNLOCK(inp); \
+} while (0)
+
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
@@ -1029,10 +1100,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
OPTSET(INP_RECVIF);
break;
- case IP_FAITH:
- OPTSET(INP_FAITH);
- break;
-
case IP_ONESBCAST:
OPTSET(INP_ONESBCAST);
break;
@@ -1045,9 +1112,30 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVTOS:
OPTSET(INP_RECVTOS);
break;
+ case IP_BINDMULTI:
+ OPTSET2(INP_BINDMULTI, optval);
+ break;
+ case IP_RECVFLOWID:
+ OPTSET2(INP_RECVFLOWID, optval);
+ break;
+#ifdef RSS
+ case IP_RSS_LISTEN_BUCKET:
+ if ((optval >= 0) &&
+ (optval < rss_getnumbuckets())) {
+ inp->inp_rss_listen_bucket = optval;
+ OPTSET2(INP_RSS_BUCKET_SET, 1);
+ } else {
+ error = EINVAL;
+ }
+ break;
+ case IP_RECVRSSBUCKETID:
+ OPTSET2(INP_RECVRSSBUCKETID, optval);
+ break;
+#endif
}
break;
#undef OPTSET
+#undef OPTSET2
/*
* Multicast socket options are processed by the in_mcast
@@ -1133,7 +1221,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_OPTIONS:
case IP_RETOPTS:
if (inp->inp_options)
- error = sooptcopyout(sopt,
+ error = sooptcopyout(sopt,
mtod(inp->inp_options,
char *),
inp->inp_options->m_len);
@@ -1150,11 +1238,18 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVTTL:
case IP_RECVIF:
case IP_PORTRANGE:
- case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
case IP_BINDANY:
case IP_RECVTOS:
+ case IP_BINDMULTI:
+ case IP_FLOWID:
+ case IP_FLOWTYPE:
+ case IP_RECVFLOWID:
+#ifdef RSS
+ case IP_RSSBUCKETID:
+ case IP_RECVRSSBUCKETID:
+#endif
switch (sopt->sopt_name) {
case IP_TOS:
@@ -1170,6 +1265,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
break;
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
+#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
@@ -1200,10 +1296,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
optval = 0;
break;
- case IP_FAITH:
- optval = OPTBIT(INP_FAITH);
- break;
-
case IP_ONESBCAST:
optval = OPTBIT(INP_ONESBCAST);
break;
@@ -1216,6 +1308,32 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
case IP_RECVTOS:
optval = OPTBIT(INP_RECVTOS);
break;
+ case IP_FLOWID:
+ optval = inp->inp_flowid;
+ break;
+ case IP_FLOWTYPE:
+ optval = inp->inp_flowtype;
+ break;
+ case IP_RECVFLOWID:
+ optval = OPTBIT2(INP_RECVFLOWID);
+ break;
+#ifdef RSS
+ case IP_RSSBUCKETID:
+ retval = rss_hash2bucket(inp->inp_flowid,
+ inp->inp_flowtype,
+ &rss_bucket);
+ if (retval == 0)
+ optval = rss_bucket;
+ else
+ error = EINVAL;
+ break;
+ case IP_RECVRSSBUCKETID:
+ optval = OPTBIT2(INP_RECVRSSBUCKETID);
+ break;
+#endif
+ case IP_BINDMULTI:
+ optval = OPTBIT2(INP_BINDMULTI);
+ break;
}
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
@@ -1239,7 +1357,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
caddr_t req = NULL;
size_t len = 0;
- if (m != 0) {
+ if (m != NULL) {
req = mtod(m, caddr_t);
len = m->m_len;
}
@@ -1269,18 +1387,17 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
* replicating that code here.
*/
static void
-ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
- int hlen)
+ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
{
- register struct ip *ip;
+ struct ip *ip;
struct mbuf *copym;
/*
* Make a deep copy of the packet because we're going to
* modify the pack in order to generate checksums.
*/
- copym = m_dup(m, M_DONTWAIT);
- if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
+ copym = m_dup(m, M_NOWAIT);
+ if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
copym = m_pullup(copym, hlen);
if (copym != NULL) {
/* If needed, compute the checksum and mark it as valid. */
@@ -1296,17 +1413,8 @@ ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
* than the interface's MTU. Can this possibly matter?
*/
ip = mtod(copym, struct ip *);
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
ip->ip_sum = in_cksum(copym, hlen);
-#if 1 /* XXX */
- if (dst->sin_family != AF_INET) {
- printf("ip_mloopback: bad address family %d\n",
- dst->sin_family);
- dst->sin_family = AF_INET;
- }
-#endif
- if_simloop(ifp, copym, dst->sin_family, 0);
+ if_simloop(ifp, copym, AF_INET, 0);
}
}