diff options
Diffstat (limited to 'freebsd/sys/netinet/ip_output.c')
-rw-r--r-- | freebsd/sys/netinet/ip_output.c | 614 |
1 files changed, 361 insertions, 253 deletions
diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c index a06fed68..81e7b123 100644 --- a/freebsd/sys/netinet/ip_output.c +++ b/freebsd/sys/netinet/ip_output.c @@ -34,27 +34,32 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <rtems/bsd/local/opt_ipfw.h> +#include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_ipsec.h> -#include <rtems/bsd/local/opt_route.h> #include <rtems/bsd/local/opt_mbuf_stress_test.h> #include <rtems/bsd/local/opt_mpath.h> +#include <rtems/bsd/local/opt_route.h> #include <rtems/bsd/local/opt_sctp.h> +#include <rtems/bsd/local/opt_rss.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/priv.h> #include <sys/proc.h> #include <sys/protosw.h> +#include <sys/rmlock.h> +#include <sys/sdt.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/ucred.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_llatbl.h> #include <net/netisr.h> #include <net/pfil.h> @@ -63,12 +68,15 @@ __FBSDID("$FreeBSD$"); #ifdef RADIX_MPATH #include <net/radix_mpath.h> #endif +#include <net/rss_config.h> #include <net/vnet.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> +#include <netinet/in_rss.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> #include <netinet/ip_options.h> @@ -86,25 +94,112 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -VNET_DEFINE(u_short, ip_id); - #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif -static void ip_mloopback - (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); +static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; +static inline int +ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, + struct sockaddr_in *dst, int *fibnum, int *error) +{ + struct m_tag *fwd_tag = NULL; + struct mbuf *m; + struct in_addr odst; + struct ip *ip; + + m = *mp; + ip = mtod(m, struct ip *); + + /* Run through list of hooks for output packets. */ + odst.s_addr = ip->ip_dst.s_addr; + *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp); + m = *mp; + if ((*error) != 0 || m == NULL) + return 1; /* Finished */ + + ip = mtod(m, struct ip *); + + /* See if destination IP address was changed by packet filter. */ + if (odst.s_addr != ip->ip_dst.s_addr) { + m->m_flags |= M_SKIP_FIREWALL; + /* If destination is now ourself drop to ip_input(). */ + if (in_localip(ip->ip_dst)) { + m->m_flags |= M_FASTFWD_OURS; + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = V_loif; + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; +#ifdef SCTP + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; +#endif + *error = netisr_queue(NETISR_IP, m); + return 1; /* Finished */ + } + + bzero(dst, sizeof(*dst)); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + + return -1; /* Reloop */ + } + /* See if fib was changed by packet filter. */ + if ((*fibnum) != M_GETFIB(m)) { + m->m_flags |= M_SKIP_FIREWALL; + *fibnum = M_GETFIB(m); + return -1; /* Reloop for FIB change */ + } + + /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ + if (m->m_flags & M_FASTFWD_OURS) { + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = V_loif; + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } +#ifdef SCTP + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; +#endif + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; + + *error = netisr_queue(NETISR_IP, m); + return 1; /* Finished */ + } + /* Or forward to some other address? */ + if ((m->m_flags & M_IP_NEXTHOP) && + ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { + bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); + m->m_flags |= M_SKIP_FIREWALL; + m->m_flags &= ~M_IP_NEXTHOP; + m_tag_delete(m, fwd_tag); + + return -1; /* Reloop for CHANGE of dst */ + } + + return 0; +} + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). - * ip_len and ip_off are in host format. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be @@ -118,20 +213,22 @@ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { + struct rm_priotracker in_ifa_tracker; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; - int n; /* scratchpad */ int error = 0; struct sockaddr_in *dst; + const struct sockaddr_in *gw; struct in_ifaddr *ia; - int isbroadcast, sw_csum; + int isbroadcast; + uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ - struct in_addr odst; - struct m_tag *fwd_tag = NULL; + uint32_t fibnum; + int have_ia_ref; #ifdef IPSEC int no_route_but_check_spd = 0; #endif @@ -140,31 +237,21 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, inp->inp_flowtype); } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); - } + } else + ro->ro_flags |= RT_LLE_CACHE; #ifdef FLOWTABLE - if (ro->ro_rt == NULL) { - struct flentry *fle; - - /* - * The flow table returns route entries valid for up to 30 - * seconds; we rely on the remainder of ip_output() taking no - * longer than that long for the stability of ro_rt. The - * flow ID assignment must have happened before this point. - */ - fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET); - if (fle != NULL) - flow_to_route(fle, ro); - } + if (ro->ro_rt == NULL) + (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { @@ -174,37 +261,49 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); + ip_len = ntohs(ip->ip_len); + ip_off = ntohs(ip->ip_off); - /* - * Fill in IP header. If we are not allowing fragmentation, - * then the ip_id field is meaningless, but we don't set it - * to zero. Doing so causes various problems when devices along - * the path (routers, load balancers, firewalls, etc.) illegally - * disable DF on our packet. Note that a 16-bit counter - * will wrap around in less than 10 seconds at 100 Mbit/s on a - * medium with MTU 1500. See Steven M. Bellovin, "A Technique - * for Counting NATted Hosts", Proc. IMW'02, available at - * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>. - */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; - ip->ip_id = ip_newid(); + ip_fillid(ip); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } + /* + * dst/gw handling: + * + * dst can be rewritten but always points to &ro->ro_dst. + * gw is readonly but can point either to dst OR rt_gateway, + * therefore we need restore gw if we're redoing lookup. + */ + gw = dst = (struct sockaddr_in *)&ro->ro_dst; + fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); + rte = ro->ro_rt; + if (rte == NULL) { + bzero(dst, sizeof(*dst)); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + } again: - dst = (struct sockaddr_in *)&ro->ro_dst; - ia = NULL; + /* + * Validate route against routing table additions; + * a better/more specific route might have been added. + */ + if (inp) + RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. + * Also check whether routing cache needs invalidation. */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || @@ -212,16 +311,14 @@ again: !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { - RO_RTFREE(ro); - ro->ro_lle = NULL; - rte = NULL; - } - if (rte == NULL && fwd_tag == NULL) { - bzero(dst, sizeof(*dst)); - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = ip->ip_dst; + RTFREE(rte); + rte = ro->ro_rt = (struct rtentry *)NULL; + if (ro->ro_lle) + LLE_FREE(ro->ro_lle); /* zeros ro_lle */ + ro->ro_lle = (struct llentry *)NULL; } + ia = NULL; + have_ia_ref = 0; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an @@ -229,27 +326,33 @@ again: * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { - if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && - (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { + if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), + M_GETFIB(m)))) == NULL && + (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), + M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } + have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { - if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && - (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), + M_GETFIB(m)))) == NULL && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, + M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } + have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; - isbroadcast = in_broadcast(dst->sin_addr, ifp); + isbroadcast = in_ifaddr_broadcast(dst->sin_addr, ia); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* @@ -257,7 +360,9 @@ again: * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); + if (ia) + have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* @@ -269,14 +374,14 @@ again: #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), - inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); + fibnum); #else - in_rtalloc_ign(ro, 0, - inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); + in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; } if (rte == NULL || + (rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC @@ -293,45 +398,37 @@ again: goto bad; } ia = ifatoia(rte->rt_ifa); - ifa_ref(&ia->ia_ifa); ifp = rte->rt_ifp; - rte->rt_rmx.rmx_pksent++; + counter_u64_add(rte->rt_pksent, 1); + rt_update_ro_flags(ro); if (rte->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)rte->rt_gateway; + gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); else - isbroadcast = in_broadcast(dst->sin_addr, ifp); + isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); } + /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ - if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) { - /* - * This case can happen if the user changed the MTU - * of an interface after enabling IP on it. Because - * most netifs don't keep track of routes pointing to - * them, there is no way for one to update all its - * routes when the MTU is changed. - */ - if (rte->rt_rmx.rmx_mtu > ifp->if_mtu) - rte->rt_rmx.rmx_mtu = ifp->if_mtu; - mtu = rte->rt_rmx.rmx_mtu; - } else { + if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) + mtu = rte->rt_mtu; + else mtu = ifp->if_mtu; - } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* - * IP destination address is multicast. Make sure "dst" + * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ - dst = (struct sockaddr_in *)&ro->ro_dst; + gw = dst; /* * See if the caller provided any multicast options */ @@ -373,7 +470,7 @@ again: * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ - ip_mloopback(ifp, m, dst, hlen); + ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform @@ -433,23 +530,6 @@ again: } /* - * Verify that we have any chance at all of being able to queue the - * packet or packet fragments, unless ALTQ is enabled on the given - * interface in which case packetdrop should be done by queueing. - */ - n = ip->ip_len / mtu + 1; /* how many fragments ? */ - if ( -#ifdef ALTQ - (!ALTQ_IS_ENABLED(&ifp->if_snd)) && -#endif /* ALTQ */ - (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) { - error = ENOBUFS; - IPSTAT_INC(ips_odropped); - ifp->if_snd.ifq_drops += n; - goto bad; - } - - /* * Look for broadcast address and * verify user is allowed to send * such a packet. @@ -464,7 +544,7 @@ again: goto bad; } /* don't allow broadcast messages to be fragmented */ - if (ip->ip_len > mtu) { + if (ip_len > mtu) { error = EMSGSIZE; goto bad; } @@ -475,7 +555,7 @@ again: sendit: #ifdef IPSEC - switch(ip_ipsec_output(&m, inp, &flags, &error)) { + switch(ip_ipsec_output(&m, inp, &error)) { case 1: goto bad; case -1: @@ -498,78 +578,29 @@ sendit: #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ - if (!PFIL_HOOKED(&V_inet_pfil_hook)) - goto passout; - - /* Run through list of hooks for output packets. */ - odst.s_addr = ip->ip_dst.s_addr; - error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp); - if (error != 0 || m == NULL) - goto done; + if (PFIL_HOOKED(&V_inet_pfil_hook)) { + switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) { + case 1: /* Finished */ + goto done; - ip = mtod(m, struct ip *); + case 0: /* Continue normally */ + ip = mtod(m, struct ip *); + break; - /* See if destination IP address was changed by packet filter. */ - if (odst.s_addr != ip->ip_dst.s_addr) { - m->m_flags |= M_SKIP_FIREWALL; - /* If destination is now ourself drop to ip_input(). */ - if (in_localip(ip->ip_dst)) { - m->m_flags |= M_FASTFWD_OURS; - if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = V_loif; - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - m->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xffff; - } - m->m_pkthdr.csum_flags |= - CSUM_IP_CHECKED | CSUM_IP_VALID; -#ifdef SCTP - if (m->m_pkthdr.csum_flags & CSUM_SCTP) - m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; -#endif - error = netisr_queue(NETISR_IP, m); - goto done; - } else { - if (ia != NULL) + case -1: /* Need to try again */ + /* Reset everything for a new round */ + RO_RTFREE(ro); + if (have_ia_ref) ifa_free(&ia->ia_ifa); - goto again; /* Redo the routing table lookup. */ - } - } + ro->ro_prepend = NULL; + rte = NULL; + gw = dst; + ip = mtod(m, struct ip *); + goto again; - /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ - if (m->m_flags & M_FASTFWD_OURS) { - if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = V_loif; - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - m->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xffff; } -#ifdef SCTP - if (m->m_pkthdr.csum_flags & CSUM_SCTP) - m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; -#endif - m->m_pkthdr.csum_flags |= - CSUM_IP_CHECKED | CSUM_IP_VALID; - - error = netisr_queue(NETISR_IP, m); - goto done; - } - /* Or forward to some other address? */ - if ((m->m_flags & M_IP_NEXTHOP) && - (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { - dst = (struct sockaddr_in *)&ro->ro_dst; - bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); - m->m_flags |= M_SKIP_FIREWALL; - m->m_flags &= ~M_IP_NEXTHOP; - m_tag_delete(m, fwd_tag); - if (ia != NULL) - ifa_free(&ia->ia_ifa); - goto again; } -passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { @@ -581,31 +612,28 @@ passout: } m->m_pkthdr.csum_flags |= CSUM_IP; - sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; - if (sw_csum & CSUM_DELAY_DATA) { + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); - sw_csum &= ~CSUM_DELAY_DATA; + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP - if (sw_csum & CSUM_SCTP) { + if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); - sw_csum &= ~CSUM_SCTP; + m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif - m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ - if (ip->ip_len <= mtu || - (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || - ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); + if (ip_len <= mtu || + (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); + m->m_pkthdr.csum_flags &= ~CSUM_IP; + } /* * Record statistics for this interface address. @@ -615,28 +643,30 @@ passout: */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) - ia->ia_ifa.if_opackets += - m->m_pkthdr.len / m->m_pkthdr.tso_segsz; + counter_u64_add(ia->ia_ifa.ifa_opackets, + m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; + counter_u64_add(ia->ia_ifa.ifa_opackets, 1); + + counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) - m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); + m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ - m->m_flags &= ~(M_PROTOFLAGS); + m_clrprotoflags(m); + IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, - (struct sockaddr *)dst, ro); + (const struct sockaddr *)gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ - if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { + if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; @@ -646,7 +676,7 @@ passout: * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ - error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); + error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { @@ -655,17 +685,19 @@ passout: if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; + counter_u64_add(ia->ia_ifa.ifa_opackets, 1); + counter_u64_add(ia->ia_ifa.ifa_obytes, + m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ - m->m_flags &= ~(M_PROTOFLAGS); + m_clrprotoflags(m); + IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, - (struct sockaddr *)dst, ro); + (const struct sockaddr *)gw, ro); } else m_freem(m); } @@ -674,9 +706,20 @@ passout: IPSTAT_INC(ips_fragmented); done: - if (ro == &iproute) + /* + * Release the route if using our private route, or if + * (with flowtable) we don't have our own reference. + */ + if (ro == &iproute || ro->ro_flags & RT_NORTREF) RO_RTFREE(ro); - if (ia != NULL) + else if (rte == NULL) + /* + * If the caller supplied a route but somehow the reference + * to it has been released need to prevent the caller + * calling RTFREE on it again. + */ + ro->ro_rt = NULL; + if (have_ia_ref) ifa_free(&ia->ia_ifa); return (error); bad: @@ -691,11 +734,10 @@ bad: * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) - * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, - u_long if_hwassist_flags, int sw_csum) + u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; @@ -705,8 +747,12 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, int firstlen; struct mbuf **mnext; int nfrags; + uint16_t ip_len, ip_off; + + ip_len = ntohs(ip->ip_len); + ip_off = ntohs(ip->ip_off); - if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ + if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } @@ -732,10 +778,10 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, } #endif if (len > PAGE_SIZE) { - /* - * Fragment large datagrams such that each segment - * contains a multiple of PAGE_SIZE amount of data, - * plus headers. This enables a receiver to perform + /* + * Fragment large datagrams such that each segment + * contains a multiple of PAGE_SIZE amount of data, + * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver @@ -747,7 +793,7 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, off = MIN(mtu, m0->m_pkthdr.len); /* - * firstlen (off - hlen) must be aligned on an + * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) @@ -776,22 +822,30 @@ smart_frag_failure: * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ - for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { + for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } - /* copy multicast and flowid flag, if any */ - m->m_flags |= (m0->m_flags & (M_FLOWID | M_MCAST)) | M_FRAG; - /* make sure the flowid is the same for the fragmented mbufs */ - M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0)); - m->m_pkthdr.flowid = m0->m_pkthdr.flowid; + /* + * Make sure the complete packet header gets copied + * from the originating mbuf to the newly created + * mbuf. This also ensures that existing firewall + * classification(s), VLAN tags and so on get copied + * to the resulting fragmented packet(s): + */ + if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { + m_free(m); + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + goto done; + } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload @@ -806,15 +860,14 @@ smart_frag_failure: mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; - /* XXX do we need to add ip->ip_off below ? */ - mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; - if (off + len >= ip->ip_len) { /* last fragment */ - len = ip->ip_len - off; - m->m_flags |= M_LASTFRAG; - } else + /* XXX do we need to add ip_off below ? */ + mhip->ip_off = ((off - hlen) >> 3) + ip_off; + if (off + len >= ip_len) + len = ip_len - off; + else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); - m->m_next = m_copym(m0, off, len, M_DONTWAIT); + m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ @@ -822,36 +875,33 @@ smart_frag_failure: goto done; } m->m_pkthdr.len = mhlen + len; - m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_netinet_fragment(m0, m); #endif - m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); + m->m_pkthdr.csum_flags &= ~CSUM_IP; + } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); - /* set first marker for fragment chain */ - m0->m_flags |= M_FIRSTFRAG | M_FRAG; - m0->m_pkthdr.csum_data = nfrags; - /* * Update first fragment by trimming what's been copied out * and updating header. */ - m_adj(m0, hlen + firstlen - ip->ip_len); + m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); - ip->ip_off |= IP_MF; - ip->ip_off = htons(ip->ip_off); + ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); + m0->m_pkthdr.csum_flags &= ~CSUM_IP; + } done: *m_frag = m0; @@ -862,11 +912,12 @@ void in_delayed_cksum(struct mbuf *m) { struct ip *ip; - u_short csum, offset; + uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; - csum = in_cksum_skip(m, ip->ip_len, offset); + ip_len = ntohs(ip->ip_len); + csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ @@ -889,6 +940,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; +#ifdef RSS + uint32_t rss_bucket; + int retval; +#endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { @@ -941,7 +996,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) error = EMSGSIZE; break; } - MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); + m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; @@ -967,6 +1022,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) break; } /* FALLTHROUGH */ + case IP_BINDMULTI: +#ifdef RSS + case IP_RSS_LISTEN_BUCKET: +#endif case IP_TOS: case IP_TTL: case IP_MINTTL: @@ -975,10 +1034,13 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: - case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: + case IP_RECVFLOWID: +#ifdef RSS + case IP_RECVRSSBUCKETID: +#endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1009,6 +1071,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(inp); \ } while (0) +#define OPTSET2(bit, val) do { \ + INP_WLOCK(inp); \ + if (val) \ + inp->inp_flags2 |= bit; \ + else \ + inp->inp_flags2 &= ~bit; \ + INP_WUNLOCK(inp); \ +} while (0) + case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; @@ -1029,10 +1100,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) OPTSET(INP_RECVIF); break; - case IP_FAITH: - OPTSET(INP_FAITH); - break; - case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; @@ -1045,9 +1112,30 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTOS: OPTSET(INP_RECVTOS); break; + case IP_BINDMULTI: + OPTSET2(INP_BINDMULTI, optval); + break; + case IP_RECVFLOWID: + OPTSET2(INP_RECVFLOWID, optval); + break; +#ifdef RSS + case IP_RSS_LISTEN_BUCKET: + if ((optval >= 0) && + (optval < rss_getnumbuckets())) { + inp->inp_rss_listen_bucket = optval; + OPTSET2(INP_RSS_BUCKET_SET, 1); + } else { + error = EINVAL; + } + break; + case IP_RECVRSSBUCKETID: + OPTSET2(INP_RECVRSSBUCKETID, optval); + break; +#endif } break; #undef OPTSET +#undef OPTSET2 /* * Multicast socket options are processed by the in_mcast @@ -1133,7 +1221,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) - error = sooptcopyout(sopt, + error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); @@ -1150,11 +1238,18 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: - case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: + case IP_BINDMULTI: + case IP_FLOWID: + case IP_FLOWTYPE: + case IP_RECVFLOWID: +#ifdef RSS + case IP_RSSBUCKETID: + case IP_RECVRSSBUCKETID: +#endif switch (sopt->sopt_name) { case IP_TOS: @@ -1170,6 +1265,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) +#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); @@ -1200,10 +1296,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) optval = 0; break; - case IP_FAITH: - optval = OPTBIT(INP_FAITH); - break; - case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; @@ -1216,6 +1308,32 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; + case IP_FLOWID: + optval = inp->inp_flowid; + break; + case IP_FLOWTYPE: + optval = inp->inp_flowtype; + break; + case IP_RECVFLOWID: + optval = OPTBIT2(INP_RECVFLOWID); + break; +#ifdef RSS + case IP_RSSBUCKETID: + retval = rss_hash2bucket(inp->inp_flowid, + inp->inp_flowtype, + &rss_bucket); + if (retval == 0) + optval = rss_bucket; + else + error = EINVAL; + break; + case IP_RECVRSSBUCKETID: + optval = OPTBIT2(INP_RECVRSSBUCKETID); + break; +#endif + case IP_BINDMULTI: + optval = OPTBIT2(INP_BINDMULTI); + break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; @@ -1239,7 +1357,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) caddr_t req = NULL; size_t len = 0; - if (m != 0) { + if (m != NULL) { req = mtod(m, caddr_t); len = m->m_len; } @@ -1269,18 +1387,17 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) * replicating that code here. */ static void -ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, - int hlen) +ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { - register struct ip *ip; + struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ - copym = m_dup(m, M_DONTWAIT); - if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + copym = m_dup(m, M_NOWAIT); + if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ @@ -1296,17 +1413,8 @@ ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); -#if 1 /* XXX */ - if (dst->sin_family != AF_INET) { - printf("ip_mloopback: bad address family %d\n", - dst->sin_family); - dst->sin_family = AF_INET; - } -#endif - if_simloop(ifp, copym, dst->sin_family, 0); + if_simloop(ifp, copym, AF_INET, 0); } } |