diff options
Diffstat (limited to 'freebsd/sys/netinet/udp_usrreq.c')
-rw-r--r-- | freebsd/sys/netinet/udp_usrreq.c | 584 |
1 files changed, 459 insertions, 125 deletions
diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c index bf95e954..7eb11648 100644 --- a/freebsd/sys/netinet/udp_usrreq.c +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -5,6 +5,7 @@ * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. + * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under @@ -44,10 +45,10 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <rtems/bsd/local/opt_ipfw.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> +#include <rtems/bsd/local/opt_rss.h> #include <rtems/bsd/sys/param.h> #include <sys/domain.h> @@ -60,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <sys/priv.h> #include <sys/proc.h> #include <sys/protosw.h> +#include <sys/sdt.h> #include <sys/signalvar.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -71,9 +73,12 @@ __FBSDID("$FreeBSD$"); #include <vm/uma.h> #include <net/if.h> +#include <net/if_var.h> #include <net/route.h> +#include <net/rss_config.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_pcb.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> @@ -90,6 +95,8 @@ __FBSDID("$FreeBSD$"); #endif #include <netinet/udp.h> #include <netinet/udp_var.h> +#include <netinet/udplite.h> +#include <netinet/in_rss.h> #ifdef IPSEC #include <netipsec/ipsec.h> @@ -101,8 +108,9 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> /* - * UDP protocol implementation. + * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. + * Per RFC 3828, July, 2004. */ /* @@ -112,7 +120,7 @@ __FBSDID("$FreeBSD$"); * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; -SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); int udp_log_in_vain = 0; @@ -120,12 +128,17 @@ SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; -SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); +static VNET_DEFINE(int, udp_require_l2_bcast) = 0; +#define V_udp_require_l2_bcast VNET(udp_require_l2_bcast) +SYSCTL_INT(_net_inet_udp, OID_AUTO, require_l2_bcast, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(udp_require_l2_bcast), 0, + "Only treat packets sent to an L2 broadcast address as broadcast packets"); + u_long udp_sendspace = 9216; /* really max datagram size */ - /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); @@ -135,13 +148,15 @@ u_long udp_recvspace = 40 * (1024 + #else sizeof(struct sockaddr_in) #endif - ); + ); /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); +VNET_DEFINE(struct inpcbhead, ulitecb); +VNET_DEFINE(struct inpcbinfo, ulitecbinfo); static VNET_DEFINE(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) @@ -149,11 +164,14 @@ static VNET_DEFINE(uma_zone_t, udpcb_zone); #define UDBHASHSIZE 128 #endif -VNET_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ -SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(udpstat), udpstat, - "UDP statistics (struct udpstat, netinet/udp_var.h)"); +VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ +VNET_PCPUSTAT_SYSINIT(udpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, + udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(udpstat); +#endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, @@ -187,20 +205,47 @@ udp_inpcb_init(void *mem, int size, int flags) return (0); } +static int +udplite_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp; + + inp = mem; + INP_LOCK_INIT(inp, "inp", "udpliteinp"); + return (0); +} + void udp_init(void) { + /* + * For now default to 2-tuple UDP hashing - until the fragment + * reassembly code can also update the flowid. + * + * Once we can calculate the flowid that way and re-establish + * a 4-tuple, flip this to 4-tuple. + */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, + "udp_inpcb", udp_inpcb_init, NULL, 0, IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); + uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +void +udplite_init(void) +{ + + in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, + UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL, + 0, IPI_HASHFIELDS_2TUPLE); +} + /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the @@ -212,7 +257,7 @@ void kmod_udpstat_inc(int statnum) { - (*((u_long *)&V_udpstat + statnum))++; + counter_u64_add(VNET(udpstat)[statnum], 1); } int @@ -235,13 +280,23 @@ udp_discardcb(struct udpcb *up) } #ifdef VIMAGE -void -udp_destroy(void) +static void +udp_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); } +VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL); + +static void +udplite_destroy(void *unused __unused) +{ + + in_pcbinfo_destroy(&V_ulitecbinfo); +} +VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy, + NULL); #endif #ifdef INET @@ -251,14 +306,23 @@ udp_destroy(void) * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. + * + * In the normal case udp_append() will return 0, indicating that you + * must unlock the inp. However if a tunneling protocol is in place we increment + * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we + * then decrement the reference count. If the inp_rele returns 1, indicating the + * inp is gone, we return that to the caller to tell them *not* to unlock + * the inp. In the case of multi-cast this will cause the distribution + * to stop (though most tunneling protocols known currently do *not* use + * multicast). */ -static void +static int udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; - struct mbuf *opts = 0; + struct mbuf *opts = NULL; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif @@ -271,21 +335,21 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { - (*up->u_tun_func)(n, off, inp); - return; + in_pcbref(inp); + INP_RUNLOCK(inp); + (*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in, + up->u_tun_ctx); + INP_RLOCK(inp); + return (in_pcbrele_rlocked(inp)); } - if (n == NULL) - return; - off += sizeof(struct udphdr); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); - IPSECSTAT_INC(in_polvio); - return; + return (0); } #ifdef IPSEC_NAT_T up = intoudpcb(inp); @@ -293,14 +357,14 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ n = udp4_espdecap(inp, n, off); if (n == NULL) /* Consumed. */ - return; + return (0); } #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); - return; + return (0); } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || @@ -334,22 +398,28 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); + return (0); } -void -udp_input(struct mbuf *m, int off) +int +udp_input(struct mbuf **mp, int *offp, int proto) { - int iphlen = off; struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; - int len; + uint16_t len, ip_len; + struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in; + struct mbuf *m; struct m_tag *fwd_tag; + int cscov_partial, iphlen; + m = *mp; + iphlen = *offp; ifp = m->m_pkthdr.rcvif; + *mp = NULL; UDPSTAT_INC(udps_ipackets); /* @@ -358,7 +428,7 @@ udp_input(struct mbuf *m, int off) * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { - ip_stripoptions(m, (struct mbuf *)0); + ip_stripoptions(m); iphlen = sizeof(struct ip); } @@ -367,13 +437,14 @@ udp_input(struct mbuf *m, int off) */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { - if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); + cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. @@ -396,13 +467,20 @@ udp_input(struct mbuf *m, int off) * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); - if (ip->ip_len != len) { - if (len > ip->ip_len || len < sizeof(struct udphdr)) { + ip_len = ntohs(ip->ip_len) - iphlen; + if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { + /* Zero means checksum over the complete packet. */ + if (len == 0) + len = ip_len; + cscov_partial = 0; + } + if (ip_len != len) { + if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } - m_adj(m, len - ip->ip_len); - /* ip->ip_len = len; */ + if (proto == IPPROTO_UDP) + m_adj(m, len - ip_len); } /* @@ -420,39 +498,53 @@ udp_input(struct mbuf *m, int off) if (uh->uh_sum) { u_short uh_sum; - if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && + !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + - m->m_pkthdr.csum_data + IPPROTO_UDP)); + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); - ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ? + uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); - return; + return (IPPROTO_DONE); } - } else - UDPSTAT_INC(udps_nosum); + } else { + if (proto == IPPROTO_UDP) { + UDPSTAT_INC(udps_nosum); + } else { + /* UDPLite requires a checksum */ + /* XXX: What is the right UDPLite MIB counter here? */ + m_freem(m); + return (IPPROTO_DONE); + } + } + pcbinfo = udp_get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, ifp)) { + ((!V_udp_require_l2_bcast || m->m_flags & M_BCAST) && + in_broadcast(ip->ip_dst, ifp))) { struct inpcb *last; + struct inpcbhead *pcblist; struct ip_moptions *imo; - INP_INFO_RLOCK(&V_udbinfo); + INP_INFO_RLOCK(pcbinfo); + pcblist = udp_get_pcblist(proto); last = NULL; - LIST_FOREACH(inp, &V_udb, inp_list) { + LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 @@ -511,8 +603,14 @@ udp_input(struct mbuf *m, int off) if (last != NULL) { struct mbuf *n; - n = m_copy(m, 0, M_COPYALL); - udp_append(last, ip, n, iphlen, &udp_in); + if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { + UDP_PROBE(receive, NULL, last, ip, + last, uh); + if (udp_append(last, ip, n, iphlen, + &udp_in)) { + goto inp_lost; + } + } INP_RUNLOCK(last); } last = inp; @@ -538,13 +636,15 @@ udp_input(struct mbuf *m, int off) UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK(pcbinfo); goto badunlocked; } - udp_append(last, ip, m, iphlen, &udp_in); - INP_RUNLOCK(last); - INP_INFO_RUNLOCK(&V_udbinfo); - return; + UDP_PROBE(receive, NULL, last, ip, last, uh); + if (udp_append(last, ip, m, iphlen, &udp_in) == 0) + INP_RUNLOCK(last); + inp_lost: + INP_INFO_RUNLOCK(pcbinfo); + return (IPPROTO_DONE); } /* @@ -564,7 +664,7 @@ udp_input(struct mbuf *m, int off) * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ - inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* @@ -572,7 +672,7 @@ udp_input(struct mbuf *m, int off) * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ - inp = in_pcblookup(&V_udbinfo, ip->ip_src, + inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | @@ -582,7 +682,7 @@ udp_input(struct mbuf *m, int off) m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else - inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { @@ -605,9 +705,8 @@ udp_input(struct mbuf *m, int off) if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; - ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - return; + return (IPPROTO_DONE); } /* @@ -617,14 +716,27 @@ udp_input(struct mbuf *m, int off) if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); - return; + return (IPPROTO_DONE); + } + if (cscov_partial) { + struct udpcb *up; + + up = intoudpcb(inp); + if (up->u_rxcslen == 0 || up->u_rxcslen > len) { + INP_RUNLOCK(inp); + m_freem(m); + return (IPPROTO_DONE); + } } - udp_append(inp, ip, m, iphlen, &udp_in); - INP_RUNLOCK(inp); - return; + + UDP_PROBE(receive, NULL, inp, ip, inp, uh); + if (udp_append(inp, ip, m, iphlen, &udp_in) == 0) + INP_RUNLOCK(inp); + return (IPPROTO_DONE); badunlocked: m_freem(m); + return (IPPROTO_DONE); } #endif /* INET */ @@ -643,6 +755,11 @@ udp_notify(struct inpcb *inp, int errno) * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); + if ((errno == EHOSTUNREACH || errno == ENETUNREACH || + errno == EHOSTDOWN) && inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); @@ -651,8 +768,9 @@ udp_notify(struct inpcb *inp, int errno) } #ifdef INET -void -udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +static void +udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, + struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; @@ -663,11 +781,11 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; - /* - * Redirects don't need to be handled up here. - */ - if (PRC_IS_REDIRECT(cmd)) + if (PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify); return; + } /* * Hostdead is ugly because it goes linearly through all PCBs. @@ -681,7 +799,7 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport, + inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); @@ -689,11 +807,39 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); + } else { + inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, + INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); + if (inp != NULL) { + struct udpcb *up; + + up = intoudpcb(inp); + if (up->u_icmp_func != NULL) { + INP_RUNLOCK(inp); + (*up->u_icmp_func)(cmd, sa, vip, up->u_tun_ctx); + } else { + INP_RUNLOCK(inp); + } + } } } else - in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], + in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); } +void +udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + + return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo)); +} + +void +udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + + return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo)); +} #endif /* INET */ static int @@ -740,7 +886,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) + if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&V_udbinfo); @@ -849,16 +995,16 @@ SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, int udp_ctloutput(struct socket *so, struct sockopt *sopt) { - int error = 0, optval; struct inpcb *inp; -#ifdef IPSEC_NAT_T struct udpcb *up; -#endif + int isudplite, error, optval; + error = 0; + isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); - if (sopt->sopt_level != IPPROTO_UDP) { + if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); @@ -916,6 +1062,34 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) } INP_WUNLOCK(inp); break; + case UDPLITE_SEND_CSCOV: + case UDPLITE_RECV_CSCOV: + if (!isudplite) { + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error != 0) + break; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + INP_WLOCK(inp); + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: up == NULL", __func__)); + if ((optval != 0 && optval < 8) || (optval > 65535)) { + INP_WUNLOCK(inp); + error = EINVAL; + break; + } + if (sopt->sopt_name == UDPLITE_SEND_CSCOV) + up->u_txcslen = optval; + else + up->u_rxcslen = optval; + INP_WUNLOCK(inp); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -933,6 +1107,22 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif + case UDPLITE_SEND_CSCOV: + case UDPLITE_RECV_CSCOV: + if (!isudplite) { + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: up == NULL", __func__)); + if (sopt->sopt_name == UDPLITE_SEND_CSCOV) + optval = up->u_txcslen; + else + optval = up->u_rxcslen; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -955,12 +1145,18 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; + struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; + int cscov_partial = 0; int error = 0; int ipflags; u_short fport, lport; - int unlock_udbinfo; + int unlock_udbinfo, unlock_inp; u_char tos; + uint8_t pr; + uint16_t cscov = 0; + uint32_t flowid = 0; + uint8_t flowtype = M_HASHTYPE_NONE; /* * udp_output() may need to temporarily bind or connect the current @@ -976,7 +1172,15 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } src.sin_family = 0; - INP_RLOCK(inp); + sin = (struct sockaddr_in *)addr; + if (sin == NULL || + (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { + INP_WLOCK(inp); + unlock_inp = UH_WLOCKED; + } else { + INP_RLOCK(inp); + unlock_inp = UH_RLOCKED; + } tos = inp->inp_ip_tos; if (control != NULL) { /* @@ -984,7 +1188,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * stored in a single mbuf. */ if (control->m_next) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); @@ -1024,6 +1231,31 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, tos = *(u_char *)CMSG_DATA(cm); break; + case IP_FLOWID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowid = *(uint32_t *) CMSG_DATA(cm); + break; + + case IP_FLOWTYPE: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowtype = *(uint32_t *) CMSG_DATA(cm); + break; + +#ifdef RSS + case IP_RSSBUCKETID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + /* This is just a placeholder for now */ + break; +#endif /* RSS */ default: error = ENOPROTOOPT; break; @@ -1034,7 +1266,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, m_freem(control); } if (error) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); m_freem(m); return (error); } @@ -1055,12 +1290,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * * XXXRW: Check that hash locking update here is correct. */ + pr = inp->inp_socket->so_proto->pr_protocol; + pcbinfo = udp_get_inpcbinfo(pr); sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { - INP_RUNLOCK(inp); - INP_WLOCK(inp); - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || @@ -1068,7 +1303,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - INP_HASH_RLOCK(&V_udbinfo); + INP_HASH_RLOCK(pcbinfo); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; @@ -1081,7 +1316,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { - INP_HASH_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { @@ -1132,7 +1367,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { - INP_HASH_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); @@ -1147,7 +1382,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK_ASSERT(&V_udbinfo); + INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. @@ -1181,7 +1416,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ - M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT); + M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; @@ -1196,12 +1431,30 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ - ui->ui_pr = IPPROTO_UDP; + ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); + if (pr == IPPROTO_UDPLITE) { + struct udpcb *up; + uint16_t plen; + + up = intoudpcb(inp); + cscov = up->u_txcslen; + plen = (u_short)len + sizeof(struct udphdr); + if (cscov >= plen) + cscov = 0; + ui->ui_len = htons(plen); + ui->ui_ulen = htons(cscov); + /* + * For UDP-Lite, checksum coverage length of zero means + * the entire UDPLite packet is covered by the checksum. + */ + cscov_partial = (cscov == 0) ? 0 : 1; + } else + ui->ui_v = IPVERSION << 4; /* * Set the Don't Fragment bit in the IP header. @@ -1210,7 +1463,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct ip *ip; ip = (struct ip *)&ui->ui_i; - ip->ip_off |= IP_DF; + ip->ip_off |= htons(IP_DF); } ipflags = 0; @@ -1228,27 +1481,90 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, /* * Set up checksum and output datagram. */ - if (V_udp_cksum) { + ui->ui_sum = 0; + if (pr == IPPROTO_UDPLITE) { + if (inp->inp_flags & INP_ONESBCAST) + faddr.s_addr = INADDR_BROADCAST; + if (cscov_partial) { + if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) + ui->ui_sum = 0xffff; + } else { + if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) + ui->ui_sum = 0xffff; + } + } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, - htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); + htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - } else - ui->ui_sum = 0; - ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + } + ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); + /* + * Setup flowid / RSS information for outbound socket. + * + * Once the UDP code decides to set a flowid some other way, + * this allows the flowid to be overridden by userland. + */ + if (flowtype != M_HASHTYPE_NONE) { + m->m_pkthdr.flowid = flowid; + M_HASHTYPE_SET(m, flowtype); +#ifdef RSS + } else { + uint32_t hash_val, hash_type; + /* + * Calculate an appropriate RSS hash for UDP and + * UDP Lite. + * + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + * + * UDP Lite is a different protocol number and will + * likely end up being hashed as a 2-tuple until + * RSS / NICs grow UDP Lite protocol awareness. + */ + if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, + pr, &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + M_HASHTYPE_SET(m, hash_type); + } +#endif + } + +#ifdef RSS + /* + * Don't override with the inp cached flowid value. + * + * Depending upon the kind of send being done, the inp + * flowid/flowtype values may actually not be appropriate + * for this particular socket send. + * + * We should either leave the flowid at zero (which is what is + * currently done) or set it to some software generated + * hash value based on the packet contents. + */ + ipflags |= IP_NODEFAULTFLOWID; +#endif /* RSS */ + if (unlock_udbinfo == UH_WLOCKED) - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK(&V_udbinfo); - error = ip_output(m, inp->inp_options, NULL, ipflags, + INP_HASH_RUNLOCK(pcbinfo); + UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); + error = ip_output(m, inp->inp_options, + (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, inp->inp_moptions, inp); - if (unlock_udbinfo == UH_WLOCKED) + if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); @@ -1256,10 +1572,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, release: if (unlock_udbinfo == UH_WLOCKED) { - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { - INP_HASH_RUNLOCK(&V_udbinfo); + INP_HASH_RUNLOCK(pcbinfo); INP_RUNLOCK(inp); } else INP_RUNLOCK(inp); @@ -1297,7 +1613,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if ((m = m_pullup(m, minlen)) == NULL) { - IPSECSTAT_INC(in_inval); + IPSECSTAT_INC(ips_in_inval); return (NULL); /* Bypass caller processing. */ } data = mtod(m, caddr_t); /* Points to ip header. */ @@ -1337,7 +1653,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) uint32_t spi; if (payload <= sizeof(struct esp)) { - IPSECSTAT_INC(in_inval); + IPSECSTAT_INC(ips_in_inval); m_freem(m); return (NULL); /* Discard. */ } @@ -1358,7 +1674,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, 2 * sizeof(uint16_t), M_NOWAIT); if (tag == NULL) { - IPSECSTAT_INC(in_nomem); + IPSECSTAT_INC(ips_in_nomem); m_freem(m); return (NULL); /* Discard. */ } @@ -1387,7 +1703,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) m_adj(m, skip); ip = mtod(m, struct ip *); - ip->ip_len -= skip; + ip->ip_len = htons(ntohs(ip->ip_len) - skip); ip->ip_p = IPPROTO_ESP; /* @@ -1397,7 +1713,8 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR); - (void) ipsec4_common_input(m, iphlen, ip->ip_p); + (void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p), + AF_INET, ip->ip_p); return (NULL); /* NB: consumed, bypass processing. */ } #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ @@ -1406,15 +1723,17 @@ static void udp_abort(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -1424,17 +1743,19 @@ static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; int error; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); - INP_INFO_WLOCK(&V_udbinfo); - error = in_pcballoc(so, &V_udbinfo); + INP_INFO_WLOCK(pcbinfo); + error = in_pcballoc(so, pcbinfo); if (error) { - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); return (error); } @@ -1446,18 +1767,18 @@ udp_attach(struct socket *so, int proto, struct thread *td) if (error) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); return (0); } #endif /* INET */ int -udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f) +udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx) { struct inpcb *inp; struct udpcb *up; @@ -1468,11 +1789,14 @@ udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f) KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); - if (up->u_tun_func != NULL) { + if ((up->u_tun_func != NULL) || + (up->u_icmp_func != NULL)) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; + up->u_icmp_func = i; + up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } @@ -1482,14 +1806,16 @@ static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; int error; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_WLOCK(inp); - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } @@ -1498,15 +1824,17 @@ static void udp_close(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -1516,9 +1844,11 @@ static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; - int error; + struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; + int error; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_WLOCK(inp); @@ -1532,9 +1862,9 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) INP_WUNLOCK(inp); return (error); } - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); @@ -1545,20 +1875,22 @@ static void udp_detach(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; struct udpcb *up; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); - INP_INFO_WLOCK(&V_udbinfo); + INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } @@ -1566,7 +1898,9 @@ static int udp_disconnect(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); @@ -1574,10 +1908,10 @@ udp_disconnect(struct socket *so) INP_WUNLOCK(inp); return (ENOTCONN); } - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); |