diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_lro.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_lro.c | 458 |
1 files changed, 411 insertions, 47 deletions
diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c index 52d92aa0..3550ab84 100644 --- a/freebsd/sys/netinet/tcp_lro.c +++ b/freebsd/sys/netinet/tcp_lro.c @@ -4,6 +4,7 @@ * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation + * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb @@ -39,9 +40,11 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/systm.h> -#include <sys/mbuf.h> #include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> #include <sys/socket.h> +#include <sys/sysctl.h> #include <net/if.h> #include <net/if_var.h> @@ -55,59 +58,139 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip_var.h> #include <netinet/tcp.h> #include <netinet/tcp_lro.h> +#include <netinet/tcp_var.h> #include <netinet6/ip6_var.h> #include <machine/in_cksum.h> -#ifndef LRO_ENTRIES -#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */ -#endif +static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_UPDATE_CSUM 1 #ifndef TCP_LRO_UPDATE_CSUM #define TCP_LRO_INVALID_CSUM 0x0000 #endif +static void tcp_lro_rx_done(struct lro_ctrl *lc); +static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, + uint32_t csum, int use_hash); + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TCP LRO"); + +static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; +SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, + CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, + "default number of LRO entries"); + +static __inline void +tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, + struct lro_entry *le) +{ + + LIST_INSERT_HEAD(&lc->lro_active, le, next); + LIST_INSERT_HEAD(bucket, le, hash_next); +} + +static __inline void +tcp_lro_active_remove(struct lro_entry *le) +{ + + LIST_REMOVE(le, next); /* active list */ + LIST_REMOVE(le, hash_next); /* hash bucket */ +} + int tcp_lro_init(struct lro_ctrl *lc) { + return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); +} + +int +tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, + unsigned lro_entries, unsigned lro_mbufs) +{ struct lro_entry *le; - int error, i; + size_t size; + unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_cnt = 0; - SLIST_INIT(&lc->lro_free); - SLIST_INIT(&lc->lro_active); - - error = 0; - for (i = 0; i < LRO_ENTRIES; i++) { - le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (le == NULL) { - if (i == 0) - error = ENOMEM; - break; - } - lc->lro_cnt = i + 1; - SLIST_INSERT_HEAD(&lc->lro_free, le, next); - } - - return (error); + lc->lro_mbuf_count = 0; + lc->lro_mbuf_max = lro_mbufs; + lc->lro_cnt = lro_entries; + lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; + lc->lro_length_lim = TCP_LRO_LENGTH_MAX; + lc->ifp = ifp; + LIST_INIT(&lc->lro_free); + LIST_INIT(&lc->lro_active); + + /* create hash table to accelerate entry lookup */ + if (lro_entries > lro_mbufs) + elements = lro_entries; + else + elements = lro_mbufs; + lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, + HASH_NOWAIT); + if (lc->lro_hash == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + + /* compute size to allocate */ + size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + + (lro_entries * sizeof(*le)); + lc->lro_mbuf_data = (struct lro_mbuf_sort *) + malloc(size, M_LRO, M_NOWAIT | M_ZERO); + + /* check for out of memory */ + if (lc->lro_mbuf_data == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + /* compute offset for LRO entries */ + le = (struct lro_entry *) + (lc->lro_mbuf_data + lro_mbufs); + + /* setup linked list */ + for (i = 0; i != lro_entries; i++) + LIST_INSERT_HEAD(&lc->lro_free, le + i, next); + + return (0); } void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; + unsigned x; - while (!SLIST_EMPTY(&lc->lro_free)) { - le = SLIST_FIRST(&lc->lro_free); - SLIST_REMOVE_HEAD(&lc->lro_free, next); - free(le, M_DEVBUF); + /* reset LRO free list */ + LIST_INIT(&lc->lro_free); + + /* free active mbufs, if any */ + while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { + tcp_lro_active_remove(le); + m_freem(le->m_head); } + + /* free hash table */ + if (lc->lro_hash != NULL) { + free(lc->lro_hash, M_LRO); + lc->lro_hash = NULL; + } + lc->lro_hashsz = 0; + + /* free mbuf array, if any */ + for (x = 0; x != lc->lro_mbuf_count; x++) + m_freem(lc->lro_mbuf_data[x].mb); + lc->lro_mbuf_count = 0; + + /* free allocated memory, if any */ + free(lc->lro_mbuf_data, M_LRO); + lc->lro_mbuf_data = NULL; } #ifdef TCP_LRO_UPDATE_CSUM @@ -195,6 +278,36 @@ tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, } #endif +static void +tcp_lro_rx_done(struct lro_ctrl *lc) +{ + struct lro_entry *le; + + while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + } +} + +void +tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) +{ + struct lro_entry *le, *le_tmp; + struct timeval tv; + + if (LIST_EMPTY(&lc->lro_active)) + return; + + getmicrotime(&tv); + timevalsub(&tv, timeout); + LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { + if (timevalcmp(&tv, &le->mtime, >=)) { + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + } + } +} + void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { @@ -285,7 +398,143 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) lc->lro_queued += le->append_cnt + 1; lc->lro_flushed++; bzero(le, sizeof(*le)); - SLIST_INSERT_HEAD(&lc->lro_free, le, next); + LIST_INSERT_HEAD(&lc->lro_free, le, next); +} + +#ifdef HAVE_INLINE_FLSLL +#define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) +#else +static inline uint64_t +tcp_lro_msb_64(uint64_t x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + x |= (x >> 32); + return (x & ~(x >> 1)); +} +#endif + +/* + * The tcp_lro_sort() routine is comparable to qsort(), except it has + * a worst case complexity limit of O(MIN(N,64)*N), where N is the + * number of elements to sort and 64 is the number of sequence bits + * available. The algorithm is bit-slicing the 64-bit sequence number, + * sorting one bit at a time from the most significant bit until the + * least significant one, skipping the constant bits. This is + * typically called a radix sort. + */ +static void +tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) +{ + struct lro_mbuf_sort temp; + uint64_t ones; + uint64_t zeros; + uint32_t x; + uint32_t y; + +repeat: + /* for small arrays insertion sort is faster */ + if (size <= 12) { + for (x = 1; x < size; x++) { + temp = parray[x]; + for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) + parray[y] = parray[y - 1]; + parray[y] = temp; + } + return; + } + + /* compute sequence bits which are constant */ + ones = 0; + zeros = 0; + for (x = 0; x != size; x++) { + ones |= parray[x].seq; + zeros |= ~parray[x].seq; + } + + /* compute bits which are not constant into "ones" */ + ones &= zeros; + if (ones == 0) + return; + + /* pick the most significant bit which is not constant */ + ones = tcp_lro_msb_64(ones); + + /* + * Move entries having cleared sequence bits to the beginning + * of the array: + */ + for (x = y = 0; y != size; y++) { + /* skip set bits */ + if (parray[y].seq & ones) + continue; + /* swap entries */ + temp = parray[x]; + parray[x] = parray[y]; + parray[y] = temp; + x++; + } + + KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); + + /* sort zeros */ + tcp_lro_sort(parray, x); + + /* sort ones */ + parray += x; + size -= x; + goto repeat; +} + +void +tcp_lro_flush_all(struct lro_ctrl *lc) +{ + uint64_t seq; + uint64_t nseq; + unsigned x; + + /* check if no mbufs to flush */ + if (lc->lro_mbuf_count == 0) + goto done; + + /* sort all mbufs according to stream */ + tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); + + /* input data into LRO engine, stream by stream */ + seq = 0; + for (x = 0; x != lc->lro_mbuf_count; x++) { + struct mbuf *mb; + + /* get mbuf */ + mb = lc->lro_mbuf_data[x].mb; + + /* get sequence number, masking away the packet index */ + nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); + + /* check for new stream */ + if (seq != nseq) { + seq = nseq; + + /* flush active streams */ + tcp_lro_rx_done(lc); + } + + /* add packet to LRO engine */ + if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { + /* input packet to network layer */ + (*lc->ifp->if_input)(lc->ifp, mb); + lc->lro_queued++; + lc->lro_flushed++; + } + } +done: + /* flush active streams */ + tcp_lro_rx_done(lc); + + lc->lro_mbuf_count = 0; } #ifdef INET6 @@ -348,8 +597,8 @@ tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, } #endif -int -tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +static int +tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) { struct lro_entry *le; struct ether_header *eh; @@ -365,6 +614,8 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) tcp_seq seq; int error, ip_len, l; uint16_t eh_type, tcp_data_len; + struct lro_head *bucket; + int force_flush = 0; /* We expect a contiguous header [eh, ip, tcp]. */ @@ -431,10 +682,17 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) * Check TCP header constraints. */ /* Ensure no bits set besides ACK or PSH. */ - if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) - return (TCP_LRO_CANNOT); + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { + if (th->th_flags & TH_SYN) + return (TCP_LRO_CANNOT); + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segement, e.g. FIN. + */ + force_flush = 1; + } - /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */ + /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ /* XXX-BZ Ideally we'd flush on PUSH? */ /* @@ -448,8 +706,13 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) ts_ptr = (uint32_t *)(th + 1); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| - TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) - return (TCP_LRO_CANNOT); + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segement. + */ + force_flush = 1; + } /* If the driver did not pass in the checksum, set it now. */ if (csum == 0x0000) @@ -457,8 +720,41 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) seq = ntohl(th->th_seq); + if (!use_hash) { + bucket = &lc->lro_hash[0]; + } else if (M_HASHTYPE_ISHASH(m)) { + bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; + } else { + uint32_t hash; + + switch (eh_type) { +#ifdef INET + case ETHERTYPE_IP: + hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + hash = ip6->ip6_src.s6_addr32[0] + + ip6->ip6_dst.s6_addr32[0]; + hash += ip6->ip6_src.s6_addr32[1] + + ip6->ip6_dst.s6_addr32[1]; + hash += ip6->ip6_src.s6_addr32[2] + + ip6->ip6_dst.s6_addr32[2]; + hash += ip6->ip6_src.s6_addr32[3] + + ip6->ip6_dst.s6_addr32[3]; + break; +#endif + default: + hash = 0; + break; + } + hash += th->th_sport + th->th_dport; + bucket = &lc->lro_hash[hash % lc->lro_hashsz]; + } + /* Try to find a matching previous segment. */ - SLIST_FOREACH(le, &lc->lro_active, next) { + LIST_FOREACH(le, bucket, hash_next) { if (le->eh_type != eh_type) continue; if (le->source_port != th->th_sport || @@ -483,9 +779,16 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) #endif } + if (force_flush) { + /* Timestamps mismatch; this is a FIN, etc */ + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + return (TCP_LRO_CANNOT); + } + /* Flush now if appending will result in overflow. */ - if (le->p_len > (65535 - tcp_data_len)) { - SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { + tcp_lro_active_remove(le); tcp_lro_flush(lc, le); break; } @@ -494,7 +797,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) if (__predict_false(seq != le->next_seq || (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { /* Out of order packet or duplicate ACK. */ - SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + tcp_lro_active_remove(le); tcp_lro_flush(lc, le); return (TCP_LRO_CANNOT); } @@ -522,6 +825,14 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) if (tcp_data_len == 0) { m_freem(m); + /* + * Flush this LRO entry, if this ACK should not + * be further delayed. + */ + if (le->append_cnt >= lc->lro_ackcnt_lim) { + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + } return (0); } @@ -533,7 +844,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) * append new segment to existing mbuf chain. */ m_adj(m, m->m_pkthdr.len - tcp_data_len); - m->m_flags &= ~M_PKTHDR; + m_demote_pkthdr(m); le->m_tail->m_next = m; le->m_tail = m_last(m); @@ -542,22 +853,32 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) * If a possible next full length packet would cause an * overflow, pro-actively flush now. */ - if (le->p_len > (65535 - lc->ifp->if_mtu)) { - SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) { + tcp_lro_active_remove(le); tcp_lro_flush(lc, le); - } + } else + getmicrotime(&le->mtime); return (0); } - /* Try to find an empty slot. */ - if (SLIST_EMPTY(&lc->lro_free)) + if (force_flush) { + /* + * Nothing to flush, but this segment can not be further + * aggregated/delayed. + */ return (TCP_LRO_CANNOT); + } + + /* Try to find an empty slot. */ + if (LIST_EMPTY(&lc->lro_free)) + return (TCP_LRO_NO_ENTRIES); /* Start a new segment chain. */ - le = SLIST_FIRST(&lc->lro_free); - SLIST_REMOVE_HEAD(&lc->lro_free, next); - SLIST_INSERT_HEAD(&lc->lro_active, le, next); + le = LIST_FIRST(&lc->lro_free); + LIST_REMOVE(le, next); + tcp_lro_active_insert(lc, bucket, le); + getmicrotime(&le->mtime); /* Start filling in details. */ switch (eh_type) { @@ -614,4 +935,47 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) return (0); } +int +tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +{ + + return tcp_lro_rx2(lc, m, csum, 1); +} + +void +tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) +{ + /* sanity checks */ + if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || + lc->lro_mbuf_max == 0)) { + /* packet drop */ + m_freem(mb); + return; + } + + /* check if packet is not LRO capable */ + if (__predict_false(mb->m_pkthdr.csum_flags == 0 || + (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { + lc->lro_flushed++; + lc->lro_queued++; + + /* input packet to network layer */ + (*lc->ifp->if_input) (lc->ifp, mb); + return; + } + + /* check if array is full */ + if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max)) + tcp_lro_flush_all(lc); + + /* create sequence number */ + lc->lro_mbuf_data[lc->lro_mbuf_count].seq = + (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | + (((uint64_t)mb->m_pkthdr.flowid) << 24) | + ((uint64_t)lc->lro_mbuf_count); + + /* enter mbuf */ + lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb; +} + /* end */ |