summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/tcp_lro.c
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/tcp_lro.c')
-rw-r--r--freebsd/sys/netinet/tcp_lro.c458
1 files changed, 411 insertions, 47 deletions
diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c
index 52d92aa0..3550ab84 100644
--- a/freebsd/sys/netinet/tcp_lro.c
+++ b/freebsd/sys/netinet/tcp_lro.c
@@ -4,6 +4,7 @@
* Copyright (c) 2007, Myricom Inc.
* Copyright (c) 2008, Intel Corporation.
* Copyright (c) 2012 The FreeBSD Foundation
+ * Copyright (c) 2016 Mellanox Technologies.
* All rights reserved.
*
* Portions of this software were developed by Bjoern Zeeb
@@ -39,9 +40,11 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/sys/param.h>
#include <sys/systm.h>
-#include <sys/mbuf.h>
#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
#include <sys/socket.h>
+#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_var.h>
@@ -55,59 +58,139 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_var.h>
#include <netinet/tcp.h>
#include <netinet/tcp_lro.h>
+#include <netinet/tcp_var.h>
#include <netinet6/ip6_var.h>
#include <machine/in_cksum.h>
-#ifndef LRO_ENTRIES
-#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */
-#endif
+static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
#define TCP_LRO_UPDATE_CSUM 1
#ifndef TCP_LRO_UPDATE_CSUM
#define TCP_LRO_INVALID_CSUM 0x0000
#endif
+static void tcp_lro_rx_done(struct lro_ctrl *lc);
+static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m,
+ uint32_t csum, int use_hash);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP LRO");
+
+static unsigned tcp_lro_entries = TCP_LRO_ENTRIES;
+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
+ CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
+ "default number of LRO entries");
+
+static __inline void
+tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket,
+ struct lro_entry *le)
+{
+
+ LIST_INSERT_HEAD(&lc->lro_active, le, next);
+ LIST_INSERT_HEAD(bucket, le, hash_next);
+}
+
+static __inline void
+tcp_lro_active_remove(struct lro_entry *le)
+{
+
+ LIST_REMOVE(le, next); /* active list */
+ LIST_REMOVE(le, hash_next); /* hash bucket */
+}
+
int
tcp_lro_init(struct lro_ctrl *lc)
{
+ return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
+}
+
+int
+tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
+ unsigned lro_entries, unsigned lro_mbufs)
+{
struct lro_entry *le;
- int error, i;
+ size_t size;
+ unsigned i, elements;
lc->lro_bad_csum = 0;
lc->lro_queued = 0;
lc->lro_flushed = 0;
lc->lro_cnt = 0;
- SLIST_INIT(&lc->lro_free);
- SLIST_INIT(&lc->lro_active);
-
- error = 0;
- for (i = 0; i < LRO_ENTRIES; i++) {
- le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
- M_NOWAIT | M_ZERO);
- if (le == NULL) {
- if (i == 0)
- error = ENOMEM;
- break;
- }
- lc->lro_cnt = i + 1;
- SLIST_INSERT_HEAD(&lc->lro_free, le, next);
- }
-
- return (error);
+ lc->lro_mbuf_count = 0;
+ lc->lro_mbuf_max = lro_mbufs;
+ lc->lro_cnt = lro_entries;
+ lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
+ lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
+ lc->ifp = ifp;
+ LIST_INIT(&lc->lro_free);
+ LIST_INIT(&lc->lro_active);
+
+ /* create hash table to accelerate entry lookup */
+ if (lro_entries > lro_mbufs)
+ elements = lro_entries;
+ else
+ elements = lro_mbufs;
+ lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz,
+ HASH_NOWAIT);
+ if (lc->lro_hash == NULL) {
+ memset(lc, 0, sizeof(*lc));
+ return (ENOMEM);
+ }
+
+ /* compute size to allocate */
+ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
+ (lro_entries * sizeof(*le));
+ lc->lro_mbuf_data = (struct lro_mbuf_sort *)
+ malloc(size, M_LRO, M_NOWAIT | M_ZERO);
+
+ /* check for out of memory */
+ if (lc->lro_mbuf_data == NULL) {
+ memset(lc, 0, sizeof(*lc));
+ return (ENOMEM);
+ }
+ /* compute offset for LRO entries */
+ le = (struct lro_entry *)
+ (lc->lro_mbuf_data + lro_mbufs);
+
+ /* setup linked list */
+ for (i = 0; i != lro_entries; i++)
+ LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
+
+ return (0);
}
void
tcp_lro_free(struct lro_ctrl *lc)
{
struct lro_entry *le;
+ unsigned x;
- while (!SLIST_EMPTY(&lc->lro_free)) {
- le = SLIST_FIRST(&lc->lro_free);
- SLIST_REMOVE_HEAD(&lc->lro_free, next);
- free(le, M_DEVBUF);
+ /* reset LRO free list */
+ LIST_INIT(&lc->lro_free);
+
+ /* free active mbufs, if any */
+ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
+ tcp_lro_active_remove(le);
+ m_freem(le->m_head);
}
+
+ /* free hash table */
+ if (lc->lro_hash != NULL) {
+ free(lc->lro_hash, M_LRO);
+ lc->lro_hash = NULL;
+ }
+ lc->lro_hashsz = 0;
+
+ /* free mbuf array, if any */
+ for (x = 0; x != lc->lro_mbuf_count; x++)
+ m_freem(lc->lro_mbuf_data[x].mb);
+ lc->lro_mbuf_count = 0;
+
+ /* free allocated memory, if any */
+ free(lc->lro_mbuf_data, M_LRO);
+ lc->lro_mbuf_data = NULL;
}
#ifdef TCP_LRO_UPDATE_CSUM
@@ -195,6 +278,36 @@ tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
}
#endif
+static void
+tcp_lro_rx_done(struct lro_ctrl *lc)
+{
+ struct lro_entry *le;
+
+ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ }
+}
+
+void
+tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
+{
+ struct lro_entry *le, *le_tmp;
+ struct timeval tv;
+
+ if (LIST_EMPTY(&lc->lro_active))
+ return;
+
+ getmicrotime(&tv);
+ timevalsub(&tv, timeout);
+ LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
+ if (timevalcmp(&tv, &le->mtime, >=)) {
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ }
+ }
+}
+
void
tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
{
@@ -285,7 +398,143 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
lc->lro_queued += le->append_cnt + 1;
lc->lro_flushed++;
bzero(le, sizeof(*le));
- SLIST_INSERT_HEAD(&lc->lro_free, le, next);
+ LIST_INSERT_HEAD(&lc->lro_free, le, next);
+}
+
+#ifdef HAVE_INLINE_FLSLL
+#define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
+#else
+static inline uint64_t
+tcp_lro_msb_64(uint64_t x)
+{
+ x |= (x >> 1);
+ x |= (x >> 2);
+ x |= (x >> 4);
+ x |= (x >> 8);
+ x |= (x >> 16);
+ x |= (x >> 32);
+ return (x & ~(x >> 1));
+}
+#endif
+
+/*
+ * The tcp_lro_sort() routine is comparable to qsort(), except it has
+ * a worst case complexity limit of O(MIN(N,64)*N), where N is the
+ * number of elements to sort and 64 is the number of sequence bits
+ * available. The algorithm is bit-slicing the 64-bit sequence number,
+ * sorting one bit at a time from the most significant bit until the
+ * least significant one, skipping the constant bits. This is
+ * typically called a radix sort.
+ */
+static void
+tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
+{
+ struct lro_mbuf_sort temp;
+ uint64_t ones;
+ uint64_t zeros;
+ uint32_t x;
+ uint32_t y;
+
+repeat:
+ /* for small arrays insertion sort is faster */
+ if (size <= 12) {
+ for (x = 1; x < size; x++) {
+ temp = parray[x];
+ for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
+ parray[y] = parray[y - 1];
+ parray[y] = temp;
+ }
+ return;
+ }
+
+ /* compute sequence bits which are constant */
+ ones = 0;
+ zeros = 0;
+ for (x = 0; x != size; x++) {
+ ones |= parray[x].seq;
+ zeros |= ~parray[x].seq;
+ }
+
+ /* compute bits which are not constant into "ones" */
+ ones &= zeros;
+ if (ones == 0)
+ return;
+
+ /* pick the most significant bit which is not constant */
+ ones = tcp_lro_msb_64(ones);
+
+ /*
+ * Move entries having cleared sequence bits to the beginning
+ * of the array:
+ */
+ for (x = y = 0; y != size; y++) {
+ /* skip set bits */
+ if (parray[y].seq & ones)
+ continue;
+ /* swap entries */
+ temp = parray[x];
+ parray[x] = parray[y];
+ parray[y] = temp;
+ x++;
+ }
+
+ KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
+
+ /* sort zeros */
+ tcp_lro_sort(parray, x);
+
+ /* sort ones */
+ parray += x;
+ size -= x;
+ goto repeat;
+}
+
+void
+tcp_lro_flush_all(struct lro_ctrl *lc)
+{
+ uint64_t seq;
+ uint64_t nseq;
+ unsigned x;
+
+ /* check if no mbufs to flush */
+ if (lc->lro_mbuf_count == 0)
+ goto done;
+
+ /* sort all mbufs according to stream */
+ tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
+
+ /* input data into LRO engine, stream by stream */
+ seq = 0;
+ for (x = 0; x != lc->lro_mbuf_count; x++) {
+ struct mbuf *mb;
+
+ /* get mbuf */
+ mb = lc->lro_mbuf_data[x].mb;
+
+ /* get sequence number, masking away the packet index */
+ nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
+
+ /* check for new stream */
+ if (seq != nseq) {
+ seq = nseq;
+
+ /* flush active streams */
+ tcp_lro_rx_done(lc);
+ }
+
+ /* add packet to LRO engine */
+ if (tcp_lro_rx2(lc, mb, 0, 0) != 0) {
+ /* input packet to network layer */
+ (*lc->ifp->if_input)(lc->ifp, mb);
+ lc->lro_queued++;
+ lc->lro_flushed++;
+ }
+ }
+done:
+ /* flush active streams */
+ tcp_lro_rx_done(lc);
+
+ lc->lro_mbuf_count = 0;
}
#ifdef INET6
@@ -348,8 +597,8 @@ tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
}
#endif
-int
-tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
+static int
+tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash)
{
struct lro_entry *le;
struct ether_header *eh;
@@ -365,6 +614,8 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
tcp_seq seq;
int error, ip_len, l;
uint16_t eh_type, tcp_data_len;
+ struct lro_head *bucket;
+ int force_flush = 0;
/* We expect a contiguous header [eh, ip, tcp]. */
@@ -431,10 +682,17 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
* Check TCP header constraints.
*/
/* Ensure no bits set besides ACK or PSH. */
- if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
- return (TCP_LRO_CANNOT);
+ if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
+ if (th->th_flags & TH_SYN)
+ return (TCP_LRO_CANNOT);
+ /*
+ * Make sure that previously seen segements/ACKs are delivered
+ * before this segement, e.g. FIN.
+ */
+ force_flush = 1;
+ }
- /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
+ /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
/* XXX-BZ Ideally we'd flush on PUSH? */
/*
@@ -448,8 +706,13 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
ts_ptr = (uint32_t *)(th + 1);
if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
(*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
- TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
- return (TCP_LRO_CANNOT);
+ TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
+ /*
+ * Make sure that previously seen segements/ACKs are delivered
+ * before this segement.
+ */
+ force_flush = 1;
+ }
/* If the driver did not pass in the checksum, set it now. */
if (csum == 0x0000)
@@ -457,8 +720,41 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
seq = ntohl(th->th_seq);
+ if (!use_hash) {
+ bucket = &lc->lro_hash[0];
+ } else if (M_HASHTYPE_ISHASH(m)) {
+ bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz];
+ } else {
+ uint32_t hash;
+
+ switch (eh_type) {
+#ifdef INET
+ case ETHERTYPE_IP:
+ hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr;
+ break;
+#endif
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ hash = ip6->ip6_src.s6_addr32[0] +
+ ip6->ip6_dst.s6_addr32[0];
+ hash += ip6->ip6_src.s6_addr32[1] +
+ ip6->ip6_dst.s6_addr32[1];
+ hash += ip6->ip6_src.s6_addr32[2] +
+ ip6->ip6_dst.s6_addr32[2];
+ hash += ip6->ip6_src.s6_addr32[3] +
+ ip6->ip6_dst.s6_addr32[3];
+ break;
+#endif
+ default:
+ hash = 0;
+ break;
+ }
+ hash += th->th_sport + th->th_dport;
+ bucket = &lc->lro_hash[hash % lc->lro_hashsz];
+ }
+
/* Try to find a matching previous segment. */
- SLIST_FOREACH(le, &lc->lro_active, next) {
+ LIST_FOREACH(le, bucket, hash_next) {
if (le->eh_type != eh_type)
continue;
if (le->source_port != th->th_sport ||
@@ -483,9 +779,16 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
#endif
}
+ if (force_flush) {
+ /* Timestamps mismatch; this is a FIN, etc */
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ return (TCP_LRO_CANNOT);
+ }
+
/* Flush now if appending will result in overflow. */
- if (le->p_len > (65535 - tcp_data_len)) {
- SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
+ tcp_lro_active_remove(le);
tcp_lro_flush(lc, le);
break;
}
@@ -494,7 +797,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
if (__predict_false(seq != le->next_seq ||
(tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
/* Out of order packet or duplicate ACK. */
- SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ tcp_lro_active_remove(le);
tcp_lro_flush(lc, le);
return (TCP_LRO_CANNOT);
}
@@ -522,6 +825,14 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
if (tcp_data_len == 0) {
m_freem(m);
+ /*
+ * Flush this LRO entry, if this ACK should not
+ * be further delayed.
+ */
+ if (le->append_cnt >= lc->lro_ackcnt_lim) {
+ tcp_lro_active_remove(le);
+ tcp_lro_flush(lc, le);
+ }
return (0);
}
@@ -533,7 +844,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
* append new segment to existing mbuf chain.
*/
m_adj(m, m->m_pkthdr.len - tcp_data_len);
- m->m_flags &= ~M_PKTHDR;
+ m_demote_pkthdr(m);
le->m_tail->m_next = m;
le->m_tail = m_last(m);
@@ -542,22 +853,32 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
* If a possible next full length packet would cause an
* overflow, pro-actively flush now.
*/
- if (le->p_len > (65535 - lc->ifp->if_mtu)) {
- SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+ if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
+ tcp_lro_active_remove(le);
tcp_lro_flush(lc, le);
- }
+ } else
+ getmicrotime(&le->mtime);
return (0);
}
- /* Try to find an empty slot. */
- if (SLIST_EMPTY(&lc->lro_free))
+ if (force_flush) {
+ /*
+ * Nothing to flush, but this segment can not be further
+ * aggregated/delayed.
+ */
return (TCP_LRO_CANNOT);
+ }
+
+ /* Try to find an empty slot. */
+ if (LIST_EMPTY(&lc->lro_free))
+ return (TCP_LRO_NO_ENTRIES);
/* Start a new segment chain. */
- le = SLIST_FIRST(&lc->lro_free);
- SLIST_REMOVE_HEAD(&lc->lro_free, next);
- SLIST_INSERT_HEAD(&lc->lro_active, le, next);
+ le = LIST_FIRST(&lc->lro_free);
+ LIST_REMOVE(le, next);
+ tcp_lro_active_insert(lc, bucket, le);
+ getmicrotime(&le->mtime);
/* Start filling in details. */
switch (eh_type) {
@@ -614,4 +935,47 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
return (0);
}
+int
+tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
+{
+
+ return tcp_lro_rx2(lc, m, csum, 1);
+}
+
+void
+tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
+{
+ /* sanity checks */
+ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
+ lc->lro_mbuf_max == 0)) {
+ /* packet drop */
+ m_freem(mb);
+ return;
+ }
+
+ /* check if packet is not LRO capable */
+ if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
+ (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
+ lc->lro_flushed++;
+ lc->lro_queued++;
+
+ /* input packet to network layer */
+ (*lc->ifp->if_input) (lc->ifp, mb);
+ return;
+ }
+
+ /* check if array is full */
+ if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max))
+ tcp_lro_flush_all(lc);
+
+ /* create sequence number */
+ lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
+ (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
+ (((uint64_t)mb->m_pkthdr.flowid) << 24) |
+ ((uint64_t)lc->lro_mbuf_count);
+
+ /* enter mbuf */
+ lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb;
+}
+
/* end */