diff options
Diffstat (limited to 'freebsd/sys/netinet/ip_reass.c')
-rw-r--r-- | freebsd/sys/netinet/ip_reass.c | 203 |
1 files changed, 157 insertions, 46 deletions
diff --git a/freebsd/sys/netinet/ip_reass.c b/freebsd/sys/netinet/ip_reass.c index 64660228..95603390 100644 --- a/freebsd/sys/netinet/ip_reass.c +++ b/freebsd/sys/netinet/ip_reass.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/hash.h> #include <sys/mbuf.h> #include <sys/malloc.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/sysctl.h> @@ -65,18 +66,19 @@ SYSCTL_DECL(_net_inet_ip); /* * Reassembly headers are stored in hash buckets. */ -#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH_LOG2 10 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_HMASK (IPREASS_NHASH - 1) struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; + int count; }; -static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]); +VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]); #define V_ipq VNET(ipq) -static VNET_DEFINE(uint32_t, ipq_hashseed); +VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); #define V_ipq_hashseed VNET(ipq_hashseed) #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) @@ -84,6 +86,9 @@ static VNET_DEFINE(uint32_t, ipq_hashseed); #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) +VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); +#define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) + void ipreass_init(void); void ipreass_drain(void); void ipreass_slowtimo(void); @@ -91,28 +96,54 @@ void ipreass_slowtimo(void); void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); +static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static void ipreass_zone_change(void *); static void ipreass_drain_tomax(void); -static void ipq_free(struct ipqhead *, struct ipq *); +static void ipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static inline void -ipq_timeout(struct ipqhead *head, struct ipq *fp) +ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static inline void -ipq_drop(struct ipqhead *head, struct ipq *fp) +ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } -static VNET_DEFINE(uma_zone_t, ipq_zone); +/* + * By default, limit the number of IP fragments across all reassembly + * queues to 1/32 of the total number of mbuf clusters. + * + * Limit the total number of reassembly queues per VNET to the + * IP fragment limit, but ensure the limit will not allow any bucket + * to grow above 100 items. (The bucket limit is + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct + * multiplier to reach a 100-item limit.) + * The 100-item limit was chosen as brief testing seems to show that + * this produces "reasonable" performance on some subset of systems + * under DoS attack. + */ +#define IP_MAXFRAGS (nmbclusters / 32) +#define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50)) + +static int maxfrags; +static volatile u_int nfrags; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, + &maxfrags, 0, + "Maximum number of IPv4 fragments allowed across all reassembly queues"); +SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, + __DEVOLATILE(u_int *, &nfrags), 0, + "Current number of IPv4 fragments across all reassembly queues"); + +VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); #define V_ipq_zone VNET(ipq_zone) SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_maxfragpackets, "I", @@ -121,14 +152,18 @@ SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, &VNET_NAME(ipq_zone), "Current number of IPv4 fragment reassembly queue entries"); -static VNET_DEFINE(int, noreass); +VNET_DEFINE_STATIC(int, noreass); #define V_noreass VNET(noreass) -static VNET_DEFINE(int, maxfragsperpacket); +VNET_DEFINE_STATIC(int, maxfragsperpacket); #define V_maxfragsperpacket VNET(maxfragsperpacket) SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_maxfragbucketsize, "I", + "Maximum number of IPv4 fragment reassembly queue entries per bucket"); /* * Take incoming datagram fragment and try to reassemble it into @@ -148,9 +183,9 @@ ip_reass(struct mbuf *m) struct mbuf *p, *q, *nq, *t; struct ipq *fp; struct ipqhead *head; - int i, hlen, next; + int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; - uint32_t hash; + uint32_t hash, hashkey[3]; #ifdef RSS uint32_t rss_hash, rss_type; #endif @@ -158,8 +193,12 @@ ip_reass(struct mbuf *m) /* * If no reassembling or maxfragsperpacket are 0, * never accept fragments. + * Also, drop packet if it would exceed the maximum + * number of fragments. */ - if (V_noreass == 1 || V_maxfragsperpacket == 0) { + tmpmax = maxfrags; + if (V_noreass == 1 || V_maxfragsperpacket == 0 || + (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); @@ -204,8 +243,12 @@ ip_reass(struct mbuf *m) m->m_data += hlen; m->m_len -= hlen; - hash = ip->ip_src.s_addr ^ ip->ip_id; - hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK; + hashkey[0] = ip->ip_src.s_addr; + hashkey[1] = ip->ip_dst.s_addr; + hashkey[2] = (uint32_t)ip->ip_p << 16; + hashkey[2] += ip->ip_id; + hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); + hash &= IPREASS_HMASK; head = &V_ipq[hash].head; IPQ_LOCK(hash); @@ -226,9 +269,12 @@ ip_reass(struct mbuf *m) * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { - fp = uma_zalloc(V_ipq_zone, M_NOWAIT); + if (V_ipq[hash].count < V_ipreass_maxbucketsize) + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); + if (fp == NULL) + goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); @@ -238,7 +284,9 @@ ip_reass(struct mbuf *m) mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); + V_ipq[hash].count++; fp->ipq_nfrags = 1; + atomic_add_int(&nfrags, 1); fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; @@ -249,6 +297,7 @@ ip_reass(struct mbuf *m) goto done; } else { fp->ipq_nfrags++; + atomic_add_int(&nfrags, 1); #ifdef MAC mac_ipq_update(m, fp); #endif @@ -325,6 +374,7 @@ ip_reass(struct mbuf *m) m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; + atomic_subtract_int(&nfrags, 1); m_freem(q); } @@ -342,7 +392,7 @@ ip_reass(struct mbuf *m) for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } next += ntohs(GETIP(q)->ip_len); @@ -350,7 +400,7 @@ ip_reass(struct mbuf *m) /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } @@ -361,7 +411,7 @@ ip_reass(struct mbuf *m) ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } @@ -390,6 +440,7 @@ ip_reass(struct mbuf *m) while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); + atomic_subtract_int(&nfrags, fp->ipq_nfrags); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); @@ -404,6 +455,7 @@ ip_reass(struct mbuf *m) ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); + V_ipq[hash].count--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); @@ -449,8 +501,10 @@ ip_reass(struct mbuf *m) dropfrag: IPSTAT_INC(ips_fragdropped); - if (fp != NULL) + if (fp != NULL) { fp->ipq_nfrags--; + atomic_subtract_int(&nfrags, 1); + } m_freem(m); done: IPQ_UNLOCK(hash); @@ -465,21 +519,27 @@ done: void ipreass_init(void) { + int max; for (int i = 0; i < IPREASS_NHASH; i++) { TAILQ_INIT(&V_ipq[i].head); mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, MTX_DEF | MTX_DUPOK); + V_ipq[i].count = 0; } V_ipq_hashseed = arc4random(); V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(V_ipq_zone, nmbclusters / 32); + max = IP_MAXFRAGPACKETS; + max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); - if (IS_DEFAULT_VNET(curvnet)) + if (IS_DEFAULT_VNET(curvnet)) { + maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); + } } /* @@ -494,7 +554,7 @@ ipreass_slowtimo(void) IPQ_LOCK(i); TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp) if (--fp->ipq_ttl == 0) - ipq_timeout(&V_ipq[i].head, fp); + ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } } @@ -509,7 +569,10 @@ ipreass_drain(void) for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); while(!TAILQ_EMPTY(&V_ipq[i].head)) - ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head)); + ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head)); + KASSERT(V_ipq[i].count == 0, + ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, + V_ipq[i].count, V_ipq)); IPQ_UNLOCK(i); } } @@ -537,9 +600,23 @@ ipreass_destroy(void) static void ipreass_drain_tomax(void) { + struct ipq *fp; int target; /* + * Make sure each bucket is under the new limit. If + * necessary, drop enough of the oldest elements from + * each bucket to get under the new limit. + */ + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + while (V_ipq[i].count > V_ipreass_maxbucketsize && + (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) + ipq_timeout(&V_ipq[i], fp); + IPQ_UNLOCK(i); + } + + /* * If we are over the maximum number of fragments, * drain off enough to get down to the new limit, * stripping off last elements on queues. Every @@ -547,13 +624,11 @@ ipreass_drain_tomax(void) */ target = uma_zone_get_max(V_ipq_zone); while (uma_zone_get_cur(V_ipq_zone) > target) { - struct ipq *fp; - for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); if (fp != NULL) - ipq_timeout(&V_ipq[i].head, fp); + ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } } @@ -562,9 +637,20 @@ ipreass_drain_tomax(void) static void ipreass_zone_change(void *tag) { - - uma_zone_set_max(V_ipq_zone, nmbclusters / 32); - ipreass_drain_tomax(); + VNET_ITERATOR_DECL(vnet_iter); + int max; + + maxfrags = IP_MAXFRAGS; + max = IP_MAXFRAGPACKETS; + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); + ipreass_drain_tomax(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); } /* @@ -592,6 +678,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) * and place an extreme upper bound. */ max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); ipreass_drain_tomax(); V_noreass = 0; } else if (max == 0) { @@ -600,6 +687,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) } else if (max == -1) { V_noreass = 0; uma_zone_set_max(V_ipq_zone, 0); + V_ipreass_maxbucketsize = INT_MAX; } else return (EINVAL); return (0); @@ -613,49 +701,72 @@ static struct ipq * ipq_reuse(int start) { struct ipq *fp; - int i; + int bucket, i; IPQ_LOCK_ASSERT(start); - for (i = start;; i++) { - if (i == IPREASS_NHASH) - i = 0; - if (i != start && IPQ_TRYLOCK(i) == 0) + for (i = 0; i < IPREASS_NHASH; i++) { + bucket = (start + i) % IPREASS_NHASH; + if (bucket != start && IPQ_TRYLOCK(bucket) == 0) continue; - fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); + fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); if (fp) { struct mbuf *m; IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); + atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { m = fp->ipq_frags; fp->ipq_frags = m->m_nextpkt; m_freem(m); } - TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list); - if (i != start) - IPQ_UNLOCK(i); - IPQ_LOCK_ASSERT(start); - return (fp); + TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); + V_ipq[bucket].count--; + if (bucket != start) + IPQ_UNLOCK(bucket); + break; } - if (i != start) - IPQ_UNLOCK(i); + if (bucket != start) + IPQ_UNLOCK(bucket); } + IPQ_LOCK_ASSERT(start); + return (fp); } /* * Free a fragment reassembly header and all associated datagrams. */ static void -ipq_free(struct ipqhead *fhp, struct ipq *fp) +ipq_free(struct ipqbucket *bucket, struct ipq *fp) { struct mbuf *q; + atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } - TAILQ_REMOVE(fhp, fp, ipq_list); + TAILQ_REMOVE(&bucket->head, fp, ipq_list); + bucket->count--; uma_zfree(V_ipq_zone, fp); } + +/* + * Get or set the maximum number of reassembly queues per bucket. + */ +static int +sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) +{ + int error, max; + + max = V_ipreass_maxbucketsize; + error = sysctl_handle_int(oidp, &max, 0, req); + if (error || !req->newptr) + return (error); + if (max <= 0) + return (EINVAL); + V_ipreass_maxbucketsize = max; + ipreass_drain_tomax(); + return (0); +} |