summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/kern
diff options
context:
space:
mode:
authorSebastian Huber <sebastian.huber@embedded-brains.de>2018-08-09 13:04:41 +0200
committerSebastian Huber <sebastian.huber@embedded-brains.de>2018-09-21 10:29:37 +0200
commite4a8065910cd6b2e7e0448cc6431ca2906322389 (patch)
tree73492991cfa40f994c20d761d476e6bc16304536 /freebsd/sys/kern
parentUpdate to FreeBSD head 2017-08-01 (diff)
downloadrtems-libbsd-e4a8065910cd6b2e7e0448cc6431ca2906322389.tar.bz2
Update to FreeBSD head 2017-10-01
Git mirror commit b2f0376b45428f13151d229c5ae9d4d8f74acbd1. Update #3472.
Diffstat (limited to 'freebsd/sys/kern')
-rw-r--r--freebsd/sys/kern/kern_sysctl.c100
-rw-r--r--freebsd/sys/kern/subr_blist.c547
-rw-r--r--freebsd/sys/kern/subr_sbuf.c24
-rw-r--r--freebsd/sys/kern/sys_socket.c4
-rw-r--r--freebsd/sys/kern/uipc_sockbuf.c2
-rw-r--r--freebsd/sys/kern/uipc_socket.c52
-rw-r--r--freebsd/sys/kern/uipc_usrreq.c17
7 files changed, 568 insertions, 178 deletions
diff --git a/freebsd/sys/kern/kern_sysctl.c b/freebsd/sys/kern/kern_sysctl.c
index 90b15444..e99a8bd6 100644
--- a/freebsd/sys/kern/kern_sysctl.c
+++ b/freebsd/sys/kern/kern_sysctl.c
@@ -90,7 +90,7 @@ static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
* sysctl requests larger than a single page via an exclusive lock.
*/
static struct rmlock sysctllock;
-static struct sx sysctlmemlock;
+static struct sx __exclusive_cache_line sysctlmemlock;
#define SYSCTL_WLOCK() rm_wlock(&sysctllock)
#define SYSCTL_WUNLOCK() rm_wunlock(&sysctllock)
@@ -327,6 +327,91 @@ sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp)
}
#endif /* __rtems__ */
+static int
+sbuf_printf_drain(void *arg __unused, const char *data, int len)
+{
+
+ return (printf("%.*s", len, data));
+}
+
+/*
+ * Locate the path to a given oid. Returns the length of the resulting path,
+ * or -1 if the oid was not found. nodes must have room for CTL_MAXNAME
+ * elements and be NULL initialized.
+ */
+static int
+sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle)
+{
+ int indx;
+
+ SYSCTL_ASSERT_LOCKED();
+ indx = 0;
+ while (indx < CTL_MAXNAME && indx >= 0) {
+ if (nodes[indx] == NULL && indx == 0)
+ nodes[indx] = SLIST_FIRST(&sysctl__children);
+ else if (nodes[indx] == NULL)
+ nodes[indx] = SLIST_FIRST(&nodes[indx - 1]->oid_children);
+ else
+ nodes[indx] = SLIST_NEXT(nodes[indx], oid_link);
+
+ if (nodes[indx] == needle)
+ return (indx + 1);
+
+ if (nodes[indx] == NULL) {
+ indx--;
+ continue;
+ }
+
+ if ((nodes[indx]->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ indx++;
+ continue;
+ }
+ }
+ return (-1);
+}
+
+static void
+sysctl_warn_reuse(const char *func, struct sysctl_oid *leaf)
+{
+ struct sysctl_oid *nodes[CTL_MAXNAME];
+ char buf[128];
+ struct sbuf sb;
+ int rc, i;
+
+ (void)sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN | SBUF_INCLUDENUL);
+ sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
+
+ sbuf_printf(&sb, "%s: can't re-use a leaf (", __func__);
+
+ memset(nodes, 0, sizeof(nodes));
+ rc = sysctl_search_oid(nodes, leaf);
+ if (rc > 0) {
+ for (i = 0; i < rc; i++)
+ sbuf_printf(&sb, "%s%.*s", nodes[i]->oid_name,
+ i != (rc - 1), ".");
+ } else {
+ sbuf_printf(&sb, "%s", leaf->oid_name);
+ }
+ sbuf_printf(&sb, ")!\n");
+
+ (void)sbuf_finish(&sb);
+}
+
+#ifdef SYSCTL_DEBUG
+static int
+sysctl_reuse_test(SYSCTL_HANDLER_ARGS)
+{
+ struct rm_priotracker tracker;
+
+ SYSCTL_RLOCK(&tracker);
+ sysctl_warn_reuse(__func__, oidp);
+ SYSCTL_RUNLOCK(&tracker);
+ return (0);
+}
+SYSCTL_PROC(_sysctl, 0, reuse_test, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE,
+ 0, 0, sysctl_reuse_test, "-", "");
+#endif
+
void
sysctl_register_oid(struct sysctl_oid *oidp)
{
@@ -347,7 +432,7 @@ sysctl_register_oid(struct sysctl_oid *oidp)
p->oid_refcnt++;
return;
} else {
- printf("can't re-use a leaf (%s)!\n", p->oid_name);
+ sysctl_warn_reuse(__func__, p);
return;
}
}
@@ -721,8 +806,8 @@ sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
SYSCTL_WUNLOCK();
return (oidp);
} else {
+ sysctl_warn_reuse(__func__, oidp);
SYSCTL_WUNLOCK();
- printf("can't re-use a leaf (%s)!\n", name);
return (NULL);
}
}
@@ -2005,16 +2090,9 @@ userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
}
}
req.validlen = req.oldlen;
-
- if (old) {
- if (!useracc(old, req.oldlen, VM_PROT_WRITE))
- return (EFAULT);
- req.oldptr= old;
- }
+ req.oldptr = old;
if (new != NULL) {
- if (!useracc(new, newlen, VM_PROT_READ))
- return (EFAULT);
req.newlen = newlen;
req.newptr = new;
}
diff --git a/freebsd/sys/kern/subr_blist.c b/freebsd/sys/kern/subr_blist.c
index c8e32c5b..7f232e93 100644
--- a/freebsd/sys/kern/subr_blist.c
+++ b/freebsd/sys/kern/subr_blist.c
@@ -34,14 +34,17 @@
* try to interpret the meaning of a 'block' other than to return
* SWAPBLK_NONE on an allocation failure.
*
- * A radix tree is used to maintain the bitmap. Two radix constants are
- * involved: One for the bitmaps contained in the leaf nodes (typically
- * 64), and one for the meta nodes (typically 16). Both meta and leaf
- * nodes have a hint field. This field gives us a hint as to the largest
- * free contiguous range of blocks under the node. It may contain a
- * value that is too high, but will never contain a value that is too
- * low. When the radix tree is searched, allocation failures in subtrees
- * update the hint.
+ * A radix tree controls access to pieces of the bitmap, and includes
+ * auxiliary information at each interior node about the availabilty of
+ * contiguous free blocks in the subtree rooted at that node. Two radix
+ * constants are involved: one for the size of the bitmaps contained in the
+ * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
+ * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is
+ * associated with a range of blocks. The root of any subtree stores a
+ * hint field that defines an upper bound on the size of the largest
+ * allocation that can begin in the associated block range. A hint is an
+ * upper bound on a potential allocation, but not necessarily a tight upper
+ * bound.
*
* The radix tree also implements two collapsed states for meta nodes:
* the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
@@ -92,6 +95,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h>
#include <sys/blist.h>
#include <sys/malloc.h>
+#include <sys/sbuf.h>
#include <sys/proc.h>
#include <sys/mutex.h>
@@ -103,6 +107,7 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/malloc.h>
+#include <sys/sbuf.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
@@ -112,6 +117,7 @@ __FBSDID("$FreeBSD$");
#define bitcount64(x) __bitcount64((uint64_t)(x))
#define malloc(a,b,c) calloc(a, 1)
#define free(a,b) free(a)
+static __inline int imax(int a, int b) { return (a > b ? a : b); }
#include <sys/blist.h>
@@ -122,29 +128,84 @@ void panic(const char *ctl, ...);
/*
* static support functions
*/
-static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
- daddr_t cursor);
-static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
- daddr_t radix, daddr_t skip, daddr_t cursor);
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
+ u_daddr_t radix);
static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
- daddr_t radix, daddr_t skip, daddr_t blk);
+ u_daddr_t radix);
static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
- daddr_t skip, blist_t dest, daddr_t count);
+ blist_t dest, daddr_t count);
static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
- daddr_t radix, daddr_t skip, daddr_t blk);
-static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip,
- daddr_t count);
+ u_daddr_t radix);
+static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count);
#ifndef _KERNEL
static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
- daddr_t skip, int tab);
+ int tab);
#endif
#ifdef _KERNEL
static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
#endif
+_Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
+ "radix divisibility error");
+#define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1)
+#define BLIST_META_MASK (BLIST_META_RADIX - 1)
+
+/*
+ * For a subtree that can represent the state of up to 'radix' blocks, the
+ * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm'
+ * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
+ * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
+ * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
+ * in the 'meta' functions that process subtrees. Since integer division
+ * discards remainders, we can express this computation as
+ * skip = (m * m**h) / (m - 1)
+ * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
+ * and since m divides BLIST_BMAP_RADIX, we can simplify further to
+ * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
+ * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
+ * so that simple integer division by a constant can safely be used for the
+ * calculation.
+ */
+static inline daddr_t
+radix_to_skip(daddr_t radix)
+{
+
+ return (radix /
+ ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
+}
+
+/*
+ * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
+ * Assumes that the argument has only one bit set.
+ */
+static inline int
+bitpos(u_daddr_t mask)
+{
+ int hi, lo, mid;
+
+ switch (sizeof(mask)) {
+#ifdef HAVE_INLINE_FFSLL
+ case sizeof(long long):
+ return (ffsll(mask) - 1);
+#endif
+ default:
+ lo = 0;
+ hi = BLIST_BMAP_RADIX;
+ while (lo + 1 < hi) {
+ mid = (lo + hi) >> 1;
+ if ((mask >> mid) != 0)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ return (lo);
+ }
+}
+
/*
* blist_create() - create a blist capable of handling up to the specified
* number of blocks
@@ -159,18 +220,16 @@ blist_t
blist_create(daddr_t blocks, int flags)
{
blist_t bl;
- daddr_t nodes, radix, skip;
+ daddr_t nodes, radix;
/*
- * Calculate radix and skip field used for scanning.
+ * Calculate the radix field used for scanning.
*/
radix = BLIST_BMAP_RADIX;
- skip = 0;
while (radix < blocks) {
radix *= BLIST_META_RADIX;
- skip = (skip + 1) * BLIST_META_RADIX;
}
- nodes = 1 + blst_radix_init(NULL, radix, skip, blocks);
+ nodes = 1 + blst_radix_init(NULL, radix, blocks);
bl = malloc(sizeof(struct blist), M_SWAP, flags);
if (bl == NULL)
@@ -178,14 +237,13 @@ blist_create(daddr_t blocks, int flags)
bl->bl_blocks = blocks;
bl->bl_radix = radix;
- bl->bl_skip = skip;
bl->bl_cursor = 0;
bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
if (bl->bl_root == NULL) {
free(bl, M_SWAP);
return (NULL);
}
- blst_radix_init(bl->bl_root, radix, skip, blocks);
+ blst_radix_init(bl->bl_root, radix, blocks);
#if defined(BLIST_DEBUG)
printf(
@@ -226,10 +284,12 @@ blist_alloc(blist_t bl, daddr_t count)
* reduce the hint, stopping further iterations.
*/
while (count <= bl->bl_root->bm_bighint) {
- blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix,
- bl->bl_skip, bl->bl_cursor);
+ blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
+ bl->bl_radix);
if (blk != SWAPBLK_NONE) {
bl->bl_cursor = blk + count;
+ if (bl->bl_cursor == bl->bl_blocks)
+ bl->bl_cursor = 0;
return (blk);
} else if (bl->bl_cursor != 0)
bl->bl_cursor = 0;
@@ -259,7 +319,7 @@ void
blist_free(blist_t bl, daddr_t blkno, daddr_t count)
{
- blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
+ blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
}
/*
@@ -272,8 +332,7 @@ daddr_t
blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
{
- return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix,
- bl->bl_skip, 0));
+ return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
}
/*
@@ -292,7 +351,7 @@ blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
*pbl = newbl;
if (count > save->bl_blocks)
count = save->bl_blocks;
- blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
+ blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
/*
* If resizing upwards, should we free the new space or not?
@@ -311,13 +370,199 @@ blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
void
blist_print(blist_t bl)
{
- printf("BLIST {\n");
- blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
+ printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
+ blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
printf("}\n");
}
#endif
+static const u_daddr_t fib[] = {
+ 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
+ 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
+ 514229, 832040, 1346269, 2178309, 3524578,
+};
+
+/*
+ * Use 'gap' to describe a maximal range of unallocated blocks/bits.
+ */
+struct gap_stats {
+ daddr_t start; /* current gap start, or SWAPBLK_NONE */
+ daddr_t num; /* number of gaps observed */
+ daddr_t max; /* largest gap size */
+ daddr_t avg; /* average gap size */
+ daddr_t err; /* sum - num * avg */
+ daddr_t histo[nitems(fib)]; /* # gaps in each size range */
+ int max_bucket; /* last histo elt with nonzero val */
+};
+
+/*
+ * gap_stats_counting() - is the state 'counting 1 bits'?
+ * or 'skipping 0 bits'?
+ */
+static inline bool
+gap_stats_counting(const struct gap_stats *stats)
+{
+
+ return (stats->start != SWAPBLK_NONE);
+}
+
+/*
+ * init_gap_stats() - initialize stats on gap sizes
+ */
+static inline void
+init_gap_stats(struct gap_stats *stats)
+{
+
+ bzero(stats, sizeof(*stats));
+ stats->start = SWAPBLK_NONE;
+}
+
+/*
+ * update_gap_stats() - update stats on gap sizes
+ */
+static void
+update_gap_stats(struct gap_stats *stats, daddr_t posn)
+{
+ daddr_t size;
+ int hi, lo, mid;
+
+ if (!gap_stats_counting(stats)) {
+ stats->start = posn;
+ return;
+ }
+ size = posn - stats->start;
+ stats->start = SWAPBLK_NONE;
+ if (size > stats->max)
+ stats->max = size;
+
+ /*
+ * Find the fibonacci range that contains size,
+ * expecting to find it in an early range.
+ */
+ lo = 0;
+ hi = 1;
+ while (hi < nitems(fib) && fib[hi] <= size) {
+ lo = hi;
+ hi *= 2;
+ }
+ if (hi >= nitems(fib))
+ hi = nitems(fib);
+ while (lo + 1 != hi) {
+ mid = (lo + hi) >> 1;
+ if (fib[mid] <= size)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ stats->histo[lo]++;
+ if (lo > stats->max_bucket)
+ stats->max_bucket = lo;
+ stats->err += size - stats->avg;
+ stats->num++;
+ stats->avg += stats->err / stats->num;
+ stats->err %= stats->num;
+}
+
+/*
+ * dump_gap_stats() - print stats on gap sizes
+ */
+static inline void
+dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
+{
+ int i;
+
+ sbuf_printf(s, "number of maximal free ranges: %jd\n",
+ (intmax_t)stats->num);
+ sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
+ sbuf_printf(s, "average maximal free range size: %jd\n",
+ (intmax_t)stats->avg);
+ sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
+ sbuf_printf(s, " count | size range\n");
+ sbuf_printf(s, " ----- | ----------\n");
+ for (i = 0; i < stats->max_bucket; i++) {
+ if (stats->histo[i] != 0) {
+ sbuf_printf(s, "%20jd | ",
+ (intmax_t)stats->histo[i]);
+ if (fib[i] != fib[i + 1] - 1)
+ sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
+ (intmax_t)fib[i + 1] - 1);
+ else
+ sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
+ }
+ }
+ sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
+ if (stats->histo[i] > 1)
+ sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
+ (intmax_t)stats->max);
+ else
+ sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
+}
+
+/*
+ * blist_stats() - dump radix tree stats
+ */
+void
+blist_stats(blist_t bl, struct sbuf *s)
+{
+ struct gap_stats gstats;
+ struct gap_stats *stats = &gstats;
+ daddr_t i, nodes, radix;
+ u_daddr_t bit, diff, mask;
+
+ init_gap_stats(stats);
+ nodes = 0;
+ i = bl->bl_radix;
+ while (i < bl->bl_radix + bl->bl_blocks) {
+ /*
+ * Find max size subtree starting at i.
+ */
+ radix = BLIST_BMAP_RADIX;
+ while (((i / radix) & BLIST_META_MASK) == 0)
+ radix *= BLIST_META_RADIX;
+
+ /*
+ * Check for skippable subtrees starting at i.
+ */
+ while (radix > BLIST_BMAP_RADIX) {
+ if (bl->bl_root[nodes].u.bmu_avail == 0) {
+ if (gap_stats_counting(stats))
+ update_gap_stats(stats, i);
+ break;
+ }
+ if (bl->bl_root[nodes].u.bmu_avail == radix) {
+ if (!gap_stats_counting(stats))
+ update_gap_stats(stats, i);
+ break;
+ }
+
+ /*
+ * Skip subtree root.
+ */
+ nodes++;
+ radix /= BLIST_META_RADIX;
+ }
+ if (radix == BLIST_BMAP_RADIX) {
+ /*
+ * Scan leaf.
+ */
+ mask = bl->bl_root[nodes].u.bmu_bitmap;
+ diff = mask ^ (mask << 1);
+ if (gap_stats_counting(stats))
+ diff ^= 1;
+ while (diff != 0) {
+ bit = diff & -diff;
+ update_gap_stats(stats, i + bitpos(bit));
+ diff ^= bit;
+ }
+ }
+ nodes += radix_to_skip(radix);
+ i += radix;
+ }
+ update_gap_stats(stats, i);
+ dump_gap_stats(stats, s);
+}
+
/************************************************************************
* ALLOCATION SUPPORT FUNCTIONS *
************************************************************************
@@ -333,36 +578,19 @@ blist_print(blist_t bl)
*
* This is the core of the allocator and is optimized for the
* BLIST_BMAP_RADIX block allocation case. Otherwise, execution
- * time is proportional to log2(count) + log2(BLIST_BMAP_RADIX).
+ * time is proportional to log2(count) + bitpos time.
*/
static daddr_t
-blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
+blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
{
u_daddr_t mask;
- int count1, hi, lo, mid, num_shifts, range1, range_ext;
+ int count1, hi, lo, num_shifts, range1, range_ext;
- if (count == BLIST_BMAP_RADIX) {
- /*
- * Optimize allocation of BLIST_BMAP_RADIX bits. If this wasn't
- * a special case, then forming the final value of 'mask' below
- * would require special handling to avoid an invalid left shift
- * when count equals the number of bits in mask.
- */
- if (~scan->u.bmu_bitmap != 0) {
- scan->bm_bighint = BLIST_BMAP_RADIX - 1;
- return (SWAPBLK_NONE);
- }
- if (cursor != blk)
- return (SWAPBLK_NONE);
- scan->u.bmu_bitmap = 0;
- scan->bm_bighint = 0;
- return (blk);
- }
range1 = 0;
count1 = count - 1;
num_shifts = fls(count1);
mask = scan->u.bmu_bitmap;
- while (mask != 0 && num_shifts > 0) {
+ while ((-mask & ~mask) != 0 && num_shifts > 0) {
/*
* If bit i is set in mask, then bits in [i, i+range1] are set
* in scan->u.bmu_bitmap. The value of range1 is equal to
@@ -370,52 +598,95 @@ blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
* while preserving these invariants. The updates to mask leave
* fewer bits set, but each bit that remains set represents a
* longer string of consecutive bits set in scan->u.bmu_bitmap.
+ * If more updates to mask cannot clear more bits, because mask
+ * is partitioned with all 0 bits preceding all 1 bits, the loop
+ * terminates immediately.
*/
num_shifts--;
range_ext = range1 + ((count1 >> num_shifts) & 1);
- mask &= mask >> range_ext;
+ /*
+ * mask is a signed quantity for the shift because when it is
+ * shifted right, the sign bit should copied; when the last
+ * block of the leaf is free, pretend, for a while, that all the
+ * blocks that follow it are also free.
+ */
+ mask &= (daddr_t)mask >> range_ext;
range1 += range_ext;
}
if (mask == 0) {
/*
* Update bighint. There is no allocation bigger than range1
- * available in this leaf.
+ * starting in this leaf.
*/
scan->bm_bighint = range1;
return (SWAPBLK_NONE);
}
- /*
- * Discard any candidates that appear before the cursor.
- */
- lo = cursor - blk;
- mask &= ~(u_daddr_t)0 << lo;
-
+ /* Discard any candidates that appear before blk. */
+ mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
if (mask == 0)
return (SWAPBLK_NONE);
/*
* The least significant set bit in mask marks the start of the first
* available range of sufficient size. Clear all the bits but that one,
- * and then perform a binary search to find its position.
+ * and then find its position.
*/
mask &= -mask;
- hi = BLIST_BMAP_RADIX - count1;
- while (lo + 1 < hi) {
- mid = (lo + hi) >> 1;
- if ((mask >> mid) != 0)
- lo = mid;
- else
- hi = mid;
+ lo = bitpos(mask);
+
+ hi = lo + count;
+ if (hi > BLIST_BMAP_RADIX) {
+ /*
+ * An allocation within this leaf is impossible, so a successful
+ * allocation depends on the next leaf providing some of the blocks.
+ */
+ if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
+ /*
+ * The next leaf has a different meta-node parent, so it
+ * is not necessarily initialized. Update bighint,
+ * comparing the range found at the end of mask to the
+ * largest earlier range that could have been made to
+ * vanish in the initial processing of mask.
+ */
+ scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
+ return (SWAPBLK_NONE);
+ }
+ hi -= BLIST_BMAP_RADIX;
+ if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
+ /*
+ * The next leaf doesn't have enough free blocks at the
+ * beginning to complete the spanning allocation. The
+ * hint cannot be updated, because the same allocation
+ * request could be satisfied later, by this leaf, if
+ * the state of the next leaf changes, and without any
+ * changes to this leaf.
+ */
+ return (SWAPBLK_NONE);
+ }
+ /* Clear the first 'hi' bits in the next leaf, allocating them. */
+ scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
+ hi = BLIST_BMAP_RADIX;
}
- /*
- * Set in mask exactly the bits being allocated, and clear them from
- * the set of available bits.
- */
- mask = (mask << count) - mask;
+ /* Set the bits of mask at position 'lo' and higher. */
+ mask = -mask;
+ if (hi == BLIST_BMAP_RADIX) {
+ /*
+ * Update bighint. There is no allocation bigger than range1
+ * available in this leaf after this allocation completes.
+ */
+ scan->bm_bighint = range1;
+ } else {
+ /* Clear the bits of mask at position 'hi' and higher. */
+ mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
+ /* If this allocation uses all the bits, clear the hint. */
+ if (mask == scan->u.bmu_bitmap)
+ scan->bm_bighint = 0;
+ }
+ /* Clear the allocated bits from this leaf. */
scan->u.bmu_bitmap &= ~mask;
- return (blk + lo);
+ return ((blk & ~BLIST_BMAP_MASK) + lo);
}
/*
@@ -427,15 +698,14 @@ blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
* and we have a few optimizations strewn in as well.
*/
static daddr_t
-blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
- daddr_t skip, daddr_t cursor)
+blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
{
- daddr_t i, next_skip, r;
+ daddr_t blk, i, next_skip, r, skip;
int child;
bool scan_from_start;
if (radix == BLIST_BMAP_RADIX)
- return (blst_leaf_alloc(scan, blk, count, cursor));
+ return (blst_leaf_alloc(scan, cursor, count));
if (scan->u.bmu_avail < count) {
/*
* The meta node's hint must be too large if the allocation
@@ -445,6 +715,8 @@ blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
scan->bm_bighint = scan->u.bmu_avail;
return (SWAPBLK_NONE);
}
+ blk = cursor & -radix;
+ skip = radix_to_skip(radix);
next_skip = skip / BLIST_META_RADIX;
/*
@@ -458,7 +730,7 @@ blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
* Reinitialize each of the meta node's children. An ALL-FREE
* meta node cannot have a terminator in any subtree.
*/
- for (i = 1; i <= skip; i += next_skip) {
+ for (i = 1; i < skip; i += next_skip) {
if (next_skip == 1)
scan[i].u.bmu_bitmap = (u_daddr_t)-1;
else
@@ -479,13 +751,13 @@ blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
scan_from_start = cursor == blk;
child = (cursor - blk) / radix;
blk += child * radix;
- for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
+ for (i = 1 + child * next_skip; i < skip; i += next_skip) {
if (count <= scan[i].bm_bighint) {
/*
- * The allocation might fit in the i'th subtree.
+ * The allocation might fit beginning in the i'th subtree.
*/
- r = blst_meta_alloc(&scan[i], blk, count, radix,
- next_skip - 1, cursor > blk ? cursor : blk);
+ r = blst_meta_alloc(&scan[i],
+ cursor > blk ? cursor : blk, count, radix);
if (r != SWAPBLK_NONE) {
scan->u.bmu_avail -= count;
return (r);
@@ -515,22 +787,20 @@ blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
static void
blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
{
+ u_daddr_t mask;
+ int n;
+
/*
* free some data in this bitmap
- *
- * e.g.
- * 0000111111111110000
+ * mask=0000111111111110000
* \_________/\__/
- * v n
+ * count n
*/
- int n = blk & (BLIST_BMAP_RADIX - 1);
- u_daddr_t mask;
-
+ n = blk & BLIST_BMAP_MASK;
mask = ((u_daddr_t)-1 << n) &
((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
-
if (scan->u.bmu_bitmap & mask)
- panic("blst_radix_free: freeing free block");
+ panic("freeing free block");
scan->u.bmu_bitmap |= mask;
/*
@@ -553,16 +823,16 @@ blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
* range).
*/
static void
-blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
- daddr_t skip, daddr_t blk)
+blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
{
- daddr_t i, next_skip, v;
+ daddr_t blk, i, next_skip, skip, v;
int child;
if (scan->bm_bighint == (daddr_t)-1)
panic("freeing invalid range");
if (radix == BLIST_BMAP_RADIX)
return (blst_leaf_free(scan, freeBlk, count));
+ skip = radix_to_skip(radix);
next_skip = skip / BLIST_META_RADIX;
if (scan->u.bmu_avail == 0) {
@@ -574,7 +844,7 @@ blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
scan->bm_bighint = count;
if (count != radix) {
- for (i = 1; i <= skip; i += next_skip) {
+ for (i = 1; i < skip; i += next_skip) {
if (scan[i].bm_bighint == (daddr_t)-1)
break;
scan[i].bm_bighint = 0;
@@ -606,16 +876,17 @@ blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
* Break the free down into its components
*/
+ blk = freeBlk & -radix;
radix /= BLIST_META_RADIX;
child = (freeBlk - blk) / radix;
blk += child * radix;
i = 1 + child * next_skip;
- while (i <= skip && blk < freeBlk + count) {
+ while (i < skip && blk < freeBlk + count) {
v = blk + radix - freeBlk;
if (v > count)
v = count;
- blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
+ blst_meta_free(&scan[i], freeBlk, v, radix);
if (scan->bm_bighint < scan[i].bm_bighint)
scan->bm_bighint = scan[i].bm_bighint;
count -= v;
@@ -632,10 +903,10 @@ blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
* tree. The space may not already be free in the destination.
*/
static void
-blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
- blist_t dest, daddr_t count)
+blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
+ daddr_t count)
{
- daddr_t i, next_skip;
+ daddr_t i, next_skip, skip;
/*
* Leaf node
@@ -679,21 +950,20 @@ blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
}
- radix /= BLIST_META_RADIX;
+ skip = radix_to_skip(radix);
next_skip = skip / BLIST_META_RADIX;
+ radix /= BLIST_META_RADIX;
- for (i = 1; count && i <= skip; i += next_skip) {
+ for (i = 1; count && i < skip; i += next_skip) {
if (scan[i].bm_bighint == (daddr_t)-1)
break;
if (count >= radix) {
- blst_copy(&scan[i], blk, radix, next_skip - 1, dest,
- radix);
+ blst_copy(&scan[i], blk, radix, dest, radix);
count -= radix;
} else {
if (count) {
- blst_copy(&scan[i], blk, radix, next_skip - 1,
- dest, count);
+ blst_copy(&scan[i], blk, radix, dest, count);
}
count = 0;
}
@@ -711,10 +981,11 @@ blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
static daddr_t
blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
{
- int n = blk & (BLIST_BMAP_RADIX - 1);
daddr_t nblks;
u_daddr_t mask;
+ int n;
+ n = blk & BLIST_BMAP_MASK;
mask = ((u_daddr_t)-1 << n) &
((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
@@ -734,10 +1005,9 @@ blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
* number of blocks allocated by the call.
*/
static daddr_t
-blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
- daddr_t skip, daddr_t blk)
+blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
{
- daddr_t i, nblks, next_skip, v;
+ daddr_t blk, i, nblks, next_skip, skip, v;
int child;
if (scan->bm_bighint == (daddr_t)-1)
@@ -760,7 +1030,9 @@ blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
scan->bm_bighint = 0;
return (nblks);
}
+ skip = radix_to_skip(radix);
next_skip = skip / BLIST_META_RADIX;
+ blk = allocBlk & -radix;
/*
* An ALL-FREE meta node requires special handling before allocating
@@ -773,7 +1045,7 @@ blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
* Reinitialize each of the meta node's children. An ALL-FREE
* meta node cannot have a terminator in any subtree.
*/
- for (i = 1; i <= skip; i += next_skip) {
+ for (i = 1; i < skip; i += next_skip) {
if (next_skip == 1)
scan[i].u.bmu_bitmap = (u_daddr_t)-1;
else
@@ -788,12 +1060,11 @@ blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
child = (allocBlk - blk) / radix;
blk += child * radix;
i = 1 + child * next_skip;
- while (i <= skip && blk < allocBlk + count) {
+ while (i < skip && blk < allocBlk + count) {
v = blk + radix - allocBlk;
if (v > count)
v = count;
- nblks += blst_meta_fill(&scan[i], allocBlk, v, radix,
- next_skip - 1, blk);
+ nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
count -= v;
allocBlk += v;
blk += radix;
@@ -812,9 +1083,9 @@ blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
* RADIX values we use.
*/
static daddr_t
-blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
+blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count)
{
- daddr_t i, memindex, next_skip;
+ daddr_t i, memindex, next_skip, skip;
memindex = 0;
@@ -841,17 +1112,18 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
scan->u.bmu_avail = 0;
}
- radix /= BLIST_META_RADIX;
+ skip = radix_to_skip(radix);
next_skip = skip / BLIST_META_RADIX;
+ radix /= BLIST_META_RADIX;
- for (i = 1; i <= skip; i += next_skip) {
+ for (i = 1; i < skip; i += next_skip) {
if (count >= radix) {
/*
* Allocate the entire object
*/
memindex = i +
blst_radix_init(((scan) ? &scan[i] : NULL), radix,
- next_skip - 1, radix);
+ radix);
count -= radix;
} else if (count > 0) {
/*
@@ -859,14 +1131,18 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
*/
memindex = i +
blst_radix_init(((scan) ? &scan[i] : NULL), radix,
- next_skip - 1, count);
+ count);
count = 0;
} else {
/*
- * Add terminator and break out
+ * Add terminator and break out. Make terminator bitmap
+ * zero to avoid a spanning leaf allocation that
+ * includes the terminator.
*/
- if (scan)
+ if (scan) {
scan[i].bm_bighint = (daddr_t)-1;
+ scan[i].u.bmu_bitmap = 0;
+ }
break;
}
}
@@ -878,10 +1154,9 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
#ifdef BLIST_DEBUG
static void
-blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
- int tab)
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
{
- daddr_t i, next_skip;
+ daddr_t i, next_skip, skip;
if (radix == BLIST_BMAP_RADIX) {
printf(
@@ -922,11 +1197,12 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
(long long)scan->bm_bighint
);
- radix /= BLIST_META_RADIX;
+ skip = radix_to_skip(radix);
next_skip = skip / BLIST_META_RADIX;
+ radix /= BLIST_META_RADIX;
tab += 4;
- for (i = 1; i <= skip; i += next_skip) {
+ for (i = 1; i < skip; i += next_skip) {
if (scan[i].bm_bighint == (daddr_t)-1) {
printf(
"%*.*s(%08llx,%lld): Terminator\n",
@@ -935,7 +1211,7 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
);
break;
}
- blst_radix_print(&scan[i], blk, radix, next_skip - 1, tab);
+ blst_radix_print(&scan[i], blk, radix, tab);
blk += radix;
}
tab -= 4;
@@ -956,6 +1232,7 @@ main(int ac, char **av)
int size = 1024;
int i;
blist_t bl;
+ struct sbuf *s;
for (i = 1; i < ac; ++i) {
const char *ptr = av[i];
@@ -990,6 +1267,13 @@ main(int ac, char **av)
case 'p':
blist_print(bl);
break;
+ case 's':
+ s = sbuf_new_auto();
+ blist_stats(bl, s);
+ sbuf_finish(s);
+ printf("%s", sbuf_data(s));
+ sbuf_delete(s);
+ break;
case 'a':
if (sscanf(buf + 1, "%lld", &count) == 1) {
daddr_t blk = blist_alloc(bl, count);
@@ -1017,6 +1301,7 @@ main(int ac, char **av)
case 'h':
puts(
"p -print\n"
+ "s -stats\n"
"a %d -allocate\n"
"f %x %d -free\n"
"l %x %d -fill\n"
diff --git a/freebsd/sys/kern/subr_sbuf.c b/freebsd/sys/kern/subr_sbuf.c
index 8dd11b07..951cb50c 100644
--- a/freebsd/sys/kern/subr_sbuf.c
+++ b/freebsd/sys/kern/subr_sbuf.c
@@ -75,6 +75,8 @@ static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers");
#define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND)
#define SBUF_ISSECTION(s) ((s)->s_flags & SBUF_INSECTION)
#define SBUF_NULINCLUDED(s) ((s)->s_flags & SBUF_INCLUDENUL)
+#define SBUF_ISDRAINTOEOR(s) ((s)->s_flags & SBUF_DRAINTOEOR)
+#define SBUF_DODRAINTOEOR(s) (SBUF_ISSECTION(s) && SBUF_ISDRAINTOEOR(s))
/*
* Set / clear flags
@@ -310,6 +312,7 @@ sbuf_clear(struct sbuf *s)
SBUF_CLEARFLAG(s, SBUF_FINISHED);
s->s_error = 0;
s->s_len = 0;
+ s->s_rec_off = 0;
s->s_sect_len = 0;
}
@@ -364,14 +367,18 @@ sbuf_drain(struct sbuf *s)
KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s));
KASSERT(s->s_error == 0, ("Called %s with error on %p", __func__, s));
- len = s->s_drain_func(s->s_drain_arg, s->s_buf, s->s_len);
- if (len < 0) {
- s->s_error = -len;
+ if (SBUF_DODRAINTOEOR(s) && s->s_rec_off == 0)
+ return (s->s_error = EDEADLK);
+ len = s->s_drain_func(s->s_drain_arg, s->s_buf,
+ SBUF_DODRAINTOEOR(s) ? s->s_rec_off : s->s_len);
+ if (len <= 0) {
+ s->s_error = len ? -len : EDEADLK;
return (s->s_error);
}
KASSERT(len > 0 && len <= s->s_len,
("Bad drain amount %d for sbuf %p", len, s));
s->s_len -= len;
+ s->s_rec_off -= len;
/*
* Fast path for the expected case where all the data was
* drained.
@@ -635,9 +642,9 @@ sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
break;
/* Cannot print with the current available space. */
if (s->s_drain_func != NULL && s->s_len > 0)
- error = sbuf_drain(s);
- else
- error = sbuf_extend(s, len - SBUF_FREESPACE(s));
+ error = sbuf_drain(s); /* sbuf_drain() sets s_error. */
+ else if (sbuf_extend(s, len - SBUF_FREESPACE(s)) != 0)
+ s->s_error = error = ENOMEM;
} while (error == 0);
/*
@@ -654,8 +661,6 @@ sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
s->s_len += len;
if (SBUF_ISSECTION(s))
s->s_sect_len += len;
- if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s))
- s->s_error = ENOMEM;
KASSERT(s->s_len < s->s_size,
("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
@@ -837,6 +842,7 @@ sbuf_start_section(struct sbuf *s, ssize_t *old_lenp)
("s_sect_len != 0 when starting a section"));
if (old_lenp != NULL)
*old_lenp = -1;
+ s->s_rec_off = s->s_len;
SBUF_SETFLAG(s, SBUF_INSECTION);
} else {
KASSERT(old_lenp != NULL,
@@ -867,7 +873,7 @@ sbuf_end_section(struct sbuf *s, ssize_t old_len, size_t pad, int c)
}
len = s->s_sect_len;
if (old_len == -1) {
- s->s_sect_len = 0;
+ s->s_rec_off = s->s_sect_len = 0;
SBUF_CLEARFLAG(s, SBUF_INSECTION);
} else {
s->s_sect_len += old_len;
diff --git a/freebsd/sys/kern/sys_socket.c b/freebsd/sys/kern/sys_socket.c
index 9dd458f1..b3c9c56d 100644
--- a/freebsd/sys/kern/sys_socket.c
+++ b/freebsd/sys/kern/sys_socket.c
@@ -976,11 +976,7 @@ sowakeup_aio(struct socket *so, struct sockbuf *sb)
if (sb->sb_flags & SB_AIO_RUNNING)
return;
sb->sb_flags |= SB_AIO_RUNNING;
- if (sb == &so->so_snd)
- SOCK_LOCK(so);
soref(so);
- if (sb == &so->so_snd)
- SOCK_UNLOCK(so);
soaio_enqueue(&sb->sb_aiotask);
}
diff --git a/freebsd/sys/kern/uipc_sockbuf.c b/freebsd/sys/kern/uipc_sockbuf.c
index 4b710a2c..4ec7187c 100644
--- a/freebsd/sys/kern/uipc_sockbuf.c
+++ b/freebsd/sys/kern/uipc_sockbuf.c
@@ -336,7 +336,7 @@ sowakeup(struct socket *so, struct sockbuf *sb)
if (sb->sb_flags & SB_AIO)
sowakeup_aio(so, sb);
SOCKBUF_UNLOCK(sb);
- if (ret == SU_ISCONNECTED && !(so->so_state & SS_ISDISCONNECTED))
+ if (ret == SU_ISCONNECTED)
soisconnected(so);
if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
pgsigio(&so->so_sigio, SIGIO, 0);
diff --git a/freebsd/sys/kern/uipc_socket.c b/freebsd/sys/kern/uipc_socket.c
index 1773606d..5e4fda56 100644
--- a/freebsd/sys/kern/uipc_socket.c
+++ b/freebsd/sys/kern/uipc_socket.c
@@ -3744,24 +3744,41 @@ soisconnecting(struct socket *so)
void
soisconnected(struct socket *so)
{
- struct socket *head;
- int ret;
- /*
- * XXXGL: this is the only place where we acquire socket locks
- * in reverse order: first child, then listening socket. To
- * avoid possible LOR, use try semantics.
- */
-restart:
SOCK_LOCK(so);
- if ((head = so->so_listen) != NULL &&
- __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
- SOCK_UNLOCK(so);
- goto restart;
- }
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
so->so_state |= SS_ISCONNECTED;
- if (head != NULL && (so->so_qstate == SQ_INCOMP)) {
+
+ if (so->so_qstate == SQ_INCOMP) {
+ struct socket *head = so->so_listen;
+ int ret;
+
+ KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
+ /*
+ * Promoting a socket from incomplete queue to complete, we
+ * need to go through reverse order of locking. We first do
+ * trylock, and if that doesn't succeed, we go the hard way
+ * leaving a reference and rechecking consistency after proper
+ * locking.
+ */
+ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+ soref(head);
+ SOCK_UNLOCK(so);
+ SOLISTEN_LOCK(head);
+ SOCK_LOCK(so);
+ if (__predict_false(head != so->so_listen)) {
+ /*
+ * The socket went off the listen queue,
+ * should be lost race to close(2) of sol.
+ * The socket is about to soabort().
+ */
+ SOCK_UNLOCK(so);
+ sorele(head);
+ return;
+ }
+ /* Not the last one, as so holds a ref. */
+ refcount_release(&head->so_count);
+ }
again:
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
TAILQ_REMOVE(&head->sol_incomp, so, so_list);
@@ -3790,8 +3807,6 @@ again:
}
return;
}
- if (head != NULL)
- SOLISTEN_UNLOCK(head);
SOCK_UNLOCK(so);
wakeup(&so->so_timeo);
sorwakeup(so);
@@ -3825,13 +3840,14 @@ soisdisconnected(struct socket *so)
so->so_state |= SS_ISDISCONNECTED;
if (!SOLISTENING(so)) {
+ SOCK_UNLOCK(so);
SOCKBUF_LOCK(&so->so_rcv);
socantrcvmore_locked(so);
SOCKBUF_LOCK(&so->so_snd);
sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
socantsendmore_locked(so);
- }
- SOCK_UNLOCK(so);
+ } else
+ SOCK_UNLOCK(so);
wakeup(&so->so_timeo);
}
diff --git a/freebsd/sys/kern/uipc_usrreq.c b/freebsd/sys/kern/uipc_usrreq.c
index 7237956a..702ba1e4 100644
--- a/freebsd/sys/kern/uipc_usrreq.c
+++ b/freebsd/sys/kern/uipc_usrreq.c
@@ -698,9 +698,9 @@ uipc_close(struct socket *so)
{
struct unpcb *unp, *unp2;
#ifndef __rtems__
- struct vnode *vp;
+ struct vnode *vp = NULL;
#else /* __rtems__ */
- IMFS_generic_t *vp;
+ IMFS_generic_t *vp = NULL;
#endif /* __rtems__ */
unp = sotounpcb(so);
@@ -1181,7 +1181,11 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
release:
if (control != NULL)
m_freem(control);
- if (m != NULL)
+ /*
+ * In case of PRUS_NOTREADY, uipc_ready() is responsible
+ * for freeing memory.
+ */
+ if (m != NULL && (flags & PRUS_NOTREADY) == 0)
m_freem(m);
return (error);
}
@@ -1196,7 +1200,12 @@ uipc_ready(struct socket *so, struct mbuf *m, int count)
unp = sotounpcb(so);
UNP_LINK_RLOCK();
- unp2 = unp->unp_conn;
+ if ((unp2 = unp->unp_conn) == NULL) {
+ UNP_LINK_RUNLOCK();
+ for (int i = 0; i < count; i++)
+ m = m_free(m);
+ return (ECONNRESET);
+ }
UNP_PCB_LOCK(unp2);
so2 = unp2->unp_socket;