Update to FreeBSD head 2017-08-01

Git mirror commit f5002f5e5f78cae9f0269d812dc0aedb0339312c. Update #3472.
author: Sebastian Huber <sebastian.huber@embedded-brains.de> 2018-08-07 14:56:50 +0200
committer: Sebastian Huber <sebastian.huber@embedded-brains.de> 2018-09-21 10:29:37 +0200
commit: c37f9fba70085fedc8eede7559489d2321393005 (patch)
tree: 042455ebf1fa89a277a825f72e1ed805d0b4d296 /freebsd/sys/kern
parent: Update to FreeBSD head 2017-06-01 (diff)
download: rtems-libbsd-c37f9fba70085fedc8eede7559489d2321393005.tar.bz2
16 files changed, 1460 insertions, 1007 deletions
diff --git a/freebsd/sys/kern/init_main.c b/freebsd/sys/kern/init_main.c
index 467888b2..f211b363 100644
--- a/freebsd/sys/kern/init_main.c
+++ b/freebsd/sys/kern/init_main.c
@@ -384,8 +384,7 @@ SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2,
 #endif
 
 static int
-null_fetch_syscall_args(struct thread *td __unused,
-    struct syscall_args *sa __unused)
+null_fetch_syscall_args(struct thread *td __unused)
 {
 
 	panic("null_fetch_syscall_args");
diff --git a/freebsd/sys/kern/kern_event.c b/freebsd/sys/kern/kern_event.c
index 0a64adbe..2428182c 100644
--- a/freebsd/sys/kern/kern_event.c
+++ b/freebsd/sys/kern/kern_event.c
@@ -31,6 +31,7 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <rtems/bsd/local/opt_compat.h>
 #include <rtems/bsd/local/opt_ktrace.h>
 #include <rtems/bsd/local/opt_kqueue.h>
 
@@ -119,6 +120,10 @@ static int	kqueue_scan(struct kqueue *kq, int maxevents,
 static void 	kqueue_wakeup(struct kqueue *kq);
 static struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
+struct g_kevent_args;
+static int	kern_kevent_generic(struct thread *td,
+		    struct g_kevent_args *uap,
+		    struct kevent_copyops *k_ops);
 
 #ifndef __rtems__
 static fo_rdwr_t	kqueue_read;
@@ -640,12 +645,13 @@ knote_fork(struct knlist *list, int pid)
  * interval timer support code.
  */
 
-#define NOTE_TIMER_PRECMASK	(NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \
-				NOTE_NSECONDS)
+#define NOTE_TIMER_PRECMASK						\
+    (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
 
 static sbintime_t
 timer2sbintime(intptr_t data, int flags)
 {
+	int64_t secs;
 
         /*
          * Macros for converting to the fractional second portion of an
@@ -664,27 +670,27 @@ timer2sbintime(intptr_t data, int flags)
 	case NOTE_MSECONDS: /* FALLTHROUGH */
 	case 0:
 		if (data >= 1000) {
-			int64_t secs = data / 1000;
+			secs = data / 1000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | MS_TO_SBT(data % 1000));
 		}
-		return MS_TO_SBT(data);
+		return (MS_TO_SBT(data));
 	case NOTE_USECONDS:
 		if (data >= 1000000) {
-			int64_t secs = data / 1000000;
+			secs = data / 1000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000));
 		}
-		return US_TO_SBT(data);
+		return (US_TO_SBT(data));
 	case NOTE_NSECONDS:
 		if (data >= 1000000000) {
-			int64_t secs = data / 1000000000;
+			secs = data / 1000000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
@@ -701,7 +707,7 @@ timer2sbintime(intptr_t data, int flags)
 struct kq_timer_cb_data {
 	struct callout c;
 	sbintime_t next;	/* next timer event fires at */
-	sbintime_t to;		/* precalculated timer period */
+	sbintime_t to;		/* precalculated timer period, 0 for abs */
 };
 
 static void
@@ -716,8 +722,9 @@ filt_timerexpire(void *knx)
 
 	if ((kn->kn_flags & EV_ONESHOT) != 0)
 		return;
-
 	kc = kn->kn_ptr.p_v;
+	if (kc->to == 0)
+		return;
 	kc->next += kc->to;
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
@@ -730,7 +737,8 @@ static int
 filt_timerattach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
-	sbintime_t to;
+	struct bintime bt;
+	sbintime_t to, sbt;
 	unsigned int ncallouts;
 
 	if (kn->kn_sdata < 0)
@@ -738,10 +746,15 @@ filt_timerattach(struct knote *kn)
 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
 		kn->kn_sdata = 1;
 	/* Only precision unit are supported in flags so far */
-	if ((kn->kn_sfflags & ~NOTE_TIMER_PRECMASK) != 0)
+	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
 		return (EINVAL);
 
 	to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
+	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
+		getboottimebin(&bt);
+		sbt = bttosbt(bt);
+		to -= sbt;
+	}
 	if (to < 0)
 		return (EINVAL);
 
@@ -751,12 +764,18 @@ filt_timerattach(struct knote *kn)
 			return (ENOMEM);
 	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
 
-	kn->kn_flags |= EV_CLEAR;		/* automatically set */
+	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
+		kn->kn_flags |= EV_CLEAR;	/* automatically set */
 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
 	callout_init(&kc->c, 1);
-	kc->next = to + sbinuptime();
-	kc->to = to;
+	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
+		kc->next = to;
+		kc->to = 0;
+	} else {
+		kc->next = to + sbinuptime();
+		kc->to = to;
+	}
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
 
@@ -970,25 +989,24 @@ kqueue(void)
 
 #ifdef KTRACE
 static size_t
-kev_iovlen(int n, u_int kgio)
+kev_iovlen(int n, u_int kgio, size_t kevent_size)
 {
 
-	if (n < 0 || n >= kgio / sizeof(struct kevent))
+	if (n < 0 || n >= kgio / kevent_size)
 		return (kgio);
-	return (n * sizeof(struct kevent));
+	return (n * kevent_size);
 }
 #endif
 
-#ifndef _SYS_SYSPROTO_H_
-struct kevent_args {
+struct g_kevent_args {
 	int	fd;
-	const struct kevent *changelist;
+	void	*changelist;
 	int	nchanges;
-	struct	kevent *eventlist;
+	void	*eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
-#endif
+
 #ifdef __rtems__
 static int kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout);
@@ -1001,12 +1019,29 @@ static
 int
 sys_kevent(struct thread *td, struct kevent_args *uap)
 {
-	struct timespec ts, *tsp;
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent_copyout,
 		.k_copyin = kevent_copyin,
+		.kevent_size = sizeof(struct kevent),
 	};
+	struct g_kevent_args gk_args = {
+		.fd = uap->fd,
+		.changelist = uap->changelist,
+		.nchanges = uap->nchanges,
+		.eventlist = uap->eventlist,
+		.nevents = uap->nevents,
+		.timeout = uap->timeout,
+	};
+
+	return (kern_kevent_generic(td, &gk_args, &k_ops));
+}
+
+static int
+kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
+    struct kevent_copyops *k_ops)
+{
+	struct timespec ts, *tsp;
 	int error;
 #ifdef KTRACE
 	struct uio ktruio;
@@ -1028,26 +1063,30 @@ sys_kevent(struct thread *td, struct kevent_args *uap)
 	if (KTRPOINT(td, KTR_GENIO)) {
 		kgio = ktr_geniosize;
 		ktriov.iov_base = uap->changelist;
-		ktriov.iov_len = kev_iovlen(uap->nchanges, kgio);
+		ktriov.iov_len = kev_iovlen(uap->nchanges, kgio,
+		    k_ops->kevent_size);
 		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
 		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
 		    .uio_td = td };
 		ktruioin = cloneuio(&ktruio);
 		ktriov.iov_base = uap->eventlist;
-		ktriov.iov_len = kev_iovlen(uap->nevents, kgio);
-		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
+		ktriov.iov_len = kev_iovlen(uap->nevents, kgio,
+		    k_ops->kevent_size);
+		ktriov.iov_len = uap->nevents * k_ops->kevent_size;
 		ktruioout = cloneuio(&ktruio);
 	}
 #endif
 
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
-	    &k_ops, tsp);
+	    k_ops, tsp);
 
 #ifdef KTRACE
 	if (ktruioin != NULL) {
-		ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio);
+		ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio,
+		    k_ops->kevent_size);
 		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
-		ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio);
+		ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio,
+		    k_ops->kevent_size);
 		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
 	}
 #endif
@@ -1123,6 +1162,94 @@ kevent_copyin(void *arg, struct kevent *kevp, int count)
 	return (error);
 }
 
+#ifdef COMPAT_FREEBSD11
+struct kevent_freebsd11 {
+	__uintptr_t	ident;		/* identifier for this event */
+	short		filter;		/* filter for event */
+	unsigned short	flags;
+	unsigned int	fflags;
+	__intptr_t	data;
+	void		*udata;		/* opaque user data identifier */
+};
+
+static int
+kevent11_copyout(void *arg, struct kevent *kevp, int count)
+{
+	struct freebsd11_kevent_args *uap;
+	struct kevent_freebsd11 kev11;
+	int error, i;
+
+	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+	uap = (struct freebsd11_kevent_args *)arg;
+
+	for (i = 0; i < count; i++) {
+		kev11.ident = kevp->ident;
+		kev11.filter = kevp->filter;
+		kev11.flags = kevp->flags;
+		kev11.fflags = kevp->fflags;
+		kev11.data = kevp->data;
+		kev11.udata = kevp->udata;
+		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
+		if (error != 0)
+			break;
+		uap->eventlist++;
+		kevp++;
+	}
+	return (error);
+}
+
+/*
+ * Copy 'count' items from the list pointed to by uap->changelist.
+ */
+static int
+kevent11_copyin(void *arg, struct kevent *kevp, int count)
+{
+	struct freebsd11_kevent_args *uap;
+	struct kevent_freebsd11 kev11;
+	int error, i;
+
+	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+	uap = (struct freebsd11_kevent_args *)arg;
+
+	for (i = 0; i < count; i++) {
+		error = copyin(uap->changelist, &kev11, sizeof(kev11));
+		if (error != 0)
+			break;
+		kevp->ident = kev11.ident;
+		kevp->filter = kev11.filter;
+		kevp->flags = kev11.flags;
+		kevp->fflags = kev11.fflags;
+		kevp->data = (uintptr_t)kev11.data;
+		kevp->udata = kev11.udata;
+		bzero(&kevp->ext, sizeof(kevp->ext));
+		uap->changelist++;
+		kevp++;
+	}
+	return (error);
+}
+
+int
+freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
+{
+	struct kevent_copyops k_ops = {
+		.arg = uap,
+		.k_copyout = kevent11_copyout,
+		.k_copyin = kevent11_copyin,
+		.kevent_size = sizeof(struct kevent_freebsd11),
+	};
+	struct g_kevent_args gk_args = {
+		.fd = uap->fd,
+		.changelist = uap->changelist,
+		.nchanges = uap->nchanges,
+		.eventlist = uap->eventlist,
+		.nevents = uap->nevents,
+		.timeout = uap->timeout,
+	};
+
+	return (kern_kevent_generic(td, &gk_args, &k_ops));
+}
+#endif
+
 int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
diff --git a/freebsd/sys/kern/kern_linker.c b/freebsd/sys/kern/kern_linker.c
index 214554d3..1c81a61c 100644
--- a/freebsd/sys/kern/kern_linker.c
+++ b/freebsd/sys/kern/kern_linker.c
@@ -1259,8 +1259,8 @@ kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
 
 	/* Version 1 fields: */
 	namelen = strlen(lf->filename) + 1;
-	if (namelen > MAXPATHLEN)
-		namelen = MAXPATHLEN;
+	if (namelen > sizeof(stat->name))
+		namelen = sizeof(stat->name);
 	bcopy(lf->filename, &stat->name[0], namelen);
 	stat->refs = lf->refs;
 	stat->id = lf->id;
@@ -1268,8 +1268,8 @@ kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
 	stat->size = lf->size;
 	/* Version 2 fields: */
 	namelen = strlen(lf->pathname) + 1;
-	if (namelen > MAXPATHLEN)
-		namelen = MAXPATHLEN;
+	if (namelen > sizeof(stat->pathname))
+		namelen = sizeof(stat->pathname);
 	bcopy(lf->pathname, &stat->pathname[0], namelen);
 	sx_xunlock(&kld_sx);
 
diff --git a/freebsd/sys/kern/kern_uuid.c b/freebsd/sys/kern/kern_uuid.c
index b6a8915f..1ac19685 100644
--- a/freebsd/sys/kern/kern_uuid.c
+++ b/freebsd/sys/kern/kern_uuid.c
@@ -60,7 +60,7 @@ CTASSERT(sizeof(struct uuid) == 16);
 /* We use an alternative, more convenient representation in the generator. */
 struct uuid_private {
 	union {
-		uint64_t	ll;		/* internal. */
+		uint64_t	ll;	/* internal, for uuid_last only */
 		struct {
 			uint32_t	low;
 			uint16_t	mid;
@@ -428,3 +428,10 @@ parse_uuid(const char *str, struct uuid *uuid)
 	    (c[3] & 0xc0) != 0x80 &&			/* variant 1? */
 	    (c[3] & 0xe0) != 0xc0) ? EINVAL : 0);	/* variant 2? */
 }
+
+int
+uuidcmp(const struct uuid *uuid1, const struct uuid *uuid2)
+{
+
+	return (memcmp(uuid1, uuid2, sizeof(struct uuid)));
+}
diff --git a/freebsd/sys/kern/subr_blist.c b/freebsd/sys/kern/subr_blist.c
index 5af51dd4..c8e32c5b 100644
--- a/freebsd/sys/kern/subr_blist.c
+++ b/freebsd/sys/kern/subr_blist.c
@@ -30,18 +30,18 @@
  * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
  *
  *	This module implements a general bitmap allocator/deallocator.  The
- *	allocator eats around 2 bits per 'block'.  The module does not 
- *	try to interpret the meaning of a 'block' other than to return 
+ *	allocator eats around 2 bits per 'block'.  The module does not
+ *	try to interpret the meaning of a 'block' other than to return
  *	SWAPBLK_NONE on an allocation failure.
  *
  *	A radix tree is used to maintain the bitmap.  Two radix constants are
  *	involved:  One for the bitmaps contained in the leaf nodes (typically
- *	32), and one for the meta nodes (typically 16).  Both meta and leaf
+ *	64), and one for the meta nodes (typically 16).  Both meta and leaf
  *	nodes have a hint field.  This field gives us a hint as to the largest
  *	free contiguous range of blocks under the node.  It may contain a
- *	value that is too high, but will never contain a value that is too 
+ *	value that is too high, but will never contain a value that is too
  *	low.  When the radix tree is searched, allocation failures in subtrees
- *	update the hint. 
+ *	update the hint.
  *
  *	The radix tree also implements two collapsed states for meta nodes:
  *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
@@ -51,7 +51,7 @@
  *
  * 	The hinting greatly increases code efficiency for allocations while
  *	the general radix structure optimizes both allocations and frees.  The
- *	radix tree should be able to operate well no matter how much 
+ *	radix tree should be able to operate well no matter how much
  *	fragmentation there is and no matter how large a bitmap is used.
  *
  *	The blist code wires all necessary memory at creation time.  Neither
@@ -63,18 +63,18 @@
  *	linear array.  Each meta node is immediately followed (laid out
  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
  *	is a recursive structure but one that can be easily scanned through
- *	a very simple 'skip' calculation.  In order to support large radixes, 
- *	portions of the tree may reside outside our memory allocation.  We 
- *	handle this with an early-termination optimization (when bighint is 
- *	set to -1) on the scan.  The memory allocation is only large enough 
+ *	a very simple 'skip' calculation.  In order to support large radixes,
+ *	portions of the tree may reside outside our memory allocation.  We
+ *	handle this with an early-termination optimization (when bighint is
+ *	set to -1) on the scan.  The memory allocation is only large enough
  *	to cover the number of blocks requested at creation time even if it
  *	must be encompassed in larger root-node radix.
  *
- *	NOTE: the allocator cannot currently allocate more than 
- *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too 
- *	large' if you try.  This is an area that could use improvement.  The 
- *	radix is large enough that this restriction does not effect the swap 
- *	system, though.  Currently only the allocation code is effected by
+ *	NOTE: the allocator cannot currently allocate more than
+ *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
+ *	large' if you try.  This is an area that could use improvement.  The
+ *	radix is large enough that this restriction does not effect the swap
+ *	system, though.  Currently only the allocation code is affected by
  *	this algorithmic unfeature.  The freeing code can handle arbitrary
  *	ranges.
  *
@@ -93,7 +93,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/blist.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
-#include <sys/mutex.h> 
+#include <sys/mutex.h>
 
 #else
 
@@ -101,19 +101,18 @@ __FBSDID("$FreeBSD$");
 #define BLIST_DEBUG
 #endif
 
-#define SWAPBLK_NONE ((daddr_t)-1)
-
 #include <sys/types.h>
+#include <sys/malloc.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdbool.h>
 
+#define	bitcount64(x)	__bitcount64((uint64_t)(x))
 #define malloc(a,b,c)	calloc(a, 1)
 #define free(a,b)	free(a)
 
-typedef unsigned int u_daddr_t;
-
 #include <sys/blist.h>
 
 void panic(const char *ctl, ...);
@@ -123,23 +122,23 @@ void panic(const char *ctl, ...);
 /*
  * static support functions
  */
-
-static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
-static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, 
-				daddr_t count, daddr_t radix, int skip);
+static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
+		    daddr_t cursor);
+static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
+		    daddr_t radix, daddr_t skip, daddr_t cursor);
 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
-static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 
-					daddr_t radix, int skip, daddr_t blk);
-static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
-				daddr_t skip, blist_t dest, daddr_t count);
-static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
-static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
-				daddr_t radix, int skip, daddr_t blk);
-static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, 
-						int skip, daddr_t count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
+		    daddr_t radix, daddr_t skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
+		    daddr_t skip, blist_t dest, daddr_t count);
+static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+		    daddr_t radix, daddr_t skip, daddr_t blk);
+static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip,
+		    daddr_t count);
 #ifndef _KERNEL
-static void	blst_radix_print(blmeta_t *scan, daddr_t blk, 
-					daddr_t radix, int skip, int tab);
+static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
+		    daddr_t skip, int tab);
 #endif
 
 #ifdef _KERNEL
@@ -153,35 +152,40 @@ static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
  *	blocks - must be greater than 0
  * 	flags  - malloc flags
  *
- *	The smallest blist consists of a single leaf node capable of 
+ *	The smallest blist consists of a single leaf node capable of
  *	managing BLIST_BMAP_RADIX blocks.
  */
-
-blist_t 
+blist_t
 blist_create(daddr_t blocks, int flags)
 {
 	blist_t bl;
-	int radix;
-	int skip = 0;
+	daddr_t nodes, radix, skip;
 
 	/*
 	 * Calculate radix and skip field used for scanning.
 	 */
 	radix = BLIST_BMAP_RADIX;
-
+	skip = 0;
 	while (radix < blocks) {
 		radix *= BLIST_META_RADIX;
 		skip = (skip + 1) * BLIST_META_RADIX;
 	}
+	nodes = 1 + blst_radix_init(NULL, radix, skip, blocks);
 
-	bl = malloc(sizeof(struct blist), M_SWAP, flags | M_ZERO);
+	bl = malloc(sizeof(struct blist), M_SWAP, flags);
+	if (bl == NULL)
+		return (NULL);
 
 	bl->bl_blocks = blocks;
 	bl->bl_radix = radix;
 	bl->bl_skip = skip;
-	bl->bl_rootblks = 1 +
-	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
-	bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, flags);
+	bl->bl_cursor = 0;
+	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
+	if (bl->bl_root == NULL) {
+		free(bl, M_SWAP);
+		return (NULL);
+	}
+	blst_radix_init(bl->bl_root, radix, skip, blocks);
 
 #if defined(BLIST_DEBUG)
 	printf(
@@ -189,17 +193,16 @@ blist_create(daddr_t blocks, int flags)
 		", requiring %lldK of ram\n",
 		(long long)bl->bl_blocks,
 		(long long)bl->bl_blocks * 4 / 1024,
-		(long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
 	);
 	printf("BLIST raw radix tree contains %lld records\n",
-	    (long long)bl->bl_rootblks);
+	    (long long)nodes);
 #endif
-	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
 
-	return(bl);
+	return (bl);
 }
 
-void 
+void
 blist_destroy(blist_t bl)
 {
 	free(bl->bl_root, M_SWAP);
@@ -207,25 +210,44 @@ blist_destroy(blist_t bl)
 }
 
 /*
- * blist_alloc() - reserve space in the block bitmap.  Return the base
+ * blist_alloc() -   reserve space in the block bitmap.  Return the base
  *		     of a contiguous region or SWAPBLK_NONE if space could
  *		     not be allocated.
  */
-
-daddr_t 
+daddr_t
 blist_alloc(blist_t bl, daddr_t count)
 {
-	daddr_t blk = SWAPBLK_NONE;
+	daddr_t blk;
 
-	if (bl) {
-		if (bl->bl_radix == BLIST_BMAP_RADIX)
-			blk = blst_leaf_alloc(bl->bl_root, 0, count);
-		else
-			blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
-		if (blk != SWAPBLK_NONE)
-			bl->bl_free -= count;
+	/*
+	 * This loop iterates at most twice.  An allocation failure in the
+	 * first iteration leads to a second iteration only if the cursor was
+	 * non-zero.  When the cursor is zero, an allocation failure will
+	 * reduce the hint, stopping further iterations.
+	 */
+	while (count <= bl->bl_root->bm_bighint) {
+		blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix,
+		    bl->bl_skip, bl->bl_cursor);
+		if (blk != SWAPBLK_NONE) {
+			bl->bl_cursor = blk + count;
+			return (blk);
+		} else if (bl->bl_cursor != 0)
+			bl->bl_cursor = 0;
 	}
-	return(blk);
+	return (SWAPBLK_NONE);
+}
+
+/*
+ * blist_avail() -	return the number of free blocks.
+ */
+daddr_t
+blist_avail(blist_t bl)
+{
+
+	if (bl->bl_radix == BLIST_BMAP_RADIX)
+		return (bitcount64(bl->bl_root->u.bmu_bitmap));
+	else
+		return (bl->bl_root->u.bmu_avail);
 }
 
 /*
@@ -233,17 +255,11 @@ blist_alloc(blist_t bl, daddr_t count)
  *		     	of a contiguous region.  Panic if an inconsistancy is
  *			found.
  */
-
-void 
+void
 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
 {
-	if (bl) {
-		if (bl->bl_radix == BLIST_BMAP_RADIX)
-			blst_leaf_free(bl->bl_root, blkno, count);
-		else
-			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
-		bl->bl_free += count;
-	}
+
+	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
 }
 
 /*
@@ -252,22 +268,12 @@ blist_free(blist_t bl, daddr_t blkno, daddr_t count)
  *			existing allocations.  Return the number of blocks
  *			actually filled that were free before the call.
  */
-
-int
+daddr_t
 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
 {
-	int filled;
 
-	if (bl) {
-		if (bl->bl_radix == BLIST_BMAP_RADIX)
-			filled = blst_leaf_fill(bl->bl_root, blkno, count);
-		else
-			filled = blst_meta_fill(bl->bl_root, blkno, count,
-			    bl->bl_radix, bl->bl_skip, 0);
-		bl->bl_free -= filled;
-		return filled;
-	} else
-		return 0;
+	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix,
+	    bl->bl_skip, 0));
 }
 
 /*
@@ -277,7 +283,6 @@ blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
  *			one.  When extending the tree you can specify whether
  *			the new blocks are to left allocated or freed.
  */
-
 void
 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
 {
@@ -303,7 +308,6 @@ blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
 /*
  * blist_print()    - dump radix tree
  */
-
 void
 blist_print(blist_t bl)
 {
@@ -318,7 +322,7 @@ blist_print(blist_t bl)
  *			  ALLOCATION SUPPORT FUNCTIONS			*
  ************************************************************************
  *
- *	These support functions do all the actual work.  They may seem 
+ *	These support functions do all the actual work.  They may seem
  *	rather longish, but that's because I've commented them up.  The
  *	actual code is straight forward.
  *
@@ -327,77 +331,91 @@ blist_print(blist_t bl)
 /*
  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
  *
- *	This is the core of the allocator and is optimized for the 1 block
- *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
- *	somewhat slower.  The 1 block allocation case is log2 and extremely
- *	quick.
+ *	This is the core of the allocator and is optimized for the
+ *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
+ *	time is proportional to log2(count) + log2(BLIST_BMAP_RADIX).
  */
-
 static daddr_t
-blst_leaf_alloc(
-	blmeta_t *scan,
-	daddr_t blk,
-	int count
-) {
-	u_daddr_t orig = scan->u.bmu_bitmap;
-
-	if (orig == 0) {
+blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
+{
+	u_daddr_t mask;
+	int count1, hi, lo, mid, num_shifts, range1, range_ext;
+
+	if (count == BLIST_BMAP_RADIX) {
 		/*
-		 * Optimize bitmap all-allocated case.  Also, count = 1
-		 * case assumes at least 1 bit is free in the bitmap, so
-		 * we have to take care of this case here.
+		 * Optimize allocation of BLIST_BMAP_RADIX bits.  If this wasn't
+		 * a special case, then forming the final value of 'mask' below
+		 * would require special handling to avoid an invalid left shift
+		 * when count equals the number of bits in mask.
 		 */
+		if (~scan->u.bmu_bitmap != 0) {
+			scan->bm_bighint = BLIST_BMAP_RADIX - 1;
+			return (SWAPBLK_NONE);
+		}
+		if (cursor != blk)
+			return (SWAPBLK_NONE);
+		scan->u.bmu_bitmap = 0;
 		scan->bm_bighint = 0;
-		return(SWAPBLK_NONE);
+		return (blk);
 	}
-	if (count == 1) {
+	range1 = 0;
+	count1 = count - 1;
+	num_shifts = fls(count1);
+	mask = scan->u.bmu_bitmap;
+	while (mask != 0 && num_shifts > 0) {
 		/*
-		 * Optimized code to allocate one bit out of the bitmap
+		 * If bit i is set in mask, then bits in [i, i+range1] are set
+		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
+		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
+		 * while preserving these invariants.  The updates to mask leave
+		 * fewer bits set, but each bit that remains set represents a
+		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
 		 */
-		u_daddr_t mask;
-		int j = BLIST_BMAP_RADIX/2;
-		int r = 0;
-
-		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
-
-		while (j) {
-			if ((orig & mask) == 0) {
-			    r += j;
-			    orig >>= j;
-			}
-			j >>= 1;
-			mask >>= j;
-		}
-		scan->u.bmu_bitmap &= ~(1 << r);
-		return(blk + r);
+		num_shifts--;
+		range_ext = range1 + ((count1 >> num_shifts) & 1);
+		mask &= mask >> range_ext;
+		range1 += range_ext;
 	}
-	if (count <= BLIST_BMAP_RADIX) {
+	if (mask == 0) {
 		/*
-		 * non-optimized code to allocate N bits out of the bitmap.
-		 * The more bits, the faster the code runs.  It will run
-		 * the slowest allocating 2 bits, but since there aren't any
-		 * memory ops in the core loop (or shouldn't be, anyway),
-		 * you probably won't notice the difference.
+		 * Update bighint.  There is no allocation bigger than range1
+		 * available in this leaf.
 		 */
-		int j;
-		int n = BLIST_BMAP_RADIX - count;
-		u_daddr_t mask;
+		scan->bm_bighint = range1;
+		return (SWAPBLK_NONE);
+	}
 
-		mask = (u_daddr_t)-1 >> n;
+	/*
+	 * Discard any candidates that appear before the cursor.
+	 */
+	lo = cursor - blk;
+	mask &= ~(u_daddr_t)0 << lo;
 
-		for (j = 0; j <= n; ++j) {
-			if ((orig & mask) == mask) {
-				scan->u.bmu_bitmap &= ~mask;
-				return(blk + j);
-			}
-			mask = (mask << 1);
-		}
+	if (mask == 0)
+		return (SWAPBLK_NONE);
+
+	/*
+	 * The least significant set bit in mask marks the start of the first
+	 * available range of sufficient size.  Clear all the bits but that one,
+	 * and then perform a binary search to find its position.
+	 */
+	mask &= -mask;
+	hi = BLIST_BMAP_RADIX - count1;
+	while (lo + 1 < hi) {
+		mid = (lo + hi) >> 1;
+		if ((mask >> mid) != 0)
+			lo = mid;
+		else
+			hi = mid;
 	}
+
 	/*
-	 * We couldn't allocate count in this subtree, update bighint.
+	 * Set in mask exactly the bits being allocated, and clear them from
+	 * the set of available bits.
 	 */
-	scan->bm_bighint = count - 1;
-	return(SWAPBLK_NONE);
+	mask = (mask << count) - mask;
+	scan->u.bmu_bitmap &= ~mask;
+	return (blk + lo);
 }
 
 /*
@@ -408,76 +426,75 @@ blst_leaf_alloc(
  *	calls that hit this node.  We have to check for our collapse cases
  *	and we have a few optimizations strewn in as well.
  */
-
 static daddr_t
-blst_meta_alloc(
-	blmeta_t *scan, 
-	daddr_t blk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t cursor)
+{
+	daddr_t i, next_skip, r;
+	int child;
+	bool scan_from_start;
 
-	if (scan->u.bmu_avail == 0)  {
+	if (radix == BLIST_BMAP_RADIX)
+		return (blst_leaf_alloc(scan, blk, count, cursor));
+	if (scan->u.bmu_avail < count) {
 		/*
-		 * ALL-ALLOCATED special case
+		 * The meta node's hint must be too large if the allocation
+		 * exceeds the number of free blocks.  Reduce the hint, and
+		 * return failure.
 		 */
-		scan->bm_bighint = count;
-		return(SWAPBLK_NONE);
+		scan->bm_bighint = scan->u.bmu_avail;
+		return (SWAPBLK_NONE);
 	}
+	next_skip = skip / BLIST_META_RADIX;
 
+	/*
+	 * An ALL-FREE meta node requires special handling before allocating
+	 * any of its blocks.
+	 */
 	if (scan->u.bmu_avail == radix) {
 		radix /= BLIST_META_RADIX;
 
 		/*
-		 * ALL-FREE special case, initialize uninitialize
-		 * sublevel.
+		 * Reinitialize each of the meta node's children.  An ALL-FREE
+		 * meta node cannot have a terminator in any subtree.
 		 */
 		for (i = 1; i <= skip; i += next_skip) {
-			if (scan[i].bm_bighint == (daddr_t)-1)
-				break;
-			if (next_skip == 1) {
+			if (next_skip == 1)
 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
-				scan[i].bm_bighint = BLIST_BMAP_RADIX;
-			} else {
-				scan[i].bm_bighint = radix;
+			else
 				scan[i].u.bmu_avail = radix;
-			}
+			scan[i].bm_bighint = radix;
 		}
 	} else {
 		radix /= BLIST_META_RADIX;
 	}
 
-	for (i = 1; i <= skip; i += next_skip) {
+	if (count > radix) {
+		/*
+		 * The allocation exceeds the number of blocks that are
+		 * managed by a subtree of this meta node.
+		 */
+		panic("allocation too large");
+	}
+	scan_from_start = cursor == blk;
+	child = (cursor - blk) / radix;
+	blk += child * radix;
+	for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
 		if (count <= scan[i].bm_bighint) {
 			/*
-			 * count fits in object
+			 * The allocation might fit in the i'th subtree.
 			 */
-			daddr_t r;
-			if (next_skip == 1) {
-				r = blst_leaf_alloc(&scan[i], blk, count);
-			} else {
-				r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
-			}
+			r = blst_meta_alloc(&scan[i], blk, count, radix,
+			    next_skip - 1, cursor > blk ? cursor : blk);
 			if (r != SWAPBLK_NONE) {
 				scan->u.bmu_avail -= count;
-				if (scan->bm_bighint > scan->u.bmu_avail)
-					scan->bm_bighint = scan->u.bmu_avail;
-				return(r);
+				return (r);
 			}
 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
 			/*
 			 * Terminator
 			 */
 			break;
-		} else if (count > radix) {
-			/*
-			 * count does not fit in object even if it were
-			 * complete free.
-			 */
-			panic("blist_meta_alloc: allocation too large");
 		}
 		blk += radix;
 	}
@@ -485,22 +502,19 @@ blst_meta_alloc(
 	/*
 	 * We couldn't allocate count in this subtree, update bighint.
 	 */
-	if (scan->bm_bighint >= count)
+	if (scan_from_start && scan->bm_bighint >= count)
 		scan->bm_bighint = count - 1;
-	return(SWAPBLK_NONE);
+
+	return (SWAPBLK_NONE);
 }
 
 /*
  * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
  *
  */
-
 static void
-blst_leaf_free(
-	blmeta_t *scan,
-	daddr_t blk,
-	int count
-) {
+blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
+{
 	/*
 	 * free some data in this bitmap
 	 *
@@ -521,7 +535,7 @@ blst_leaf_free(
 
 	/*
 	 * We could probably do a better job here.  We are required to make
-	 * bighint at least as large as the biggest contiguous block of 
+	 * bighint at least as large as the biggest contiguous block of
 	 * data.  If we just shoehorn it, a little extra overhead will
 	 * be incured on the next allocation (but only that one typically).
 	 */
@@ -538,25 +552,18 @@ blst_leaf_free(
  *	range whereas the allocation code cannot allocate an arbitrary
  *	range).
  */
+static void
+blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t blk)
+{
+	daddr_t i, next_skip, v;
+	int child;
 
-static void 
-blst_meta_free(
-	blmeta_t *scan, 
-	daddr_t freeBlk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip,
-	daddr_t blk
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
-
-#if 0
-	printf("free (%llx,%lld) FROM (%llx,%lld)\n",
-	    (long long)freeBlk, (long long)count,
-	    (long long)blk, (long long)radix
-	);
-#endif
+	if (scan->bm_bighint == (daddr_t)-1)
+		panic("freeing invalid range");
+	if (radix == BLIST_BMAP_RADIX)
+		return (blst_leaf_free(scan, freeBlk, count));
+	next_skip = skip / BLIST_META_RADIX;
 
 	if (scan->u.bmu_avail == 0) {
 		/*
@@ -601,27 +608,16 @@ blst_meta_free(
 
 	radix /= BLIST_META_RADIX;
 
-	i = (freeBlk - blk) / radix;
-	blk += i * radix;
-	i = i * next_skip + 1;
-
+	child = (freeBlk - blk) / radix;
+	blk += child * radix;
+	i = 1 + child * next_skip;
 	while (i <= skip && blk < freeBlk + count) {
-		daddr_t v;
-
 		v = blk + radix - freeBlk;
 		if (v > count)
 			v = count;
-
-		if (scan->bm_bighint == (daddr_t)-1)
-			panic("blst_meta_free: freeing unexpected range");
-
-		if (next_skip == 1) {
-			blst_leaf_free(&scan[i], freeBlk, v);
-		} else {
-			blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
-		}
+		blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
 		if (scan->bm_bighint < scan[i].bm_bighint)
-		    scan->bm_bighint = scan[i].bm_bighint;
+			scan->bm_bighint = scan[i].bm_bighint;
 		count -= v;
 		freeBlk += v;
 		blk += radix;
@@ -635,17 +631,11 @@ blst_meta_free(
  *	Locates free space in the source tree and frees it in the destination
  *	tree.  The space may not already be free in the destination.
  */
-
-static void blst_copy(
-	blmeta_t *scan, 
-	daddr_t blk,
-	daddr_t radix, 
-	daddr_t skip, 
-	blist_t dest,
-	daddr_t count
-) {
-	int next_skip;
-	int i;
+static void
+blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
+    blist_t dest, daddr_t count)
+{
+	daddr_t i, next_skip;
 
 	/*
 	 * Leaf node
@@ -660,7 +650,7 @@ static void blst_copy(
 			int i;
 
 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
-				if (v & (1 << i))
+				if (v & ((u_daddr_t)1 << i))
 					blist_free(dest, blk + i, 1);
 			}
 		}
@@ -676,7 +666,7 @@ static void blst_copy(
 		 * Source all allocated, leave dest allocated
 		 */
 		return;
-	} 
+	}
 	if (scan->u.bmu_avail == radix) {
 		/*
 		 * Source all free, free entire dest
@@ -690,32 +680,20 @@ static void blst_copy(
 
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 
 	for (i = 1; count && i <= skip; i += next_skip) {
 		if (scan[i].bm_bighint == (daddr_t)-1)
 			break;
 
 		if (count >= radix) {
-			blst_copy(
-			    &scan[i],
-			    blk,
-			    radix,
-			    next_skip - 1,
-			    dest,
-			    radix
-			);
+			blst_copy(&scan[i], blk, radix, next_skip - 1, dest,
+			    radix);
 			count -= radix;
 		} else {
 			if (count) {
-				blst_copy(
-				    &scan[i],
-				    blk,
-				    radix,
-				    next_skip - 1,
-				    dest,
-				    count
-				);
+				blst_copy(&scan[i], blk, radix, next_skip - 1,
+				    dest, count);
 			}
 			count = 0;
 		}
@@ -730,24 +708,21 @@ static void blst_copy(
  *	regardless of any existing allocations in that range.  Returns
  *	the number of blocks allocated by the call.
  */
-
-static int
+static daddr_t
 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
 {
 	int n = blk & (BLIST_BMAP_RADIX - 1);
-	int nblks;
-	u_daddr_t mask, bitmap;
+	daddr_t nblks;
+	u_daddr_t mask;
 
 	mask = ((u_daddr_t)-1 << n) &
 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
 
-	/* Count the number of blocks we're about to allocate */
-	bitmap = scan->u.bmu_bitmap & mask;
-	for (nblks = 0; bitmap != 0; nblks++)
-		bitmap &= bitmap - 1;
+	/* Count the number of blocks that we are allocating. */
+	nblks = bitcount64(scan->u.bmu_bitmap & mask);
 
 	scan->u.bmu_bitmap &= ~mask;
-	return nblks;
+	return (nblks);
 }
 
 /*
@@ -758,80 +733,74 @@ blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
  *	range must be within the extent of this node.  Returns the
  *	number of blocks allocated by the call.
  */
-static int
-blst_meta_fill(
-	blmeta_t *scan,
-	daddr_t allocBlk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip,
-	daddr_t blk
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
-	int nblks = 0;
+static daddr_t
+blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t blk)
+{
+	daddr_t i, nblks, next_skip, v;
+	int child;
 
+	if (scan->bm_bighint == (daddr_t)-1)
+		panic("filling invalid range");
+	if (count > radix) {
+		/*
+		 * The allocation exceeds the number of blocks that are
+		 * managed by this node.
+		 */
+		panic("fill too large");
+	}
+	if (radix == BLIST_BMAP_RADIX)
+		return (blst_leaf_fill(scan, allocBlk, count));
 	if (count == radix || scan->u.bmu_avail == 0)  {
 		/*
 		 * ALL-ALLOCATED special case
 		 */
 		nblks = scan->u.bmu_avail;
 		scan->u.bmu_avail = 0;
-		scan->bm_bighint = count;
-		return nblks;
+		scan->bm_bighint = 0;
+		return (nblks);
 	}
+	next_skip = skip / BLIST_META_RADIX;
 
+	/*
+	 * An ALL-FREE meta node requires special handling before allocating
+	 * any of its blocks.
+	 */
 	if (scan->u.bmu_avail == radix) {
 		radix /= BLIST_META_RADIX;
 
 		/*
-		 * ALL-FREE special case, initialize sublevel
+		 * Reinitialize each of the meta node's children.  An ALL-FREE
+		 * meta node cannot have a terminator in any subtree.
 		 */
 		for (i = 1; i <= skip; i += next_skip) {
-			if (scan[i].bm_bighint == (daddr_t)-1)
-				break;
-			if (next_skip == 1) {
+			if (next_skip == 1)
 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
-				scan[i].bm_bighint = BLIST_BMAP_RADIX;
-			} else {
-				scan[i].bm_bighint = radix;
+			else
 				scan[i].u.bmu_avail = radix;
-			}
+			scan[i].bm_bighint = radix;
 		}
 	} else {
 		radix /= BLIST_META_RADIX;
 	}
 
-	if (count > radix)
-		panic("blist_meta_fill: allocation too large");
-
-	i = (allocBlk - blk) / radix;
-	blk += i * radix;
-	i = i * next_skip + 1;
-
+	nblks = 0;
+	child = (allocBlk - blk) / radix;
+	blk += child * radix;
+	i = 1 + child * next_skip;
 	while (i <= skip && blk < allocBlk + count) {
-		daddr_t v;
-
 		v = blk + radix - allocBlk;
 		if (v > count)
 			v = count;
-
-		if (scan->bm_bighint == (daddr_t)-1)
-			panic("blst_meta_fill: filling unexpected range");
-
-		if (next_skip == 1) {
-			nblks += blst_leaf_fill(&scan[i], allocBlk, v);
-		} else {
-			nblks += blst_meta_fill(&scan[i], allocBlk, v,
-			    radix, next_skip - 1, blk);
-		}
+		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix,
+		    next_skip - 1, blk);
 		count -= v;
 		allocBlk += v;
 		blk += radix;
 		i += next_skip;
 	}
 	scan->u.bmu_avail -= nblks;
-	return nblks;
+	return (nblks);
 }
 
 /*
@@ -842,13 +811,12 @@ blst_meta_fill(
  *	be considerably less than the calculated radix due to the large
  *	RADIX values we use.
  */
-
-static daddr_t	
-blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+static daddr_t
+blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
 {
-	int i;
-	int next_skip;
-	daddr_t memindex = 0;
+	daddr_t i, memindex, next_skip;
+
+	memindex = 0;
 
 	/*
 	 * Leaf node
@@ -859,7 +827,7 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
 			scan->bm_bighint = 0;
 			scan->u.bmu_bitmap = 0;
 		}
-		return(memindex);
+		return (memindex);
 	}
 
 	/*
@@ -874,30 +842,24 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
 	}
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 
 	for (i = 1; i <= skip; i += next_skip) {
 		if (count >= radix) {
 			/*
 			 * Allocate the entire object
 			 */
-			memindex = i + blst_radix_init(
-			    ((scan) ? &scan[i] : NULL),
-			    radix,
-			    next_skip - 1,
-			    radix
-			);
+			memindex = i +
+			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
+			    next_skip - 1, radix);
 			count -= radix;
 		} else if (count > 0) {
 			/*
 			 * Allocate a partial object
 			 */
-			memindex = i + blst_radix_init(
-			    ((scan) ? &scan[i] : NULL),
-			    radix,
-			    next_skip - 1,
-			    count
-			);
+			memindex = i +
+			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
+			    next_skip - 1, count);
 			count = 0;
 		} else {
 			/*
@@ -910,21 +872,20 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
 	}
 	if (memindex < i)
 		memindex = i;
-	return(memindex);
+	return (memindex);
 }
 
 #ifdef BLIST_DEBUG
 
-static void	
-blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+static void
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
+    int tab)
 {
-	int i;
-	int next_skip;
-	int lastState = 0;
+	daddr_t i, next_skip;
 
 	if (radix == BLIST_BMAP_RADIX) {
 		printf(
-		    "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", 
+		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
 		    tab, tab, "",
 		    (long long)blk, (long long)radix,
 		    (long long)scan->u.bmu_bitmap,
@@ -962,7 +923,7 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
 	);
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 	tab += 4;
 
 	for (i = 1; i <= skip; i += next_skip) {
@@ -972,16 +933,9 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
 			    tab, tab, "",
 			    (long long)blk, (long long)radix
 			);
-			lastState = 0;
 			break;
 		}
-		blst_radix_print(
-		    &scan[i],
-		    blk,
-		    radix,
-		    next_skip - 1,
-		    tab
-		);
+		blst_radix_print(&scan[i], blk, radix, next_skip - 1, tab);
 		blk += radix;
 	}
 	tab -= 4;
@@ -1018,11 +972,10 @@ main(int ac, char **av)
 
 	for (;;) {
 		char buf[1024];
-		daddr_t da = 0;
-		daddr_t count = 0;
-
+		long long da = 0;
+		long long count = 0;
 
-		printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
 		    (long long)size, (long long)bl->bl_radix);
 		fflush(stdout);
 		if (fgets(buf, sizeof(buf), stdin) == NULL)
@@ -1030,7 +983,7 @@ main(int ac, char **av)
 		switch(buf[0]) {
 		case 'r':
 			if (sscanf(buf + 1, "%lld", &count) == 1) {
-				blist_resize(&bl, count, 1);
+				blist_resize(&bl, count, 1, M_WAITOK);
 			} else {
 				printf("?\n");
 			}
@@ -1046,18 +999,16 @@ main(int ac, char **av)
 			}
 			break;
 		case 'f':
-			if (sscanf(buf + 1, "%llx %lld",
-			    (long long *)&da, (long long *)&count) == 2) {
+			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
 				blist_free(bl, da, count);
 			} else {
 				printf("?\n");
 			}
 			break;
 		case 'l':
-			if (sscanf(buf + 1, "%llx %lld",
-			    (long long *)&da, (long long *)&count) == 2) {
-				printf("    n=%d\n",
-				    blist_fill(bl, da, count));
+			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
+				printf("    n=%jd\n",
+				    (intmax_t)blist_fill(bl, da, count));
 			} else {
 				printf("?\n");
 			}
@@ -1094,4 +1045,3 @@ panic(const char *ctl, ...)
 }
 
 #endif
-
diff --git a/freebsd/sys/kern/subr_prf.c b/freebsd/sys/kern/subr_prf.c
index 39f5826d..0380cfec 100644
--- a/freebsd/sys/kern/subr_prf.c
+++ b/freebsd/sys/kern/subr_prf.c
@@ -411,7 +411,6 @@ log_console(struct uio *uio)
 	msgbuftrigger = 1;
 	free(uio, M_IOV);
 	free(consbuffer, M_TEMP);
-	return;
 }
 #endif /* __rtems__ */
 
@@ -678,7 +677,7 @@ kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_lis
 	uintmax_t num;
 	int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
 	int cflag, hflag, jflag, tflag, zflag;
-	int dwidth, upper;
+	int bconv, dwidth, upper;
 	char padc;
 	int stop = 0, retval = 0;
 
@@ -704,7 +703,7 @@ kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_lis
 		}
 		percent = fmt - 1;
 		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
-		sign = 0; dot = 0; dwidth = 0; upper = 0;
+		sign = 0; dot = 0; bconv = 0; dwidth = 0; upper = 0;
 		cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
 reswitch:	switch (ch = (u_char)*fmt++) {
 		case '.':
@@ -752,28 +751,9 @@ reswitch:	switch (ch = (u_char)*fmt++) {
 				width = n;
 			goto reswitch;
 		case 'b':
-			num = (u_int)va_arg(ap, int);
-			p = va_arg(ap, char *);
-			for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
-				PCHAR(*q--);
-
-			if (num == 0)
-				break;
-
-			for (tmp = 0; *p;) {
-				n = *p++;
-				if (num & (1 << (n - 1))) {
-					PCHAR(tmp ? ',' : '<');
-					for (; (n = *p) > ' '; ++p)
-						PCHAR(n);
-					tmp = 1;
-				} else
-					for (; *p > ' '; ++p)
-						continue;
-			}
-			if (tmp)
-				PCHAR('>');
-			break;
+			ladjust = 1;
+			bconv = 1;
+			goto handle_nosign;
 		case 'c':
 			width -= 1;
 
@@ -919,6 +899,10 @@ handle_nosign:
 				num = (u_char)va_arg(ap, int);
 			else
 				num = va_arg(ap, u_int);
+			if (bconv) {
+				q = va_arg(ap, char *);
+				base = *q++;
+			}
 			goto number;
 handle_sign:
 			if (jflag)
@@ -976,6 +960,26 @@ number:
 			while (*p)
 				PCHAR(*p--);
 
+			if (bconv && num != 0) {
+				/* %b conversion flag format. */
+				tmp = retval;
+				while (*q) {
+					n = *q++;
+					if (num & (1 << (n - 1))) {
+						PCHAR(retval != tmp ?
+						    ',' : '<');
+						for (; (n = *q) > ' '; ++q)
+							PCHAR(n);
+					} else
+						for (; *q > ' '; ++q)
+							continue;
+				}
+				if (retval != tmp) {
+					PCHAR('>');
+					width -= retval - tmp;
+				}
+			}
+
 			if (ladjust)
 				while (width-- > 0)
 					PCHAR(' ');
diff --git a/freebsd/sys/kern/subr_sbuf.c b/freebsd/sys/kern/subr_sbuf.c
index 680613b1..8dd11b07 100644
--- a/freebsd/sys/kern/subr_sbuf.c
+++ b/freebsd/sys/kern/subr_sbuf.c
@@ -106,7 +106,7 @@ _assert_sbuf_integrity(const char *fun, struct sbuf *s)
 	    ("%s called with a NULL sbuf pointer", fun));
 	KASSERT(s->s_buf != NULL,
 	    ("%s called with uninitialized or corrupt sbuf", fun));
-        if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) {
+	if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) {
 		KASSERT(s->s_len <= s->s_size,
 		    ("wrote past end of sbuf (%jd >= %jd)",
 		    (intmax_t)s->s_len, (intmax_t)s->s_size));
diff --git a/freebsd/sys/kern/subr_taskqueue.c b/freebsd/sys/kern/subr_taskqueue.c
index 6f1ba19a..74b9cf59 100644
--- a/freebsd/sys/kern/subr_taskqueue.c
+++ b/freebsd/sys/kern/subr_taskqueue.c
@@ -316,8 +316,8 @@ taskqueue_timeout_func(void *arg)
 }
 
 int
-taskqueue_enqueue_timeout(struct taskqueue *queue,
-    struct timeout_task *timeout_task, int ticks)
+taskqueue_enqueue_timeout_sbt(struct taskqueue *queue,
+    struct timeout_task *timeout_task, sbintime_t sbt, sbintime_t pr, int flags)
 {
 	int res;
 
@@ -333,7 +333,7 @@ taskqueue_enqueue_timeout(struct taskqueue *queue,
 		/* Do nothing */
 		TQ_UNLOCK(queue);
 		res = -1;
-	} else if (ticks == 0) {
+	} else if (sbt == 0) {
 		taskqueue_enqueue_locked(queue, &timeout_task->t);
 		/* The lock is released inside. */
 	} else {
@@ -342,18 +342,27 @@ taskqueue_enqueue_timeout(struct taskqueue *queue,
 		} else {
 			queue->tq_callouts++;
 			timeout_task->f |= DT_CALLOUT_ARMED;
-			if (ticks < 0)
-				ticks = -ticks; /* Ignore overflow. */
+			if (sbt < 0)
+				sbt = -sbt; /* Ignore overflow. */
 		}
-		if (ticks > 0) {
-			callout_reset(&timeout_task->c, ticks,
-			    taskqueue_timeout_func, timeout_task);
+		if (sbt > 0) {
+			callout_reset_sbt(&timeout_task->c, sbt, pr,
+			    taskqueue_timeout_func, timeout_task, flags);
 		}
 		TQ_UNLOCK(queue);
 	}
 	return (res);
 }
 
+int
+taskqueue_enqueue_timeout(struct taskqueue *queue,
+    struct timeout_task *ttask, int ticks)
+{
+
+	return (taskqueue_enqueue_timeout_sbt(queue, ttask, ticks * tick_sbt,
+	    0, 0));
+}
+
 static void
 taskqueue_task_nop_fn(void *context, int pending)
 {
diff --git a/freebsd/sys/kern/subr_uio.c b/freebsd/sys/kern/subr_uio.c
index 5740e667..904ef1f4 100644
--- a/freebsd/sys/kern/subr_uio.c
+++ b/freebsd/sys/kern/subr_uio.c
@@ -212,41 +212,37 @@ uiomove_nofault(void *cp, int n, struct uio *uio)
 static int
 uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
 {
-#ifndef __rtems__
-	struct thread *td;
-#endif /* __rtems__ */
 	struct iovec *iov;
 	size_t cnt;
-	int error, newflags, save;
-
 #ifndef __rtems__
-	td = curthread;
+	int error, newflags, save;
+#else /* __rtems__ */
+	int error;
 #endif /* __rtems__ */
+
 	error = 0;
 
+#ifndef __rtems__
 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomove: mode"));
-#ifndef __rtems__
-	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
+	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
 	    ("uiomove proc"));
-#endif /* __rtems__ */
-	if (!nofault)
-		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-		    "Calling uiomove()");
 
-#ifndef __rtems__
-	/* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
-	newflags = TDP_DEADLKTREAT;
-	if (uio->uio_segflg == UIO_USERSPACE && nofault) {
-		/*
-		 * Fail if a non-spurious page fault occurs.
-		 */
-		newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		newflags = TDP_DEADLKTREAT;
+		if (nofault) {
+			/*
+			 * Fail if a non-spurious page fault occurs.
+			 */
+			newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
+		} else {
+			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+			    "Calling uiomove()");
+		}
+		save = curthread_pflags_set(newflags);
+	} else {
+		KASSERT(nofault == 0, ("uiomove: nofault"));
 	}
-	save = curthread_pflags_set(newflags);
-#else /* __rtems__ */
-	(void) newflags;
-	(void) save;
 #endif /* __rtems__ */
 
 	while (n > 0 && uio->uio_resid) {
@@ -292,7 +288,8 @@ uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
 	}
 out:
 #ifndef __rtems__
-	curthread_pflags_restore(save);
+	if (uio->uio_segflg == UIO_USERSPACE) 
+		curthread_pflags_restore(save);
 #endif /* __rtems__ */
 	return (error);
 }
diff --git a/freebsd/sys/kern/sys_socket.c b/freebsd/sys/kern/sys_socket.c
index 8d87c51b..9dd458f1 100644
--- a/freebsd/sys/kern/sys_socket.c
+++ b/freebsd/sys/kern/sys_socket.c
@@ -318,32 +318,36 @@ soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
 		break;
 
 	case FIOASYNC:
-		/*
-		 * XXXRW: This code separately acquires SOCK_LOCK(so) and
-		 * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
-		 * mutex to avoid introducing the assumption that they are
-		 * the same.
-		 */
 		if (*(int *)data) {
 			SOCK_LOCK(so);
 			so->so_state |= SS_ASYNC;
+			if (SOLISTENING(so)) {
+				so->sol_sbrcv_flags |= SB_ASYNC;
+				so->sol_sbsnd_flags |= SB_ASYNC;
+			} else {
+				SOCKBUF_LOCK(&so->so_rcv);
+				so->so_rcv.sb_flags |= SB_ASYNC;
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				SOCKBUF_LOCK(&so->so_snd);
+				so->so_snd.sb_flags |= SB_ASYNC;
+				SOCKBUF_UNLOCK(&so->so_snd);
+			}
 			SOCK_UNLOCK(so);
-			SOCKBUF_LOCK(&so->so_rcv);
-			so->so_rcv.sb_flags |= SB_ASYNC;
-			SOCKBUF_UNLOCK(&so->so_rcv);
-			SOCKBUF_LOCK(&so->so_snd);
-			so->so_snd.sb_flags |= SB_ASYNC;
-			SOCKBUF_UNLOCK(&so->so_snd);
 		} else {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ASYNC;
+			if (SOLISTENING(so)) {
+				so->sol_sbrcv_flags &= ~SB_ASYNC;
+				so->sol_sbsnd_flags &= ~SB_ASYNC;
+			} else {
+				SOCKBUF_LOCK(&so->so_rcv);
+				so->so_rcv.sb_flags &= ~SB_ASYNC;
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				SOCKBUF_LOCK(&so->so_snd);
+				so->so_snd.sb_flags &= ~SB_ASYNC;
+				SOCKBUF_UNLOCK(&so->so_snd);
+			}
 			SOCK_UNLOCK(so);
-			SOCKBUF_LOCK(&so->so_rcv);
-			so->so_rcv.sb_flags &= ~SB_ASYNC;
-			SOCKBUF_UNLOCK(&so->so_rcv);
-			SOCKBUF_LOCK(&so->so_snd);
-			so->so_snd.sb_flags &= ~SB_ASYNC;
-			SOCKBUF_UNLOCK(&so->so_snd);
 		}
 		break;
 
@@ -477,7 +481,6 @@ static int
 soo_stat(struct socket *so, struct stat *ub)
 {
 #endif /* __rtems__ */
-	struct sockbuf *sb;
 #ifdef MAC
 	int error;
 #endif
@@ -491,22 +494,26 @@ soo_stat(struct socket *so, struct stat *ub)
 	if (error)
 		return (error);
 #endif
-	/*
-	 * If SBS_CANTRCVMORE is set, but there's still data left in the
-	 * receive buffer, the socket is still readable.
-	 */
-	sb = &so->so_rcv;
-	SOCKBUF_LOCK(sb);
-	if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
-		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
-	ub->st_size = sbavail(sb) - sb->sb_ctl;
-	SOCKBUF_UNLOCK(sb);
+	if (!SOLISTENING(so)) {
+		struct sockbuf *sb;
 
-	sb = &so->so_snd;
-	SOCKBUF_LOCK(sb);
-	if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
-		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
-	SOCKBUF_UNLOCK(sb);
+		/*
+		 * If SBS_CANTRCVMORE is set, but there's still data left
+		 * in the receive buffer, the socket is still readable.
+		 */
+		sb = &so->so_rcv;
+		SOCKBUF_LOCK(sb);
+		if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
+			ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
+		ub->st_size = sbavail(sb) - sb->sb_ctl;
+		SOCKBUF_UNLOCK(sb);
+	
+		sb = &so->so_snd;
+		SOCKBUF_LOCK(sb);
+		if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
+			ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
+		SOCKBUF_UNLOCK(sb);
+	}
 #ifndef __rtems__
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
@@ -916,6 +923,7 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb)
 {
 	struct kaiocb *job;
 
+	CURVNET_SET(so->so_vnet);
 	SOCKBUF_LOCK(sb);
 	while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) {
 		job = TAILQ_FIRST(&sb->sb_aiojobq);
@@ -936,9 +944,9 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb)
 	sb->sb_flags &= ~SB_AIO_RUNNING;
 	SOCKBUF_UNLOCK(sb);
 
-	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sorele(so);
+	CURVNET_RESTORE();
 }
 
 void
diff --git a/freebsd/sys/kern/uipc_accf.c b/freebsd/sys/kern/uipc_accf.c
index a766adf8..8a0e14e3 100644
--- a/freebsd/sys/kern/uipc_accf.c
+++ b/freebsd/sys/kern/uipc_accf.c
@@ -132,8 +132,7 @@ accept_filt_generic_mod_event(module_t mod, int event, void *data)
 
 	switch (event) {
 	case MOD_LOAD:
-		p = malloc(sizeof(*p), M_ACCF,
-		    M_WAITOK);
+		p = malloc(sizeof(*p), M_ACCF, M_WAITOK);
 		bcopy(accfp, p, sizeof(*p));
 		error = accept_filt_add(p);
 		break;
@@ -164,26 +163,25 @@ accept_filt_generic_mod_event(module_t mod, int event, void *data)
 }
 
 int
-do_getopt_accept_filter(struct socket *so, struct sockopt *sopt)
+accept_filt_getopt(struct socket *so, struct sockopt *sopt)
 {
 	struct accept_filter_arg *afap;
 	int error;
 
 	error = 0;
-	afap = malloc(sizeof(*afap), M_TEMP,
-	    M_WAITOK | M_ZERO);
+	afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO);
 	SOCK_LOCK(so);
 	if ((so->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto out;
 	}
-	if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+	if (so->sol_accept_filter == NULL) {
 		error = EINVAL;
 		goto out;
 	}
-	strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
-	if (so->so_accf->so_accept_filter_str != NULL)
-		strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+	strcpy(afap->af_name, so->sol_accept_filter->accf_name);
+	if (so->sol_accept_filter_str != NULL)
+		strcpy(afap->af_arg, so->sol_accept_filter_str);
 out:
 	SOCK_UNLOCK(so);
 	if (error == 0)
@@ -193,35 +191,61 @@ out:
 }
 
 int
-do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
+accept_filt_setopt(struct socket *so, struct sockopt *sopt)
 {
 	struct accept_filter_arg *afap;
 	struct accept_filter *afp;
-	struct so_accf *newaf;
-	int error = 0;
+	char *accept_filter_str = NULL;
+	void *accept_filter_arg = NULL;
+	int error;
 
 	/*
 	 * Handle the simple delete case first.
 	 */
 	if (sopt == NULL || sopt->sopt_val == NULL) {
+		struct socket *sp, *sp1;
+		int wakeup;
+
 		SOCK_LOCK(so);
 		if ((so->so_options & SO_ACCEPTCONN) == 0) {
 			SOCK_UNLOCK(so);
 			return (EINVAL);
 		}
-		if (so->so_accf != NULL) {
-			struct so_accf *af = so->so_accf;
-			if (af->so_accept_filter != NULL &&
-				af->so_accept_filter->accf_destroy != NULL) {
-				af->so_accept_filter->accf_destroy(so);
-			}
-			if (af->so_accept_filter_str != NULL)
-				free(af->so_accept_filter_str, M_ACCF);
-			free(af, M_ACCF);
-			so->so_accf = NULL;
+		if (so->sol_accept_filter == NULL) {
+			SOCK_UNLOCK(so);
+			return (0);
 		}
+		if (so->sol_accept_filter->accf_destroy != NULL)
+			so->sol_accept_filter->accf_destroy(so);
+		if (so->sol_accept_filter_str != NULL)
+			free(so->sol_accept_filter_str, M_ACCF);
+		so->sol_accept_filter = NULL;
+		so->sol_accept_filter_arg = NULL;
+		so->sol_accept_filter_str = NULL;
 		so->so_options &= ~SO_ACCEPTFILTER;
-		SOCK_UNLOCK(so);
+
+		/*
+		 * Move from incomplete queue to complete only those
+		 * connections, that are blocked by us.
+		 */
+		wakeup = 0;
+		TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) {
+			SOCK_LOCK(sp);
+			if (sp->so_options & SO_ACCEPTFILTER) {
+				TAILQ_REMOVE(&so->sol_incomp, sp, so_list);
+				TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list);
+				sp->so_qstate = SQ_COMP;
+				sp->so_options &= ~SO_ACCEPTFILTER;
+				so->sol_incqlen--;
+				so->sol_qlen++;
+				wakeup = 1;
+			}
+			SOCK_UNLOCK(sp);
+		}
+		if (wakeup)
+			solisten_wakeup(so);  /* unlocks */
+		else
+			SOLISTEN_UNLOCK(so);
 		return (0);
 	}
 
@@ -229,8 +253,7 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
 	 * Pre-allocate any memory we may need later to avoid blocking at
 	 * untimely moments.  This does not optimize for invalid arguments.
 	 */
-	afap = malloc(sizeof(*afap), M_TEMP,
-	    M_WAITOK);
+	afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK);
 	error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
 	afap->af_name[sizeof(afap->af_name)-1] = '\0';
 	afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
@@ -243,19 +266,10 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
 		free(afap, M_TEMP);
 		return (ENOENT);
 	}
-	/*
-	 * Allocate the new accept filter instance storage.  We may
-	 * have to free it again later if we fail to attach it.  If
-	 * attached properly, 'newaf' is NULLed to avoid a free()
-	 * while in use.
-	 */
-	newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK |
-	    M_ZERO);
 	if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
 		size_t len = strlen(afap->af_name) + 1;
-		newaf->so_accept_filter_str = malloc(len, M_ACCF,
-		    M_WAITOK);
-		strcpy(newaf->so_accept_filter_str, afap->af_name);
+		accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
+		strcpy(accept_filter_str, afap->af_name);
 	}
 
 	/*
@@ -263,8 +277,8 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
 	 * without first removing it.
 	 */
 	SOCK_LOCK(so);
-	if (((so->so_options & SO_ACCEPTCONN) == 0) ||
-	    (so->so_accf != NULL)) {
+	if ((so->so_options & SO_ACCEPTCONN) == 0 ||
+	    so->sol_accept_filter != NULL) {
 		error = EINVAL;
 		goto out;
 	}
@@ -275,25 +289,20 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
 	 * can't block.
 	 */
 	if (afp->accf_create != NULL) {
-		newaf->so_accept_filter_arg =
-		    afp->accf_create(so, afap->af_arg);
-		if (newaf->so_accept_filter_arg == NULL) {
+		accept_filter_arg = afp->accf_create(so, afap->af_arg);
+		if (accept_filter_arg == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 	}
-	newaf->so_accept_filter = afp;
-	so->so_accf = newaf;
+	so->sol_accept_filter = afp;
+	so->sol_accept_filter_arg = accept_filter_arg;
+	so->sol_accept_filter_str = accept_filter_str;
 	so->so_options |= SO_ACCEPTFILTER;
-	newaf = NULL;
 out:
 	SOCK_UNLOCK(so);
-	if (newaf != NULL) {
-		if (newaf->so_accept_filter_str != NULL)
-			free(newaf->so_accept_filter_str, M_ACCF);
-		free(newaf, M_ACCF);
-	}
-	if (afap != NULL)
-		free(afap, M_TEMP);
+	if (accept_filter_str != NULL)
+		free(accept_filter_str, M_ACCF);
+	free(afap, M_TEMP);
 	return (error);
 }
diff --git a/freebsd/sys/kern/uipc_mbuf.c b/freebsd/sys/kern/uipc_mbuf.c
index ba8a2d48..abc30dd3 100644
--- a/freebsd/sys/kern/uipc_mbuf.c
+++ b/freebsd/sys/kern/uipc_mbuf.c
@@ -1519,7 +1519,7 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 	 * the total data supplied by the uio.
 	 */
 	if (len > 0)
-		total = min(uio->uio_resid, len);
+		total = (uio->uio_resid < len) ? uio->uio_resid : len;
 	else
 		total = uio->uio_resid;
 
diff --git a/freebsd/sys/kern/uipc_sockbuf.c b/freebsd/sys/kern/uipc_sockbuf.c
index 04193c29..4b710a2c 100644
--- a/freebsd/sys/kern/uipc_sockbuf.c
+++ b/freebsd/sys/kern/uipc_sockbuf.c
@@ -316,14 +316,14 @@ sowakeup(struct socket *so, struct sockbuf *sb)
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
-	selwakeuppri(&sb->sb_sel, PSOCK);
-	if (!SEL_WAITING(&sb->sb_sel))
+	selwakeuppri(sb->sb_sel, PSOCK);
+	if (!SEL_WAITING(sb->sb_sel))
 		sb->sb_flags &= ~SB_SEL;
 	if (sb->sb_flags & SB_WAIT) {
 		sb->sb_flags &= ~SB_WAIT;
 		wakeup(&sb->sb_acc);
 	}
-	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+	KNOTE_LOCKED(&sb->sb_sel->si_note, 0);
 	if (sb->sb_upcall != NULL) {
 		ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
 		if (ret == SU_ISCONNECTED) {
@@ -336,7 +336,7 @@ sowakeup(struct socket *so, struct sockbuf *sb)
 	if (sb->sb_flags & SB_AIO)
 		sowakeup_aio(so, sb);
 	SOCKBUF_UNLOCK(sb);
-	if (ret == SU_ISCONNECTED)
+	if (ret == SU_ISCONNECTED && !(so->so_state & SS_ISDISCONNECTED))
 		soisconnected(so);
 	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGIO, 0);
@@ -457,14 +457,78 @@ sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
 }
 
 int
-sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, 
-    struct thread *td)
+sbsetopt(struct socket *so, int cmd, u_long cc)
 {
+	struct sockbuf *sb;
+	short *flags;
+	u_int *hiwat, *lowat;
 	int error;
 
-	SOCKBUF_LOCK(sb);
-	error = sbreserve_locked(sb, cc, so, td);
-	SOCKBUF_UNLOCK(sb);
+	SOCK_LOCK(so);
+	if (SOLISTENING(so)) {
+		switch (cmd) {
+			case SO_SNDLOWAT:
+			case SO_SNDBUF:
+				lowat = &so->sol_sbsnd_lowat;
+				hiwat = &so->sol_sbsnd_hiwat;
+				flags = &so->sol_sbsnd_flags;
+				break;
+			case SO_RCVLOWAT:
+			case SO_RCVBUF:
+				lowat = &so->sol_sbrcv_lowat;
+				hiwat = &so->sol_sbrcv_hiwat;
+				flags = &so->sol_sbrcv_flags;
+				break;
+		}
+	} else {
+		switch (cmd) {
+			case SO_SNDLOWAT:
+			case SO_SNDBUF:
+				sb = &so->so_snd;
+				break;
+			case SO_RCVLOWAT:
+			case SO_RCVBUF:
+				sb = &so->so_rcv;
+				break;
+		}
+		flags = &sb->sb_flags;
+		hiwat = &sb->sb_hiwat;
+		lowat = &sb->sb_lowat;
+		SOCKBUF_LOCK(sb);
+	}
+
+	error = 0;
+	switch (cmd) {
+	case SO_SNDBUF:
+	case SO_RCVBUF:
+		if (SOLISTENING(so)) {
+			if (cc > sb_max_adj) {
+				error = ENOBUFS;
+				break;
+			}
+			*hiwat = cc;
+			if (*lowat > *hiwat)
+				*lowat = *hiwat;
+		} else {
+			if (!sbreserve_locked(sb, cc, so, curthread))
+				error = ENOBUFS;
+		}
+		if (error == 0)
+			*flags &= ~SB_AUTOSIZE;
+		break;
+	case SO_SNDLOWAT:
+	case SO_RCVLOWAT:
+		/*
+		 * Make sure the low-water is never greater than the
+		 * high-water.
+		 */
+		*lowat = (cc > *hiwat) ? *hiwat : cc;
+		break;
+	}
+
+	if (!SOLISTENING(so))
+		SOCKBUF_UNLOCK(sb);
+	SOCK_UNLOCK(so);
 	return (error);
 }
 
diff --git a/freebsd/sys/kern/uipc_socket.c b/freebsd/sys/kern/uipc_socket.c
index c52a543c..1773606d 100644
--- a/freebsd/sys/kern/uipc_socket.c
+++ b/freebsd/sys/kern/uipc_socket.c
@@ -108,6 +108,7 @@ __FBSDID("$FreeBSD$");
 #include <rtems/bsd/local/opt_inet.h>
 #include <rtems/bsd/local/opt_inet6.h>
 #include <rtems/bsd/local/opt_compat.h>
+#include <rtems/bsd/local/opt_sctp.h>
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -160,13 +161,21 @@ __FBSDID("$FreeBSD$");
 
 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
 		    int flags);
+static void	so_rdknl_lock(void *);
+static void	so_rdknl_unlock(void *);
+static void	so_rdknl_assert_locked(void *);
+static void	so_rdknl_assert_unlocked(void *);
+static void	so_wrknl_lock(void *);
+static void	so_wrknl_unlock(void *);
+static void	so_wrknl_assert_locked(void *);
+static void	so_wrknl_assert_unlocked(void *);
 
 static void	filt_sordetach(struct knote *kn);
 static int	filt_soread(struct knote *kn, long hint);
 static void	filt_sowdetach(struct knote *kn);
 static int	filt_sowrite(struct knote *kn, long hint);
-static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 static int	filt_soempty(struct knote *kn, long hint);
+static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 #ifdef __rtems__
 static
 #endif /* __rtems__ */
@@ -412,8 +421,16 @@ soalloc(struct vnet *vnet)
 		return (NULL);
 	}
 
+	/*
+	 * The socket locking protocol allows to lock 2 sockets at a time,
+	 * however, the first one must be a listening socket.  WITNESS lacks
+	 * a feature to change class of an existing lock, so we use DUPOK.
+	 */
+	mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
 	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
 	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+	so->so_rcv.sb_sel = &so->so_rdsel;
+	so->so_snd.sb_sel = &so->so_wrsel;
 	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
 	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
 #ifndef __rtems__
@@ -465,15 +482,6 @@ sodealloc(struct socket *so)
 	so->so_vnet->vnet_sockcnt--;
 #endif
 	mtx_unlock(&so_global_mtx);
-	if (so->so_rcv.sb_hiwat)
-		(void)chgsbsize(so->so_cred->cr_uidinfo,
-		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
-	if (so->so_snd.sb_hiwat)
-		(void)chgsbsize(so->so_cred->cr_uidinfo,
-		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
-	/* remove accept filter if one is present. */
-	if (so->so_accf != NULL)
-		do_setopt_accept_filter(so, NULL);
 #ifdef MAC
 	mac_socket_destroy(so);
 #endif
@@ -481,10 +489,22 @@ sodealloc(struct socket *so)
 
 	crfree(so->so_cred);
 	khelp_destroy_osd(&so->osd);
-	sx_destroy(&so->so_snd.sb_sx);
-	sx_destroy(&so->so_rcv.sb_sx);
-	SOCKBUF_LOCK_DESTROY(&so->so_snd);
-	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+	if (SOLISTENING(so)) {
+		if (so->sol_accept_filter != NULL)
+			accept_filt_setopt(so, NULL);
+	} else {
+		if (so->so_rcv.sb_hiwat)
+			(void)chgsbsize(so->so_cred->cr_uidinfo,
+			    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+		if (so->so_snd.sb_hiwat)
+			(void)chgsbsize(so->so_cred->cr_uidinfo,
+			    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+		sx_destroy(&so->so_snd.sb_sx);
+		sx_destroy(&so->so_rcv.sb_sx);
+		SOCKBUF_LOCK_DESTROY(&so->so_snd);
+		SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+	}
+	mtx_destroy(&so->so_lock);
 	uma_zfree(socket_zone, so);
 }
 
@@ -527,8 +547,6 @@ socreate(int dom, struct socket **aso, int type, int proto,
 	if (so == NULL)
 		return (ENOBUFS);
 
-	TAILQ_INIT(&so->so_incomp);
-	TAILQ_INIT(&so->so_comp);
 	so->so_type = type;
 	so->so_cred = crhold(cred);
 	if ((prp->pr_domain->dom_family == PF_INET) ||
@@ -545,9 +563,10 @@ socreate(int dom, struct socket **aso, int type, int proto,
 #ifdef MAC
 	mac_socket_create(cred, so);
 #endif
-	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
-	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
-	so->so_count = 1;
+	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+	    so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+	    so_wrknl_assert_locked, so_wrknl_assert_unlocked);
 	/*
 	 * Auto-sizing of socket buffers is managed by the protocols and
 	 * the appropriate flags must be set in the pru_attach function.
@@ -556,12 +575,10 @@ socreate(int dom, struct socket **aso, int type, int proto,
 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 	CURVNET_RESTORE();
 	if (error) {
-		KASSERT(so->so_count == 1, ("socreate: so_count %d",
-		    so->so_count));
-		so->so_count = 0;
 		sodealloc(so);
 		return (error);
 	}
+	soref(so);
 	*aso = so;
 	return (0);
 }
@@ -589,11 +606,11 @@ sonewconn(struct socket *head, int connstatus)
 	static int overcount;
 
 	struct socket *so;
-	int over;
+	u_int over;
 
-	ACCEPT_LOCK();
-	over = (head->so_qlen > 3 * head->so_qlimit / 2);
-	ACCEPT_UNLOCK();
+	SOLISTEN_LOCK(head);
+	over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
+	SOLISTEN_UNLOCK(head);
 #ifdef REGRESSION
 	if (regression_sonewconn_earlytest && over) {
 #else
@@ -605,15 +622,15 @@ sonewconn(struct socket *head, int connstatus)
 			log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
 			    "%i already in queue awaiting acceptance "
 			    "(%d occurrences)\n",
-			    __func__, head->so_pcb, head->so_qlen, overcount);
+			    __func__, head->so_pcb, head->sol_qlen, overcount);
 
 			overcount = 0;
 		}
 
 		return (NULL);
 	}
-	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
-	    __func__, __LINE__, head));
+	VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
+	    __func__, head));
 	so = soalloc(head->so_vnet);
 	if (so == NULL) {
 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
@@ -621,11 +638,8 @@ sonewconn(struct socket *head, int connstatus)
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
-	if ((head->so_options & SO_ACCEPTFILTER) != 0)
-		connstatus = 0;
-	so->so_head = head;
+	so->so_listen = head;
 	so->so_type = head->so_type;
-	so->so_options = head->so_options &~ SO_ACCEPTCONN;
 	so->so_linger = head->so_linger;
 	so->so_state = head->so_state | SS_NOFDREF;
 	so->so_fibnum = head->so_fibnum;
@@ -634,10 +648,12 @@ sonewconn(struct socket *head, int connstatus)
 #ifdef MAC
 	mac_socket_newconn(head, so);
 #endif
-	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
-	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+	    so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+	    so_wrknl_assert_locked, so_wrknl_assert_unlocked);
 	VNET_SO_ASSERT(head);
-	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+	if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 		    __func__, head->so_pcb);
@@ -649,32 +665,24 @@ sonewconn(struct socket *head, int connstatus)
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
-	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
-	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
-	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
-	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
-	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
-	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+	so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
+	so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
+	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
+	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
+	so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
+	so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
+
+	SOLISTEN_LOCK(head);
+	if (head->sol_accept_filter != NULL)
+		connstatus = 0;
 	so->so_state |= connstatus;
-	ACCEPT_LOCK();
-	/*
-	 * The accept socket may be tearing down but we just
-	 * won a race on the ACCEPT_LOCK.
-	 * However, if sctp_peeloff() is called on a 1-to-many
-	 * style socket, the SO_ACCEPTCONN doesn't need to be set.
-	 */
-	if (!(head->so_options & SO_ACCEPTCONN) &&
-	    ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
-	     (head->so_type != SOCK_SEQPACKET))) {
-		SOCK_LOCK(so);
-		so->so_head = NULL;
-		sofree(so);		/* NB: returns ACCEPT_UNLOCK'ed. */
-		return (NULL);
-	}
+	so->so_options = head->so_options & ~SO_ACCEPTCONN;
+	soref(head); /* A socket on (in)complete queue refs head. */
 	if (connstatus) {
-		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
-		so->so_qstate |= SQ_COMP;
-		head->so_qlen++;
+		TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
+		so->so_qstate = SQ_COMP;
+		head->sol_qlen++;
+		solisten_wakeup(head);	/* unlocks */
 	} else {
 		/*
 		 * Keep removing sockets from the head until there's room for
@@ -683,28 +691,86 @@ sonewconn(struct socket *head, int connstatus)
 		 * threads and soabort() requires dropping locks, we must
 		 * loop waiting for the condition to be true.
 		 */
-		while (head->so_incqlen > head->so_qlimit) {
+		while (head->sol_incqlen > head->sol_qlimit) {
 			struct socket *sp;
-			sp = TAILQ_FIRST(&head->so_incomp);
-			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
-			head->so_incqlen--;
-			sp->so_qstate &= ~SQ_INCOMP;
-			sp->so_head = NULL;
-			ACCEPT_UNLOCK();
+
+			sp = TAILQ_FIRST(&head->sol_incomp);
+			TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
+			head->sol_incqlen--;
+			SOCK_LOCK(sp);
+			sp->so_qstate = SQ_NONE;
+			sp->so_listen = NULL;
+			SOCK_UNLOCK(sp);
+			sorele(head);	/* does SOLISTEN_UNLOCK, head stays */
 			soabort(sp);
-			ACCEPT_LOCK();
+			SOLISTEN_LOCK(head);
 		}
-		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
-		so->so_qstate |= SQ_INCOMP;
-		head->so_incqlen++;
+		TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
+		so->so_qstate = SQ_INCOMP;
+		head->sol_incqlen++;
+		SOLISTEN_UNLOCK(head);
 	}
-	ACCEPT_UNLOCK();
-	if (connstatus) {
-		sorwakeup(head);
-		wakeup_one(&head->so_timeo);
+	return (so);
+}
+
+#ifdef SCTP
+/*
+ * Socket part of sctp_peeloff().  Detach a new socket from an
+ * association.  The new socket is returned with a reference.
+ */
+struct socket *
+sopeeloff(struct socket *head)
+{
+	struct socket *so;
+
+	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
+	    __func__, __LINE__, head));
+	so = soalloc(head->so_vnet);
+	if (so == NULL) {
+		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
+		    "limit reached or out of memory\n",
+		    __func__, head->so_pcb);
+		return (NULL);
 	}
+	so->so_type = head->so_type;
+	so->so_options = head->so_options;
+	so->so_linger = head->so_linger;
+	so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
+	so->so_fibnum = head->so_fibnum;
+	so->so_proto = head->so_proto;
+	so->so_cred = crhold(head->so_cred);
+#ifdef MAC
+	mac_socket_newconn(head, so);
+#endif
+	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+	    so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+	    so_wrknl_assert_locked, so_wrknl_assert_unlocked);
+	VNET_SO_ASSERT(head);
+	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+		sodealloc(so);
+		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
+		    __func__, head->so_pcb);
+		return (NULL);
+	}
+	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sodealloc(so);
+		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+		    __func__, head->so_pcb);
+		return (NULL);
+	}
+	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+
+	soref(so);
+
 	return (so);
 }
+#endif	/* SCTP */
 
 int
 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
@@ -766,13 +832,140 @@ solisten_proto_check(struct socket *so)
 void
 solisten_proto(struct socket *so, int backlog)
 {
+	int sbrcv_lowat, sbsnd_lowat;
+	u_int sbrcv_hiwat, sbsnd_hiwat;
+	short sbrcv_flags, sbsnd_flags;
+	sbintime_t sbrcv_timeo, sbsnd_timeo;
 
 	SOCK_LOCK_ASSERT(so);
 
+	if (SOLISTENING(so))
+		goto listening;
+
+	/*
+	 * Change this socket to listening state.
+	 */
+	sbrcv_lowat = so->so_rcv.sb_lowat;
+	sbsnd_lowat = so->so_snd.sb_lowat;
+	sbrcv_hiwat = so->so_rcv.sb_hiwat;
+	sbsnd_hiwat = so->so_snd.sb_hiwat;
+	sbrcv_flags = so->so_rcv.sb_flags;
+	sbsnd_flags = so->so_snd.sb_flags;
+	sbrcv_timeo = so->so_rcv.sb_timeo;
+	sbsnd_timeo = so->so_snd.sb_timeo;
+
+	sbdestroy(&so->so_snd, so);
+	sbdestroy(&so->so_rcv, so);
+	sx_destroy(&so->so_snd.sb_sx);
+	sx_destroy(&so->so_rcv.sb_sx);
+	SOCKBUF_LOCK_DESTROY(&so->so_snd);
+	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+
+#ifdef INVARIANTS
+	bzero(&so->so_rcv,
+	    sizeof(struct socket) - offsetof(struct socket, so_rcv));
+#endif
+
+	so->sol_sbrcv_lowat = sbrcv_lowat;
+	so->sol_sbsnd_lowat = sbsnd_lowat;
+	so->sol_sbrcv_hiwat = sbrcv_hiwat;
+	so->sol_sbsnd_hiwat = sbsnd_hiwat;
+	so->sol_sbrcv_flags = sbrcv_flags;
+	so->sol_sbsnd_flags = sbsnd_flags;
+	so->sol_sbrcv_timeo = sbrcv_timeo;
+	so->sol_sbsnd_timeo = sbsnd_timeo;
+
+	so->sol_qlen = so->sol_incqlen = 0;
+	TAILQ_INIT(&so->sol_incomp);
+	TAILQ_INIT(&so->sol_comp);
+
+	so->sol_accept_filter = NULL;
+	so->sol_accept_filter_arg = NULL;
+	so->sol_accept_filter_str = NULL;
+
+	so->sol_upcall = NULL;
+	so->sol_upcallarg = NULL;
+
+	so->so_options |= SO_ACCEPTCONN;
+
+listening:
 	if (backlog < 0 || backlog > somaxconn)
 		backlog = somaxconn;
-	so->so_qlimit = backlog;
-	so->so_options |= SO_ACCEPTCONN;
+	so->sol_qlimit = backlog;
+}
+
+/*
+ * Wakeup listeners/subsystems once we have a complete connection.
+ * Enters with lock, returns unlocked.
+ */
+void
+solisten_wakeup(struct socket *sol)
+{
+
+	if (sol->sol_upcall != NULL)
+		(void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
+	else {
+		selwakeuppri(&sol->so_rdsel, PSOCK);
+		KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
+	}
+	SOLISTEN_UNLOCK(sol);
+	wakeup_one(&sol->sol_comp);
+}
+
+/*
+ * Return single connection off a listening socket queue.  Main consumer of
+ * the function is kern_accept4().  Some modules, that do their own accept
+ * management also use the function.
+ *
+ * Listening socket must be locked on entry and is returned unlocked on
+ * return.
+ * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
+ */
+int
+solisten_dequeue(struct socket *head, struct socket **ret, int flags)
+{
+	struct socket *so;
+	int error;
+
+	SOLISTEN_LOCK_ASSERT(head);
+
+	while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
+	    head->so_error == 0) {
+		error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH,
+		    "accept", 0);
+		if (error != 0) {
+			SOLISTEN_UNLOCK(head);
+			return (error);
+		}
+	}
+	if (head->so_error) {
+		error = head->so_error;
+		head->so_error = 0;
+		SOLISTEN_UNLOCK(head);
+		return (error);
+        }
+	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) {
+		SOLISTEN_UNLOCK(head);
+		return (EWOULDBLOCK);
+	}
+	so = TAILQ_FIRST(&head->sol_comp);
+	SOCK_LOCK(so);
+	KASSERT(so->so_qstate == SQ_COMP,
+	    ("%s: so %p not SQ_COMP", __func__, so));
+	soref(so);
+	head->sol_qlen--;
+	so->so_qstate = SQ_NONE;
+	so->so_listen = NULL;
+	TAILQ_REMOVE(&head->sol_comp, so, so_list);
+	if (flags & ACCEPT4_INHERIT)
+		so->so_state |= (head->so_state & SS_NBIO);
+	else
+		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
+	SOCK_UNLOCK(so);
+	sorele(head);
+
+	*ret = so;
+	return (0);
 }
 
 /*
@@ -799,44 +992,62 @@ void
 sofree(struct socket *so)
 {
 	struct protosw *pr = so->so_proto;
-	struct socket *head;
 
-	ACCEPT_LOCK_ASSERT();
 	SOCK_LOCK_ASSERT(so);
 
 	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
-	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
+	    (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
 		SOCK_UNLOCK(so);
-		ACCEPT_UNLOCK();
 		return;
 	}
 
-	head = so->so_head;
-	if (head != NULL) {
-		KASSERT((so->so_qstate & SQ_COMP) != 0 ||
-		    (so->so_qstate & SQ_INCOMP) != 0,
-		    ("sofree: so_head != NULL, but neither SQ_COMP nor "
-		    "SQ_INCOMP"));
-		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
-		    (so->so_qstate & SQ_INCOMP) == 0,
-		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
-		TAILQ_REMOVE(&head->so_incomp, so, so_list);
-		head->so_incqlen--;
-		so->so_qstate &= ~SQ_INCOMP;
-		so->so_head = NULL;
-	}
-	KASSERT((so->so_qstate & SQ_COMP) == 0 &&
-	    (so->so_qstate & SQ_INCOMP) == 0,
-	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
-	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
-	if (so->so_options & SO_ACCEPTCONN) {
-		KASSERT((TAILQ_EMPTY(&so->so_comp)),
-		    ("sofree: so_comp populated"));
-		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
-		    ("sofree: so_incomp populated"));
+	if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
+		struct socket *sol;
+
+		sol = so->so_listen;
+		KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
+
+		/*
+		 * To solve race between close of a listening socket and
+		 * a socket on its incomplete queue, we need to lock both.
+		 * The order is first listening socket, then regular.
+		 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
+		 * function and the listening socket are the only pointers
+		 * to so.  To preserve so and sol, we reference both and then
+		 * relock.
+		 * After relock the socket may not move to so_comp since it
+		 * doesn't have PCB already, but it may be removed from
+		 * so_incomp. If that happens, we share responsiblity on
+		 * freeing the socket, but soclose() has already removed
+		 * it from queue.
+		 */
+		soref(sol);
+		soref(so);
+		SOCK_UNLOCK(so);
+		SOLISTEN_LOCK(sol);
+		SOCK_LOCK(so);
+		if (so->so_qstate == SQ_INCOMP) {
+			KASSERT(so->so_listen == sol,
+			    ("%s: so %p migrated out of sol %p",
+			    __func__, so, sol));
+			TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
+			sol->sol_incqlen--;
+			/* This is guarenteed not to be the last. */
+			refcount_release(&sol->so_count);
+			so->so_qstate = SQ_NONE;
+			so->so_listen = NULL;
+		} else
+			KASSERT(so->so_listen == NULL,
+			    ("%s: so %p not on (in)comp with so_listen",
+			    __func__, so));
+		sorele(sol);
+		KASSERT(so->so_count == 1,
+		    ("%s: so %p count %u", __func__, so, so->so_count));
+		so->so_count = 0;
 	}
+	if (SOLISTENING(so))
+		so->so_error = ECONNABORTED;
 	SOCK_UNLOCK(so);
-	ACCEPT_UNLOCK();
 
 	VNET_SO_ASSERT(so);
 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
@@ -858,12 +1069,14 @@ sofree(struct socket *so)
 	 * before calling pru_detach.  This means that protocols shold not
 	 * assume they can perform socket wakeups, etc, in their detach code.
 	 */
-	sbdestroy(&so->so_snd, so);
-	sbdestroy(&so->so_rcv, so);
-	seldrain(&so->so_snd.sb_sel);
-	seldrain(&so->so_rcv.sb_sel);
-	knlist_destroy(&so->so_rcv.sb_sel.si_note);
-	knlist_destroy(&so->so_snd.sb_sel.si_note);
+	if (!SOLISTENING(so)) {
+		sbdestroy(&so->so_snd, so);
+		sbdestroy(&so->so_rcv, so);
+	}
+	seldrain(&so->so_rdsel);
+	seldrain(&so->so_wrsel);
+	knlist_destroy(&so->so_rdsel.si_note);
+	knlist_destroy(&so->so_wrsel.si_note);
 	sodealloc(so);
 }
 
@@ -878,6 +1091,8 @@ sofree(struct socket *so)
 int
 soclose(struct socket *so)
 {
+	struct accept_queue lqueue;
+	bool listening;
 	int error = 0;
 
 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
@@ -910,41 +1125,42 @@ soclose(struct socket *so)
 drop:
 	if (so->so_proto->pr_usrreqs->pru_close != NULL)
 		(*so->so_proto->pr_usrreqs->pru_close)(so);
-	ACCEPT_LOCK();
-	if (so->so_options & SO_ACCEPTCONN) {
+
+	SOCK_LOCK(so);
+	if ((listening = (so->so_options & SO_ACCEPTCONN))) {
 		struct socket *sp;
-		/*
-		 * Prevent new additions to the accept queues due
-		 * to ACCEPT_LOCK races while we are draining them.
-		 */
-		so->so_options &= ~SO_ACCEPTCONN;
-		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
-			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
-			so->so_incqlen--;
-			sp->so_qstate &= ~SQ_INCOMP;
-			sp->so_head = NULL;
-			ACCEPT_UNLOCK();
-			soabort(sp);
-			ACCEPT_LOCK();
-		}
-		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
-			TAILQ_REMOVE(&so->so_comp, sp, so_list);
-			so->so_qlen--;
-			sp->so_qstate &= ~SQ_COMP;
-			sp->so_head = NULL;
-			ACCEPT_UNLOCK();
-			soabort(sp);
-			ACCEPT_LOCK();
+
+		TAILQ_INIT(&lqueue);
+		TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
+		TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
+
+		so->sol_qlen = so->sol_incqlen = 0;
+
+		TAILQ_FOREACH(sp, &lqueue, so_list) {
+			SOCK_LOCK(sp);
+			sp->so_qstate = SQ_NONE;
+			sp->so_listen = NULL;
+			SOCK_UNLOCK(sp);
+			/* Guaranteed not to be the last. */
+			refcount_release(&so->so_count);
 		}
-		KASSERT((TAILQ_EMPTY(&so->so_comp)),
-		    ("%s: so_comp populated", __func__));
-		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
-		    ("%s: so_incomp populated", __func__));
 	}
-	SOCK_LOCK(so);
 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 	so->so_state |= SS_NOFDREF;
-	sorele(so);			/* NB: Returns with ACCEPT_UNLOCK(). */
+	sorele(so);
+	if (listening) {
+		struct socket *sp;
+
+		TAILQ_FOREACH(sp, &lqueue, so_list) {
+			SOCK_LOCK(sp);
+			if (sp->so_count == 0) {
+				SOCK_UNLOCK(sp);
+				soabort(sp);
+			} else
+				/* sp is now in sofree() */
+				SOCK_UNLOCK(sp);
+		}
+	}
 	CURVNET_RESTORE();
 	return (error);
 }
@@ -976,13 +1192,11 @@ soabort(struct socket *so)
 	KASSERT(so->so_count == 0, ("soabort: so_count"));
 	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
 	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
-	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
-	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
+	KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE"));
 	VNET_SO_ASSERT(so);
 
 	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
 		(*so->so_proto->pr_usrreqs->pru_abort)(so);
-	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sofree(so);
 }
@@ -1431,8 +1645,14 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
-	    control, flags, td);
+	if (!SOLISTENING(so))
+		error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
+		    top, control, flags, td);
+	else {
+		m_freem(top);
+		m_freem(control);
+		error = ENOTCONN;
+	}
 	CURVNET_RESTORE();
 	return (error);
 }
@@ -2368,8 +2588,11 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
-	    controlp, flagsp));
+	if (!SOLISTENING(so))
+		error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
+		    mp0, controlp, flagsp));
+	else
+		error = ENOTCONN;
 	CURVNET_RESTORE();
 	return (error);
 }
@@ -2565,7 +2788,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
-			error = do_setopt_accept_filter(so, sopt);
+			error = accept_filt_setopt(so, sopt);
 			if (error)
 				goto bad;
 			break;
@@ -2653,38 +2876,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
 				goto bad;
 			}
 
-			switch (sopt->sopt_name) {
-			case SO_SNDBUF:
-			case SO_RCVBUF:
-				if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
-				    &so->so_snd : &so->so_rcv, (u_long)optval,
-				    so, curthread) == 0) {
-					error = ENOBUFS;
-					goto bad;
-				}
-				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
-				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
-				break;
-
-			/*
-			 * Make sure the low-water is never greater than the
-			 * high-water.
-			 */
-			case SO_SNDLOWAT:
-				SOCKBUF_LOCK(&so->so_snd);
-				so->so_snd.sb_lowat =
-				    (optval > so->so_snd.sb_hiwat) ?
-				    so->so_snd.sb_hiwat : optval;
-				SOCKBUF_UNLOCK(&so->so_snd);
-				break;
-			case SO_RCVLOWAT:
-				SOCKBUF_LOCK(&so->so_rcv);
-				so->so_rcv.sb_lowat =
-				    (optval > so->so_rcv.sb_hiwat) ?
-				    so->so_rcv.sb_hiwat : optval;
-				SOCKBUF_UNLOCK(&so->so_rcv);
-				break;
-			}
+			error = sbsetopt(so, sopt->sopt_name, optval);
 			break;
 
 		case SO_SNDTIMEO:
@@ -2825,7 +3017,7 @@ sogetopt(struct socket *so, struct sockopt *sopt)
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
-			error = do_getopt_accept_filter(so, sopt);
+			error = accept_filt_getopt(so, sopt);
 			break;
 
 		case SO_LINGER:
@@ -2869,19 +3061,23 @@ integer:
 			goto integer;
 
 		case SO_SNDBUF:
-			optval = so->so_snd.sb_hiwat;
+			optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
+			    so->so_snd.sb_hiwat;
 			goto integer;
 
 		case SO_RCVBUF:
-			optval = so->so_rcv.sb_hiwat;
+			optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
+			    so->so_rcv.sb_hiwat;
 			goto integer;
 
 		case SO_SNDLOWAT:
-			optval = so->so_snd.sb_lowat;
+			optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
+			    so->so_snd.sb_lowat;
 			goto integer;
 
 		case SO_RCVLOWAT:
-			optval = so->so_rcv.sb_lowat;
+			optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
+			    so->so_rcv.sb_lowat;
 			goto integer;
 
 		case SO_SNDTIMEO:
@@ -2933,15 +3129,15 @@ integer:
 			break;
 
 		case SO_LISTENQLIMIT:
-			optval = so->so_qlimit;
+			optval = SOLISTENING(so) ? so->sol_qlimit : 0;
 			goto integer;
 
 		case SO_LISTENQLEN:
-			optval = so->so_qlen;
+			optval = SOLISTENING(so) ? so->sol_qlen : 0;
 			goto integer;
 
 		case SO_LISTENINCQLEN:
-			optval = so->so_incqlen;
+			optval = SOLISTENING(so) ? so->sol_incqlen : 0;
 			goto integer;
 
 		case SO_TS_CLOCK:
@@ -3092,7 +3288,7 @@ sohasoutofband(struct socket *so)
 	if (so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGURG, 0);
 #endif /* __rtems__ */
-	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
+	selwakeuppri(&so->so_rdsel, PSOCK);
 }
 
 int
@@ -3112,44 +3308,54 @@ int
 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
-	int revents = 0;
-
-	SOCKBUF_LOCK(&so->so_snd);
-	SOCKBUF_LOCK(&so->so_rcv);
-	if (events & (POLLIN | POLLRDNORM))
-		if (soreadabledata(so))
-			revents |= events & (POLLIN | POLLRDNORM);
+	int revents;
 
-	if (events & (POLLOUT | POLLWRNORM))
-		if (sowriteable(so))
-			revents |= events & (POLLOUT | POLLWRNORM);
-
-	if (events & (POLLPRI | POLLRDBAND))
-		if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
-			revents |= events & (POLLPRI | POLLRDBAND);
-
-	if ((events & POLLINIGNEOF) == 0) {
-		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
-			revents |= events & (POLLIN | POLLRDNORM);
-			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
-				revents |= POLLHUP;
+	SOCK_LOCK(so);
+	if (SOLISTENING(so)) {
+		if (!(events & (POLLIN | POLLRDNORM)))
+			revents = 0;
+		else if (!TAILQ_EMPTY(&so->sol_comp))
+			revents = events & (POLLIN | POLLRDNORM);
+		else {
+			selrecord(td, &so->so_rdsel);
+			revents = 0;
 		}
-	}
-
-	if (revents == 0) {
-		if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
-			selrecord(td, &so->so_rcv.sb_sel);
-			so->so_rcv.sb_flags |= SB_SEL;
+	} else {
+		revents = 0;
+		SOCKBUF_LOCK(&so->so_snd);
+		SOCKBUF_LOCK(&so->so_rcv);
+		if (events & (POLLIN | POLLRDNORM))
+			if (soreadabledata(so))
+				revents |= events & (POLLIN | POLLRDNORM);
+		if (events & (POLLOUT | POLLWRNORM))
+			if (sowriteable(so))
+				revents |= events & (POLLOUT | POLLWRNORM);
+		if (events & (POLLPRI | POLLRDBAND))
+			if (so->so_oobmark ||
+			    (so->so_rcv.sb_state & SBS_RCVATMARK))
+				revents |= events & (POLLPRI | POLLRDBAND);
+		if ((events & POLLINIGNEOF) == 0) {
+			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+				revents |= events & (POLLIN | POLLRDNORM);
+				if (so->so_snd.sb_state & SBS_CANTSENDMORE)
+					revents |= POLLHUP;
+			}
 		}
-
-		if (events & (POLLOUT | POLLWRNORM)) {
-			selrecord(td, &so->so_snd.sb_sel);
-			so->so_snd.sb_flags |= SB_SEL;
+		if (revents == 0) {
+			if (events &
+			    (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
+				selrecord(td, &so->so_rdsel);
+				so->so_rcv.sb_flags |= SB_SEL;
+			}
+			if (events & (POLLOUT | POLLWRNORM)) {
+				selrecord(td, &so->so_wrsel);
+				so->so_snd.sb_flags |= SB_SEL;
+			}
 		}
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		SOCKBUF_UNLOCK(&so->so_snd);
 	}
-
-	SOCKBUF_UNLOCK(&so->so_rcv);
-	SOCKBUF_UNLOCK(&so->so_snd);
+	SOCK_UNLOCK(so);
 	return (revents);
 }
 
@@ -3158,28 +3364,38 @@ soo_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 	struct sockbuf *sb;
+	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &soread_filtops;
+		knl = &so->so_rdsel.si_note;
 		sb = &so->so_rcv;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &sowrite_filtops;
+		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		break;
 	case EVFILT_EMPTY:
 		kn->kn_fop = &soempty_filtops;
+		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		break;
 	default:
 		return (EINVAL);
 	}
 
-	SOCKBUF_LOCK(sb);
-	knlist_add(&sb->sb_sel.si_note, kn, 1);
-	sb->sb_flags |= SB_KNOTE;
-	SOCKBUF_UNLOCK(sb);
+	SOCK_LOCK(so);
+	if (SOLISTENING(so)) {
+		knlist_add(knl, kn, 1);
+	} else {
+		SOCKBUF_LOCK(sb);
+		knlist_add(knl, kn, 1);
+		sb->sb_flags |= SB_KNOTE;
+		SOCKBUF_UNLOCK(sb);
+	}
+	SOCK_UNLOCK(so);
 	return (0);
 }
 #ifdef __rtems__
@@ -3367,11 +3583,11 @@ filt_sordetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
-	SOCKBUF_LOCK(&so->so_rcv);
-	knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
-	if (knlist_empty(&so->so_rcv.sb_sel.si_note))
+	so_rdknl_lock(so);
+	knlist_remove(&so->so_rdsel.si_note, kn, 1);
+	if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
-	SOCKBUF_UNLOCK(&so->so_rcv);
+	so_rdknl_unlock(so);
 }
 
 /*ARGSUSED*/
@@ -3381,11 +3597,13 @@ filt_soread(struct knote *kn, long hint)
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
-	if (so->so_options & SO_ACCEPTCONN) {
-		kn->kn_data = so->so_qlen;
-		return (!TAILQ_EMPTY(&so->so_comp));
 
+	if (SOLISTENING(so)) {
+		SOCK_LOCK_ASSERT(so);
+		kn->kn_data = so->sol_qlen;
+		return (!TAILQ_EMPTY(&so->sol_comp));
 	}
+
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
@@ -3411,11 +3629,11 @@ filt_sowdetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
-	SOCKBUF_LOCK(&so->so_snd);
-	knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
-	if (knlist_empty(&so->so_snd.sb_sel.si_note))
+	so_wrknl_lock(so);
+	knlist_remove(&so->so_wrsel.si_note, kn, 1);
+	if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
-	SOCKBUF_UNLOCK(&so->so_snd);
+	so_wrknl_unlock(so);
 }
 
 /*ARGSUSED*/
@@ -3425,6 +3643,10 @@ filt_sowrite(struct knote *kn, long hint)
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
+
+	if (SOLISTENING(so))
+		return (0);
+
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	kn->kn_data = sbspace(&so->so_snd);
 
@@ -3451,6 +3673,10 @@ filt_soempty(struct knote *kn, long hint)
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
+
+	if (SOLISTENING(so))
+		return (1);
+
 	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	kn->kn_data = sbused(&so->so_snd);
 
@@ -3521,42 +3747,52 @@ soisconnected(struct socket *so)
 	struct socket *head;
 	int ret;
 
+	/*
+	 * XXXGL: this is the only place where we acquire socket locks
+	 * in reverse order: first child, then listening socket.  To
+	 * avoid possible LOR, use try semantics.
+	 */
 restart:
-	ACCEPT_LOCK();
 	SOCK_LOCK(so);
+	if ((head = so->so_listen) != NULL &&
+	    __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+		SOCK_UNLOCK(so);
+		goto restart;
+	}
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 	so->so_state |= SS_ISCONNECTED;
-	head = so->so_head;
-	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
+	if (head != NULL && (so->so_qstate == SQ_INCOMP)) {
+again:
 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+			TAILQ_REMOVE(&head->sol_incomp, so, so_list);
+			head->sol_incqlen--;
+			TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
+			head->sol_qlen++;
+			so->so_qstate = SQ_COMP;
 			SOCK_UNLOCK(so);
-			TAILQ_REMOVE(&head->so_incomp, so, so_list);
-			head->so_incqlen--;
-			so->so_qstate &= ~SQ_INCOMP;
-			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
-			head->so_qlen++;
-			so->so_qstate |= SQ_COMP;
-			ACCEPT_UNLOCK();
-			sorwakeup(head);
-			wakeup_one(&head->so_timeo);
+			solisten_wakeup(head);	/* unlocks */
 		} else {
-			ACCEPT_UNLOCK();
+			SOCKBUF_LOCK(&so->so_rcv);
 			soupcall_set(so, SO_RCV,
-			    head->so_accf->so_accept_filter->accf_callback,
-			    head->so_accf->so_accept_filter_arg);
+			    head->sol_accept_filter->accf_callback,
+			    head->sol_accept_filter_arg);
 			so->so_options &= ~SO_ACCEPTFILTER;
-			ret = head->so_accf->so_accept_filter->accf_callback(so,
-			    head->so_accf->so_accept_filter_arg, M_NOWAIT);
-			if (ret == SU_ISCONNECTED)
+			ret = head->sol_accept_filter->accf_callback(so,
+			    head->sol_accept_filter_arg, M_NOWAIT);
+			if (ret == SU_ISCONNECTED) {
 				soupcall_clear(so, SO_RCV);
+				SOCKBUF_UNLOCK(&so->so_rcv);
+				goto again;
+			}
+			SOCKBUF_UNLOCK(&so->so_rcv);
 			SOCK_UNLOCK(so);
-			if (ret == SU_ISCONNECTED)
-				goto restart;
+			SOLISTEN_UNLOCK(head);
 		}
 		return;
 	}
+	if (head != NULL)
+		SOLISTEN_UNLOCK(head);
 	SOCK_UNLOCK(so);
-	ACCEPT_UNLOCK();
 	wakeup(&so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
@@ -3566,16 +3802,17 @@ void
 soisdisconnecting(struct socket *so)
 {
 
-	/*
-	 * Note: This code assumes that SOCK_LOCK(so) and
-	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
-	 */
-	SOCKBUF_LOCK(&so->so_rcv);
+	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTING;
 	so->so_state |= SS_ISDISCONNECTING;
-	socantrcvmore_locked(so);
-	SOCKBUF_LOCK(&so->so_snd);
-	socantsendmore_locked(so);
+
+	if (!SOLISTENING(so)) {
+		SOCKBUF_LOCK(&so->so_rcv);
+		socantrcvmore_locked(so);
+		SOCKBUF_LOCK(&so->so_snd);
+		socantsendmore_locked(so);
+	}
+	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
@@ -3583,17 +3820,18 @@ void
 soisdisconnected(struct socket *so)
 {
 
-	/*
-	 * Note: This code assumes that SOCK_LOCK(so) and
-	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
-	 */
-	SOCKBUF_LOCK(&so->so_rcv);
+	SOCK_LOCK(so);
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 	so->so_state |= SS_ISDISCONNECTED;
-	socantrcvmore_locked(so);
-	SOCKBUF_LOCK(&so->so_snd);
-	sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
-	socantsendmore_locked(so);
+
+	if (!SOLISTENING(so)) {
+		SOCKBUF_LOCK(&so->so_rcv);
+		socantrcvmore_locked(so);
+		SOCKBUF_LOCK(&so->so_snd);
+		sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
+		socantsendmore_locked(so);
+	}
+	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
@@ -3615,11 +3853,12 @@ sodupsockaddr(const struct sockaddr *sa, int mflags)
  * Register per-socket buffer upcalls.
  */
 void
-soupcall_set(struct socket *so, int which,
-    int (*func)(struct socket *, void *, int), void *arg)
+soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
 {
 	struct sockbuf *sb;
 
+	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
+
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
@@ -3631,10 +3870,6 @@ soupcall_set(struct socket *so, int which,
 		panic("soupcall_set: bad which");
 	}
 	SOCKBUF_LOCK_ASSERT(sb);
-#if 0
-	/* XXX: accf_http actually wants to do this on purpose. */
-	KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
-#endif
 	sb->sb_upcall = func;
 	sb->sb_upcallarg = arg;
 	sb->sb_flags |= SB_UPCALL;
@@ -3645,6 +3880,8 @@ soupcall_clear(struct socket *so, int which)
 {
 	struct sockbuf *sb;
 
+	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
+
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
@@ -3656,12 +3893,110 @@ soupcall_clear(struct socket *so, int which)
 		panic("soupcall_clear: bad which");
 	}
 	SOCKBUF_LOCK_ASSERT(sb);
-	KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
+	KASSERT(sb->sb_upcall != NULL,
+	    ("%s: so %p no upcall to clear", __func__, so));
 	sb->sb_upcall = NULL;
 	sb->sb_upcallarg = NULL;
 	sb->sb_flags &= ~SB_UPCALL;
 }
 
+void
+solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
+{
+
+	SOLISTEN_LOCK_ASSERT(so);
+	so->sol_upcall = func;
+	so->sol_upcallarg = arg;
+}
+
+static void
+so_rdknl_lock(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_LOCK(so);
+	else
+		SOCKBUF_LOCK(&so->so_rcv);
+}
+
+static void
+so_rdknl_unlock(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_UNLOCK(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+static void
+so_rdknl_assert_locked(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_LOCK_ASSERT(so);
+	else
+		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+}
+
+static void
+so_rdknl_assert_unlocked(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_UNLOCK_ASSERT(so);
+	else
+		SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
+}
+
+static void
+so_wrknl_lock(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_LOCK(so);
+	else
+		SOCKBUF_LOCK(&so->so_snd);
+}
+
+static void
+so_wrknl_unlock(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_UNLOCK(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_snd);
+}
+
+static void
+so_wrknl_assert_locked(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_LOCK_ASSERT(so);
+	else
+		SOCKBUF_LOCK_ASSERT(&so->so_snd);
+}
+
+static void
+so_wrknl_assert_unlocked(void *arg)
+{
+	struct socket *so = arg;
+
+	if (SOLISTENING(so))
+		SOCK_UNLOCK_ASSERT(so);
+	else
+		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+}
+
 /*
  * Create an external-format (``xsocket'') structure using the information in
  * the kernel-format socket structure pointed to by so.  This is done to
@@ -3683,36 +4018,28 @@ sotoxsocket(struct socket *so, struct xsocket *xso)
 	xso->so_pcb = so->so_pcb;
 	xso->xso_protocol = so->so_proto->pr_protocol;
 	xso->xso_family = so->so_proto->pr_domain->dom_family;
-	xso->so_qlen = so->so_qlen;
-	xso->so_incqlen = so->so_incqlen;
-	xso->so_qlimit = so->so_qlimit;
 	xso->so_timeo = so->so_timeo;
 	xso->so_error = so->so_error;
-	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
-	xso->so_oobmark = so->so_oobmark;
-	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
-	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
 #ifndef __rtems__
 	xso->so_uid = so->so_cred->cr_uid;
 #else /* __rtems__ */
 	xso->so_uid = BSD_DEFAULT_UID;
 #endif /* __rtems__ */
-}
-
-
-/*
- * Socket accessor functions to provide external consumers with
- * a safe interface to socket state
- *
- */
-
-void
-so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
-    void *arg)
-{
-
-	TAILQ_FOREACH(so, &so->so_comp, so_list)
-		func(so, arg);
+	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+	if (SOLISTENING(so)) {
+		xso->so_qlen = so->sol_qlen;
+		xso->so_incqlen = so->sol_incqlen;
+		xso->so_qlimit = so->sol_qlimit;
+		xso->so_oobmark = 0;
+		bzero(&xso->so_snd, sizeof(xso->so_snd));
+		bzero(&xso->so_rcv, sizeof(xso->so_rcv));
+	} else {
+		xso->so_state |= so->so_qstate;
+		xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
+		xso->so_oobmark = so->so_oobmark;
+		sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+		sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+	}
 }
 
 struct sockbuf *
diff --git a/freebsd/sys/kern/uipc_syscalls.c b/freebsd/sys/kern/uipc_syscalls.c
index f301c12c..5a9a381f 100644
--- a/freebsd/sys/kern/uipc_syscalls.c
+++ b/freebsd/sys/kern/uipc_syscalls.c
@@ -70,13 +70,6 @@ __FBSDID("$FreeBSD$");
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
-/*
- * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
- * and SOCK_NONBLOCK.
- */
-#define	ACCEPT4_INHERIT	0x1
-#define	ACCEPT4_COMPAT	0x2
-
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
@@ -524,59 +517,22 @@ kern_accept4(struct thread *td, int s, struct sockaddr **name,
 	    (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
 	if (error != 0)
 		goto done;
-	ACCEPT_LOCK();
-	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
-		ACCEPT_UNLOCK();
-		error = EWOULDBLOCK;
-		goto noconnection;
-	}
-	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
-		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
-			head->so_error = ECONNABORTED;
-			break;
-		}
-		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
-		    "accept", 0);
-		if (error != 0) {
-			ACCEPT_UNLOCK();
-			goto noconnection;
-		}
-	}
-	if (head->so_error) {
-		error = head->so_error;
-		head->so_error = 0;
-		ACCEPT_UNLOCK();
+	SOCK_LOCK(head);
+	if (!SOLISTENING(head)) {
+		SOCK_UNLOCK(head);
+		error = EINVAL;
 		goto noconnection;
 	}
-	so = TAILQ_FIRST(&head->so_comp);
-	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
-	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 
-	/*
-	 * Before changing the flags on the socket, we have to bump the
-	 * reference count.  Otherwise, if the protocol calls sofree(),
-	 * the socket will be released due to a zero refcount.
-	 */
-	SOCK_LOCK(so);			/* soref() and so_state update */
-	soref(so);			/* file descriptor reference */
-
-	TAILQ_REMOVE(&head->so_comp, so, so_list);
-	head->so_qlen--;
-	if (flags & ACCEPT4_INHERIT)
-		so->so_state |= (head->so_state & SS_NBIO);
-	else
-		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
-	so->so_qstate &= ~SQ_COMP;
-	so->so_head = NULL;
-
-	SOCK_UNLOCK(so);
-	ACCEPT_UNLOCK();
+	error = solisten_dequeue(head, &so, flags);
+	if (error != 0)
+		goto noconnection;
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
-	/* connection has been removed from the listen queue */
-	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
+	/* Connection has been removed from the listen queue. */
+	KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0);
 
 	if (flags & ACCEPT4_INHERIT) {
 		pgid = fgetown(&head->so_sigio);
@@ -594,7 +550,6 @@ kern_accept4(struct thread *td, int s, struct sockaddr **name,
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 	tmp = fflag & FASYNC;
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
-	sa = NULL;
 	error = soaccept(so, &sa);
 	if (error != 0)
 		goto noconnection;
@@ -769,7 +724,7 @@ kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 	}
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
-		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
+		error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH,
 		    "connec", 0);
 		if (error != 0) {
 			if (error == EINTR || error == ERESTART)
diff --git a/freebsd/sys/kern/uipc_usrreq.c b/freebsd/sys/kern/uipc_usrreq.c
index 8e60f227..7237956a 100644
--- a/freebsd/sys/kern/uipc_usrreq.c
+++ b/freebsd/sys/kern/uipc_usrreq.c
@@ -202,10 +202,9 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
 /*
  * Locking and synchronization:
  *
- * Three types of locks exit in the local domain socket implementation: a
- * global list mutex, a global linkage rwlock, and per-unpcb mutexes.  Of the
- * global locks, the list lock protects the socket count, global generation
- * number, and stream/datagram global lists.  The linkage lock protects the
+ * Two types of locks exist in the local domain socket implementation: a
+ * a global linkage rwlock and per-unpcb mutexes.  The linkage lock protects
+ * the socket count, global generation number, stream/datagram global lists and
  * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
  * held exclusively over the acquisition of multiple unpcb locks to prevent
  * deadlock.
@@ -246,7 +245,6 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
  * to perform namei() and other file system operations.
  */
 static struct rwlock	unp_link_rwlock;
-static struct mtx	unp_list_lock;
 static struct mtx	unp_defers_lock;
 
 #define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
@@ -263,11 +261,7 @@ static struct mtx	unp_defers_lock;
 #define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
 #define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
 					    RA_WLOCKED)
-
-#define	UNP_LIST_LOCK_INIT()		mtx_init(&unp_list_lock,	\
-					    "unp_list_lock", NULL, MTX_DEF)
-#define	UNP_LIST_LOCK()			mtx_lock(&unp_list_lock)
-#define	UNP_LIST_UNLOCK()		mtx_unlock(&unp_list_lock)
+#define	UNP_LINK_WOWNED()		rw_wowned(&unp_link_rwlock)
 
 #define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
 					    "unp_defer", NULL, MTX_DEF)
@@ -417,6 +411,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
 	u_long sendspace, recvspace;
 	struct unpcb *unp;
 	int error;
+	bool locked;
 
 	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -451,10 +446,12 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
 	unp->unp_socket = so;
 	so->so_pcb = unp;
 	unp->unp_refcount = 1;
-	if (so->so_head != NULL)
+	if (so->so_listen != NULL)
 		unp->unp_flags |= UNP_NASCENT;
 
-	UNP_LIST_LOCK();
+	if ((locked = UNP_LINK_WOWNED()) == false)
+		UNP_LINK_WLOCK();
+
 	unp->unp_gencnt = ++unp_gencnt;
 	unp_count++;
 	switch (so->so_type) {
@@ -473,7 +470,9 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
 	default:
 		panic("uipc_attach");
 	}
-	UNP_LIST_UNLOCK();
+
+	if (locked == false)
+		UNP_LINK_WUNLOCK();
 
 	return (0);
 }
@@ -516,6 +515,14 @@ static const IMFS_node_control rtems_uipc_imfs_control =
 static const IMFS_node_control rtems_uipc_imfs_zombi_control =
     IMFS_GENERIC_INITIALIZER(&rtems_filesystem_handlers_default, NULL,
     IMFS_node_destroy_default);
+
+static void
+VOP_UNP_DETACH(IMFS_generic_t *vp)
+{
+
+	vp->Node.control = &rtems_uipc_imfs_zombi_control;
+	vp->context = NULL;
+}
 #endif /* __rtems__ */
 static int
 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
@@ -630,7 +637,7 @@ restart:
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 #ifndef __rtems__
-	VOP_UNP_BIND(vp, unp->unp_socket);
+	VOP_UNP_BIND(vp, unp);
 	unp->unp_vnode = vp;
 #endif /* __rtems__ */
 	unp->unp_addr = soun;
@@ -690,6 +697,11 @@ static void
 uipc_close(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
+#ifndef __rtems__
+	struct vnode *vp;
+#else /* __rtems__ */
+	IMFS_generic_t *vp;
+#endif /* __rtems__ */
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
@@ -702,8 +714,16 @@ uipc_close(struct socket *so)
 		unp_disconnect(unp, unp2);
 		UNP_PCB_UNLOCK(unp2);
 	}
+	if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) {
+		VOP_UNP_DETACH(vp);
+		unp->unp_vnode = NULL;
+	}
 	UNP_PCB_UNLOCK(unp);
 	UNP_LINK_WUNLOCK();
+#ifndef __rtems__
+	if (vp)
+		vrele(vp);
+#endif /* __rtems__ */
 }
 
 static int
@@ -747,29 +767,16 @@ uipc_detach(struct socket *so)
 	local_unp_rights = 0;
 #endif /* __rtems__ */
 
-	UNP_LIST_LOCK();
+	UNP_LINK_WLOCK();
 	LIST_REMOVE(unp, unp_link);
 	unp->unp_gencnt = ++unp_gencnt;
 	--unp_count;
-	UNP_LIST_UNLOCK();
-
-	if ((unp->unp_flags & UNP_NASCENT) != 0) {
-		UNP_PCB_LOCK(unp);
-		goto teardown;
-	}
-	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
+	if ((unp->unp_flags & UNP_NASCENT) != 0)
+		goto teardown;
 
-	/*
-	 * XXXRW: Should assert vp->v_socket == so.
-	 */
 	if ((vp = unp->unp_vnode) != NULL) {
-#ifndef __rtems__
 		VOP_UNP_DETACH(vp);
-#else /* __rtems__ */
-		vp->Node.control = &rtems_uipc_imfs_zombi_control;
-		vp->context = NULL;
-#endif /* __rtems__ */
 		unp->unp_vnode = NULL;
 	}
 	unp2 = unp->unp_conn;
@@ -793,8 +800,8 @@ uipc_detach(struct socket *so)
 #ifndef __rtems__
 	local_unp_rights = unp_rights;
 #endif /* __rtems__ */
-	UNP_LINK_WUNLOCK();
 teardown:
+	UNP_LINK_WUNLOCK();
 	unp->unp_socket->so_pcb = NULL;
 	saved_unp_addr = unp->unp_addr;
 	unp->unp_addr = NULL;
@@ -860,7 +867,6 @@ uipc_listen(struct socket *so, int backlog, struct thread *td)
 	error = solisten_proto_check(so);
 	if (error == 0) {
 		cru2x(td->td_ucred, &unp->unp_peercred);
-		unp->unp_flags |= UNP_HAVEPCCACHED;
 		solisten_proto(so, backlog);
 	}
 	SOCK_UNLOCK(so);
@@ -1439,7 +1445,7 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
 #else /* __rtems__ */
 	struct IMFS_jnode_tt *vp;
 #endif /* __rtems__ */
-	struct socket *so2, *so3;
+	struct socket *so2;
 	struct unpcb *unp, *unp2, *unp3;
 #ifndef __rtems__
 	struct nameidata nd;
@@ -1450,7 +1456,9 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
 	const rtems_filesystem_location_info_t *currentloc;
 #endif /* __rtems__ */
 	struct sockaddr *sa;
+#ifndef __rtems__
 	cap_rights_t rights;
+#endif /* __rtems__ */
 	int error, len;
 
 	if (nam->sa_family != AF_UNIX)
@@ -1535,34 +1543,38 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
 	 */
 	UNP_LINK_WLOCK();
 #ifndef __rtems__
-	VOP_UNP_CONNECT(vp, &so2);
+	VOP_UNP_CONNECT(vp, &unp2);
+	if (unp2 == NULL) {
+		error = ECONNREFUSED;
+		goto bad2;
+	}
+	so2 = unp2->unp_socket;
 #else /* __rtems__ */
 	so2 = IMFS_generic_get_context_by_node(vp);
-#endif /* __rtems__ */
 	if (so2 == NULL) {
 		error = ECONNREFUSED;
 		goto bad2;
 	}
+	unp2 = sotounpcb(so2);
+#endif /* __rtems__ */
 	if (so->so_type != so2->so_type) {
 		error = EPROTOTYPE;
 		goto bad2;
 	}
+	UNP_PCB_LOCK(unp);
+	UNP_PCB_LOCK(unp2);
 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
 		if (so2->so_options & SO_ACCEPTCONN) {
 			CURVNET_SET(so2->so_vnet);
-			so3 = sonewconn(so2, 0);
+			so2 = sonewconn(so2, 0);
 			CURVNET_RESTORE();
 		} else
-			so3 = NULL;
-		if (so3 == NULL) {
+			so2 = NULL;
+		if (so2 == NULL) {
 			error = ECONNREFUSED;
-			goto bad2;
+			goto bad3;
 		}
-		unp = sotounpcb(so);
-		unp2 = sotounpcb(so2);
-		unp3 = sotounpcb(so3);
-		UNP_PCB_LOCK(unp);
-		UNP_PCB_LOCK(unp2);
+		unp3 = sotounpcb(so2);
 		UNP_PCB_LOCK(unp3);
 		if (unp2->unp_addr != NULL) {
 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
@@ -1583,30 +1595,24 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
 		 * listen(); uipc_listen() cached that process's credentials
 		 * at that time so we can use them now.
 		 */
-		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
-		    ("unp_connect: listener without cached peercred"));
 		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
 		    sizeof(unp->unp_peercred));
 		unp->unp_flags |= UNP_HAVEPC;
 		if (unp2->unp_flags & UNP_WANTCRED)
 			unp3->unp_flags |= UNP_WANTCRED;
-		UNP_PCB_UNLOCK(unp3);
 		UNP_PCB_UNLOCK(unp2);
-		UNP_PCB_UNLOCK(unp);
+		unp2 = unp3;
 #ifdef MAC
-		mac_socketpeer_set_from_socket(so, so3);
-		mac_socketpeer_set_from_socket(so3, so);
+		mac_socketpeer_set_from_socket(so, so2);
+		mac_socketpeer_set_from_socket(so2, so);
 #endif
-
-		so2 = so3;
 	}
-	unp = sotounpcb(so);
-	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
-	unp2 = sotounpcb(so2);
-	KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
-	UNP_PCB_LOCK(unp);
-	UNP_PCB_LOCK(unp2);
+
+	KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
+	    sotounpcb(so2) == unp2,
+	    ("%s: unp2 %p so2 %p", __func__, unp2, so2));
 	error = unp_connect2(so, so2, PRU_CONNECT);
+bad3:
 	UNP_PCB_UNLOCK(unp2);
 	UNP_PCB_UNLOCK(unp);
 bad2:
@@ -1750,10 +1756,10 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
 	 * OK, now we're committed to doing something.
 	 */
 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
-	UNP_LIST_LOCK();
+	UNP_LINK_RLOCK();
 	gencnt = unp_gencnt;
 	n = unp_count;
-	UNP_LIST_UNLOCK();
+	UNP_LINK_RUNLOCK();
 
 	xug->xug_len = sizeof *xug;
 	xug->xug_count = n;
@@ -1767,7 +1773,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
 
 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
 
-	UNP_LIST_LOCK();
+	UNP_LINK_RLOCK();
 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
 	     unp = LIST_NEXT(unp, unp_link)) {
 		UNP_PCB_LOCK(unp);
@@ -1782,7 +1788,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
 		}
 		UNP_PCB_UNLOCK(unp);
 	}
-	UNP_LIST_UNLOCK();
+	UNP_LINK_RUNLOCK();
 	n = i;			/* In case we lost some during malloc. */
 
 	error = 0;
@@ -2044,7 +2050,6 @@ unp_init(void)
 	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
 #endif /* __rtems__ */
 	UNP_LINK_LOCK_INIT();
-	UNP_LIST_LOCK_INIT();
 	UNP_DEFERRED_LOCK_INIT();
 }
 
@@ -2396,8 +2401,7 @@ unp_accessable(struct filedescent **fdep, int fdcount)
 static void
 unp_gc_process(struct unpcb *unp)
 {
-	struct socket *soa;
-	struct socket *so;
+	struct socket *so, *soa;
 	struct file *fp;
 
 	/* Already processed. */
@@ -2417,28 +2421,30 @@ unp_gc_process(struct unpcb *unp)
 		return;
 	}
 
-	/*
-	 * Mark all sockets we reference with RIGHTS.
-	 */
 	so = unp->unp_socket;
-	if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
-		SOCKBUF_LOCK(&so->so_rcv);
-		unp_scan(so->so_rcv.sb_mb, unp_accessable);
-		SOCKBUF_UNLOCK(&so->so_rcv);
-	}
-
-	/*
-	 * Mark all sockets in our accept queue.
-	 */
-	ACCEPT_LOCK();
-	TAILQ_FOREACH(soa, &so->so_comp, so_list) {
-		if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0)
-			continue;
-		SOCKBUF_LOCK(&soa->so_rcv);
-		unp_scan(soa->so_rcv.sb_mb, unp_accessable);
-		SOCKBUF_UNLOCK(&soa->so_rcv);
+	SOCK_LOCK(so);
+	if (SOLISTENING(so)) {
+		/*
+		 * Mark all sockets in our accept queue.
+		 */
+		TAILQ_FOREACH(soa, &so->sol_comp, so_list) {
+			if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
+				continue;
+			SOCKBUF_LOCK(&soa->so_rcv);
+			unp_scan(soa->so_rcv.sb_mb, unp_accessable);
+			SOCKBUF_UNLOCK(&soa->so_rcv);
+		}
+	} else {
+		/*
+		 * Mark all sockets we reference with RIGHTS.
+		 */
+		if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
+			SOCKBUF_LOCK(&so->so_rcv);
+			unp_scan(so->so_rcv.sb_mb, unp_accessable);
+			SOCKBUF_UNLOCK(&so->so_rcv);
+		}
 	}
-	ACCEPT_UNLOCK();
+	SOCK_UNLOCK(so);
 	unp->unp_gcflag |= UNPGC_SCANNED;
 }
 
@@ -2461,7 +2467,7 @@ unp_gc(__unused void *arg, int pending)
 	int i, total;
 
 	unp_taskcount++;
-	UNP_LIST_LOCK();
+	UNP_LINK_RLOCK();
 	/*
 	 * First clear all gc flags from previous runs, apart from
 	 * UNPGC_IGNORE_RIGHTS.
@@ -2484,7 +2490,7 @@ unp_gc(__unused void *arg, int pending)
 			LIST_FOREACH(unp, *head, unp_link)
 				unp_gc_process(unp);
 	} while (unp_marked);
-	UNP_LIST_UNLOCK();
+	UNP_LINK_RUNLOCK();
 	if (unp_unreachable == 0)
 		return;
 
@@ -2499,7 +2505,6 @@ unp_gc(__unused void *arg, int pending)
 	 * as as unreachable and store them locally.
 	 */
 	UNP_LINK_RLOCK();
-	UNP_LIST_LOCK();
 	for (total = 0, head = heads; *head != NULL; head++)
 		LIST_FOREACH(unp, *head, unp_link)
 			if ((unp->unp_gcflag & UNPGC_DEAD) != 0) {
@@ -2512,7 +2517,6 @@ unp_gc(__unused void *arg, int pending)
 				KASSERT(total <= unp_unreachable,
 				    ("unp_gc: incorrect unreachable count."));
 			}
-	UNP_LIST_UNLOCK();
 	UNP_LINK_RUNLOCK();
 
 	/*
@@ -2555,10 +2559,11 @@ unp_dispose(struct socket *so)
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
-	UNP_LIST_LOCK();
+	UNP_LINK_WLOCK();
 	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
-	UNP_LIST_UNLOCK();
-	unp_dispose_mbuf(so->so_rcv.sb_mb);
+	UNP_LINK_WUNLOCK();
+	if (!SOLISTENING(so))
+		unp_dispose_mbuf(so->so_rcv.sb_mb);
 }
 
 static void
@@ -2613,7 +2618,6 @@ unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
 void
 vfs_unp_reclaim(struct vnode *vp)
 {
-	struct socket *so;
 	struct unpcb *unp;
 	int active;
 
@@ -2623,10 +2627,7 @@ vfs_unp_reclaim(struct vnode *vp)
 
 	active = 0;
 	UNP_LINK_WLOCK();
-	VOP_UNP_CONNECT(vp, &so);
-	if (so == NULL)
-		goto done;
-	unp = sotounpcb(so);
+	VOP_UNP_CONNECT(vp, &unp);
 	if (unp == NULL)
 		goto done;
 	UNP_PCB_LOCK(unp);
@@ -2663,10 +2664,6 @@ db_print_unpflags(int unp_flags)
 		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
 		comma = 1;
 	}
-	if (unp_flags & UNP_HAVEPCCACHED) {
-		db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
-		comma = 1;
-	}
 	if (unp_flags & UNP_WANTCRED) {
 		db_printf("%sUNP_WANTCRED", comma ? ", " : "");
 		comma = 1;
author	Sebastian Huber <sebastian.huber@embedded-brains.de>	2018-08-07 14:56:50 +0200
committer	Sebastian Huber <sebastian.huber@embedded-brains.de>	2018-09-21 10:29:37 +0200
commit	c37f9fba70085fedc8eede7559489d2321393005 (patch)
tree	042455ebf1fa89a277a825f72e1ed805d0b4d296 /freebsd/sys/kern
parent	Update to FreeBSD head 2017-06-01 (diff)
download	rtems-libbsd-c37f9fba70085fedc8eede7559489d2321393005.tar.bz2