1 files changed, 546 insertions, 285 deletions
diff --git a/freebsd/sys/kern/kern_event.c b/freebsd/sys/kern/kern_event.c
index 50f4c696..ca9c1de7 100644
--- a/freebsd/sys/kern/kern_event.c
+++ b/freebsd/sys/kern/kern_event.c
@@ -32,13 +32,15 @@
 __FBSDID("$FreeBSD$");
 
 #include <rtems/bsd/local/opt_ktrace.h>
+#include <rtems/bsd/local/opt_kqueue.h>
 
 #include <rtems/bsd/sys/param.h>
 #include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <rtems/bsd/sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <rtems/bsd/sys/unistd.h>
@@ -48,11 +50,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/fcntl.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
+#include <sys/stdatomic.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
+#include <sys/resourcevar.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
@@ -63,6 +67,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
+#include <sys/user.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
@@ -96,7 +101,7 @@ MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
 	haslck = 0;				\
 } while (0)
 
-TASKQUEUE_DEFINE_THREAD(kqueue);
+TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
@@ -104,6 +109,8 @@ static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
 		    struct thread *td, int waitok);
 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
+static void	kqueue_destroy(struct kqueue *kq);
+static void	kqueue_drain(struct kqueue *kq, struct thread *td);
 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
 		    uintptr_t ident, int waitok);
 static void	kqueue_task(void *arg, int pending);
@@ -124,11 +131,12 @@ static fo_poll_t	kqueue_poll;
 static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
+static fo_fill_kinfo_t	kqueue_fill_kinfo;
 
 static struct fileops kqueueops = {
-	.fo_read = kqueue_read,
-	.fo_write = kqueue_write,
-	.fo_truncate = kqueue_truncate,
+	.fo_read = invfo_rdwr,
+	.fo_write = invfo_rdwr,
+	.fo_truncate = invfo_truncate,
 	.fo_ioctl = kqueue_ioctl,
 	.fo_poll = kqueue_poll,
 	.fo_kqfilter = kqueue_kqfilter,
@@ -136,6 +144,8 @@ static struct fileops kqueueops = {
 	.fo_close = kqueue_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_fill_kinfo = kqueue_fill_kinfo,
 };
 #else /* __rtems__ */
 static const rtems_filesystem_file_handlers_r kqueueops;
@@ -199,9 +209,9 @@ static struct filterops user_filtops = {
 };
 
 static uma_zone_t	knote_zone;
-static int 		kq_ncallouts = 0;
-static int 		kq_calloutmax = (4 * 1024);
-SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
+static atomic_uint	kq_ncallouts = ATOMIC_VAR_INIT(0);
+static unsigned int 	kq_calloutmax = 4 * 1024;
+SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not KN_INFLUX?? */
@@ -238,14 +248,33 @@ SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
 #define KQ_NOTOWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
 } while (0)
-#define KN_LIST_LOCK(kn) do {						\
-	if (kn->kn_knlist != NULL)					\
-		kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg);	\
-} while (0)
-#define KN_LIST_UNLOCK(kn) do {						\
-	if (kn->kn_knlist != NULL) 					\
-		kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg);	\
-} while (0)
+
+static struct knlist *
+kn_list_lock(struct knote *kn)
+{
+	struct knlist *knl;
+
+	knl = kn->kn_knlist;
+	if (knl != NULL)
+		knl->kl_lock(knl->kl_lockarg);
+	return (knl);
+}
+
+static void
+kn_list_unlock(struct knlist *knl)
+{
+	bool do_free;
+
+	if (knl == NULL)
+		return;
+	do_free = knl->kl_autodestroy && knlist_empty(knl);
+	knl->kl_unlock(knl->kl_lockarg);
+	if (do_free) {
+		knlist_destroy(knl);
+		free(knl, M_KQUEUE);
+	}
+}
+
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
@@ -264,7 +293,10 @@ SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
 #endif /* INVARIANTS */
 
+#ifndef	KN_HASHSIZE
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
+#endif
+
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
@@ -291,28 +323,30 @@ MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
 	MTX_DEF);
 static struct {
 	struct filterops *for_fop;
+	int for_nolock;
 	int for_refcnt;
 } sysfilt_ops[EVFILT_SYSCOUNT] = {
-	{ &file_filtops },			/* EVFILT_READ */
-	{ &file_filtops },			/* EVFILT_WRITE */
+	{ &file_filtops, 1 },			/* EVFILT_READ */
+	{ &file_filtops, 1 },			/* EVFILT_WRITE */
 	{ &null_filtops },			/* EVFILT_AIO */
-	{ &file_filtops },			/* EVFILT_VNODE */
+	{ &file_filtops, 1 },			/* EVFILT_VNODE */
 #ifndef __rtems__
-	{ &proc_filtops },			/* EVFILT_PROC */
-	{ &sig_filtops },			/* EVFILT_SIGNAL */
+	{ &proc_filtops, 1 },			/* EVFILT_PROC */
+	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
 #else /* __rtems__ */
 	{ &null_filtops },			/* EVFILT_PROC */
 	{ &null_filtops },			/* EVFILT_SIGNAL */
 #endif /* __rtems__ */
-	{ &timer_filtops },			/* EVFILT_TIMER */
+	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
 	{ &null_filtops },			/* former EVFILT_NETDEV */
 #ifndef __rtems__
-	{ &fs_filtops },			/* EVFILT_FS */
+	{ &fs_filtops, 1 },			/* EVFILT_FS */
 #else /* __rtems__ */
 	{ &null_filtops },			/* EVFILT_FS */
 #endif /* __rtems__ */
 	{ &null_filtops },			/* EVFILT_LIO */
-	{ &user_filtops },			/* EVFILT_USER */
+	{ &user_filtops, 1 },			/* EVFILT_USER */
+	{ &null_filtops },			/* EVFILT_SENDFILE */
 };
 
 /*
@@ -375,16 +409,16 @@ static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
-	int immediate;
 	int error;
+	bool exiting, immediate;
 
-	immediate = 0;
+	exiting = immediate = false;
 	p = pfind(kn->kn_id);
 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
 		p = zpfind(kn->kn_id);
-		immediate = 1;
+		exiting = true;
 	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
-		immediate = 1;
+		exiting = true;
 	}
 
 	if (p == NULL)
@@ -398,23 +432,33 @@ filt_procattach(struct knote *kn)
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
-	 * internal flag indicating registration done by kernel
+	 * Internal flag indicating registration done by kernel for the
+	 * purposes of getting a NOTE_CHILD notification.
 	 */
-	if (kn->kn_flags & EV_FLAG1) {
+	if (kn->kn_flags & EV_FLAG2) {
+		kn->kn_flags &= ~EV_FLAG2;
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
+		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
+		immediate = true; /* Force immediate activation of child note. */
+	}
+	/*
+	 * Internal flag indicating registration done by kernel (for other than
+	 * NOTE_CHILD).
+	 */
+	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
-	if (immediate == 0)
-		knlist_add(&p->p_klist, kn, 1);
+	knlist_add(p->p_klist, kn, 1);
 
 	/*
-	 * Immediately activate any exit notes if the target process is a
-	 * zombie.  This is necessary to handle the case where the target
-	 * process, e.g. a child, dies before the kevent is registered.
+	 * Immediately activate any child notes or, in the case of a zombie
+	 * target process, exit notes.  The latter is necessary to handle the
+	 * case where the target process, e.g. a child, dies before the kevent
+	 * is registered.
 	 */
-	if (immediate && filt_proc(kn, NOTE_EXIT))
+	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
 		KNOTE_ACTIVATE(kn, 0);
 
 	PROC_UNLOCK(p);
@@ -434,10 +478,8 @@ filt_procattach(struct knote *kn)
 static void
 filt_procdetach(struct knote *kn)
 {
-	struct proc *p;
 
-	p = kn->kn_ptr.p_proc;
-	knlist_remove(&p->p_klist, kn, 0);
+	knlist_remove(kn->kn_knlist, kn, 0);
 	kn->kn_ptr.p_proc = NULL;
 }
 
@@ -445,30 +487,26 @@ filt_procdetach(struct knote *kn)
 static int
 filt_proc(struct knote *kn, long hint)
 {
-	struct proc *p = kn->kn_ptr.p_proc;
+	struct proc *p;
 	u_int event;
 
-	/*
-	 * mask off extra data
-	 */
+	p = kn->kn_ptr.p_proc;
+	if (p == NULL) /* already activated, from attach filter */
+		return (0);
+
+	/* Mask off extra data. */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
-	/*
-	 * if the user is interested in this event, record it.
-	 */
+	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
-	/*
-	 * process is gone, so flag the event as finished.
-	 */
+	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
-		if (!(kn->kn_status & KN_DETACHED))
-			knlist_remove_inevent(&p->p_klist, kn);
-		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		kn->kn_ptr.p_proc = NULL;
 		if (kn->kn_fflags & NOTE_EXIT)
-			kn->kn_data = p->p_xstat;
+			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
@@ -498,8 +536,6 @@ knote_fork(struct knlist *list, int pid)
 	list->kl_lock(list->kl_lockarg);
 
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
-		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
-			continue;
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
@@ -521,7 +557,7 @@ knote_fork(struct knlist *list, int pid)
 
 		/*
 		 * The NOTE_TRACK case. In addition to the activation
-		 * of the event, we need to register new event to
+		 * of the event, we need to register new events to
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
@@ -530,8 +566,28 @@ knote_fork(struct knlist *list, int pid)
 		list->kl_unlock(list->kl_lockarg);
 
 		/*
-		 * Activate existing knote and register a knote with
+		 * Activate existing knote and register tracking knotes with
 		 * new process.
+		 *
+		 * First register a knote to get just the child notice. This
+		 * must be a separate note from a potential NOTE_EXIT
+		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
+		 * to use the data field (in conflicting ways).
+		 */
+		kev.ident = pid;
+		kev.filter = kn->kn_filter;
+		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
+		    EV_FLAG2;
+		kev.fflags = kn->kn_sfflags;
+		kev.data = kn->kn_id;		/* parent */
+		kev.udata = kn->kn_kevent.udata;/* preserve udata */
+		error = kqueue_register(kq, &kev, NULL, 0);
+		if (error)
+			kn->kn_fflags |= NOTE_TRACKERR;
+
+		/*
+		 * Then register another knote to track other potential events
+		 * from the new process.
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
@@ -557,64 +613,123 @@ knote_fork(struct knlist *list, int pid)
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
-static int
-timertoticks(intptr_t data)
-{
-	struct timeval tv;
-	int tticks;
-
-	tv.tv_sec = data / 1000;
-	tv.tv_usec = (data % 1000) * 1000;
-	tticks = tvtohz(&tv);
 
-	return tticks;
+#define NOTE_TIMER_PRECMASK	(NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \
+				NOTE_NSECONDS)
+
+static sbintime_t
+timer2sbintime(intptr_t data, int flags)
+{
+
+        /*
+         * Macros for converting to the fractional second portion of an
+         * sbintime_t using 64bit multiplication to improve precision.
+         */
+#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
+#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
+#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
+	switch (flags & NOTE_TIMER_PRECMASK) {
+	case NOTE_SECONDS:
+#ifdef __LP64__
+		if (data > (SBT_MAX / SBT_1S))
+			return SBT_MAX;
+#endif
+		return ((sbintime_t)data << 32);
+	case NOTE_MSECONDS: /* FALLTHROUGH */
+	case 0:
+		if (data >= 1000) {
+			int64_t secs = data / 1000;
+#ifdef __LP64__
+			if (secs > (SBT_MAX / SBT_1S))
+				return SBT_MAX;
+#endif
+			return (secs << 32 | MS_TO_SBT(data % 1000));
+		}
+		return MS_TO_SBT(data);
+	case NOTE_USECONDS:
+		if (data >= 1000000) {
+			int64_t secs = data / 1000000;
+#ifdef __LP64__
+			if (secs > (SBT_MAX / SBT_1S))
+				return SBT_MAX;
+#endif
+			return (secs << 32 | US_TO_SBT(data % 1000000));
+		}
+		return US_TO_SBT(data);
+	case NOTE_NSECONDS:
+		if (data >= 1000000000) {
+			int64_t secs = data / 1000000000;
+#ifdef __LP64__
+			if (secs > (SBT_MAX / SBT_1S))
+				return SBT_MAX;
+#endif
+			return (secs << 32 | US_TO_SBT(data % 1000000000));
+		}
+		return NS_TO_SBT(data);
+	default:
+		break;
+	}
+	return (-1);
 }
 
 static void
 filt_timerexpire(void *knx)
 {
-	struct knote *kn = knx;
 	struct callout *calloutp;
+	struct knote *kn;
 
+	kn = knx;
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
-	/*
-	 * timertoticks() uses tvtohz() which always adds 1 to allow
-	 * for the time until the next clock interrupt being strictly
-	 * less than 1 clock tick.  We don't want that here since we
-	 * want to appear to be in sync with the clock interrupt even
-	 * when we're delayed.
-	 */
 	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
 		calloutp = (struct callout *)kn->kn_hook;
-		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
-		    filt_timerexpire, kn);
+		*kn->kn_ptr.p_nexttime += timer2sbintime(kn->kn_sdata, 
+		    kn->kn_sfflags);
+		callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
+		    filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
 	}
 }
 
 /*
- * data contains amount of time to sleep, in milliseconds
+ * data contains amount of time to sleep
  */
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct callout *calloutp;
+	sbintime_t to;
+	unsigned int ncallouts;
 
-	atomic_add_int(&kq_ncallouts, 1);
+	if ((intptr_t)kn->kn_sdata < 0)
+		return (EINVAL);
+	if ((intptr_t)kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
+		kn->kn_sdata = 1;
+	/* Only precision unit are supported in flags so far */
+	if (kn->kn_sfflags & ~NOTE_TIMER_PRECMASK)
+		return (EINVAL);
 
-	if (kq_ncallouts >= kq_calloutmax) {
-		atomic_add_int(&kq_ncallouts, -1);
-		return (ENOMEM);
-	}
+	to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
+	if (to < 0)
+		return (EINVAL);
+
+	ncallouts = atomic_load_explicit(&kq_ncallouts, memory_order_relaxed);
+	do {
+		if (ncallouts >= kq_calloutmax)
+			return (ENOMEM);
+	} while (!atomic_compare_exchange_weak_explicit(&kq_ncallouts,
+	    &ncallouts, ncallouts + 1, memory_order_relaxed,
+	    memory_order_relaxed));
 
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
-	kn->kn_status &= ~KN_DETACHED;		/* knlist_add usually sets it */
+	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
+	kn->kn_ptr.p_nexttime = malloc(sizeof(sbintime_t), M_KQUEUE, M_WAITOK);
 	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
-	callout_init(calloutp, CALLOUT_MPSAFE);
+	callout_init(calloutp, 1);
 	kn->kn_hook = calloutp;
-	callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
-	    filt_timerexpire, kn);
+	*kn->kn_ptr.p_nexttime = to + sbinuptime();
+	callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
+	    filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
 
 	return (0);
 }
@@ -623,12 +738,15 @@ static void
 filt_timerdetach(struct knote *kn)
 {
 	struct callout *calloutp;
+	unsigned int old;
 
 	calloutp = (struct callout *)kn->kn_hook;
 	callout_drain(calloutp);
 	free(calloutp, M_KQUEUE);
-	atomic_add_int(&kq_ncallouts, -1);
-	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
+	free(kn->kn_ptr.p_nexttime, M_KQUEUE);
+	old = atomic_fetch_sub_explicit(&kq_ncallouts, 1, memory_order_relaxed);
+	KASSERT(old > 0, ("Number of callouts cannot become negative"));
+	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
 }
 
 static int
@@ -727,34 +845,60 @@ filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
 }
 
 #ifdef __rtems__
+static int
+kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
+
 static
 #endif /* __rtems__ */
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
+
+	return (kern_kqueue(td, 0, NULL));
+}
+
+static void
+kqueue_init(struct kqueue *kq)
+{
+
+	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
+	TAILQ_INIT(&kq->kq_head);
+	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
+	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+}
+
+int
+kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
+{
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
+	struct ucred *cred;
 	int fd, error;
 
 #ifndef __rtems__
 	fdp = td->td_proc->p_fd;
+	cred = td->td_ucred;
+	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
+		return (ENOMEM);
 #else /* __rtems__ */
-	(void) fdp;
+	(void)fdp;
+	(void)cred;
 #endif /* __rtems__ */
-	error = falloc(td, &fp, &fd, 0);
-	if (error)
-		goto done2;
 
-	/* An extra reference on `nfp' has been held for us by falloc(). */
+	error = falloc_caps(td, &fp, &fd, flags, fcaps);
+	if (error != 0) {
+		chgkqcnt(cred->cr_ruidinfo, -1, 0);
+		return (error);
+	}
+
+	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
-	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
-	TAILQ_INIT(&kq->kq_head);
+	kqueue_init(kq);
 #ifndef __rtems__
 	kq->kq_fdp = fdp;
+	kq->kq_cred = crhold(cred);
 #endif /* __rtems__ */
-	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
-	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 
 #ifndef __rtems__
 	FILEDESC_XLOCK(fdp);
@@ -770,8 +914,7 @@ sys_kqueue(struct thread *td, struct kqueue_args *uap)
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
-done2:
-	return (error);
+	return (0);
 }
 #ifdef __rtems__
 int
@@ -806,9 +949,11 @@ struct kevent_args {
 };
 #endif
 #ifdef __rtems__
-static int
-kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct
-    kevent_copyops *k_ops, const struct timespec *timeout);
+static int kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
+    struct kevent_copyops *k_ops, const struct timespec *timeout);
+
+static int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges,
+    int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout);
 
 static
 #endif /* __rtems__ */
@@ -936,24 +1081,39 @@ int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
-	struct kevent keva[KQ_NEVENTS];
-	struct kevent *kevp, *changes;
-	struct kqueue *kq;
+	cap_rights_t rights;
 	struct file *fp;
-	int i, n, nerrors, error;
+	int error;
 
-	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
+	cap_rights_init(&rights);
+	if (nchanges > 0)
+		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
+	if (nevents > 0)
+		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
+	error = fget(td, fd, &rights, &fp);
+	if (error != 0)
 		return (error);
-	if ((error = kqueue_acquire(fp, &kq)) != 0)
-		goto done_norel;
 
-	nerrors = 0;
+	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
+	fdrop(fp, td);
 
+	return (error);
+}
+
+static int
+kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
+    struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
+	struct kevent keva[KQ_NEVENTS];
+	struct kevent *kevp, *changes;
+	int i, n, nerrors, error;
+
+	nerrors = 0;
 	while (nchanges > 0) {
 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
 		error = k_ops->k_copyin(k_ops->arg, keva, n);
 		if (error)
-			goto done;
+			return (error);
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
@@ -962,31 +1122,56 @@ kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, 1);
 			if (error || (kevp->flags & EV_RECEIPT)) {
-				if (nevents != 0) {
-					kevp->flags = EV_ERROR;
-					kevp->data = error;
-					(void) k_ops->k_copyout(k_ops->arg,
-					    kevp, 1);
-					nevents--;
-					nerrors++;
-				} else {
-					goto done;
-				}
+				if (nevents == 0)
+					return (error);
+				kevp->flags = EV_ERROR;
+				kevp->data = error;
+				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
+				nevents--;
+				nerrors++;
 			}
 		}
 		nchanges -= n;
 	}
 	if (nerrors) {
 		td->td_retval[0] = nerrors;
-		error = 0;
-		goto done;
+		return (0);
 	}
 
-	error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
-done:
+	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
+}
+
+int
+kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
+    struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
+	struct kqueue *kq;
+	int error;
+
+	error = kqueue_acquire(fp, &kq);
+	if (error != 0)
+		return (error);
+	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
 	kqueue_release(kq, 0);
-done_norel:
-	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Performs a kevent() call on a temporarily created kqueue. This can be
+ * used to perform one-shot polling, similar to poll() and select().
+ */
+int
+kern_kevent_anonymous(struct thread *td, int nevents,
+    struct kevent_copyops *k_ops)
+{
+	struct kqueue kq = {};
+	int error;
+
+	kqueue_init(&kq);
+	kq.kq_refcnt = 1;
+	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
+	kqueue_drain(&kq, td);
+	kqueue_destroy(&kq);
 	return (error);
 }
 
@@ -1046,6 +1231,9 @@ kqueue_fo_find(int filt)
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return NULL;
 
+	if (sysfilt_ops[~filt].for_nolock)
+		return sysfilt_ops[~filt].for_fop;
+
 	mtx_lock(&filterops_lock);
 	sysfilt_ops[~filt].for_refcnt++;
 	if (sysfilt_ops[~filt].for_fop == NULL)
@@ -1062,6 +1250,9 @@ kqueue_fo_release(int filt)
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return;
 
+	if (sysfilt_ops[~filt].for_nolock)
+		return;
+
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 	    ("filter object refcount not valid on release"));
@@ -1080,11 +1271,17 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
+	struct knlist *knl;
+	cap_rights_t rights;
 	int error, filt, event;
 	int haskqglobal, filedesc_unlock;
 
+	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
+		return (EINVAL);
+
 	fp = NULL;
 	kn = NULL;
+	knl = NULL;
 	error = 0;
 	haskqglobal = 0;
 	filedesc_unlock = 0;
@@ -1094,12 +1291,25 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int wa
 	if (fops == NULL)
 		return EINVAL;
 
-	tkn = knote_alloc(waitok);		/* prevent waiting with locks */
+	if (kev->flags & EV_ADD) {
+		/*
+		 * Prevent waiting with locks.  Non-sleepable
+		 * allocation failures are handled in the loop, only
+		 * if the spare knote appears to be actually required.
+		 */
+		tkn = knote_alloc(waitok);
+	} else {
+		tkn = NULL;
+	}
 
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
-		error = fget(td, kev->ident, CAP_POLL_EVENT, &fp);
+		if (kev->ident > INT_MAX)
+			error = EBADF;
+		else
+			error = fget(td, kev->ident,
+			    cap_rights_init(&rights, CAP_EVENT), &fp);
 		if (error)
 			goto done;
 
@@ -1120,7 +1330,7 @@ findkn:
 		if (fp->f_io.pathinfo.handlers == &kqueueops) {
 #endif /* __rtems__ */
 			/*
-			 * if we add some inteligence about what we are doing,
+			 * If we add some intelligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
@@ -1152,7 +1362,18 @@ findkn:
 			kqueue_expand(kq, fops, kev->ident, waitok);
 
 		KQ_LOCK(kq);
-		if (kq->kq_knhashmask != 0) {
+
+		/*
+		 * If possible, find an existing knote to use for this kevent.
+		 */
+		if (kev->filter == EVFILT_PROC &&
+		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
+			/* This is an internal creation of a process tracking
+			 * note. Don't attempt to coalesce this with an
+			 * existing note.
+			 */
+			;			
+		} else if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
@@ -1164,7 +1385,7 @@ findkn:
 		}
 	}
 
-	/* knote is in the process of changing, wait for it to stablize. */
+	/* knote is in the process of changing, wait for it to stabilize. */
 	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		if (filedesc_unlock) {
@@ -1208,7 +1429,7 @@ findkn:
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
-			    EV_ENABLE | EV_DISABLE);
+			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
 			kn->kn_status = KN_INFLUX|KN_DETACHED;
 
 			error = knote_attach(kn, kq);
@@ -1222,7 +1443,7 @@ findkn:
 				knote_drop(kn, td);
 				goto done;
 			}
-			KN_LIST_LOCK(kn);
+			knl = kn_list_lock(kn);
 			goto done_ev_add;
 		} else {
 			/* No matching knote and the EV_ADD flag is not set. */
@@ -1241,6 +1462,11 @@ findkn:
 		goto done;
 	}
 
+	if (kev->flags & EV_FORCEONESHOT) {
+		kn->kn_flags |= EV_ONESHOT;
+		KNOTE_ACTIVATE(kn, 1);
+	}
+
 	/*
 	 * The user may change some filter values after the initial EV_ADD,
 	 * but doing so will not reset any filter which has already been
@@ -1248,7 +1474,7 @@ findkn:
 	 */
 	kn->kn_status |= KN_INFLUX | KN_SCAN;
 	KQ_UNLOCK(kq);
-	KN_LIST_LOCK(kn);
+	knl = kn_list_lock(kn);
 	kn->kn_kevent.udata = kev->udata;
 	if (!fops->f_isfd && fops->f_touch != NULL) {
 		fops->f_touch(kn, kev, EVENT_REGISTER);
@@ -1265,24 +1491,24 @@ findkn:
 	 * kn_knlist.
 	 */
 done_ev_add:
-	event = kn->kn_fop->f_event(kn, 0);
+	if ((kev->flags & EV_ENABLE) != 0)
+		kn->kn_status &= ~KN_DISABLED;
+	else if ((kev->flags & EV_DISABLE) != 0)
+		kn->kn_status |= KN_DISABLED;
+
+	if ((kn->kn_status & KN_DISABLED) == 0)
+		event = kn->kn_fop->f_event(kn, 0);
+	else
+		event = 0;
+
 	KQ_LOCK(kq);
 	if (event)
-		KNOTE_ACTIVATE(kn, 1);
+		kn->kn_status |= KN_ACTIVE;
+	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
+	    KN_ACTIVE)
+		knote_enqueue(kn);
 	kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
-	KN_LIST_UNLOCK(kn);
-
-	if ((kev->flags & EV_DISABLE) &&
-	    ((kn->kn_status & KN_DISABLED) == 0)) {
-		kn->kn_status |= KN_DISABLED;
-	}
-
-	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
-		kn->kn_status &= ~KN_DISABLED;
-		if ((kn->kn_status & KN_ACTIVE) &&
-		    ((kn->kn_status & KN_QUEUED) == 0))
-			knote_enqueue(kn);
-	}
+	kn_list_unlock(knl);
 	KQ_UNLOCK_FLUX(kq);
 
 done:
@@ -1291,8 +1517,7 @@ done:
 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
-	if (tkn != NULL)
-		knote_free(tkn);
+	knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
@@ -1348,7 +1573,7 @@ kqueue_schedtask(struct kqueue *kq)
 	    ("scheduling kqueue task while draining"));
 
 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
-		taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
+		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
 		kq->kq_state |= KQ_TASKSCHED;
 	}
 }
@@ -1456,10 +1681,10 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
-	struct timeval atv, rtv, ttv;
 	struct knote *kn, *marker;
-	int count, timeout, nkev, error, influx;
-	int haskqglobal, touch;
+	struct knlist *knl;
+	sbintime_t asbt, rsbt;
+	int count, error, haskqglobal, influx, nkev, touch;
 
 	count = maxevents;
 	nkev = 0;
@@ -1469,53 +1694,42 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
 	if (maxevents == 0)
 		goto done_nl;
 
+	rsbt = 0;
 	if (tsp != NULL) {
-		TIMESPEC_TO_TIMEVAL(&atv, tsp);
-		if (itimerfix(&atv)) {
+		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
+		    tsp->tv_nsec >= 1000000000) {
 			error = EINVAL;
 			goto done_nl;
 		}
-		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
-			timeout = -1;
-		else
-			timeout = atv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&atv);
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
-	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
-		timeout = 0;
-	}
+		if (timespecisset(tsp)) {
+			if (tsp->tv_sec <= INT32_MAX) {
+				rsbt = tstosbt(*tsp);
+				if (TIMESEL(&asbt, rsbt))
+					asbt += tc_tick_sbt;
+				if (asbt <= SBT_MAX - rsbt)
+					asbt += rsbt;
+				else
+					asbt = 0;
+				rsbt >>= tc_precexp;
+			} else
+				asbt = 0;
+		} else
+			asbt = -1;
+	} else
+		asbt = 0;
 	marker = knote_alloc(1);
-	if (marker == NULL) {
-		error = ENOMEM;
-		goto done_nl;
-	}
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
-	goto start;
 
 retry:
-	if (atv.tv_sec || atv.tv_usec) {
-		getmicrouptime(&rtv);
-		if (timevalcmp(&rtv, &atv, >=))
-			goto done;
-		ttv = atv;
-		timevalsub(&ttv, &rtv);
-		timeout = ttv.tv_sec > 24 * 60 * 60 ?
-			24 * 60 * 60 * hz : tvtohz(&ttv);
-	}
-
-start:
 	kevp = keva;
 	if (kq->kq_count == 0) {
-		if (timeout < 0) {
+		if (asbt == -1) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
-			error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
-			    "kqread", timeout);
+			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
+			    "kqread", asbt, rsbt, C_ABSOLUTE);
 		}
 		if (error == 0)
 			goto retry;
@@ -1594,7 +1808,7 @@ start:
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
-			KN_LIST_LOCK(kn);
+			knl = kn_list_lock(kn);
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
@@ -1602,7 +1816,7 @@ start:
 				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
 				    KN_SCAN);
 				kq->kq_count--;
-				KN_LIST_UNLOCK(kn);
+				kn_list_unlock(knl);
 				influx = 1;
 				continue;
 			}
@@ -1614,7 +1828,7 @@ start:
 				*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
-			if (kn->kn_flags & (EV_CLEAR |  EV_DISPATCH)) {
+			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
 				/* 
 				 * Manually clear knotes who weren't 
 				 * 'touch'ed.
@@ -1631,7 +1845,7 @@ start:
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
 			kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
-			KN_LIST_UNLOCK(kn);
+			kn_list_unlock(knl);
 			influx = 1;
 		}
 
@@ -1665,35 +1879,6 @@ done_nl:
 }
 
 #ifndef __rtems__
-/*
- * XXX
- * This could be expanded to call kqueue_scan, if desired.
- */
-/*ARGSUSED*/
-static int
-kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
-	int flags, struct thread *td)
-{
-	return (ENXIO);
-}
-
-/*ARGSUSED*/
-static int
-kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
-	 int flags, struct thread *td)
-{
-	return (ENXIO);
-}
-
-/*ARGSUSED*/
-static int
-kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
-	struct thread *td)
-{
-
-	return (EINVAL);
-}
-
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
@@ -1813,26 +1998,12 @@ rtems_bsd_kqueue_stat(const rtems_filesystem_location_info_t *loc,
 	return (0);
 }
 
-/*ARGSUSED*/
-static int
-kqueue_close(struct file *fp, struct thread *td)
+static void
+kqueue_drain(struct kqueue *kq, struct thread *td)
 {
-	struct kqueue *kq = fp->f_data;
-	struct filedesc *fdp;
 	struct knote *kn;
 	int i;
-	int error;
-	int filedesc_unlock;
-
-#ifdef __rtems__
-	/* FIXME: Move this to the RTEMS close() function */
-	knote_fdclose(td, rtems_bsd_fp_to_fd(fp));
-#endif /* __rtems__ */
 
-	if ((error = kqueue_acquire(fp, &kq)))
-		return error;
-
-	filedesc_unlock = 0;
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
@@ -1842,11 +2013,6 @@ kqueue_close(struct file *fp, struct thread *td)
 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
 
 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
-#ifndef __rtems__
-	fdp = kq->kq_fdp;
-#else /* __rtems__ */
-	(void) fdp;
-#endif /* __rtems__ */
 
 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
 	    ("kqueue's knlist not empty"));
@@ -1897,6 +2063,38 @@ kqueue_close(struct file *fp, struct thread *td)
 	}
 
 	KQ_UNLOCK(kq);
+}
+
+static void
+kqueue_destroy(struct kqueue *kq)
+{
+
+	KASSERT(kq->kq_fdp == NULL,
+	    ("kqueue still attached to a file descriptor"));
+	seldrain(&kq->kq_sel);
+	knlist_destroy(&kq->kq_sel.si_note);
+	mtx_destroy(&kq->kq_lock);
+
+	if (kq->kq_knhash != NULL)
+		free(kq->kq_knhash, M_KQUEUE);
+	if (kq->kq_knlist != NULL)
+		free(kq->kq_knlist, M_KQUEUE);
+
+	funsetown(&kq->kq_sigio);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_close(struct file *fp, struct thread *td)
+{
+	struct kqueue *kq = fp->f_data;
+	struct filedesc *fdp;
+	int error;
+	int filedesc_unlock;
+
+	if ((error = kqueue_acquire(fp, &kq)))
+		return error;
+	kqueue_drain(kq, td);
 
 #ifndef __rtems__
 	/*
@@ -1905,6 +2103,8 @@ kqueue_close(struct file *fp, struct thread *td)
 	 * lock is owned, and filedesc sx is locked before, to not
 	 * take the sleepable lock after non-sleepable.
 	 */
+	fdp = kq->kq_fdp;
+	kq->kq_fdp = NULL;
 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
 		FILEDESC_XLOCK(fdp);
 		filedesc_unlock = 1;
@@ -1920,19 +2120,9 @@ kqueue_close(struct file *fp, struct thread *td)
 	rtems_libio_unlock();
 #endif /* __rtems__ */
 
-	seldrain(&kq->kq_sel);
-	knlist_destroy(&kq->kq_sel.si_note);
-	mtx_destroy(&kq->kq_lock);
-#ifndef __rtems__
-	kq->kq_fdp = NULL;
-#endif /* __rtems__ */
-
-	if (kq->kq_knhash != NULL)
-		free(kq->kq_knhash, M_KQUEUE);
-	if (kq->kq_knlist != NULL)
-		free(kq->kq_knlist, M_KQUEUE);
-
-	funsetown(&kq->kq_sigio);
+	kqueue_destroy(kq);
+	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
+	crfree(kq->kq_cred);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
@@ -1956,6 +2146,16 @@ rtems_bsd_kqueue_close(rtems_libio_t *iop)
 }
 #endif /* __rtems__ */
 
+#ifndef __rtems__
+static int
+kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
+{
+
+	kif->kf_type = KF_TYPE_KQUEUE;
+	return (0);
+}
+#endif /* __rtems__ */
+
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
@@ -1993,7 +2193,7 @@ void
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
-	struct knote *kn;
+	struct knote *kn, *tkn;
 	int error;
 
 	if (list == NULL)
@@ -2005,14 +2205,13 @@ knote(struct knlist *list, long hint, int lockflags)
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
-	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
-	 * the kqueue scheduling, but this will introduce four
-	 * lock/unlock's for each knote to test.  If we do, continue to use
-	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
-	 * only safe if you want to remove the current item, which we are
-	 * not doing.
+	 * If we unlock the list lock (and set KN_INFLUX), we can
+	 * eliminate the kqueue scheduling, but this will introduce
+	 * four lock/unlock's for each knote to test.  Also, marker
+	 * would be needed to keep iteration position, since filters
+	 * or other threads could remove events.
 	 */
-	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
@@ -2068,7 +2267,8 @@ knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 }
 
 static void
-knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
+knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
+    int kqislocked)
 {
 	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
@@ -2081,7 +2281,7 @@ knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqis
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
 	kn->kn_knlist = NULL;
 	if (!knlislocked)
-		knl->kl_unlock(knl->kl_lockarg);
+		kn_list_unlock(knl);
 	if (!kqislocked)
 		KQ_LOCK(kn->kn_kq);
 	kn->kn_status |= KN_DETACHED;
@@ -2090,7 +2290,7 @@ knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqis
 }
 
 /*
- * remove all knotes from a specified klist
+ * remove knote from the specified knlist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
@@ -2099,20 +2299,10 @@ knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
 	knlist_remove_kq(knl, kn, islocked, 0);
 }
 
-/*
- * remove knote from a specified klist while in f_event handler.
- */
-void
-knlist_remove_inevent(struct knlist *knl, struct knote *kn)
-{
-
-	knlist_remove_kq(knl, kn, 1,
-	    (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
-}
-
 int
 knlist_empty(struct knlist *knl)
 {
+
 	KNL_ASSERT_LOCKED(knl);
 	return SLIST_EMPTY(&knl->kl_list);
 }
@@ -2126,27 +2316,61 @@ static void knlist_mtx_unlock(void *arg);
 static void
 knlist_mtx_lock(void *arg)
 {
+
 	mtx_lock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_unlock(void *arg)
 {
+
 	mtx_unlock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_assert_locked(void *arg)
 {
+
 	mtx_assert((struct mtx *)arg, MA_OWNED);
 }
 
 static void
 knlist_mtx_assert_unlocked(void *arg)
 {
+
 	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
 }
 
+#ifndef __rtems__
+static void
+knlist_rw_rlock(void *arg)
+{
+
+	rw_rlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_runlock(void *arg)
+{
+
+	rw_runlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_assert_locked(void *arg)
+{
+
+	rw_assert((struct rwlock *)arg, RA_LOCKED);
+}
+
+static void
+knlist_rw_assert_unlocked(void *arg)
+{
+
+	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
+}
+#endif /* __rtems__ */
+
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *),
@@ -2175,6 +2399,7 @@ knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
 	else
 		knl->kl_assert_unlocked = kl_assert_unlocked;
 
+	knl->kl_autodestroy = 0;
 	SLIST_INIT(&knl->kl_list);
 }
 
@@ -2185,6 +2410,26 @@ knlist_init_mtx(struct knlist *knl, struct mtx *lock)
 	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
 }
 
+struct knlist *
+knlist_alloc(struct mtx *lock)
+{
+	struct knlist *knl;
+
+	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
+	knlist_init_mtx(knl, lock);
+	return (knl);
+}
+
+#ifndef __rtems__
+void
+knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
+{
+
+	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
+	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
+}
+#endif /* __rtems__ */
+
 void
 knlist_destroy(struct knlist *knl)
 {
@@ -2192,7 +2437,7 @@ knlist_destroy(struct knlist *knl)
 #ifdef INVARIANTS
 	/*
 	 * if we run across this error, we need to find the offending
-	 * driver and have it call knlist_clear.
+	 * driver and have it call knlist_clear or knlist_delete.
 	 */
 	if (!SLIST_EMPTY(&knl->kl_list))
 		printf("WARNING: destroying knlist w/ knotes on it!\n");
@@ -2202,6 +2447,18 @@ knlist_destroy(struct knlist *knl)
 	SLIST_INIT(&knl->kl_list);
 }
 
+void
+knlist_detach(struct knlist *knl)
+{
+
+	KNL_ASSERT_LOCKED(knl);
+	knl->kl_autodestroy = 1;
+	if (knlist_empty(knl)) {
+		knlist_destroy(knl);
+		free(knl, M_KQUEUE);
+	}
+}
+
 /*
  * Even if we are locked, we may need to drop the lock to allow any influx
  * knotes time to "settle".
@@ -2212,6 +2469,7 @@ knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
+	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
@@ -2421,15 +2679,16 @@ SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 static struct knote *
 knote_alloc(int waitok)
 {
-	return ((struct knote *)uma_zalloc(knote_zone,
-	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
+
+	return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) |
+	    M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
-	if (kn != NULL)
-		uma_zfree(knote_zone, kn);
+
+	uma_zfree(knote_zone, kn);
 }
 
 /*
@@ -2440,9 +2699,11 @@ kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
 {
 	struct kqueue *kq;
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
-	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
+	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
+	if (error != 0)
 		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto noacquire;