From 1739d74f7dc53232fe20ed3ea9d8b4b0730b4025 Mon Sep 17 00:00:00 2001
From: Chris Johns <chrisj@rtems.org>
Date: Thu, 22 Jul 2021 11:50:13 +1000
Subject: freebsd/sys: Import VFS support

Update #4475
---
 freebsd/sys/fs/deadfs/dead_vnops.c          |  159 +
 freebsd/sys/fs/pseudofs/pseudofs.c          |  491 +++
 freebsd/sys/fs/pseudofs/pseudofs.h          |  312 ++
 freebsd/sys/fs/pseudofs/pseudofs_fileno.c   |  159 +
 freebsd/sys/fs/pseudofs/pseudofs_internal.h |  213 +
 freebsd/sys/fs/pseudofs/pseudofs_vncache.c  |  333 ++
 freebsd/sys/fs/pseudofs/pseudofs_vnops.c    | 1060 +++++
 freebsd/sys/kern/kern_descrip.c             | 4283 ++++++++++++++++++++
 freebsd/sys/kern/kern_lock.c                | 1719 ++++++++
 freebsd/sys/kern/subr_pctrie.c              |  695 ++++
 freebsd/sys/kern/vfs_acl.c                  |  600 +++
 freebsd/sys/kern/vfs_aio.c                  | 2987 ++++++++++++++
 freebsd/sys/kern/vfs_bio.c                  | 5474 +++++++++++++++++++++++++
 freebsd/sys/kern/vfs_cache.c                | 2604 ++++++++++++
 freebsd/sys/kern/vfs_cluster.c              | 1086 +++++
 freebsd/sys/kern/vfs_default.c              | 1286 ++++++
 freebsd/sys/kern/vfs_export.c               |  528 +++
 freebsd/sys/kern/vfs_extattr.c              |  757 ++++
 freebsd/sys/kern/vfs_hash.c                 |  234 ++
 freebsd/sys/kern/vfs_init.c                 |  376 ++
 freebsd/sys/kern/vfs_lookup.c               | 1450 +++++++
 freebsd/sys/kern/vfs_mount.c                | 2052 ++++++++++
 freebsd/sys/kern/vfs_subr.c                 | 5719 +++++++++++++++++++++++++++
 freebsd/sys/kern/vfs_syscalls.c             | 4748 ++++++++++++++++++++++
 freebsd/sys/kern/vfs_vnops.c                | 2607 ++++++++++++
 freebsd/sys/sys/bio.h                       |  184 +
 freebsd/sys/sys/namei.h                     |  226 ++
 freebsd/sys/sys/pctrie.h                    |  152 +
 freebsd/sys/sys/syscallsubr.h               |  317 ++
 freebsd/sys/sys/sysent.h                    |  327 ++
 freebsd/sys/sys/vmem.h                      |  145 +
 freebsd/sys/vm/vm_meter.c                   |  561 +++
 32 files changed, 43844 insertions(+)
 create mode 100644 freebsd/sys/fs/deadfs/dead_vnops.c
 create mode 100644 freebsd/sys/fs/pseudofs/pseudofs.c
 create mode 100644 freebsd/sys/fs/pseudofs/pseudofs.h
 create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_fileno.c
 create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_internal.h
 create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_vncache.c
 create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_vnops.c
 create mode 100644 freebsd/sys/kern/kern_descrip.c
 create mode 100644 freebsd/sys/kern/kern_lock.c
 create mode 100644 freebsd/sys/kern/subr_pctrie.c
 create mode 100644 freebsd/sys/kern/vfs_acl.c
 create mode 100644 freebsd/sys/kern/vfs_aio.c
 create mode 100644 freebsd/sys/kern/vfs_bio.c
 create mode 100644 freebsd/sys/kern/vfs_cache.c
 create mode 100644 freebsd/sys/kern/vfs_cluster.c
 create mode 100644 freebsd/sys/kern/vfs_default.c
 create mode 100644 freebsd/sys/kern/vfs_export.c
 create mode 100644 freebsd/sys/kern/vfs_extattr.c
 create mode 100644 freebsd/sys/kern/vfs_hash.c
 create mode 100644 freebsd/sys/kern/vfs_init.c
 create mode 100644 freebsd/sys/kern/vfs_lookup.c
 create mode 100644 freebsd/sys/kern/vfs_mount.c
 create mode 100644 freebsd/sys/kern/vfs_subr.c
 create mode 100644 freebsd/sys/kern/vfs_syscalls.c
 create mode 100644 freebsd/sys/kern/vfs_vnops.c
 create mode 100644 freebsd/sys/sys/bio.h
 create mode 100644 freebsd/sys/sys/namei.h
 create mode 100644 freebsd/sys/sys/pctrie.h
 create mode 100644 freebsd/sys/sys/syscallsubr.h
 create mode 100644 freebsd/sys/sys/sysent.h
 create mode 100644 freebsd/sys/sys/vmem.h
 create mode 100644 freebsd/sys/vm/vm_meter.c

diff --git a/freebsd/sys/fs/deadfs/dead_vnops.c b/freebsd/sys/fs/deadfs/dead_vnops.c
new file mode 100644
index 00000000..a3153aed
--- /dev/null
+++ b/freebsd/sys/fs/deadfs/dead_vnops.c
@@ -0,0 +1,159 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)dead_vnops.c	8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/vnode.h>
+
+/*
+ * Prototypes for dead operations on vnodes.
+ */
+static vop_lookup_t	dead_lookup;
+static vop_open_t	dead_open;
+static vop_getwritemount_t dead_getwritemount;
+static vop_rename_t	dead_rename;
+static vop_unset_text_t	dead_unset_text;
+
+struct vop_vector dead_vnodeops = {
+	.vop_default =		&default_vnodeops,
+
+	.vop_access =		VOP_EBADF,
+	.vop_advlock =		VOP_EBADF,
+	.vop_bmap =		VOP_EBADF,
+	.vop_create =		VOP_PANIC,
+	.vop_getattr =		VOP_EBADF,
+	.vop_getwritemount =	dead_getwritemount,
+	.vop_inactive =		VOP_NULL,
+	.vop_ioctl =		VOP_EBADF,
+	.vop_link =		VOP_PANIC,
+	.vop_lookup =		dead_lookup,
+	.vop_mkdir =		VOP_PANIC,
+	.vop_mknod =		VOP_PANIC,
+	.vop_open =		dead_open,
+	.vop_pathconf =		VOP_EBADF,	/* per pathconf(2) */
+	.vop_poll =		dead_poll,
+	.vop_read =		dead_read,
+	.vop_readdir =		VOP_EBADF,
+	.vop_readlink =		VOP_EBADF,
+	.vop_reclaim =		VOP_NULL,
+	.vop_remove =		VOP_PANIC,
+	.vop_rename =		dead_rename,
+	.vop_rmdir =		VOP_PANIC,
+	.vop_setattr =		VOP_EBADF,
+	.vop_symlink =		VOP_PANIC,
+	.vop_vptocnp =		VOP_EBADF,
+	.vop_unset_text =	dead_unset_text,
+	.vop_write =		dead_write,
+};
+
+static int
+dead_getwritemount(struct vop_getwritemount_args *ap)
+{
+
+	*(ap->a_mpp) = NULL;
+	return (0);
+}
+
+/*
+ * Trivial lookup routine that always fails.
+ */
+static int
+dead_lookup(struct vop_lookup_args *ap)
+{
+
+	*ap->a_vpp = NULL;
+	return (ENOTDIR);
+}
+
+/*
+ * Open always fails as if device did not exist.
+ */
+static int
+dead_open(struct vop_open_args *ap)
+{
+
+	return (ENXIO);
+}
+
+int
+dead_read(struct vop_read_args *ap)
+{
+
+	/*
+	 * Return EOF for tty devices, EIO for others
+	 */
+	if ((ap->a_vp->v_vflag & VV_ISTTY) == 0)
+		return (EIO);
+	return (0);
+}
+
+int
+dead_write(struct vop_write_args *ap)
+{
+
+	return (EIO);
+}
+
+int
+dead_poll(struct vop_poll_args *ap)
+{
+
+	if (ap->a_events & ~POLLSTANDARD)
+		return (POLLNVAL);
+
+	/*
+	 * Let the user find out that the descriptor is gone.
+	 */
+	return (POLLHUP | ((POLLIN | POLLRDNORM) & ap->a_events));
+
+}
+
+static int
+dead_rename(struct vop_rename_args *ap)
+{
+
+	vop_rename_fail(ap);
+	return (EXDEV);
+}
+
+static int
+dead_unset_text(struct vop_unset_text_args *ap)
+{
+
+	return (0);
+}
diff --git a/freebsd/sys/fs/pseudofs/pseudofs.c b/freebsd/sys/fs/pseudofs/pseudofs.c
new file mode 100644
index 00000000..73d3c7cb
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs.c
@@ -0,0 +1,491 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+static MALLOC_DEFINE(M_PFSNODES, "pfs_nodes", "pseudofs nodes");
+
+SYSCTL_NODE(_vfs, OID_AUTO, pfs, CTLFLAG_RW, 0,
+    "pseudofs");
+
+#ifdef PSEUDOFS_TRACE
+int pfs_trace;
+SYSCTL_INT(_vfs_pfs, OID_AUTO, trace, CTLFLAG_RW, &pfs_trace, 0,
+    "enable tracing of pseudofs vnode operations");
+#endif
+
+#if PFS_FSNAMELEN != MFSNAMELEN
+#error "PFS_FSNAMELEN is not equal to MFSNAMELEN"
+#endif
+
+/*
+ * Allocate and initialize a node
+ */
+static struct pfs_node *
+pfs_alloc_node_flags(struct pfs_info *pi, const char *name, pfs_type_t type, int flags)
+{
+	struct pfs_node *pn;
+	int malloc_flags;
+
+	KASSERT(strlen(name) < PFS_NAMELEN,
+	    ("%s(): node name is too long", __func__));
+	if (flags & PFS_NOWAIT)
+		malloc_flags = M_NOWAIT | M_ZERO;
+	else
+		malloc_flags = M_WAITOK | M_ZERO;
+	pn = malloc(sizeof *pn, M_PFSNODES, malloc_flags);
+	if (pn == NULL)
+		return (NULL);
+	mtx_init(&pn->pn_mutex, "pfs_node", NULL, MTX_DEF | MTX_DUPOK);
+	strlcpy(pn->pn_name, name, sizeof pn->pn_name);
+	pn->pn_type = type;
+	pn->pn_info = pi;
+	return (pn);
+}
+
+static struct pfs_node *
+pfs_alloc_node(struct pfs_info *pi, const char *name, pfs_type_t type)
+{
+	return (pfs_alloc_node_flags(pi, name, type, 0));
+}
+
+/*
+ * Add a node to a directory
+ */
+static void
+pfs_add_node(struct pfs_node *parent, struct pfs_node *pn)
+{
+#ifdef INVARIANTS
+	struct pfs_node *iter;
+#endif
+
+	KASSERT(parent != NULL,
+	    ("%s(): parent is NULL", __func__));
+	KASSERT(pn->pn_parent == NULL,
+	    ("%s(): node already has a parent", __func__));
+	KASSERT(parent->pn_info != NULL,
+	    ("%s(): parent has no pn_info", __func__));
+	KASSERT(parent->pn_type == pfstype_dir ||
+	    parent->pn_type == pfstype_procdir ||
+	    parent->pn_type == pfstype_root,
+	    ("%s(): parent is not a directory", __func__));
+
+#ifdef INVARIANTS
+	/* XXX no locking! */
+	if (pn->pn_type == pfstype_procdir)
+		for (iter = parent; iter != NULL; iter = iter->pn_parent)
+			KASSERT(iter->pn_type != pfstype_procdir,
+			    ("%s(): nested process directories", __func__));
+	for (iter = parent->pn_nodes; iter != NULL; iter = iter->pn_next) {
+		KASSERT(strcmp(pn->pn_name, iter->pn_name) != 0,
+		    ("%s(): homonymous siblings", __func__));
+		if (pn->pn_type == pfstype_procdir)
+			KASSERT(iter->pn_type != pfstype_procdir,
+			    ("%s(): sibling process directories", __func__));
+	}
+#endif
+
+	pn->pn_parent = parent;
+	pfs_fileno_alloc(pn);
+
+	pfs_lock(parent);
+	pn->pn_next = parent->pn_nodes;
+	if ((parent->pn_flags & PFS_PROCDEP) != 0)
+		pn->pn_flags |= PFS_PROCDEP;
+	parent->pn_nodes = pn;
+	pfs_unlock(parent);
+}
+
+/*
+ * Detach a node from its aprent
+ */
+static void
+pfs_detach_node(struct pfs_node *pn)
+{
+	struct pfs_node *parent = pn->pn_parent;
+	struct pfs_node **iter;
+
+	KASSERT(parent != NULL, ("%s(): node has no parent", __func__));
+	KASSERT(parent->pn_info == pn->pn_info,
+	    ("%s(): parent has different pn_info", __func__));
+
+	pfs_lock(parent);
+	iter = &parent->pn_nodes;
+	while (*iter != NULL) {
+		if (*iter == pn) {
+			*iter = pn->pn_next;
+			break;
+		}
+		iter = &(*iter)->pn_next;
+	}
+	pn->pn_parent = NULL;
+	pfs_unlock(parent);
+}
+
+/*
+ * Add . and .. to a directory
+ */
+static int
+pfs_fixup_dir_flags(struct pfs_node *parent, int flags)
+{
+	struct pfs_node *dot, *dotdot;
+
+	dot = pfs_alloc_node_flags(parent->pn_info, ".", pfstype_this, flags);
+	if (dot == NULL)
+		return (ENOMEM);
+	dotdot = pfs_alloc_node_flags(parent->pn_info, "..", pfstype_parent, flags);
+	if (dotdot == NULL) {
+		pfs_destroy(dot);
+		return (ENOMEM);
+	}
+	pfs_add_node(parent, dot);
+	pfs_add_node(parent, dotdot);
+	return (0);
+}
+
+static void
+pfs_fixup_dir(struct pfs_node *parent)
+{
+
+	pfs_fixup_dir_flags(parent, 0);
+}
+
+/*
+ * Create a directory
+ */
+struct pfs_node	*
+pfs_create_dir(struct pfs_node *parent, const char *name,
+	       pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy,
+	       int flags)
+{
+	struct pfs_node *pn;
+	int rc;
+
+	pn = pfs_alloc_node_flags(parent->pn_info, name,
+			 (flags & PFS_PROCDEP) ? pfstype_procdir : pfstype_dir, flags);
+	if (pn == NULL)
+		return (NULL);
+	pn->pn_attr = attr;
+	pn->pn_vis = vis;
+	pn->pn_destroy = destroy;
+	pn->pn_flags = flags;
+	pfs_add_node(parent, pn);
+	rc = pfs_fixup_dir_flags(pn, flags);
+	if (rc) {
+		pfs_destroy(pn);
+		return (NULL);
+	}
+	return (pn);
+}
+
+/*
+ * Create a file
+ */
+struct pfs_node	*
+pfs_create_file(struct pfs_node *parent, const char *name, pfs_fill_t fill,
+		pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy,
+		int flags)
+{
+	struct pfs_node *pn;
+
+	pn = pfs_alloc_node_flags(parent->pn_info, name, pfstype_file, flags);
+	if (pn == NULL)
+		return (NULL);
+	pn->pn_fill = fill;
+	pn->pn_attr = attr;
+	pn->pn_vis = vis;
+	pn->pn_destroy = destroy;
+	pn->pn_flags = flags;
+	pfs_add_node(parent, pn);
+
+	return (pn);
+}
+
+/*
+ * Create a symlink
+ */
+struct pfs_node	*
+pfs_create_link(struct pfs_node *parent, const char *name, pfs_fill_t fill,
+		pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy,
+		int flags)
+{
+	struct pfs_node *pn;
+
+	pn = pfs_alloc_node_flags(parent->pn_info, name, pfstype_symlink, flags);
+	if (pn == NULL)
+		return (NULL);
+	pn->pn_fill = fill;
+	pn->pn_attr = attr;
+	pn->pn_vis = vis;
+	pn->pn_destroy = destroy;
+	pn->pn_flags = flags;
+	pfs_add_node(parent, pn);
+
+	return (pn);
+}
+
+/*
+ * Locate a node by name
+ */
+struct pfs_node *
+pfs_find_node(struct pfs_node *parent, const char *name)
+{
+	struct pfs_node *pn;
+
+	pfs_lock(parent);
+	for (pn = parent->pn_nodes; pn != NULL; pn = pn->pn_next)
+		if (strcmp(pn->pn_name, name) == 0)
+			break;
+	pfs_unlock(parent);
+	return (pn);
+}
+
+/*
+ * Destroy a node and all its descendants.  If the node to be destroyed
+ * has a parent, the parent's mutex must be held.
+ */
+int
+pfs_destroy(struct pfs_node *pn)
+{
+	struct pfs_node *iter;
+
+	KASSERT(pn != NULL,
+	    ("%s(): node is NULL", __func__));
+	KASSERT(pn->pn_info != NULL,
+	    ("%s(): node has no pn_info", __func__));
+
+	if (pn->pn_parent)
+		pfs_detach_node(pn);
+
+	/* destroy children */
+	if (pn->pn_type == pfstype_dir ||
+	    pn->pn_type == pfstype_procdir ||
+	    pn->pn_type == pfstype_root) {
+		pfs_lock(pn);
+		while (pn->pn_nodes != NULL) {
+			iter = pn->pn_nodes;
+			pn->pn_nodes = iter->pn_next;
+			iter->pn_parent = NULL;
+			pfs_unlock(pn);
+			pfs_destroy(iter);
+			pfs_lock(pn);
+		}
+		pfs_unlock(pn);
+	}
+
+	/* revoke vnodes and fileno */
+	pfs_purge(pn);
+
+	/* callback to free any private resources */
+	if (pn->pn_destroy != NULL)
+		pn_destroy(pn);
+
+	/* destroy the node */
+	pfs_fileno_free(pn);
+	mtx_destroy(&pn->pn_mutex);
+	free(pn, M_PFSNODES);
+
+	return (0);
+}
+
+/*
+ * Mount a pseudofs instance
+ */
+int
+pfs_mount(struct pfs_info *pi, struct mount *mp)
+{
+	struct statfs *sbp;
+
+	if (mp->mnt_flag & MNT_UPDATE)
+		return (EOPNOTSUPP);
+
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_LOCAL;
+	MNT_IUNLOCK(mp);
+	mp->mnt_data = pi;
+	vfs_getnewfsid(mp);
+
+	sbp = &mp->mnt_stat;
+	vfs_mountedfrom(mp, pi->pi_name);
+	sbp->f_bsize = PAGE_SIZE;
+	sbp->f_iosize = PAGE_SIZE;
+	sbp->f_blocks = 1;
+	sbp->f_bfree = 0;
+	sbp->f_bavail = 0;
+	sbp->f_files = 1;
+	sbp->f_ffree = 0;
+
+	return (0);
+}
+
+/*
+ * Compatibility shim for old mount(2) system call
+ */
+int
+pfs_cmount(struct mntarg *ma, void *data, uint64_t flags)
+{
+	int error;
+
+	error = kernel_mount(ma, flags);
+	return (error);
+}
+
+/*
+ * Unmount a pseudofs instance
+ */
+int
+pfs_unmount(struct mount *mp, int mntflags)
+{
+	int error;
+
+	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
+	    curthread);
+	return (error);
+}
+
+/*
+ * Return a root vnode
+ */
+int
+pfs_root(struct mount *mp, int flags, struct vnode **vpp)
+{
+	struct pfs_info *pi;
+
+	pi = (struct pfs_info *)mp->mnt_data;
+	return (pfs_vncache_alloc(mp, vpp, pi->pi_root, NO_PID));
+}
+
+/*
+ * Return filesystem stats
+ */
+int
+pfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+	/* no-op:  always called with mp->mnt_stat */
+	return (0);
+}
+
+/*
+ * Initialize a pseudofs instance
+ */
+int
+pfs_init(struct pfs_info *pi, struct vfsconf *vfc)
+{
+	struct pfs_node *root;
+	int error;
+
+	pfs_fileno_init(pi);
+
+	/* set up the root directory */
+	root = pfs_alloc_node(pi, "/", pfstype_root);
+	pi->pi_root = root;
+	pfs_fileno_alloc(root);
+	pfs_fixup_dir(root);
+
+	/* construct file hierarchy */
+	error = (pi->pi_init)(pi, vfc);
+	if (error) {
+		pfs_destroy(root);
+		pi->pi_root = NULL;
+		return (error);
+	}
+
+	if (bootverbose)
+		printf("%s registered\n", pi->pi_name);
+	return (0);
+}
+
+/*
+ * Destroy a pseudofs instance
+ */
+int
+pfs_uninit(struct pfs_info *pi, struct vfsconf *vfc)
+{
+	int error;
+
+	pfs_destroy(pi->pi_root);
+	pi->pi_root = NULL;
+	pfs_fileno_uninit(pi);
+	if (bootverbose)
+		printf("%s unregistered\n", pi->pi_name);
+	error = (pi->pi_uninit)(pi, vfc);
+	return (error);
+}
+
+/*
+ * Handle load / unload events
+ */
+static int
+pfs_modevent(module_t mod, int evt, void *arg)
+{
+	switch (evt) {
+	case MOD_LOAD:
+		pfs_vncache_load();
+		break;
+	case MOD_UNLOAD:
+	case MOD_SHUTDOWN:
+		pfs_vncache_unload();
+		break;
+	default:
+		return EOPNOTSUPP;
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Module declaration
+ */
+static moduledata_t pseudofs_data = {
+	"pseudofs",
+	pfs_modevent,
+	NULL
+};
+DECLARE_MODULE(pseudofs, pseudofs_data, SI_SUB_EXEC, SI_ORDER_FIRST);
+MODULE_VERSION(pseudofs, 1);
diff --git a/freebsd/sys/fs/pseudofs/pseudofs.h b/freebsd/sys/fs/pseudofs/pseudofs.h
new file mode 100644
index 00000000..602e1fbf
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs.h
@@ -0,0 +1,312 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *      $FreeBSD$
+ */
+
+#ifndef _PSEUDOFS_H_INCLUDED
+#define _PSEUDOFS_H_INCLUDED
+
+#include <sys/jail.h>
+
+/*
+ * Opaque structures
+ */
+struct mntarg;
+struct mount;
+struct nameidata;
+struct proc;
+struct sbuf;
+struct statfs;
+struct thread;
+struct uio;
+struct vfsconf;
+struct vnode;
+
+/*
+ * Limits and constants
+ */
+#define PFS_NAMELEN		128
+#define PFS_FSNAMELEN		16	/* equal to MFSNAMELEN */
+#define PFS_DELEN		(offsetof(struct dirent, d_name) + PFS_NAMELEN)
+
+typedef enum {
+	pfstype_none = 0,
+	pfstype_root,
+	pfstype_dir,
+	pfstype_this,
+	pfstype_parent,
+	pfstype_file,
+	pfstype_symlink,
+	pfstype_procdir
+} pfs_type_t;
+
+/*
+ * Flags
+ */
+#define PFS_RD		0x0001	/* readable */
+#define PFS_WR		0x0002	/* writeable */
+#define PFS_RDWR	(PFS_RD|PFS_WR)
+#define PFS_RAWRD	0x0004	/* raw reader */
+#define	PFS_RAWWR	0x0008	/* raw writer */
+#define PFS_RAW		(PFS_RAWRD|PFS_RAWWR)
+#define PFS_PROCDEP	0x0010	/* process-dependent */
+#define PFS_NOWAIT	0x0020 /* allow malloc to fail */
+
+/*
+ * Data structures
+ */
+struct pfs_info;
+struct pfs_node;
+
+/*
+ * Init / uninit callback
+ */
+#define PFS_INIT_ARGS \
+	struct pfs_info *pi, struct vfsconf *vfc
+#define PFS_INIT_ARGNAMES \
+	pi, vfc
+#define PFS_INIT_PROTO(name) \
+	int name(PFS_INIT_ARGS);
+typedef int (*pfs_init_t)(PFS_INIT_ARGS);
+
+/*
+ * Filler callback
+ * Called with proc held but unlocked
+ */
+#define PFS_FILL_ARGS \
+	struct thread *td, struct proc *p, struct pfs_node *pn, \
+	struct sbuf *sb, struct uio *uio
+#define PFS_FILL_ARGNAMES \
+	td, p, pn, sb, uio
+#define PFS_FILL_PROTO(name) \
+	int name(PFS_FILL_ARGS);
+typedef int (*pfs_fill_t)(PFS_FILL_ARGS);
+
+/*
+ * Attribute callback
+ * Called with proc locked
+ */
+struct vattr;
+#define PFS_ATTR_ARGS \
+	struct thread *td, struct proc *p, struct pfs_node *pn, \
+	struct vattr *vap
+#define PFS_ATTR_ARGNAMES \
+	td, p, pn, vap
+#define PFS_ATTR_PROTO(name) \
+	int name(PFS_ATTR_ARGS);
+typedef int (*pfs_attr_t)(PFS_ATTR_ARGS);
+
+/*
+ * Visibility callback
+ * Called with proc locked
+ */
+#define PFS_VIS_ARGS \
+	struct thread *td, struct proc *p, struct pfs_node *pn
+#define PFS_VIS_ARGNAMES \
+	td, p, pn
+#define PFS_VIS_PROTO(name) \
+	int name(PFS_VIS_ARGS);
+typedef int (*pfs_vis_t)(PFS_VIS_ARGS);
+
+/*
+ * Ioctl callback
+ * Called with proc locked
+ */
+#define PFS_IOCTL_ARGS \
+	struct thread *td, struct proc *p, struct pfs_node *pn, \
+	unsigned long cmd, void *data
+#define PFS_IOCTL_ARGNAMES \
+	td, p, pn, cmd, data
+#define PFS_IOCTL_PROTO(name) \
+	int name(PFS_IOCTL_ARGS);
+typedef int (*pfs_ioctl_t)(PFS_IOCTL_ARGS);
+
+/*
+ * Getextattr callback
+ * Called with proc locked
+ */
+#define PFS_GETEXTATTR_ARGS \
+	struct thread *td, struct proc *p, struct pfs_node *pn, \
+	int attrnamespace, const char *name, struct uio *uio,	\
+	size_t *size, struct ucred *cred
+#define PFS_GETEXTATTR_ARGNAMES \
+	td, p, pn, attrnamespace, name, uio, size, cred
+#define PFS_GETEXTATTR_PROTO(name) \
+	int name(PFS_GETEXTATTR_ARGS);
+struct ucred;
+typedef int (*pfs_getextattr_t)(PFS_GETEXTATTR_ARGS);
+
+/*
+ * Last-close callback
+ * Called with proc locked
+ */
+#define PFS_CLOSE_ARGS \
+	struct thread *td, struct proc *p, struct pfs_node *pn
+#define PFS_CLOSE_ARGNAMES \
+	td, p, pn
+#define PFS_CLOSE_PROTO(name) \
+	int name(PFS_CLOSE_ARGS);
+typedef int (*pfs_close_t)(PFS_CLOSE_ARGS);
+
+/*
+ * Destroy callback
+ */
+#define PFS_DESTROY_ARGS \
+	struct pfs_node *pn
+#define PFS_DESTROY_ARGNAMES \
+	pn
+#define PFS_DESTROY_PROTO(name) \
+	int name(PFS_DESTROY_ARGS);
+typedef int (*pfs_destroy_t)(PFS_DESTROY_ARGS);
+
+/*
+ * pfs_info: describes a pseudofs instance
+ *
+ * The pi_mutex is only used to avoid using the global subr_unit lock
+ * for unrhdr.  The rest of struct pfs_info is only modified during
+ * vfs_init() and vfs_uninit() of the consumer filesystem.
+ */
+struct pfs_info {
+	char			 pi_name[PFS_FSNAMELEN];
+	pfs_init_t		 pi_init;
+	pfs_init_t		 pi_uninit;
+
+	/* members below this line are initialized at run time */
+	struct pfs_node		*pi_root;
+	struct mtx		 pi_mutex;
+	struct unrhdr		*pi_unrhdr;
+};
+
+/*
+ * pfs_node: describes a node (file or directory) within a pseudofs
+ *
+ * - Fields marked (o) are protected by the node's own mutex.
+ * - Fields marked (p) are protected by the node's parent's mutex.
+ * - Remaining fields are not protected by any lock and are assumed to be
+ *   immutable once the node has been created.
+ *
+ * To prevent deadlocks, if a node's mutex is to be held at the same time
+ * as its parent's (e.g. when adding or removing nodes to a directory),
+ * the parent's mutex must always be acquired first.  Unfortunately, this
+ * is not enforcable by WITNESS.
+ */
+struct pfs_node {
+	char			 pn_name[PFS_NAMELEN];
+	pfs_type_t		 pn_type;
+	int			 pn_flags;
+	struct mtx		 pn_mutex;
+	void			*pn_data;		/* (o) */
+
+	pfs_fill_t		 pn_fill;
+	pfs_ioctl_t		 pn_ioctl;
+	pfs_close_t		 pn_close;
+	pfs_attr_t		 pn_attr;
+	pfs_vis_t		 pn_vis;
+	pfs_getextattr_t	 pn_getextattr;
+	pfs_destroy_t		 pn_destroy;
+
+	struct pfs_info		*pn_info;
+	u_int32_t		 pn_fileno;		/* (o) */
+
+	struct pfs_node		*pn_parent;		/* (o) */
+	struct pfs_node		*pn_nodes;		/* (o) */
+	struct pfs_node		*pn_next;		/* (p) */
+};
+
+/*
+ * VFS interface
+ */
+int		 pfs_mount	(struct pfs_info *pi, struct mount *mp);
+int		 pfs_cmount	(struct mntarg *ma, void *data, uint64_t flags);
+int		 pfs_unmount	(struct mount *mp, int mntflags);
+int		 pfs_root	(struct mount *mp, int flags,
+				 struct vnode **vpp);
+int		 pfs_statfs	(struct mount *mp, struct statfs *sbp);
+int		 pfs_init	(struct pfs_info *pi, struct vfsconf *vfc);
+int		 pfs_uninit	(struct pfs_info *pi, struct vfsconf *vfc);
+
+/*
+ * Directory structure construction and manipulation
+ */
+struct pfs_node	*pfs_create_dir	(struct pfs_node *parent, const char *name,
+				 pfs_attr_t attr, pfs_vis_t vis,
+				 pfs_destroy_t destroy, int flags);
+struct pfs_node	*pfs_create_file(struct pfs_node *parent, const char *name,
+				 pfs_fill_t fill, pfs_attr_t attr,
+				 pfs_vis_t vis, pfs_destroy_t destroy,
+				 int flags);
+struct pfs_node	*pfs_create_link(struct pfs_node *parent, const char *name,
+				 pfs_fill_t fill, pfs_attr_t attr,
+				 pfs_vis_t vis, pfs_destroy_t destroy,
+				 int flags);
+struct pfs_node	*pfs_find_node	(struct pfs_node *parent, const char *name);
+void		 pfs_purge	(struct pfs_node *pn);
+int		 pfs_destroy	(struct pfs_node *pn);
+
+/*
+ * Now for some initialization magic...
+ */
+#define PSEUDOFS(name, version, flags)					\
+									\
+static struct pfs_info name##_info = {					\
+	#name,								\
+	name##_init,							\
+	name##_uninit,							\
+};									\
+									\
+static int								\
+_##name##_mount(struct mount *mp) {					\
+	return (pfs_mount(&name##_info, mp));				\
+}									\
+									\
+static int								\
+_##name##_init(struct vfsconf *vfc) {					\
+	return (pfs_init(&name##_info, vfc));				\
+}									\
+									\
+static int								\
+_##name##_uninit(struct vfsconf *vfc) {					\
+	return (pfs_uninit(&name##_info, vfc));				\
+}									\
+									\
+static struct vfsops name##_vfsops = {					\
+	.vfs_cmount =		pfs_cmount,				\
+	.vfs_init =		_##name##_init,				\
+	.vfs_mount =		_##name##_mount,			\
+	.vfs_root =		pfs_root,				\
+	.vfs_statfs =		pfs_statfs,				\
+	.vfs_uninit =		_##name##_uninit,			\
+	.vfs_unmount =		pfs_unmount,				\
+};									\
+VFS_SET(name##_vfsops, name, VFCF_SYNTHETIC | flags);			\
+MODULE_VERSION(name, version);						\
+MODULE_DEPEND(name, pseudofs, 1, 1, 1);
+
+#endif
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_fileno.c b/freebsd/sys/fs/pseudofs/pseudofs_fileno.c
new file mode 100644
index 00000000..2c6b2d1f
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_fileno.c
@@ -0,0 +1,159 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+/*
+ * Initialize fileno bitmap
+ */
+void
+pfs_fileno_init(struct pfs_info *pi)
+{
+
+	mtx_init(&pi->pi_mutex, "pfs_fileno", NULL, MTX_DEF);
+	pi->pi_unrhdr = new_unrhdr(3, INT_MAX / NO_PID, &pi->pi_mutex);
+}
+
+/*
+ * Tear down fileno bitmap
+ */
+void
+pfs_fileno_uninit(struct pfs_info *pi)
+{
+
+	delete_unrhdr(pi->pi_unrhdr);
+	pi->pi_unrhdr = NULL;
+	mtx_destroy(&pi->pi_mutex);
+}
+
+/*
+ * Allocate a file number
+ */
+void
+pfs_fileno_alloc(struct pfs_node *pn)
+{
+
+	if (pn->pn_parent)
+		PFS_TRACE(("%s/%s", pn->pn_parent->pn_name, pn->pn_name));
+	else
+		PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	switch (pn->pn_type) {
+	case pfstype_root:
+		/* root must always be 2 */
+		pn->pn_fileno = 2;
+		break;
+	case pfstype_dir:
+	case pfstype_file:
+	case pfstype_symlink:
+	case pfstype_procdir:
+		pn->pn_fileno = alloc_unr(pn->pn_info->pi_unrhdr);
+		break;
+	case pfstype_this:
+		KASSERT(pn->pn_parent != NULL,
+		    ("%s(): pfstype_this node has no parent", __func__));
+		pn->pn_fileno = pn->pn_parent->pn_fileno;
+		break;
+	case pfstype_parent:
+		KASSERT(pn->pn_parent != NULL,
+		    ("%s(): pfstype_parent node has no parent", __func__));
+		if (pn->pn_parent->pn_type == pfstype_root) {
+			pn->pn_fileno = pn->pn_parent->pn_fileno;
+			break;
+		}
+		KASSERT(pn->pn_parent->pn_parent != NULL,
+		    ("%s(): pfstype_parent node has no grandparent", __func__));
+		pn->pn_fileno = pn->pn_parent->pn_parent->pn_fileno;
+		break;
+	case pfstype_none:
+		KASSERT(0,
+		    ("%s(): pfstype_none node", __func__));
+		break;
+	}
+
+#if 0
+	printf("%s(): %s: ", __func__, pn->pn_info->pi_name);
+	if (pn->pn_parent) {
+		if (pn->pn_parent->pn_parent) {
+			printf("%s/", pn->pn_parent->pn_parent->pn_name);
+		}
+		printf("%s/", pn->pn_parent->pn_name);
+	}
+	printf("%s -> %d\n", pn->pn_name, pn->pn_fileno);
+#endif
+}
+
+/*
+ * Release a file number
+ */
+void
+pfs_fileno_free(struct pfs_node *pn)
+{
+
+	pfs_assert_not_owned(pn);
+
+	switch (pn->pn_type) {
+	case pfstype_root:
+		/* not allocated from unrhdr */
+		return;
+	case pfstype_dir:
+	case pfstype_file:
+	case pfstype_symlink:
+	case pfstype_procdir:
+		free_unr(pn->pn_info->pi_unrhdr, pn->pn_fileno);
+		break;
+	case pfstype_this:
+	case pfstype_parent:
+		/* ignore these, as they don't "own" their file number */
+		break;
+	case pfstype_none:
+		KASSERT(0,
+		    ("pfs_fileno_free() called for pfstype_none node"));
+		break;
+	}
+}
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_internal.h b/freebsd/sys/fs/pseudofs/pseudofs_internal.h
new file mode 100644
index 00000000..3ec49e71
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_internal.h
@@ -0,0 +1,213 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *      $FreeBSD$
+ */
+
+#ifndef _PSEUDOFS_INTERNAL_H_INCLUDED
+#define _PSEUDOFS_INTERNAL_H_INCLUDED
+
+/*
+ * Sysctl subtree
+ */
+SYSCTL_DECL(_vfs_pfs);
+
+/*
+ * Vnode data
+ */
+struct pfs_vdata {
+	struct pfs_node	*pvd_pn;
+	pid_t		 pvd_pid;
+	struct vnode	*pvd_vnode;
+	struct pfs_vdata*pvd_prev, *pvd_next;
+	int		 pvd_dead:1;
+};
+
+/*
+ * Vnode cache
+ */
+void	 pfs_vncache_load	(void);
+void	 pfs_vncache_unload	(void);
+int	 pfs_vncache_alloc	(struct mount *, struct vnode **,
+				 struct pfs_node *, pid_t pid);
+int	 pfs_vncache_free	(struct vnode *);
+
+/*
+ * File number bitmap
+ */
+void	 pfs_fileno_init	(struct pfs_info *);
+void	 pfs_fileno_uninit	(struct pfs_info *);
+void	 pfs_fileno_alloc	(struct pfs_node *);
+void	 pfs_fileno_free	(struct pfs_node *);
+
+/*
+ * Debugging
+ */
+#ifdef PSEUDOFS_TRACE
+extern int pfs_trace;
+
+#define PFS_TRACE(foo) \
+	do { \
+		if (pfs_trace) { \
+			printf("%s(): line %d: ", __func__, __LINE__); \
+			printf foo ; \
+			printf("\n"); \
+		} \
+	} while (0)
+#define PFS_RETURN(err) \
+	do { \
+		if (pfs_trace) { \
+			printf("%s(): line %d: returning %d\n", \
+			    __func__, __LINE__, err); \
+		} \
+		return (err); \
+	} while (0)
+#else
+#define PFS_TRACE(foo) \
+	do { /* nothing */ } while (0)
+#define PFS_RETURN(err) \
+	return (err)
+#endif
+
+/*
+ * Inline helpers for locking
+ */
+static inline void
+pfs_lock(struct pfs_node *pn)
+{
+
+	mtx_lock(&pn->pn_mutex);
+}
+
+static inline void
+pfs_unlock(struct pfs_node *pn)
+{
+
+	mtx_unlock(&pn->pn_mutex);
+}
+
+static inline void
+pfs_assert_owned(struct pfs_node *pn)
+{
+
+	mtx_assert(&pn->pn_mutex, MA_OWNED);
+}
+
+static inline void
+pfs_assert_not_owned(struct pfs_node *pn)
+{
+
+	mtx_assert(&pn->pn_mutex, MA_NOTOWNED);
+}
+
+static inline int
+pn_fill(PFS_FILL_ARGS)
+{
+
+	PFS_TRACE(("%s", pn->pn_name));
+	KASSERT(pn->pn_fill != NULL, ("%s(): no callback", __func__));
+	if (p != NULL) {
+		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+		PROC_ASSERT_HELD(p);
+	}
+	pfs_assert_not_owned(pn);
+	return ((pn->pn_fill)(PFS_FILL_ARGNAMES));
+}
+
+static inline int
+pn_attr(PFS_ATTR_ARGS)
+{
+
+	PFS_TRACE(("%s", pn->pn_name));
+	KASSERT(pn->pn_attr != NULL, ("%s(): no callback", __func__));
+	if (p != NULL)
+		PROC_LOCK_ASSERT(p, MA_OWNED);
+	pfs_assert_not_owned(pn);
+	return ((pn->pn_attr)(PFS_ATTR_ARGNAMES));
+}
+
+static inline int
+pn_vis(PFS_VIS_ARGS)
+{
+
+	PFS_TRACE(("%s", pn->pn_name));
+	KASSERT(pn->pn_vis != NULL, ("%s(): no callback", __func__));
+	KASSERT(p != NULL, ("%s(): no process", __func__));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	pfs_assert_not_owned(pn);
+	return ((pn->pn_vis)(PFS_VIS_ARGNAMES));
+}
+
+static inline int
+pn_ioctl(PFS_IOCTL_ARGS)
+{
+
+	PFS_TRACE(("%s", pn->pn_name));
+	KASSERT(pn->pn_ioctl != NULL, ("%s(): no callback", __func__));
+	if (p != NULL)
+		PROC_LOCK_ASSERT(p, MA_OWNED);
+	pfs_assert_not_owned(pn);
+	return ((pn->pn_ioctl)(PFS_IOCTL_ARGNAMES));
+}
+
+static inline int
+pn_getextattr(PFS_GETEXTATTR_ARGS)
+{
+
+	PFS_TRACE(("%s", pn->pn_name));
+	KASSERT(pn->pn_getextattr != NULL, ("%s(): no callback", __func__));
+	if (p != NULL)
+		PROC_LOCK_ASSERT(p, MA_OWNED);
+	pfs_assert_not_owned(pn);
+	return ((pn->pn_getextattr)(PFS_GETEXTATTR_ARGNAMES));
+}
+
+static inline int
+pn_close(PFS_CLOSE_ARGS)
+{
+
+	PFS_TRACE(("%s", pn->pn_name));
+	KASSERT(pn->pn_close != NULL, ("%s(): no callback", __func__));
+	if (p != NULL)
+		PROC_LOCK_ASSERT(p, MA_OWNED);
+	pfs_assert_not_owned(pn);
+	return ((pn->pn_close)(PFS_CLOSE_ARGNAMES));
+}
+
+static inline int
+pn_destroy(PFS_DESTROY_ARGS)
+{
+
+	PFS_TRACE(("%s", pn->pn_name));
+	KASSERT(pn->pn_destroy != NULL, ("%s(): no callback", __func__));
+	pfs_assert_not_owned(pn);
+	return ((pn->pn_destroy)(PFS_DESTROY_ARGNAMES));
+}
+
+#endif
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_vncache.c b/freebsd/sys/fs/pseudofs/pseudofs_vncache.c
new file mode 100644
index 00000000..05dd6569
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_vncache.c
@@ -0,0 +1,333 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+static MALLOC_DEFINE(M_PFSVNCACHE, "pfs_vncache", "pseudofs vnode cache");
+
+static struct mtx pfs_vncache_mutex;
+static struct pfs_vdata *pfs_vncache;
+static eventhandler_tag pfs_exit_tag;
+static void pfs_exit(void *arg, struct proc *p);
+static void pfs_purge_locked(struct pfs_node *pn, bool force);
+
+static SYSCTL_NODE(_vfs_pfs, OID_AUTO, vncache, CTLFLAG_RW, 0,
+    "pseudofs vnode cache");
+
+static int pfs_vncache_entries;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, entries, CTLFLAG_RD,
+    &pfs_vncache_entries, 0,
+    "number of entries in the vnode cache");
+
+static int pfs_vncache_maxentries;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, maxentries, CTLFLAG_RD,
+    &pfs_vncache_maxentries, 0,
+    "highest number of entries in the vnode cache");
+
+static int pfs_vncache_hits;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, hits, CTLFLAG_RD,
+    &pfs_vncache_hits, 0,
+    "number of cache hits since initialization");
+
+static int pfs_vncache_misses;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, misses, CTLFLAG_RD,
+    &pfs_vncache_misses, 0,
+    "number of cache misses since initialization");
+
+extern struct vop_vector pfs_vnodeops;	/* XXX -> .h file */
+
+/*
+ * Initialize vnode cache
+ */
+void
+pfs_vncache_load(void)
+{
+
+	mtx_init(&pfs_vncache_mutex, "pfs_vncache", NULL, MTX_DEF);
+	pfs_exit_tag = EVENTHANDLER_REGISTER(process_exit, pfs_exit, NULL,
+	    EVENTHANDLER_PRI_ANY);
+}
+
+/*
+ * Tear down vnode cache
+ */
+void
+pfs_vncache_unload(void)
+{
+
+	EVENTHANDLER_DEREGISTER(process_exit, pfs_exit_tag);
+	mtx_lock(&pfs_vncache_mutex);
+	pfs_purge_locked(NULL, true);
+	mtx_unlock(&pfs_vncache_mutex);
+	KASSERT(pfs_vncache_entries == 0,
+	    ("%d vncache entries remaining", pfs_vncache_entries));
+	mtx_destroy(&pfs_vncache_mutex);
+}
+
+/*
+ * Allocate a vnode
+ */
+int
+pfs_vncache_alloc(struct mount *mp, struct vnode **vpp,
+		  struct pfs_node *pn, pid_t pid)
+{
+	struct pfs_vdata *pvd, *pvd2;
+	struct vnode *vp;
+	int error;
+
+	/*
+	 * See if the vnode is in the cache.
+	 * XXX linear search is not very efficient.
+	 */
+retry:
+	mtx_lock(&pfs_vncache_mutex);
+	for (pvd = pfs_vncache; pvd; pvd = pvd->pvd_next) {
+		if (pvd->pvd_pn == pn && pvd->pvd_pid == pid &&
+		    pvd->pvd_vnode->v_mount == mp) {
+			vp = pvd->pvd_vnode;
+			VI_LOCK(vp);
+			mtx_unlock(&pfs_vncache_mutex);
+			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) {
+				++pfs_vncache_hits;
+				*vpp = vp;
+				/*
+				 * Some callers cache_enter(vp) later, so
+				 * we have to make sure it's not in the
+				 * VFS cache so it doesn't get entered
+				 * twice.  A better solution would be to
+				 * make pfs_vncache_alloc() responsible
+				 * for entering the vnode in the VFS
+				 * cache.
+				 */
+				cache_purge(vp);
+				return (0);
+			}
+			goto retry;
+		}
+	}
+	mtx_unlock(&pfs_vncache_mutex);
+
+	/* nope, get a new one */
+	pvd = malloc(sizeof *pvd, M_PFSVNCACHE, M_WAITOK);
+	pvd->pvd_next = pvd->pvd_prev = NULL;
+	error = getnewvnode("pseudofs", mp, &pfs_vnodeops, vpp);
+	if (error) {
+		free(pvd, M_PFSVNCACHE);
+		return (error);
+	}
+	pvd->pvd_pn = pn;
+	pvd->pvd_pid = pid;
+	(*vpp)->v_data = pvd;
+	switch (pn->pn_type) {
+	case pfstype_root:
+		(*vpp)->v_vflag = VV_ROOT;
+#if 0
+		printf("root vnode allocated\n");
+#endif
+		/* fall through */
+	case pfstype_dir:
+	case pfstype_this:
+	case pfstype_parent:
+	case pfstype_procdir:
+		(*vpp)->v_type = VDIR;
+		break;
+	case pfstype_file:
+		(*vpp)->v_type = VREG;
+		break;
+	case pfstype_symlink:
+		(*vpp)->v_type = VLNK;
+		break;
+	case pfstype_none:
+		KASSERT(0, ("pfs_vncache_alloc called for null node\n"));
+	default:
+		panic("%s has unexpected type: %d", pn->pn_name, pn->pn_type);
+	}
+	/*
+	 * Propagate flag through to vnode so users know it can change
+	 * if the process changes (i.e. execve)
+	 */
+	if ((pn->pn_flags & PFS_PROCDEP) != 0)
+		(*vpp)->v_vflag |= VV_PROCDEP;
+	pvd->pvd_vnode = *vpp;
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+	VN_LOCK_AREC(*vpp);
+	error = insmntque(*vpp, mp);
+	if (error != 0) {
+		free(pvd, M_PFSVNCACHE);
+		*vpp = NULLVP;
+		return (error);
+	}
+retry2:
+	mtx_lock(&pfs_vncache_mutex);
+	/*
+	 * Other thread may race with us, creating the entry we are
+	 * going to insert into the cache. Recheck after
+	 * pfs_vncache_mutex is reacquired.
+	 */
+	for (pvd2 = pfs_vncache; pvd2; pvd2 = pvd2->pvd_next) {
+		if (pvd2->pvd_pn == pn && pvd2->pvd_pid == pid &&
+		    pvd2->pvd_vnode->v_mount == mp) {
+			vp = pvd2->pvd_vnode;
+			VI_LOCK(vp);
+			mtx_unlock(&pfs_vncache_mutex);
+			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) {
+				++pfs_vncache_hits;
+				vgone(*vpp);
+				vput(*vpp);
+				*vpp = vp;
+				cache_purge(vp);
+				return (0);
+			}
+			goto retry2;
+		}
+	}
+	++pfs_vncache_misses;
+	if (++pfs_vncache_entries > pfs_vncache_maxentries)
+		pfs_vncache_maxentries = pfs_vncache_entries;
+	pvd->pvd_prev = NULL;
+	pvd->pvd_next = pfs_vncache;
+	if (pvd->pvd_next)
+		pvd->pvd_next->pvd_prev = pvd;
+	pfs_vncache = pvd;
+	mtx_unlock(&pfs_vncache_mutex);
+	return (0);
+}
+
+/*
+ * Free a vnode
+ */
+int
+pfs_vncache_free(struct vnode *vp)
+{
+	struct pfs_vdata *pvd;
+
+	mtx_lock(&pfs_vncache_mutex);
+	pvd = (struct pfs_vdata *)vp->v_data;
+	KASSERT(pvd != NULL, ("pfs_vncache_free(): no vnode data\n"));
+	if (pvd->pvd_next)
+		pvd->pvd_next->pvd_prev = pvd->pvd_prev;
+	if (pvd->pvd_prev) {
+		pvd->pvd_prev->pvd_next = pvd->pvd_next;
+		--pfs_vncache_entries;
+	} else if (pfs_vncache == pvd) {
+		pfs_vncache = pvd->pvd_next;
+		--pfs_vncache_entries;
+	}
+	mtx_unlock(&pfs_vncache_mutex);
+
+	free(pvd, M_PFSVNCACHE);
+	vp->v_data = NULL;
+	return (0);
+}
+
+/*
+ * Purge the cache of dead entries
+ *
+ * This is extremely inefficient due to the fact that vgone() not only
+ * indirectly modifies the vnode cache, but may also sleep.  We can
+ * neither hold pfs_vncache_mutex across a vgone() call, nor make any
+ * assumptions about the state of the cache after vgone() returns.  In
+ * consequence, we must start over after every vgone() call, and keep
+ * trying until we manage to traverse the entire cache.
+ *
+ * The only way to improve this situation is to change the data structure
+ * used to implement the cache.
+ */
+static void
+pfs_purge_locked(struct pfs_node *pn, bool force)
+{
+	struct pfs_vdata *pvd;
+	struct vnode *vnp;
+
+	mtx_assert(&pfs_vncache_mutex, MA_OWNED);
+	pvd = pfs_vncache;
+	while (pvd != NULL) {
+		if (force || pvd->pvd_dead ||
+		    (pn != NULL && pvd->pvd_pn == pn)) {
+			vnp = pvd->pvd_vnode;
+			vhold(vnp);
+			mtx_unlock(&pfs_vncache_mutex);
+			VOP_LOCK(vnp, LK_EXCLUSIVE);
+			vgone(vnp);
+			VOP_UNLOCK(vnp, 0);
+			mtx_lock(&pfs_vncache_mutex);
+			vdrop(vnp);
+			pvd = pfs_vncache;
+		} else {
+			pvd = pvd->pvd_next;
+		}
+	}
+}
+
+void
+pfs_purge(struct pfs_node *pn)
+{
+
+	mtx_lock(&pfs_vncache_mutex);
+	pfs_purge_locked(pn, false);
+	mtx_unlock(&pfs_vncache_mutex);
+}
+
+/*
+ * Free all vnodes associated with a defunct process
+ */
+static void
+pfs_exit(void *arg, struct proc *p)
+{
+	struct pfs_vdata *pvd;
+	int dead;
+
+	if (pfs_vncache == NULL)
+		return;
+	mtx_lock(&pfs_vncache_mutex);
+	for (pvd = pfs_vncache, dead = 0; pvd != NULL; pvd = pvd->pvd_next)
+		if (pvd->pvd_pid == p->p_pid)
+			dead = pvd->pvd_dead = 1;
+	if (dead)
+		pfs_purge_locked(NULL, false);
+	mtx_unlock(&pfs_vncache_mutex);
+}
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_vnops.c b/freebsd/sys/fs/pseudofs/pseudofs_vnops.c
new file mode 100644
index 00000000..da35f062
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_vnops.c
@@ -0,0 +1,1060 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/ctype.h>
+#include <sys/dirent.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+#define KASSERT_PN_IS_DIR(pn)						\
+	KASSERT((pn)->pn_type == pfstype_root ||			\
+	    (pn)->pn_type == pfstype_dir ||				\
+	    (pn)->pn_type == pfstype_procdir,				\
+	    ("%s(): VDIR vnode refers to non-directory pfs_node", __func__))
+
+#define KASSERT_PN_IS_FILE(pn)						\
+	KASSERT((pn)->pn_type == pfstype_file,				\
+	    ("%s(): VREG vnode refers to non-file pfs_node", __func__))
+
+#define KASSERT_PN_IS_LINK(pn)						\
+	KASSERT((pn)->pn_type == pfstype_symlink,			\
+	    ("%s(): VLNK vnode refers to non-link pfs_node", __func__))
+
+/*
+ * Returns the fileno, adjusted for target pid
+ */
+static uint32_t
+pn_fileno(struct pfs_node *pn, pid_t pid)
+{
+
+	KASSERT(pn->pn_fileno > 0,
+	    ("%s(): no fileno allocated", __func__));
+	if (pid != NO_PID)
+		return (pn->pn_fileno * NO_PID + pid);
+	return (pn->pn_fileno);
+}
+
+/*
+ * Returns non-zero if given file is visible to given thread.
+ */
+static int
+pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc)
+{
+	int visible;
+
+	if (proc == NULL)
+		return (0);
+
+	PROC_LOCK_ASSERT(proc, MA_OWNED);
+
+	visible = ((proc->p_flag & P_WEXIT) == 0);
+	if (visible)
+		visible = (p_cansee(td, proc) == 0);
+	if (visible && pn->pn_vis != NULL)
+		visible = pn_vis(td, proc, pn);
+	if (!visible)
+		return (0);
+	return (1);
+}
+
+static int
+pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid,
+    bool allproc_locked, struct proc **p)
+{
+	struct proc *proc;
+
+	PFS_TRACE(("%s (pid: %d, req: %d)",
+	    pn->pn_name, pid, td->td_proc->p_pid));
+
+	if (p)
+		*p = NULL;
+	if (pid == NO_PID)
+		PFS_RETURN (1);
+	proc = allproc_locked ? pfind_locked(pid) : pfind(pid);
+	if (proc == NULL)
+		PFS_RETURN (0);
+	if (pfs_visible_proc(td, pn, proc)) {
+		if (p)
+			*p = proc;
+		else
+			PROC_UNLOCK(proc);
+		PFS_RETURN (1);
+	}
+	PROC_UNLOCK(proc);
+	PFS_RETURN (0);
+}
+
+/*
+ * Verify permissions
+ */
+static int
+pfs_access(struct vop_access_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct vattr vattr;
+	int error;
+
+	PFS_TRACE(("%s", pvd->pvd_pn->pn_name));
+	(void)pvd;
+
+	error = VOP_GETATTR(vn, &vattr, va->a_cred);
+	if (error)
+		PFS_RETURN (error);
+	error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid,
+	    vattr.va_gid, va->a_accmode, va->a_cred, NULL);
+	PFS_RETURN (error);
+}
+
+/*
+ * Close a file or directory
+ */
+static int
+pfs_close(struct vop_close_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+	struct proc *proc;
+	int error;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	/*
+	 * Do nothing unless this is the last close and the node has a
+	 * last-close handler.
+	 */
+	if (vrefcnt(vn) > 1 || pn->pn_close == NULL)
+		PFS_RETURN (0);
+
+	if (pvd->pvd_pid != NO_PID) {
+		proc = pfind(pvd->pvd_pid);
+	} else {
+		proc = NULL;
+	}
+
+	error = pn_close(va->a_td, proc, pn);
+
+	if (proc != NULL)
+		PROC_UNLOCK(proc);
+
+	PFS_RETURN (error);
+}
+
+/*
+ * Get file attributes
+ */
+static int
+pfs_getattr(struct vop_getattr_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+	struct vattr *vap = va->a_vap;
+	struct proc *proc;
+	int error = 0;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+		PFS_RETURN (ENOENT);
+
+	vap->va_type = vn->v_type;
+	vap->va_fileid = pn_fileno(pn, pvd->pvd_pid);
+	vap->va_flags = 0;
+	vap->va_blocksize = PAGE_SIZE;
+	vap->va_bytes = vap->va_size = 0;
+	vap->va_filerev = 0;
+	vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0];
+	vap->va_nlink = 1;
+	nanotime(&vap->va_ctime);
+	vap->va_atime = vap->va_mtime = vap->va_ctime;
+
+	switch (pn->pn_type) {
+	case pfstype_procdir:
+	case pfstype_root:
+	case pfstype_dir:
+#if 0
+		pfs_lock(pn);
+		/* compute link count */
+		pfs_unlock(pn);
+#endif
+		vap->va_mode = 0555;
+		break;
+	case pfstype_file:
+	case pfstype_symlink:
+		vap->va_mode = 0444;
+		break;
+	default:
+		printf("shouldn't be here!\n");
+		vap->va_mode = 0;
+		break;
+	}
+
+	if (proc != NULL) {
+		vap->va_uid = proc->p_ucred->cr_ruid;
+		vap->va_gid = proc->p_ucred->cr_rgid;
+	} else {
+		vap->va_uid = 0;
+		vap->va_gid = 0;
+	}
+
+	if (pn->pn_attr != NULL)
+		error = pn_attr(curthread, proc, pn, vap);
+
+	if(proc != NULL)
+		PROC_UNLOCK(proc);
+
+	PFS_RETURN (error);
+}
+
+/*
+ * Perform an ioctl
+ */
+static int
+pfs_ioctl(struct vop_ioctl_args *va)
+{
+	struct vnode *vn;
+	struct pfs_vdata *pvd;
+	struct pfs_node *pn;
+	struct proc *proc;
+	int error;
+
+	vn = va->a_vp;
+	vn_lock(vn, LK_SHARED | LK_RETRY);
+	if (vn->v_iflag & VI_DOOMED) {
+		VOP_UNLOCK(vn, 0);
+		return (EBADF);
+	}
+	pvd = vn->v_data;
+	pn = pvd->pvd_pn;
+
+	PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command));
+	pfs_assert_not_owned(pn);
+
+	if (vn->v_type != VREG) {
+		VOP_UNLOCK(vn, 0);
+		PFS_RETURN (EINVAL);
+	}
+	KASSERT_PN_IS_FILE(pn);
+
+	if (pn->pn_ioctl == NULL) {
+		VOP_UNLOCK(vn, 0);
+		PFS_RETURN (ENOTTY);
+	}
+
+	/*
+	 * This is necessary because process' privileges may
+	 * have changed since the open() call.
+	 */
+	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) {
+		VOP_UNLOCK(vn, 0);
+		PFS_RETURN (EIO);
+	}
+
+	error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data);
+
+	if (proc != NULL)
+		PROC_UNLOCK(proc);
+
+	VOP_UNLOCK(vn, 0);
+	PFS_RETURN (error);
+}
+
+/*
+ * Perform getextattr
+ */
+static int
+pfs_getextattr(struct vop_getextattr_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+	struct proc *proc;
+	int error;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	/*
+	 * This is necessary because either process' privileges may
+	 * have changed since the open() call.
+	 */
+	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+		PFS_RETURN (EIO);
+
+	if (pn->pn_getextattr == NULL)
+		error = EOPNOTSUPP;
+	else
+		error = pn_getextattr(curthread, proc, pn,
+		    va->a_attrnamespace, va->a_name, va->a_uio,
+		    va->a_size, va->a_cred);
+
+	if (proc != NULL)
+		PROC_UNLOCK(proc);
+
+	PFS_RETURN (error);
+}
+
+/*
+ * Convert a vnode to its component name
+ */
+static int
+pfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode **dvp = ap->a_vpp;
+	struct pfs_vdata *pvd = vp->v_data;
+	struct pfs_node *pd = pvd->pvd_pn;
+	struct pfs_node *pn;
+	struct mount *mp;
+	char *buf = ap->a_buf;
+	int *buflen = ap->a_buflen;
+	char pidbuf[PFS_NAMELEN];
+	pid_t pid = pvd->pvd_pid;
+	int len, i, error, locked;
+
+	i = *buflen;
+	error = 0;
+
+	pfs_lock(pd);
+
+	if (vp->v_type == VDIR && pd->pn_type == pfstype_root) {
+		*dvp = vp;
+		vhold(*dvp);
+		pfs_unlock(pd);
+		PFS_RETURN (0);
+	} else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) {
+		len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
+		i -= len;
+		if (i < 0) {
+			error = ENOMEM;
+			goto failed;
+		}
+		bcopy(pidbuf, buf + i, len);
+	} else {
+		len = strlen(pd->pn_name);
+		i -= len;
+		if (i < 0) {
+			error = ENOMEM;
+			goto failed;
+		}
+		bcopy(pd->pn_name, buf + i, len);
+	}
+
+	pn = pd->pn_parent;
+	pfs_unlock(pd);
+
+	mp = vp->v_mount;
+	error = vfs_busy(mp, 0);
+	if (error)
+		return (error);
+
+	/*
+	 * vp is held by caller.
+	 */
+	locked = VOP_ISLOCKED(vp);
+	VOP_UNLOCK(vp, 0);
+
+	error = pfs_vncache_alloc(mp, dvp, pn, pid);
+	if (error) {
+		vn_lock(vp, locked | LK_RETRY);
+		vfs_unbusy(mp);
+		PFS_RETURN(error);
+	}
+
+	*buflen = i;
+	VOP_UNLOCK(*dvp, 0);
+	vn_lock(vp, locked | LK_RETRY);
+	vfs_unbusy(mp);
+
+	PFS_RETURN (0);
+failed:
+	pfs_unlock(pd);
+	PFS_RETURN(error);
+}
+
+/*
+ * Look up a file or directory
+ */
+static int
+pfs_lookup(struct vop_cachedlookup_args *va)
+{
+	struct vnode *vn = va->a_dvp;
+	struct vnode **vpp = va->a_vpp;
+	struct componentname *cnp = va->a_cnp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pd = pvd->pvd_pn;
+	struct pfs_node *pn, *pdn = NULL;
+	struct mount *mp;
+	pid_t pid = pvd->pvd_pid;
+	char *pname;
+	int error, i, namelen, visible;
+
+	PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr));
+	pfs_assert_not_owned(pd);
+
+	if (vn->v_type != VDIR)
+		PFS_RETURN (ENOTDIR);
+	KASSERT_PN_IS_DIR(pd);
+
+	/*
+	 * Don't support DELETE or RENAME.  CREATE is supported so
+	 * that O_CREAT will work, but the lookup will still fail if
+	 * the file does not exist.
+	 */
+	if ((cnp->cn_flags & ISLASTCN) &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+		PFS_RETURN (EOPNOTSUPP);
+
+	/* shortcut: check if the name is too long */
+	if (cnp->cn_namelen >= PFS_NAMELEN)
+		PFS_RETURN (ENOENT);
+
+	/* check that parent directory is visible... */
+	if (!pfs_visible(curthread, pd, pvd->pvd_pid, false, NULL))
+		PFS_RETURN (ENOENT);
+
+	/* self */
+	namelen = cnp->cn_namelen;
+	pname = cnp->cn_nameptr;
+	if (namelen == 1 && pname[0] == '.') {
+		pn = pd;
+		*vpp = vn;
+		VREF(vn);
+		PFS_RETURN (0);
+	}
+
+	mp = vn->v_mount;
+
+	/* parent */
+	if (cnp->cn_flags & ISDOTDOT) {
+		if (pd->pn_type == pfstype_root)
+			PFS_RETURN (EIO);
+		error = vfs_busy(mp, MBF_NOWAIT);
+		if (error != 0) {
+			vfs_ref(mp);
+			VOP_UNLOCK(vn, 0);
+			error = vfs_busy(mp, 0);
+			vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
+			vfs_rel(mp);
+			if (error != 0)
+				PFS_RETURN(ENOENT);
+			if (vn->v_iflag & VI_DOOMED) {
+				vfs_unbusy(mp);
+				PFS_RETURN(ENOENT);
+			}
+		}
+		VOP_UNLOCK(vn, 0);
+		KASSERT(pd->pn_parent != NULL,
+		    ("%s(): non-root directory has no parent", __func__));
+		/*
+		 * This one is tricky.  Descendents of procdir nodes
+		 * inherit their parent's process affinity, but
+		 * there's no easy reverse mapping.  For simplicity,
+		 * we assume that if this node is a procdir, its
+		 * parent isn't (which is correct as long as
+		 * descendents of procdir nodes are never procdir
+		 * nodes themselves)
+		 */
+		if (pd->pn_type == pfstype_procdir)
+			pid = NO_PID;
+		pfs_lock(pd);
+		pn = pd->pn_parent;
+		pfs_unlock(pd);
+		goto got_pnode;
+	}
+
+	pfs_lock(pd);
+
+	/* named node */
+	for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next)
+		if (pn->pn_type == pfstype_procdir)
+			pdn = pn;
+		else if (pn->pn_name[namelen] == '\0' &&
+		    bcmp(pname, pn->pn_name, namelen) == 0) {
+			pfs_unlock(pd);
+			goto got_pnode;
+		}
+
+	/* process dependent node */
+	if ((pn = pdn) != NULL) {
+		pid = 0;
+		for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i)
+			if ((pid = pid * 10 + pname[i] - '0') > PID_MAX)
+				break;
+		if (i == cnp->cn_namelen) {
+			pfs_unlock(pd);
+			goto got_pnode;
+		}
+	}
+
+	pfs_unlock(pd);
+
+	PFS_RETURN (ENOENT);
+
+ got_pnode:
+	pfs_assert_not_owned(pd);
+	pfs_assert_not_owned(pn);
+	visible = pfs_visible(curthread, pn, pid, false, NULL);
+	if (!visible) {
+		error = ENOENT;
+		goto failed;
+	}
+
+	error = pfs_vncache_alloc(mp, vpp, pn, pid);
+	if (error)
+		goto failed;
+
+	if (cnp->cn_flags & ISDOTDOT) {
+		vfs_unbusy(mp);
+		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
+		if (vn->v_iflag & VI_DOOMED) {
+			vput(*vpp);
+			*vpp = NULL;
+			PFS_RETURN(ENOENT);
+		}
+	}
+	if (cnp->cn_flags & MAKEENTRY && !(vn->v_iflag & VI_DOOMED))
+		cache_enter(vn, *vpp, cnp);
+	PFS_RETURN (0);
+ failed:
+	if (cnp->cn_flags & ISDOTDOT) {
+		vfs_unbusy(mp);
+		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
+		*vpp = NULL;
+	}
+	PFS_RETURN(error);
+}
+
+/*
+ * Open a file or directory.
+ */
+static int
+pfs_open(struct vop_open_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+	int mode = va->a_mode;
+
+	PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode));
+	pfs_assert_not_owned(pn);
+
+	/* check if the requested mode is permitted */
+	if (((mode & FREAD) && !(mode & PFS_RD)) ||
+	    ((mode & FWRITE) && !(mode & PFS_WR)))
+		PFS_RETURN (EPERM);
+
+	/* we don't support locking */
+	if ((mode & O_SHLOCK) || (mode & O_EXLOCK))
+		PFS_RETURN (EOPNOTSUPP);
+
+	PFS_RETURN (0);
+}
+
+/*
+ * Read from a file
+ */
+static int
+pfs_read(struct vop_read_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+	struct uio *uio = va->a_uio;
+	struct proc *proc;
+	struct sbuf *sb = NULL;
+	int error, locked;
+	off_t buflen;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	if (vn->v_type != VREG)
+		PFS_RETURN (EINVAL);
+	KASSERT_PN_IS_FILE(pn);
+
+	if (!(pn->pn_flags & PFS_RD))
+		PFS_RETURN (EBADF);
+
+	if (pn->pn_fill == NULL)
+		PFS_RETURN (EIO);
+
+	/*
+	 * This is necessary because either process' privileges may
+	 * have changed since the open() call.
+	 */
+	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+		PFS_RETURN (EIO);
+	if (proc != NULL) {
+		_PHOLD(proc);
+		PROC_UNLOCK(proc);
+	}
+
+	vhold(vn);
+	locked = VOP_ISLOCKED(vn);
+	VOP_UNLOCK(vn, 0);
+
+	if (pn->pn_flags & PFS_RAWRD) {
+		PFS_TRACE(("%zd resid", uio->uio_resid));
+		error = pn_fill(curthread, proc, pn, NULL, uio);
+		PFS_TRACE(("%zd resid", uio->uio_resid));
+		goto ret;
+	}
+
+	if (uio->uio_resid < 0 || uio->uio_offset < 0 ||
+	    uio->uio_resid > OFF_MAX - uio->uio_offset) {
+		error = EINVAL;
+		goto ret;
+	}
+	buflen = uio->uio_offset + uio->uio_resid;
+	if (buflen > MAXPHYS)
+		buflen = MAXPHYS;
+
+	sb = sbuf_new(sb, NULL, buflen + 1, 0);
+	if (sb == NULL) {
+		error = EIO;
+		goto ret;
+	}
+
+	error = pn_fill(curthread, proc, pn, sb, uio);
+
+	if (error) {
+		sbuf_delete(sb);
+		goto ret;
+	}
+
+	/*
+	 * XXX: If the buffer overflowed, sbuf_len() will not return
+	 * the data length. Then just use the full length because an
+	 * overflowed sbuf must be full.
+	 */
+	if (sbuf_finish(sb) == 0)
+		buflen = sbuf_len(sb);
+	error = uiomove_frombuf(sbuf_data(sb), buflen, uio);
+	sbuf_delete(sb);
+ret:
+	vn_lock(vn, locked | LK_RETRY);
+	vdrop(vn);
+	if (proc != NULL)
+		PRELE(proc);
+	PFS_RETURN (error);
+}
+
+/*
+ * Iterate through directory entries
+ */
+static int
+pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd,
+	    struct pfs_node **pn, struct proc **p)
+{
+	int visible;
+
+	sx_assert(&allproc_lock, SX_SLOCKED);
+	pfs_assert_owned(pd);
+ again:
+	if (*pn == NULL) {
+		/* first node */
+		*pn = pd->pn_nodes;
+	} else if ((*pn)->pn_type != pfstype_procdir) {
+		/* next node */
+		*pn = (*pn)->pn_next;
+	}
+	if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
+		/* next process */
+		if (*p == NULL)
+			*p = LIST_FIRST(&allproc);
+		else
+			*p = LIST_NEXT(*p, p_list);
+		/* out of processes: next node */
+		if (*p == NULL)
+			*pn = (*pn)->pn_next;
+		else
+			PROC_LOCK(*p);
+	}
+
+	if ((*pn) == NULL)
+		return (-1);
+
+	if (*p != NULL) {
+		visible = pfs_visible_proc(td, *pn, *p);
+		PROC_UNLOCK(*p);
+	} else if (proc != NULL) {
+		visible = pfs_visible_proc(td, *pn, proc);
+	} else {
+		visible = 1;
+	}
+	if (!visible)
+		goto again;
+
+	return (0);
+}
+
+/* Directory entry list */
+struct pfsentry {
+	STAILQ_ENTRY(pfsentry)	link;
+	struct dirent		entry;
+};
+STAILQ_HEAD(pfsdirentlist, pfsentry);
+
+/*
+ * Return directory entries.
+ */
+static int
+pfs_readdir(struct vop_readdir_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pd = pvd->pvd_pn;
+	pid_t pid = pvd->pvd_pid;
+	struct proc *p, *proc;
+	struct pfs_node *pn;
+	struct uio *uio;
+	struct pfsentry *pfsent, *pfsent2;
+	struct pfsdirentlist lst;
+	off_t offset;
+	int error, i, resid;
+
+	STAILQ_INIT(&lst);
+	error = 0;
+	KASSERT(pd->pn_info == vn->v_mount->mnt_data,
+	    ("%s(): pn_info does not match mountpoint", __func__));
+	PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid));
+	pfs_assert_not_owned(pd);
+
+	if (vn->v_type != VDIR)
+		PFS_RETURN (ENOTDIR);
+	KASSERT_PN_IS_DIR(pd);
+	uio = va->a_uio;
+
+	/* only allow reading entire entries */
+	offset = uio->uio_offset;
+	resid = uio->uio_resid;
+	if (offset < 0 || offset % PFS_DELEN != 0 ||
+	    (resid && resid < PFS_DELEN))
+		PFS_RETURN (EINVAL);
+	if (resid == 0)
+		PFS_RETURN (0);
+
+	sx_slock(&allproc_lock);
+	pfs_lock(pd);
+
+        /* check if the directory is visible to the caller */
+        if (!pfs_visible(curthread, pd, pid, true, &proc)) {
+		sx_sunlock(&allproc_lock);
+		pfs_unlock(pd);
+                PFS_RETURN (ENOENT);
+	}
+	KASSERT(pid == NO_PID || proc != NULL,
+	    ("%s(): no process for pid %lu", __func__, (unsigned long)pid));
+
+	/* skip unwanted entries */
+	for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) {
+		if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) {
+			/* nothing left... */
+			if (proc != NULL)
+				PROC_UNLOCK(proc);
+			pfs_unlock(pd);
+			sx_sunlock(&allproc_lock);
+			PFS_RETURN (0);
+		}
+	}
+
+	/* fill in entries */
+	while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 &&
+	    resid >= PFS_DELEN) {
+		if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV,
+		    M_NOWAIT | M_ZERO)) == NULL) {
+			error = ENOMEM;
+			break;
+		}
+		pfsent->entry.d_reclen = PFS_DELEN;
+		pfsent->entry.d_fileno = pn_fileno(pn, pid);
+		/* PFS_DELEN was picked to fit PFS_NAMLEN */
+		for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i)
+			pfsent->entry.d_name[i] = pn->pn_name[i];
+		pfsent->entry.d_namlen = i;
+		/* NOTE: d_off is the offset of the *next* entry. */
+		pfsent->entry.d_off = offset + PFS_DELEN;
+		switch (pn->pn_type) {
+		case pfstype_procdir:
+			KASSERT(p != NULL,
+			    ("reached procdir node with p == NULL"));
+			pfsent->entry.d_namlen = snprintf(pfsent->entry.d_name,
+			    PFS_NAMELEN, "%d", p->p_pid);
+			/* fall through */
+		case pfstype_root:
+		case pfstype_dir:
+		case pfstype_this:
+		case pfstype_parent:
+			pfsent->entry.d_type = DT_DIR;
+			break;
+		case pfstype_file:
+			pfsent->entry.d_type = DT_REG;
+			break;
+		case pfstype_symlink:
+			pfsent->entry.d_type = DT_LNK;
+			break;
+		default:
+			panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type);
+		}
+		PFS_TRACE(("%s", pfsent->entry.d_name));
+		dirent_terminate(&pfsent->entry);
+		STAILQ_INSERT_TAIL(&lst, pfsent, link);
+		offset += PFS_DELEN;
+		resid -= PFS_DELEN;
+	}
+	if (proc != NULL)
+		PROC_UNLOCK(proc);
+	pfs_unlock(pd);
+	sx_sunlock(&allproc_lock);
+	i = 0;
+	STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) {
+		if (error == 0)
+			error = uiomove(&pfsent->entry, PFS_DELEN, uio);
+		free(pfsent, M_IOV);
+		i++;
+	}
+	PFS_TRACE(("%ju bytes", (uintmax_t)(i * PFS_DELEN)));
+	PFS_RETURN (error);
+}
+
+/*
+ * Read a symbolic link
+ */
+static int
+pfs_readlink(struct vop_readlink_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+	struct uio *uio = va->a_uio;
+	struct proc *proc = NULL;
+	char buf[PATH_MAX];
+	struct sbuf sb;
+	int error, locked;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	if (vn->v_type != VLNK)
+		PFS_RETURN (EINVAL);
+	KASSERT_PN_IS_LINK(pn);
+
+	if (pn->pn_fill == NULL)
+		PFS_RETURN (EIO);
+
+	if (pvd->pvd_pid != NO_PID) {
+		if ((proc = pfind(pvd->pvd_pid)) == NULL)
+			PFS_RETURN (EIO);
+		if (proc->p_flag & P_WEXIT) {
+			PROC_UNLOCK(proc);
+			PFS_RETURN (EIO);
+		}
+		_PHOLD(proc);
+		PROC_UNLOCK(proc);
+	}
+	vhold(vn);
+	locked = VOP_ISLOCKED(vn);
+	VOP_UNLOCK(vn, 0);
+
+	/* sbuf_new() can't fail with a static buffer */
+	sbuf_new(&sb, buf, sizeof buf, 0);
+
+	error = pn_fill(curthread, proc, pn, &sb, NULL);
+
+	if (proc != NULL)
+		PRELE(proc);
+	vn_lock(vn, locked | LK_RETRY);
+	vdrop(vn);
+
+	if (error) {
+		sbuf_delete(&sb);
+		PFS_RETURN (error);
+	}
+
+	if (sbuf_finish(&sb) != 0) {
+		sbuf_delete(&sb);
+		PFS_RETURN (ENAMETOOLONG);
+	}
+
+	error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio);
+	sbuf_delete(&sb);
+	PFS_RETURN (error);
+}
+
+/*
+ * Reclaim a vnode
+ */
+static int
+pfs_reclaim(struct vop_reclaim_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	return (pfs_vncache_free(va->a_vp));
+}
+
+/*
+ * Set attributes
+ */
+static int
+pfs_setattr(struct vop_setattr_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	/* Silently ignore unchangeable attributes. */
+	PFS_RETURN (0);
+}
+
+/*
+ * Write to a file
+ */
+static int
+pfs_write(struct vop_write_args *va)
+{
+	struct vnode *vn = va->a_vp;
+	struct pfs_vdata *pvd = vn->v_data;
+	struct pfs_node *pn = pvd->pvd_pn;
+	struct uio *uio = va->a_uio;
+	struct proc *proc;
+	struct sbuf sb;
+	int error;
+
+	PFS_TRACE(("%s", pn->pn_name));
+	pfs_assert_not_owned(pn);
+
+	if (vn->v_type != VREG)
+		PFS_RETURN (EINVAL);
+	KASSERT_PN_IS_FILE(pn);
+
+	if (!(pn->pn_flags & PFS_WR))
+		PFS_RETURN (EBADF);
+
+	if (pn->pn_fill == NULL)
+		PFS_RETURN (EIO);
+
+	/*
+	 * This is necessary because either process' privileges may
+	 * have changed since the open() call.
+	 */
+	if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+		PFS_RETURN (EIO);
+	if (proc != NULL) {
+		_PHOLD(proc);
+		PROC_UNLOCK(proc);
+	}
+
+	if (pn->pn_flags & PFS_RAWWR) {
+		error = pn_fill(curthread, proc, pn, NULL, uio);
+		if (proc != NULL)
+			PRELE(proc);
+		PFS_RETURN (error);
+	}
+
+	sbuf_uionew(&sb, uio, &error);
+	if (error) {
+		if (proc != NULL)
+			PRELE(proc);
+		PFS_RETURN (error);
+	}
+
+	error = pn_fill(curthread, proc, pn, &sb, uio);
+
+	sbuf_delete(&sb);
+	if (proc != NULL)
+		PRELE(proc);
+	PFS_RETURN (error);
+}
+
+/*
+ * Vnode operations
+ */
+struct vop_vector pfs_vnodeops = {
+	.vop_default =		&default_vnodeops,
+
+	.vop_access =		pfs_access,
+	.vop_cachedlookup =	pfs_lookup,
+	.vop_close =		pfs_close,
+	.vop_create =		VOP_EOPNOTSUPP,
+	.vop_getattr =		pfs_getattr,
+	.vop_getextattr =	pfs_getextattr,
+	.vop_ioctl =		pfs_ioctl,
+	.vop_link =		VOP_EOPNOTSUPP,
+	.vop_lookup =		vfs_cache_lookup,
+	.vop_mkdir =		VOP_EOPNOTSUPP,
+	.vop_mknod =		VOP_EOPNOTSUPP,
+	.vop_open =		pfs_open,
+	.vop_read =		pfs_read,
+	.vop_readdir =		pfs_readdir,
+	.vop_readlink =		pfs_readlink,
+	.vop_reclaim =		pfs_reclaim,
+	.vop_remove =		VOP_EOPNOTSUPP,
+	.vop_rename =		VOP_EOPNOTSUPP,
+	.vop_rmdir =		VOP_EOPNOTSUPP,
+	.vop_setattr =		pfs_setattr,
+	.vop_symlink =		VOP_EOPNOTSUPP,
+	.vop_vptocnp =		pfs_vptocnp,
+	.vop_write =		pfs_write,
+	/* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */
+};
diff --git a/freebsd/sys/kern/kern_descrip.c b/freebsd/sys/kern/kern_descrip.c
new file mode 100644
index 00000000..423968b2
--- /dev/null
+++ b/freebsd/sys/kern/kern_descrip.c
@@ -0,0 +1,4283 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/capsicum.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/selinfo.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kdb.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/unistd.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+
+#include <ddb/ddb.h>
+
+static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
+static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
+    "file desc to leader structures");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
+
+MALLOC_DECLARE(M_FADVISE);
+
+static __read_mostly uma_zone_t file_zone;
+static __read_mostly uma_zone_t filedesc0_zone;
+
+static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
+		    struct thread *td, int holdleaders);
+static int	fd_first_free(struct filedesc *fdp, int low, int size);
+static int	fd_last_used(struct filedesc *fdp, int size);
+static void	fdgrowtable(struct filedesc *fdp, int nfd);
+static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
+static void	fdunused(struct filedesc *fdp, int fd);
+static void	fdused(struct filedesc *fdp, int fd);
+static int	getmaxfd(struct thread *td);
+static u_long	*filecaps_copy_prep(const struct filecaps *src);
+static void	filecaps_copy_finish(const struct filecaps *src,
+		    struct filecaps *dst, u_long *ioctls);
+static u_long 	*filecaps_free_prep(struct filecaps *fcaps);
+static void	filecaps_free_finish(u_long *ioctls);
+
+/*
+ * Each process has:
+ *
+ * - An array of open file descriptors (fd_ofiles)
+ * - An array of file flags (fd_ofileflags)
+ * - A bitmap recording which descriptors are in use (fd_map)
+ *
+ * A process starts out with NDFILE descriptors.  The value of NDFILE has
+ * been selected based the historical limit of 20 open files, and an
+ * assumption that the majority of processes, especially short-lived
+ * processes like shells, will never need more.
+ *
+ * If this initial allocation is exhausted, a larger descriptor table and
+ * map are allocated dynamically, and the pointers in the process's struct
+ * filedesc are updated to point to those.  This is repeated every time
+ * the process runs out of file descriptors (provided it hasn't hit its
+ * resource limit).
+ *
+ * Since threads may hold references to individual descriptor table
+ * entries, the tables are never freed.  Instead, they are placed on a
+ * linked list and freed only when the struct filedesc is released.
+ */
+#define NDFILE		20
+#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
+#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
+#define NDSLOT(x)	((x) / NDENTRIES)
+#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
+#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
+
+/*
+ * SLIST entry used to keep track of ofiles which must be reclaimed when
+ * the process exits.
+ */
+struct freetable {
+	struct fdescenttbl *ft_table;
+	SLIST_ENTRY(freetable) ft_next;
+};
+
+/*
+ * Initial allocation: a filedesc structure + the head of SLIST used to
+ * keep track of old ofiles + enough space for NDFILE descriptors.
+ */
+
+struct fdescenttbl0 {
+	int	fdt_nfiles;
+	struct	filedescent fdt_ofiles[NDFILE];
+};
+
+struct filedesc0 {
+	struct filedesc fd_fd;
+	SLIST_HEAD(, freetable) fd_free;
+	struct	fdescenttbl0 fd_dfiles;
+	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
+};
+
+/*
+ * Descriptor management.
+ */
+volatile int __exclusive_cache_line openfiles; /* actual number of open files */
+struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
+void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+
+/*
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
+ */
+static int
+fd_first_free(struct filedesc *fdp, int low, int size)
+{
+	NDSLOTTYPE *map = fdp->fd_map;
+	NDSLOTTYPE mask;
+	int off, maxoff;
+
+	if (low >= size)
+		return (low);
+
+	off = NDSLOT(low);
+	if (low % NDENTRIES) {
+		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
+		if ((mask &= ~map[off]) != 0UL)
+			return (off * NDENTRIES + ffsl(mask) - 1);
+		++off;
+	}
+	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
+		if (map[off] != ~0UL)
+			return (off * NDENTRIES + ffsl(~map[off]) - 1);
+	return (size);
+}
+
+/*
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
+ */
+static int
+fd_last_used(struct filedesc *fdp, int size)
+{
+	NDSLOTTYPE *map = fdp->fd_map;
+	NDSLOTTYPE mask;
+	int off, minoff;
+
+	off = NDSLOT(size);
+	if (size % NDENTRIES) {
+		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
+		if ((mask &= map[off]) != 0)
+			return (off * NDENTRIES + flsl(mask) - 1);
+		--off;
+	}
+	for (minoff = NDSLOT(0); off >= minoff; --off)
+		if (map[off] != 0)
+			return (off * NDENTRIES + flsl(map[off]) - 1);
+	return (-1);
+}
+
+static int
+fdisused(struct filedesc *fdp, int fd)
+{
+
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
+}
+
+/*
+ * Mark a file descriptor as used.
+ */
+static void
+fdused_init(struct filedesc *fdp, int fd)
+{
+
+	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
+
+	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
+}
+
+static void
+fdused(struct filedesc *fdp, int fd)
+{
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	fdused_init(fdp, fd);
+	if (fd > fdp->fd_lastfile)
+		fdp->fd_lastfile = fd;
+	if (fd == fdp->fd_freefile)
+		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
+}
+
+/*
+ * Mark a file descriptor as unused.
+ */
+static void
+fdunused(struct filedesc *fdp, int fd)
+{
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+	    ("fd=%d is still in use", fd));
+
+	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
+	if (fd < fdp->fd_freefile)
+		fdp->fd_freefile = fd;
+	if (fd == fdp->fd_lastfile)
+		fdp->fd_lastfile = fd_last_used(fdp, fd);
+}
+
+/*
+ * Free a file descriptor.
+ *
+ * Avoid some work if fdp is about to be destroyed.
+ */
+static inline void
+fdefree_last(struct filedescent *fde)
+{
+
+	filecaps_free(&fde->fde_caps);
+}
+
+static inline void
+fdfree(struct filedesc *fdp, int fd)
+{
+	struct filedescent *fde;
+
+	fde = &fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+	seq_write_begin(&fde->fde_seq);
+#endif
+	fde->fde_file = NULL;
+#ifdef CAPABILITIES
+	seq_write_end(&fde->fde_seq);
+#endif
+	fdefree_last(fde);
+	fdunused(fdp, fd);
+}
+
+void
+pwd_ensure_dirs(void)
+{
+	struct filedesc *fdp;
+
+	fdp = curproc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	if (fdp->fd_cdir == NULL) {
+		fdp->fd_cdir = rootvnode;
+		vrefact(rootvnode);
+	}
+	if (fdp->fd_rdir == NULL) {
+		fdp->fd_rdir = rootvnode;
+		vrefact(rootvnode);
+	}
+	FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+	int	dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
+{
+#ifdef	RACCT
+	uint64_t lim;
+#endif
+
+	td->td_retval[0] =
+	    min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc);
+#ifdef	RACCT
+	PROC_LOCK(td->td_proc);
+	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
+	PROC_UNLOCK(td->td_proc);
+	if (lim < td->td_retval[0])
+		td->td_retval[0] = lim;
+#endif
+	return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ *
+ * Note: keep in mind that a potential race condition exists when closing
+ * descriptors from a shared descriptor table (via rfork).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+	u_int	from;
+	u_int	to;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup2(struct thread *td, struct dup2_args *uap)
+{
+
+	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+	u_int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup(struct thread *td, struct dup_args *uap)
+{
+
+	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+	int	fd;
+	int	cmd;
+	long	arg;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fcntl(struct thread *td, struct fcntl_args *uap)
+{
+
+	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
+}
+
+int
+kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
+{
+	struct flock fl;
+	struct __oflock ofl;
+	intptr_t arg1;
+	int error, newcmd;
+
+	error = 0;
+	newcmd = cmd;
+	switch (cmd) {
+	case F_OGETLK:
+	case F_OSETLK:
+	case F_OSETLKW:
+		/*
+		 * Convert old flock structure to new.
+		 */
+		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
+		fl.l_start = ofl.l_start;
+		fl.l_len = ofl.l_len;
+		fl.l_pid = ofl.l_pid;
+		fl.l_type = ofl.l_type;
+		fl.l_whence = ofl.l_whence;
+		fl.l_sysid = 0;
+
+		switch (cmd) {
+		case F_OGETLK:
+			newcmd = F_GETLK;
+			break;
+		case F_OSETLK:
+			newcmd = F_SETLK;
+			break;
+		case F_OSETLKW:
+			newcmd = F_SETLKW;
+			break;
+		}
+		arg1 = (intptr_t)&fl;
+		break;
+	case F_GETLK:
+	case F_SETLK:
+	case F_SETLKW:
+	case F_SETLK_REMOTE:
+		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
+		arg1 = (intptr_t)&fl;
+		break;
+	default:
+		arg1 = arg;
+		break;
+	}
+	if (error)
+		return (error);
+	error = kern_fcntl(td, fd, newcmd, arg1);
+	if (error)
+		return (error);
+	if (cmd == F_OGETLK) {
+		ofl.l_start = fl.l_start;
+		ofl.l_len = fl.l_len;
+		ofl.l_pid = fl.l_pid;
+		ofl.l_type = fl.l_type;
+		ofl.l_whence = fl.l_whence;
+		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
+	} else if (cmd == F_GETLK) {
+		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
+	}
+	return (error);
+}
+
+int
+kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
+{
+	struct filedesc *fdp;
+	struct flock *flp;
+	struct file *fp, *fp2;
+	struct filedescent *fde;
+	struct proc *p;
+	struct vnode *vp;
+	int error, flg, tmp;
+	uint64_t bsize;
+	off_t foffset;
+
+	error = 0;
+	flg = F_POSIX;
+	p = td->td_proc;
+	fdp = p->p_fd;
+
+	AUDIT_ARG_FD(cmd);
+	AUDIT_ARG_CMD(cmd);
+	switch (cmd) {
+	case F_DUPFD:
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
+		break;
+
+	case F_DUPFD_CLOEXEC:
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
+		break;
+
+	case F_DUP2FD:
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
+		break;
+
+	case F_DUP2FD_CLOEXEC:
+		tmp = arg;
+		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
+		break;
+
+	case F_GETFD:
+		error = EBADF;
+		FILEDESC_SLOCK(fdp);
+		fde = fdeget_locked(fdp, fd);
+		if (fde != NULL) {
+			td->td_retval[0] =
+			    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+			error = 0;
+		}
+		FILEDESC_SUNLOCK(fdp);
+		break;
+
+	case F_SETFD:
+		error = EBADF;
+		FILEDESC_XLOCK(fdp);
+		fde = fdeget_locked(fdp, fd);
+		if (fde != NULL) {
+			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
+			    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+			error = 0;
+		}
+		FILEDESC_XUNLOCK(fdp);
+		break;
+
+	case F_GETFL:
+		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
+		if (error != 0)
+			break;
+		td->td_retval[0] = OFLAGS(fp->f_flag);
+		fdrop(fp, td);
+		break;
+
+	case F_SETFL:
+		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
+		if (error != 0)
+			break;
+		do {
+			tmp = flg = fp->f_flag;
+			tmp &= ~FCNTLFLAGS;
+			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
+		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
+		tmp = fp->f_flag & FNONBLOCK;
+		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+		if (error != 0) {
+			fdrop(fp, td);
+			break;
+		}
+		tmp = fp->f_flag & FASYNC;
+		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
+		if (error == 0) {
+			fdrop(fp, td);
+			break;
+		}
+		atomic_clear_int(&fp->f_flag, FNONBLOCK);
+		tmp = 0;
+		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+		fdrop(fp, td);
+		break;
+
+	case F_GETOWN:
+		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
+		if (error != 0)
+			break;
+		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
+		if (error == 0)
+			td->td_retval[0] = tmp;
+		fdrop(fp, td);
+		break;
+
+	case F_SETOWN:
+		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
+		if (error != 0)
+			break;
+		tmp = arg;
+		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
+		fdrop(fp, td);
+		break;
+
+	case F_SETLK_REMOTE:
+		error = priv_check(td, PRIV_NFS_LOCKD);
+		if (error != 0)
+			return (error);
+		flg = F_REMOTE;
+		goto do_setlk;
+
+	case F_SETLKW:
+		flg |= F_WAIT;
+		/* FALLTHROUGH F_SETLK */
+
+	case F_SETLK:
+	do_setlk:
+		flp = (struct flock *)arg;
+		if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
+			error = EINVAL;
+			break;
+		}
+
+		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL);
+		if (error != 0)
+			break;
+		if (fp->f_type != DTYPE_VNODE) {
+			error = EBADF;
+			fdrop(fp, td);
+			break;
+		}
+
+		if (flp->l_whence == SEEK_CUR) {
+			foffset = foffset_get(fp);
+			if (foffset < 0 ||
+			    (flp->l_start > 0 &&
+			     foffset > OFF_MAX - flp->l_start)) {
+				error = EOVERFLOW;
+				fdrop(fp, td);
+				break;
+			}
+			flp->l_start += foffset;
+		}
+
+		vp = fp->f_vnode;
+		switch (flp->l_type) {
+		case F_RDLCK:
+			if ((fp->f_flag & FREAD) == 0) {
+				error = EBADF;
+				break;
+			}
+			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
+				PROC_LOCK(p->p_leader);
+				p->p_leader->p_flag |= P_ADVLOCK;
+				PROC_UNLOCK(p->p_leader);
+			}
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+			    flp, flg);
+			break;
+		case F_WRLCK:
+			if ((fp->f_flag & FWRITE) == 0) {
+				error = EBADF;
+				break;
+			}
+			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
+				PROC_LOCK(p->p_leader);
+				p->p_leader->p_flag |= P_ADVLOCK;
+				PROC_UNLOCK(p->p_leader);
+			}
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+			    flp, flg);
+			break;
+		case F_UNLCK:
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
+			    flp, flg);
+			break;
+		case F_UNLCKSYS:
+			if (flg != F_REMOTE) {
+				error = EINVAL;
+				break;
+			}
+			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+			    F_UNLCKSYS, flp, flg);
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+		if (error != 0 || flp->l_type == F_UNLCK ||
+		    flp->l_type == F_UNLCKSYS) {
+			fdrop(fp, td);
+			break;
+		}
+
+		/*
+		 * Check for a race with close.
+		 *
+		 * The vnode is now advisory locked (or unlocked, but this case
+		 * is not really important) as the caller requested.
+		 * We had to drop the filedesc lock, so we need to recheck if
+		 * the descriptor is still valid, because if it was closed
+		 * in the meantime we need to remove advisory lock from the
+		 * vnode - close on any descriptor leading to an advisory
+		 * locked vnode, removes that lock.
+		 * We will return 0 on purpose in that case, as the result of
+		 * successful advisory lock might have been externally visible
+		 * already. This is fine - effectively we pretend to the caller
+		 * that the closing thread was a bit slower and that the
+		 * advisory lock succeeded before the close.
+		 */
+		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2, NULL);
+		if (error != 0) {
+			fdrop(fp, td);
+			break;
+		}
+		if (fp != fp2) {
+			flp->l_whence = SEEK_SET;
+			flp->l_start = 0;
+			flp->l_len = 0;
+			flp->l_type = F_UNLCK;
+			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+			    F_UNLCK, flp, F_POSIX);
+		}
+		fdrop(fp, td);
+		fdrop(fp2, td);
+		break;
+
+	case F_GETLK:
+		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL);
+		if (error != 0)
+			break;
+		if (fp->f_type != DTYPE_VNODE) {
+			error = EBADF;
+			fdrop(fp, td);
+			break;
+		}
+		flp = (struct flock *)arg;
+		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
+		    flp->l_type != F_UNLCK) {
+			error = EINVAL;
+			fdrop(fp, td);
+			break;
+		}
+		if (flp->l_whence == SEEK_CUR) {
+			foffset = foffset_get(fp);
+			if ((flp->l_start > 0 &&
+			    foffset > OFF_MAX - flp->l_start) ||
+			    (flp->l_start < 0 &&
+			    foffset < OFF_MIN - flp->l_start)) {
+				error = EOVERFLOW;
+				fdrop(fp, td);
+				break;
+			}
+			flp->l_start += foffset;
+		}
+		vp = fp->f_vnode;
+		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
+		    F_POSIX);
+		fdrop(fp, td);
+		break;
+
+	case F_RDAHEAD:
+		arg = arg ? 128 * 1024: 0;
+		/* FALLTHROUGH */
+	case F_READAHEAD:
+		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp, NULL);
+		if (error != 0)
+			break;
+		if (fp->f_type != DTYPE_VNODE) {
+			fdrop(fp, td);
+			error = EBADF;
+			break;
+		}
+		vp = fp->f_vnode;
+		if (vp->v_type != VREG) {
+			fdrop(fp, td);
+			error = ENOTTY;
+			break;
+		}
+
+		/*
+		 * Exclusive lock synchronizes against f_seqcount reads and
+		 * writes in sequential_heuristic().
+		 */
+		error = vn_lock(vp, LK_EXCLUSIVE);
+		if (error != 0) {
+			fdrop(fp, td);
+			break;
+		}
+		if (arg >= 0) {
+			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
+			arg = MIN(arg, INT_MAX - bsize + 1);
+			fp->f_seqcount = MIN(IO_SEQMAX,
+			    (arg + bsize - 1) / bsize);
+			atomic_set_int(&fp->f_flag, FRDAHEAD);
+		} else {
+			atomic_clear_int(&fp->f_flag, FRDAHEAD);
+		}
+		VOP_UNLOCK(vp, 0);
+		fdrop(fp, td);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static int
+getmaxfd(struct thread *td)
+{
+
+	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
+}
+
+/*
+ * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
+ */
+int
+kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
+{
+	struct filedesc *fdp;
+	struct filedescent *oldfde, *newfde;
+	struct proc *p;
+	struct file *delfp;
+	u_long *oioctls, *nioctls;
+	int error, maxfd;
+
+	p = td->td_proc;
+	fdp = p->p_fd;
+
+	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
+	MPASS(mode < FDDUP_LASTMODE);
+
+	AUDIT_ARG_FD(old);
+	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
+
+	/*
+	 * Verify we have a valid descriptor to dup from and possibly to
+	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
+	 * return EINVAL when the new descriptor is out of bounds.
+	 */
+	if (old < 0)
+		return (EBADF);
+	if (new < 0)
+		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
+	maxfd = getmaxfd(td);
+	if (new >= maxfd)
+		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
+
+	error = EBADF;
+	FILEDESC_XLOCK(fdp);
+	if (fget_locked(fdp, old) == NULL)
+		goto unlock;
+	if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
+		td->td_retval[0] = new;
+		if (flags & FDDUP_FLAG_CLOEXEC)
+			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
+		error = 0;
+		goto unlock;
+	}
+
+	oldfde = &fdp->fd_ofiles[old];
+	if (!fhold(oldfde->fde_file))
+		goto unlock;
+
+	/*
+	 * If the caller specified a file descriptor, make sure the file
+	 * table is large enough to hold it, and grab it.  Otherwise, just
+	 * allocate a new descriptor the usual way.
+	 */
+	switch (mode) {
+	case FDDUP_NORMAL:
+	case FDDUP_FCNTL:
+		if ((error = fdalloc(td, new, &new)) != 0) {
+			fdrop(oldfde->fde_file, td);
+			goto unlock;
+		}
+		break;
+	case FDDUP_MUSTREPLACE:
+		/* Target file descriptor must exist. */
+		if (fget_locked(fdp, new) == NULL) {
+			fdrop(oldfde->fde_file, td);
+			goto unlock;
+		}
+		break;
+	case FDDUP_FIXED:
+		if (new >= fdp->fd_nfiles) {
+			/*
+			 * The resource limits are here instead of e.g.
+			 * fdalloc(), because the file descriptor table may be
+			 * shared between processes, so we can't really use
+			 * racct_add()/racct_sub().  Instead of counting the
+			 * number of actually allocated descriptors, just put
+			 * the limit on the size of the file descriptor table.
+			 */
+#ifdef RACCT
+			if (racct_enable) {
+				PROC_LOCK(p);
+				error = racct_set(p, RACCT_NOFILE, new + 1);
+				PROC_UNLOCK(p);
+				if (error != 0) {
+					error = EMFILE;
+					fdrop(oldfde->fde_file, td);
+					goto unlock;
+				}
+			}
+#endif
+			fdgrowtable_exp(fdp, new + 1);
+		}
+		if (!fdisused(fdp, new))
+			fdused(fdp, new);
+		break;
+	default:
+		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
+	}
+
+	KASSERT(old != new, ("new fd is same as old"));
+
+	newfde = &fdp->fd_ofiles[new];
+	delfp = newfde->fde_file;
+
+	oioctls = filecaps_free_prep(&newfde->fde_caps);
+	nioctls = filecaps_copy_prep(&oldfde->fde_caps);
+
+	/*
+	 * Duplicate the source descriptor.
+	 */
+#ifdef CAPABILITIES
+	seq_write_begin(&newfde->fde_seq);
+#endif
+	memcpy(newfde, oldfde, fde_change_size);
+	filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
+	    nioctls);
+	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
+		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
+	else
+		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+#ifdef CAPABILITIES
+	seq_write_end(&newfde->fde_seq);
+#endif
+	filecaps_free_finish(oioctls);
+	td->td_retval[0] = new;
+
+	error = 0;
+
+	if (delfp != NULL) {
+		(void) closefp(fdp, new, delfp, td, 1);
+		FILEDESC_UNLOCK_ASSERT(fdp);
+	} else {
+unlock:
+		FILEDESC_XUNLOCK(fdp);
+	}
+
+	return (error);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(struct sigio **sigiop)
+{
+	struct sigio *sigio;
+
+	if (*sigiop == NULL)
+		return;
+	SIGIO_LOCK();
+	sigio = *sigiop;
+	if (sigio == NULL) {
+		SIGIO_UNLOCK();
+		return;
+	}
+	*(sigio->sio_myref) = NULL;
+	if ((sigio)->sio_pgid < 0) {
+		struct pgrp *pg = (sigio)->sio_pgrp;
+		PGRP_LOCK(pg);
+		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+			    sigio, sio_pgsigio);
+		PGRP_UNLOCK(pg);
+	} else {
+		struct proc *p = (sigio)->sio_proc;
+		PROC_LOCK(p);
+		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+			    sigio, sio_pgsigio);
+		PROC_UNLOCK(p);
+	}
+	SIGIO_UNLOCK();
+	crfree(sigio->sio_ucred);
+	free(sigio, M_SIGIO);
+}
+
+/*
+ * Free a list of sigio structures.
+ * We only need to lock the SIGIO_LOCK because we have made ourselves
+ * inaccessible to callers of fsetown and therefore do not need to lock
+ * the proc or pgrp struct for the list manipulation.
+ */
+void
+funsetownlst(struct sigiolst *sigiolst)
+{
+	struct proc *p;
+	struct pgrp *pg;
+	struct sigio *sigio;
+
+	sigio = SLIST_FIRST(sigiolst);
+	if (sigio == NULL)
+		return;
+	p = NULL;
+	pg = NULL;
+
+	/*
+	 * Every entry of the list should belong
+	 * to a single proc or pgrp.
+	 */
+	if (sigio->sio_pgid < 0) {
+		pg = sigio->sio_pgrp;
+		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
+	} else /* if (sigio->sio_pgid > 0) */ {
+		p = sigio->sio_proc;
+		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+	}
+
+	SIGIO_LOCK();
+	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
+		*(sigio->sio_myref) = NULL;
+		if (pg != NULL) {
+			KASSERT(sigio->sio_pgid < 0,
+			    ("Proc sigio in pgrp sigio list"));
+			KASSERT(sigio->sio_pgrp == pg,
+			    ("Bogus pgrp in sigio list"));
+			PGRP_LOCK(pg);
+			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
+			    sio_pgsigio);
+			PGRP_UNLOCK(pg);
+		} else /* if (p != NULL) */ {
+			KASSERT(sigio->sio_pgid > 0,
+			    ("Pgrp sigio in proc sigio list"));
+			KASSERT(sigio->sio_proc == p,
+			    ("Bogus proc in sigio list"));
+			PROC_LOCK(p);
+			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
+			    sio_pgsigio);
+			PROC_UNLOCK(p);
+		}
+		SIGIO_UNLOCK();
+		crfree(sigio->sio_ucred);
+		free(sigio, M_SIGIO);
+		SIGIO_LOCK();
+	}
+	SIGIO_UNLOCK();
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pid_t pgid, struct sigio **sigiop)
+{
+	struct proc *proc;
+	struct pgrp *pgrp;
+	struct sigio *sigio;
+	int ret;
+
+	if (pgid == 0) {
+		funsetown(sigiop);
+		return (0);
+	}
+
+	ret = 0;
+
+	/* Allocate and fill in the new sigio out of locks. */
+	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
+	sigio->sio_pgid = pgid;
+	sigio->sio_ucred = crhold(curthread->td_ucred);
+	sigio->sio_myref = sigiop;
+
+	sx_slock(&proctree_lock);
+	if (pgid > 0) {
+		proc = pfind(pgid);
+		if (proc == NULL) {
+			ret = ESRCH;
+			goto fail;
+		}
+
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		PROC_UNLOCK(proc);
+		if (proc->p_session != curthread->td_proc->p_session) {
+			ret = EPERM;
+			goto fail;
+		}
+
+		pgrp = NULL;
+	} else /* if (pgid < 0) */ {
+		pgrp = pgfind(-pgid);
+		if (pgrp == NULL) {
+			ret = ESRCH;
+			goto fail;
+		}
+		PGRP_UNLOCK(pgrp);
+
+		/*
+		 * Policy - Don't allow a process to FSETOWN a process
+		 * in another session.
+		 *
+		 * Remove this test to allow maximum flexibility or
+		 * restrict FSETOWN to the current process or process
+		 * group for maximum safety.
+		 */
+		if (pgrp->pg_session != curthread->td_proc->p_session) {
+			ret = EPERM;
+			goto fail;
+		}
+
+		proc = NULL;
+	}
+	funsetown(sigiop);
+	if (pgid > 0) {
+		PROC_LOCK(proc);
+		/*
+		 * Since funsetownlst() is called without the proctree
+		 * locked, we need to check for P_WEXIT.
+		 * XXX: is ESRCH correct?
+		 */
+		if ((proc->p_flag & P_WEXIT) != 0) {
+			PROC_UNLOCK(proc);
+			ret = ESRCH;
+			goto fail;
+		}
+		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_proc = proc;
+		PROC_UNLOCK(proc);
+	} else {
+		PGRP_LOCK(pgrp);
+		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+		sigio->sio_pgrp = pgrp;
+		PGRP_UNLOCK(pgrp);
+	}
+	sx_sunlock(&proctree_lock);
+	SIGIO_LOCK();
+	*sigiop = sigio;
+	SIGIO_UNLOCK();
+	return (0);
+
+fail:
+	sx_sunlock(&proctree_lock);
+	crfree(sigio->sio_ucred);
+	free(sigio, M_SIGIO);
+	return (ret);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(struct sigio **sigiop)
+{
+	pid_t pgid;
+
+	SIGIO_LOCK();
+	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
+	SIGIO_UNLOCK();
+	return (pgid);
+}
+
+/*
+ * Function drops the filedesc lock on return.
+ */
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+    int holdleaders)
+{
+	int error;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (holdleaders) {
+		if (td->td_proc->p_fdtol != NULL) {
+			/*
+			 * Ask fdfree() to sleep to ensure that all relevant
+			 * process leaders can be traversed in closef().
+			 */
+			fdp->fd_holdleaderscount++;
+		} else {
+			holdleaders = 0;
+		}
+	}
+
+	/*
+	 * We now hold the fp reference that used to be owned by the
+	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
+	 * knote_fdclose to prevent a race of the fd getting opened, a knote
+	 * added, and deleteing a knote for the new fd.
+	 */
+	knote_fdclose(td, fd);
+
+	/*
+	 * We need to notify mqueue if the object is of type mqueue.
+	 */
+	if (fp->f_type == DTYPE_MQUEUE)
+		mq_fdclose(td, fd, fp);
+	FILEDESC_XUNLOCK(fdp);
+
+	error = closef(fp, td);
+	if (holdleaders) {
+		FILEDESC_XLOCK(fdp);
+		fdp->fd_holdleaderscount--;
+		if (fdp->fd_holdleaderscount == 0 &&
+		    fdp->fd_holdleaderswakeup != 0) {
+			fdp->fd_holdleaderswakeup = 0;
+			wakeup(&fdp->fd_holdleaderscount);
+		}
+		FILEDESC_XUNLOCK(fdp);
+	}
+	return (error);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+	int     fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_close(struct thread *td, struct close_args *uap)
+{
+
+	return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(struct thread *td, int fd)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+
+	fdp = td->td_proc->p_fd;
+
+	AUDIT_SYSCLOSE(td, fd);
+
+	FILEDESC_XLOCK(fdp);
+	if ((fp = fget_locked(fdp, fd)) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+	fdfree(fdp, fd);
+
+	/* closefp() drops the FILEDESC lock for us. */
+	return (closefp(fdp, fd, fp, td, 1));
+}
+
+/*
+ * Close open file descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct closefrom_args {
+	int	lowfd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_closefrom(struct thread *td, struct closefrom_args *uap)
+{
+	struct filedesc *fdp;
+	int fd;
+
+	fdp = td->td_proc->p_fd;
+	AUDIT_ARG_FD(uap->lowfd);
+
+	/*
+	 * Treat negative starting file descriptor values identical to
+	 * closefrom(0) which closes all files.
+	 */
+	if (uap->lowfd < 0)
+		uap->lowfd = 0;
+	FILEDESC_SLOCK(fdp);
+	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
+		if (fdp->fd_ofiles[fd].fde_file != NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			(void)kern_close(td, fd);
+			FILEDESC_SLOCK(fdp);
+		}
+	}
+	FILEDESC_SUNLOCK(fdp);
+	return (0);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+	int	fd;
+	struct	ostat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+ofstat(struct thread *td, struct ofstat_args *uap)
+{
+	struct ostat oub;
+	struct stat ub;
+	int error;
+
+	error = kern_fstat(td, uap->fd, &ub);
+	if (error == 0) {
+		cvtstat(&ub, &oub);
+		error = copyout(&oub, uap->sb, sizeof(oub));
+	}
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD11)
+int
+freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
+{
+	struct stat sb;
+	struct freebsd11_stat osb;
+	int error;
+
+	error = kern_fstat(td, uap->fd, &sb);
+	if (error != 0)
+		return (error);
+	error = freebsd11_cvtstat(&sb, &osb);
+	if (error == 0)
+		error = copyout(&osb, uap->sb, sizeof(osb));
+	return (error);
+}
+#endif	/* COMPAT_FREEBSD11 */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+	int	fd;
+	struct	stat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fstat(struct thread *td, struct fstat_args *uap)
+{
+	struct stat ub;
+	int error;
+
+	error = kern_fstat(td, uap->fd, &ub);
+	if (error == 0)
+		error = copyout(&ub, uap->sb, sizeof(ub));
+	return (error);
+}
+
+int
+kern_fstat(struct thread *td, int fd, struct stat *sbp)
+{
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+
+	error = fget(td, fd, &cap_fstat_rights, &fp);
+	if (error != 0)
+		return (error);
+
+	AUDIT_ARG_FILE(td->td_proc, fp);
+
+	error = fo_stat(fp, sbp, td->td_ucred, td);
+	fdrop(fp, td);
+#ifdef __STAT_TIME_T_EXT
+	if (error == 0) {
+		sbp->st_atim_ext = 0;
+		sbp->st_mtim_ext = 0;
+		sbp->st_ctim_ext = 0;
+		sbp->st_btim_ext = 0;
+	}
+#endif
+#ifdef KTRACE
+	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+		ktrstat(sbp);
+#endif
+	return (error);
+}
+
+#if defined(COMPAT_FREEBSD11)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_nfstat_args {
+	int	fd;
+	struct	nstat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
+{
+	struct nstat nub;
+	struct stat ub;
+	int error;
+
+	error = kern_fstat(td, uap->fd, &ub);
+	if (error == 0) {
+		freebsd11_cvtnstat(&ub, &nub);
+		error = copyout(&nub, uap->sb, sizeof(nub));
+	}
+	return (error);
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+	int	fd;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
+{
+	long value;
+	int error;
+
+	error = kern_fpathconf(td, uap->fd, uap->name, &value);
+	if (error == 0)
+		td->td_retval[0] = value;
+	return (error);
+}
+
+int
+kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
+{
+	struct file *fp;
+	struct vnode *vp;
+	int error;
+
+	error = fget(td, fd, &cap_fpathconf_rights, &fp);
+	if (error != 0)
+		return (error);
+
+	if (name == _PC_ASYNC_IO) {
+		*valuep = _POSIX_ASYNCHRONOUS_IO;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp != NULL) {
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_PATHCONF(vp, name, valuep);
+		VOP_UNLOCK(vp, 0);
+	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
+		if (name != _PC_PIPE_BUF) {
+			error = EINVAL;
+		} else {
+			*valuep = PIPE_BUF;
+			error = 0;
+		}
+	} else {
+		error = EOPNOTSUPP;
+	}
+out:
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Initialize filecaps structure.
+ */
+void
+filecaps_init(struct filecaps *fcaps)
+{
+
+	bzero(fcaps, sizeof(*fcaps));
+	fcaps->fc_nioctls = -1;
+}
+
+/*
+ * Copy filecaps structure allocating memory for ioctls array if needed.
+ *
+ * The last parameter indicates whether the fdtable is locked. If it is not and
+ * ioctls are encountered, copying fails and the caller must lock the table.
+ *
+ * Note that if the table was not locked, the caller has to check the relevant
+ * sequence counter to determine whether the operation was successful.
+ */
+bool
+filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
+{
+	size_t size;
+
+	if (src->fc_ioctls != NULL && !locked)
+		return (false);
+	memcpy(dst, src, sizeof(*src));
+	if (src->fc_ioctls == NULL)
+		return (true);
+
+	KASSERT(src->fc_nioctls > 0,
+	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+	memcpy(dst->fc_ioctls, src->fc_ioctls, size);
+	return (true);
+}
+
+static u_long *
+filecaps_copy_prep(const struct filecaps *src)
+{
+	u_long *ioctls;
+	size_t size;
+
+	if (src->fc_ioctls == NULL)
+		return (NULL);
+
+	KASSERT(src->fc_nioctls > 0,
+	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+	ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+	return (ioctls);
+}
+
+static void
+filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
+    u_long *ioctls)
+{
+	size_t size;
+
+	*dst = *src;
+	if (src->fc_ioctls == NULL) {
+		MPASS(ioctls == NULL);
+		return;
+	}
+
+	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+	dst->fc_ioctls = ioctls;
+	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
+}
+
+/*
+ * Move filecaps structure to the new place and clear the old place.
+ */
+void
+filecaps_move(struct filecaps *src, struct filecaps *dst)
+{
+
+	*dst = *src;
+	bzero(src, sizeof(*src));
+}
+
+/*
+ * Fill the given filecaps structure with full rights.
+ */
+static void
+filecaps_fill(struct filecaps *fcaps)
+{
+
+	CAP_ALL(&fcaps->fc_rights);
+	fcaps->fc_ioctls = NULL;
+	fcaps->fc_nioctls = -1;
+	fcaps->fc_fcntls = CAP_FCNTL_ALL;
+}
+
+/*
+ * Free memory allocated within filecaps structure.
+ */
+void
+filecaps_free(struct filecaps *fcaps)
+{
+
+	free(fcaps->fc_ioctls, M_FILECAPS);
+	bzero(fcaps, sizeof(*fcaps));
+}
+
+static u_long *
+filecaps_free_prep(struct filecaps *fcaps)
+{
+	u_long *ioctls;
+
+	ioctls = fcaps->fc_ioctls;
+	bzero(fcaps, sizeof(*fcaps));
+	return (ioctls);
+}
+
+static void
+filecaps_free_finish(u_long *ioctls)
+{
+
+	free(ioctls, M_FILECAPS);
+}
+
+/*
+ * Validate the given filecaps structure.
+ */
+static void
+filecaps_validate(const struct filecaps *fcaps, const char *func)
+{
+
+	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
+	    ("%s: invalid rights", func));
+	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
+	    ("%s: invalid fcntls", func));
+	KASSERT(fcaps->fc_fcntls == 0 ||
+	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
+	    ("%s: fcntls without CAP_FCNTL", func));
+	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
+	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
+	    ("%s: invalid ioctls", func));
+	KASSERT(fcaps->fc_nioctls == 0 ||
+	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
+	    ("%s: ioctls without CAP_IOCTL", func));
+}
+
+static void
+fdgrowtable_exp(struct filedesc *fdp, int nfd)
+{
+	int nfd1;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	nfd1 = fdp->fd_nfiles * 2;
+	if (nfd1 < nfd)
+		nfd1 = nfd;
+	fdgrowtable(fdp, nfd1);
+}
+
+/*
+ * Grow the file table to accommodate (at least) nfd descriptors.
+ */
+static void
+fdgrowtable(struct filedesc *fdp, int nfd)
+{
+	struct filedesc0 *fdp0;
+	struct freetable *ft;
+	struct fdescenttbl *ntable;
+	struct fdescenttbl *otable;
+	int nnfiles, onfiles;
+	NDSLOTTYPE *nmap, *omap;
+
+	/*
+	 * If lastfile is -1 this struct filedesc was just allocated and we are
+	 * growing it to accommodate for the one we are going to copy from. There
+	 * is no need to have a lock on this one as it's not visible to anyone.
+	 */
+	if (fdp->fd_lastfile != -1)
+		FILEDESC_XLOCK_ASSERT(fdp);
+
+	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
+
+	/* save old values */
+	onfiles = fdp->fd_nfiles;
+	otable = fdp->fd_files;
+	omap = fdp->fd_map;
+
+	/* compute the size of the new table */
+	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
+	if (nnfiles <= onfiles)
+		/* the table is already large enough */
+		return;
+
+	/*
+	 * Allocate a new table.  We need enough space for the number of
+	 * entries, file entries themselves and the struct freetable we will use
+	 * when we decommission the table and place it on the freelist.
+	 * We place the struct freetable in the middle so we don't have
+	 * to worry about padding.
+	 */
+	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
+	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
+	    sizeof(struct freetable),
+	    M_FILEDESC, M_ZERO | M_WAITOK);
+	/* copy the old data */
+	ntable->fdt_nfiles = nnfiles;
+	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
+	    onfiles * sizeof(ntable->fdt_ofiles[0]));
+
+	/*
+	 * Allocate a new map only if the old is not large enough.  It will
+	 * grow at a slower rate than the table as it can map more
+	 * entries than the table can hold.
+	 */
+	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
+		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
+		    M_ZERO | M_WAITOK);
+		/* copy over the old data and update the pointer */
+		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
+		fdp->fd_map = nmap;
+	}
+
+	/*
+	 * Make sure that ntable is correctly initialized before we replace
+	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
+	 * data.
+	 */
+	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
+
+	/*
+	 * Do not free the old file table, as some threads may still
+	 * reference entries within it.  Instead, place it on a freelist
+	 * which will be processed when the struct filedesc is released.
+	 *
+	 * Note that if onfiles == NDFILE, we're dealing with the original
+	 * static allocation contained within (struct filedesc0 *)fdp,
+	 * which must not be freed.
+	 */
+	if (onfiles > NDFILE) {
+		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
+		fdp0 = (struct filedesc0 *)fdp;
+		ft->ft_table = otable;
+		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
+	}
+	/*
+	 * The map does not have the same possibility of threads still
+	 * holding references to it.  So always free it as long as it
+	 * does not reference the original static allocation.
+	 */
+	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
+		free(omap, M_FILEDESC);
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+int
+fdalloc(struct thread *td, int minfd, int *result)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	int fd, maxfd, allocfd;
+#ifdef RACCT
+	int error;
+#endif
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (fdp->fd_freefile > minfd)
+		minfd = fdp->fd_freefile;
+
+	maxfd = getmaxfd(td);
+
+	/*
+	 * Search the bitmap for a free descriptor starting at minfd.
+	 * If none is found, grow the file table.
+	 */
+	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+	if (fd >= maxfd)
+		return (EMFILE);
+	if (fd >= fdp->fd_nfiles) {
+		allocfd = min(fd * 2, maxfd);
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(p);
+			error = racct_set(p, RACCT_NOFILE, allocfd);
+			PROC_UNLOCK(p);
+			if (error != 0)
+				return (EMFILE);
+		}
+#endif
+		/*
+		 * fd is already equal to first free descriptor >= minfd, so
+		 * we only need to grow the table and we are done.
+		 */
+		fdgrowtable_exp(fdp, allocfd);
+	}
+
+	/*
+	 * Perform some sanity checks, then mark the file descriptor as
+	 * used and return it to the caller.
+	 */
+	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+	    ("invalid descriptor %d", fd));
+	KASSERT(!fdisused(fdp, fd),
+	    ("fd_first_free() returned non-free descriptor"));
+	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+	    ("file descriptor isn't free"));
+	fdused(fdp, fd);
+	*result = fd;
+	return (0);
+}
+
+/*
+ * Allocate n file descriptors for the process.
+ */
+int
+fdallocn(struct thread *td, int minfd, int *fds, int n)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	int i;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	for (i = 0; i < n; i++)
+		if (fdalloc(td, 0, &fds[i]) != 0)
+			break;
+
+	if (i < n) {
+		for (i--; i >= 0; i--)
+			fdunused(fdp, fds[i]);
+		return (EMFILE);
+	}
+
+	return (0);
+}
+
+/*
+ * Create a new open file structure and allocate a file descriptor for the
+ * process that refers to it.  We add one reference to the file for the
+ * descriptor table and one reference for resultfp. This is to prevent us
+ * being preempted and the entry in the descriptor table closed after we
+ * release the FILEDESC lock.
+ */
+int
+falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
+    struct filecaps *fcaps)
+{
+	struct file *fp;
+	int error, fd;
+
+	error = falloc_noinstall(td, &fp);
+	if (error)
+		return (error);		/* no reference held on error */
+
+	error = finstall(td, fp, &fd, flags, fcaps);
+	if (error) {
+		fdrop(fp, td);		/* one reference (fp only) */
+		return (error);
+	}
+
+	if (resultfp != NULL)
+		*resultfp = fp;		/* copy out result */
+	else
+		fdrop(fp, td);		/* release local reference */
+
+	if (resultfd != NULL)
+		*resultfd = fd;
+
+	return (0);
+}
+
+/*
+ * Create a new open file structure without allocating a file descriptor.
+ */
+int
+falloc_noinstall(struct thread *td, struct file **resultfp)
+{
+	struct file *fp;
+	int maxuserfiles = maxfiles - (maxfiles / 20);
+	int openfiles_new;
+	static struct timeval lastfail;
+	static int curfail;
+
+	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
+
+	openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
+	if ((openfiles_new >= maxuserfiles &&
+	    priv_check(td, PRIV_MAXFILES) != 0) ||
+	    openfiles_new >= maxfiles) {
+		atomic_subtract_int(&openfiles, 1);
+		if (ppsratecheck(&lastfail, &curfail, 1)) {
+			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
+			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
+		}
+		return (ENFILE);
+	}
+	fp = uma_zalloc(file_zone, M_WAITOK);
+	bzero(fp, sizeof(*fp));
+	refcount_init(&fp->f_count, 1);
+	fp->f_cred = crhold(td->td_ucred);
+	fp->f_ops = &badfileops;
+	*resultfp = fp;
+	return (0);
+}
+
+/*
+ * Install a file in a file descriptor table.
+ */
+void
+_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
+    struct filecaps *fcaps)
+{
+	struct filedescent *fde;
+
+	MPASS(fp != NULL);
+	if (fcaps != NULL)
+		filecaps_validate(fcaps, __func__);
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	fde = &fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+	seq_write_begin(&fde->fde_seq);
+#endif
+	fde->fde_file = fp;
+	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
+	if (fcaps != NULL)
+		filecaps_move(fcaps, &fde->fde_caps);
+	else
+		filecaps_fill(&fde->fde_caps);
+#ifdef CAPABILITIES
+	seq_write_end(&fde->fde_seq);
+#endif
+}
+
+int
+finstall(struct thread *td, struct file *fp, int *fd, int flags,
+    struct filecaps *fcaps)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	int error;
+
+	MPASS(fd != NULL);
+
+	if (!fhold(fp))
+		return (EBADF);
+	FILEDESC_XLOCK(fdp);
+	if ((error = fdalloc(td, 0, fd))) {
+		FILEDESC_XUNLOCK(fdp);
+		fdrop(fp, td);
+		return (error);
+	}
+	_finstall(fdp, fp, *fd, flags, fcaps);
+	FILEDESC_XUNLOCK(fdp);
+	return (0);
+}
+
+/*
+ * Build a new filedesc structure from another.
+ * Copy the current, root, and jail root vnode references.
+ *
+ * If fdp is not NULL, return with it shared locked.
+ */
+struct filedesc *
+fdinit(struct filedesc *fdp, bool prepfiles)
+{
+	struct filedesc0 *newfdp0;
+	struct filedesc *newfdp;
+
+	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
+	newfdp = &newfdp0->fd_fd;
+
+	/* Create the file descriptor table. */
+	FILEDESC_LOCK_INIT(newfdp);
+	refcount_init(&newfdp->fd_refcnt, 1);
+	refcount_init(&newfdp->fd_holdcnt, 1);
+	newfdp->fd_cmask = CMASK;
+	newfdp->fd_map = newfdp0->fd_dmap;
+	newfdp->fd_lastfile = -1;
+	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
+	newfdp->fd_files->fdt_nfiles = NDFILE;
+
+	if (fdp == NULL)
+		return (newfdp);
+
+	if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles)
+		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
+
+	FILEDESC_SLOCK(fdp);
+	newfdp->fd_cdir = fdp->fd_cdir;
+	if (newfdp->fd_cdir)
+		vrefact(newfdp->fd_cdir);
+	newfdp->fd_rdir = fdp->fd_rdir;
+	if (newfdp->fd_rdir)
+		vrefact(newfdp->fd_rdir);
+	newfdp->fd_jdir = fdp->fd_jdir;
+	if (newfdp->fd_jdir)
+		vrefact(newfdp->fd_jdir);
+
+	if (!prepfiles) {
+		FILEDESC_SUNLOCK(fdp);
+	} else {
+		while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
+			FILEDESC_SUNLOCK(fdp);
+			fdgrowtable(newfdp, fdp->fd_lastfile + 1);
+			FILEDESC_SLOCK(fdp);
+		}
+	}
+
+	return (newfdp);
+}
+
+static struct filedesc *
+fdhold(struct proc *p)
+{
+	struct filedesc *fdp;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	fdp = p->p_fd;
+	if (fdp != NULL)
+		refcount_acquire(&fdp->fd_holdcnt);
+	return (fdp);
+}
+
+static void
+fddrop(struct filedesc *fdp)
+{
+
+	if (fdp->fd_holdcnt > 1) {
+		if (refcount_release(&fdp->fd_holdcnt) == 0)
+			return;
+	}
+
+	FILEDESC_LOCK_DESTROY(fdp);
+	uma_zfree(filedesc0_zone, fdp);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(struct filedesc *fdp)
+{
+
+	refcount_acquire(&fdp->fd_refcnt);
+	return (fdp);
+}
+
+/*
+ * Unshare a filedesc structure, if necessary by making a copy
+ */
+void
+fdunshare(struct thread *td)
+{
+	struct filedesc *tmp;
+	struct proc *p = td->td_proc;
+
+	if (p->p_fd->fd_refcnt == 1)
+		return;
+
+	tmp = fdcopy(p->p_fd);
+	fdescfree(td);
+	p->p_fd = tmp;
+}
+
+void
+fdinstall_remapped(struct thread *td, struct filedesc *fdp)
+{
+
+	fdescfree(td);
+	td->td_proc->p_fd = fdp;
+}
+
+/*
+ * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
+ * this is to ease callers, not catch errors.
+ */
+struct filedesc *
+fdcopy(struct filedesc *fdp)
+{
+	struct filedesc *newfdp;
+	struct filedescent *nfde, *ofde;
+	int i;
+
+	MPASS(fdp != NULL);
+
+	newfdp = fdinit(fdp, true);
+	/* copy all passable descriptors (i.e. not kqueue) */
+	newfdp->fd_freefile = -1;
+	for (i = 0; i <= fdp->fd_lastfile; ++i) {
+		ofde = &fdp->fd_ofiles[i];
+		if (ofde->fde_file == NULL ||
+		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
+		    !fhold(ofde->fde_file)) {
+			if (newfdp->fd_freefile == -1)
+				newfdp->fd_freefile = i;
+			continue;
+		}
+		nfde = &newfdp->fd_ofiles[i];
+		*nfde = *ofde;
+		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
+		fdused_init(newfdp, i);
+		newfdp->fd_lastfile = i;
+	}
+	if (newfdp->fd_freefile == -1)
+		newfdp->fd_freefile = i;
+	newfdp->fd_cmask = fdp->fd_cmask;
+	FILEDESC_SUNLOCK(fdp);
+	return (newfdp);
+}
+
+/*
+ * Copies a filedesc structure, while remapping all file descriptors
+ * stored inside using a translation table.
+ *
+ * File descriptors are copied over to the new file descriptor table,
+ * regardless of whether the close-on-exec flag is set.
+ */
+int
+fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
+    struct filedesc **ret)
+{
+	struct filedesc *newfdp;
+	struct filedescent *nfde, *ofde;
+	int error, i;
+
+	MPASS(fdp != NULL);
+
+	newfdp = fdinit(fdp, true);
+	if (nfds > fdp->fd_lastfile + 1) {
+		/* New table cannot be larger than the old one. */
+		error = E2BIG;
+		goto bad;
+	}
+	/* Copy all passable descriptors (i.e. not kqueue). */
+	newfdp->fd_freefile = nfds;
+	for (i = 0; i < nfds; ++i) {
+		if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) {
+			/* File descriptor out of bounds. */
+			error = EBADF;
+			goto bad;
+		}
+		ofde = &fdp->fd_ofiles[fds[i]];
+		if (ofde->fde_file == NULL) {
+			/* Unused file descriptor. */
+			error = EBADF;
+			goto bad;
+		}
+		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
+			/* File descriptor cannot be passed. */
+			error = EINVAL;
+			goto bad;
+		}
+		if (!fhold(nfde->fde_file)) {
+			error = EBADF;
+			goto bad;
+		}
+		nfde = &newfdp->fd_ofiles[i];
+		*nfde = *ofde;
+		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
+		fdused_init(newfdp, i);
+		newfdp->fd_lastfile = i;
+	}
+	newfdp->fd_cmask = fdp->fd_cmask;
+	FILEDESC_SUNLOCK(fdp);
+	*ret = newfdp;
+	return (0);
+bad:
+	FILEDESC_SUNLOCK(fdp);
+	fdescfree_remapped(newfdp);
+	return (error);
+}
+
+/*
+ * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
+ * one of processes using it exits) and the table used to be shared.
+ */
+static void
+fdclearlocks(struct thread *td)
+{
+	struct filedesc *fdp;
+	struct filedesc_to_leader *fdtol;
+	struct flock lf;
+	struct file *fp;
+	struct proc *p;
+	struct vnode *vp;
+	int i;
+
+	p = td->td_proc;
+	fdp = p->p_fd;
+	fdtol = p->p_fdtol;
+	MPASS(fdtol != NULL);
+
+	FILEDESC_XLOCK(fdp);
+	KASSERT(fdtol->fdl_refcount > 0,
+	    ("filedesc_to_refcount botch: fdl_refcount=%d",
+	    fdtol->fdl_refcount));
+	if (fdtol->fdl_refcount == 1 &&
+	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
+		for (i = 0; i <= fdp->fd_lastfile; i++) {
+			fp = fdp->fd_ofiles[i].fde_file;
+			if (fp == NULL || fp->f_type != DTYPE_VNODE ||
+			    !fhold(fp))
+				continue;
+			FILEDESC_XUNLOCK(fdp);
+			lf.l_whence = SEEK_SET;
+			lf.l_start = 0;
+			lf.l_len = 0;
+			lf.l_type = F_UNLCK;
+			vp = fp->f_vnode;
+			(void) VOP_ADVLOCK(vp,
+			    (caddr_t)p->p_leader, F_UNLCK,
+			    &lf, F_POSIX);
+			FILEDESC_XLOCK(fdp);
+			fdrop(fp, td);
+		}
+	}
+retry:
+	if (fdtol->fdl_refcount == 1) {
+		if (fdp->fd_holdleaderscount > 0 &&
+		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
+			/*
+			 * close() or kern_dup() has cleared a reference
+			 * in a shared file descriptor table.
+			 */
+			fdp->fd_holdleaderswakeup = 1;
+			sx_sleep(&fdp->fd_holdleaderscount,
+			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
+			goto retry;
+		}
+		if (fdtol->fdl_holdcount > 0) {
+			/*
+			 * Ensure that fdtol->fdl_leader remains
+			 * valid in closef().
+			 */
+			fdtol->fdl_wakeup = 1;
+			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
+			    "fdlhold", 0);
+			goto retry;
+		}
+	}
+	fdtol->fdl_refcount--;
+	if (fdtol->fdl_refcount == 0 &&
+	    fdtol->fdl_holdcount == 0) {
+		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
+		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
+	} else
+		fdtol = NULL;
+	p->p_fdtol = NULL;
+	FILEDESC_XUNLOCK(fdp);
+	if (fdtol != NULL)
+		free(fdtol, M_FILEDESC_TO_LEADER);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+static void
+fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
+{
+	struct filedesc0 *fdp0;
+	struct freetable *ft, *tft;
+	struct filedescent *fde;
+	struct file *fp;
+	int i;
+
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		fde = &fdp->fd_ofiles[i];
+		fp = fde->fde_file;
+		if (fp != NULL) {
+			fdefree_last(fde);
+			if (needclose)
+				(void) closef(fp, td);
+			else
+				fdrop(fp, td);
+		}
+	}
+
+	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
+		free(fdp->fd_map, M_FILEDESC);
+	if (fdp->fd_nfiles > NDFILE)
+		free(fdp->fd_files, M_FILEDESC);
+
+	fdp0 = (struct filedesc0 *)fdp;
+	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
+		free(ft->ft_table, M_FILEDESC);
+
+	fddrop(fdp);
+}
+
+void
+fdescfree(struct thread *td)
+{
+	struct proc *p;
+	struct filedesc *fdp;
+	struct vnode *cdir, *jdir, *rdir;
+
+	p = td->td_proc;
+	fdp = p->p_fd;
+	MPASS(fdp != NULL);
+
+#ifdef RACCT
+	if (racct_enable) {
+		PROC_LOCK(p);
+		racct_set(p, RACCT_NOFILE, 0);
+		PROC_UNLOCK(p);
+	}
+#endif
+
+	if (p->p_fdtol != NULL)
+		fdclearlocks(td);
+
+	PROC_LOCK(p);
+	p->p_fd = NULL;
+	PROC_UNLOCK(p);
+
+	if (refcount_release(&fdp->fd_refcnt) == 0)
+		return;
+
+	FILEDESC_XLOCK(fdp);
+	cdir = fdp->fd_cdir;
+	fdp->fd_cdir = NULL;
+	rdir = fdp->fd_rdir;
+	fdp->fd_rdir = NULL;
+	jdir = fdp->fd_jdir;
+	fdp->fd_jdir = NULL;
+	FILEDESC_XUNLOCK(fdp);
+
+	if (cdir != NULL)
+		vrele(cdir);
+	if (rdir != NULL)
+		vrele(rdir);
+	if (jdir != NULL)
+		vrele(jdir);
+
+	fdescfree_fds(td, fdp, 1);
+}
+
+void
+fdescfree_remapped(struct filedesc *fdp)
+{
+
+	if (fdp->fd_cdir != NULL)
+		vrele(fdp->fd_cdir);
+	if (fdp->fd_rdir != NULL)
+		vrele(fdp->fd_rdir);
+	if (fdp->fd_jdir != NULL)
+		vrele(fdp->fd_jdir);
+
+	fdescfree_fds(curthread, fdp, 0);
+}
+
+/*
+ * For setugid programs, we don't want to people to use that setugidness
+ * to generate error messages which write to a file which otherwise would
+ * otherwise be off-limits to the process.  We check for filesystems where
+ * the vnode can change out from under us after execve (like [lin]procfs).
+ *
+ * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
+ * sufficient.  We also don't check for setugidness since we know we are.
+ */
+static bool
+is_unsafe(struct file *fp)
+{
+	struct vnode *vp;
+
+	if (fp->f_type != DTYPE_VNODE)
+		return (false);
+
+	vp = fp->f_vnode;
+	return ((vp->v_vflag & VV_PROCDEP) != 0);
+}
+
+/*
+ * Make this setguid thing safe, if at all possible.
+ */
+void
+fdsetugidsafety(struct thread *td)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	int i;
+
+	fdp = td->td_proc->p_fd;
+	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+	MPASS(fdp->fd_nfiles >= 3);
+	for (i = 0; i <= 2; i++) {
+		fp = fdp->fd_ofiles[i].fde_file;
+		if (fp != NULL && is_unsafe(fp)) {
+			FILEDESC_XLOCK(fdp);
+			knote_fdclose(td, i);
+			/*
+			 * NULL-out descriptor prior to close to avoid
+			 * a race while close blocks.
+			 */
+			fdfree(fdp, i);
+			FILEDESC_XUNLOCK(fdp);
+			(void) closef(fp, td);
+		}
+	}
+}
+
+/*
+ * If a specific file object occupies a specific file descriptor, close the
+ * file descriptor entry and drop a reference on the file object.  This is a
+ * convenience function to handle a subsequent error in a function that calls
+ * falloc() that handles the race that another thread might have closed the
+ * file descriptor out from under the thread creating the file object.
+ */
+void
+fdclose(struct thread *td, struct file *fp, int idx)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+
+	FILEDESC_XLOCK(fdp);
+	if (fdp->fd_ofiles[idx].fde_file == fp) {
+		fdfree(fdp, idx);
+		FILEDESC_XUNLOCK(fdp);
+		fdrop(fp, td);
+	} else
+		FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(struct thread *td)
+{
+	struct filedesc *fdp;
+	struct filedescent *fde;
+	struct file *fp;
+	int i;
+
+	fdp = td->td_proc->p_fd;
+	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		fde = &fdp->fd_ofiles[i];
+		fp = fde->fde_file;
+		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
+		    (fde->fde_flags & UF_EXCLOSE))) {
+			FILEDESC_XLOCK(fdp);
+			fdfree(fdp, i);
+			(void) closefp(fdp, i, fp, td, 0);
+			FILEDESC_UNLOCK_ASSERT(fdp);
+		}
+	}
+}
+
+/*
+ * It is unsafe for set[ug]id processes to be started with file
+ * descriptors 0..2 closed, as these descriptors are given implicit
+ * significance in the Standard C library.  fdcheckstd() will create a
+ * descriptor referencing /dev/null for each of stdin, stdout, and
+ * stderr that is not already open.
+ */
+int
+fdcheckstd(struct thread *td)
+{
+	struct filedesc *fdp;
+	register_t save;
+	int i, error, devnull;
+
+	fdp = td->td_proc->p_fd;
+	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+	MPASS(fdp->fd_nfiles >= 3);
+	devnull = -1;
+	for (i = 0; i <= 2; i++) {
+		if (fdp->fd_ofiles[i].fde_file != NULL)
+			continue;
+
+		save = td->td_retval[0];
+		if (devnull != -1) {
+			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
+		} else {
+			error = kern_openat(td, AT_FDCWD, "/dev/null",
+			    UIO_SYSSPACE, O_RDWR, 0);
+			if (error == 0) {
+				devnull = td->td_retval[0];
+				KASSERT(devnull == i, ("we didn't get our fd"));
+			}
+		}
+		td->td_retval[0] = save;
+		if (error != 0)
+			return (error);
+	}
+	return (0);
+}
+
+/*
+ * Internal form of close.  Decrement reference count on file structure.
+ * Note: td may be NULL when closing a file that was being passed in a
+ * message.
+ */
+int
+closef(struct file *fp, struct thread *td)
+{
+	struct vnode *vp;
+	struct flock lf;
+	struct filedesc_to_leader *fdtol;
+	struct filedesc *fdp;
+
+	/*
+	 * POSIX record locking dictates that any close releases ALL
+	 * locks owned by this process.  This is handled by setting
+	 * a flag in the unlock to free ONLY locks obeying POSIX
+	 * semantics, and not to free BSD-style file locks.
+	 * If the descriptor was in a message, POSIX-style locks
+	 * aren't passed with the descriptor, and the thread pointer
+	 * will be NULL.  Callers should be careful only to pass a
+	 * NULL thread pointer when there really is no owning
+	 * context that might have locks, or the locks will be
+	 * leaked.
+	 */
+	if (fp->f_type == DTYPE_VNODE && td != NULL) {
+		vp = fp->f_vnode;
+		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+			lf.l_whence = SEEK_SET;
+			lf.l_start = 0;
+			lf.l_len = 0;
+			lf.l_type = F_UNLCK;
+			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
+			    F_UNLCK, &lf, F_POSIX);
+		}
+		fdtol = td->td_proc->p_fdtol;
+		if (fdtol != NULL) {
+			/*
+			 * Handle special case where file descriptor table is
+			 * shared between multiple process leaders.
+			 */
+			fdp = td->td_proc->p_fd;
+			FILEDESC_XLOCK(fdp);
+			for (fdtol = fdtol->fdl_next;
+			    fdtol != td->td_proc->p_fdtol;
+			    fdtol = fdtol->fdl_next) {
+				if ((fdtol->fdl_leader->p_flag &
+				    P_ADVLOCK) == 0)
+					continue;
+				fdtol->fdl_holdcount++;
+				FILEDESC_XUNLOCK(fdp);
+				lf.l_whence = SEEK_SET;
+				lf.l_start = 0;
+				lf.l_len = 0;
+				lf.l_type = F_UNLCK;
+				vp = fp->f_vnode;
+				(void) VOP_ADVLOCK(vp,
+				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+				    F_POSIX);
+				FILEDESC_XLOCK(fdp);
+				fdtol->fdl_holdcount--;
+				if (fdtol->fdl_holdcount == 0 &&
+				    fdtol->fdl_wakeup != 0) {
+					fdtol->fdl_wakeup = 0;
+					wakeup(fdtol);
+				}
+			}
+			FILEDESC_XUNLOCK(fdp);
+		}
+	}
+	return (fdrop(fp, td));
+}
+
+/*
+ * Initialize the file pointer with the specified properties.
+ *
+ * The ops are set with release semantics to be certain that the flags, type,
+ * and data are visible when ops is.  This is to prevent ops methods from being
+ * called with bad data.
+ */
+void
+finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
+{
+	fp->f_data = data;
+	fp->f_flag = flag;
+	fp->f_type = type;
+	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
+}
+
+int
+fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+    struct file **fpp, struct filecaps *havecapsp)
+{
+	struct filedescent *fde;
+	int error;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	fde = fdeget_locked(fdp, fd);
+	if (fde == NULL) {
+		error = EBADF;
+		goto out;
+	}
+
+#ifdef CAPABILITIES
+	error = cap_check(cap_rights_fde_inline(fde), needrightsp);
+	if (error != 0)
+		goto out;
+#endif
+
+	if (havecapsp != NULL)
+		filecaps_copy(&fde->fde_caps, havecapsp, true);
+
+	*fpp = fde->fde_file;
+
+	error = 0;
+out:
+	return (error);
+}
+
+int
+fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
+    struct file **fpp, struct filecaps *havecapsp)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+	int error;
+#ifndef CAPABILITIES
+	error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL);
+	if (error == 0 && havecapsp != NULL)
+		filecaps_fill(havecapsp);
+#else
+	struct file *fp;
+	seq_t seq;
+
+	for (;;) {
+		error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq);
+		if (error != 0)
+			return (error);
+
+		if (havecapsp != NULL) {
+			if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
+			    havecapsp, false)) {
+				fdrop(fp, td);
+				goto get_locked;
+			}
+		}
+
+		if (!fd_modified(fdp, fd, seq))
+			break;
+		fdrop(fp, td);
+	}
+
+	*fpp = fp;
+	return (0);
+
+get_locked:
+	FILEDESC_SLOCK(fdp);
+	error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
+	if (error == 0 && !fhold(*fpp))
+		error = EBADF;
+	FILEDESC_SUNLOCK(fdp);
+#endif
+	return (error);
+}
+
+int
+fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+    struct file **fpp, seq_t *seqp)
+{
+#ifdef CAPABILITIES
+	const struct filedescent *fde;
+#endif
+	const struct fdescenttbl *fdt;
+	struct file *fp;
+	u_int count;
+#ifdef CAPABILITIES
+	seq_t seq;
+	cap_rights_t haverights;
+	int error;
+#endif
+
+	fdt = fdp->fd_files;
+	if ((u_int)fd >= fdt->fdt_nfiles)
+		return (EBADF);
+	/*
+	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
+	 * never raising a refcount above 0.  To accomplish this we have
+	 * to use a cmpset loop rather than an atomic_add.  The descriptor
+	 * must be re-verified once we acquire a reference to be certain
+	 * that the identity is still correct and we did not lose a race
+	 * due to preemption.
+	 */
+	for (;;) {
+#ifdef CAPABILITIES
+		seq = seq_load(fd_seq(fdt, fd));
+		fde = &fdt->fdt_ofiles[fd];
+		haverights = *cap_rights_fde_inline(fde);
+		fp = fde->fde_file;
+		if (!seq_consistent(fd_seq(fdt, fd), seq))
+			continue;
+#else
+		fp = fdt->fdt_ofiles[fd].fde_file;
+#endif
+		if (fp == NULL)
+			return (EBADF);
+#ifdef CAPABILITIES
+		error = cap_check(&haverights, needrightsp);
+		if (error != 0)
+			return (error);
+#endif
+		count = fp->f_count;
+	retry:
+		if (count == 0) {
+			/*
+			 * Force a reload. Other thread could reallocate the
+			 * table before this fd was closed, so it possible that
+			 * there is a stale fp pointer in cached version.
+			 */
+			fdt = *(const struct fdescenttbl * const volatile *)
+			    &(fdp->fd_files);
+			continue;
+		}
+		if (__predict_false(count + 1 < count))
+			return (EBADF);
+
+		/*
+		 * Use an acquire barrier to force re-reading of fdt so it is
+		 * refreshed for verification.
+		 */
+		if (__predict_false(atomic_fcmpset_acq_int(&fp->f_count,
+		    &count, count + 1) == 0))
+			goto retry;
+		fdt = fdp->fd_files;
+#ifdef	CAPABILITIES
+		if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
+#else
+		if (fp == fdt->fdt_ofiles[fd].fde_file)
+#endif
+			break;
+		fdrop(fp, curthread);
+	}
+	*fpp = fp;
+	if (seqp != NULL) {
+#ifdef CAPABILITIES
+		*seqp = seq;
+#endif
+	}
+	return (0);
+}
+
+/*
+ * Extract the file pointer associated with the specified descriptor for the
+ * current user process.
+ *
+ * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
+ * returned.
+ *
+ * File's rights will be checked against the capability rights mask.
+ *
+ * If an error occurred the non-zero error is returned and *fpp is set to
+ * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
+ * responsible for fdrop().
+ */
+static __inline int
+_fget(struct thread *td, int fd, struct file **fpp, int flags,
+    cap_rights_t *needrightsp, seq_t *seqp)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	int error;
+
+	*fpp = NULL;
+	fdp = td->td_proc->p_fd;
+	error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp);
+	if (error != 0)
+		return (error);
+	if (fp->f_ops == &badfileops) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+
+	/*
+	 * FREAD and FWRITE failure return EBADF as per POSIX.
+	 */
+	error = 0;
+	switch (flags) {
+	case FREAD:
+	case FWRITE:
+		if ((fp->f_flag & flags) == 0)
+			error = EBADF;
+		break;
+	case FEXEC:
+	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
+		    ((fp->f_flag & FWRITE) != 0))
+			error = EBADF;
+		break;
+	case 0:
+		break;
+	default:
+		KASSERT(0, ("wrong flags"));
+	}
+
+	if (error != 0) {
+		fdrop(fp, td);
+		return (error);
+	}
+
+	*fpp = fp;
+	return (0);
+}
+
+int
+fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+	return (_fget(td, fd, fpp, 0, rightsp, NULL));
+}
+
+int
+fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
+    struct file **fpp)
+{
+	int error;
+#ifndef CAPABILITIES
+	error = _fget(td, fd, fpp, 0, rightsp, NULL);
+	if (maxprotp != NULL)
+		*maxprotp = VM_PROT_ALL;
+#else
+	cap_rights_t fdrights;
+	struct filedesc *fdp = td->td_proc->p_fd;
+	seq_t seq;
+
+	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
+	for (;;) {
+		error = _fget(td, fd, fpp, 0, rightsp, &seq);
+		if (error != 0)
+			return (error);
+		if (maxprotp != NULL)
+			fdrights = *cap_rights(fdp, fd);
+		if (!fd_modified(fdp, fd, seq))
+			break;
+		fdrop(*fpp, td);
+	}
+
+	/*
+	 * If requested, convert capability rights to access flags.
+	 */
+	if (maxprotp != NULL)
+		*maxprotp = cap_rights_to_vmprot(&fdrights);
+#endif
+	return (error);
+}
+
+int
+fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+	return (_fget(td, fd, fpp, FREAD, rightsp, NULL));
+}
+
+int
+fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
+}
+
+int
+fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
+    struct file **fpp)
+{
+	struct filedesc *fdp = td->td_proc->p_fd;
+#ifndef CAPABILITIES
+	return (fget_unlocked(fdp, fd, rightsp, fpp, NULL));
+#else
+	int error;
+	seq_t seq;
+
+	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
+	for (;;) {
+		error = fget_unlocked(fdp, fd, rightsp, fpp, &seq);
+		if (error != 0)
+			return (error);
+		error = cap_fcntl_check(fdp, fd, needfcntl);
+		if (!fd_modified(fdp, fd, seq))
+			break;
+		fdrop(*fpp, td);
+	}
+	if (error != 0) {
+		fdrop(*fpp, td);
+		*fpp = NULL;
+	}
+	return (error);
+#endif
+}
+
+/*
+ * Like fget() but loads the underlying vnode, or returns an error if the
+ * descriptor does not represent a vnode.  Note that pipes use vnodes but
+ * never have VM objects.  The returned vnode will be vref()'d.
+ *
+ * XXX: what about the unused flags ?
+ */
+static __inline int
+_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
+    struct vnode **vpp)
+{
+	struct file *fp;
+	int error;
+
+	*vpp = NULL;
+	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
+	if (error != 0)
+		return (error);
+	if (fp->f_vnode == NULL) {
+		error = EINVAL;
+	} else {
+		*vpp = fp->f_vnode;
+		vrefact(*vpp);
+	}
+	fdrop(fp, td);
+
+	return (error);
+}
+
+int
+fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, 0, rightsp, vpp));
+}
+
+int
+fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
+    struct filecaps *havecaps, struct vnode **vpp)
+{
+	struct filedesc *fdp;
+	struct filecaps caps;
+	struct file *fp;
+	int error;
+
+	fdp = td->td_proc->p_fd;
+	error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps);
+	if (error != 0)
+		return (error);
+	if (fp->f_ops == &badfileops) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_vnode == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+
+	*havecaps = caps;
+	*vpp = fp->f_vnode;
+	vrefact(*vpp);
+
+	return (0);
+out:
+	filecaps_free(&caps);
+	return (error);
+}
+
+int
+fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
+}
+
+int
+fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
+}
+
+#ifdef notyet
+int
+fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
+    struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
+}
+#endif
+
+/*
+ * Handle the last reference to a file being closed.
+ *
+ * Without the noinline attribute clang keeps inlining the func thorough this
+ * file when fdrop is used.
+ */
+int __noinline
+_fdrop(struct file *fp, struct thread *td)
+{
+	int error;
+
+	if (fp->f_count != 0)
+		panic("fdrop: count %d", fp->f_count);
+	error = fo_close(fp, td);
+	atomic_subtract_int(&openfiles, 1);
+	crfree(fp->f_cred);
+	free(fp->f_advice, M_FADVISE);
+	uma_zfree(file_zone, fp);
+
+	return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on the entire file
+ * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+	int	fd;
+	int	how;
+};
+#endif
+/* ARGSUSED */
+int
+sys_flock(struct thread *td, struct flock_args *uap)
+{
+	struct file *fp;
+	struct vnode *vp;
+	struct flock lf;
+	int error;
+
+	error = fget(td, uap->fd, &cap_flock_rights, &fp);
+	if (error != 0)
+		return (error);
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
+		return (EOPNOTSUPP);
+	}
+
+	vp = fp->f_vnode;
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	if (uap->how & LOCK_UN) {
+		lf.l_type = F_UNLCK;
+		atomic_clear_int(&fp->f_flag, FHASLOCK);
+		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+		goto done2;
+	}
+	if (uap->how & LOCK_EX)
+		lf.l_type = F_WRLCK;
+	else if (uap->how & LOCK_SH)
+		lf.l_type = F_RDLCK;
+	else {
+		error = EBADF;
+		goto done2;
+	}
+	atomic_set_int(&fp->f_flag, FHASLOCK);
+	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
+done2:
+	fdrop(fp, td);
+	return (error);
+}
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
+    int openerror, int *indxp)
+{
+	struct filedescent *newfde, *oldfde;
+	struct file *fp;
+	u_long *ioctls;
+	int error, indx;
+
+	KASSERT(openerror == ENODEV || openerror == ENXIO,
+	    ("unexpected error %d in %s", openerror, __func__));
+
+	/*
+	 * If the to-be-dup'd fd number is greater than the allowed number
+	 * of file descriptors, or the fd to be dup'd has already been
+	 * closed, then reject.
+	 */
+	FILEDESC_XLOCK(fdp);
+	if ((fp = fget_locked(fdp, dfd)) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+
+	error = fdalloc(td, 0, &indx);
+	if (error != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (error);
+	}
+
+	/*
+	 * There are two cases of interest here.
+	 *
+	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
+	 *
+	 * For ENXIO steal away the file structure from (dfd) and store it in
+	 * (indx).  (dfd) is effectively closed by this operation.
+	 */
+	switch (openerror) {
+	case ENODEV:
+		/*
+		 * Check that the mode the file is being opened for is a
+		 * subset of the mode of the existing descriptor.
+		 */
+		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+			fdunused(fdp, indx);
+			FILEDESC_XUNLOCK(fdp);
+			return (EACCES);
+		}
+		if (!fhold(fp)) {
+			fdunused(fdp, indx);
+			FILEDESC_XUNLOCK(fdp);
+			return (EBADF);
+		}
+		newfde = &fdp->fd_ofiles[indx];
+		oldfde = &fdp->fd_ofiles[dfd];
+		ioctls = filecaps_copy_prep(&oldfde->fde_caps);
+#ifdef CAPABILITIES
+		seq_write_begin(&newfde->fde_seq);
+#endif
+		memcpy(newfde, oldfde, fde_change_size);
+		filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
+		    ioctls);
+#ifdef CAPABILITIES
+		seq_write_end(&newfde->fde_seq);
+#endif
+		break;
+	case ENXIO:
+		/*
+		 * Steal away the file pointer from dfd and stuff it into indx.
+		 */
+		newfde = &fdp->fd_ofiles[indx];
+		oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+		seq_write_begin(&newfde->fde_seq);
+#endif
+		memcpy(newfde, oldfde, fde_change_size);
+		oldfde->fde_file = NULL;
+		fdunused(fdp, dfd);
+#ifdef CAPABILITIES
+		seq_write_end(&newfde->fde_seq);
+#endif
+		break;
+	}
+	FILEDESC_XUNLOCK(fdp);
+	*indxp = indx;
+	return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ *	0: disallowed for all processes.
+ *	1: allowed for processes that were not already chroot(2)'ed.
+ *	2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+    &chroot_allow_open_directories, 0,
+    "Allow a process to chroot(2) if it has a directory open");
+
+/*
+ * Helper function for raised chroot(2) security function:  Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(struct filedesc *fdp)
+{
+	struct vnode *vp;
+	struct file *fp;
+	int fd;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
+		fp = fget_locked(fdp, fd);
+		if (fp == NULL)
+			continue;
+		if (fp->f_type == DTYPE_VNODE) {
+			vp = fp->f_vnode;
+			if (vp->v_type == VDIR)
+				return (EPERM);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Common routine for kern_chroot() and jail_attach().  The caller is
+ * responsible for invoking priv_check() and mac_vnode_check_chroot() to
+ * authorize this operation.
+ */
+int
+pwd_chroot(struct thread *td, struct vnode *vp)
+{
+	struct filedesc *fdp;
+	struct vnode *oldvp;
+	int error;
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	if (chroot_allow_open_directories == 0 ||
+	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+		error = chroot_refuse_vdir_fds(fdp);
+		if (error != 0) {
+			FILEDESC_XUNLOCK(fdp);
+			return (error);
+		}
+	}
+	oldvp = fdp->fd_rdir;
+	vrefact(vp);
+	fdp->fd_rdir = vp;
+	if (fdp->fd_jdir == NULL) {
+		vrefact(vp);
+		fdp->fd_jdir = vp;
+	}
+	FILEDESC_XUNLOCK(fdp);
+	vrele(oldvp);
+	return (0);
+}
+
+void
+pwd_chdir(struct thread *td, struct vnode *vp)
+{
+	struct filedesc *fdp;
+	struct vnode *oldvp;
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	VNASSERT(vp->v_usecount > 0, vp,
+	    ("chdir to a vnode with zero usecount"));
+	oldvp = fdp->fd_cdir;
+	fdp->fd_cdir = vp;
+	FILEDESC_XUNLOCK(fdp);
+	vrele(oldvp);
+}
+
+/*
+ * Scan all active processes and prisons to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new mount point.
+ */
+void
+mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
+{
+	struct filedesc *fdp;
+	struct prison *pr;
+	struct proc *p;
+	int nrele;
+
+	if (vrefcnt(olddp) == 1)
+		return;
+	nrele = 0;
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		fdp = fdhold(p);
+		PROC_UNLOCK(p);
+		if (fdp == NULL)
+			continue;
+		FILEDESC_XLOCK(fdp);
+		if (fdp->fd_cdir == olddp) {
+			vrefact(newdp);
+			fdp->fd_cdir = newdp;
+			nrele++;
+		}
+		if (fdp->fd_rdir == olddp) {
+			vrefact(newdp);
+			fdp->fd_rdir = newdp;
+			nrele++;
+		}
+		if (fdp->fd_jdir == olddp) {
+			vrefact(newdp);
+			fdp->fd_jdir = newdp;
+			nrele++;
+		}
+		FILEDESC_XUNLOCK(fdp);
+		fddrop(fdp);
+	}
+	sx_sunlock(&allproc_lock);
+	if (rootvnode == olddp) {
+		vrefact(newdp);
+		rootvnode = newdp;
+		nrele++;
+	}
+	mtx_lock(&prison0.pr_mtx);
+	if (prison0.pr_root == olddp) {
+		vrefact(newdp);
+		prison0.pr_root = newdp;
+		nrele++;
+	}
+	mtx_unlock(&prison0.pr_mtx);
+	sx_slock(&allprison_lock);
+	TAILQ_FOREACH(pr, &allprison, pr_list) {
+		mtx_lock(&pr->pr_mtx);
+		if (pr->pr_root == olddp) {
+			vrefact(newdp);
+			pr->pr_root = newdp;
+			nrele++;
+		}
+		mtx_unlock(&pr->pr_mtx);
+	}
+	sx_sunlock(&allprison_lock);
+	while (nrele--)
+		vrele(olddp);
+}
+
+struct filedesc_to_leader *
+filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
+{
+	struct filedesc_to_leader *fdtol;
+
+	fdtol = malloc(sizeof(struct filedesc_to_leader),
+	    M_FILEDESC_TO_LEADER, M_WAITOK);
+	fdtol->fdl_refcount = 1;
+	fdtol->fdl_holdcount = 0;
+	fdtol->fdl_wakeup = 0;
+	fdtol->fdl_leader = leader;
+	if (old != NULL) {
+		FILEDESC_XLOCK(fdp);
+		fdtol->fdl_next = old->fdl_next;
+		fdtol->fdl_prev = old;
+		old->fdl_next = fdtol;
+		fdtol->fdl_next->fdl_prev = fdtol;
+		FILEDESC_XUNLOCK(fdp);
+	} else {
+		fdtol->fdl_next = fdtol;
+		fdtol->fdl_prev = fdtol;
+	}
+	return (fdtol);
+}
+
+static int
+sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
+{
+	struct filedesc *fdp;
+	int i, count, slots;
+
+	if (*(int *)arg1 != 0)
+		return (EINVAL);
+
+	fdp = curproc->p_fd;
+	count = 0;
+	FILEDESC_SLOCK(fdp);
+	slots = NDSLOTS(fdp->fd_lastfile + 1);
+	for (i = 0; i < slots; i++)
+		count += bitcountl(fdp->fd_map[i]);
+	FILEDESC_SUNLOCK(fdp);
+
+	return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
+    CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
+    "Number of open file descriptors");
+
+/*
+ * Get file structures globally.
+ */
+static int
+sysctl_kern_file(SYSCTL_HANDLER_ARGS)
+{
+	struct xfile xf;
+	struct filedesc *fdp;
+	struct file *fp;
+	struct proc *p;
+	int error, n;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	if (req->oldptr == NULL) {
+		n = 0;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_LOCK(p);
+			if (p->p_state == PRS_NEW) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			fdp = fdhold(p);
+			PROC_UNLOCK(p);
+			if (fdp == NULL)
+				continue;
+			/* overestimates sparse tables. */
+			if (fdp->fd_lastfile > 0)
+				n += fdp->fd_lastfile;
+			fddrop(fdp);
+		}
+		sx_sunlock(&allproc_lock);
+		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
+	}
+	error = 0;
+	bzero(&xf, sizeof(xf));
+	xf.xf_size = sizeof(xf);
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		if (p_cansee(req->td, p) != 0) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		xf.xf_pid = p->p_pid;
+		xf.xf_uid = p->p_ucred->cr_uid;
+		fdp = fdhold(p);
+		PROC_UNLOCK(p);
+		if (fdp == NULL)
+			continue;
+		FILEDESC_SLOCK(fdp);
+		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
+			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+				continue;
+			xf.xf_fd = n;
+			xf.xf_file = (uintptr_t)fp;
+			xf.xf_data = (uintptr_t)fp->f_data;
+			xf.xf_vnode = (uintptr_t)fp->f_vnode;
+			xf.xf_type = (uintptr_t)fp->f_type;
+			xf.xf_count = fp->f_count;
+			xf.xf_msgcount = 0;
+			xf.xf_offset = foffset_get(fp);
+			xf.xf_flag = fp->f_flag;
+			error = SYSCTL_OUT(req, &xf, sizeof(xf));
+			if (error)
+				break;
+		}
+		FILEDESC_SUNLOCK(fdp);
+		fddrop(fdp);
+		if (error)
+			break;
+	}
+	sx_sunlock(&allproc_lock);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
+    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
+
+#ifdef KINFO_FILE_SIZE
+CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
+#endif
+
+static int
+xlate_fflags(int fflags)
+{
+	static const struct {
+		int	fflag;
+		int	kf_fflag;
+	} fflags_table[] = {
+		{ FAPPEND, KF_FLAG_APPEND },
+		{ FASYNC, KF_FLAG_ASYNC },
+		{ FFSYNC, KF_FLAG_FSYNC },
+		{ FHASLOCK, KF_FLAG_HASLOCK },
+		{ FNONBLOCK, KF_FLAG_NONBLOCK },
+		{ FREAD, KF_FLAG_READ },
+		{ FWRITE, KF_FLAG_WRITE },
+		{ O_CREAT, KF_FLAG_CREAT },
+		{ O_DIRECT, KF_FLAG_DIRECT },
+		{ O_EXCL, KF_FLAG_EXCL },
+		{ O_EXEC, KF_FLAG_EXEC },
+		{ O_EXLOCK, KF_FLAG_EXLOCK },
+		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
+		{ O_SHLOCK, KF_FLAG_SHLOCK },
+		{ O_TRUNC, KF_FLAG_TRUNC }
+	};
+	unsigned int i;
+	int kflags;
+
+	kflags = 0;
+	for (i = 0; i < nitems(fflags_table); i++)
+		if (fflags & fflags_table[i].fflag)
+			kflags |=  fflags_table[i].kf_fflag;
+	return (kflags);
+}
+
+/* Trim unused data from kf_path by truncating the structure size. */
+void
+pack_kinfo(struct kinfo_file *kif)
+{
+
+	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
+	    strlen(kif->kf_path) + 1;
+	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
+}
+
+static void
+export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
+    struct kinfo_file *kif, struct filedesc *fdp, int flags)
+{
+	int error;
+
+	bzero(kif, sizeof(*kif));
+
+	/* Set a default type to allow for empty fill_kinfo() methods. */
+	kif->kf_type = KF_TYPE_UNKNOWN;
+	kif->kf_flags = xlate_fflags(fp->f_flag);
+	if (rightsp != NULL)
+		kif->kf_cap_rights = *rightsp;
+	else
+		cap_rights_init(&kif->kf_cap_rights);
+	kif->kf_fd = fd;
+	kif->kf_ref_count = fp->f_count;
+	kif->kf_offset = foffset_get(fp);
+
+	/*
+	 * This may drop the filedesc lock, so the 'fp' cannot be
+	 * accessed after this call.
+	 */
+	error = fo_fill_kinfo(fp, kif, fdp);
+	if (error == 0)
+		kif->kf_status |= KF_ATTR_VALID;
+	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
+		pack_kinfo(kif);
+	else
+		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
+}
+
+static void
+export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
+    struct kinfo_file *kif, int flags)
+{
+	int error;
+
+	bzero(kif, sizeof(*kif));
+
+	kif->kf_type = KF_TYPE_VNODE;
+	error = vn_fill_kinfo_vnode(vp, kif);
+	if (error == 0)
+		kif->kf_status |= KF_ATTR_VALID;
+	kif->kf_flags = xlate_fflags(fflags);
+	cap_rights_init(&kif->kf_cap_rights);
+	kif->kf_fd = fd;
+	kif->kf_ref_count = -1;
+	kif->kf_offset = -1;
+	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
+		pack_kinfo(kif);
+	else
+		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
+	vrele(vp);
+}
+
+struct export_fd_buf {
+	struct filedesc		*fdp;
+	struct sbuf 		*sb;
+	ssize_t			remainder;
+	struct kinfo_file	kif;
+	int			flags;
+};
+
+static int
+export_kinfo_to_sb(struct export_fd_buf *efbuf)
+{
+	struct kinfo_file *kif;
+
+	kif = &efbuf->kif;
+	if (efbuf->remainder != -1) {
+		if (efbuf->remainder < kif->kf_structsize) {
+			/* Terminate export. */
+			efbuf->remainder = 0;
+			return (0);
+		}
+		efbuf->remainder -= kif->kf_structsize;
+	}
+	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
+}
+
+static int
+export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
+    struct export_fd_buf *efbuf)
+{
+	int error;
+
+	if (efbuf->remainder == 0)
+		return (0);
+	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
+	    efbuf->flags);
+	FILEDESC_SUNLOCK(efbuf->fdp);
+	error = export_kinfo_to_sb(efbuf);
+	FILEDESC_SLOCK(efbuf->fdp);
+	return (error);
+}
+
+static int
+export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
+    struct export_fd_buf *efbuf)
+{
+	int error;
+
+	if (efbuf->remainder == 0)
+		return (0);
+	if (efbuf->fdp != NULL)
+		FILEDESC_SUNLOCK(efbuf->fdp);
+	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
+	error = export_kinfo_to_sb(efbuf);
+	if (efbuf->fdp != NULL)
+		FILEDESC_SLOCK(efbuf->fdp);
+	return (error);
+}
+
+/*
+ * Store a process file descriptor information to sbuf.
+ *
+ * Takes a locked proc as argument, and returns with the proc unlocked.
+ */
+int
+kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
+    int flags)
+{
+	struct file *fp;
+	struct filedesc *fdp;
+	struct export_fd_buf *efbuf;
+	struct vnode *cttyvp, *textvp, *tracevp;
+	int error, i;
+	cap_rights_t rights;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/* ktrace vnode */
+	tracevp = p->p_tracevp;
+	if (tracevp != NULL)
+		vrefact(tracevp);
+	/* text vnode */
+	textvp = p->p_textvp;
+	if (textvp != NULL)
+		vrefact(textvp);
+	/* Controlling tty. */
+	cttyvp = NULL;
+	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
+		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
+		if (cttyvp != NULL)
+			vrefact(cttyvp);
+	}
+	fdp = fdhold(p);
+	PROC_UNLOCK(p);
+	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+	efbuf->fdp = NULL;
+	efbuf->sb = sb;
+	efbuf->remainder = maxlen;
+	efbuf->flags = flags;
+	if (tracevp != NULL)
+		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
+		    efbuf);
+	if (textvp != NULL)
+		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
+	if (cttyvp != NULL)
+		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
+		    efbuf);
+	error = 0;
+	if (fdp == NULL)
+		goto fail;
+	efbuf->fdp = fdp;
+	FILEDESC_SLOCK(fdp);
+	/* working directory */
+	if (fdp->fd_cdir != NULL) {
+		vrefact(fdp->fd_cdir);
+		export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
+	}
+	/* root directory */
+	if (fdp->fd_rdir != NULL) {
+		vrefact(fdp->fd_rdir);
+		export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
+	}
+	/* jail directory */
+	if (fdp->fd_jdir != NULL) {
+		vrefact(fdp->fd_jdir);
+		export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
+	}
+	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+			continue;
+#ifdef CAPABILITIES
+		rights = *cap_rights(fdp, i);
+#else /* !CAPABILITIES */
+		rights = cap_no_rights;
+#endif
+		/*
+		 * Create sysctl entry.  It is OK to drop the filedesc
+		 * lock inside of export_file_to_sb() as we will
+		 * re-validate and re-evaluate its properties when the
+		 * loop continues.
+		 */
+		error = export_file_to_sb(fp, i, &rights, efbuf);
+		if (error != 0 || efbuf->remainder == 0)
+			break;
+	}
+	FILEDESC_SUNLOCK(fdp);
+	fddrop(fdp);
+fail:
+	free(efbuf, M_TEMP);
+	return (error);
+}
+
+#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sb;
+	struct proc *p;
+	ssize_t maxlen;
+	int error, error2, *name;
+
+	name = (int *)arg1;
+
+	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
+	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
+	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+	if (error != 0) {
+		sbuf_delete(&sb);
+		return (error);
+	}
+	maxlen = req->oldptr != NULL ? req->oldlen : -1;
+	error = kern_proc_filedesc_out(p, &sb, maxlen,
+	    KERN_FILEDESC_PACK_KINFO);
+	error2 = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+	return (error != 0 ? error : error2);
+}
+
+#ifdef COMPAT_FREEBSD7
+#ifdef KINFO_OFILE_SIZE
+CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
+#endif
+
+static void
+kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
+{
+
+	okif->kf_structsize = sizeof(*okif);
+	okif->kf_type = kif->kf_type;
+	okif->kf_fd = kif->kf_fd;
+	okif->kf_ref_count = kif->kf_ref_count;
+	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
+	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
+	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
+	okif->kf_offset = kif->kf_offset;
+	if (kif->kf_type == KF_TYPE_VNODE)
+		okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
+	else
+		okif->kf_vnode_type = KF_VTYPE_VNON;
+	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
+	if (kif->kf_type == KF_TYPE_SOCKET) {
+		okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
+		okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
+		okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
+		okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
+		okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
+	} else {
+		okif->kf_sa_local.ss_family = AF_UNSPEC;
+		okif->kf_sa_peer.ss_family = AF_UNSPEC;
+	}
+}
+
+static int
+export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
+    struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
+{
+	int error;
+
+	vrefact(vp);
+	FILEDESC_SUNLOCK(fdp);
+	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
+	kinfo_to_okinfo(kif, okif);
+	error = SYSCTL_OUT(req, okif, sizeof(*okif));
+	FILEDESC_SLOCK(fdp);
+	return (error);
+}
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
+{
+	struct kinfo_ofile *okif;
+	struct kinfo_file *kif;
+	struct filedesc *fdp;
+	int error, i, *name;
+	struct file *fp;
+	struct proc *p;
+
+	name = (int *)arg1;
+	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+	if (error != 0)
+		return (error);
+	fdp = fdhold(p);
+	PROC_UNLOCK(p);
+	if (fdp == NULL)
+		return (ENOENT);
+	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
+	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
+	FILEDESC_SLOCK(fdp);
+	if (fdp->fd_cdir != NULL)
+		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
+		    okif, fdp, req);
+	if (fdp->fd_rdir != NULL)
+		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
+		    okif, fdp, req);
+	if (fdp->fd_jdir != NULL)
+		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
+		    okif, fdp, req);
+	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+			continue;
+		export_file_to_kinfo(fp, i, NULL, kif, fdp,
+		    KERN_FILEDESC_PACK_KINFO);
+		FILEDESC_SUNLOCK(fdp);
+		kinfo_to_okinfo(kif, okif);
+		error = SYSCTL_OUT(req, okif, sizeof(*okif));
+		FILEDESC_SLOCK(fdp);
+		if (error)
+			break;
+	}
+	FILEDESC_SUNLOCK(fdp);
+	fddrop(fdp);
+	free(kif, M_TEMP);
+	free(okif, M_TEMP);
+	return (0);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
+    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
+    "Process ofiledesc entries");
+#endif	/* COMPAT_FREEBSD7 */
+
+int
+vntype_to_kinfo(int vtype)
+{
+	struct {
+		int	vtype;
+		int	kf_vtype;
+	} vtypes_table[] = {
+		{ VBAD, KF_VTYPE_VBAD },
+		{ VBLK, KF_VTYPE_VBLK },
+		{ VCHR, KF_VTYPE_VCHR },
+		{ VDIR, KF_VTYPE_VDIR },
+		{ VFIFO, KF_VTYPE_VFIFO },
+		{ VLNK, KF_VTYPE_VLNK },
+		{ VNON, KF_VTYPE_VNON },
+		{ VREG, KF_VTYPE_VREG },
+		{ VSOCK, KF_VTYPE_VSOCK }
+	};
+	unsigned int i;
+
+	/*
+	 * Perform vtype translation.
+	 */
+	for (i = 0; i < nitems(vtypes_table); i++)
+		if (vtypes_table[i].vtype == vtype)
+			return (vtypes_table[i].kf_vtype);
+
+	return (KF_VTYPE_UNKNOWN);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
+    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
+    "Process filedesc entries");
+
+/*
+ * Store a process current working directory information to sbuf.
+ *
+ * Takes a locked proc as argument, and returns with the proc unlocked.
+ */
+int
+kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
+{
+	struct filedesc *fdp;
+	struct export_fd_buf *efbuf;
+	int error;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	fdp = fdhold(p);
+	PROC_UNLOCK(p);
+	if (fdp == NULL)
+		return (EINVAL);
+
+	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+	efbuf->fdp = fdp;
+	efbuf->sb = sb;
+	efbuf->remainder = maxlen;
+
+	FILEDESC_SLOCK(fdp);
+	if (fdp->fd_cdir == NULL)
+		error = EINVAL;
+	else {
+		vrefact(fdp->fd_cdir);
+		error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD,
+		    FREAD, efbuf);
+	}
+	FILEDESC_SUNLOCK(fdp);
+	fddrop(fdp);
+	free(efbuf, M_TEMP);
+	return (error);
+}
+
+/*
+ * Get per-process current working directory.
+ */
+static int
+sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sb;
+	struct proc *p;
+	ssize_t maxlen;
+	int error, error2, *name;
+
+	name = (int *)arg1;
+
+	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
+	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
+	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+	if (error != 0) {
+		sbuf_delete(&sb);
+		return (error);
+	}
+	maxlen = req->oldptr != NULL ? req->oldlen : -1;
+	error = kern_proc_cwd_out(p, &sb, maxlen);
+	error2 = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+	return (error != 0 ? error : error2);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
+    sysctl_kern_proc_cwd, "Process current working directory");
+
+#ifdef DDB
+/*
+ * For the purposes of debugging, generate a human-readable string for the
+ * file type.
+ */
+static const char *
+file_type_to_name(short type)
+{
+
+	switch (type) {
+	case 0:
+		return ("zero");
+	case DTYPE_VNODE:
+		return ("vnode");
+	case DTYPE_SOCKET:
+		return ("socket");
+	case DTYPE_PIPE:
+		return ("pipe");
+	case DTYPE_FIFO:
+		return ("fifo");
+	case DTYPE_KQUEUE:
+		return ("kqueue");
+	case DTYPE_CRYPTO:
+		return ("crypto");
+	case DTYPE_MQUEUE:
+		return ("mqueue");
+	case DTYPE_SHM:
+		return ("shm");
+	case DTYPE_SEM:
+		return ("ksem");
+	case DTYPE_PTS:
+		return ("pts");
+	case DTYPE_DEV:
+		return ("dev");
+	case DTYPE_PROCDESC:
+		return ("proc");
+	case DTYPE_LINUXEFD:
+		return ("levent");
+	case DTYPE_LINUXTFD:
+		return ("ltimer");
+	default:
+		return ("unkn");
+	}
+}
+
+/*
+ * For the purposes of debugging, identify a process (if any, perhaps one of
+ * many) that references the passed file in its file descriptor array. Return
+ * NULL if none.
+ */
+static struct proc *
+file_to_first_proc(struct file *fp)
+{
+	struct filedesc *fdp;
+	struct proc *p;
+	int n;
+
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (p->p_state == PRS_NEW)
+			continue;
+		fdp = p->p_fd;
+		if (fdp == NULL)
+			continue;
+		for (n = 0; n <= fdp->fd_lastfile; n++) {
+			if (fp == fdp->fd_ofiles[n].fde_file)
+				return (p);
+		}
+	}
+	return (NULL);
+}
+
+static void
+db_print_file(struct file *fp, int header)
+{
+#define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
+	struct proc *p;
+
+	if (header)
+		db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
+		    XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
+		    "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
+		    "FCmd");
+	p = file_to_first_proc(fp);
+	db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
+	    fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
+	    fp->f_flag, 0, fp->f_count, 0, XPTRWIDTH, fp->f_vnode,
+	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+
+#undef XPTRWIDTH
+}
+
+DB_SHOW_COMMAND(file, db_show_file)
+{
+	struct file *fp;
+
+	if (!have_addr) {
+		db_printf("usage: show file <addr>\n");
+		return;
+	}
+	fp = (struct file *)addr;
+	db_print_file(fp, 1);
+}
+
+DB_SHOW_COMMAND(files, db_show_files)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	struct proc *p;
+	int header;
+	int n;
+
+	header = 1;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (p->p_state == PRS_NEW)
+			continue;
+		if ((fdp = p->p_fd) == NULL)
+			continue;
+		for (n = 0; n <= fdp->fd_lastfile; ++n) {
+			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+				continue;
+			db_print_file(fp, header);
+			header = 0;
+		}
+	}
+}
+#endif
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
+    &maxfilesperproc, 0, "Maximum files allowed open per process");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
+    &maxfiles, 0, "Maximum number of files");
+
+SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
+    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
+
+/* ARGSUSED*/
+static void
+filelistinit(void *dummy)
+{
+
+	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
+}
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
+
+/*-------------------------------------------------------------------*/
+
+static int
+badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+static int
+badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (0);
+}
+
+static int
+badfo_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_close(struct file *fp, struct thread *td)
+{
+
+	return (0);
+}
+
+static int
+badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    struct thread *td)
+{
+
+	return (EBADF);
+}
+
+static int
+badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
+{
+
+	return (0);
+}
+
+struct fileops badfileops = {
+	.fo_read = badfo_readwrite,
+	.fo_write = badfo_readwrite,
+	.fo_truncate = badfo_truncate,
+	.fo_ioctl = badfo_ioctl,
+	.fo_poll = badfo_poll,
+	.fo_kqfilter = badfo_kqfilter,
+	.fo_stat = badfo_stat,
+	.fo_close = badfo_close,
+	.fo_chmod = badfo_chmod,
+	.fo_chown = badfo_chown,
+	.fo_sendfile = badfo_sendfile,
+	.fo_fill_kinfo = badfo_fill_kinfo,
+};
+
+int
+invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+int
+invfo_ioctl(struct file *fp, u_long com, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+
+	return (ENOTTY);
+}
+
+int
+invfo_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (poll_no_poll(events));
+}
+
+int
+invfo_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (EINVAL);
+}
+
+int
+invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+int
+invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+int
+invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    struct thread *td)
+{
+
+	return (EINVAL);
+}
+
+/*-------------------------------------------------------------------*/
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process.  Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ *
+ * XXX: we could give this one a cloning event handler if necessary.
+ */
+
+/* ARGSUSED */
+static int
+fdopen(struct cdev *dev, int mode, int type, struct thread *td)
+{
+
+	/*
+	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
+	 * the file descriptor being sought for duplication. The error
+	 * return ensures that the vnode for this device will be released
+	 * by vn_open. Open will detect this special error and take the
+	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+	 * will simply report the error.
+	 */
+	td->td_dupfd = dev2unit(dev);
+	return (ENODEV);
+}
+
+static struct cdevsw fildesc_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	fdopen,
+	.d_name =	"FD",
+};
+
+static void
+fildesc_drvinit(void *unused)
+{
+	struct cdev *dev;
+
+	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
+	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
+	make_dev_alias(dev, "stdin");
+	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
+	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
+	make_dev_alias(dev, "stdout");
+	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
+	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
+	make_dev_alias(dev, "stderr");
+}
+
+SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/freebsd/sys/kern/kern_lock.c b/freebsd/sys/kern/kern_lock.c
new file mode 100644
index 00000000..d769a185
--- /dev/null
+++ b/freebsd/sys/kern/kern_lock.c
@@ -0,0 +1,1719 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2008 Attilio Rao <attilio@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/lock_profile.h>
+#include <sys/lockmgr.h>
+#include <sys/lockstat.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sleepqueue.h>
+#ifdef DEBUG_LOCKS
+#include <sys/stack.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
+    (LK_ADAPTIVE | LK_NOSHARE));
+CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
+    ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));
+
+#define	SQ_EXCLUSIVE_QUEUE	0
+#define	SQ_SHARED_QUEUE		1
+
+#ifndef INVARIANTS
+#define	_lockmgr_assert(lk, what, file, line)
+#endif
+
+#define	TD_SLOCKS_INC(td)	((td)->td_lk_slocks++)
+#define	TD_SLOCKS_DEC(td)	((td)->td_lk_slocks--)
+
+#ifndef DEBUG_LOCKS
+#define	STACK_PRINT(lk)
+#define	STACK_SAVE(lk)
+#define	STACK_ZERO(lk)
+#else
+#define	STACK_PRINT(lk)	stack_print_ddb(&(lk)->lk_stack)
+#define	STACK_SAVE(lk)	stack_save(&(lk)->lk_stack)
+#define	STACK_ZERO(lk)	stack_zero(&(lk)->lk_stack)
+#endif
+
+#define	LOCK_LOG2(lk, string, arg1, arg2)				\
+	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
+		CTR2(KTR_LOCK, (string), (arg1), (arg2))
+#define	LOCK_LOG3(lk, string, arg1, arg2, arg3)				\
+	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
+		CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3))
+
+#define	GIANT_DECLARE							\
+	int _i = 0;							\
+	WITNESS_SAVE_DECL(Giant)
+#define	GIANT_RESTORE() do {						\
+	if (_i > 0) {							\
+		while (_i--)						\
+			mtx_lock(&Giant);				\
+		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
+	}								\
+} while (0)
+#define	GIANT_SAVE() do {						\
+	if (mtx_owned(&Giant)) {					\
+		WITNESS_SAVE(&Giant.lock_object, Giant);		\
+		while (mtx_owned(&Giant)) {				\
+			_i++;						\
+			mtx_unlock(&Giant);				\
+		}							\
+	}								\
+} while (0)
+
+static bool __always_inline
+LK_CAN_SHARE(uintptr_t x, int flags, bool fp)
+{
+
+	if ((x & (LK_SHARE | LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) ==
+	    LK_SHARE)
+		return (true);
+	if (fp || (!(x & LK_SHARE)))
+		return (false);
+	if ((curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) ||
+	    (curthread->td_pflags & TDP_DEADLKTREAT))
+		return (true);
+	return (false);
+}
+
+#define	LK_TRYOP(x)							\
+	((x) & LK_NOWAIT)
+
+#define	LK_CAN_WITNESS(x)						\
+	(((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x))
+#define	LK_TRYWIT(x)							\
+	(LK_TRYOP(x) ? LOP_TRYLOCK : 0)
+
+#define	LK_CAN_ADAPT(lk, f)						\
+	(((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 &&		\
+	((f) & LK_SLEEPFAIL) == 0)
+
+#define	lockmgr_disowned(lk)						\
+	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC)
+
+#define	lockmgr_xlocked_v(v)						\
+	(((v) & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread)
+
+#define	lockmgr_xlocked(lk) lockmgr_xlocked_v((lk)->lk_lock)
+
+static void	assert_lockmgr(const struct lock_object *lock, int how);
+#ifdef DDB
+static void	db_show_lockmgr(const struct lock_object *lock);
+#endif
+static void	lock_lockmgr(struct lock_object *lock, uintptr_t how);
+#ifdef KDTRACE_HOOKS
+static int	owner_lockmgr(const struct lock_object *lock,
+		    struct thread **owner);
+#endif
+static uintptr_t unlock_lockmgr(struct lock_object *lock);
+
+struct lock_class lock_class_lockmgr = {
+	.lc_name = "lockmgr",
+	.lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE,
+	.lc_assert = assert_lockmgr,
+#ifdef DDB
+	.lc_ddb_show = db_show_lockmgr,
+#endif
+	.lc_lock = lock_lockmgr,
+	.lc_unlock = unlock_lockmgr,
+#ifdef KDTRACE_HOOKS
+	.lc_owner = owner_lockmgr,
+#endif
+};
+
+struct lockmgr_wait {
+	const char *iwmesg;
+	int ipri;
+	int itimo;
+};
+
+static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp,
+    int flags, bool fp);
+static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp);
+
+static void
+lockmgr_exit(u_int flags, struct lock_object *ilk, int wakeup_swapper)
+{
+	struct lock_class *class;
+
+	if (flags & LK_INTERLOCK) {
+		class = LOCK_CLASS(ilk);
+		class->lc_unlock(ilk);
+	}
+
+	if (__predict_false(wakeup_swapper))
+		kick_proc0();
+}
+
+static void
+lockmgr_note_shared_acquire(struct lock *lk, int contested,
+    uint64_t waittime, const char *file, int line, int flags)
+{
+
+	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested,
+	    waittime, file, line, LOCKSTAT_READER);
+	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line);
+	WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line);
+	TD_LOCKS_INC(curthread);
+	TD_SLOCKS_INC(curthread);
+	STACK_SAVE(lk);
+}
+
+static void
+lockmgr_note_shared_release(struct lock *lk, const char *file, int line)
+{
+
+	LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_READER);
+	WITNESS_UNLOCK(&lk->lock_object, 0, file, line);
+	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line);
+	TD_LOCKS_DEC(curthread);
+	TD_SLOCKS_DEC(curthread);
+}
+
+static void
+lockmgr_note_exclusive_acquire(struct lock *lk, int contested,
+    uint64_t waittime, const char *file, int line, int flags)
+{
+
+	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested,
+	    waittime, file, line, LOCKSTAT_WRITER);
+	LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line);
+	WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file,
+	    line);
+	TD_LOCKS_INC(curthread);
+	STACK_SAVE(lk);
+}
+
+static void
+lockmgr_note_exclusive_release(struct lock *lk, const char *file, int line)
+{
+
+	LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_WRITER);
+	LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file,
+	    line);
+	WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+	TD_LOCKS_DEC(curthread);
+}
+
+static __inline struct thread *
+lockmgr_xholder(const struct lock *lk)
+{
+	uintptr_t x;
+
+	x = lk->lk_lock;
+	return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x));
+}
+
+/*
+ * It assumes sleepq_lock held and returns with this one unheld.
+ * It also assumes the generic interlock is sane and previously checked.
+ * If LK_INTERLOCK is specified the interlock is not reacquired after the
+ * sleep.
+ */
+static __inline int
+sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *wmesg, int pri, int timo, int queue)
+{
+	GIANT_DECLARE;
+	struct lock_class *class;
+	int catch, error;
+
+	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+	catch = pri & PCATCH;
+	pri &= PRIMASK;
+	error = 0;
+
+	LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk,
+	    (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared");
+
+	if (flags & LK_INTERLOCK)
+		class->lc_unlock(ilk);
+	if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
+		lk->lk_exslpfail++;
+	GIANT_SAVE();
+	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
+	    SLEEPQ_INTERRUPTIBLE : 0), queue);
+	if ((flags & LK_TIMELOCK) && timo)
+		sleepq_set_timeout(&lk->lock_object, timo);
+
+	/*
+	 * Decisional switch for real sleeping.
+	 */
+	if ((flags & LK_TIMELOCK) && timo && catch)
+		error = sleepq_timedwait_sig(&lk->lock_object, pri);
+	else if ((flags & LK_TIMELOCK) && timo)
+		error = sleepq_timedwait(&lk->lock_object, pri);
+	else if (catch)
+		error = sleepq_wait_sig(&lk->lock_object, pri);
+	else
+		sleepq_wait(&lk->lock_object, pri);
+	GIANT_RESTORE();
+	if ((flags & LK_SLEEPFAIL) && error == 0)
+		error = ENOLCK;
+
+	return (error);
+}
+
+static __inline int
+wakeupshlk(struct lock *lk, const char *file, int line)
+{
+	uintptr_t v, x, orig_x;
+	u_int realexslp;
+	int queue, wakeup_swapper;
+
+	wakeup_swapper = 0;
+	for (;;) {
+		x = lk->lk_lock;
+		if (lockmgr_sunlock_try(lk, &x))
+			break;
+
+		/*
+		 * We should have a sharer with waiters, so enter the hard
+		 * path in order to handle wakeups correctly.
+		 */
+		sleepq_lock(&lk->lock_object);
+		orig_x = lk->lk_lock;
+retry_sleepq:
+		x = orig_x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+		v = LK_UNLOCKED;
+
+		/*
+		 * If the lock has exclusive waiters, give them preference in
+		 * order to avoid deadlock with shared runners up.
+		 * If interruptible sleeps left the exclusive queue empty
+		 * avoid a starvation for the threads sleeping on the shared
+		 * queue by giving them precedence and cleaning up the
+		 * exclusive waiters bit anyway.
+		 * Please note that lk_exslpfail count may be lying about
+		 * the real number of waiters with the LK_SLEEPFAIL flag on
+		 * because they may be used in conjunction with interruptible
+		 * sleeps so lk_exslpfail might be considered an 'upper limit'
+		 * bound, including the edge cases.
+		 */
+		realexslp = sleepq_sleepcnt(&lk->lock_object,
+		    SQ_EXCLUSIVE_QUEUE);
+		if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+			if (lk->lk_exslpfail < realexslp) {
+				lk->lk_exslpfail = 0;
+				queue = SQ_EXCLUSIVE_QUEUE;
+				v |= (x & LK_SHARED_WAITERS);
+			} else {
+				lk->lk_exslpfail = 0;
+				LOCK_LOG2(lk,
+				    "%s: %p has only LK_SLEEPFAIL sleepers",
+				    __func__, lk);
+				LOCK_LOG2(lk,
+			    "%s: %p waking up threads on the exclusive queue",
+				    __func__, lk);
+				wakeup_swapper =
+				    sleepq_broadcast(&lk->lock_object,
+				    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+				queue = SQ_SHARED_QUEUE;
+			}
+				
+		} else {
+
+			/*
+			 * Exclusive waiters sleeping with LK_SLEEPFAIL on
+			 * and using interruptible sleeps/timeout may have
+			 * left spourious lk_exslpfail counts on, so clean
+			 * it up anyway.
+			 */
+			lk->lk_exslpfail = 0;
+			queue = SQ_SHARED_QUEUE;
+		}
+
+		if (lockmgr_sunlock_try(lk, &orig_x)) {
+			sleepq_release(&lk->lock_object);
+			break;
+		}
+
+		x |= LK_SHARERS_LOCK(1);
+		if (!atomic_fcmpset_rel_ptr(&lk->lk_lock, &x, v)) {
+			orig_x = x;
+			goto retry_sleepq;
+		}
+		LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
+		    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+		    "exclusive");
+		wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
+		    0, queue);
+		sleepq_release(&lk->lock_object);
+		break;
+	}
+
+	lockmgr_note_shared_release(lk, file, line);
+	return (wakeup_swapper);
+}
+
+static void
+assert_lockmgr(const struct lock_object *lock, int what)
+{
+
+	panic("lockmgr locks do not support assertions");
+}
+
+static void
+lock_lockmgr(struct lock_object *lock, uintptr_t how)
+{
+
+	panic("lockmgr locks do not support sleep interlocking");
+}
+
+static uintptr_t
+unlock_lockmgr(struct lock_object *lock)
+{
+
+	panic("lockmgr locks do not support sleep interlocking");
+}
+
+#ifdef KDTRACE_HOOKS
+static int
+owner_lockmgr(const struct lock_object *lock, struct thread **owner)
+{
+
+	panic("lockmgr locks do not support owner inquiring");
+}
+#endif
+
+void
+lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags)
+{
+	int iflags;
+
+	MPASS((flags & ~LK_INIT_MASK) == 0);
+	ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock,
+            ("%s: lockmgr not aligned for %s: %p", __func__, wmesg,
+            &lk->lk_lock));
+
+	iflags = LO_SLEEPABLE | LO_UPGRADABLE;
+	if (flags & LK_CANRECURSE)
+		iflags |= LO_RECURSABLE;
+	if ((flags & LK_NODUP) == 0)
+		iflags |= LO_DUPOK;
+	if (flags & LK_NOPROFILE)
+		iflags |= LO_NOPROFILE;
+	if ((flags & LK_NOWITNESS) == 0)
+		iflags |= LO_WITNESS;
+	if (flags & LK_QUIET)
+		iflags |= LO_QUIET;
+	if (flags & LK_IS_VNODE)
+		iflags |= LO_IS_VNODE;
+	iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE);
+
+	lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags);
+	lk->lk_lock = LK_UNLOCKED;
+	lk->lk_recurse = 0;
+	lk->lk_exslpfail = 0;
+	lk->lk_timo = timo;
+	lk->lk_pri = pri;
+	STACK_ZERO(lk);
+}
+
+/*
+ * XXX: Gross hacks to manipulate external lock flags after
+ * initialization.  Used for certain vnode and buf locks.
+ */
+void
+lockallowshare(struct lock *lk)
+{
+
+	lockmgr_assert(lk, KA_XLOCKED);
+	lk->lock_object.lo_flags &= ~LK_NOSHARE;
+}
+
+void
+lockdisableshare(struct lock *lk)
+{
+
+	lockmgr_assert(lk, KA_XLOCKED);
+	lk->lock_object.lo_flags |= LK_NOSHARE;
+}
+
+void
+lockallowrecurse(struct lock *lk)
+{
+
+	lockmgr_assert(lk, KA_XLOCKED);
+	lk->lock_object.lo_flags |= LO_RECURSABLE;
+}
+
+void
+lockdisablerecurse(struct lock *lk)
+{
+
+	lockmgr_assert(lk, KA_XLOCKED);
+	lk->lock_object.lo_flags &= ~LO_RECURSABLE;
+}
+
+void
+lockdestroy(struct lock *lk)
+{
+
+	KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
+	KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
+	KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters"));
+	lock_destroy(&lk->lock_object);
+}
+
+static bool __always_inline
+lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags, bool fp)
+{
+
+	/*
+	 * If no other thread has an exclusive lock, or
+	 * no exclusive waiter is present, bump the count of
+	 * sharers.  Since we have to preserve the state of
+	 * waiters, if we fail to acquire the shared lock
+	 * loop back and retry.
+	 */
+	*xp = lk->lk_lock;
+	while (LK_CAN_SHARE(*xp, flags, fp)) {
+		if (atomic_fcmpset_acq_ptr(&lk->lk_lock, xp,
+		    *xp + LK_ONE_SHARER)) {
+			return (true);
+		}
+	}
+	return (false);
+}
+
+static bool __always_inline
+lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp)
+{
+
+	for (;;) {
+		if (LK_SHARERS(*xp) > 1 || !(*xp & LK_ALL_WAITERS)) {
+			if (atomic_fcmpset_rel_ptr(&lk->lk_lock, xp,
+			    *xp - LK_ONE_SHARER))
+				return (true);
+			continue;
+		}
+		break;
+	}
+	return (false);
+}
+
+static __noinline int
+lockmgr_slock_hard(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *file, int line, struct lockmgr_wait *lwa)
+{
+	uintptr_t tid, x;
+	int error = 0;
+	const char *iwmesg;
+	int ipri, itimo;
+
+#ifdef KDTRACE_HOOKS
+	uint64_t sleep_time = 0;
+#endif
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+
+	if (__predict_false(panicstr != NULL))
+		goto out;
+
+	tid = (uintptr_t)curthread;
+
+	if (LK_CAN_WITNESS(flags))
+		WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
+		    file, line, flags & LK_INTERLOCK ? ilk : NULL);
+	for (;;) {
+		if (lockmgr_slock_try(lk, &x, flags, false))
+			break;
+#ifdef HWPMC_HOOKS
+		PMC_SOFT_CALL( , , lock, failed);
+#endif
+		lock_profile_obtain_lock_failed(&lk->lock_object,
+		    &contested, &waittime);
+
+		/*
+		 * If the lock is already held by curthread in
+		 * exclusive way avoid a deadlock.
+		 */
+		if (LK_HOLDER(x) == tid) {
+			LOCK_LOG2(lk,
+			    "%s: %p already held in exclusive mode",
+			    __func__, lk);
+			error = EDEADLK;
+			break;
+		}
+
+		/*
+		 * If the lock is expected to not sleep just give up
+		 * and return.
+		 */
+		if (LK_TRYOP(flags)) {
+			LOCK_LOG2(lk, "%s: %p fails the try operation",
+			    __func__, lk);
+			error = EBUSY;
+			break;
+		}
+
+		/*
+		 * Acquire the sleepqueue chain lock because we
+		 * probabilly will need to manipulate waiters flags.
+		 */
+		sleepq_lock(&lk->lock_object);
+		x = lk->lk_lock;
+retry_sleepq:
+
+		/*
+		 * if the lock can be acquired in shared mode, try
+		 * again.
+		 */
+		if (LK_CAN_SHARE(x, flags, false)) {
+			sleepq_release(&lk->lock_object);
+			continue;
+		}
+
+		/*
+		 * Try to set the LK_SHARED_WAITERS flag.  If we fail,
+		 * loop back and retry.
+		 */
+		if ((x & LK_SHARED_WAITERS) == 0) {
+			if (!atomic_fcmpset_acq_ptr(&lk->lk_lock, &x,
+			    x | LK_SHARED_WAITERS)) {
+				goto retry_sleepq;
+			}
+			LOCK_LOG2(lk, "%s: %p set shared waiters flag",
+			    __func__, lk);
+		}
+
+		if (lwa == NULL) {
+			iwmesg = lk->lock_object.lo_name;
+			ipri = lk->lk_pri;
+			itimo = lk->lk_timo;
+		} else {
+			iwmesg = lwa->iwmesg;
+			ipri = lwa->ipri;
+			itimo = lwa->itimo;
+		}
+
+		/*
+		 * As far as we have been unable to acquire the
+		 * shared lock and the shared waiters flag is set,
+		 * we will sleep.
+		 */
+#ifdef KDTRACE_HOOKS
+		sleep_time -= lockstat_nsecs(&lk->lock_object);
+#endif
+		error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+		    SQ_SHARED_QUEUE);
+#ifdef KDTRACE_HOOKS
+		sleep_time += lockstat_nsecs(&lk->lock_object);
+#endif
+		flags &= ~LK_INTERLOCK;
+		if (error) {
+			LOCK_LOG3(lk,
+			    "%s: interrupted sleep for %p with %d",
+			    __func__, lk, error);
+			break;
+		}
+		LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+		    __func__, lk);
+	}
+	if (error == 0) {
+#ifdef KDTRACE_HOOKS
+		if (sleep_time != 0)
+			LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time,
+			    LOCKSTAT_READER, (x & LK_SHARE) == 0,
+			    (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x));
+#endif
+#ifdef LOCK_PROFILING
+		lockmgr_note_shared_acquire(lk, contested, waittime,
+		    file, line, flags);
+#else
+		lockmgr_note_shared_acquire(lk, 0, 0, file, line,
+		    flags);
+#endif
+	}
+
+out:
+	lockmgr_exit(flags, ilk, 0);
+	return (error);
+}
+
+static __noinline int
+lockmgr_xlock_hard(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *file, int line, struct lockmgr_wait *lwa)
+{
+	struct lock_class *class;
+	uintptr_t tid, x, v;
+	int error = 0;
+	const char *iwmesg;
+	int ipri, itimo;
+
+#ifdef KDTRACE_HOOKS
+	uint64_t sleep_time = 0;
+#endif
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+
+	if (__predict_false(panicstr != NULL))
+		goto out;
+
+	tid = (uintptr_t)curthread;
+
+	if (LK_CAN_WITNESS(flags))
+		WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+		    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+		    ilk : NULL);
+
+	/*
+	 * If curthread already holds the lock and this one is
+	 * allowed to recurse, simply recurse on it.
+	 */
+	if (lockmgr_xlocked(lk)) {
+		if ((flags & LK_CANRECURSE) == 0 &&
+		    (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) {
+			/*
+			 * If the lock is expected to not panic just
+			 * give up and return.
+			 */
+			if (LK_TRYOP(flags)) {
+				LOCK_LOG2(lk,
+				    "%s: %p fails the try operation",
+				    __func__, lk);
+				error = EBUSY;
+				goto out;
+			}
+			if (flags & LK_INTERLOCK) {
+				class = LOCK_CLASS(ilk);
+				class->lc_unlock(ilk);
+			}
+			panic("%s: recursing on non recursive lockmgr %p "
+			    "@ %s:%d\n", __func__, lk, file, line);
+		}
+		lk->lk_recurse++;
+		LOCK_LOG2(lk, "%s: %p recursing", __func__, lk);
+		LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
+		    lk->lk_recurse, file, line);
+		WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+		    LK_TRYWIT(flags), file, line);
+		TD_LOCKS_INC(curthread);
+		goto out;
+	}
+
+	for (;;) {
+		if (lk->lk_lock == LK_UNLOCKED &&
+		    atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid))
+			break;
+#ifdef HWPMC_HOOKS
+		PMC_SOFT_CALL( , , lock, failed);
+#endif
+		lock_profile_obtain_lock_failed(&lk->lock_object,
+		    &contested, &waittime);
+
+		/*
+		 * If the lock is expected to not sleep just give up
+		 * and return.
+		 */
+		if (LK_TRYOP(flags)) {
+			LOCK_LOG2(lk, "%s: %p fails the try operation",
+			    __func__, lk);
+			error = EBUSY;
+			break;
+		}
+
+		/*
+		 * Acquire the sleepqueue chain lock because we
+		 * probabilly will need to manipulate waiters flags.
+		 */
+		sleepq_lock(&lk->lock_object);
+		x = lk->lk_lock;
+retry_sleepq:
+
+		/*
+		 * if the lock has been released while we spun on
+		 * the sleepqueue chain lock just try again.
+		 */
+		if (x == LK_UNLOCKED) {
+			sleepq_release(&lk->lock_object);
+			continue;
+		}
+
+		/*
+		 * The lock can be in the state where there is a
+		 * pending queue of waiters, but still no owner.
+		 * This happens when the lock is contested and an
+		 * owner is going to claim the lock.
+		 * If curthread is the one successfully acquiring it
+		 * claim lock ownership and return, preserving waiters
+		 * flags.
+		 */
+		v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+		if ((x & ~v) == LK_UNLOCKED) {
+			v &= ~LK_EXCLUSIVE_SPINNERS;
+			if (atomic_fcmpset_acq_ptr(&lk->lk_lock, &x,
+			    tid | v)) {
+				sleepq_release(&lk->lock_object);
+				LOCK_LOG2(lk,
+				    "%s: %p claimed by a new writer",
+				    __func__, lk);
+				break;
+			}
+			goto retry_sleepq;
+		}
+
+		/*
+		 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
+		 * fail, loop back and retry.
+		 */
+		if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+			if (!atomic_fcmpset_ptr(&lk->lk_lock, &x,
+			    x | LK_EXCLUSIVE_WAITERS)) {
+				goto retry_sleepq;
+			}
+			LOCK_LOG2(lk, "%s: %p set excl waiters flag",
+			    __func__, lk);
+		}
+
+		if (lwa == NULL) {
+			iwmesg = lk->lock_object.lo_name;
+			ipri = lk->lk_pri;
+			itimo = lk->lk_timo;
+		} else {
+			iwmesg = lwa->iwmesg;
+			ipri = lwa->ipri;
+			itimo = lwa->itimo;
+		}
+
+		/*
+		 * As far as we have been unable to acquire the
+		 * exclusive lock and the exclusive waiters flag
+		 * is set, we will sleep.
+		 */
+#ifdef KDTRACE_HOOKS
+		sleep_time -= lockstat_nsecs(&lk->lock_object);
+#endif
+		error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+		    SQ_EXCLUSIVE_QUEUE);
+#ifdef KDTRACE_HOOKS
+		sleep_time += lockstat_nsecs(&lk->lock_object);
+#endif
+		flags &= ~LK_INTERLOCK;
+		if (error) {
+			LOCK_LOG3(lk,
+			    "%s: interrupted sleep for %p with %d",
+			    __func__, lk, error);
+			break;
+		}
+		LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+		    __func__, lk);
+	}
+	if (error == 0) {
+#ifdef KDTRACE_HOOKS
+		if (sleep_time != 0)
+			LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time,
+			    LOCKSTAT_WRITER, (x & LK_SHARE) == 0,
+			    (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x));
+#endif
+#ifdef LOCK_PROFILING
+		lockmgr_note_exclusive_acquire(lk, contested, waittime,
+		    file, line, flags);
+#else
+		lockmgr_note_exclusive_acquire(lk, 0, 0, file, line,
+		    flags);
+#endif
+	}
+
+out:
+	lockmgr_exit(flags, ilk, 0);
+	return (error);
+}
+
+static __noinline int
+lockmgr_upgrade(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *file, int line, struct lockmgr_wait *lwa)
+{
+	uintptr_t tid, x, v;
+	int error = 0;
+	int wakeup_swapper = 0;
+	int op;
+
+	if (__predict_false(panicstr != NULL))
+		goto out;
+
+	tid = (uintptr_t)curthread;
+
+	_lockmgr_assert(lk, KA_SLOCKED, file, line);
+	v = lk->lk_lock;
+	x = v & LK_ALL_WAITERS;
+	v &= LK_EXCLUSIVE_SPINNERS;
+
+	/*
+	 * Try to switch from one shared lock to an exclusive one.
+	 * We need to preserve waiters flags during the operation.
+	 */
+	if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
+	    tid | x)) {
+		LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
+		    line);
+		WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
+		    LK_TRYWIT(flags), file, line);
+		LOCKSTAT_RECORD0(lockmgr__upgrade, lk);
+		TD_SLOCKS_DEC(curthread);
+		goto out;
+	}
+
+	op = flags & LK_TYPE_MASK;
+
+	/*
+	 * In LK_TRYUPGRADE mode, do not drop the lock,
+	 * returning EBUSY instead.
+	 */
+	if (op == LK_TRYUPGRADE) {
+		LOCK_LOG2(lk, "%s: %p failed the nowait upgrade",
+		    __func__, lk);
+		error = EBUSY;
+		goto out;
+	}
+
+	/*
+	 * We have been unable to succeed in upgrading, so just
+	 * give up the shared lock.
+	 */
+	wakeup_swapper |= wakeupshlk(lk, file, line);
+	error = lockmgr_xlock_hard(lk, flags, ilk, file, line, lwa);
+	flags &= ~LK_INTERLOCK;
+out:
+	lockmgr_exit(flags, ilk, wakeup_swapper);
+	return (error);
+}
+
+int
+lockmgr_lock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *file, int line)
+{
+	struct lock_class *class;
+	uintptr_t x, tid;
+	u_int op;
+	bool locked;
+
+	if (__predict_false(panicstr != NULL))
+		return (0);
+
+	op = flags & LK_TYPE_MASK;
+	locked = false;
+	switch (op) {
+	case LK_SHARED:
+		if (LK_CAN_WITNESS(flags))
+			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
+			    file, line, flags & LK_INTERLOCK ? ilk : NULL);
+		if (__predict_false(lk->lock_object.lo_flags & LK_NOSHARE))
+			break;
+		if (lockmgr_slock_try(lk, &x, flags, true)) {
+			lockmgr_note_shared_acquire(lk, 0, 0,
+			    file, line, flags);
+			locked = true;
+		} else {
+			return (lockmgr_slock_hard(lk, flags, ilk, file, line,
+			    NULL));
+		}
+		break;
+	case LK_EXCLUSIVE:
+		if (LK_CAN_WITNESS(flags))
+			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+			    ilk : NULL);
+		tid = (uintptr_t)curthread;
+		if (lk->lk_lock == LK_UNLOCKED &&
+		    atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) {
+			lockmgr_note_exclusive_acquire(lk, 0, 0, file, line,
+			    flags);
+			locked = true;
+		} else {
+			return (lockmgr_xlock_hard(lk, flags, ilk, file, line,
+			    NULL));
+		}
+		break;
+	case LK_UPGRADE:
+	case LK_TRYUPGRADE:
+		return (lockmgr_upgrade(lk, flags, ilk, file, line, NULL));
+	default:
+		break;
+	}
+	if (__predict_true(locked)) {
+		if (__predict_false(flags & LK_INTERLOCK)) {
+			class = LOCK_CLASS(ilk);
+			class->lc_unlock(ilk);
+		}
+		return (0);
+	} else {
+		return (__lockmgr_args(lk, flags, ilk, LK_WMESG_DEFAULT,
+		    LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, file, line));
+	}
+}
+
+static __noinline int
+lockmgr_sunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk,
+    const char *file, int line)
+
+{
+	int wakeup_swapper = 0;
+
+	if (__predict_false(panicstr != NULL))
+		goto out;
+
+	wakeup_swapper = wakeupshlk(lk, file, line);
+
+out:
+	lockmgr_exit(flags, ilk, wakeup_swapper);
+	return (0);
+}
+
+static __noinline int
+lockmgr_xunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk,
+    const char *file, int line)
+{
+	uintptr_t tid, v;
+	int wakeup_swapper = 0;
+	u_int realexslp;
+	int queue;
+
+	if (__predict_false(panicstr != NULL))
+		goto out;
+
+	tid = (uintptr_t)curthread;
+
+	/*
+	 * As first option, treact the lock as if it has not
+	 * any waiter.
+	 * Fix-up the tid var if the lock has been disowned.
+	 */
+	if (LK_HOLDER(x) == LK_KERNPROC)
+		tid = LK_KERNPROC;
+	else {
+		WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+		TD_LOCKS_DEC(curthread);
+	}
+	LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line);
+
+	/*
+	 * The lock is held in exclusive mode.
+	 * If the lock is recursed also, then unrecurse it.
+	 */
+	if (lockmgr_xlocked_v(x) && lockmgr_recursed(lk)) {
+		LOCK_LOG2(lk, "%s: %p unrecursing", __func__, lk);
+		lk->lk_recurse--;
+		goto out;
+	}
+	if (tid != LK_KERNPROC)
+		LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk,
+		    LOCKSTAT_WRITER);
+
+	if (x == tid && atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED))
+		goto out;
+
+	sleepq_lock(&lk->lock_object);
+	x = lk->lk_lock;
+	v = LK_UNLOCKED;
+
+	/*
+	 * If the lock has exclusive waiters, give them
+	 * preference in order to avoid deadlock with
+	 * shared runners up.
+	 * If interruptible sleeps left the exclusive queue
+	 * empty avoid a starvation for the threads sleeping
+	 * on the shared queue by giving them precedence
+	 * and cleaning up the exclusive waiters bit anyway.
+	 * Please note that lk_exslpfail count may be lying
+	 * about the real number of waiters with the
+	 * LK_SLEEPFAIL flag on because they may be used in
+	 * conjunction with interruptible sleeps so
+	 * lk_exslpfail might be considered an 'upper limit'
+	 * bound, including the edge cases.
+	 */
+	MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+	realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE);
+	if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+		if (lk->lk_exslpfail < realexslp) {
+			lk->lk_exslpfail = 0;
+			queue = SQ_EXCLUSIVE_QUEUE;
+			v |= (x & LK_SHARED_WAITERS);
+		} else {
+			lk->lk_exslpfail = 0;
+			LOCK_LOG2(lk,
+			    "%s: %p has only LK_SLEEPFAIL sleepers",
+			    __func__, lk);
+			LOCK_LOG2(lk,
+			    "%s: %p waking up threads on the exclusive queue",
+			    __func__, lk);
+			wakeup_swapper = sleepq_broadcast(&lk->lock_object,
+			    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+			queue = SQ_SHARED_QUEUE;
+		}
+	} else {
+
+		/*
+		 * Exclusive waiters sleeping with LK_SLEEPFAIL
+		 * on and using interruptible sleeps/timeout
+		 * may have left spourious lk_exslpfail counts
+		 * on, so clean it up anyway.
+		 */
+		lk->lk_exslpfail = 0;
+		queue = SQ_SHARED_QUEUE;
+	}
+
+	LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
+	    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+	    "exclusive");
+	atomic_store_rel_ptr(&lk->lk_lock, v);
+	wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue);
+	sleepq_release(&lk->lock_object);
+
+out:
+	lockmgr_exit(flags, ilk, wakeup_swapper);
+	return (0);
+}
+
+int
+lockmgr_unlock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk)
+{
+	struct lock_class *class;
+	uintptr_t x, tid;
+	const char *file;
+	int line;
+
+	if (__predict_false(panicstr != NULL))
+		return (0);
+
+	file = __FILE__;
+	line = __LINE__;
+
+	_lockmgr_assert(lk, KA_LOCKED, file, line);
+	x = lk->lk_lock;
+	if (__predict_true(x & LK_SHARE) != 0) {
+		if (lockmgr_sunlock_try(lk, &x)) {
+			lockmgr_note_shared_release(lk, file, line);
+		} else {
+			return (lockmgr_sunlock_hard(lk, x, flags, ilk, file, line));
+		}
+	} else {
+		tid = (uintptr_t)curthread;
+		if (!lockmgr_recursed(lk) &&
+		    atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) {
+			lockmgr_note_exclusive_release(lk, file, line);
+		} else {
+			return (lockmgr_xunlock_hard(lk, x, flags, ilk, file, line));
+		}
+	}
+	if (__predict_false(flags & LK_INTERLOCK)) {
+		class = LOCK_CLASS(ilk);
+		class->lc_unlock(ilk);
+	}
+	return (0);
+}
+
+int
+__lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
+    const char *wmesg, int pri, int timo, const char *file, int line)
+{
+	GIANT_DECLARE;
+	struct lockmgr_wait lwa;
+	struct lock_class *class;
+	const char *iwmesg;
+	uintptr_t tid, v, x;
+	u_int op, realexslp;
+	int error, ipri, itimo, queue, wakeup_swapper;
+#ifdef LOCK_PROFILING
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+
+	if (panicstr != NULL)
+		return (0);
+
+	error = 0;
+	tid = (uintptr_t)curthread;
+	op = (flags & LK_TYPE_MASK);
+	iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg;
+	ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri;
+	itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo;
+
+	lwa.iwmesg = iwmesg;
+	lwa.ipri = ipri;
+	lwa.itimo = itimo;
+
+	MPASS((flags & ~LK_TOTAL_MASK) == 0);
+	KASSERT((op & (op - 1)) == 0,
+	    ("%s: Invalid requested operation @ %s:%d", __func__, file, line));
+	KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 ||
+	    (op != LK_DOWNGRADE && op != LK_RELEASE),
+	    ("%s: Invalid flags in regard of the operation desired @ %s:%d",
+	    __func__, file, line));
+	KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL,
+	    ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d",
+	    __func__, file, line));
+	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+	    ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread,
+	    lk->lock_object.lo_name, file, line));
+
+	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+
+	if (lk->lock_object.lo_flags & LK_NOSHARE) {
+		switch (op) {
+		case LK_SHARED:
+			op = LK_EXCLUSIVE;
+			break;
+		case LK_UPGRADE:
+		case LK_TRYUPGRADE:
+		case LK_DOWNGRADE:
+			_lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED,
+			    file, line);
+			if (flags & LK_INTERLOCK)
+				class->lc_unlock(ilk);
+			return (0);
+		}
+	}
+
+	wakeup_swapper = 0;
+	switch (op) {
+	case LK_SHARED:
+		return (lockmgr_slock_hard(lk, flags, ilk, file, line, &lwa));
+		break;
+	case LK_UPGRADE:
+	case LK_TRYUPGRADE:
+		return (lockmgr_upgrade(lk, flags, ilk, file, line, &lwa));
+		break;
+	case LK_EXCLUSIVE:
+		return (lockmgr_xlock_hard(lk, flags, ilk, file, line, &lwa));
+		break;
+	case LK_DOWNGRADE:
+		_lockmgr_assert(lk, KA_XLOCKED, file, line);
+		WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line);
+
+		/*
+		 * Panic if the lock is recursed.
+		 */
+		if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
+			if (flags & LK_INTERLOCK)
+				class->lc_unlock(ilk);
+			panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n",
+			    __func__, iwmesg, file, line);
+		}
+		TD_SLOCKS_INC(curthread);
+
+		/*
+		 * In order to preserve waiters flags, just spin.
+		 */
+		for (;;) {
+			x = lk->lk_lock;
+			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+			x &= LK_ALL_WAITERS;
+			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+			    LK_SHARERS_LOCK(1) | x))
+				break;
+			cpu_spinwait();
+		}
+		LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line);
+		LOCKSTAT_RECORD0(lockmgr__downgrade, lk);
+		break;
+	case LK_RELEASE:
+		_lockmgr_assert(lk, KA_LOCKED, file, line);
+		x = lk->lk_lock;
+
+		if (__predict_true(x & LK_SHARE) != 0) {
+			return (lockmgr_sunlock_hard(lk, x, flags, ilk, file, line));
+		} else {
+			return (lockmgr_xunlock_hard(lk, x, flags, ilk, file, line));
+		}
+		break;
+	case LK_DRAIN:
+		if (LK_CAN_WITNESS(flags))
+			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+			    ilk : NULL);
+
+		/*
+		 * Trying to drain a lock we already own will result in a
+		 * deadlock.
+		 */
+		if (lockmgr_xlocked(lk)) {
+			if (flags & LK_INTERLOCK)
+				class->lc_unlock(ilk);
+			panic("%s: draining %s with the lock held @ %s:%d\n",
+			    __func__, iwmesg, file, line);
+		}
+
+		for (;;) {
+			if (lk->lk_lock == LK_UNLOCKED &&
+			    atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid))
+				break;
+
+#ifdef HWPMC_HOOKS
+			PMC_SOFT_CALL( , , lock, failed);
+#endif
+			lock_profile_obtain_lock_failed(&lk->lock_object,
+			    &contested, &waittime);
+
+			/*
+			 * If the lock is expected to not sleep just give up
+			 * and return.
+			 */
+			if (LK_TRYOP(flags)) {
+				LOCK_LOG2(lk, "%s: %p fails the try operation",
+				    __func__, lk);
+				error = EBUSY;
+				break;
+			}
+
+			/*
+			 * Acquire the sleepqueue chain lock because we
+			 * probabilly will need to manipulate waiters flags.
+			 */
+			sleepq_lock(&lk->lock_object);
+			x = lk->lk_lock;
+
+			/*
+			 * if the lock has been released while we spun on
+			 * the sleepqueue chain lock just try again.
+			 */
+			if (x == LK_UNLOCKED) {
+				sleepq_release(&lk->lock_object);
+				continue;
+			}
+
+			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+			if ((x & ~v) == LK_UNLOCKED) {
+				v = (x & ~LK_EXCLUSIVE_SPINNERS);
+
+				/*
+				 * If interruptible sleeps left the exclusive
+				 * queue empty avoid a starvation for the
+				 * threads sleeping on the shared queue by
+				 * giving them precedence and cleaning up the
+				 * exclusive waiters bit anyway.
+				 * Please note that lk_exslpfail count may be
+				 * lying about the real number of waiters with
+				 * the LK_SLEEPFAIL flag on because they may
+				 * be used in conjunction with interruptible
+				 * sleeps so lk_exslpfail might be considered
+				 * an 'upper limit' bound, including the edge
+				 * cases.
+				 */
+				if (v & LK_EXCLUSIVE_WAITERS) {
+					queue = SQ_EXCLUSIVE_QUEUE;
+					v &= ~LK_EXCLUSIVE_WAITERS;
+				} else {
+
+					/*
+					 * Exclusive waiters sleeping with
+					 * LK_SLEEPFAIL on and using
+					 * interruptible sleeps/timeout may
+					 * have left spourious lk_exslpfail
+					 * counts on, so clean it up anyway.
+					 */
+					MPASS(v & LK_SHARED_WAITERS);
+					lk->lk_exslpfail = 0;
+					queue = SQ_SHARED_QUEUE;
+					v &= ~LK_SHARED_WAITERS;
+				}
+				if (queue == SQ_EXCLUSIVE_QUEUE) {
+					realexslp =
+					    sleepq_sleepcnt(&lk->lock_object,
+					    SQ_EXCLUSIVE_QUEUE);
+					if (lk->lk_exslpfail >= realexslp) {
+						lk->lk_exslpfail = 0;
+						queue = SQ_SHARED_QUEUE;
+						v &= ~LK_SHARED_WAITERS;
+						if (realexslp != 0) {
+							LOCK_LOG2(lk,
+					"%s: %p has only LK_SLEEPFAIL sleepers",
+							    __func__, lk);
+							LOCK_LOG2(lk,
+			"%s: %p waking up threads on the exclusive queue",
+							    __func__, lk);
+							wakeup_swapper =
+							    sleepq_broadcast(
+							    &lk->lock_object,
+							    SLEEPQ_LK, 0,
+							    SQ_EXCLUSIVE_QUEUE);
+						}
+					} else
+						lk->lk_exslpfail = 0;
+				}
+				if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+				LOCK_LOG3(lk,
+				"%s: %p waking up all threads on the %s queue",
+				    __func__, lk, queue == SQ_SHARED_QUEUE ?
+				    "shared" : "exclusive");
+				wakeup_swapper |= sleepq_broadcast(
+				    &lk->lock_object, SLEEPQ_LK, 0, queue);
+
+				/*
+				 * If shared waiters have been woken up we need
+				 * to wait for one of them to acquire the lock
+				 * before to set the exclusive waiters in
+				 * order to avoid a deadlock.
+				 */
+				if (queue == SQ_SHARED_QUEUE) {
+					for (v = lk->lk_lock;
+					    (v & LK_SHARE) && !LK_SHARERS(v);
+					    v = lk->lk_lock)
+						cpu_spinwait();
+				}
+			}
+
+			/*
+			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
+			 * fail, loop back and retry.
+			 */
+			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
+				    x | LK_EXCLUSIVE_WAITERS)) {
+					sleepq_release(&lk->lock_object);
+					continue;
+				}
+				LOCK_LOG2(lk, "%s: %p set drain waiters flag",
+				    __func__, lk);
+			}
+
+			/*
+			 * As far as we have been unable to acquire the
+			 * exclusive lock and the exclusive waiters flag
+			 * is set, we will sleep.
+			 */
+			if (flags & LK_INTERLOCK) {
+				class->lc_unlock(ilk);
+				flags &= ~LK_INTERLOCK;
+			}
+			GIANT_SAVE();
+			sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK,
+			    SQ_EXCLUSIVE_QUEUE);
+			sleepq_wait(&lk->lock_object, ipri & PRIMASK);
+			GIANT_RESTORE();
+			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+			    __func__, lk);
+		}
+
+		if (error == 0) {
+			lock_profile_obtain_lock_success(&lk->lock_object,
+			    contested, waittime, file, line);
+			LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0,
+			    lk->lk_recurse, file, line);
+			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+			    LK_TRYWIT(flags), file, line);
+			TD_LOCKS_INC(curthread);
+			STACK_SAVE(lk);
+		}
+		break;
+	default:
+		if (flags & LK_INTERLOCK)
+			class->lc_unlock(ilk);
+		panic("%s: unknown lockmgr request 0x%x\n", __func__, op);
+	}
+
+	if (flags & LK_INTERLOCK)
+		class->lc_unlock(ilk);
+	if (wakeup_swapper)
+		kick_proc0();
+
+	return (error);
+}
+
+void
+_lockmgr_disown(struct lock *lk, const char *file, int line)
+{
+	uintptr_t tid, x;
+
+	if (SCHEDULER_STOPPED())
+		return;
+
+	tid = (uintptr_t)curthread;
+	_lockmgr_assert(lk, KA_XLOCKED, file, line);
+
+	/*
+	 * Panic if the lock is recursed.
+	 */
+	if (lockmgr_xlocked(lk) && lockmgr_recursed(lk))
+		panic("%s: disown a recursed lockmgr @ %s:%d\n",
+		    __func__,  file, line);
+
+	/*
+	 * If the owner is already LK_KERNPROC just skip the whole operation.
+	 */
+	if (LK_HOLDER(lk->lk_lock) != tid)
+		return;
+	lock_profile_release_lock(&lk->lock_object);
+	LOCKSTAT_RECORD1(lockmgr__disown, lk, LOCKSTAT_WRITER);
+	LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line);
+	WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+	TD_LOCKS_DEC(curthread);
+	STACK_SAVE(lk);
+
+	/*
+	 * In order to preserve waiters flags, just spin.
+	 */
+	for (;;) {
+		x = lk->lk_lock;
+		MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+		x &= LK_ALL_WAITERS;
+		if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+		    LK_KERNPROC | x))
+			return;
+		cpu_spinwait();
+	}
+}
+
+void
+lockmgr_printinfo(const struct lock *lk)
+{
+	struct thread *td;
+	uintptr_t x;
+
+	if (lk->lk_lock == LK_UNLOCKED)
+		printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name);
+	else if (lk->lk_lock & LK_SHARE)
+		printf("lock type %s: SHARED (count %ju)\n",
+		    lk->lock_object.lo_name,
+		    (uintmax_t)LK_SHARERS(lk->lk_lock));
+	else {
+		td = lockmgr_xholder(lk);
+		if (td == (struct thread *)LK_KERNPROC)
+			printf("lock type %s: EXCL by KERNPROC\n",
+			    lk->lock_object.lo_name);
+		else
+			printf("lock type %s: EXCL by thread %p "
+			    "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name,
+			    td, td->td_proc->p_pid, td->td_proc->p_comm,
+			    td->td_tid);
+	}
+
+	x = lk->lk_lock;
+	if (x & LK_EXCLUSIVE_WAITERS)
+		printf(" with exclusive waiters pending\n");
+	if (x & LK_SHARED_WAITERS)
+		printf(" with shared waiters pending\n");
+	if (x & LK_EXCLUSIVE_SPINNERS)
+		printf(" with exclusive spinners pending\n");
+
+	STACK_PRINT(lk);
+}
+
+int
+lockstatus(const struct lock *lk)
+{
+	uintptr_t v, x;
+	int ret;
+
+	ret = LK_SHARED;
+	x = lk->lk_lock;
+	v = LK_HOLDER(x);
+
+	if ((x & LK_SHARE) == 0) {
+		if (v == (uintptr_t)curthread || v == LK_KERNPROC)
+			ret = LK_EXCLUSIVE;
+		else
+			ret = LK_EXCLOTHER;
+	} else if (x == LK_UNLOCKED)
+		ret = 0;
+
+	return (ret);
+}
+
+#ifdef INVARIANT_SUPPORT
+
+FEATURE(invariant_support,
+    "Support for modules compiled with INVARIANTS option");
+
+#ifndef INVARIANTS
+#undef	_lockmgr_assert
+#endif
+
+void
+_lockmgr_assert(const struct lock *lk, int what, const char *file, int line)
+{
+	int slocked = 0;
+
+	if (panicstr != NULL)
+		return;
+	switch (what) {
+	case KA_SLOCKED:
+	case KA_SLOCKED | KA_NOTRECURSED:
+	case KA_SLOCKED | KA_RECURSED:
+		slocked = 1;
+	case KA_LOCKED:
+	case KA_LOCKED | KA_NOTRECURSED:
+	case KA_LOCKED | KA_RECURSED:
+#ifdef WITNESS
+
+		/*
+		 * We cannot trust WITNESS if the lock is held in exclusive
+		 * mode and a call to lockmgr_disown() happened.
+		 * Workaround this skipping the check if the lock is held in
+		 * exclusive mode even for the KA_LOCKED case.
+		 */
+		if (slocked || (lk->lk_lock & LK_SHARE)) {
+			witness_assert(&lk->lock_object, what, file, line);
+			break;
+		}
+#endif
+		if (lk->lk_lock == LK_UNLOCKED ||
+		    ((lk->lk_lock & LK_SHARE) == 0 && (slocked ||
+		    (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)))))
+			panic("Lock %s not %slocked @ %s:%d\n",
+			    lk->lock_object.lo_name, slocked ? "share" : "",
+			    file, line);
+
+		if ((lk->lk_lock & LK_SHARE) == 0) {
+			if (lockmgr_recursed(lk)) {
+				if (what & KA_NOTRECURSED)
+					panic("Lock %s recursed @ %s:%d\n",
+					    lk->lock_object.lo_name, file,
+					    line);
+			} else if (what & KA_RECURSED)
+				panic("Lock %s not recursed @ %s:%d\n",
+				    lk->lock_object.lo_name, file, line);
+		}
+		break;
+	case KA_XLOCKED:
+	case KA_XLOCKED | KA_NOTRECURSED:
+	case KA_XLOCKED | KA_RECURSED:
+		if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))
+			panic("Lock %s not exclusively locked @ %s:%d\n",
+			    lk->lock_object.lo_name, file, line);
+		if (lockmgr_recursed(lk)) {
+			if (what & KA_NOTRECURSED)
+				panic("Lock %s recursed @ %s:%d\n",
+				    lk->lock_object.lo_name, file, line);
+		} else if (what & KA_RECURSED)
+			panic("Lock %s not recursed @ %s:%d\n",
+			    lk->lock_object.lo_name, file, line);
+		break;
+	case KA_UNLOCKED:
+		if (lockmgr_xlocked(lk) || lockmgr_disowned(lk))
+			panic("Lock %s exclusively locked @ %s:%d\n",
+			    lk->lock_object.lo_name, file, line);
+		break;
+	default:
+		panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file,
+		    line);
+	}
+}
+#endif
+
+#ifdef DDB
+int
+lockmgr_chain(struct thread *td, struct thread **ownerp)
+{
+	struct lock *lk;
+
+	lk = td->td_wchan;
+
+	if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
+		return (0);
+	db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
+	if (lk->lk_lock & LK_SHARE)
+		db_printf("SHARED (count %ju)\n",
+		    (uintmax_t)LK_SHARERS(lk->lk_lock));
+	else
+		db_printf("EXCL\n");
+	*ownerp = lockmgr_xholder(lk);
+
+	return (1);
+}
+
+static void
+db_show_lockmgr(const struct lock_object *lock)
+{
+	struct thread *td;
+	const struct lock *lk;
+
+	lk = (const struct lock *)lock;
+
+	db_printf(" state: ");
+	if (lk->lk_lock == LK_UNLOCKED)
+		db_printf("UNLOCKED\n");
+	else if (lk->lk_lock & LK_SHARE)
+		db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock));
+	else {
+		td = lockmgr_xholder(lk);
+		if (td == (struct thread *)LK_KERNPROC)
+			db_printf("XLOCK: LK_KERNPROC\n");
+		else
+			db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+			    td->td_tid, td->td_proc->p_pid,
+			    td->td_proc->p_comm);
+		if (lockmgr_recursed(lk))
+			db_printf(" recursed: %d\n", lk->lk_recurse);
+	}
+	db_printf(" waiters: ");
+	switch (lk->lk_lock & LK_ALL_WAITERS) {
+	case LK_SHARED_WAITERS:
+		db_printf("shared\n");
+		break;
+	case LK_EXCLUSIVE_WAITERS:
+		db_printf("exclusive\n");
+		break;
+	case LK_ALL_WAITERS:
+		db_printf("shared and exclusive\n");
+		break;
+	default:
+		db_printf("none\n");
+	}
+	db_printf(" spinners: ");
+	if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS)
+		db_printf("exclusive\n");
+	else
+		db_printf("none\n");
+}
+#endif
diff --git a/freebsd/sys/kern/subr_pctrie.c b/freebsd/sys/kern/subr_pctrie.c
new file mode 100644
index 00000000..c5f2c06e
--- /dev/null
+++ b/freebsd/sys/kern/subr_pctrie.c
@@ -0,0 +1,695 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Path-compressed radix trie implementation.
+ *
+ * The implementation takes into account the following rationale:
+ * - Size of the nodes should be as small as possible but still big enough
+ *   to avoid a large maximum depth for the trie.  This is a balance
+ *   between the necessity to not wire too much physical memory for the nodes
+ *   and the necessity to avoid too much cache pollution during the trie
+ *   operations.
+ * - There is not a huge bias toward the number of lookup operations over
+ *   the number of insert and remove operations.  This basically implies
+ *   that optimizations supposedly helping one operation but hurting the
+ *   other might be carefully evaluated.
+ * - On average not many nodes are expected to be fully populated, hence
+ *   level compression may just complicate things.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/pctrie.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
+#define	PCTRIE_LIMIT	(howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
+
+/* Flag bits stored in node pointers. */
+#define	PCTRIE_ISLEAF	0x1
+#define	PCTRIE_FLAGS	0x1
+#define	PCTRIE_PAD	PCTRIE_FLAGS
+
+/* Returns one unit associated with specified level. */
+#define	PCTRIE_UNITLEVEL(lev)						\
+	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
+
+struct pctrie_node {
+	uint64_t	 pn_owner;			/* Owner of record. */
+	uint16_t	 pn_count;			/* Valid children. */
+	uint16_t	 pn_clev;			/* Current level. */
+	void		*pn_child[PCTRIE_COUNT];	/* Child nodes. */
+};
+
+/*
+ * Allocate a node.  Pre-allocation should ensure that the request
+ * will always be satisfied.
+ */
+static __inline struct pctrie_node *
+pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
+    uint16_t count, uint16_t clevel)
+{
+	struct pctrie_node *node;
+
+	node = allocfn(ptree);
+	if (node == NULL)
+		return (NULL);
+	node->pn_owner = owner;
+	node->pn_count = count;
+	node->pn_clev = clevel;
+
+	return (node);
+}
+
+/*
+ * Free radix node.
+ */
+static __inline void
+pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
+    pctrie_free_t freefn)
+{
+#ifdef INVARIANTS
+	int slot;
+
+	KASSERT(node->pn_count == 0,
+	    ("pctrie_node_put: node %p has %d children", node,
+	    node->pn_count));
+	for (slot = 0; slot < PCTRIE_COUNT; slot++)
+		KASSERT(node->pn_child[slot] == NULL,
+		    ("pctrie_node_put: node %p has a child", node));
+#endif
+	freefn(ptree, node);
+}
+
+/*
+ * Return the position in the array for a given level.
+ */
+static __inline int
+pctrie_slot(uint64_t index, uint16_t level)
+{
+
+	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
+}
+
+/* Trims the key after the specified level. */
+static __inline uint64_t
+pctrie_trimkey(uint64_t index, uint16_t level)
+{
+	uint64_t ret;
+
+	ret = index;
+	if (level > 0) {
+		ret >>= level * PCTRIE_WIDTH;
+		ret <<= level * PCTRIE_WIDTH;
+	}
+	return (ret);
+}
+
+/*
+ * Get the root node for a tree.
+ */
+static __inline struct pctrie_node *
+pctrie_getroot(struct pctrie *ptree)
+{
+
+	return ((struct pctrie_node *)ptree->pt_root);
+}
+
+/*
+ * Set the root node for a tree.
+ */
+static __inline void
+pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
+{
+
+	ptree->pt_root = (uintptr_t)node;
+}
+
+/*
+ * Returns TRUE if the specified node is a leaf and FALSE otherwise.
+ */
+static __inline boolean_t
+pctrie_isleaf(struct pctrie_node *node)
+{
+
+	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
+}
+
+/*
+ * Returns the associated val extracted from node.
+ */
+static __inline uint64_t *
+pctrie_toval(struct pctrie_node *node)
+{
+
+	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
+}
+
+/*
+ * Adds the val as a child of the provided node.
+ */
+static __inline void
+pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
+    uint64_t *val)
+{
+	int slot;
+
+	slot = pctrie_slot(index, clev);
+	node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
+}
+
+/*
+ * Returns the slot where two keys differ.
+ * It cannot accept 2 equal keys.
+ */
+static __inline uint16_t
+pctrie_keydiff(uint64_t index1, uint64_t index2)
+{
+	uint16_t clev;
+
+	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
+	    __func__, (uintmax_t)index1));
+
+	index1 ^= index2;
+	for (clev = PCTRIE_LIMIT;; clev--)
+		if (pctrie_slot(index1, clev) != 0)
+			return (clev);
+}
+
+/*
+ * Returns TRUE if it can be determined that key does not belong to the
+ * specified node.  Otherwise, returns FALSE.
+ */
+static __inline boolean_t
+pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
+{
+
+	if (node->pn_clev < PCTRIE_LIMIT) {
+		idx = pctrie_trimkey(idx, node->pn_clev + 1);
+		return (idx != node->pn_owner);
+	}
+	return (FALSE);
+}
+
+/*
+ * Internal helper for pctrie_reclaim_allnodes().
+ * This function is recursive.
+ */
+static void
+pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
+    pctrie_free_t freefn)
+{
+	int slot;
+
+	KASSERT(node->pn_count <= PCTRIE_COUNT,
+	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
+	for (slot = 0; node->pn_count != 0; slot++) {
+		if (node->pn_child[slot] == NULL)
+			continue;
+		if (!pctrie_isleaf(node->pn_child[slot]))
+			pctrie_reclaim_allnodes_int(ptree,
+			    node->pn_child[slot], freefn);
+		node->pn_child[slot] = NULL;
+		node->pn_count--;
+	}
+	pctrie_node_put(ptree, node, freefn);
+}
+
+/*
+ * pctrie node zone initializer.
+ */
+int
+pctrie_zone_init(void *mem, int size __unused, int flags __unused)
+{
+	struct pctrie_node *node;
+
+	node = mem;
+	memset(node->pn_child, 0, sizeof(node->pn_child));
+	return (0);
+}
+
+size_t
+pctrie_node_size(void)
+{
+
+	return (sizeof(struct pctrie_node));
+}
+
+/*
+ * Inserts the key-value pair into the trie.
+ * Panics if the key already exists.
+ */
+int
+pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
+{
+	uint64_t index, newind;
+	void **parentp;
+	struct pctrie_node *node, *tmp;
+	uint64_t *m;
+	int slot;
+	uint16_t clev;
+
+	index = *val;
+
+	/*
+	 * The owner of record for root is not really important because it
+	 * will never be used.
+	 */
+	node = pctrie_getroot(ptree);
+	if (node == NULL) {
+		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
+		return (0);
+	}
+	parentp = (void **)&ptree->pt_root;
+	for (;;) {
+		if (pctrie_isleaf(node)) {
+			m = pctrie_toval(node);
+			if (*m == index)
+				panic("%s: key %jx is already present",
+				    __func__, (uintmax_t)index);
+			clev = pctrie_keydiff(*m, index);
+			tmp = pctrie_node_get(ptree, allocfn,
+			    pctrie_trimkey(index, clev + 1), 2, clev);
+			if (tmp == NULL)
+				return (ENOMEM);
+			*parentp = tmp;
+			pctrie_addval(tmp, index, clev, val);
+			pctrie_addval(tmp, *m, clev, m);
+			return (0);
+		} else if (pctrie_keybarr(node, index))
+			break;
+		slot = pctrie_slot(index, node->pn_clev);
+		if (node->pn_child[slot] == NULL) {
+			node->pn_count++;
+			pctrie_addval(node, index, node->pn_clev, val);
+			return (0);
+		}
+		parentp = &node->pn_child[slot];
+		node = node->pn_child[slot];
+	}
+
+	/*
+	 * A new node is needed because the right insertion level is reached.
+	 * Setup the new intermediate node and add the 2 children: the
+	 * new object and the older edge.
+	 */
+	newind = node->pn_owner;
+	clev = pctrie_keydiff(newind, index);
+	tmp = pctrie_node_get(ptree, allocfn,
+	    pctrie_trimkey(index, clev + 1), 2, clev);
+	if (tmp == NULL)
+		return (ENOMEM);
+	*parentp = tmp;
+	pctrie_addval(tmp, index, clev, val);
+	slot = pctrie_slot(newind, clev);
+	tmp->pn_child[slot] = node;
+
+	return (0);
+}
+
+/*
+ * Returns the value stored at the index.  If the index is not present,
+ * NULL is returned.
+ */
+uint64_t *
+pctrie_lookup(struct pctrie *ptree, uint64_t index)
+{
+	struct pctrie_node *node;
+	uint64_t *m;
+	int slot;
+
+	node = pctrie_getroot(ptree);
+	while (node != NULL) {
+		if (pctrie_isleaf(node)) {
+			m = pctrie_toval(node);
+			if (*m == index)
+				return (m);
+			else
+				break;
+		} else if (pctrie_keybarr(node, index))
+			break;
+		slot = pctrie_slot(index, node->pn_clev);
+		node = node->pn_child[slot];
+	}
+	return (NULL);
+}
+
+/*
+ * Look up the nearest entry at a position bigger than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
+{
+	struct pctrie_node *stack[PCTRIE_LIMIT];
+	uint64_t inc;
+	uint64_t *m;
+	struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+	int loops = 0;
+#endif
+	int slot, tos;
+
+	node = pctrie_getroot(ptree);
+	if (node == NULL)
+		return (NULL);
+	else if (pctrie_isleaf(node)) {
+		m = pctrie_toval(node);
+		if (*m >= index)
+			return (m);
+		else
+			return (NULL);
+	}
+	tos = 0;
+	for (;;) {
+		/*
+		 * If the keys differ before the current bisection node,
+		 * then the search key might rollback to the earliest
+		 * available bisection node or to the smallest key
+		 * in the current node (if the owner is bigger than the
+		 * search key).
+		 */
+		if (pctrie_keybarr(node, index)) {
+			if (index > node->pn_owner) {
+ascend:
+				KASSERT(++loops < 1000,
+				    ("pctrie_lookup_ge: too many loops"));
+
+				/*
+				 * Pop nodes from the stack until either the
+				 * stack is empty or a node that could have a
+				 * matching descendant is found.
+				 */
+				do {
+					if (tos == 0)
+						return (NULL);
+					node = stack[--tos];
+				} while (pctrie_slot(index,
+				    node->pn_clev) == (PCTRIE_COUNT - 1));
+
+				/*
+				 * The following computation cannot overflow
+				 * because index's slot at the current level
+				 * is less than PCTRIE_COUNT - 1.
+				 */
+				index = pctrie_trimkey(index,
+				    node->pn_clev);
+				index += PCTRIE_UNITLEVEL(node->pn_clev);
+			} else
+				index = node->pn_owner;
+			KASSERT(!pctrie_keybarr(node, index),
+			    ("pctrie_lookup_ge: keybarr failed"));
+		}
+		slot = pctrie_slot(index, node->pn_clev);
+		child = node->pn_child[slot];
+		if (pctrie_isleaf(child)) {
+			m = pctrie_toval(child);
+			if (*m >= index)
+				return (m);
+		} else if (child != NULL)
+			goto descend;
+
+		/*
+		 * Look for an available edge or val within the current
+		 * bisection node.
+		 */
+                if (slot < (PCTRIE_COUNT - 1)) {
+			inc = PCTRIE_UNITLEVEL(node->pn_clev);
+			index = pctrie_trimkey(index, node->pn_clev);
+			do {
+				index += inc;
+				slot++;
+				child = node->pn_child[slot];
+				if (pctrie_isleaf(child)) {
+					m = pctrie_toval(child);
+					if (*m >= index)
+						return (m);
+				} else if (child != NULL)
+					goto descend;
+			} while (slot < (PCTRIE_COUNT - 1));
+		}
+		KASSERT(child == NULL || pctrie_isleaf(child),
+		    ("pctrie_lookup_ge: child is radix node"));
+
+		/*
+		 * If a value or edge bigger than the search slot is not found
+		 * in the current node, ascend to the next higher-level node.
+		 */
+		goto ascend;
+descend:
+		KASSERT(node->pn_clev > 0,
+		    ("pctrie_lookup_ge: pushing leaf's parent"));
+		KASSERT(tos < PCTRIE_LIMIT,
+		    ("pctrie_lookup_ge: stack overflow"));
+		stack[tos++] = node;
+		node = child;
+	}
+}
+
+/*
+ * Look up the nearest entry at a position less than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
+{
+	struct pctrie_node *stack[PCTRIE_LIMIT];
+	uint64_t inc;
+	uint64_t *m;
+	struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+	int loops = 0;
+#endif
+	int slot, tos;
+
+	node = pctrie_getroot(ptree);
+	if (node == NULL)
+		return (NULL);
+	else if (pctrie_isleaf(node)) {
+		m = pctrie_toval(node);
+		if (*m <= index)
+			return (m);
+		else
+			return (NULL);
+	}
+	tos = 0;
+	for (;;) {
+		/*
+		 * If the keys differ before the current bisection node,
+		 * then the search key might rollback to the earliest
+		 * available bisection node or to the largest key
+		 * in the current node (if the owner is smaller than the
+		 * search key).
+		 */
+		if (pctrie_keybarr(node, index)) {
+			if (index > node->pn_owner) {
+				index = node->pn_owner + PCTRIE_COUNT *
+				    PCTRIE_UNITLEVEL(node->pn_clev);
+			} else {
+ascend:
+				KASSERT(++loops < 1000,
+				    ("pctrie_lookup_le: too many loops"));
+
+				/*
+				 * Pop nodes from the stack until either the
+				 * stack is empty or a node that could have a
+				 * matching descendant is found.
+				 */
+				do {
+					if (tos == 0)
+						return (NULL);
+					node = stack[--tos];
+				} while (pctrie_slot(index,
+				    node->pn_clev) == 0);
+
+				/*
+				 * The following computation cannot overflow
+				 * because index's slot at the current level
+				 * is greater than 0.
+				 */
+				index = pctrie_trimkey(index,
+				    node->pn_clev);
+			}
+			index--;
+			KASSERT(!pctrie_keybarr(node, index),
+			    ("pctrie_lookup_le: keybarr failed"));
+		}
+		slot = pctrie_slot(index, node->pn_clev);
+		child = node->pn_child[slot];
+		if (pctrie_isleaf(child)) {
+			m = pctrie_toval(child);
+			if (*m <= index)
+				return (m);
+		} else if (child != NULL)
+			goto descend;
+
+		/*
+		 * Look for an available edge or value within the current
+		 * bisection node.
+		 */
+		if (slot > 0) {
+			inc = PCTRIE_UNITLEVEL(node->pn_clev);
+			index |= inc - 1;
+			do {
+				index -= inc;
+				slot--;
+				child = node->pn_child[slot];
+				if (pctrie_isleaf(child)) {
+					m = pctrie_toval(child);
+					if (*m <= index)
+						return (m);
+				} else if (child != NULL)
+					goto descend;
+			} while (slot > 0);
+		}
+		KASSERT(child == NULL || pctrie_isleaf(child),
+		    ("pctrie_lookup_le: child is radix node"));
+
+		/*
+		 * If a value or edge smaller than the search slot is not found
+		 * in the current node, ascend to the next higher-level node.
+		 */
+		goto ascend;
+descend:
+		KASSERT(node->pn_clev > 0,
+		    ("pctrie_lookup_le: pushing leaf's parent"));
+		KASSERT(tos < PCTRIE_LIMIT,
+		    ("pctrie_lookup_le: stack overflow"));
+		stack[tos++] = node;
+		node = child;
+	}
+}
+
+/*
+ * Remove the specified index from the tree.
+ * Panics if the key is not present.
+ */
+void
+pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
+{
+	struct pctrie_node *node, *parent;
+	uint64_t *m;
+	int i, slot;
+
+	node = pctrie_getroot(ptree);
+	if (pctrie_isleaf(node)) {
+		m = pctrie_toval(node);
+		if (*m != index)
+			panic("%s: invalid key found", __func__);
+		pctrie_setroot(ptree, NULL);
+		return;
+	}
+	parent = NULL;
+	for (;;) {
+		if (node == NULL)
+			panic("pctrie_remove: impossible to locate the key");
+		slot = pctrie_slot(index, node->pn_clev);
+		if (pctrie_isleaf(node->pn_child[slot])) {
+			m = pctrie_toval(node->pn_child[slot]);
+			if (*m != index)
+				panic("%s: invalid key found", __func__);
+			node->pn_child[slot] = NULL;
+			node->pn_count--;
+			if (node->pn_count > 1)
+				break;
+			for (i = 0; i < PCTRIE_COUNT; i++)
+				if (node->pn_child[i] != NULL)
+					break;
+			KASSERT(i != PCTRIE_COUNT,
+			    ("%s: invalid node configuration", __func__));
+			if (parent == NULL)
+				pctrie_setroot(ptree, node->pn_child[i]);
+			else {
+				slot = pctrie_slot(index, parent->pn_clev);
+				KASSERT(parent->pn_child[slot] == node,
+				    ("%s: invalid child value", __func__));
+				parent->pn_child[slot] = node->pn_child[i];
+			}
+			node->pn_count--;
+			node->pn_child[i] = NULL;
+			pctrie_node_put(ptree, node, freefn);
+			break;
+		}
+		parent = node;
+		node = node->pn_child[slot];
+	}
+}
+
+/*
+ * Remove and free all the nodes from the tree.
+ * This function is recursive but there is a tight control on it as the
+ * maximum depth of the tree is fixed.
+ */
+void
+pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
+{
+	struct pctrie_node *root;
+
+	root = pctrie_getroot(ptree);
+	if (root == NULL)
+		return;
+	pctrie_setroot(ptree, NULL);
+	if (!pctrie_isleaf(root))
+		pctrie_reclaim_allnodes_int(ptree, root, freefn);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given node.
+ */
+DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
+{
+	struct pctrie_node *node;
+	int i;
+
+        if (!have_addr)
+                return;
+	node = (struct pctrie_node *)addr;
+	db_printf("node %p, owner %jx, children count %u, level %u:\n",
+	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
+	    node->pn_clev);
+	for (i = 0; i < PCTRIE_COUNT; i++)
+		if (node->pn_child[i] != NULL)
+			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
+			    i, (void *)node->pn_child[i],
+			    pctrie_isleaf(node->pn_child[i]) ?
+			    pctrie_toval(node->pn_child[i]) : NULL,
+			    node->pn_clev);
+}
+#endif /* DDB */
diff --git a/freebsd/sys/kern/vfs_acl.c b/freebsd/sys/kern/vfs_acl.c
new file mode 100644
index 00000000..56192cfb
--- /dev/null
+++ b/freebsd/sys/kern/vfs_acl.c
@@ -0,0 +1,600 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 1999-2006, 2016-2017 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Portions of this software were developed by BAE Systems, the University of
+ * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
+ * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
+ * Computing (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL system calls and other functions common across different ACL types.
+ * Type-specific routines go into subr_acl_<type>.c.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capsicum.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/acl.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES);
+
+MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists");
+
+
+static int	kern___acl_aclcheck_path(struct thread *td, const char *path,
+		    acl_type_t type, struct acl *aclp, int follow);
+static int	kern___acl_delete_path(struct thread *td, const char *path,
+		    acl_type_t type, int follow);
+static int	kern___acl_get_path(struct thread *td, const char *path,
+		    acl_type_t type, struct acl *aclp, int follow);
+static int	kern___acl_set_path(struct thread *td, const char *path,
+		    acl_type_t type, const struct acl *aclp, int follow);
+static int	vacl_set_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, const struct acl *aclp);
+static int	vacl_get_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
+		    acl_type_t type, const struct acl *aclp);
+
+int
+acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest)
+{
+	int i;
+
+	if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES)
+		return (EINVAL);
+	
+	bzero(dest, sizeof(*dest));
+
+	dest->acl_cnt = source->acl_cnt;
+	dest->acl_maxcnt = ACL_MAX_ENTRIES;
+
+	for (i = 0; i < dest->acl_cnt; i++) {
+		dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+		dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+		dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+	}
+
+	return (0);
+}
+
+int
+acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest)
+{
+	int i;
+
+	if (source->acl_cnt > OLDACL_MAX_ENTRIES)
+		return (EINVAL);
+
+	bzero(dest, sizeof(*dest));
+
+	dest->acl_cnt = source->acl_cnt;
+
+	for (i = 0; i < dest->acl_cnt; i++) {
+		dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+		dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+		dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+	}
+
+	return (0);
+}
+
+/*
+ * At one time, "struct ACL" was extended in order to add support for NFSv4
+ * ACLs.  Instead of creating compatibility versions of all the ACL-related
+ * syscalls, they were left intact.  It's possible to find out what the code
+ * calling these syscalls (libc) expects basing on "type" argument - if it's
+ * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were
+ * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct
+ * oldacl".  If it's something else, then it's the new "struct acl".  In the
+ * latter case, the routines below just copyin/copyout the contents.  In the
+ * former case, they copyin the "struct oldacl" and convert it to the new
+ * format.
+ */
+static int
+acl_copyin(const void *user_acl, struct acl *kernel_acl, acl_type_t type)
+{
+	int error;
+	struct oldacl old;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS_OLD:
+	case ACL_TYPE_DEFAULT_OLD:
+		error = copyin(user_acl, &old, sizeof(old));
+		if (error != 0)
+			break;
+		acl_copy_oldacl_into_acl(&old, kernel_acl);
+		break;
+
+	default:
+		error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl));
+		if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES)
+			return (EINVAL);
+	}
+
+	return (error);
+}
+
+static int
+acl_copyout(const struct acl *kernel_acl, void *user_acl, acl_type_t type)
+{
+	uint32_t am;
+	int error;
+	struct oldacl old;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS_OLD:
+	case ACL_TYPE_DEFAULT_OLD:
+		error = acl_copy_acl_into_oldacl(kernel_acl, &old);
+		if (error != 0)
+			break;
+
+		error = copyout(&old, user_acl, sizeof(old));
+		break;
+
+	default:
+		error = fueword32((char *)user_acl +
+		    offsetof(struct acl, acl_maxcnt), &am);
+		if (error == -1)
+			return (EFAULT);
+		if (am != ACL_MAX_ENTRIES)
+			return (EINVAL);
+
+		error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl));
+	}
+
+	return (error);
+}
+
+/*
+ * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new"
+ * counterpart.  It's required for old (pre-NFSv4 ACLs) libc to work
+ * with new kernel.  Fixing 'type' for old binaries with new libc
+ * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold().
+ */
+static int
+acl_type_unold(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS_OLD:
+		return (ACL_TYPE_ACCESS);
+
+	case ACL_TYPE_DEFAULT_OLD:
+		return (ACL_TYPE_DEFAULT);
+
+	default:
+		return (type);
+	}
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the syscall
+ * code once the syscall has converted the path or file descriptor to a vnode
+ * (unlocked).  The aclp pointer is assumed still to point to userland, so
+ * this should not be consumed within the kernel except by syscall code.
+ * Other code should directly invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    const struct acl *aclp)
+{
+	struct acl *inkernelacl;
+	struct mount *mp;
+	int error;
+
+	AUDIT_ARG_VALUE(type);
+	inkernelacl = acl_alloc(M_WAITOK);
+	error = acl_copyin(aclp, inkernelacl, type);
+	if (error != 0)
+		goto out;
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		goto out;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+	error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl);
+	if (error != 0)
+		goto out_unlock;
+#endif
+	error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl,
+	    td->td_ucred, td);
+#ifdef MAC
+out_unlock:
+#endif
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+out:
+	acl_free(inkernelacl);
+	return (error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl *inkernelacl;
+	int error;
+
+	AUDIT_ARG_VALUE(type);
+	inkernelacl = acl_alloc(M_WAITOK | M_ZERO);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+	error = mac_vnode_check_getacl(td->td_ucred, vp, type);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl,
+	    td->td_ucred, td);
+
+#ifdef MAC
+out:
+#endif
+	VOP_UNLOCK(vp, 0);
+	if (error == 0)
+		error = acl_copyout(inkernelacl, aclp, type);
+	acl_free(inkernelacl);
+	return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+	struct mount *mp;
+	int error;
+
+	AUDIT_ARG_VALUE(type);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+	error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ *
+ * XXXRW: No vnode lock held so can't audit vnode state...?
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+    const struct acl *aclp)
+{
+	struct acl *inkernelacl;
+	int error;
+
+	inkernelacl = acl_alloc(M_WAITOK);
+	error = acl_copyin(aclp, inkernelacl, type);
+	if (error != 0)
+		goto out;
+	error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl,
+	    td->td_ucred, td);
+out:
+	acl_free(inkernelacl);
+	return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
+ * need to lock, as the vacl_ code will get/release any locks required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ */
+int
+sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+
+	return (kern___acl_get_path(td, uap->path, uap->type, uap->aclp,
+	    FOLLOW));
+}
+
+/*
+ * Given a file path, get an ACL for it; don't follow links.
+ */
+int
+sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
+{
+
+	return(kern___acl_get_path(td, uap->path, uap->type, uap->aclp,
+	    NOFOLLOW));
+}
+
+static int
+kern___acl_get_path(struct thread *td, const char *path, acl_type_t type,
+    struct acl *aclp, int follow)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, type, aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it.
+ */
+int
+sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+
+	return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp,
+	    FOLLOW));
+}
+
+/*
+ * Given a file path, set an ACL for it; don't follow links.
+ */
+int
+sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
+{
+
+	return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp,
+	    NOFOLLOW));
+}
+
+static int
+kern___acl_set_path(struct thread *td, const char *path,
+    acl_type_t type, const struct acl *aclp, int follow)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, type, aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it.
+ */
+int
+sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->filedes);
+	error = getvnode(td, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_GET), &fp);
+	if (error == 0) {
+		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it.
+ */
+int
+sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->filedes);
+	error = getvnode(td, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_SET), &fp);
+	if (error == 0) {
+		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+
+	return (kern___acl_delete_path(td, uap->path, uap->type, FOLLOW));
+}
+
+/*
+ * Given a file path, delete an ACL from it; don't follow links.
+ */
+int
+sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
+{
+
+	return (kern___acl_delete_path(td, uap->path, uap->type, NOFOLLOW));
+}
+
+static int
+kern___acl_delete_path(struct thread *td, const char *path,
+    acl_type_t type, int follow)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, type);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->filedes);
+	error = getvnode(td, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_DELETE), &fp);
+	if (error == 0) {
+		error = vacl_delete(td, fp->f_vnode, uap->type);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+
+	return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp,
+	    FOLLOW));
+}
+
+/*
+ * Given a file path, check an ACL for it; don't follow links.
+ */
+int
+sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
+{
+	return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp,
+	    NOFOLLOW));
+}
+
+static int
+kern___acl_aclcheck_path(struct thread *td, const char *path, acl_type_t type,
+    struct acl *aclp, int follow)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, type, aclp);
+		NDFREE(&nd, 0);
+	}
+	return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->filedes);
+	error = getvnode(td, uap->filedes,
+	    cap_rights_init(&rights, CAP_ACL_CHECK), &fp);
+	if (error == 0) {
+		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+	}
+	return (error);
+}
+
+struct acl *
+acl_alloc(int flags)
+{
+	struct acl *aclp;
+
+	aclp = malloc(sizeof(*aclp), M_ACL, flags);
+	if (aclp == NULL)
+		return (NULL);
+
+	aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+
+	return (aclp);
+}
+
+void
+acl_free(struct acl *aclp)
+{
+
+	free(aclp, M_ACL);
+}
diff --git a/freebsd/sys/kern/vfs_aio.c b/freebsd/sys/kern/vfs_aio.c
new file mode 100644
index 00000000..350c51a0
--- /dev/null
+++ b/freebsd/sys/kern/vfs_aio.c
@@ -0,0 +1,2987 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 1997 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author.  This software is distributed AS-IS.
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capsicum.h>
+#include <sys/eventhandler.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/kthread.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/posix4.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/protosw.h>
+#include <sys/rwlock.h>
+#include <sys/sema.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/mount.h>
+#include <geom/geom.h>
+
+#include <machine/atomic.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/uma.h>
+#include <sys/aio.h>
+
+/*
+ * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
+ * overflow. (XXX will be removed soon.)
+ */
+static u_long jobrefid;
+
+/*
+ * Counter for aio_fsync.
+ */
+static uint64_t jobseqno;
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC	32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC	256
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define MAX_AIO_QUEUE		1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO		16
+#endif
+
+FEATURE(aio, "Asynchronous I/O");
+SYSCTL_DECL(_p1003_1b);
+
+static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
+static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");
+
+static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
+    "Async IO management");
+
+static int enable_aio_unsafe = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
+    "Permit asynchronous IO on all file types, not just known-safe types");
+
+static unsigned int unsafe_warningcnt = 1;
+SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
+    &unsafe_warningcnt, 0,
+    "Warnings that will be triggered upon failed IO requests on unsafe files");
+
+static int max_aio_procs = MAX_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
+    "Maximum number of kernel processes to use for handling async IO ");
+
+static int num_aio_procs = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
+    "Number of presently active kernel processes for async IO");
+
+/*
+ * The code will adjust the actual number of AIO processes towards this
+ * number when it gets a chance.
+ */
+static int target_aio_procs = TARGET_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
+    0,
+    "Preferred number of ready kernel processes for async IO");
+
+static int max_queue_count = MAX_AIO_QUEUE;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
+    "Maximum number of aio requests to queue, globally");
+
+static int num_queue_count = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
+    "Number of queued aio requests");
+
+static int num_buf_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
+    "Number of aio requests presently handled by the buf subsystem");
+
+static int num_unmapped_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
+    0,
+    "Number of aio requests presently handled by unmapped I/O buffers");
+
+/* Number of async I/O processes in the process of being started */
+/* XXX This should be local to aio_aqueue() */
+static int num_aio_resv_start = 0;
+
+static int aiod_lifetime;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
+    "Maximum lifetime for idle aiod");
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
+    0,
+    "Maximum active aio requests per process");
+
+static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
+    &max_aio_queue_per_proc, 0,
+    "Maximum queued aio requests per process");
+
+static int max_buf_aio = MAX_BUF_AIO;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
+    "Maximum buf aio requests per process");
+
+/* 
+ * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
+ * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
+ * vfs.aio.aio_listio_max.
+ */
+SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
+    CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
+    0, "Maximum aio requests for a single lio_listio call");
+
+#ifdef COMPAT_FREEBSD6
+typedef struct oaiocb {
+	int	aio_fildes;		/* File descriptor */
+	off_t	aio_offset;		/* File offset for I/O */
+	volatile void *aio_buf;         /* I/O buffer in process space */
+	size_t	aio_nbytes;		/* Number of bytes for I/O */
+	struct	osigevent aio_sigevent;	/* Signal to deliver */
+	int	aio_lio_opcode;		/* LIO opcode */
+	int	aio_reqprio;		/* Request priority -- ignored */
+	struct	__aiocb_private	_aiocb_private;
+} oaiocb_t;
+#endif
+
+/*
+ * Below is a key of locks used to protect each member of struct kaiocb
+ * aioliojob and kaioinfo and any backends.
+ *
+ * * - need not protected
+ * a - locked by kaioinfo lock
+ * b - locked by backend lock, the backend lock can be null in some cases,
+ *     for example, BIO belongs to this type, in this case, proc lock is
+ *     reused.
+ * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
+ */
+
+/*
+ * If the routine that services an AIO request blocks while running in an
+ * AIO kernel process it can starve other I/O requests.  BIO requests
+ * queued via aio_qbio() complete asynchronously and do not use AIO kernel
+ * processes at all.  Socket I/O requests use a separate pool of
+ * kprocs and also force non-blocking I/O.  Other file I/O requests
+ * use the generic fo_read/fo_write operations which can block.  The
+ * fsync and mlock operations can also block while executing.  Ideally
+ * none of these requests would block while executing.
+ *
+ * Note that the service routines cannot toggle O_NONBLOCK in the file
+ * structure directly while handling a request due to races with
+ * userland threads.
+ */
+
+/* jobflags */
+#define	KAIOCB_QUEUEING		0x01
+#define	KAIOCB_CANCELLED	0x02
+#define	KAIOCB_CANCELLING	0x04
+#define	KAIOCB_CHECKSYNC	0x08
+#define	KAIOCB_CLEARED		0x10
+#define	KAIOCB_FINISHED		0x20
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE	0x1			/* proc on free queue */
+
+struct aioproc {
+	int	aioprocflags;			/* (c) AIO proc flags */
+	TAILQ_ENTRY(aioproc) list;		/* (c) list of processes */
+	struct	proc *aioproc;			/* (*) the AIO proc */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aioliojob {
+	int	lioj_flags;			/* (a) listio flags */
+	int	lioj_count;			/* (a) listio flags */
+	int	lioj_finished_count;		/* (a) listio flags */
+	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
+	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
+	struct	knlist klist;			/* (a) list of knotes */
+	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
+};
+
+#define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
+#define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
+#define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+	struct	mtx kaio_mtx;		/* the lock to protect this struct */
+	int	kaio_flags;		/* (a) per process kaio flags */
+	int	kaio_active_count;	/* (c) number of currently used AIOs */
+	int	kaio_count;		/* (a) size of AIO queue */
+	int	kaio_buffer_count;	/* (a) number of bio buffers */
+	TAILQ_HEAD(,kaiocb) kaio_all;	/* (a) all AIOs in a process */
+	TAILQ_HEAD(,kaiocb) kaio_done;	/* (a) done queue for process */
+	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
+	TAILQ_HEAD(,kaiocb) kaio_jobqueue;	/* (a) job queue for process */
+	TAILQ_HEAD(,kaiocb) kaio_syncqueue;	/* (a) queue for aio_fsync */
+	TAILQ_HEAD(,kaiocb) kaio_syncready;  /* (a) second q for aio_fsync */
+	struct	task kaio_task;		/* (*) task to kick aio processes */
+	struct	task kaio_sync_task;	/* (*) task to schedule fsync jobs */
+};
+
+#define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
+#define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
+#define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
+#define AIO_MTX(ki)		(&(ki)->kaio_mtx)
+
+#define KAIO_RUNDOWN	0x1	/* process is being run down */
+#define KAIO_WAKEUP	0x2	/* wakeup process when AIO completes */
+
+/*
+ * Operations used to interact with userland aio control blocks.
+ * Different ABIs provide their own operations.
+ */
+struct aiocb_ops {
+	int	(*copyin)(struct aiocb *ujob, struct aiocb *kjob);
+	long	(*fetch_status)(struct aiocb *ujob);
+	long	(*fetch_error)(struct aiocb *ujob);
+	int	(*store_status)(struct aiocb *ujob, long status);
+	int	(*store_error)(struct aiocb *ujob, long error);
+	int	(*store_kernelinfo)(struct aiocb *ujob, long jobref);
+	int	(*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
+};
+
+static TAILQ_HEAD(,aioproc) aio_freeproc;		/* (c) Idle daemons */
+static struct sema aio_newproc_sem;
+static struct mtx aio_job_mtx;
+static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
+static struct unrhdr *aiod_unr;
+
+void		aio_init_aioinfo(struct proc *p);
+static int	aio_onceonly(void);
+static int	aio_free_entry(struct kaiocb *job);
+static void	aio_process_rw(struct kaiocb *job);
+static void	aio_process_sync(struct kaiocb *job);
+static void	aio_process_mlock(struct kaiocb *job);
+static void	aio_schedule_fsync(void *context, int pending);
+static int	aio_newproc(int *);
+int		aio_aqueue(struct thread *td, struct aiocb *ujob,
+		    struct aioliojob *lio, int type, struct aiocb_ops *ops);
+static int	aio_queue_file(struct file *fp, struct kaiocb *job);
+static void	aio_biowakeup(struct bio *bp);
+static void	aio_proc_rundown(void *arg, struct proc *p);
+static void	aio_proc_rundown_exec(void *arg, struct proc *p,
+		    struct image_params *imgp);
+static int	aio_qbio(struct proc *p, struct kaiocb *job);
+static void	aio_daemon(void *param);
+static void	aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
+static bool	aio_clear_cancel_function_locked(struct kaiocb *job);
+static int	aio_kick(struct proc *userp);
+static void	aio_kick_nowait(struct proc *userp);
+static void	aio_kick_helper(void *context, int pending);
+static int	filt_aioattach(struct knote *kn);
+static void	filt_aiodetach(struct knote *kn);
+static int	filt_aio(struct knote *kn, long hint);
+static int	filt_lioattach(struct knote *kn);
+static void	filt_liodetach(struct knote *kn);
+static int	filt_lio(struct knote *kn, long hint);
+
+/*
+ * Zones for:
+ * 	kaio	Per process async io info
+ *	aiop	async io process data
+ *	aiocb	async io jobs
+ *	aiolio	list io jobs
+ */
+static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;
+
+/* kqueue filters for aio */
+static struct filterops aio_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_aioattach,
+	.f_detach = filt_aiodetach,
+	.f_event = filt_aio,
+};
+static struct filterops lio_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_lioattach,
+	.f_detach = filt_liodetach,
+	.f_event = filt_lio
+};
+
+static eventhandler_tag exit_tag, exec_tag;
+
+TASKQUEUE_DEFINE_THREAD(aiod_kick);
+
+/*
+ * Main operations function for use as a kernel module.
+ */
+static int
+aio_modload(struct module *module, int cmd, void *arg)
+{
+	int error = 0;
+
+	switch (cmd) {
+	case MOD_LOAD:
+		aio_onceonly();
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t aio_mod = {
+	"aio",
+	&aio_modload,
+	NULL
+};
+
+DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(aio, 1);
+
+/*
+ * Startup initialization
+ */
+static int
+aio_onceonly(void)
+{
+
+	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
+	    NULL, EVENTHANDLER_PRI_ANY);
+	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
+	TAILQ_INIT(&aio_freeproc);
+	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
+	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
+	TAILQ_INIT(&aio_jobs);
+	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
+	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
+	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+	jobrefid = 1;
+	p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
+	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
+	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
+
+	return (0);
+}
+
+/*
+ * Init the per-process aioinfo structure.  The aioinfo limits are set
+ * per-process for user limit (resource) management.
+ */
+void
+aio_init_aioinfo(struct proc *p)
+{
+	struct kaioinfo *ki;
+
+	ki = uma_zalloc(kaio_zone, M_WAITOK);
+	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
+	ki->kaio_flags = 0;
+	ki->kaio_active_count = 0;
+	ki->kaio_count = 0;
+	ki->kaio_buffer_count = 0;
+	TAILQ_INIT(&ki->kaio_all);
+	TAILQ_INIT(&ki->kaio_done);
+	TAILQ_INIT(&ki->kaio_jobqueue);
+	TAILQ_INIT(&ki->kaio_liojoblist);
+	TAILQ_INIT(&ki->kaio_syncqueue);
+	TAILQ_INIT(&ki->kaio_syncready);
+	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
+	TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
+	PROC_LOCK(p);
+	if (p->p_aioinfo == NULL) {
+		p->p_aioinfo = ki;
+		PROC_UNLOCK(p);
+	} else {
+		PROC_UNLOCK(p);
+		mtx_destroy(&ki->kaio_mtx);
+		uma_zfree(kaio_zone, ki);
+	}
+
+	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
+		aio_newproc(NULL);
+}
+
+static int
+aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+	struct thread *td;
+	int error;
+
+	error = sigev_findtd(p, sigev, &td);
+	if (error)
+		return (error);
+	if (!KSI_ONQ(ksi)) {
+		ksiginfo_set_sigev(ksi, sigev);
+		ksi->ksi_code = SI_ASYNCIO;
+		ksi->ksi_flags |= KSI_EXT | KSI_INS;
+		tdsendsignal(p, td, ksi->ksi_signo, ksi);
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
+ * Free a job entry.  Wait for completion if it is currently active, but don't
+ * delay forever.  If we delay, we return a flag that says that we have to
+ * restart the queue scan.
+ */
+static int
+aio_free_entry(struct kaiocb *job)
+{
+	struct kaioinfo *ki;
+	struct aioliojob *lj;
+	struct proc *p;
+
+	p = job->userproc;
+	MPASS(curproc == p);
+	ki = p->p_aioinfo;
+	MPASS(ki != NULL);
+
+	AIO_LOCK_ASSERT(ki, MA_OWNED);
+	MPASS(job->jobflags & KAIOCB_FINISHED);
+
+	atomic_subtract_int(&num_queue_count, 1);
+
+	ki->kaio_count--;
+	MPASS(ki->kaio_count >= 0);
+
+	TAILQ_REMOVE(&ki->kaio_done, job, plist);
+	TAILQ_REMOVE(&ki->kaio_all, job, allist);
+
+	lj = job->lio;
+	if (lj) {
+		lj->lioj_count--;
+		lj->lioj_finished_count--;
+
+		if (lj->lioj_count == 0) {
+			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+			/* lio is going away, we need to destroy any knotes */
+			knlist_delete(&lj->klist, curthread, 1);
+			PROC_LOCK(p);
+			sigqueue_take(&lj->lioj_ksi);
+			PROC_UNLOCK(p);
+			uma_zfree(aiolio_zone, lj);
+		}
+	}
+
+	/* job is going away, we need to destroy any knotes */
+	knlist_delete(&job->klist, curthread, 1);
+	PROC_LOCK(p);
+	sigqueue_take(&job->ksi);
+	PROC_UNLOCK(p);
+
+	AIO_UNLOCK(ki);
+
+	/*
+	 * The thread argument here is used to find the owning process
+	 * and is also passed to fo_close() which may pass it to various
+	 * places such as devsw close() routines.  Because of that, we
+	 * need a thread pointer from the process owning the job that is
+	 * persistent and won't disappear out from under us or move to
+	 * another process.
+	 *
+	 * Currently, all the callers of this function call it to remove
+	 * a kaiocb from the current process' job list either via a
+	 * syscall or due to the current process calling exit() or
+	 * execve().  Thus, we know that p == curproc.  We also know that
+	 * curthread can't exit since we are curthread.
+	 *
+	 * Therefore, we use curthread as the thread to pass to
+	 * knlist_delete().  This does mean that it is possible for the
+	 * thread pointer at close time to differ from the thread pointer
+	 * at open time, but this is already true of file descriptors in
+	 * a multithreaded process.
+	 */
+	if (job->fd_file)
+		fdrop(job->fd_file, curthread);
+	crfree(job->cred);
+	uma_zfree(aiocb_zone, job);
+	AIO_LOCK(ki);
+
+	return (0);
+}
+
+static void
+aio_proc_rundown_exec(void *arg, struct proc *p,
+    struct image_params *imgp __unused)
+{
+   	aio_proc_rundown(arg, p);
+}
+
+static int
+aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
+{
+	aio_cancel_fn_t *func;
+	int cancelled;
+
+	AIO_LOCK_ASSERT(ki, MA_OWNED);
+	if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
+		return (0);
+	MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
+	job->jobflags |= KAIOCB_CANCELLED;
+
+	func = job->cancel_fn;
+
+	/*
+	 * If there is no cancel routine, just leave the job marked as
+	 * cancelled.  The job should be in active use by a caller who
+	 * should complete it normally or when it fails to install a
+	 * cancel routine.
+	 */
+	if (func == NULL)
+		return (0);
+
+	/*
+	 * Set the CANCELLING flag so that aio_complete() will defer
+	 * completions of this job.  This prevents the job from being
+	 * freed out from under the cancel callback.  After the
+	 * callback any deferred completion (whether from the callback
+	 * or any other source) will be completed.
+	 */
+	job->jobflags |= KAIOCB_CANCELLING;
+	AIO_UNLOCK(ki);
+	func(job);
+	AIO_LOCK(ki);
+	job->jobflags &= ~KAIOCB_CANCELLING;
+	if (job->jobflags & KAIOCB_FINISHED) {
+		cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
+		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
+		aio_bio_done_notify(p, job);
+	} else {
+		/*
+		 * The cancel callback might have scheduled an
+		 * operation to cancel this request, but it is
+		 * only counted as cancelled if the request is
+		 * cancelled when the callback returns.
+		 */
+		cancelled = 0;
+	}
+	return (cancelled);
+}
+
+/*
+ * Rundown the jobs for a given process.
+ */
+static void
+aio_proc_rundown(void *arg, struct proc *p)
+{
+	struct kaioinfo *ki;
+	struct aioliojob *lj;
+	struct kaiocb *job, *jobn;
+
+	KASSERT(curthread->td_proc == p,
+	    ("%s: called on non-curproc", __func__));
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return;
+
+	AIO_LOCK(ki);
+	ki->kaio_flags |= KAIO_RUNDOWN;
+
+restart:
+
+	/*
+	 * Try to cancel all pending requests. This code simulates
+	 * aio_cancel on all pending I/O requests.
+	 */
+	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
+		aio_cancel_job(p, ki, job);
+	}
+
+	/* Wait for all running I/O to be finished */
+	if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
+		ki->kaio_flags |= KAIO_WAKEUP;
+		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
+		goto restart;
+	}
+
+	/* Free all completed I/O requests. */
+	while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
+		aio_free_entry(job);
+
+	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
+		if (lj->lioj_count == 0) {
+			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+			knlist_delete(&lj->klist, curthread, 1);
+			PROC_LOCK(p);
+			sigqueue_take(&lj->lioj_ksi);
+			PROC_UNLOCK(p);
+			uma_zfree(aiolio_zone, lj);
+		} else {
+			panic("LIO job not cleaned up: C:%d, FC:%d\n",
+			    lj->lioj_count, lj->lioj_finished_count);
+		}
+	}
+	AIO_UNLOCK(ki);
+	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
+	taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
+	mtx_destroy(&ki->kaio_mtx);
+	uma_zfree(kaio_zone, ki);
+	p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon).
+ */
+static struct kaiocb *
+aio_selectjob(struct aioproc *aiop)
+{
+	struct kaiocb *job;
+	struct kaioinfo *ki;
+	struct proc *userp;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+restart:
+	TAILQ_FOREACH(job, &aio_jobs, list) {
+		userp = job->userproc;
+		ki = userp->p_aioinfo;
+
+		if (ki->kaio_active_count < max_aio_per_proc) {
+			TAILQ_REMOVE(&aio_jobs, job, list);
+			if (!aio_clear_cancel_function(job))
+				goto restart;
+
+			/* Account for currently active jobs. */
+			ki->kaio_active_count++;
+			break;
+		}
+	}
+	return (job);
+}
+
+/*
+ * Move all data to a permanent storage device.  This code
+ * simulates the fsync syscall.
+ */
+static int
+aio_fsync_vnode(struct thread *td, struct vnode *vp)
+{
+	struct mount *mp;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto drop;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if (vp->v_object != NULL) {
+		VM_OBJECT_WLOCK(vp->v_object);
+		vm_object_page_clean(vp->v_object, 0, 0, 0);
+		VM_OBJECT_WUNLOCK(vp->v_object);
+	}
+	error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+drop:
+	return (error);
+}
+
+/*
+ * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
+ * does the I/O request for the non-bio version of the operations.  The normal
+ * vn operations are used, and this code should work in all instances for every
+ * type of file, including pipes, sockets, fifos, and regular files.
+ *
+ * XXX I don't think it works well for socket, pipe, and fifo.
+ */
+static void
+aio_process_rw(struct kaiocb *job)
+{
+	struct ucred *td_savedcred;
+	struct thread *td;
+	struct aiocb *cb;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	ssize_t cnt;
+	long msgsnd_st, msgsnd_end;
+	long msgrcv_st, msgrcv_end;
+	long oublock_st, oublock_end;
+	long inblock_st, inblock_end;
+	int error;
+
+	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
+	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
+	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
+
+	aio_switch_vmspace(job);
+	td = curthread;
+	td_savedcred = td->td_ucred;
+	td->td_ucred = job->cred;
+	cb = &job->uaiocb;
+	fp = job->fd_file;
+
+	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
+	aiov.iov_len = cb->aio_nbytes;
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = cb->aio_offset;
+	auio.uio_resid = cb->aio_nbytes;
+	cnt = cb->aio_nbytes;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+
+	msgrcv_st = td->td_ru.ru_msgrcv;
+	msgsnd_st = td->td_ru.ru_msgsnd;
+	inblock_st = td->td_ru.ru_inblock;
+	oublock_st = td->td_ru.ru_oublock;
+
+	/*
+	 * aio_aqueue() acquires a reference to the file that is
+	 * released in aio_free_entry().
+	 */
+	if (cb->aio_lio_opcode == LIO_READ) {
+		auio.uio_rw = UIO_READ;
+		if (auio.uio_resid == 0)
+			error = 0;
+		else
+			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+	} else {
+		if (fp->f_type == DTYPE_VNODE)
+			bwillwrite();
+		auio.uio_rw = UIO_WRITE;
+		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+	}
+	msgrcv_end = td->td_ru.ru_msgrcv;
+	msgsnd_end = td->td_ru.ru_msgsnd;
+	inblock_end = td->td_ru.ru_inblock;
+	oublock_end = td->td_ru.ru_oublock;
+
+	job->msgrcv = msgrcv_end - msgrcv_st;
+	job->msgsnd = msgsnd_end - msgsnd_st;
+	job->inblock = inblock_end - inblock_st;
+	job->outblock = oublock_end - oublock_st;
+
+	if ((error) && (auio.uio_resid != cnt)) {
+		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+			error = 0;
+		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
+			PROC_LOCK(job->userproc);
+			kern_psignal(job->userproc, SIGPIPE);
+			PROC_UNLOCK(job->userproc);
+		}
+	}
+
+	cnt -= auio.uio_resid;
+	td->td_ucred = td_savedcred;
+	if (error)
+		aio_complete(job, -1, error);
+	else
+		aio_complete(job, cnt, 0);
+}
+
+static void
+aio_process_sync(struct kaiocb *job)
+{
+	struct thread *td = curthread;
+	struct ucred *td_savedcred = td->td_ucred;
+	struct file *fp = job->fd_file;
+	int error = 0;
+
+	KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
+	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
+
+	td->td_ucred = job->cred;
+	if (fp->f_vnode != NULL)
+		error = aio_fsync_vnode(td, fp->f_vnode);
+	td->td_ucred = td_savedcred;
+	if (error)
+		aio_complete(job, -1, error);
+	else
+		aio_complete(job, 0, 0);
+}
+
+static void
+aio_process_mlock(struct kaiocb *job)
+{
+	struct aiocb *cb = &job->uaiocb;
+	int error;
+
+	KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
+	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
+
+	aio_switch_vmspace(job);
+	error = kern_mlock(job->userproc, job->cred,
+	    __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
+	aio_complete(job, error != 0 ? -1 : 0, error);
+}
+
+static void
+aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
+{
+	struct aioliojob *lj;
+	struct kaioinfo *ki;
+	struct kaiocb *sjob, *sjobn;
+	int lj_done;
+	bool schedule_fsync;
+
+	ki = userp->p_aioinfo;
+	AIO_LOCK_ASSERT(ki, MA_OWNED);
+	lj = job->lio;
+	lj_done = 0;
+	if (lj) {
+		lj->lioj_finished_count++;
+		if (lj->lioj_count == lj->lioj_finished_count)
+			lj_done = 1;
+	}
+	TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
+	MPASS(job->jobflags & KAIOCB_FINISHED);
+
+	if (ki->kaio_flags & KAIO_RUNDOWN)
+		goto notification_done;
+
+	if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+	    job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
+		aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
+
+	KNOTE_LOCKED(&job->klist, 1);
+
+	if (lj_done) {
+		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+			KNOTE_LOCKED(&lj->klist, 1);
+		}
+		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+		    == LIOJ_SIGNAL
+		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
+			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+		}
+	}
+
+notification_done:
+	if (job->jobflags & KAIOCB_CHECKSYNC) {
+		schedule_fsync = false;
+		TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
+			if (job->fd_file != sjob->fd_file ||
+			    job->seqno >= sjob->seqno)
+				continue;
+			if (--sjob->pending > 0)
+				continue;
+			TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
+			if (!aio_clear_cancel_function_locked(sjob))
+				continue;
+			TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
+			schedule_fsync = true;
+		}
+		if (schedule_fsync)
+			taskqueue_enqueue(taskqueue_aiod_kick,
+			    &ki->kaio_sync_task);
+	}
+	if (ki->kaio_flags & KAIO_WAKEUP) {
+		ki->kaio_flags &= ~KAIO_WAKEUP;
+		wakeup(&userp->p_aioinfo);
+	}
+}
+
+static void
+aio_schedule_fsync(void *context, int pending)
+{
+	struct kaioinfo *ki;
+	struct kaiocb *job;
+
+	ki = context;
+	AIO_LOCK(ki);
+	while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
+		job = TAILQ_FIRST(&ki->kaio_syncready);
+		TAILQ_REMOVE(&ki->kaio_syncready, job, list);
+		AIO_UNLOCK(ki);
+		aio_schedule(job, aio_process_sync);
+		AIO_LOCK(ki);
+	}
+	AIO_UNLOCK(ki);
+}
+
+bool
+aio_cancel_cleared(struct kaiocb *job)
+{
+
+	/*
+	 * The caller should hold the same queue lock held when
+	 * aio_clear_cancel_function() was called and set this flag
+	 * ensuring this check sees an up-to-date value.  However,
+	 * there is no way to assert that.
+	 */
+	return ((job->jobflags & KAIOCB_CLEARED) != 0);
+}
+
+static bool
+aio_clear_cancel_function_locked(struct kaiocb *job)
+{
+
+	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
+	MPASS(job->cancel_fn != NULL);
+	if (job->jobflags & KAIOCB_CANCELLING) {
+		job->jobflags |= KAIOCB_CLEARED;
+		return (false);
+	}
+	job->cancel_fn = NULL;
+	return (true);
+}
+
+bool
+aio_clear_cancel_function(struct kaiocb *job)
+{
+	struct kaioinfo *ki;
+	bool ret;
+
+	ki = job->userproc->p_aioinfo;
+	AIO_LOCK(ki);
+	ret = aio_clear_cancel_function_locked(job);
+	AIO_UNLOCK(ki);
+	return (ret);
+}
+
+static bool
+aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
+{
+
+	AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
+	if (job->jobflags & KAIOCB_CANCELLED)
+		return (false);
+	job->cancel_fn = func;
+	return (true);
+}
+
+bool
+aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
+{
+	struct kaioinfo *ki;
+	bool ret;
+
+	ki = job->userproc->p_aioinfo;
+	AIO_LOCK(ki);
+	ret = aio_set_cancel_function_locked(job, func);
+	AIO_UNLOCK(ki);
+	return (ret);
+}
+
+void
+aio_complete(struct kaiocb *job, long status, int error)
+{
+	struct kaioinfo *ki;
+	struct proc *userp;
+
+	job->uaiocb._aiocb_private.error = error;
+	job->uaiocb._aiocb_private.status = status;
+
+	userp = job->userproc;
+	ki = userp->p_aioinfo;
+
+	AIO_LOCK(ki);
+	KASSERT(!(job->jobflags & KAIOCB_FINISHED),
+	    ("duplicate aio_complete"));
+	job->jobflags |= KAIOCB_FINISHED;
+	if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
+		TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
+		aio_bio_done_notify(userp, job);
+	}
+	AIO_UNLOCK(ki);
+}
+
+void
+aio_cancel(struct kaiocb *job)
+{
+
+	aio_complete(job, -1, ECANCELED);
+}
+
+void
+aio_switch_vmspace(struct kaiocb *job)
+{
+
+	vmspace_switch_aio(job->userproc->p_vmspace);
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process_*,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *_id)
+{
+	struct kaiocb *job;
+	struct aioproc *aiop;
+	struct kaioinfo *ki;
+	struct proc *p;
+	struct vmspace *myvm;
+	struct thread *td = curthread;
+	int id = (intptr_t)_id;
+
+	/*
+	 * Grab an extra reference on the daemon's vmspace so that it
+	 * doesn't get freed by jobs that switch to a different
+	 * vmspace.
+	 */
+	p = td->td_proc;
+	myvm = vmspace_acquire_ref(p);
+
+	KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
+
+	/*
+	 * Allocate and ready the aio control info.  There is one aiop structure
+	 * per daemon.
+	 */
+	aiop = uma_zalloc(aiop_zone, M_WAITOK);
+	aiop->aioproc = p;
+	aiop->aioprocflags = 0;
+
+	/*
+	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
+	 * and creating too many daemons.)
+	 */
+	sema_post(&aio_newproc_sem);
+
+	mtx_lock(&aio_job_mtx);
+	for (;;) {
+		/*
+		 * Take daemon off of free queue
+		 */
+		if (aiop->aioprocflags & AIOP_FREE) {
+			TAILQ_REMOVE(&aio_freeproc, aiop, list);
+			aiop->aioprocflags &= ~AIOP_FREE;
+		}
+
+		/*
+		 * Check for jobs.
+		 */
+		while ((job = aio_selectjob(aiop)) != NULL) {
+			mtx_unlock(&aio_job_mtx);
+
+			ki = job->userproc->p_aioinfo;
+			job->handle_fn(job);
+
+			mtx_lock(&aio_job_mtx);
+			/* Decrement the active job count. */
+			ki->kaio_active_count--;
+		}
+
+		/*
+		 * Disconnect from user address space.
+		 */
+		if (p->p_vmspace != myvm) {
+			mtx_unlock(&aio_job_mtx);
+			vmspace_switch_aio(myvm);
+			mtx_lock(&aio_job_mtx);
+			/*
+			 * We have to restart to avoid race, we only sleep if
+			 * no job can be selected.
+			 */
+			continue;
+		}
+
+		mtx_assert(&aio_job_mtx, MA_OWNED);
+
+		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+		aiop->aioprocflags |= AIOP_FREE;
+
+		/*
+		 * If daemon is inactive for a long time, allow it to exit,
+		 * thereby freeing resources.
+		 */
+		if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
+		    aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
+		    (aiop->aioprocflags & AIOP_FREE) &&
+		    num_aio_procs > target_aio_procs)
+			break;
+	}
+	TAILQ_REMOVE(&aio_freeproc, aiop, list);
+	num_aio_procs--;
+	mtx_unlock(&aio_job_mtx);
+	uma_zfree(aiop_zone, aiop);
+	free_unr(aiod_unr, id);
+	vmspace_free(myvm);
+
+	KASSERT(p->p_vmspace == myvm,
+	    ("AIOD: bad vmspace for exiting daemon"));
+	KASSERT(myvm->vm_refcnt > 1,
+	    ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
+	kproc_exit(0);
+}
+
+/*
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
+ * AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc(int *start)
+{
+	int error;
+	struct proc *p;
+	int id;
+
+	id = alloc_unr(aiod_unr);
+	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
+		RFNOWAIT, 0, "aiod%d", id);
+	if (error == 0) {
+		/*
+		 * Wait until daemon is started.
+		 */
+		sema_wait(&aio_newproc_sem);
+		mtx_lock(&aio_job_mtx);
+		num_aio_procs++;
+		if (start != NULL)
+			(*start)--;
+		mtx_unlock(&aio_job_mtx);
+	} else {
+		free_unr(aiod_unr, id);
+	}
+	return (error);
+}
+
+/*
+ * Try the high-performance, low-overhead bio method for eligible
+ * VCHR devices.  This method doesn't use an aio helper thread, and
+ * thus has very low overhead.
+ *
+ * Assumes that the caller, aio_aqueue(), has incremented the file
+ * structure's reference count, preventing its deallocation for the
+ * duration of this call.
+ */
+static int
+aio_qbio(struct proc *p, struct kaiocb *job)
+{
+	struct aiocb *cb;
+	struct file *fp;
+	struct bio *bp;
+	struct buf *pbuf;
+	struct vnode *vp;
+	struct cdevsw *csw;
+	struct cdev *dev;
+	struct kaioinfo *ki;
+	int error, ref, poff;
+	vm_prot_t prot;
+
+	cb = &job->uaiocb;
+	fp = job->fd_file;
+
+	if (!(cb->aio_lio_opcode == LIO_WRITE ||
+	    cb->aio_lio_opcode == LIO_READ))
+		return (-1);
+	if (fp == NULL || fp->f_type != DTYPE_VNODE)
+		return (-1);
+
+	vp = fp->f_vnode;
+	if (vp->v_type != VCHR)
+		return (-1);
+	if (vp->v_bufobj.bo_bsize == 0)
+		return (-1);
+	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+		return (-1);
+
+	ref = 0;
+	csw = devvn_refthread(vp, &dev, &ref);
+	if (csw == NULL)
+		return (ENXIO);
+
+	if ((csw->d_flags & D_DISK) == 0) {
+		error = -1;
+		goto unref;
+	}
+	if (cb->aio_nbytes > dev->si_iosize_max) {
+		error = -1;
+		goto unref;
+	}
+
+	ki = p->p_aioinfo;
+	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
+	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
+		if (cb->aio_nbytes > MAXPHYS) {
+			error = -1;
+			goto unref;
+		}
+
+		pbuf = NULL;
+	} else {
+		if (cb->aio_nbytes > MAXPHYS - poff) {
+			error = -1;
+			goto unref;
+		}
+		if (ki->kaio_buffer_count >= max_buf_aio) {
+			error = EAGAIN;
+			goto unref;
+		}
+
+		job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
+		BUF_KERNPROC(pbuf);
+		AIO_LOCK(ki);
+		ki->kaio_buffer_count++;
+		AIO_UNLOCK(ki);
+	}
+	job->bp = bp = g_alloc_bio();
+
+	bp->bio_length = cb->aio_nbytes;
+	bp->bio_bcount = cb->aio_nbytes;
+	bp->bio_done = aio_biowakeup;
+	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
+	bp->bio_offset = cb->aio_offset;
+	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+	bp->bio_dev = dev;
+	bp->bio_caller1 = (void *)job;
+
+	prot = VM_PROT_READ;
+	if (cb->aio_lio_opcode == LIO_READ)
+		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
+	job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
+	    nitems(job->pages));
+	if (job->npages < 0) {
+		error = EFAULT;
+		goto doerror;
+	}
+	if (pbuf != NULL) {
+		pmap_qenter((vm_offset_t)pbuf->b_data,
+		    job->pages, job->npages);
+		bp->bio_data = pbuf->b_data + poff;
+		atomic_add_int(&num_buf_aio, 1);
+	} else {
+		bp->bio_ma = job->pages;
+		bp->bio_ma_n = job->npages;
+		bp->bio_ma_offset = poff;
+		bp->bio_data = unmapped_buf;
+		bp->bio_flags |= BIO_UNMAPPED;
+		atomic_add_int(&num_unmapped_aio, 1);
+	}
+
+	/* Perform transfer. */
+	csw->d_strategy(bp);
+	dev_relthread(dev, ref);
+	return (0);
+
+doerror:
+	if (pbuf != NULL) {
+		AIO_LOCK(ki);
+		ki->kaio_buffer_count--;
+		AIO_UNLOCK(ki);
+		relpbuf(pbuf, NULL);
+		job->pbuf = NULL;
+	}
+	g_destroy_bio(bp);
+	job->bp = NULL;
+unref:
+	dev_relthread(dev, ref);
+	return (error);
+}
+
+#ifdef COMPAT_FREEBSD6
+static int
+convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
+{
+
+	/*
+	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+	 * supported by AIO with the old sigevent structure.
+	 */
+	nsig->sigev_notify = osig->sigev_notify;
+	switch (nsig->sigev_notify) {
+	case SIGEV_NONE:
+		break;
+	case SIGEV_SIGNAL:
+		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+		break;
+	case SIGEV_KEVENT:
+		nsig->sigev_notify_kqueue =
+		    osig->__sigev_u.__sigev_notify_kqueue;
+		nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+	struct oaiocb *ojob;
+	int error;
+
+	bzero(kjob, sizeof(struct aiocb));
+	error = copyin(ujob, kjob, sizeof(struct oaiocb));
+	if (error)
+		return (error);
+	ojob = (struct oaiocb *)kjob;
+	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
+}
+#endif
+
+static int
+aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+
+	return (copyin(ujob, kjob, sizeof(struct aiocb)));
+}
+
+static long
+aiocb_fetch_status(struct aiocb *ujob)
+{
+
+	return (fuword(&ujob->_aiocb_private.status));
+}
+
+static long
+aiocb_fetch_error(struct aiocb *ujob)
+{
+
+	return (fuword(&ujob->_aiocb_private.error));
+}
+
+static int
+aiocb_store_status(struct aiocb *ujob, long status)
+{
+
+	return (suword(&ujob->_aiocb_private.status, status));
+}
+
+static int
+aiocb_store_error(struct aiocb *ujob, long error)
+{
+
+	return (suword(&ujob->_aiocb_private.error, error));
+}
+
+static int
+aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+
+	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+	return (suword(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb_ops = {
+	.copyin = aiocb_copyin,
+	.fetch_status = aiocb_fetch_status,
+	.fetch_error = aiocb_fetch_error,
+	.store_status = aiocb_store_status,
+	.store_error = aiocb_store_error,
+	.store_kernelinfo = aiocb_store_kernelinfo,
+	.store_aiocb = aiocb_store_aiocb,
+};
+
+#ifdef COMPAT_FREEBSD6
+static struct aiocb_ops aiocb_ops_osigevent = {
+	.copyin = aiocb_copyin_old_sigevent,
+	.fetch_status = aiocb_fetch_status,
+	.fetch_error = aiocb_fetch_error,
+	.store_status = aiocb_store_status,
+	.store_error = aiocb_store_error,
+	.store_kernelinfo = aiocb_store_kernelinfo,
+	.store_aiocb = aiocb_store_aiocb,
+};
+#endif
+
+/*
+ * Queue a new AIO request.  Choosing either the threaded or direct bio VCHR
+ * technique is done in this code.
+ */
+int
+aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
+    int type, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct file *fp;
+	struct kaiocb *job;
+	struct kaioinfo *ki;
+	struct kevent kev;
+	int opcode;
+	int error;
+	int fd, kqfd;
+	int jid;
+	u_short evflags;
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+
+	ki = p->p_aioinfo;
+
+	ops->store_status(ujob, -1);
+	ops->store_error(ujob, 0);
+	ops->store_kernelinfo(ujob, -1);
+
+	if (num_queue_count >= max_queue_count ||
+	    ki->kaio_count >= max_aio_queue_per_proc) {
+		ops->store_error(ujob, EAGAIN);
+		return (EAGAIN);
+	}
+
+	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
+	knlist_init_mtx(&job->klist, AIO_MTX(ki));
+
+	error = ops->copyin(ujob, &job->uaiocb);
+	if (error) {
+		ops->store_error(ujob, error);
+		uma_zfree(aiocb_zone, job);
+		return (error);
+	}
+
+	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
+		uma_zfree(aiocb_zone, job);
+		return (EINVAL);
+	}
+
+	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
+	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
+	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
+	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
+		ops->store_error(ujob, EINVAL);
+		uma_zfree(aiocb_zone, job);
+		return (EINVAL);
+	}
+
+	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
+		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
+		uma_zfree(aiocb_zone, job);
+		return (EINVAL);
+	}
+
+	ksiginfo_init(&job->ksi);
+
+	/* Save userspace address of the job info. */
+	job->ujob = ujob;
+
+	/* Get the opcode. */
+	if (type != LIO_NOP)
+		job->uaiocb.aio_lio_opcode = type;
+	opcode = job->uaiocb.aio_lio_opcode;
+
+	/*
+	 * Validate the opcode and fetch the file object for the specified
+	 * file descriptor.
+	 *
+	 * XXXRW: Moved the opcode validation up here so that we don't
+	 * retrieve a file descriptor without knowing what the capabiltity
+	 * should be.
+	 */
+	fd = job->uaiocb.aio_fildes;
+	switch (opcode) {
+	case LIO_WRITE:
+		error = fget_write(td, fd, &cap_pwrite_rights, &fp);
+		break;
+	case LIO_READ:
+		error = fget_read(td, fd, &cap_pread_rights, &fp);
+		break;
+	case LIO_SYNC:
+		error = fget(td, fd, &cap_fsync_rights, &fp);
+		break;
+	case LIO_MLOCK:
+		fp = NULL;
+		break;
+	case LIO_NOP:
+		error = fget(td, fd, &cap_no_rights, &fp);
+		break;
+	default:
+		error = EINVAL;
+	}
+	if (error) {
+		uma_zfree(aiocb_zone, job);
+		ops->store_error(ujob, error);
+		return (error);
+	}
+
+	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+
+	if ((opcode == LIO_READ || opcode == LIO_WRITE) &&
+	    job->uaiocb.aio_offset < 0 &&
+	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+
+	job->fd_file = fp;
+
+	mtx_lock(&aio_job_mtx);
+	jid = jobrefid++;
+	job->seqno = jobseqno++;
+	mtx_unlock(&aio_job_mtx);
+	error = ops->store_kernelinfo(ujob, jid);
+	if (error) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
+
+	if (opcode == LIO_NOP) {
+		fdrop(fp, td);
+		uma_zfree(aiocb_zone, job);
+		return (0);
+	}
+
+	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
+		goto no_kqueue;
+	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
+	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
+		error = EINVAL;
+		goto aqueue_fail;
+	}
+	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
+	memset(&kev, 0, sizeof(kev));
+	kev.ident = (uintptr_t)job->ujob;
+	kev.filter = EVFILT_AIO;
+	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
+	kev.data = (intptr_t)job;
+	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
+	error = kqfd_register(kqfd, &kev, td, M_WAITOK);
+	if (error)
+		goto aqueue_fail;
+
+no_kqueue:
+
+	ops->store_error(ujob, EINPROGRESS);
+	job->uaiocb._aiocb_private.error = EINPROGRESS;
+	job->userproc = p;
+	job->cred = crhold(td->td_ucred);
+	job->jobflags = KAIOCB_QUEUEING;
+	job->lio = lj;
+
+	if (opcode == LIO_MLOCK) {
+		aio_schedule(job, aio_process_mlock);
+		error = 0;
+	} else if (fp->f_ops->fo_aio_queue == NULL)
+		error = aio_queue_file(fp, job);
+	else
+		error = fo_aio_queue(fp, job);
+	if (error)
+		goto aqueue_fail;
+
+	AIO_LOCK(ki);
+	job->jobflags &= ~KAIOCB_QUEUEING;
+	TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
+	ki->kaio_count++;
+	if (lj)
+		lj->lioj_count++;
+	atomic_add_int(&num_queue_count, 1);
+	if (job->jobflags & KAIOCB_FINISHED) {
+		/*
+		 * The queue callback completed the request synchronously.
+		 * The bulk of the completion is deferred in that case
+		 * until this point.
+		 */
+		aio_bio_done_notify(p, job);
+	} else
+		TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
+	AIO_UNLOCK(ki);
+	return (0);
+
+aqueue_fail:
+	knlist_delete(&job->klist, curthread, 0);
+	if (fp)
+		fdrop(fp, td);
+	uma_zfree(aiocb_zone, job);
+	ops->store_error(ujob, error);
+	return (error);
+}
+
+static void
+aio_cancel_daemon_job(struct kaiocb *job)
+{
+
+	mtx_lock(&aio_job_mtx);
+	if (!aio_cancel_cleared(job))
+		TAILQ_REMOVE(&aio_jobs, job, list);
+	mtx_unlock(&aio_job_mtx);
+	aio_cancel(job);
+}
+
+void
+aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
+{
+
+	mtx_lock(&aio_job_mtx);
+	if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
+		mtx_unlock(&aio_job_mtx);
+		aio_cancel(job);
+		return;
+	}
+	job->handle_fn = func;
+	TAILQ_INSERT_TAIL(&aio_jobs, job, list);
+	aio_kick_nowait(job->userproc);
+	mtx_unlock(&aio_job_mtx);
+}
+
+static void
+aio_cancel_sync(struct kaiocb *job)
+{
+	struct kaioinfo *ki;
+
+	ki = job->userproc->p_aioinfo;
+	AIO_LOCK(ki);
+	if (!aio_cancel_cleared(job))
+		TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
+	AIO_UNLOCK(ki);
+	aio_cancel(job);
+}
+
+int
+aio_queue_file(struct file *fp, struct kaiocb *job)
+{
+	struct kaioinfo *ki;
+	struct kaiocb *job2;
+	struct vnode *vp;
+	struct mount *mp;
+	int error;
+	bool safe;
+
+	ki = job->userproc->p_aioinfo;
+	error = aio_qbio(job->userproc, job);
+	if (error >= 0)
+		return (error);
+	safe = false;
+	if (fp->f_type == DTYPE_VNODE) {
+		vp = fp->f_vnode;
+		if (vp->v_type == VREG || vp->v_type == VDIR) {
+			mp = fp->f_vnode->v_mount;
+			if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
+				safe = true;
+		}
+	}
+	if (!(safe || enable_aio_unsafe)) {
+		counted_warning(&unsafe_warningcnt,
+		    "is attempting to use unsafe AIO requests");
+		return (EOPNOTSUPP);
+	}
+
+	switch (job->uaiocb.aio_lio_opcode) {
+	case LIO_READ:
+	case LIO_WRITE:
+		aio_schedule(job, aio_process_rw);
+		error = 0;
+		break;
+	case LIO_SYNC:
+		AIO_LOCK(ki);
+		TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
+			if (job2->fd_file == job->fd_file &&
+			    job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
+			    job2->seqno < job->seqno) {
+				job2->jobflags |= KAIOCB_CHECKSYNC;
+				job->pending++;
+			}
+		}
+		if (job->pending != 0) {
+			if (!aio_set_cancel_function_locked(job,
+				aio_cancel_sync)) {
+				AIO_UNLOCK(ki);
+				aio_cancel(job);
+				return (0);
+			}
+			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
+			AIO_UNLOCK(ki);
+			return (0);
+		}
+		AIO_UNLOCK(ki);
+		aio_schedule(job, aio_process_sync);
+		error = 0;
+		break;
+	default:
+		error = EINVAL;
+	}
+	return (error);
+}
+
+static void
+aio_kick_nowait(struct proc *userp)
+{
+	struct kaioinfo *ki = userp->p_aioinfo;
+	struct aioproc *aiop;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+		TAILQ_REMOVE(&aio_freeproc, aiop, list);
+		aiop->aioprocflags &= ~AIOP_FREE;
+		wakeup(aiop->aioproc);
+	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
+	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
+		taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
+	}
+}
+
+static int
+aio_kick(struct proc *userp)
+{
+	struct kaioinfo *ki = userp->p_aioinfo;
+	struct aioproc *aiop;
+	int error, ret = 0;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+retryproc:
+	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+		TAILQ_REMOVE(&aio_freeproc, aiop, list);
+		aiop->aioprocflags &= ~AIOP_FREE;
+		wakeup(aiop->aioproc);
+	} else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
+	    ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
+		num_aio_resv_start++;
+		mtx_unlock(&aio_job_mtx);
+		error = aio_newproc(&num_aio_resv_start);
+		mtx_lock(&aio_job_mtx);
+		if (error) {
+			num_aio_resv_start--;
+			goto retryproc;
+		}
+	} else {
+		ret = -1;
+	}
+	return (ret);
+}
+
+static void
+aio_kick_helper(void *context, int pending)
+{
+	struct proc *userp = context;
+
+	mtx_lock(&aio_job_mtx);
+	while (--pending >= 0) {
+		if (aio_kick(userp))
+			break;
+	}
+	mtx_unlock(&aio_job_mtx);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel resources are
+ * released.
+ */
+static int
+kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct kaiocb *job;
+	struct kaioinfo *ki;
+	long status, error;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return (EINVAL);
+	AIO_LOCK(ki);
+	TAILQ_FOREACH(job, &ki->kaio_done, plist) {
+		if (job->ujob == ujob)
+			break;
+	}
+	if (job != NULL) {
+		MPASS(job->jobflags & KAIOCB_FINISHED);
+		status = job->uaiocb._aiocb_private.status;
+		error = job->uaiocb._aiocb_private.error;
+		td->td_retval[0] = status;
+		td->td_ru.ru_oublock += job->outblock;
+		td->td_ru.ru_inblock += job->inblock;
+		td->td_ru.ru_msgsnd += job->msgsnd;
+		td->td_ru.ru_msgrcv += job->msgrcv;
+		aio_free_entry(job);
+		AIO_UNLOCK(ki);
+		ops->store_error(ujob, error);
+		ops->store_status(ujob, status);
+	} else {
+		error = EINVAL;
+		AIO_UNLOCK(ki);
+	}
+	return (error);
+}
+
+int
+sys_aio_return(struct thread *td, struct aio_return_args *uap)
+{
+
+	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are completed.
+ */
+static int
+kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
+    struct timespec *ts)
+{
+	struct proc *p = td->td_proc;
+	struct timeval atv;
+	struct kaioinfo *ki;
+	struct kaiocb *firstjob, *job;
+	int error, i, timo;
+
+	timo = 0;
+	if (ts) {
+		if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+			return (EINVAL);
+
+		TIMESPEC_TO_TIMEVAL(&atv, ts);
+		if (itimerfix(&atv))
+			return (EINVAL);
+		timo = tvtohz(&atv);
+	}
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		return (EAGAIN);
+
+	if (njoblist == 0)
+		return (0);
+
+	AIO_LOCK(ki);
+	for (;;) {
+		firstjob = NULL;
+		error = 0;
+		TAILQ_FOREACH(job, &ki->kaio_all, allist) {
+			for (i = 0; i < njoblist; i++) {
+				if (job->ujob == ujoblist[i]) {
+					if (firstjob == NULL)
+						firstjob = job;
+					if (job->jobflags & KAIOCB_FINISHED)
+						goto RETURN;
+				}
+			}
+		}
+		/* All tasks were finished. */
+		if (firstjob == NULL)
+			break;
+
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+		    "aiospn", timo);
+		if (error == ERESTART)
+			error = EINTR;
+		if (error)
+			break;
+	}
+RETURN:
+	AIO_UNLOCK(ki);
+	return (error);
+}
+
+int
+sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
+{
+	struct timespec ts, *tsp;
+	struct aiocb **ujoblist;
+	int error;
+
+	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
+		return (EINVAL);
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
+			return (error);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
+	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
+	if (error == 0)
+		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+	free(ujoblist, M_AIOS);
+	return (error);
+}
+
+/*
+ * aio_cancel cancels any non-bio aio operations not currently in progress.
+ */
+int
+sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct kaioinfo *ki;
+	struct kaiocb *job, *jobn;
+	struct file *fp;
+	int error;
+	int cancelled = 0;
+	int notcancelled = 0;
+	struct vnode *vp;
+
+	/* Lookup file object. */
+	error = fget(td, uap->fd, &cap_no_rights, &fp);
+	if (error)
+		return (error);
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		goto done;
+
+	if (fp->f_type == DTYPE_VNODE) {
+		vp = fp->f_vnode;
+		if (vn_isdisk(vp, &error)) {
+			fdrop(fp, td);
+			td->td_retval[0] = AIO_NOTCANCELED;
+			return (0);
+		}
+	}
+
+	AIO_LOCK(ki);
+	TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
+		if ((uap->fd == job->uaiocb.aio_fildes) &&
+		    ((uap->aiocbp == NULL) ||
+		     (uap->aiocbp == job->ujob))) {
+			if (aio_cancel_job(p, ki, job)) {
+				cancelled++;
+			} else {
+				notcancelled++;
+			}
+			if (uap->aiocbp != NULL)
+				break;
+		}
+	}
+	AIO_UNLOCK(ki);
+
+done:
+	fdrop(fp, td);
+
+	if (uap->aiocbp != NULL) {
+		if (cancelled) {
+			td->td_retval[0] = AIO_CANCELED;
+			return (0);
+		}
+	}
+
+	if (notcancelled) {
+		td->td_retval[0] = AIO_NOTCANCELED;
+		return (0);
+	}
+
+	if (cancelled) {
+		td->td_retval[0] = AIO_CANCELED;
+		return (0);
+	}
+
+	td->td_retval[0] = AIO_ALLDONE;
+
+	return (0);
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility purposes
+ * only.  For a user mode async implementation, it would be best to do it in
+ * a userland subroutine.
+ */
+static int
+kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct kaiocb *job;
+	struct kaioinfo *ki;
+	int status;
+
+	ki = p->p_aioinfo;
+	if (ki == NULL) {
+		td->td_retval[0] = EINVAL;
+		return (0);
+	}
+
+	AIO_LOCK(ki);
+	TAILQ_FOREACH(job, &ki->kaio_all, allist) {
+		if (job->ujob == ujob) {
+			if (job->jobflags & KAIOCB_FINISHED)
+				td->td_retval[0] =
+					job->uaiocb._aiocb_private.error;
+			else
+				td->td_retval[0] = EINPROGRESS;
+			AIO_UNLOCK(ki);
+			return (0);
+		}
+	}
+	AIO_UNLOCK(ki);
+
+	/*
+	 * Hack for failure of aio_aqueue.
+	 */
+	status = ops->fetch_status(ujob);
+	if (status == -1) {
+		td->td_retval[0] = ops->fetch_error(ujob);
+		return (0);
+	}
+
+	td->td_retval[0] = EINVAL;
+	return (0);
+}
+
+int
+sys_aio_error(struct thread *td, struct aio_error_args *uap)
+{
+
+	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
+}
+
+/* syscall - asynchronous read from a file (REALTIME) */
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+	    &aiocb_ops_osigevent));
+}
+#endif
+
+int
+sys_aio_read(struct thread *td, struct aio_read_args *uap)
+{
+
+	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
+}
+
+/* syscall - asynchronous write to a file (REALTIME) */
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+	    &aiocb_ops_osigevent));
+}
+#endif
+
+int
+sys_aio_write(struct thread *td, struct aio_write_args *uap)
+{
+
+	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
+}
+
+int
+sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
+{
+
+	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
+}
+
+static int
+kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
+    struct aiocb **acb_list, int nent, struct sigevent *sig,
+    struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct aiocb *job;
+	struct kaioinfo *ki;
+	struct aioliojob *lj;
+	struct kevent kev;
+	int error;
+	int nagain, nerror;
+	int i;
+
+	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
+		return (EINVAL);
+
+	if (nent < 0 || nent > max_aio_queue_per_proc)
+		return (EINVAL);
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+
+	ki = p->p_aioinfo;
+
+	lj = uma_zalloc(aiolio_zone, M_WAITOK);
+	lj->lioj_flags = 0;
+	lj->lioj_count = 0;
+	lj->lioj_finished_count = 0;
+	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
+	ksiginfo_init(&lj->lioj_ksi);
+
+	/*
+	 * Setup signal.
+	 */
+	if (sig && (mode == LIO_NOWAIT)) {
+		bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
+		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+			/* Assume only new style KEVENT */
+			memset(&kev, 0, sizeof(kev));
+			kev.filter = EVFILT_LIO;
+			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+			kev.ident = (uintptr_t)uacb_list; /* something unique */
+			kev.data = (intptr_t)lj;
+			/* pass user defined sigval data */
+			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
+			error = kqfd_register(
+			    lj->lioj_signal.sigev_notify_kqueue, &kev, td,
+			    M_WAITOK);
+			if (error) {
+				uma_zfree(aiolio_zone, lj);
+				return (error);
+			}
+		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
+			;
+		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
+				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+					uma_zfree(aiolio_zone, lj);
+					return EINVAL;
+				}
+				lj->lioj_flags |= LIOJ_SIGNAL;
+		} else {
+			uma_zfree(aiolio_zone, lj);
+			return EINVAL;
+		}
+	}
+
+	AIO_LOCK(ki);
+	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+	/*
+	 * Add extra aiocb count to avoid the lio to be freed
+	 * by other threads doing aio_waitcomplete or aio_return,
+	 * and prevent event from being sent until we have queued
+	 * all tasks.
+	 */
+	lj->lioj_count = 1;
+	AIO_UNLOCK(ki);
+
+	/*
+	 * Get pointers to the list of I/O requests.
+	 */
+	nagain = 0;
+	nerror = 0;
+	for (i = 0; i < nent; i++) {
+		job = acb_list[i];
+		if (job != NULL) {
+			error = aio_aqueue(td, job, lj, LIO_NOP, ops);
+			if (error == EAGAIN)
+				nagain++;
+			else if (error != 0)
+				nerror++;
+		}
+	}
+
+	error = 0;
+	AIO_LOCK(ki);
+	if (mode == LIO_WAIT) {
+		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
+			ki->kaio_flags |= KAIO_WAKEUP;
+			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
+			    PRIBIO | PCATCH, "aiospn", 0);
+			if (error == ERESTART)
+				error = EINTR;
+			if (error)
+				break;
+		}
+	} else {
+		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
+			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+				KNOTE_LOCKED(&lj->klist, 1);
+			}
+			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+			    == LIOJ_SIGNAL
+			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+				aio_sendsig(p, &lj->lioj_signal,
+					    &lj->lioj_ksi);
+				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+			}
+		}
+	}
+	lj->lioj_count--;
+	if (lj->lioj_count == 0) {
+		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+		knlist_delete(&lj->klist, curthread, 1);
+		PROC_LOCK(p);
+		sigqueue_take(&lj->lioj_ksi);
+		PROC_UNLOCK(p);
+		AIO_UNLOCK(ki);
+		uma_zfree(aiolio_zone, lj);
+	} else
+		AIO_UNLOCK(ki);
+
+	if (nerror)
+		return (EIO);
+	else if (nagain)
+		return (EAGAIN);
+	else
+		return (error);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	struct osigevent osig;
+	int error, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > max_aio_queue_per_proc)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &osig, sizeof(osig));
+		if (error)
+			return (error);
+		error = convert_old_sigevent(&osig, &sig);
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+	if (error == 0)
+		error = kern_lio_listio(td, uap->mode,
+		    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+		    &aiocb_ops_osigevent);
+	free(acb_list, M_LIO);
+	return (error);
+}
+#endif
+
+/* syscall - list directed I/O (REALTIME) */
+int
+sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	int error, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > max_aio_queue_per_proc)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &sig, sizeof(sig));
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+	if (error == 0)
+		error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
+		    nent, sigp, &aiocb_ops);
+	free(acb_list, M_LIO);
+	return (error);
+}
+
+static void
+aio_biowakeup(struct bio *bp)
+{
+	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
+	struct proc *userp;
+	struct kaioinfo *ki;
+	size_t nbytes;
+	int error, nblks;
+
+	/* Release mapping into kernel space. */
+	userp = job->userproc;
+	ki = userp->p_aioinfo;
+	if (job->pbuf) {
+		pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
+		relpbuf(job->pbuf, NULL);
+		job->pbuf = NULL;
+		atomic_subtract_int(&num_buf_aio, 1);
+		AIO_LOCK(ki);
+		ki->kaio_buffer_count--;
+		AIO_UNLOCK(ki);
+	} else
+		atomic_subtract_int(&num_unmapped_aio, 1);
+	vm_page_unhold_pages(job->pages, job->npages);
+
+	bp = job->bp;
+	job->bp = NULL;
+	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
+	error = 0;
+	if (bp->bio_flags & BIO_ERROR)
+		error = bp->bio_error;
+	nblks = btodb(nbytes);
+	if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
+		job->outblock += nblks;
+	else
+		job->inblock += nblks;
+
+	if (error)
+		aio_complete(job, -1, error);
+	else
+		aio_complete(job, nbytes, 0);
+
+	g_destroy_bio(bp);
+}
+
+/* syscall - wait for the next completion of an aio request */
+static int
+kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
+    struct timespec *ts, struct aiocb_ops *ops)
+{
+	struct proc *p = td->td_proc;
+	struct timeval atv;
+	struct kaioinfo *ki;
+	struct kaiocb *job;
+	struct aiocb *ujob;
+	long error, status;
+	int timo;
+
+	ops->store_aiocb(ujobp, NULL);
+
+	if (ts == NULL) {
+		timo = 0;
+	} else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
+		timo = -1;
+	} else {
+		if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
+			return (EINVAL);
+
+		TIMESPEC_TO_TIMEVAL(&atv, ts);
+		if (itimerfix(&atv))
+			return (EINVAL);
+		timo = tvtohz(&atv);
+	}
+
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+	ki = p->p_aioinfo;
+
+	error = 0;
+	job = NULL;
+	AIO_LOCK(ki);
+	while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
+		if (timo == -1) {
+			error = EWOULDBLOCK;
+			break;
+		}
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+		    "aiowc", timo);
+		if (timo && error == ERESTART)
+			error = EINTR;
+		if (error)
+			break;
+	}
+
+	if (job != NULL) {
+		MPASS(job->jobflags & KAIOCB_FINISHED);
+		ujob = job->ujob;
+		status = job->uaiocb._aiocb_private.status;
+		error = job->uaiocb._aiocb_private.error;
+		td->td_retval[0] = status;
+		td->td_ru.ru_oublock += job->outblock;
+		td->td_ru.ru_inblock += job->inblock;
+		td->td_ru.ru_msgsnd += job->msgsnd;
+		td->td_ru.ru_msgrcv += job->msgrcv;
+		aio_free_entry(job);
+		AIO_UNLOCK(ki);
+		ops->store_aiocb(ujobp, ujob);
+		ops->store_error(ujob, error);
+		ops->store_status(ujob, status);
+	} else
+		AIO_UNLOCK(ki);
+
+	return (error);
+}
+
+int
+sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
+{
+	struct timespec ts, *tsp;
+	int error;
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		error = copyin(uap->timeout, &ts, sizeof(ts));
+		if (error)
+			return (error);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
+}
+
+static int
+kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
+    struct aiocb_ops *ops)
+{
+
+	if (op != O_SYNC) /* XXX lack of O_DSYNC */
+		return (EINVAL);
+	return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
+}
+
+int
+sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
+{
+
+	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
+}
+
+/* kqueue attach function */
+static int
+filt_aioattach(struct knote *kn)
+{
+	struct kaiocb *job;
+
+	job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
+
+	/*
+	 * The job pointer must be validated before using it, so
+	 * registration is restricted to the kernel; the user cannot
+	 * set EV_FLAG1.
+	 */
+	if ((kn->kn_flags & EV_FLAG1) == 0)
+		return (EPERM);
+	kn->kn_ptr.p_aio = job;
+	kn->kn_flags &= ~EV_FLAG1;
+
+	knlist_add(&job->klist, kn, 0);
+
+	return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_aiodetach(struct knote *kn)
+{
+	struct knlist *knl;
+
+	knl = &kn->kn_ptr.p_aio->klist;
+	knl->kl_lock(knl->kl_lockarg);
+	if (!knlist_empty(knl))
+		knlist_remove(knl, kn, 1);
+	knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_aio(struct knote *kn, long hint)
+{
+	struct kaiocb *job = kn->kn_ptr.p_aio;
+
+	kn->kn_data = job->uaiocb._aiocb_private.error;
+	if (!(job->jobflags & KAIOCB_FINISHED))
+		return (0);
+	kn->kn_flags |= EV_EOF;
+	return (1);
+}
+
+/* kqueue attach function */
+static int
+filt_lioattach(struct knote *kn)
+{
+	struct aioliojob *lj;
+
+	lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
+
+	/*
+	 * The aioliojob pointer must be validated before using it, so
+	 * registration is restricted to the kernel; the user cannot
+	 * set EV_FLAG1.
+	 */
+	if ((kn->kn_flags & EV_FLAG1) == 0)
+		return (EPERM);
+	kn->kn_ptr.p_lio = lj;
+	kn->kn_flags &= ~EV_FLAG1;
+
+	knlist_add(&lj->klist, kn, 0);
+
+	return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_liodetach(struct knote *kn)
+{
+	struct knlist *knl;
+
+	knl = &kn->kn_ptr.p_lio->klist;
+	knl->kl_lock(knl->kl_lockarg);
+	if (!knlist_empty(knl))
+		knlist_remove(knl, kn, 1);
+	knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_lio(struct knote *kn, long hint)
+{
+	struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
+}
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+struct __aiocb_private32 {
+	int32_t	status;
+	int32_t	error;
+	uint32_t kernelinfo;
+};
+
+#ifdef COMPAT_FREEBSD6
+typedef struct oaiocb32 {
+	int	aio_fildes;		/* File descriptor */
+	uint64_t aio_offset __packed;	/* File offset for I/O */
+	uint32_t aio_buf;		/* I/O buffer in process space */
+	uint32_t aio_nbytes;		/* Number of bytes for I/O */
+	struct	osigevent32 aio_sigevent; /* Signal to deliver */
+	int	aio_lio_opcode;		/* LIO opcode */
+	int	aio_reqprio;		/* Request priority -- ignored */
+	struct	__aiocb_private32 _aiocb_private;
+} oaiocb32_t;
+#endif
+
+typedef struct aiocb32 {
+	int32_t	aio_fildes;		/* File descriptor */
+	uint64_t aio_offset __packed;	/* File offset for I/O */
+	uint32_t aio_buf;		/* I/O buffer in process space */
+	uint32_t aio_nbytes;		/* Number of bytes for I/O */
+	int	__spare__[2];
+	uint32_t __spare2__;
+	int	aio_lio_opcode;		/* LIO opcode */
+	int	aio_reqprio;		/* Request priority -- ignored */
+	struct	__aiocb_private32 _aiocb_private;
+	struct	sigevent32 aio_sigevent;	/* Signal to deliver */
+} aiocb32_t;
+
+#ifdef COMPAT_FREEBSD6
+static int
+convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
+{
+
+	/*
+	 * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+	 * supported by AIO with the old sigevent structure.
+	 */
+	CP(*osig, *nsig, sigev_notify);
+	switch (nsig->sigev_notify) {
+	case SIGEV_NONE:
+		break;
+	case SIGEV_SIGNAL:
+		nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+		break;
+	case SIGEV_KEVENT:
+		nsig->sigev_notify_kqueue =
+		    osig->__sigev_u.__sigev_notify_kqueue;
+		PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+	struct oaiocb32 job32;
+	int error;
+
+	bzero(kjob, sizeof(struct aiocb));
+	error = copyin(ujob, &job32, sizeof(job32));
+	if (error)
+		return (error);
+
+	CP(job32, *kjob, aio_fildes);
+	CP(job32, *kjob, aio_offset);
+	PTRIN_CP(job32, *kjob, aio_buf);
+	CP(job32, *kjob, aio_nbytes);
+	CP(job32, *kjob, aio_lio_opcode);
+	CP(job32, *kjob, aio_reqprio);
+	CP(job32, *kjob, _aiocb_private.status);
+	CP(job32, *kjob, _aiocb_private.error);
+	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+	return (convert_old_sigevent32(&job32.aio_sigevent,
+	    &kjob->aio_sigevent));
+}
+#endif
+
+static int
+aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+	struct aiocb32 job32;
+	int error;
+
+	error = copyin(ujob, &job32, sizeof(job32));
+	if (error)
+		return (error);
+	CP(job32, *kjob, aio_fildes);
+	CP(job32, *kjob, aio_offset);
+	PTRIN_CP(job32, *kjob, aio_buf);
+	CP(job32, *kjob, aio_nbytes);
+	CP(job32, *kjob, aio_lio_opcode);
+	CP(job32, *kjob, aio_reqprio);
+	CP(job32, *kjob, _aiocb_private.status);
+	CP(job32, *kjob, _aiocb_private.error);
+	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
+}
+
+static long
+aiocb32_fetch_status(struct aiocb *ujob)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (fuword32(&ujob32->_aiocb_private.status));
+}
+
+static long
+aiocb32_fetch_error(struct aiocb *ujob)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (fuword32(&ujob32->_aiocb_private.error));
+}
+
+static int
+aiocb32_store_status(struct aiocb *ujob, long status)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (suword32(&ujob32->_aiocb_private.status, status));
+}
+
+static int
+aiocb32_store_error(struct aiocb *ujob, long error)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (suword32(&ujob32->_aiocb_private.error, error));
+}
+
+static int
+aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+	struct aiocb32 *ujob32;
+
+	ujob32 = (struct aiocb32 *)ujob;
+	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+	return (suword32(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb32_ops = {
+	.copyin = aiocb32_copyin,
+	.fetch_status = aiocb32_fetch_status,
+	.fetch_error = aiocb32_fetch_error,
+	.store_status = aiocb32_store_status,
+	.store_error = aiocb32_store_error,
+	.store_kernelinfo = aiocb32_store_kernelinfo,
+	.store_aiocb = aiocb32_store_aiocb,
+};
+
+#ifdef COMPAT_FREEBSD6
+static struct aiocb_ops aiocb32_ops_osigevent = {
+	.copyin = aiocb32_copyin_old_sigevent,
+	.fetch_status = aiocb32_fetch_status,
+	.fetch_error = aiocb32_fetch_error,
+	.store_status = aiocb32_store_status,
+	.store_error = aiocb32_store_error,
+	.store_kernelinfo = aiocb32_store_kernelinfo,
+	.store_aiocb = aiocb32_store_aiocb,
+};
+#endif
+
+int
+freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
+{
+
+	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+int
+freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
+{
+	struct timespec32 ts32;
+	struct timespec ts, *tsp;
+	struct aiocb **ujoblist;
+	uint32_t *ujoblist32;
+	int error, i;
+
+	if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
+		return (EINVAL);
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
+			return (error);
+		CP(ts32, ts, tv_sec);
+		CP(ts32, ts, tv_nsec);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
+	ujoblist32 = (uint32_t *)ujoblist;
+	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
+	    sizeof(ujoblist32[0]));
+	if (error == 0) {
+		for (i = uap->nent - 1; i >= 0; i--)
+			ujoblist[i] = PTRIN(ujoblist32[i]);
+
+		error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+	}
+	free(ujoblist, M_AIOS);
+	return (error);
+}
+
+int
+freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
+{
+
+	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_freebsd32_aio_read(struct thread *td,
+    struct freebsd6_freebsd32_aio_read_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+	    &aiocb32_ops_osigevent));
+}
+#endif
+
+int
+freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+	    &aiocb32_ops));
+}
+
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_freebsd32_aio_write(struct thread *td,
+    struct freebsd6_freebsd32_aio_write_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+	    &aiocb32_ops_osigevent));
+}
+#endif
+
+int
+freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_aio_waitcomplete(struct thread *td,
+    struct freebsd32_aio_waitcomplete_args *uap)
+{
+	struct timespec32 ts32;
+	struct timespec ts, *tsp;
+	int error;
+
+	if (uap->timeout) {
+		/* Get timespec struct. */
+		error = copyin(uap->timeout, &ts32, sizeof(ts32));
+		if (error)
+			return (error);
+		CP(ts32, ts, tv_sec);
+		CP(ts32, ts, tv_nsec);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
+	    &aiocb32_ops));
+}
+
+int
+freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
+{
+
+	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
+	    &aiocb32_ops));
+}
+
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_freebsd32_lio_listio(struct thread *td,
+    struct freebsd6_freebsd32_lio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	struct osigevent32 osig;
+	uint32_t *acb_list32;
+	int error, i, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > max_aio_queue_per_proc)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &osig, sizeof(osig));
+		if (error)
+			return (error);
+		error = convert_old_sigevent32(&osig, &sig);
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+	if (error) {
+		free(acb_list32, M_LIO);
+		return (error);
+	}
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	for (i = 0; i < nent; i++)
+		acb_list[i] = PTRIN(acb_list32[i]);
+	free(acb_list32, M_LIO);
+
+	error = kern_lio_listio(td, uap->mode,
+	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+	    &aiocb32_ops_osigevent);
+	free(acb_list, M_LIO);
+	return (error);
+}
+#endif
+
+int
+freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
+{
+	struct aiocb **acb_list;
+	struct sigevent *sigp, sig;
+	struct sigevent32 sig32;
+	uint32_t *acb_list32;
+	int error, i, nent;
+
+	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+		return (EINVAL);
+
+	nent = uap->nent;
+	if (nent < 0 || nent > max_aio_queue_per_proc)
+		return (EINVAL);
+
+	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		error = copyin(uap->sig, &sig32, sizeof(sig32));
+		if (error)
+			return (error);
+		error = convert_sigevent32(&sig32, &sig);
+		if (error)
+			return (error);
+		sigp = &sig;
+	} else
+		sigp = NULL;
+
+	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+	if (error) {
+		free(acb_list32, M_LIO);
+		return (error);
+	}
+	acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+	for (i = 0; i < nent; i++)
+		acb_list[i] = PTRIN(acb_list32[i]);
+	free(acb_list32, M_LIO);
+
+	error = kern_lio_listio(td, uap->mode,
+	    (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+	    &aiocb32_ops);
+	free(acb_list, M_LIO);
+	return (error);
+}
+
+#endif
diff --git a/freebsd/sys/kern/vfs_bio.c b/freebsd/sys/kern/vfs_bio.c
new file mode 100644
index 00000000..2277bf67
--- /dev/null
+++ b/freebsd/sys/kern/vfs_bio.c
@@ -0,0 +1,5474 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme.  Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author:  John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/bitset.h>
+#include <sys/conf.h>
+#include <sys/counter.h>
+#include <sys/buf.h>
+#include <sys/devicestat.h>
+#include <sys/eventhandler.h>
+#include <sys/fail.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vmem.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+#include <geom/geom.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/swap_pager.h>
+#include "opt_swap.h"
+
+static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
+
+struct	bio_ops bioops;		/* I/O operation notification */
+
+struct	buf_ops buf_ops_bio = {
+	.bop_name	=	"buf_ops_bio",
+	.bop_write	=	bufwrite,
+	.bop_strategy	=	bufstrategy,
+	.bop_sync	=	bufsync,
+	.bop_bdflush	=	bufbdflush,
+};
+
+struct bufqueue {
+	struct mtx_padalign	bq_lock;
+	TAILQ_HEAD(, buf)	bq_queue;
+	uint8_t			bq_index;
+	uint16_t		bq_subqueue;
+	int			bq_len;
+} __aligned(CACHE_LINE_SIZE);
+
+#define	BQ_LOCKPTR(bq)		(&(bq)->bq_lock)
+#define	BQ_LOCK(bq)		mtx_lock(BQ_LOCKPTR((bq)))
+#define	BQ_UNLOCK(bq)		mtx_unlock(BQ_LOCKPTR((bq)))
+#define	BQ_ASSERT_LOCKED(bq)	mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufdomain {
+	struct bufqueue	bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
+	struct bufqueue bd_dirtyq;
+	struct bufqueue	*bd_cleanq;
+	struct mtx_padalign bd_run_lock;
+	/* Constants */
+	long		bd_maxbufspace;
+	long		bd_hibufspace;
+	long 		bd_lobufspace;
+	long 		bd_bufspacethresh;
+	int		bd_hifreebuffers;
+	int		bd_lofreebuffers;
+	int		bd_hidirtybuffers;
+	int		bd_lodirtybuffers;
+	int		bd_dirtybufthresh;
+	int		bd_lim;
+	/* atomics */
+	int		bd_wanted;
+	int __aligned(CACHE_LINE_SIZE)	bd_numdirtybuffers;
+	int __aligned(CACHE_LINE_SIZE)	bd_running;
+	long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+	int __aligned(CACHE_LINE_SIZE)	bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define	BD_LOCKPTR(bd)		(&(bd)->bd_cleanq->bq_lock)
+#define	BD_LOCK(bd)		mtx_lock(BD_LOCKPTR((bd)))
+#define	BD_UNLOCK(bd)		mtx_unlock(BD_LOCKPTR((bd)))
+#define	BD_ASSERT_LOCKED(bd)	mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define	BD_RUN_LOCKPTR(bd)	(&(bd)->bd_run_lock)
+#define	BD_RUN_LOCK(bd)		mtx_lock(BD_RUN_LOCKPTR((bd)))
+#define	BD_RUN_UNLOCK(bd)	mtx_unlock(BD_RUN_LOCKPTR((bd)))
+#define	BD_DOMAIN(bd)		(bd - bdomain)
+
+static struct buf *buf;		/* buffer header pool */
+extern struct buf *swbuf;	/* Swap buffer header pool. */
+caddr_t __read_mostly unmapped_buf;
+
+/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
+struct proc *bufdaemonproc;
+
+static int inmem(struct vnode *vp, daddr_t blkno);
+static void vm_hold_free_pages(struct buf *bp, int newbsize);
+static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
+		vm_offset_t to);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
+static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
+		vm_page_t m);
+static void vfs_clean_pages_dirty_buf(struct buf *bp);
+static void vfs_setdirty_locked_object(struct buf *bp);
+static void vfs_vmio_invalidate(struct buf *bp);
+static void vfs_vmio_truncate(struct buf *bp, int npages);
+static void vfs_vmio_extend(struct buf *bp, int npages, int size);
+static int vfs_bio_clcheck(struct vnode *vp, int size,
+		daddr_t lblkno, daddr_t blkno);
+static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
+		void (*)(struct buf *));
+static int buf_flush(struct vnode *vp, struct bufdomain *, int);
+static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
+static void buf_daemon(void);
+static __inline void bd_wakeup(void);
+static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
+static void bufkva_reclaim(vmem_t *, int);
+static void bufkva_free(struct buf *);
+static int buf_import(void *, void **, int, int, int);
+static void buf_release(void *, void **, int);
+static void maxbcachebuf_adjust(void);
+static inline struct bufdomain *bufdomain(struct buf *);
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+	    const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
+static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
+static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);
+
+static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
+int vmiodirenable = TRUE;
+SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
+    "Use the VM system for directory writes");
+long runningbufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
+    "Amount of presently outstanding async buffer io");
+SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
+    NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
+static counter_u64_t bufkvaspace;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
+    "Kernel virtual memory used for buffers");
+static long maxbufspace;
+SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace,
+    __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
+    "Maximum allowed value of bufspace (including metadata)");
+static long bufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
+    "Amount of malloced memory for buffers");
+static long maxbufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
+    0, "Maximum amount of malloced memory for buffers");
+static long lobufspace;
+SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace,
+    __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
+    "Minimum amount of buffers we want to have");
+long hibufspace;
+SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace,
+    __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
+    "Maximum allowed value of bufspace (excluding metadata)");
+long bufspacethresh;
+SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh,
+    __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L",
+    "Bufspace consumed before waking the daemon to free some");
+static counter_u64_t buffreekvacnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
+    "Number of times we have freed the KVA space from some buffer");
+static counter_u64_t bufdefragcnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
+    "Number of times we have had to repeat buffer allocation to defragment");
+static long lorunningspace;
+SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
+    CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
+    "Minimum preferred space used for in-progress I/O");
+static long hirunningspace;
+SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
+    CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
+    "Maximum amount of space to use for in-progress I/O");
+int dirtybufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
+    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
+int bdwriteskip;
+SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
+    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
+int altbufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
+    0, "Number of fsync flushes to limit dirty buffers");
+static int recursiveflushes;
+SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
+    0, "Number of flushes skipped due to being recursive");
+static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
+    "Number of buffers that are dirty (has unwritten changes) at the moment");
+static int lodirtybuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers,
+    __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I",
+    "How many buffers we want to have free before bufdaemon can sleep");
+static int hidirtybuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers,
+    __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I",
+    "When the number of dirty buffers is considered severe");
+int dirtybufthresh;
+SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh,
+    __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I",
+    "Number of bdwrite to bawrite conversions to clear dirty buffers");
+static int numfreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
+    "Number of free buffers");
+static int lofreebuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers,
+    __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I",
+   "Target number of free buffers");
+static int hifreebuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers,
+    __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I",
+   "Threshold for clean buffer recycling");
+static counter_u64_t getnewbufcalls;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
+   &getnewbufcalls, "Number of calls to getnewbuf");
+static counter_u64_t getnewbufrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
+    &getnewbufrestarts,
+    "Number of times getnewbuf has had to restart a buffer acquisition");
+static counter_u64_t mappingrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
+    &mappingrestarts,
+    "Number of times getblk has had to restart a buffer mapping for "
+    "unmapped buffer");
+static counter_u64_t numbufallocfails;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
+    &numbufallocfails, "Number of times buffer allocations failed");
+static int flushbufqtarget = 100;
+SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
+    "Amount of work to do in flushbufqueues when helping bufdaemon");
+static counter_u64_t notbufdflushes;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
+    "Number of dirty buffer flushes done by the bufdaemon helpers");
+static long barrierwrites;
+SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
+    "Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+    &unmapped_buf_allowed, 0,
+    "Permit the use of the unmapped i/o");
+int maxbcachebuf = MAXBCACHEBUF;
+SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
+    "Maximum size of a buffer cache block");
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign __exclusive_cache_line bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign __exclusive_cache_line rbreqlock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign __exclusive_cache_line bdirtylock;
+
+/*
+ * Wakeup point for bufdaemon, as well as indicator of whether it is already
+ * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
+ * is idling.
+ */
+static int bd_request;
+
+/*
+ * Request for the buf daemon to write more buffers than is indicated by
+ * lodirtybuf.  This may be necessary to push out excess dependencies or
+ * defragment the address space where a simple count of the number of dirty
+ * buffers is insufficient to characterize the demand for flushing them.
+ */
+static int bd_speedupreq;
+
+/*
+ * Synchronization (sleep/wakeup) variable for active buffer space requests.
+ * Set when wait starts, cleared prior to wakeup().
+ * Used in runningbufwakeup() and waitrunningbufspace().
+ */
+static int runningbufreq;
+
+/*
+ * Synchronization for bwillwrite() waiters.
+ */
+static int bdirtywait;
+
+/*
+ * Definitions for the buffer free lists.
+ */
+#define QUEUE_NONE	0	/* on no queue */
+#define QUEUE_EMPTY	1	/* empty buffer headers */
+#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
+#define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
+#define QUEUE_SENTINEL	4	/* not an queue index, but mark for sentinel */
+
+/* Maximum number of buffer domains. */
+#define	BUF_DOMAINS	8
+
+struct bufdomainset bdlodirty;		/* Domains > lodirty */
+struct bufdomainset bdhidirty;		/* Domains > hidirty */
+
+/* Configured number of clean queues. */
+static int __read_mostly buf_domains;
+
+BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
+struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
+struct bufqueue __exclusive_cache_line bqempty;
+
+/*
+ * per-cpu empty buffer cache.
+ */
+uma_zone_t buf_zone;
+
+/*
+ * Single global constant for BUF_WMESG, to avoid getting multiple references.
+ * buf_wmesg is referred from macros.
+ */
+const char *buf_wmesg = BUF_WMESG;
+
+static int
+sysctl_runningspace(SYSCTL_HANDLER_ARGS)
+{
+	long value;
+	int error;
+
+	value = *(long *)arg1;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	mtx_lock(&rbreqlock);
+	if (arg1 == &hirunningspace) {
+		if (value < lorunningspace)
+			error = EINVAL;
+		else
+			hirunningspace = value;
+	} else {
+		KASSERT(arg1 == &lorunningspace,
+		    ("%s: unknown arg1", __func__));
+		if (value > hirunningspace)
+			error = EINVAL;
+		else
+			lorunningspace = value;
+	}
+	mtx_unlock(&rbreqlock);
+	return (error);
+}
+
+static int
+sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int value;
+	int i;
+
+	value = *(int *)arg1;
+	error = sysctl_handle_int(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	*(int *)arg1 = value;
+	for (i = 0; i < buf_domains; i++)
+		*(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
+		    value / buf_domains;
+
+	return (error);
+}
+
+static int
+sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
+{
+	long value;
+	int error;
+	int i;
+
+	value = *(long *)arg1;
+	error = sysctl_handle_long(oidp, &value, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	*(long *)arg1 = value;
+	for (i = 0; i < buf_domains; i++)
+		*(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
+		    value / buf_domains;
+
+	return (error);
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+static int
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
+{
+	long lvalue;
+	int ivalue;
+	int i;
+
+	lvalue = 0;
+	for (i = 0; i < buf_domains; i++)
+		lvalue += bdomain[i].bd_bufspace;
+	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
+		return (sysctl_handle_long(oidp, &lvalue, 0, req));
+	if (lvalue > INT_MAX)
+		/* On overflow, still write out a long to trigger ENOMEM. */
+		return (sysctl_handle_long(oidp, &lvalue, 0, req));
+	ivalue = lvalue;
+	return (sysctl_handle_int(oidp, &ivalue, 0, req));
+}
+#else
+static int
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
+{
+	long lvalue;
+	int i;
+
+	lvalue = 0;
+	for (i = 0; i < buf_domains; i++)
+		lvalue += bdomain[i].bd_bufspace;
+	return (sysctl_handle_long(oidp, &lvalue, 0, req));
+}
+#endif
+
+static int
+sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
+{
+	int value;
+	int i;
+
+	value = 0;
+	for (i = 0; i < buf_domains; i++)
+		value += bdomain[i].bd_numdirtybuffers;
+	return (sysctl_handle_int(oidp, &value, 0, req));
+}
+
+/*
+ *	bdirtywakeup:
+ *
+ *	Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
+{
+	mtx_lock(&bdirtylock);
+	if (bdirtywait) {
+		bdirtywait = 0;
+		wakeup(&bdirtywait);
+	}
+	mtx_unlock(&bdirtylock);
+}
+
+/*
+ *	bd_clear:
+ *
+ *	Clear a domain from the appropriate bitsets when dirtybuffers
+ *	is decremented.
+ */
+static void
+bd_clear(struct bufdomain *bd)
+{
+
+	mtx_lock(&bdirtylock);
+	if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
+		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+	if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
+		BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+	mtx_unlock(&bdirtylock);
+}
+
+/*
+ *	bd_set:
+ *
+ *	Set a domain in the appropriate bitsets when dirtybuffers
+ *	is incremented.
+ */
+static void
+bd_set(struct bufdomain *bd)
+{
+
+	mtx_lock(&bdirtylock);
+	if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
+		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+	if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
+		BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+	mtx_unlock(&bdirtylock);
+}
+
+/*
+ *	bdirtysub:
+ *
+ *	Decrement the numdirtybuffers count by one and wakeup any
+ *	threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(struct buf *bp)
+{
+	struct bufdomain *bd;
+	int num;
+
+	bd = bufdomain(bp);
+	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
+	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
+		bdirtywakeup();
+	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+		bd_clear(bd);
+}
+
+/*
+ *	bdirtyadd:
+ *
+ *	Increment the numdirtybuffers count by one and wakeup the buf 
+ *	daemon if needed.
+ */
+static void
+bdirtyadd(struct buf *bp)
+{
+	struct bufdomain *bd;
+	int num;
+
+	/*
+	 * Only do the wakeup once as we cross the boundary.  The
+	 * buf daemon will keep running until the condition clears.
+	 */
+	bd = bufdomain(bp);
+	num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
+	if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
+		bd_wakeup();
+	if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+		bd_set(bd);
+}
+
+/*
+ *	bufspace_daemon_wakeup:
+ *
+ *	Wakeup the daemons responsible for freeing clean bufs.
+ */
+static void
+bufspace_daemon_wakeup(struct bufdomain *bd)
+{
+
+	/*
+	 * avoid the lock if the daemon is running.
+	 */
+	if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
+		BD_RUN_LOCK(bd);
+		atomic_store_int(&bd->bd_running, 1);
+		wakeup(&bd->bd_running);
+		BD_RUN_UNLOCK(bd);
+	}
+}
+
+/*
+ *	bufspace_daemon_wait:
+ *
+ *	Sleep until the domain falls below a limit or one second passes.
+ */
+static void
+bufspace_daemon_wait(struct bufdomain *bd)
+{
+	/*
+	 * Re-check our limits and sleep.  bd_running must be
+	 * cleared prior to checking the limits to avoid missed
+	 * wakeups.  The waker will adjust one of bufspace or
+	 * freebuffers prior to checking bd_running.
+	 */
+	BD_RUN_LOCK(bd);
+	atomic_store_int(&bd->bd_running, 0);
+	if (bd->bd_bufspace < bd->bd_bufspacethresh &&
+	    bd->bd_freebuffers > bd->bd_lofreebuffers) {
+		msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
+		    "-", hz);
+	} else {
+		/* Avoid spurious wakeups while running. */
+		atomic_store_int(&bd->bd_running, 1);
+		BD_RUN_UNLOCK(bd);
+	}
+}
+
+/*
+ *	bufspace_adjust:
+ *
+ *	Adjust the reported bufspace for a KVA managed buffer, possibly
+ * 	waking any waiters.
+ */
+static void
+bufspace_adjust(struct buf *bp, int bufsize)
+{
+	struct bufdomain *bd;
+	long space;
+	int diff;
+
+	KASSERT((bp->b_flags & B_MALLOC) == 0,
+	    ("bufspace_adjust: malloc buf %p", bp));
+	bd = bufdomain(bp);
+	diff = bufsize - bp->b_bufsize;
+	if (diff < 0) {
+		atomic_subtract_long(&bd->bd_bufspace, -diff);
+	} else if (diff > 0) {
+		space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
+		/* Wake up the daemon on the transition. */
+		if (space < bd->bd_bufspacethresh &&
+		    space + diff >= bd->bd_bufspacethresh)
+			bufspace_daemon_wakeup(bd);
+	}
+	bp->b_bufsize = bufsize;
+}
+
+/*
+ *	bufspace_reserve:
+ *
+ *	Reserve bufspace before calling allocbuf().  metadata has a
+ *	different space limit than data.
+ */
+static int
+bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
+{
+	long limit, new;
+	long space;
+
+	if (metadata)
+		limit = bd->bd_maxbufspace;
+	else
+		limit = bd->bd_hibufspace;
+	space = atomic_fetchadd_long(&bd->bd_bufspace, size);
+	new = space + size;
+	if (new > limit) {
+		atomic_subtract_long(&bd->bd_bufspace, size);
+		return (ENOSPC);
+	}
+
+	/* Wake up the daemon on the transition. */
+	if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
+		bufspace_daemon_wakeup(bd);
+
+	return (0);
+}
+
+/*
+ *	bufspace_release:
+ *
+ *	Release reserved bufspace after bufspace_adjust() has consumed it.
+ */
+static void
+bufspace_release(struct bufdomain *bd, int size)
+{
+
+	atomic_subtract_long(&bd->bd_bufspace, size);
+}
+
+/*
+ *	bufspace_wait:
+ *
+ *	Wait for bufspace, acting as the buf daemon if a locked vnode is
+ *	supplied.  bd_wanted must be set prior to polling for space.  The
+ *	operation must be re-tried on return.
+ */
+static void
+bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
+    int slpflag, int slptimeo)
+{
+	struct thread *td;
+	int error, fl, norunbuf;
+
+	if ((gbflags & GB_NOWAIT_BD) != 0)
+		return;
+
+	td = curthread;
+	BD_LOCK(bd);
+	while (bd->bd_wanted) {
+		if (vp != NULL && vp->v_type != VCHR &&
+		    (td->td_pflags & TDP_BUFNEED) == 0) {
+			BD_UNLOCK(bd);
+			/*
+			 * getblk() is called with a vnode locked, and
+			 * some majority of the dirty buffers may as
+			 * well belong to the vnode.  Flushing the
+			 * buffers there would make a progress that
+			 * cannot be achieved by the buf_daemon, that
+			 * cannot lock the vnode.
+			 */
+			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+			    (td->td_pflags & TDP_NORUNNINGBUF);
+
+			/*
+			 * Play bufdaemon.  The getnewbuf() function
+			 * may be called while the thread owns lock
+			 * for another dirty buffer for the same
+			 * vnode, which makes it impossible to use
+			 * VOP_FSYNC() there, due to the buffer lock
+			 * recursion.
+			 */
+			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+			fl = buf_flush(vp, bd, flushbufqtarget);
+			td->td_pflags &= norunbuf;
+			BD_LOCK(bd);
+			if (fl != 0)
+				continue;
+			if (bd->bd_wanted == 0)
+				break;
+		}
+		error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
+		if (error != 0)
+			break;
+	}
+	BD_UNLOCK(bd);
+}
+
+
+/*
+ *	bufspace_daemon:
+ *
+ *	buffer space management daemon.  Tries to maintain some marginal
+ *	amount of free buffer space so that requesting processes neither
+ *	block nor work to reclaim buffers.
+ */
+static void
+bufspace_daemon(void *arg)
+{
+	struct bufdomain *bd;
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
+	    SHUTDOWN_PRI_LAST + 100);
+
+	bd = arg;
+	for (;;) {
+		kthread_suspend_check();
+
+		/*
+		 * Free buffers from the clean queue until we meet our
+		 * targets.
+		 *
+		 * Theory of operation:  The buffer cache is most efficient
+		 * when some free buffer headers and space are always
+		 * available to getnewbuf().  This daemon attempts to prevent
+		 * the excessive blocking and synchronization associated
+		 * with shortfall.  It goes through three phases according
+		 * demand:
+		 *
+		 * 1)	The daemon wakes up voluntarily once per-second
+		 *	during idle periods when the counters are below
+		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
+		 *
+		 * 2)	The daemon wakes up as we cross the thresholds
+		 *	ahead of any potential blocking.  This may bounce
+		 *	slightly according to the rate of consumption and
+		 *	release.
+		 *
+		 * 3)	The daemon and consumers are starved for working
+		 *	clean buffers.  This is the 'bufspace' sleep below
+		 *	which will inefficiently trade bufs with bqrelse
+		 *	until we return to condition 2.
+		 */
+		while (bd->bd_bufspace > bd->bd_lobufspace ||
+		    bd->bd_freebuffers < bd->bd_hifreebuffers) {
+			if (buf_recycle(bd, false) != 0) {
+				if (bd_flushall(bd))
+					continue;
+				/*
+				 * Speedup dirty if we've run out of clean
+				 * buffers.  This is possible in particular
+				 * because softdep may held many bufs locked
+				 * pending writes to other bufs which are
+				 * marked for delayed write, exhausting
+				 * clean space until they are written.
+				 */
+				bd_speedup();
+				BD_LOCK(bd);
+				if (bd->bd_wanted) {
+					msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+					    PRIBIO|PDROP, "bufspace", hz/10);
+				} else
+					BD_UNLOCK(bd);
+			}
+			maybe_yield();
+		}
+		bufspace_daemon_wait(bd);
+	}
+}
+
+/*
+ *	bufmallocadjust:
+ *
+ *	Adjust the reported bufspace for a malloc managed buffer, possibly
+ *	waking any waiters.
+ */
+static void
+bufmallocadjust(struct buf *bp, int bufsize)
+{
+	int diff;
+
+	KASSERT((bp->b_flags & B_MALLOC) != 0,
+	    ("bufmallocadjust: non-malloc buf %p", bp));
+	diff = bufsize - bp->b_bufsize;
+	if (diff < 0)
+		atomic_subtract_long(&bufmallocspace, -diff);
+	else
+		atomic_add_long(&bufmallocspace, diff);
+	bp->b_bufsize = bufsize;
+}
+
+/*
+ *	runningwakeup:
+ *
+ *	Wake up processes that are waiting on asynchronous writes to fall
+ *	below lorunningspace.
+ */
+static void
+runningwakeup(void)
+{
+
+	mtx_lock(&rbreqlock);
+	if (runningbufreq) {
+		runningbufreq = 0;
+		wakeup(&runningbufreq);
+	}
+	mtx_unlock(&rbreqlock);
+}
+
+/*
+ *	runningbufwakeup:
+ *
+ *	Decrement the outstanding write count according.
+ */
+void
+runningbufwakeup(struct buf *bp)
+{
+	long space, bspace;
+
+	bspace = bp->b_runningbufspace;
+	if (bspace == 0)
+		return;
+	space = atomic_fetchadd_long(&runningbufspace, -bspace);
+	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
+	    space, bspace));
+	bp->b_runningbufspace = 0;
+	/*
+	 * Only acquire the lock and wakeup on the transition from exceeding
+	 * the threshold to falling below it.
+	 */
+	if (space < lorunningspace)
+		return;
+	if (space - bspace > lorunningspace)
+		return;
+	runningwakeup();
+}
+
+/*
+ *	waitrunningbufspace()
+ *
+ *	runningbufspace is a measure of the amount of I/O currently
+ *	running.  This routine is used in async-write situations to
+ *	prevent creating huge backups of pending writes to a device.
+ *	Only asynchronous writes are governed by this function.
+ *
+ *	This does NOT turn an async write into a sync write.  It waits  
+ *	for earlier writes to complete and generally returns before the
+ *	caller's write has reached the device.
+ */
+void
+waitrunningbufspace(void)
+{
+
+	mtx_lock(&rbreqlock);
+	while (runningbufspace > hirunningspace) {
+		runningbufreq = 1;
+		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
+	}
+	mtx_unlock(&rbreqlock);
+}
+
+
+/*
+ *	vfs_buf_test_cache:
+ *
+ *	Called when a buffer is extended.  This function clears the B_CACHE
+ *	bit if the newly extended portion of the buffer does not contain
+ *	valid data.
+ */
+static __inline void
+vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
+    vm_offset_t size, vm_page_t m)
+{
+
+	VM_OBJECT_ASSERT_LOCKED(m->object);
+	if (bp->b_flags & B_CACHE) {
+		int base = (foff + off) & PAGE_MASK;
+		if (vm_page_is_valid(m, base, size) == 0)
+			bp->b_flags &= ~B_CACHE;
+	}
+}
+
+/* Wake up the buffer daemon if necessary */
+static void
+bd_wakeup(void)
+{
+
+	mtx_lock(&bdlock);
+	if (bd_request == 0) {
+		bd_request = 1;
+		wakeup(&bd_request);
+	}
+	mtx_unlock(&bdlock);
+}
+
+/*
+ * Adjust the maxbcachbuf tunable.
+ */
+static void
+maxbcachebuf_adjust(void)
+{
+	int i;
+
+	/*
+	 * maxbcachebuf must be a power of 2 >= MAXBSIZE.
+	 */
+	i = 2;
+	while (i * 2 <= maxbcachebuf)
+		i *= 2;
+	maxbcachebuf = i;
+	if (maxbcachebuf < MAXBSIZE)
+		maxbcachebuf = MAXBSIZE;
+	if (maxbcachebuf > MAXPHYS)
+		maxbcachebuf = MAXPHYS;
+	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
+		printf("maxbcachebuf=%d\n", maxbcachebuf);
+}
+
+/*
+ * bd_speedup - speedup the buffer cache flushing code
+ */
+void
+bd_speedup(void)
+{
+	int needwake;
+
+	mtx_lock(&bdlock);
+	needwake = 0;
+	if (bd_speedupreq == 0 || bd_request == 0)
+		needwake = 1;
+	bd_speedupreq = 1;
+	bd_request = 1;
+	if (needwake)
+		wakeup(&bd_request);
+	mtx_unlock(&bdlock);
+}
+
+#ifndef NSWBUF_MIN
+#define	NSWBUF_MIN	16
+#endif
+
+#ifdef __i386__
+#define	TRANSIENT_DENOM	5
+#else
+#define	TRANSIENT_DENOM 10
+#endif
+
+/*
+ * Calculating buffer cache scaling values and reserve space for buffer
+ * headers.  This is called during low level kernel initialization and
+ * may be called more then once.  We CANNOT write to the memory area
+ * being reserved at this time.
+ */
+caddr_t
+kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
+{
+	int tuned_nbuf;
+	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
+
+	/*
+	 * physmem_est is in pages.  Convert it to kilobytes (assumes
+	 * PAGE_SIZE is >= 1K)
+	 */
+	physmem_est = physmem_est * (PAGE_SIZE / 1024);
+
+	maxbcachebuf_adjust();
+	/*
+	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
+	 * For the first 64MB of ram nominally allocate sufficient buffers to
+	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
+	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
+	 * the buffer cache we limit the eventual kva reservation to
+	 * maxbcache bytes.
+	 *
+	 * factor represents the 1/4 x ram conversion.
+	 */
+	if (nbuf == 0) {
+		int factor = 4 * BKVASIZE / 1024;
+
+		nbuf = 50;
+		if (physmem_est > 4096)
+			nbuf += min((physmem_est - 4096) / factor,
+			    65536 / factor);
+		if (physmem_est > 65536)
+			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
+			    32 * 1024 * 1024 / (factor * 5));
+
+		if (maxbcache && nbuf > maxbcache / BKVASIZE)
+			nbuf = maxbcache / BKVASIZE;
+		tuned_nbuf = 1;
+	} else
+		tuned_nbuf = 0;
+
+	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
+	maxbuf = (LONG_MAX / 3) / BKVASIZE;
+	if (nbuf > maxbuf) {
+		if (!tuned_nbuf)
+			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
+			    maxbuf);
+		nbuf = maxbuf;
+	}
+
+	/*
+	 * Ideal allocation size for the transient bio submap is 10%
+	 * of the maximal space buffer map.  This roughly corresponds
+	 * to the amount of the buffer mapped for typical UFS load.
+	 *
+	 * Clip the buffer map to reserve space for the transient
+	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
+	 * maximum buffer map extent on the platform.
+	 *
+	 * The fall-back to the maxbuf in case of maxbcache unset,
+	 * allows to not trim the buffer KVA for the architectures
+	 * with ample KVA space.
+	 */
+	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
+		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+		buf_sz = (long)nbuf * BKVASIZE;
+		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
+		    (TRANSIENT_DENOM - 1)) {
+			/*
+			 * There is more KVA than memory.  Do not
+			 * adjust buffer map size, and assign the rest
+			 * of maxbuf to transient map.
+			 */
+			biotmap_sz = maxbuf_sz - buf_sz;
+		} else {
+			/*
+			 * Buffer map spans all KVA we could afford on
+			 * this platform.  Give 10% (20% on i386) of
+			 * the buffer map to the transient bio map.
+			 */
+			biotmap_sz = buf_sz / TRANSIENT_DENOM;
+			buf_sz -= biotmap_sz;
+		}
+		if (biotmap_sz / INT_MAX > MAXPHYS)
+			bio_transient_maxcnt = INT_MAX;
+		else
+			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+		/*
+		 * Artificially limit to 1024 simultaneous in-flight I/Os
+		 * using the transient mapping.
+		 */
+		if (bio_transient_maxcnt > 1024)
+			bio_transient_maxcnt = 1024;
+		if (tuned_nbuf)
+			nbuf = buf_sz / BKVASIZE;
+	}
+
+	/*
+	 * swbufs are used as temporary holders for I/O, such as paging I/O.
+	 * We have no less then 16 and no more then 256.
+	 */
+	nswbuf = min(nbuf / 4, 256);
+	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
+	if (nswbuf < NSWBUF_MIN)
+		nswbuf = NSWBUF_MIN;
+
+	/*
+	 * Reserve space for the buffer cache buffers
+	 */
+	swbuf = (void *)v;
+	v = (caddr_t)(swbuf + nswbuf);
+	buf = (void *)v;
+	v = (caddr_t)(buf + nbuf);
+
+	return(v);
+}
+
+/* Initialize the buffer subsystem.  Called before use of any buffers. */
+void
+bufinit(void)
+{
+	struct buf *bp;
+	int i;
+
+	KASSERT(maxbcachebuf >= MAXBSIZE,
+	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
+	    MAXBSIZE));
+	bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
+	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
+	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
+
+	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
+
+	/* finally, initialize each buffer header and stick on empty q */
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		bzero(bp, sizeof *bp);
+		bp->b_flags = B_INVAL;
+		bp->b_rcred = NOCRED;
+		bp->b_wcred = NOCRED;
+		bp->b_qindex = QUEUE_NONE;
+		bp->b_domain = -1;
+		bp->b_subqueue = mp_maxid + 1;
+		bp->b_xflags = 0;
+		bp->b_data = bp->b_kvabase = unmapped_buf;
+		LIST_INIT(&bp->b_dep);
+		BUF_LOCKINIT(bp);
+		bq_insert(&bqempty, bp, false);
+	}
+
+	/*
+	 * maxbufspace is the absolute maximum amount of buffer space we are 
+	 * allowed to reserve in KVM and in real terms.  The absolute maximum
+	 * is nominally used by metadata.  hibufspace is the nominal maximum
+	 * used by most other requests.  The differential is required to 
+	 * ensure that metadata deadlocks don't occur.
+	 *
+	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
+	 * this may result in KVM fragmentation which is not handled optimally
+	 * by the system. XXX This is less true with vmem.  We could use
+	 * PAGE_SIZE.
+	 */
+	maxbufspace = (long)nbuf * BKVASIZE;
+	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
+	lobufspace = (hibufspace / 20) * 19; /* 95% */
+	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
+
+	/*
+	 * Note: The 16 MiB upper limit for hirunningspace was chosen
+	 * arbitrarily and may need further tuning. It corresponds to
+	 * 128 outstanding write IO requests (if IO size is 128 KiB),
+	 * which fits with many RAID controllers' tagged queuing limits.
+	 * The lower 1 MiB limit is the historical upper limit for
+	 * hirunningspace.
+	 */
+	hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
+	    16 * 1024 * 1024), 1024 * 1024);
+	lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
+
+	/*
+	 * Limit the amount of malloc memory since it is wired permanently into
+	 * the kernel space.  Even though this is accounted for in the buffer
+	 * allocation, we don't want the malloced region to grow uncontrolled.
+	 * The malloc scheme improves memory utilization significantly on
+	 * average (small) directories.
+	 */
+	maxbufmallocspace = hibufspace / 20;
+
+	/*
+	 * Reduce the chance of a deadlock occurring by limiting the number
+	 * of delayed-write dirty buffers we allow to stack up.
+	 */
+	hidirtybuffers = nbuf / 4 + 20;
+	dirtybufthresh = hidirtybuffers * 9 / 10;
+	/*
+	 * To support extreme low-memory systems, make sure hidirtybuffers
+	 * cannot eat up all available buffer space.  This occurs when our
+	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
+	 * buffer space assuming BKVASIZE'd buffers.
+	 */
+	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
+		hidirtybuffers >>= 1;
+	}
+	lodirtybuffers = hidirtybuffers / 2;
+
+	/*
+	 * lofreebuffers should be sufficient to avoid stalling waiting on
+	 * buf headers under heavy utilization.  The bufs in per-cpu caches
+	 * are counted as free but will be unavailable to threads executing
+	 * on other cpus.
+	 *
+	 * hifreebuffers is the free target for the bufspace daemon.  This
+	 * should be set appropriately to limit work per-iteration.
+	 */
+	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
+	hifreebuffers = (3 * lofreebuffers) / 2;
+	numfreebuffers = nbuf;
+
+	/* Setup the kva and free list allocators. */
+	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
+	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
+	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
+
+	/*
+	 * Size the clean queue according to the amount of buffer space.
+	 * One queue per-256mb up to the max.  More queues gives better
+	 * concurrency but less accurate LRU.
+	 */
+	buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS);
+	for (i = 0 ; i < buf_domains; i++) {
+		struct bufdomain *bd;
+
+		bd = &bdomain[i];
+		bd_init(bd);
+		bd->bd_freebuffers = nbuf / buf_domains;
+		bd->bd_hifreebuffers = hifreebuffers / buf_domains;
+		bd->bd_lofreebuffers = lofreebuffers / buf_domains;
+		bd->bd_bufspace = 0;
+		bd->bd_maxbufspace = maxbufspace / buf_domains;
+		bd->bd_hibufspace = hibufspace / buf_domains;
+		bd->bd_lobufspace = lobufspace / buf_domains;
+		bd->bd_bufspacethresh = bufspacethresh / buf_domains;
+		bd->bd_numdirtybuffers = 0;
+		bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
+		bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
+		bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
+		/* Don't allow more than 2% of bufs in the per-cpu caches. */
+		bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
+	}
+	getnewbufcalls = counter_u64_alloc(M_WAITOK);
+	getnewbufrestarts = counter_u64_alloc(M_WAITOK);
+	mappingrestarts = counter_u64_alloc(M_WAITOK);
+	numbufallocfails = counter_u64_alloc(M_WAITOK);
+	notbufdflushes = counter_u64_alloc(M_WAITOK);
+	buffreekvacnt = counter_u64_alloc(M_WAITOK);
+	bufdefragcnt = counter_u64_alloc(M_WAITOK);
+	bufkvaspace = counter_u64_alloc(M_WAITOK);
+}
+
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+	KASSERT(bp->b_kvabase != unmapped_buf,
+	    ("mapped buf: b_kvabase was not updated %p", bp));
+	KASSERT(bp->b_data != unmapped_buf,
+	    ("mapped buf: b_data was not updated %p", bp));
+	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
+	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+	KASSERT(bp->b_data == unmapped_buf,
+	    ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define	BUF_CHECK_MAPPED(bp) do {} while (0)
+#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static int
+isbufbusy(struct buf *bp)
+{
+	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
+	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
+		return (1);
+	return (0);
+}
+
+/*
+ * Shutdown the system cleanly to prepare for reboot, halt, or power off.
+ */
+void
+bufshutdown(int show_busybufs)
+{
+	static int first_buf_printf = 1;
+	struct buf *bp;
+	int iter, nbusy, pbusy;
+#ifndef PREEMPTION
+	int subiter;
+#endif
+
+	/* 
+	 * Sync filesystems for shutdown
+	 */
+	wdog_kern_pat(WD_LASTVAL);
+	sys_sync(curthread, NULL);
+
+	/*
+	 * With soft updates, some buffers that are
+	 * written will be remarked as dirty until other
+	 * buffers are written.
+	 */
+	for (iter = pbusy = 0; iter < 20; iter++) {
+		nbusy = 0;
+		for (bp = &buf[nbuf]; --bp >= buf; )
+			if (isbufbusy(bp))
+				nbusy++;
+		if (nbusy == 0) {
+			if (first_buf_printf)
+				printf("All buffers synced.");
+			break;
+		}
+		if (first_buf_printf) {
+			printf("Syncing disks, buffers remaining... ");
+			first_buf_printf = 0;
+		}
+		printf("%d ", nbusy);
+		if (nbusy < pbusy)
+			iter = 0;
+		pbusy = nbusy;
+
+		wdog_kern_pat(WD_LASTVAL);
+		sys_sync(curthread, NULL);
+
+#ifdef PREEMPTION
+		/*
+		 * Spin for a while to allow interrupt threads to run.
+		 */
+		DELAY(50000 * iter);
+#else
+		/*
+		 * Context switch several times to allow interrupt
+		 * threads to run.
+		 */
+		for (subiter = 0; subiter < 50 * iter; subiter++) {
+			thread_lock(curthread);
+			mi_switch(SW_VOL, NULL);
+			thread_unlock(curthread);
+			DELAY(1000);
+		}
+#endif
+	}
+	printf("\n");
+	/*
+	 * Count only busy local buffers to prevent forcing 
+	 * a fsck if we're just a client of a wedged NFS server
+	 */
+	nbusy = 0;
+	for (bp = &buf[nbuf]; --bp >= buf; ) {
+		if (isbufbusy(bp)) {
+#if 0
+/* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
+			if (bp->b_dev == NULL) {
+				TAILQ_REMOVE(&mountlist,
+				    bp->b_vp->v_mount, mnt_list);
+				continue;
+			}
+#endif
+			nbusy++;
+			if (show_busybufs > 0) {
+				printf(
+	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
+				    nbusy, bp, bp->b_vp, bp->b_flags,
+				    (intmax_t)bp->b_blkno,
+				    (intmax_t)bp->b_lblkno);
+				BUF_LOCKPRINTINFO(bp);
+				if (show_busybufs > 1)
+					vn_printf(bp->b_vp,
+					    "vnode content: ");
+			}
+		}
+	}
+	if (nbusy) {
+		/*
+		 * Failed to sync all blocks. Indicate this and don't
+		 * unmount filesystems (thus forcing an fsck on reboot).
+		 */
+		printf("Giving up on %d buffers\n", nbusy);
+		DELAY(5000000);	/* 5 seconds */
+	} else {
+		if (!first_buf_printf)
+			printf("Final sync complete\n");
+		/*
+		 * Unmount filesystems
+		 */
+		if (panicstr == NULL)
+			vfs_unmountall();
+	}
+	swapoff_all();
+	DELAY(100000);		/* wait for console output to finish */
+}
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+	BUF_CHECK_MAPPED(bp);
+
+	/*
+	 * bp->b_data is relative to bp->b_offset, but
+	 * bp->b_offset may be offset into the first page.
+	 */
+	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
+}
+
+static inline struct bufdomain *
+bufdomain(struct buf *bp)
+{
+
+	return (&bdomain[bp->b_domain]);
+}
+
+static struct bufqueue *
+bufqueue(struct buf *bp)
+{
+
+	switch (bp->b_qindex) {
+	case QUEUE_NONE:
+		/* FALLTHROUGH */
+	case QUEUE_SENTINEL:
+		return (NULL);
+	case QUEUE_EMPTY:
+		return (&bqempty);
+	case QUEUE_DIRTY:
+		return (&bufdomain(bp)->bd_dirtyq);
+	case QUEUE_CLEAN:
+		return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
+	default:
+		break;
+	}
+	panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
+}
+
+/*
+ * Return the locked bufqueue that bp is a member of.
+ */
+static struct bufqueue *
+bufqueue_acquire(struct buf *bp)
+{
+	struct bufqueue *bq, *nbq;
+
+	/*
+	 * bp can be pushed from a per-cpu queue to the
+	 * cleanq while we're waiting on the lock.  Retry
+	 * if the queues don't match.
+	 */
+	bq = bufqueue(bp);
+	BQ_LOCK(bq);
+	for (;;) {
+		nbq = bufqueue(bp);
+		if (bq == nbq)
+			break;
+		BQ_UNLOCK(bq);
+		BQ_LOCK(nbq);
+		bq = nbq;
+	}
+	return (bq);
+}
+
+/*
+ *	binsfree:
+ *
+ *	Insert the buffer into the appropriate free list.  Requires a
+ *	locked buffer on entry and buffer is unlocked before return.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+	struct bufdomain *bd;
+	struct bufqueue *bq;
+
+	KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
+	    ("binsfree: Invalid qindex %d", qindex));
+	BUF_ASSERT_XLOCKED(bp);
+
+	/*
+	 * Handle delayed bremfree() processing.
+	 */
+	if (bp->b_flags & B_REMFREE) {
+		if (bp->b_qindex == qindex) {
+			bp->b_flags |= B_REUSE;
+			bp->b_flags &= ~B_REMFREE;
+			BUF_UNLOCK(bp);
+			return;
+		}
+		bq = bufqueue_acquire(bp);
+		bq_remove(bq, bp);
+		BQ_UNLOCK(bq);
+	}
+	bd = bufdomain(bp);
+	if (qindex == QUEUE_CLEAN) {
+		if (bd->bd_lim != 0)
+			bq = &bd->bd_subq[PCPU_GET(cpuid)];
+		else
+			bq = bd->bd_cleanq;
+	} else
+		bq = &bd->bd_dirtyq;
+	bq_insert(bq, bp, true);
+}
+
+/*
+ * buf_free:
+ *
+ *	Free a buffer to the buf zone once it no longer has valid contents.
+ */
+static void
+buf_free(struct buf *bp)
+{
+
+	if (bp->b_flags & B_REMFREE)
+		bremfreef(bp);
+	if (bp->b_vflags & BV_BKGRDINPROG)
+		panic("losing buffer 1");
+	if (bp->b_rcred != NOCRED) {
+		crfree(bp->b_rcred);
+		bp->b_rcred = NOCRED;
+	}
+	if (bp->b_wcred != NOCRED) {
+		crfree(bp->b_wcred);
+		bp->b_wcred = NOCRED;
+	}
+	if (!LIST_EMPTY(&bp->b_dep))
+		buf_deallocate(bp);
+	bufkva_free(bp);
+	atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
+	BUF_UNLOCK(bp);
+	uma_zfree(buf_zone, bp);
+}
+
+/*
+ * buf_import:
+ *
+ *	Import bufs into the uma cache from the buf list.  The system still
+ *	expects a static array of bufs and much of the synchronization
+ *	around bufs assumes type stable storage.  As a result, UMA is used
+ *	only as a per-cpu cache of bufs still maintained on a global list.
+ */
+static int
+buf_import(void *arg, void **store, int cnt, int domain, int flags)
+{
+	struct buf *bp;
+	int i;
+
+	BQ_LOCK(&bqempty);
+	for (i = 0; i < cnt; i++) {
+		bp = TAILQ_FIRST(&bqempty.bq_queue);
+		if (bp == NULL)
+			break;
+		bq_remove(&bqempty, bp);
+		store[i] = bp;
+	}
+	BQ_UNLOCK(&bqempty);
+
+	return (i);
+}
+
+/*
+ * buf_release:
+ *
+ *	Release bufs from the uma cache back to the buffer queues.
+ */
+static void
+buf_release(void *arg, void **store, int cnt)
+{
+	struct bufqueue *bq;
+	struct buf *bp;
+        int i;
+
+	bq = &bqempty;
+	BQ_LOCK(bq);
+        for (i = 0; i < cnt; i++) {
+		bp = store[i];
+		/* Inline bq_insert() to batch locking. */
+		TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+		bp->b_flags &= ~(B_AGE | B_REUSE);
+		bq->bq_len++;
+		bp->b_qindex = bq->bq_index;
+	}
+	BQ_UNLOCK(bq);
+}
+
+/*
+ * buf_alloc:
+ *
+ *	Allocate an empty buffer header.
+ */
+static struct buf *
+buf_alloc(struct bufdomain *bd)
+{
+	struct buf *bp;
+	int freebufs;
+
+	/*
+	 * We can only run out of bufs in the buf zone if the average buf
+	 * is less than BKVASIZE.  In this case the actual wait/block will
+	 * come from buf_reycle() failing to flush one of these small bufs.
+	 */
+	bp = NULL;
+	freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
+	if (freebufs > 0)
+		bp = uma_zalloc(buf_zone, M_NOWAIT);
+	if (bp == NULL) {
+		atomic_add_int(&bd->bd_freebuffers, 1);
+		bufspace_daemon_wakeup(bd);
+		counter_u64_add(numbufallocfails, 1);
+		return (NULL);
+	}
+	/*
+	 * Wake-up the bufspace daemon on transition below threshold.
+	 */
+	if (freebufs == bd->bd_lofreebuffers)
+		bufspace_daemon_wakeup(bd);
+
+	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
+	
+	KASSERT(bp->b_vp == NULL,
+	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
+	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
+	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
+	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+	KASSERT(bp->b_npages == 0,
+	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
+	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
+	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+
+	bp->b_domain = BD_DOMAIN(bd);
+	bp->b_flags = 0;
+	bp->b_ioflags = 0;
+	bp->b_xflags = 0;
+	bp->b_vflags = 0;
+	bp->b_vp = NULL;
+	bp->b_blkno = bp->b_lblkno = 0;
+	bp->b_offset = NOOFFSET;
+	bp->b_iodone = 0;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_bcount = 0;
+	bp->b_npages = 0;
+	bp->b_dirtyoff = bp->b_dirtyend = 0;
+	bp->b_bufobj = NULL;
+	bp->b_data = bp->b_kvabase = unmapped_buf;
+	bp->b_fsprivate1 = NULL;
+	bp->b_fsprivate2 = NULL;
+	bp->b_fsprivate3 = NULL;
+	LIST_INIT(&bp->b_dep);
+
+	return (bp);
+}
+
+/*
+ *	buf_recycle:
+ *
+ *	Free a buffer from the given bufqueue.  kva controls whether the
+ *	freed buf must own some kva resources.  This is used for
+ *	defragmenting.
+ */
+static int
+buf_recycle(struct bufdomain *bd, bool kva)
+{
+	struct bufqueue *bq;
+	struct buf *bp, *nbp;
+
+	if (kva)
+		counter_u64_add(bufdefragcnt, 1);
+	nbp = NULL;
+	bq = bd->bd_cleanq;
+	BQ_LOCK(bq);
+	KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
+	    ("buf_recycle: Locks don't match"));
+	nbp = TAILQ_FIRST(&bq->bq_queue);
+
+	/*
+	 * Run scan, possibly freeing data and/or kva mappings on the fly
+	 * depending.
+	 */
+	while ((bp = nbp) != NULL) {
+		/*
+		 * Calculate next bp (we can only use it if we do not
+		 * release the bqlock).
+		 */
+		nbp = TAILQ_NEXT(bp, b_freelist);
+
+		/*
+		 * If we are defragging then we need a buffer with 
+		 * some kva to reclaim.
+		 */
+		if (kva && bp->b_kvasize == 0)
+			continue;
+
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+			continue;
+
+		/*
+		 * Implement a second chance algorithm for frequently
+		 * accessed buffers.
+		 */
+		if ((bp->b_flags & B_REUSE) != 0) {
+			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+			TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+			bp->b_flags &= ~B_REUSE;
+			BUF_UNLOCK(bp);
+			continue;
+		}
+
+		/*
+		 * Skip buffers with background writes in progress.
+		 */
+		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+
+		KASSERT(bp->b_qindex == QUEUE_CLEAN,
+		    ("buf_recycle: inconsistent queue %d bp %p",
+		    bp->b_qindex, bp));
+		KASSERT(bp->b_domain == BD_DOMAIN(bd),
+		    ("getnewbuf: queue domain %d doesn't match request %d",
+		    bp->b_domain, (int)BD_DOMAIN(bd)));
+		/*
+		 * NOTE:  nbp is now entirely invalid.  We can only restart
+		 * the scan from this point on.
+		 */
+		bq_remove(bq, bp);
+		BQ_UNLOCK(bq);
+
+		/*
+		 * Requeue the background write buffer with error and
+		 * restart the scan.
+		 */
+		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+			bqrelse(bp);
+			BQ_LOCK(bq);
+			nbp = TAILQ_FIRST(&bq->bq_queue);
+			continue;
+		}
+		bp->b_flags |= B_INVAL;
+		brelse(bp);
+		return (0);
+	}
+	bd->bd_wanted = 1;
+	BQ_UNLOCK(bq);
+
+	return (ENOBUFS);
+}
+
+/*
+ *	bremfree:
+ *
+ *	Mark the buffer for removal from the appropriate free list.
+ *	
+ */
+void
+bremfree(struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT((bp->b_flags & B_REMFREE) == 0,
+	    ("bremfree: buffer %p already marked for delayed removal.", bp));
+	KASSERT(bp->b_qindex != QUEUE_NONE,
+	    ("bremfree: buffer %p not on a queue.", bp));
+	BUF_ASSERT_XLOCKED(bp);
+
+	bp->b_flags |= B_REMFREE;
+}
+
+/*
+ *	bremfreef:
+ *
+ *	Force an immediate removal from a free list.  Used only in nfs when
+ *	it abuses the b_freelist pointer.
+ */
+void
+bremfreef(struct buf *bp)
+{
+	struct bufqueue *bq;
+
+	bq = bufqueue_acquire(bp);
+	bq_remove(bq, bp);
+	BQ_UNLOCK(bq);
+}
+
+static void
+bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname)
+{
+
+	mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF);
+	TAILQ_INIT(&bq->bq_queue);
+	bq->bq_len = 0;
+	bq->bq_index = qindex;
+	bq->bq_subqueue = subqueue;
+}
+
+static void
+bd_init(struct bufdomain *bd)
+{
+	int i;
+
+	bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
+	bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
+	bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
+	for (i = 0; i <= mp_maxid; i++)
+		bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
+		    "bufq clean subqueue lock");
+	mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF);
+}
+
+/*
+ *	bq_remove:
+ *
+ *	Removes a buffer from the free list, must be called with the
+ *	correct qlock held.
+ */
+static void
+bq_remove(struct bufqueue *bq, struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_qindex != QUEUE_NONE,
+	    ("bq_remove: buffer %p not on a queue.", bp));
+	KASSERT(bufqueue(bp) == bq,
+	    ("bq_remove: Remove buffer %p from wrong queue.", bp));
+
+	BQ_ASSERT_LOCKED(bq);
+	if (bp->b_qindex != QUEUE_EMPTY) {
+		BUF_ASSERT_XLOCKED(bp);
+	}
+	KASSERT(bq->bq_len >= 1,
+	    ("queue %d underflow", bp->b_qindex));
+	TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+	bq->bq_len--;
+	bp->b_qindex = QUEUE_NONE;
+	bp->b_flags &= ~(B_REMFREE | B_REUSE);
+}
+
+static void
+bd_flush(struct bufdomain *bd, struct bufqueue *bq)
+{
+	struct buf *bp;
+
+	BQ_ASSERT_LOCKED(bq);
+	if (bq != bd->bd_cleanq) {
+		BD_LOCK(bd);
+		while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) {
+			TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+			TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp,
+			    b_freelist);
+			bp->b_subqueue = bd->bd_cleanq->bq_subqueue;
+		}
+		bd->bd_cleanq->bq_len += bq->bq_len;
+		bq->bq_len = 0;
+	}
+	if (bd->bd_wanted) {
+		bd->bd_wanted = 0;
+		wakeup(&bd->bd_wanted);
+	}
+	if (bq != bd->bd_cleanq)
+		BD_UNLOCK(bd);
+}
+
+static int
+bd_flushall(struct bufdomain *bd)
+{
+	struct bufqueue *bq;
+	int flushed;
+	int i;
+
+	if (bd->bd_lim == 0)
+		return (0);
+	flushed = 0;
+	for (i = 0; i <= mp_maxid; i++) {
+		bq = &bd->bd_subq[i];
+		if (bq->bq_len == 0)
+			continue;
+		BQ_LOCK(bq);
+		bd_flush(bd, bq);
+		BQ_UNLOCK(bq);
+		flushed++;
+	}
+
+	return (flushed);
+}
+
+static void
+bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock)
+{
+	struct bufdomain *bd;
+
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("bq_insert: free buffer %p onto another queue?", bp);
+
+	bd = bufdomain(bp);
+	if (bp->b_flags & B_AGE) {
+		/* Place this buf directly on the real queue. */
+		if (bq->bq_index == QUEUE_CLEAN)
+			bq = bd->bd_cleanq;
+		BQ_LOCK(bq);
+		TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist);
+	} else {
+		BQ_LOCK(bq);
+		TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+	}
+	bp->b_flags &= ~(B_AGE | B_REUSE);
+	bq->bq_len++;
+	bp->b_qindex = bq->bq_index;
+	bp->b_subqueue = bq->bq_subqueue;
+
+	/*
+	 * Unlock before we notify so that we don't wakeup a waiter that
+	 * fails a trylock on the buf and sleeps again.
+	 */
+	if (unlock)
+		BUF_UNLOCK(bp);
+
+	if (bp->b_qindex == QUEUE_CLEAN) {
+		/*
+		 * Flush the per-cpu queue and notify any waiters.
+		 */
+		if (bd->bd_wanted || (bq != bd->bd_cleanq &&
+		    bq->bq_len >= bd->bd_lim))
+			bd_flush(bd, bq);
+	}
+	BQ_UNLOCK(bq);
+}
+
+/*
+ *	bufkva_free:
+ *
+ *	Free the kva allocation for a buffer.
+ *
+ */
+static void
+bufkva_free(struct buf *bp)
+{
+
+#ifdef INVARIANTS
+	if (bp->b_kvasize == 0) {
+		KASSERT(bp->b_kvabase == unmapped_buf &&
+		    bp->b_data == unmapped_buf,
+		    ("Leaked KVA space on %p", bp));
+	} else if (buf_mapped(bp))
+		BUF_CHECK_MAPPED(bp);
+	else
+		BUF_CHECK_UNMAPPED(bp);
+#endif
+	if (bp->b_kvasize == 0)
+		return;
+
+	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
+	counter_u64_add(bufkvaspace, -bp->b_kvasize);
+	counter_u64_add(buffreekvacnt, 1);
+	bp->b_data = bp->b_kvabase = unmapped_buf;
+	bp->b_kvasize = 0;
+}
+
+/*
+ *	bufkva_alloc:
+ *
+ *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
+ */
+static int
+bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
+{
+	vm_offset_t addr;
+	int error;
+
+	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
+	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
+
+	bufkva_free(bp);
+
+	addr = 0;
+	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
+	if (error != 0) {
+		/*
+		 * Buffer map is too fragmented.  Request the caller
+		 * to defragment the map.
+		 */
+		return (error);
+	}
+	bp->b_kvabase = (caddr_t)addr;
+	bp->b_kvasize = maxsize;
+	counter_u64_add(bufkvaspace, bp->b_kvasize);
+	if ((gbflags & GB_UNMAPPED) != 0) {
+		bp->b_data = unmapped_buf;
+		BUF_CHECK_UNMAPPED(bp);
+	} else {
+		bp->b_data = bp->b_kvabase;
+		BUF_CHECK_MAPPED(bp);
+	}
+	return (0);
+}
+
+/*
+ *	bufkva_reclaim:
+ *
+ *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
+ *	callback that fires to avoid returning failure.
+ */
+static void
+bufkva_reclaim(vmem_t *vmem, int flags)
+{
+	bool done;
+	int q;
+	int i;
+
+	done = false;
+	for (i = 0; i < 5; i++) {
+		for (q = 0; q < buf_domains; q++)
+			if (buf_recycle(&bdomain[q], true) != 0)
+				done = true;
+		if (done)
+			break;
+	}
+	return;
+}
+
+/*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+static void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt,
+    struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *))
+{
+	struct buf *rabp;
+	struct thread *td;
+	int i;
+
+	td = curthread;
+
+	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+		if (inmem(vp, *rablkno))
+			continue;
+		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+		if ((rabp->b_flags & B_CACHE) != 0) {
+			brelse(rabp);
+			continue;
+		}
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, rabp, 0);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
+		td->td_ru.ru_inblock++;
+		rabp->b_flags |= B_ASYNC;
+		rabp->b_flags &= ~B_INVAL;
+		if ((flags & GB_CKHASH) != 0) {
+			rabp->b_flags |= B_CKHASH;
+			rabp->b_ckhashcalc = ckhashfunc;
+		}
+		rabp->b_ioflags &= ~BIO_ERROR;
+		rabp->b_iocmd = BIO_READ;
+		if (rabp->b_rcred == NOCRED && cred != NOCRED)
+			rabp->b_rcred = crhold(cred);
+		vfs_busy_pages(rabp, 0);
+		BUF_KERNPROC(rabp);
+		rabp->b_iooffset = dbtob(rabp->b_blkno);
+		bstrategy(rabp);
+	}
+}
+
+/*
+ * Entry point for bread() and breadn() via #defines in sys/buf.h.
+ *
+ * Get a buffer with the specified data.  Look in the cache first.  We
+ * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything, see
+ * getblk(). Also starts asynchronous I/O on read-ahead blocks.
+ *
+ * Always return a NULL buffer pointer (in bpp) when returning an error.
+ */
+int
+breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
+    int *rabsize, int cnt, struct ucred *cred, int flags,
+    void (*ckhashfunc)(struct buf *), struct buf **bpp)
+{
+	struct buf *bp;
+	struct thread *td;
+	int error, readwait, rv;
+
+	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
+	td = curthread;
+	/*
+	 * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags
+	 * are specified.
+	 */
+	error = getblkx(vp, blkno, size, 0, 0, flags, &bp);
+	if (error != 0) {
+		*bpp = NULL;
+		return (error);
+	}
+	flags &= ~GB_NOSPARSE;
+	*bpp = bp;
+
+	/*
+	 * If not found in cache, do some I/O
+	 */
+	readwait = 0;
+	if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			racct_add_buf(td->td_proc, bp, 0);
+			PROC_UNLOCK(td->td_proc);
+		}
+#endif /* RACCT */
+		td->td_ru.ru_inblock++;
+		bp->b_iocmd = BIO_READ;
+		bp->b_flags &= ~B_INVAL;
+		if ((flags & GB_CKHASH) != 0) {
+			bp->b_flags |= B_CKHASH;
+			bp->b_ckhashcalc = ckhashfunc;
+		}
+		bp->b_ioflags &= ~BIO_ERROR;
+		if (bp->b_rcred == NOCRED && cred != NOCRED)
+			bp->b_rcred = crhold(cred);
+		vfs_busy_pages(bp, 0);
+		bp->b_iooffset = dbtob(bp->b_blkno);
+		bstrategy(bp);
+		++readwait;
+	}
+
+	/*
+	 * Attempt to initiate asynchronous I/O on read-ahead blocks.
+	 */
+	breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc);
+
+	rv = 0;
+	if (readwait) {
+		rv = bufwait(bp);
+		if (rv != 0) {
+			brelse(bp);
+			*bpp = NULL;
+		}
+	}
+	return (rv);
+}
+
+/*
+ * Write, release buffer on completion.  (Done by iodone
+ * if async).  Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable.  This is true even of NFS
+ * now so we set it generally.  This could be set either here 
+ * or in biodone() since the I/O is synchronous.  We put it
+ * here.
+ */
+int
+bufwrite(struct buf *bp)
+{
+	int oldflags;
+	struct vnode *vp;
+	long space;
+	int vp_md;
+
+	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
+		bp->b_flags |= B_INVAL | B_RELBUF;
+		bp->b_flags &= ~B_CACHE;
+		brelse(bp);
+		return (ENXIO);
+	}
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return (0);
+	}
+
+	if (bp->b_flags & B_BARRIER)
+		atomic_add_long(&barrierwrites, 1);
+
+	oldflags = bp->b_flags;
+
+	BUF_ASSERT_HELD(bp);
+
+	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
+	    ("FFS background buffer should not get here %p", bp));
+
+	vp = bp->b_vp;
+	if (vp)
+		vp_md = vp->v_vflag & VV_MD;
+	else
+		vp_md = 0;
+
+	/*
+	 * Mark the buffer clean.  Increment the bufobj write count
+	 * before bundirty() call, to prevent other thread from seeing
+	 * empty dirty list and zero counter for writes in progress,
+	 * falsely indicating that the bufobj is clean.
+	 */
+	bufobj_wref(bp->b_bufobj);
+	bundirty(bp);
+
+	bp->b_flags &= ~B_DONE;
+	bp->b_ioflags &= ~BIO_ERROR;
+	bp->b_flags |= B_CACHE;
+	bp->b_iocmd = BIO_WRITE;
+
+	vfs_busy_pages(bp, 1);
+
+	/*
+	 * Normal bwrites pipeline writes
+	 */
+	bp->b_runningbufspace = bp->b_bufsize;
+	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
+
+#ifdef RACCT
+	if (racct_enable) {
+		PROC_LOCK(curproc);
+		racct_add_buf(curproc, bp, 1);
+		PROC_UNLOCK(curproc);
+	}
+#endif /* RACCT */
+	curthread->td_ru.ru_oublock++;
+	if (oldflags & B_ASYNC)
+		BUF_KERNPROC(bp);
+	bp->b_iooffset = dbtob(bp->b_blkno);
+	buf_track(bp, __func__);
+	bstrategy(bp);
+
+	if ((oldflags & B_ASYNC) == 0) {
+		int rtval = bufwait(bp);
+		brelse(bp);
+		return (rtval);
+	} else if (space > hirunningspace) {
+		/*
+		 * don't allow the async write to saturate the I/O
+		 * system.  We will not deadlock here because
+		 * we are blocking waiting for I/O that is already in-progress
+		 * to complete. We do not block here if it is the update
+		 * or syncer daemon trying to clean up as that can lead
+		 * to deadlock.
+		 */
+		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
+			waitrunningbufspace();
+	}
+
+	return (0);
+}
+
+void
+bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+	struct buf *nbp;
+
+	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
+		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
+		altbufferflushes++;
+	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
+		BO_LOCK(bo);
+		/*
+		 * Try to find a buffer to flush.
+		 */
+		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+			    BUF_LOCK(nbp,
+				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
+				continue;
+			if (bp == nbp)
+				panic("bdwrite: found ourselves");
+			BO_UNLOCK(bo);
+			/* Don't countdeps with the bo lock held. */
+			if (buf_countdeps(nbp, 0)) {
+				BO_LOCK(bo);
+				BUF_UNLOCK(nbp);
+				continue;
+			}
+			if (nbp->b_flags & B_CLUSTEROK) {
+				vfs_bio_awrite(nbp);
+			} else {
+				bremfree(nbp);
+				bawrite(nbp);
+			}
+			dirtybufferflushes++;
+			break;
+		}
+		if (nbp == NULL)
+			BO_UNLOCK(bo);
+	}
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty).  Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
+ */
+void
+bdwrite(struct buf *bp)
+{
+	struct thread *td = curthread;
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	KASSERT((bp->b_flags & B_BARRIER) == 0,
+	    ("Barrier request in delayed write %p", bp));
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return;
+	}
+
+	/*
+	 * If we have too many dirty buffers, don't create any more.
+	 * If we are wildly over our limit, then force a complete
+	 * cleanup. Otherwise, just keep the situation from getting
+	 * out of control. Note that we have to avoid a recursive
+	 * disaster and not try to clean up after our own cleanup!
+	 */
+	vp = bp->b_vp;
+	bo = bp->b_bufobj;
+	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
+		td->td_pflags |= TDP_INBDFLUSH;
+		BO_BDFLUSH(bo, bp);
+		td->td_pflags &= ~TDP_INBDFLUSH;
+	} else
+		recursiveflushes++;
+
+	bdirty(bp);
+	/*
+	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
+	 * true even of NFS now.
+	 */
+	bp->b_flags |= B_CACHE;
+
+	/*
+	 * This bmap keeps the system from needing to do the bmap later,
+	 * perhaps when the system is attempting to do a sync.  Since it
+	 * is likely that the indirect block -- or whatever other datastructure
+	 * that the filesystem needs is still in memory now, it is a good
+	 * thing to do this.  Note also, that if the pageout daemon is
+	 * requesting a sync -- there might not be enough memory to do
+	 * the bmap then...  So, this is important to do.
+	 */
+	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
+		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+	}
+
+	buf_track(bp, __func__);
+
+	/*
+	 * Set the *dirty* buffer range based upon the VM system dirty
+	 * pages.
+	 *
+	 * Mark the buffer pages as clean.  We need to do this here to
+	 * satisfy the vnode_pager and the pageout daemon, so that it
+	 * thinks that the pages have been "cleaned".  Note that since
+	 * the pages are in a delayed write buffer -- the VFS layer
+	 * "will" see that the pages get written out on the next sync,
+	 * or perhaps the cluster will be completed.
+	 */
+	vfs_clean_pages_dirty_buf(bp);
+	bqrelse(bp);
+
+	/*
+	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+	 * due to the softdep code.
+	 */
+}
+
+/*
+ *	bdirty:
+ *
+ *	Turn buffer into delayed write request.  We must clear BIO_READ and
+ *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
+ *	itself to properly update it in the dirty/clean lists.  We mark it
+ *	B_DONE to ensure that any asynchronization of the buffer properly
+ *	clears B_DONE ( else a panic will occur later ).  
+ *
+ *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
+ *	should only be called if the buffer is known-good.
+ *
+ *	Since the buffer is not on a queue, we do not update the numfreebuffers
+ *	count.
+ *
+ *	The buffer must be on QUEUE_NONE.
+ */
+void
+bdirty(struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
+	BUF_ASSERT_HELD(bp);
+	bp->b_flags &= ~(B_RELBUF);
+	bp->b_iocmd = BIO_WRITE;
+
+	if ((bp->b_flags & B_DELWRI) == 0) {
+		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
+		reassignbuf(bp);
+		bdirtyadd(bp);
+	}
+}
+
+/*
+ *	bundirty:
+ *
+ *	Clear B_DELWRI for buffer.
+ *
+ *	Since the buffer is not on a queue, we do not update the numfreebuffers
+ *	count.
+ *	
+ *	The buffer must be on QUEUE_NONE.
+ */
+
+void
+bundirty(struct buf *bp)
+{
+
+	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_flags & B_DELWRI) {
+		bp->b_flags &= ~B_DELWRI;
+		reassignbuf(bp);
+		bdirtysub(bp);
+	}
+	/*
+	 * Since it is now being written, we can clear its deferred write flag.
+	 */
+	bp->b_flags &= ~B_DEFERRED;
+}
+
+/*
+ *	bawrite:
+ *
+ *	Asynchronous write.  Start output on a buffer, but do not wait for
+ *	it to complete.  The buffer is released when the output completes.
+ *
+ *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
+ *	B_INVAL buffers.  Not us.
+ */
+void
+bawrite(struct buf *bp)
+{
+
+	bp->b_flags |= B_ASYNC;
+	(void) bwrite(bp);
+}
+
+/*
+ *	babarrierwrite:
+ *
+ *	Asynchronous barrier write.  Start output on a buffer, but do not
+ *	wait for it to complete.  Place a write barrier after this write so
+ *	that this buffer and all buffers written before it are committed to
+ *	the disk before any buffers written after this write are committed
+ *	to the disk.  The buffer is released when the output completes.
+ */
+void
+babarrierwrite(struct buf *bp)
+{
+
+	bp->b_flags |= B_ASYNC | B_BARRIER;
+	(void) bwrite(bp);
+}
+
+/*
+ *	bbarrierwrite:
+ *
+ *	Synchronous barrier write.  Start output on a buffer and wait for
+ *	it to complete.  Place a write barrier after this write so that
+ *	this buffer and all buffers written before it are committed to 
+ *	the disk before any buffers written after this write are committed
+ *	to the disk.  The buffer is released when the output completes.
+ */
+int
+bbarrierwrite(struct buf *bp)
+{
+
+	bp->b_flags |= B_BARRIER;
+	return (bwrite(bp));
+}
+
+/*
+ *	bwillwrite:
+ *
+ *	Called prior to the locking of any vnodes when we are expecting to
+ *	write.  We do not want to starve the buffer cache with too many
+ *	dirty buffers so we block here.  By blocking prior to the locking
+ *	of any vnodes we attempt to avoid the situation where a locked vnode
+ *	prevents the various system daemons from flushing related buffers.
+ */
+void
+bwillwrite(void)
+{
+
+	if (buf_dirty_count_severe()) {
+		mtx_lock(&bdirtylock);
+		while (buf_dirty_count_severe()) {
+			bdirtywait = 1;
+			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+			    "flswai", 0);
+		}
+		mtx_unlock(&bdirtylock);
+	}
+}
+
+/*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+
+	return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty));
+}
+
+/*
+ *	brelse:
+ *
+ *	Release a busy buffer and, if requested, free its resources.  The
+ *	buffer will be stashed in the appropriate bufqueue[] allowing it
+ *	to be accessed later as a cache entity or reused for other purposes.
+ */
+void
+brelse(struct buf *bp)
+{
+	struct mount *v_mnt;
+	int qindex;
+
+	/*
+	 * Many functions erroneously call brelse with a NULL bp under rare
+	 * error conditions. Simply return when called with a NULL bp.
+	 */
+	if (bp == NULL)
+		return;
+	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
+	    ("brelse: non-VMIO buffer marked NOREUSE"));
+
+	if (BUF_LOCKRECURSED(bp)) {
+		/*
+		 * Do not process, in particular, do not handle the
+		 * B_INVAL/B_RELBUF and do not release to free list.
+		 */
+		BUF_UNLOCK(bp);
+		return;
+	}
+
+	if (bp->b_flags & B_MANAGED) {
+		bqrelse(bp);
+		return;
+	}
+
+	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
+		BO_LOCK(bp->b_bufobj);
+		bp->b_vflags &= ~BV_BKGRDERR;
+		BO_UNLOCK(bp->b_bufobj);
+		bdirty(bp);
+	}
+	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
+	    (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) &&
+	    !(bp->b_flags & B_INVAL)) {
+		/*
+		 * Failed write, redirty.  All errors except ENXIO (which
+		 * means the device is gone) are treated as being
+		 * transient.
+		 *
+		 * XXX Treating EIO as transient is not correct; the
+		 * contract with the local storage device drivers is that
+		 * they will only return EIO once the I/O is no longer
+		 * retriable.  Network I/O also respects this through the
+		 * guarantees of TCP and/or the internal retries of NFS.
+		 * ENOMEM might be transient, but we also have no way of
+		 * knowing when its ok to retry/reschedule.  In general,
+		 * this entire case should be made obsolete through better
+		 * error handling/recovery and resource scheduling.
+		 *
+		 * Do this also for buffers that failed with ENXIO, but have
+		 * non-empty dependencies - the soft updates code might need
+		 * to access the buffer to untangle them.
+		 *
+		 * Must clear BIO_ERROR to prevent pages from being scrapped.
+		 */
+		bp->b_ioflags &= ~BIO_ERROR;
+		bdirty(bp);
+	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
+	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
+		/*
+		 * Either a failed read I/O, or we were asked to free or not
+		 * cache the buffer, or we failed to write to a device that's
+		 * no longer present.
+		 */
+		bp->b_flags |= B_INVAL;
+		if (!LIST_EMPTY(&bp->b_dep))
+			buf_deallocate(bp);
+		if (bp->b_flags & B_DELWRI)
+			bdirtysub(bp);
+		bp->b_flags &= ~(B_DELWRI | B_CACHE);
+		if ((bp->b_flags & B_VMIO) == 0) {
+			allocbuf(bp, 0);
+			if (bp->b_vp)
+				brelvp(bp);
+		}
+	}
+
+	/*
+	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate() 
+	 * is called with B_DELWRI set, the underlying pages may wind up
+	 * getting freed causing a previous write (bdwrite()) to get 'lost'
+	 * because pages associated with a B_DELWRI bp are marked clean.
+	 * 
+	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
+	 * if B_DELWRI is set.
+	 */
+	if (bp->b_flags & B_DELWRI)
+		bp->b_flags &= ~B_RELBUF;
+
+	/*
+	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
+	 * constituted, not even NFS buffers now.  Two flags effect this.  If
+	 * B_INVAL, the struct buf is invalidated but the VM object is kept
+	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
+	 *
+	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
+	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
+	 * buffer is also B_INVAL because it hits the re-dirtying code above.
+	 *
+	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
+	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
+	 * the commit state and we cannot afford to lose the buffer. If the
+	 * buffer has a background write in progress, we need to keep it
+	 * around to prevent it from being reconstituted and starting a second
+	 * background write.
+	 */
+
+	v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL;
+
+	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
+	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
+	    (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 ||
+	    vn_isdisk(bp->b_vp, NULL) || (bp->b_flags & B_DELWRI) == 0)) {
+		vfs_vmio_invalidate(bp);
+		allocbuf(bp, 0);
+	}
+
+	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
+	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
+		allocbuf(bp, 0);
+		bp->b_flags &= ~B_NOREUSE;
+		if (bp->b_vp != NULL)
+			brelvp(bp);
+	}
+			
+	/*
+	 * If the buffer has junk contents signal it and eventually
+	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
+	 * doesn't find it.
+	 */
+	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
+	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
+		bp->b_flags |= B_INVAL;
+	if (bp->b_flags & B_INVAL) {
+		if (bp->b_flags & B_DELWRI)
+			bundirty(bp);
+		if (bp->b_vp)
+			brelvp(bp);
+	}
+
+	buf_track(bp, __func__);
+
+	/* buffers with no memory */
+	if (bp->b_bufsize == 0) {
+		buf_free(bp);
+		return;
+	}
+	/* buffers with junk contents */
+	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
+	    (bp->b_ioflags & BIO_ERROR)) {
+		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
+		if (bp->b_vflags & BV_BKGRDINPROG)
+			panic("losing buffer 2");
+		qindex = QUEUE_CLEAN;
+		bp->b_flags |= B_AGE;
+	/* remaining buffers */
+	} else if (bp->b_flags & B_DELWRI)
+		qindex = QUEUE_DIRTY;
+	else
+		qindex = QUEUE_CLEAN;
+
+	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+		panic("brelse: not dirty");
+
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT);
+	/* binsfree unlocks bp. */
+	binsfree(bp, qindex);
+}
+
+/*
+ * Release a buffer back to the appropriate queue but do not try to free
+ * it.  The buffer is expected to be used again soon.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion.  It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
+ */
+void
+bqrelse(struct buf *bp)
+{
+	int qindex;
+
+	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+	qindex = QUEUE_NONE;
+	if (BUF_LOCKRECURSED(bp)) {
+		/* do not release to free list */
+		BUF_UNLOCK(bp);
+		return;
+	}
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+
+	if (bp->b_flags & B_MANAGED) {
+		if (bp->b_flags & B_REMFREE)
+			bremfreef(bp);
+		goto out;
+	}
+
+	/* buffers with stale but valid contents */
+	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
+	    BV_BKGRDERR)) == BV_BKGRDERR) {
+		BO_LOCK(bp->b_bufobj);
+		bp->b_vflags &= ~BV_BKGRDERR;
+		BO_UNLOCK(bp->b_bufobj);
+		qindex = QUEUE_DIRTY;
+	} else {
+		if ((bp->b_flags & B_DELWRI) == 0 &&
+		    (bp->b_xflags & BX_VNDIRTY))
+			panic("bqrelse: not dirty");
+		if ((bp->b_flags & B_NOREUSE) != 0) {
+			brelse(bp);
+			return;
+		}
+		qindex = QUEUE_CLEAN;
+	}
+	buf_track(bp, __func__);
+	/* binsfree unlocks bp. */
+	binsfree(bp, qindex);
+	return;
+
+out:
+	buf_track(bp, __func__);
+	/* unlock */
+	BUF_UNLOCK(bp);
+}
+
+/*
+ * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
+ * restore bogus pages.
+ */
+static void
+vfs_vmio_iodone(struct buf *bp)
+{
+	vm_ooffset_t foff;
+	vm_page_t m;
+	vm_object_t obj;
+	struct vnode *vp __unused;
+	int i, iosize, resid;
+	bool bogus;
+
+	obj = bp->b_bufobj->bo_object;
+	KASSERT(obj->paging_in_progress >= bp->b_npages,
+	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
+	    obj->paging_in_progress, bp->b_npages));
+
+	vp = bp->b_vp;
+	KASSERT(vp->v_holdcnt > 0,
+	    ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
+	KASSERT(vp->v_object != NULL,
+	    ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
+
+	foff = bp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET,
+	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
+
+	bogus = false;
+	iosize = bp->b_bcount - bp->b_resid;
+	VM_OBJECT_WLOCK(obj);
+	for (i = 0; i < bp->b_npages; i++) {
+		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+		if (resid > iosize)
+			resid = iosize;
+
+		/*
+		 * cleanup bogus pages, restoring the originals
+		 */
+		m = bp->b_pages[i];
+		if (m == bogus_page) {
+			bogus = true;
+			m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+			if (m == NULL)
+				panic("biodone: page disappeared!");
+			bp->b_pages[i] = m;
+		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
+			/*
+			 * In the write case, the valid and clean bits are
+			 * already changed correctly ( see bdwrite() ), so we 
+			 * only need to do this here in the read case.
+			 */
+			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
+			    resid)) == 0, ("vfs_vmio_iodone: page %p "
+			    "has unexpected dirty bits", m));
+			vfs_page_set_valid(bp, foff, m);
+		}
+		KASSERT(OFF_TO_IDX(foff) == m->pindex,
+		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
+		    (intmax_t)foff, (uintmax_t)m->pindex));
+
+		vm_page_sunbusy(m);
+		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+		iosize -= resid;
+	}
+	vm_object_pip_wakeupn(obj, bp->b_npages);
+	VM_OBJECT_WUNLOCK(obj);
+	if (bogus && buf_mapped(bp)) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+		    bp->b_pages, bp->b_npages);
+	}
+}
+
+/*
+ * Perform page invalidation when a buffer is released.  The fully invalid
+ * pages will be reclaimed later in vfs_vmio_truncate().
+ */
+static void
+vfs_vmio_invalidate(struct buf *bp)
+{
+	vm_object_t obj;
+	vm_page_t m;
+	int flags, i, resid, poffset, presid;
+
+	if (buf_mapped(bp)) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+	} else
+		BUF_CHECK_UNMAPPED(bp);
+	/*
+	 * Get the base offset and length of the buffer.  Note that 
+	 * in the VMIO case if the buffer block size is not
+	 * page-aligned then b_data pointer may not be page-aligned.
+	 * But our b_pages[] array *IS* page aligned.
+	 *
+	 * block sizes less then DEV_BSIZE (usually 512) are not 
+	 * supported due to the page granularity bits (m->valid,
+	 * m->dirty, etc...). 
+	 *
+	 * See man buf(9) for more information
+	 */
+	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
+	obj = bp->b_bufobj->bo_object;
+	resid = bp->b_bufsize;
+	poffset = bp->b_offset & PAGE_MASK;
+	VM_OBJECT_WLOCK(obj);
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		if (m == bogus_page)
+			panic("vfs_vmio_invalidate: Unexpected bogus page.");
+		bp->b_pages[i] = NULL;
+
+		presid = resid > (PAGE_SIZE - poffset) ?
+		    (PAGE_SIZE - poffset) : resid;
+		KASSERT(presid >= 0, ("brelse: extra page"));
+		while (vm_page_xbusied(m)) {
+			vm_page_lock(m);
+			VM_OBJECT_WUNLOCK(obj);
+			vm_page_busy_sleep(m, "mbncsh", true);
+			VM_OBJECT_WLOCK(obj);
+		}
+		if (pmap_page_wired_mappings(m) == 0)
+			vm_page_set_invalid(m, poffset, presid);
+		vm_page_release_locked(m, flags);
+		resid -= presid;
+		poffset = 0;
+	}
+	VM_OBJECT_WUNLOCK(obj);
+	bp->b_npages = 0;
+}
+
+/*
+ * Page-granular truncation of an existing VMIO buffer.
+ */
+static void
+vfs_vmio_truncate(struct buf *bp, int desiredpages)
+{
+	vm_object_t obj;
+	vm_page_t m;
+	int flags, i;
+
+	if (bp->b_npages == desiredpages)
+		return;
+
+	if (buf_mapped(bp)) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
+		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
+	} else
+		BUF_CHECK_UNMAPPED(bp);
+
+	/*
+	 * The object lock is needed only if we will attempt to free pages.
+	 */
+	flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
+	if ((bp->b_flags & B_DIRECT) != 0) {
+		flags |= VPR_TRYFREE;
+		obj = bp->b_bufobj->bo_object;
+		VM_OBJECT_WLOCK(obj);
+	} else {
+		obj = NULL;
+	}
+	for (i = desiredpages; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
+		bp->b_pages[i] = NULL;
+		if (obj != NULL)
+			vm_page_release_locked(m, flags);
+		else
+			vm_page_release(m, flags);
+	}
+	if (obj != NULL)
+		VM_OBJECT_WUNLOCK(obj);
+	bp->b_npages = desiredpages;
+}
+
+/*
+ * Byte granular extension of VMIO buffers.
+ */
+static void
+vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
+{
+	/*
+	 * We are growing the buffer, possibly in a 
+	 * byte-granular fashion.
+	 */
+	vm_object_t obj;
+	vm_offset_t toff;
+	vm_offset_t tinc;
+	vm_page_t m;
+
+	/*
+	 * Step 1, bring in the VM pages from the object, allocating
+	 * them if necessary.  We must clear B_CACHE if these pages
+	 * are not valid for the range covered by the buffer.
+	 */
+	obj = bp->b_bufobj->bo_object;
+	VM_OBJECT_WLOCK(obj);
+	if (bp->b_npages < desiredpages) {
+		/*
+		 * We must allocate system pages since blocking
+		 * here could interfere with paging I/O, no
+		 * matter which process we are.
+		 *
+		 * Only exclusive busy can be tested here.
+		 * Blocking on shared busy might lead to
+		 * deadlocks once allocbuf() is called after
+		 * pages are vfs_busy_pages().
+		 */
+		(void)vm_page_grab_pages(obj,
+		    OFF_TO_IDX(bp->b_offset) + bp->b_npages,
+		    VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY |
+		    VM_ALLOC_NOBUSY | VM_ALLOC_WIRED,
+		    &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages);
+		bp->b_npages = desiredpages;
+	}
+
+	/*
+	 * Step 2.  We've loaded the pages into the buffer,
+	 * we have to figure out if we can still have B_CACHE
+	 * set.  Note that B_CACHE is set according to the
+	 * byte-granular range ( bcount and size ), not the
+	 * aligned range ( newbsize ).
+	 *
+	 * The VM test is against m->valid, which is DEV_BSIZE
+	 * aligned.  Needless to say, the validity of the data
+	 * needs to also be DEV_BSIZE aligned.  Note that this
+	 * fails with NFS if the server or some other client
+	 * extends the file's EOF.  If our buffer is resized, 
+	 * B_CACHE may remain set! XXX
+	 */
+	toff = bp->b_bcount;
+	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+	while ((bp->b_flags & B_CACHE) && toff < size) {
+		vm_pindex_t pi;
+
+		if (tinc > (size - toff))
+			tinc = size - toff;
+		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
+		m = bp->b_pages[pi];
+		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
+		toff += tinc;
+		tinc = PAGE_SIZE;
+	}
+	VM_OBJECT_WUNLOCK(obj);
+
+	/*
+	 * Step 3, fixup the KVA pmap.
+	 */
+	if (buf_mapped(bp))
+		bpmap_qenter(bp);
+	else
+		BUF_CHECK_UNMAPPED(bp);
+}
+
+/*
+ * Check to see if a block at a particular lbn is available for a clustered
+ * write.
+ */
+static int
+vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
+{
+	struct buf *bpa;
+	int match;
+
+	match = 0;
+
+	/* If the buf isn't in core skip it */
+	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
+		return (0);
+
+	/* If the buf is busy we don't want to wait for it */
+	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+		return (0);
+
+	/* Only cluster with valid clusterable delayed write buffers */
+	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
+	    (B_DELWRI | B_CLUSTEROK))
+		goto done;
+
+	if (bpa->b_bufsize != size)
+		goto done;
+
+	/*
+	 * Check to see if it is in the expected place on disk and that the
+	 * block has been mapped.
+	 */
+	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
+		match = 1;
+done:
+	BUF_UNLOCK(bpa);
+	return (match);
+}
+
+/*
+ *	vfs_bio_awrite:
+ *
+ *	Implement clustered async writes for clearing out B_DELWRI buffers.
+ *	This is much better then the old way of writing only one buffer at
+ *	a time.  Note that we may not be presented with the buffers in the 
+ *	correct order, so we search for the cluster in both directions.
+ */
+int
+vfs_bio_awrite(struct buf *bp)
+{
+	struct bufobj *bo;
+	int i;
+	int j;
+	daddr_t lblkno = bp->b_lblkno;
+	struct vnode *vp = bp->b_vp;
+	int ncl;
+	int nwritten;
+	int size;
+	int maxcl;
+	int gbflags;
+
+	bo = &vp->v_bufobj;
+	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
+	/*
+	 * right now we support clustered writing only to regular files.  If
+	 * we find a clusterable block we could be in the middle of a cluster
+	 * rather then at the beginning.
+	 */
+	if ((vp->v_type == VREG) && 
+	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
+	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+		size = vp->v_mount->mnt_stat.f_iosize;
+		maxcl = MAXPHYS / size;
+
+		BO_RLOCK(bo);
+		for (i = 1; i < maxcl; i++)
+			if (vfs_bio_clcheck(vp, size, lblkno + i,
+			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
+				break;
+
+		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
+			if (vfs_bio_clcheck(vp, size, lblkno - j,
+			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
+				break;
+		BO_RUNLOCK(bo);
+		--j;
+		ncl = i + j;
+		/*
+		 * this is a possible cluster write
+		 */
+		if (ncl != 1) {
+			BUF_UNLOCK(bp);
+			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
+			    gbflags);
+			return (nwritten);
+		}
+	}
+	bremfree(bp);
+	bp->b_flags |= B_ASYNC;
+	/*
+	 * default (old) behavior, writing out only one block
+	 *
+	 * XXX returns b_bufsize instead of b_bcount for nwritten?
+	 */
+	nwritten = bp->b_bufsize;
+	(void) bwrite(bp);
+
+	return (nwritten);
+}
+
+/*
+ *	getnewbuf_kva:
+ *
+ *	Allocate KVA for an empty buf header according to gbflags.
+ */
+static int
+getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
+{
+
+	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
+		/*
+		 * In order to keep fragmentation sane we only allocate kva
+		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
+		 */
+		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
+
+		if (maxsize != bp->b_kvasize &&
+		    bufkva_alloc(bp, maxsize, gbflags))
+			return (ENOSPC);
+	}
+	return (0);
+}
+
+/*
+ *	getnewbuf:
+ *
+ *	Find and initialize a new buffer header, freeing up existing buffers
+ *	in the bufqueues as necessary.  The new buffer is returned locked.
+ *
+ *	We block if:
+ *		We have insufficient buffer headers
+ *		We have insufficient buffer space
+ *		buffer_arena is too fragmented ( space reservation fails )
+ *		If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ *	The caller is responsible for releasing the reserved bufspace after
+ *	allocbuf() is called.
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
+{
+	struct bufdomain *bd;
+	struct buf *bp;
+	bool metadata, reserved;
+
+	bp = NULL;
+	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+	if (!unmapped_buf_allowed)
+		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+	    vp->v_type == VCHR)
+		metadata = true;
+	else
+		metadata = false;
+	if (vp == NULL)
+		bd = &bdomain[0];
+	else
+		bd = &bdomain[vp->v_bufobj.bo_domain];
+
+	counter_u64_add(getnewbufcalls, 1);
+	reserved = false;
+	do {
+		if (reserved == false &&
+		    bufspace_reserve(bd, maxsize, metadata) != 0) {
+			counter_u64_add(getnewbufrestarts, 1);
+			continue;
+		}
+		reserved = true;
+		if ((bp = buf_alloc(bd)) == NULL) {
+			counter_u64_add(getnewbufrestarts, 1);
+			continue;
+		}
+		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
+			return (bp);
+		break;
+	} while (buf_recycle(bd, false) == 0);
+
+	if (reserved)
+		bufspace_release(bd, maxsize);
+	if (bp != NULL) {
+		bp->b_flags |= B_INVAL;
+		brelse(bp);
+	}
+	bufspace_wait(bd, vp, gbflags, slpflag, slptimeo);
+
+	return (NULL);
+}
+
+/*
+ *	buf_daemon:
+ *
+ *	buffer flushing daemon.  Buffers are normally flushed by the
+ *	update daemon but if it cannot keep up this process starts to
+ *	take the load in an attempt to prevent getnewbuf() from blocking.
+ */
+static struct kproc_desc buf_kp = {
+	"bufdaemon",
+	buf_daemon,
+	&bufdaemonproc
+};
+SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
+
+static int
+buf_flush(struct vnode *vp, struct bufdomain *bd, int target)
+{
+	int flushed;
+
+	flushed = flushbufqueues(vp, bd, target, 0);
+	if (flushed == 0) {
+		/*
+		 * Could not find any buffers without rollback
+		 * dependencies, so just write the first one
+		 * in the hopes of eventually making progress.
+		 */
+		if (vp != NULL && target > 2)
+			target /= 2;
+		flushbufqueues(vp, bd, target, 1);
+	}
+	return (flushed);
+}
+
+static void
+buf_daemon()
+{
+	struct bufdomain *bd;
+	int speedupreq;
+	int lodirty;
+	int i;
+
+	/*
+	 * This process needs to be suspended prior to shutdown sync.
+	 */
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
+	    SHUTDOWN_PRI_LAST + 100);
+
+	/*
+	 * Start the buf clean daemons as children threads.
+	 */
+	for (i = 0 ; i < buf_domains; i++) {
+		int error;
+
+		error = kthread_add((void (*)(void *))bufspace_daemon,
+		    &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
+		if (error)
+			panic("error %d spawning bufspace daemon", error);
+	}
+
+	/*
+	 * This process is allowed to take the buffer cache to the limit
+	 */
+	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
+	mtx_lock(&bdlock);
+	for (;;) {
+		bd_request = 0;
+		mtx_unlock(&bdlock);
+
+		kthread_suspend_check();
+
+		/*
+		 * Save speedupreq for this pass and reset to capture new
+		 * requests.
+		 */
+		speedupreq = bd_speedupreq;
+		bd_speedupreq = 0;
+
+		/*
+		 * Flush each domain sequentially according to its level and
+		 * the speedup request.
+		 */
+		for (i = 0; i < buf_domains; i++) {
+			bd = &bdomain[i];
+			if (speedupreq)
+				lodirty = bd->bd_numdirtybuffers / 2;
+			else
+				lodirty = bd->bd_lodirtybuffers;
+			while (bd->bd_numdirtybuffers > lodirty) {
+				if (buf_flush(NULL, bd,
+				    bd->bd_numdirtybuffers - lodirty) == 0)
+					break;
+				kern_yield(PRI_USER);
+			}
+		}
+
+		/*
+		 * Only clear bd_request if we have reached our low water
+		 * mark.  The buf_daemon normally waits 1 second and
+		 * then incrementally flushes any dirty buffers that have
+		 * built up, within reason.
+		 *
+		 * If we were unable to hit our low water mark and couldn't
+		 * find any flushable buffers, we sleep for a short period
+		 * to avoid endless loops on unlockable buffers.
+		 */
+		mtx_lock(&bdlock);
+		if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) {
+			/*
+			 * We reached our low water mark, reset the
+			 * request and sleep until we are needed again.
+			 * The sleep is just so the suspend code works.
+			 */
+			bd_request = 0;
+			/*
+			 * Do an extra wakeup in case dirty threshold
+			 * changed via sysctl and the explicit transition
+			 * out of shortfall was missed.
+			 */
+			bdirtywakeup();
+			if (runningbufspace <= lorunningspace)
+				runningwakeup();
+			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
+		} else {
+			/*
+			 * We couldn't find any flushable dirty buffers but
+			 * still have too many dirty buffers, we
+			 * have to sleep and try again.  (rare)
+			 */
+			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
+		}
+	}
+}
+
+/*
+ *	flushbufqueues:
+ *
+ *	Try to flush a buffer in the dirty queue.  We must be careful to
+ *	free up B_INVAL buffers instead of write them, which NFS is 
+ *	particularly sensitive to.
+ */
+static int flushwithdeps = 0;
+SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
+    0, "Number of buffers flushed with dependecies that require rollbacks");
+
+static int
+flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target,
+    int flushdeps)
+{
+	struct bufqueue *bq;
+	struct buf *sentinel;
+	struct vnode *vp;
+	struct mount *mp;
+	struct buf *bp;
+	int hasdeps;
+	int flushed;
+	int error;
+	bool unlock;
+
+	flushed = 0;
+	bq = &bd->bd_dirtyq;
+	bp = NULL;
+	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
+	sentinel->b_qindex = QUEUE_SENTINEL;
+	BQ_LOCK(bq);
+	TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist);
+	BQ_UNLOCK(bq);
+	while (flushed != target) {
+		maybe_yield();
+		BQ_LOCK(bq);
+		bp = TAILQ_NEXT(sentinel, b_freelist);
+		if (bp != NULL) {
+			TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
+			TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel,
+			    b_freelist);
+		} else {
+			BQ_UNLOCK(bq);
+			break;
+		}
+		/*
+		 * Skip sentinels inserted by other invocations of the
+		 * flushbufqueues(), taking care to not reorder them.
+		 *
+		 * Only flush the buffers that belong to the
+		 * vnode locked by the curthread.
+		 */
+		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
+		    bp->b_vp != lvp)) {
+			BQ_UNLOCK(bq);
+			continue;
+		}
+		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+		BQ_UNLOCK(bq);
+		if (error != 0)
+			continue;
+
+		/*
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
+		 */
+		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
+		    (bp->b_flags & B_DELWRI) == 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+		if (bp->b_flags & B_INVAL) {
+			bremfreef(bp);
+			brelse(bp);
+			flushed++;
+			continue;
+		}
+
+		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
+			if (flushdeps == 0) {
+				BUF_UNLOCK(bp);
+				continue;
+			}
+			hasdeps = 1;
+		} else
+			hasdeps = 0;
+		/*
+		 * We must hold the lock on a vnode before writing
+		 * one of its buffers. Otherwise we may confuse, or
+		 * in the case of a snapshot vnode, deadlock the
+		 * system.
+		 *
+		 * The lock order here is the reverse of the normal
+		 * of vnode followed by buf lock.  This is ok because
+		 * the NOWAIT will prevent deadlock.
+		 */
+		vp = bp->b_vp;
+		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+		if (lvp == NULL) {
+			unlock = true;
+			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		} else {
+			ASSERT_VOP_LOCKED(vp, "getbuf");
+			unlock = false;
+			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
+			    vn_lock(vp, LK_TRYUPGRADE);
+		}
+		if (error == 0) {
+			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
+			    bp, bp->b_vp, bp->b_flags);
+			if (curproc == bufdaemonproc) {
+				vfs_bio_awrite(bp);
+			} else {
+				bremfree(bp);
+				bwrite(bp);
+				counter_u64_add(notbufdflushes, 1);
+			}
+			vn_finished_write(mp);
+			if (unlock)
+				VOP_UNLOCK(vp, 0);
+			flushwithdeps += hasdeps;
+			flushed++;
+
+			/*
+			 * Sleeping on runningbufspace while holding
+			 * vnode lock leads to deadlock.
+			 */
+			if (curproc == bufdaemonproc &&
+			    runningbufspace > hirunningspace)
+				waitrunningbufspace();
+			continue;
+		}
+		vn_finished_write(mp);
+		BUF_UNLOCK(bp);
+	}
+	BQ_LOCK(bq);
+	TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
+	BQ_UNLOCK(bq);
+	free(sentinel, M_TEMP);
+	return (flushed);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct bufobj *bo, daddr_t blkno)
+{
+	struct buf *bp;
+
+	BO_RLOCK(bo);
+	bp = gbincore(bo, blkno);
+	BO_RUNLOCK(bo);
+	return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object.  This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+static int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+	vm_object_t obj;
+	vm_offset_t toff, tinc, size;
+	vm_page_t m;
+	vm_ooffset_t off;
+
+	ASSERT_VOP_LOCKED(vp, "inmem");
+
+	if (incore(&vp->v_bufobj, blkno))
+		return 1;
+	if (vp->v_mount == NULL)
+		return 0;
+	obj = vp->v_object;
+	if (obj == NULL)
+		return (0);
+
+	size = PAGE_SIZE;
+	if (size > vp->v_mount->mnt_stat.f_iosize)
+		size = vp->v_mount->mnt_stat.f_iosize;
+	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+	VM_OBJECT_RLOCK(obj);
+	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+		if (!m)
+			goto notinmem;
+		tinc = size;
+		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+		if (vm_page_is_valid(m,
+		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+			goto notinmem;
+	}
+	VM_OBJECT_RUNLOCK(obj);
+	return 1;
+
+notinmem:
+	VM_OBJECT_RUNLOCK(obj);
+	return (0);
+}
+
+/*
+ * Set the dirty range for a buffer based on the status of the dirty
+ * bits in the pages comprising the buffer.  The range is limited
+ * to the size of the buffer.
+ *
+ * Tell the VM system that the pages associated with this buffer
+ * are clean.  This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
+ */
+static void
+vfs_clean_pages_dirty_buf(struct buf *bp)
+{
+	vm_ooffset_t foff, noff, eoff;
+	vm_page_t m;
+	int i;
+
+	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
+		return;
+
+	foff = bp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET,
+	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
+
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	vfs_drain_busy_pages(bp);
+	vfs_setdirty_locked_object(bp);
+	for (i = 0; i < bp->b_npages; i++) {
+		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+		eoff = noff;
+		if (eoff > bp->b_offset + bp->b_bufsize)
+			eoff = bp->b_offset + bp->b_bufsize;
+		m = bp->b_pages[i];
+		vfs_page_set_validclean(bp, foff, m);
+		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+		foff = noff;
+	}
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+static void
+vfs_setdirty_locked_object(struct buf *bp)
+{
+	vm_object_t object;
+	int i;
+
+	object = bp->b_bufobj->bo_object;
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	/*
+	 * We qualify the scan for modified pages on whether the
+	 * object has been flushed yet.
+	 */
+	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
+		vm_offset_t boffset;
+		vm_offset_t eoffset;
+
+		/*
+		 * test the pages to see if they have been modified directly
+		 * by users through the VM system.
+		 */
+		for (i = 0; i < bp->b_npages; i++)
+			vm_page_test_dirty(bp->b_pages[i]);
+
+		/*
+		 * Calculate the encompassing dirty range, boffset and eoffset,
+		 * (eoffset - boffset) bytes.
+		 */
+
+		for (i = 0; i < bp->b_npages; i++) {
+			if (bp->b_pages[i]->dirty)
+				break;
+		}
+		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		for (i = bp->b_npages - 1; i >= 0; --i) {
+			if (bp->b_pages[i]->dirty) {
+				break;
+			}
+		}
+		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+		/*
+		 * Fit it to the buffer.
+		 */
+
+		if (eoffset > bp->b_bcount)
+			eoffset = bp->b_bcount;
+
+		/*
+		 * If we have a good dirty range, merge with the existing
+		 * dirty range.
+		 */
+
+		if (boffset < eoffset) {
+			if (bp->b_dirtyoff > boffset)
+				bp->b_dirtyoff = boffset;
+			if (bp->b_dirtyend < eoffset)
+				bp->b_dirtyend = eoffset;
+		}
+	}
+}
+
+/*
+ * Allocate the KVA mapping for an existing buffer.
+ * If an unmapped buffer is provided but a mapped buffer is requested, take
+ * also care to properly setup mappings between pages and KVA.
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+	int bsize, maxsize, need_mapping, need_kva;
+	off_t offset;
+
+	need_mapping = bp->b_data == unmapped_buf &&
+	    (gbflags & GB_UNMAPPED) == 0;
+	need_kva = bp->b_kvabase == unmapped_buf &&
+	    bp->b_data == unmapped_buf &&
+	    (gbflags & GB_KVAALLOC) != 0;
+	if (!need_mapping && !need_kva)
+		return;
+
+	BUF_CHECK_UNMAPPED(bp);
+
+	if (need_mapping && bp->b_kvabase != unmapped_buf) {
+		/*
+		 * Buffer is not mapped, but the KVA was already
+		 * reserved at the time of the instantiation.  Use the
+		 * allocated space.
+		 */
+		goto has_addr;
+	}
+
+	/*
+	 * Calculate the amount of the address space we would reserve
+	 * if the buffer was mapped.
+	 */
+	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
+	offset = blkno * bsize;
+	maxsize = size + (offset & PAGE_MASK);
+	maxsize = imax(maxsize, bsize);
+
+	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
+		if ((gbflags & GB_NOWAIT_BD) != 0) {
+			/*
+			 * XXXKIB: defragmentation cannot
+			 * succeed, not sure what else to do.
+			 */
+			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
+		}
+		counter_u64_add(mappingrestarts, 1);
+		bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0);
+	}
+has_addr:
+	if (need_mapping) {
+		/* b_offset is handled by bpmap_qenter. */
+		bp->b_data = bp->b_kvabase;
+		BUF_CHECK_MAPPED(bp);
+		bpmap_qenter(bp);
+	}
+}
+
+struct buf *
+getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+    int flags)
+{
+	struct buf *bp;
+	int error;
+
+	error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp);
+	if (error != 0)
+		return (NULL);
+	return (bp);
+}
+
+/*
+ *	getblkx:
+ *
+ *	Get a block given a specified block and offset into a file/device.
+ *	The buffers B_DONE bit will be cleared on return, making it almost
+ * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
+ *	return.  The caller should clear B_INVAL prior to initiating a
+ *	READ.
+ *
+ *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ *	an existing buffer.
+ *
+ *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ *	and then cleared based on the backing VM.  If the previous buffer is
+ *	non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ *	If getblk() must create a new buffer, the new buffer is returned with
+ *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ *	case it is returned with B_INVAL clear and B_CACHE set based on the
+ *	backing VM.
+ *
+ *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
+ *	B_CACHE bit is clear.
+ *	
+ *	What this means, basically, is that the caller should use B_CACHE to
+ *	determine whether the buffer is fully valid or not and should clear
+ *	B_INVAL prior to issuing a read.  If the caller intends to validate
+ *	the buffer by loading its data area with something, the caller needs
+ *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
+ *	the caller should set B_CACHE ( as an optimization ), else the caller
+ *	should issue the I/O and biodone() will set B_CACHE if the I/O was
+ *	a write attempt or if it was a successful read.  If the caller 
+ *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
+ *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
+ */
+int
+getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+    int flags, struct buf **bpp)
+{
+	struct buf *bp;
+	struct bufobj *bo;
+	daddr_t d_blkno;
+	int bsize, error, maxsize, vmio;
+	off_t offset;
+
+	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+	ASSERT_VOP_LOCKED(vp, "getblk");
+	if (size > maxbcachebuf)
+		panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
+		    maxbcachebuf);
+	if (!unmapped_buf_allowed)
+		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+	bo = &vp->v_bufobj;
+	d_blkno = blkno;
+loop:
+	BO_RLOCK(bo);
+	bp = gbincore(bo, blkno);
+	if (bp != NULL) {
+		int lockflags;
+		/*
+		 * Buffer is in-core.  If the buffer is not busy nor managed,
+		 * it must be on a queue.
+		 */
+		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
+
+		if ((flags & GB_LOCK_NOWAIT) != 0)
+			lockflags |= LK_NOWAIT;
+
+		error = BUF_TIMELOCK(bp, lockflags,
+		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
+
+		/*
+		 * If we slept and got the lock we have to restart in case
+		 * the buffer changed identities.
+		 */
+		if (error == ENOLCK)
+			goto loop;
+		/* We timed out or were interrupted. */
+		else if (error != 0)
+			return (error);
+		/* If recursed, assume caller knows the rules. */
+		else if (BUF_LOCKRECURSED(bp))
+			goto end;
+
+		/*
+		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
+		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
+		 * and for a VMIO buffer B_CACHE is adjusted according to the
+		 * backing VM cache.
+		 */
+		if (bp->b_flags & B_INVAL)
+			bp->b_flags &= ~B_CACHE;
+		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
+			bp->b_flags |= B_CACHE;
+		if (bp->b_flags & B_MANAGED)
+			MPASS(bp->b_qindex == QUEUE_NONE);
+		else
+			bremfree(bp);
+
+		/*
+		 * check for size inconsistencies for non-VMIO case.
+		 */
+		if (bp->b_bcount != size) {
+			if ((bp->b_flags & B_VMIO) == 0 ||
+			    (size > bp->b_kvasize)) {
+				if (bp->b_flags & B_DELWRI) {
+					bp->b_flags |= B_NOCACHE;
+					bwrite(bp);
+				} else {
+					if (LIST_EMPTY(&bp->b_dep)) {
+						bp->b_flags |= B_RELBUF;
+						brelse(bp);
+					} else {
+						bp->b_flags |= B_NOCACHE;
+						bwrite(bp);
+					}
+				}
+				goto loop;
+			}
+		}
+
+		/*
+		 * Handle the case of unmapped buffer which should
+		 * become mapped, or the buffer for which KVA
+		 * reservation is requested.
+		 */
+		bp_unmapped_get_kva(bp, blkno, size, flags);
+
+		/*
+		 * If the size is inconsistent in the VMIO case, we can resize
+		 * the buffer.  This might lead to B_CACHE getting set or
+		 * cleared.  If the size has not changed, B_CACHE remains
+		 * unchanged from its previous state.
+		 */
+		allocbuf(bp, size);
+
+		KASSERT(bp->b_offset != NOOFFSET, 
+		    ("getblk: no buffer offset"));
+
+		/*
+		 * A buffer with B_DELWRI set and B_CACHE clear must
+		 * be committed before we can return the buffer in
+		 * order to prevent the caller from issuing a read
+		 * ( due to B_CACHE not being set ) and overwriting
+		 * it.
+		 *
+		 * Most callers, including NFS and FFS, need this to
+		 * operate properly either because they assume they
+		 * can issue a read if B_CACHE is not set, or because
+		 * ( for example ) an uncached B_DELWRI might loop due 
+		 * to softupdates re-dirtying the buffer.  In the latter
+		 * case, B_CACHE is set after the first write completes,
+		 * preventing further loops.
+		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
+		 * above while extending the buffer, we cannot allow the
+		 * buffer to remain with B_CACHE set after the write
+		 * completes or it will represent a corrupt state.  To
+		 * deal with this we set B_NOCACHE to scrap the buffer
+		 * after the write.
+		 *
+		 * We might be able to do something fancy, like setting
+		 * B_CACHE in bwrite() except if B_DELWRI is already set,
+		 * so the below call doesn't set B_CACHE, but that gets real
+		 * confusing.  This is much easier.
+		 */
+
+		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+			bp->b_flags |= B_NOCACHE;
+			bwrite(bp);
+			goto loop;
+		}
+		bp->b_flags &= ~B_DONE;
+	} else {
+		/*
+		 * Buffer is not in-core, create new buffer.  The buffer
+		 * returned by getnewbuf() is locked.  Note that the returned
+		 * buffer is also considered valid (not marked B_INVAL).
+		 */
+		BO_RUNLOCK(bo);
+		/*
+		 * If the user does not want us to create the buffer, bail out
+		 * here.
+		 */
+		if (flags & GB_NOCREAT)
+			return (EEXIST);
+
+		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
+		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
+		offset = blkno * bsize;
+		vmio = vp->v_object != NULL;
+		if (vmio) {
+			maxsize = size + (offset & PAGE_MASK);
+		} else {
+			maxsize = size;
+			/* Do not allow non-VMIO notmapped buffers. */
+			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+		}
+		maxsize = imax(maxsize, bsize);
+		if ((flags & GB_NOSPARSE) != 0 && vmio &&
+		    !vn_isdisk(vp, NULL)) {
+			error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0);
+			KASSERT(error != EOPNOTSUPP,
+			    ("GB_NOSPARSE from fs not supporting bmap, vp %p",
+			    vp));
+			if (error != 0)
+				return (error);
+			if (d_blkno == -1)
+				return (EJUSTRETURN);
+		}
+
+		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
+		if (bp == NULL) {
+			if (slpflag || slptimeo)
+				return (ETIMEDOUT);
+			/*
+			 * XXX This is here until the sleep path is diagnosed
+			 * enough to work under very low memory conditions.
+			 *
+			 * There's an issue on low memory, 4BSD+non-preempt
+			 * systems (eg MIPS routers with 32MB RAM) where buffer
+			 * exhaustion occurs without sleeping for buffer
+			 * reclaimation.  This just sticks in a loop and
+			 * constantly attempts to allocate a buffer, which
+			 * hits exhaustion and tries to wakeup bufdaemon.
+			 * This never happens because we never yield.
+			 *
+			 * The real solution is to identify and fix these cases
+			 * so we aren't effectively busy-waiting in a loop
+			 * until the reclaimation path has cycles to run.
+			 */
+			kern_yield(PRI_USER);
+			goto loop;
+		}
+
+		/*
+		 * This code is used to make sure that a buffer is not
+		 * created while the getnewbuf routine is blocked.
+		 * This can be a problem whether the vnode is locked or not.
+		 * If the buffer is created out from under us, we have to
+		 * throw away the one we just created.
+		 *
+		 * Note: this must occur before we associate the buffer
+		 * with the vp especially considering limitations in
+		 * the splay tree implementation when dealing with duplicate
+		 * lblkno's.
+		 */
+		BO_LOCK(bo);
+		if (gbincore(bo, blkno)) {
+			BO_UNLOCK(bo);
+			bp->b_flags |= B_INVAL;
+			bufspace_release(bufdomain(bp), maxsize);
+			brelse(bp);
+			goto loop;
+		}
+
+		/*
+		 * Insert the buffer into the hash, so that it can
+		 * be found by incore.
+		 */
+		bp->b_lblkno = blkno;
+		bp->b_blkno = d_blkno;
+		bp->b_offset = offset;
+		bgetvp(vp, bp);
+		BO_UNLOCK(bo);
+
+		/*
+		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
+		 * buffer size starts out as 0, B_CACHE will be set by
+		 * allocbuf() for the VMIO case prior to it testing the
+		 * backing store for validity.
+		 */
+
+		if (vmio) {
+			bp->b_flags |= B_VMIO;
+			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
+			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
+			    bp, vp->v_object, bp->b_bufobj->bo_object));
+		} else {
+			bp->b_flags &= ~B_VMIO;
+			KASSERT(bp->b_bufobj->bo_object == NULL,
+			    ("ARGH! has b_bufobj->bo_object %p %p\n",
+			    bp, bp->b_bufobj->bo_object));
+			BUF_CHECK_MAPPED(bp);
+		}
+
+		allocbuf(bp, size);
+		bufspace_release(bufdomain(bp), maxsize);
+		bp->b_flags &= ~B_DONE;
+	}
+	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
+	BUF_ASSERT_HELD(bp);
+end:
+	buf_track(bp, __func__);
+	KASSERT(bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	*bpp = bp;
+	return (0);
+}
+
+/*
+ * Get an empty, disassociated buffer of given size.  The buffer is initially
+ * set to B_INVAL.
+ */
+struct buf *
+geteblk(int size, int flags)
+{
+	struct buf *bp;
+	int maxsize;
+
+	maxsize = (size + BKVAMASK) & ~BKVAMASK;
+	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
+		if ((flags & GB_NOWAIT_BD) &&
+		    (curthread->td_pflags & TDP_BUFNEED) != 0)
+			return (NULL);
+	}
+	allocbuf(bp, size);
+	bufspace_release(bufdomain(bp), maxsize);
+	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
+	BUF_ASSERT_HELD(bp);
+	return (bp);
+}
+
+/*
+ * Truncate the backing store for a non-vmio buffer.
+ */
+static void
+vfs_nonvmio_truncate(struct buf *bp, int newbsize)
+{
+
+	if (bp->b_flags & B_MALLOC) {
+		/*
+		 * malloced buffers are not shrunk
+		 */
+		if (newbsize == 0) {
+			bufmallocadjust(bp, 0);
+			free(bp->b_data, M_BIOBUF);
+			bp->b_data = bp->b_kvabase;
+			bp->b_flags &= ~B_MALLOC;
+		}
+		return;
+	}
+	vm_hold_free_pages(bp, newbsize);
+	bufspace_adjust(bp, newbsize);
+}
+
+/*
+ * Extend the backing for a non-VMIO buffer.
+ */
+static void
+vfs_nonvmio_extend(struct buf *bp, int newbsize)
+{
+	caddr_t origbuf;
+	int origbufsize;
+
+	/*
+	 * We only use malloced memory on the first allocation.
+	 * and revert to page-allocated memory when the buffer
+	 * grows.
+	 *
+	 * There is a potential smp race here that could lead
+	 * to bufmallocspace slightly passing the max.  It
+	 * is probably extremely rare and not worth worrying
+	 * over.
+	 */
+	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
+	    bufmallocspace < maxbufmallocspace) {
+		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
+		bp->b_flags |= B_MALLOC;
+		bufmallocadjust(bp, newbsize);
+		return;
+	}
+
+	/*
+	 * If the buffer is growing on its other-than-first
+	 * allocation then we revert to the page-allocation
+	 * scheme.
+	 */
+	origbuf = NULL;
+	origbufsize = 0;
+	if (bp->b_flags & B_MALLOC) {
+		origbuf = bp->b_data;
+		origbufsize = bp->b_bufsize;
+		bp->b_data = bp->b_kvabase;
+		bufmallocadjust(bp, 0);
+		bp->b_flags &= ~B_MALLOC;
+		newbsize = round_page(newbsize);
+	}
+	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
+	    (vm_offset_t) bp->b_data + newbsize);
+	if (origbuf != NULL) {
+		bcopy(origbuf, bp->b_data, origbufsize);
+		free(origbuf, M_BIOBUF);
+	}
+	bufspace_adjust(bp, newbsize);
+}
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations).  This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistent data situations.  Tread lightly!!! 
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
+ * the caller.  Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
+ * B_CACHE for the non-VMIO case.
+ */
+int
+allocbuf(struct buf *bp, int size)
+{
+	int newbsize;
+
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_bcount == size)
+		return (1);
+
+	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
+		panic("allocbuf: buffer too small");
+
+	newbsize = roundup2(size, DEV_BSIZE);
+	if ((bp->b_flags & B_VMIO) == 0) {
+		if ((bp->b_flags & B_MALLOC) == 0)
+			newbsize = round_page(newbsize);
+		/*
+		 * Just get anonymous memory from the kernel.  Don't
+		 * mess with B_CACHE.
+		 */
+		if (newbsize < bp->b_bufsize)
+			vfs_nonvmio_truncate(bp, newbsize);
+		else if (newbsize > bp->b_bufsize)
+			vfs_nonvmio_extend(bp, newbsize);
+	} else {
+		int desiredpages;
+
+		desiredpages = (size == 0) ? 0 :
+		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+		if (bp->b_flags & B_MALLOC)
+			panic("allocbuf: VMIO buffer can't be malloced");
+		/*
+		 * Set B_CACHE initially if buffer is 0 length or will become
+		 * 0-length.
+		 */
+		if (size == 0 || bp->b_bufsize == 0)
+			bp->b_flags |= B_CACHE;
+
+		if (newbsize < bp->b_bufsize)
+			vfs_vmio_truncate(bp, desiredpages);
+		/* XXX This looks as if it should be newbsize > b_bufsize */
+		else if (size > bp->b_bcount)
+			vfs_vmio_extend(bp, desiredpages, size);
+		bufspace_adjust(bp, newbsize);
+	}
+	bp->b_bcount = size;		/* requested buffer size. */
+	return (1);
+}
+
+extern int inflight_transient_maps;
+
+static struct bio_queue nondump_bios;
+
+void
+biodone(struct bio *bp)
+{
+	struct mtx *mtxp;
+	void (*done)(struct bio *);
+	vm_offset_t start, end;
+
+	biotrack(bp, __func__);
+
+	/*
+	 * Avoid completing I/O when dumping after a panic since that may
+	 * result in a deadlock in the filesystem or pager code.  Note that
+	 * this doesn't affect dumps that were started manually since we aim
+	 * to keep the system usable after it has been resumed.
+	 */
+	if (__predict_false(dumping && SCHEDULER_STOPPED())) {
+		TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue);
+		return;
+	}
+	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
+		bp->bio_flags |= BIO_UNMAPPED;
+		start = trunc_page((vm_offset_t)bp->bio_data);
+		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+		bp->bio_data = unmapped_buf;
+		pmap_qremove(start, atop(end - start));
+		vmem_free(transient_arena, start, end - start);
+		atomic_add_int(&inflight_transient_maps, -1);
+	}
+	done = bp->bio_done;
+	if (done == NULL) {
+		mtxp = mtx_pool_find(mtxpool_sleep, bp);
+		mtx_lock(mtxp);
+		bp->bio_flags |= BIO_DONE;
+		wakeup(bp);
+		mtx_unlock(mtxp);
+	} else
+		done(bp);
+}
+
+/*
+ * Wait for a BIO to finish.
+ */
+int
+biowait(struct bio *bp, const char *wchan)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	while ((bp->bio_flags & BIO_DONE) == 0)
+		msleep(bp, mtxp, PRIBIO, wchan, 0);
+	mtx_unlock(mtxp);
+	if (bp->bio_error != 0)
+		return (bp->bio_error);
+	if (!(bp->bio_flags & BIO_ERROR))
+		return (0);
+	return (EIO);
+}
+
+void
+biofinish(struct bio *bp, struct devstat *stat, int error)
+{
+	
+	if (error) {
+		bp->bio_error = error;
+		bp->bio_flags |= BIO_ERROR;
+	}
+	if (stat != NULL)
+		devstat_end_transaction_bio(stat, bp);
+	biodone(bp);
+}
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+void
+biotrack_buf(struct bio *bp, const char *location)
+{
+
+	buf_track(bp->bio_track_bp, location);
+}
+#endif
+
+/*
+ *	bufwait:
+ *
+ *	Wait for buffer I/O completion, returning error status.  The buffer
+ *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
+ *	error and cleared.
+ */
+int
+bufwait(struct buf *bp)
+{
+	if (bp->b_iocmd == BIO_READ)
+		bwait(bp, PRIBIO, "biord");
+	else
+		bwait(bp, PRIBIO, "biowr");
+	if (bp->b_flags & B_EINTR) {
+		bp->b_flags &= ~B_EINTR;
+		return (EINTR);
+	}
+	if (bp->b_ioflags & BIO_ERROR) {
+		return (bp->b_error ? bp->b_error : EIO);
+	} else {
+		return (0);
+	}
+}
+
+/*
+ *	bufdone:
+ *
+ *	Finish I/O on a buffer, optionally calling a completion function.
+ *	This is usually called from an interrupt so process blocking is
+ *	not allowed.
+ *
+ *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
+ *	assuming B_INVAL is clear.
+ *
+ *	For the VMIO case, we set B_CACHE if the op was a read and no
+ *	read error occurred, or if the op was a write.  B_CACHE is never
+ *	set if the buffer is invalid or otherwise uncacheable.
+ *
+ *	bufdone does not mess with B_INVAL, allowing the I/O routine or the
+ *	initiator to leave B_INVAL set to brelse the buffer out of existence
+ *	in the biodone routine.
+ */
+void
+bufdone(struct buf *bp)
+{
+	struct bufobj *dropobj;
+	void    (*biodone)(struct buf *);
+
+	buf_track(bp, __func__);
+	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	dropobj = NULL;
+
+	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
+	BUF_ASSERT_HELD(bp);
+
+	runningbufwakeup(bp);
+	if (bp->b_iocmd == BIO_WRITE)
+		dropobj = bp->b_bufobj;
+	/* call optional completion function if requested */
+	if (bp->b_iodone != NULL) {
+		biodone = bp->b_iodone;
+		bp->b_iodone = NULL;
+		(*biodone) (bp);
+		if (dropobj)
+			bufobj_wdrop(dropobj);
+		return;
+	}
+	if (bp->b_flags & B_VMIO) {
+		/*
+		 * Set B_CACHE if the op was a normal read and no error
+		 * occurred.  B_CACHE is set for writes in the b*write()
+		 * routines.
+		 */
+		if (bp->b_iocmd == BIO_READ &&
+		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
+		    !(bp->b_ioflags & BIO_ERROR))
+			bp->b_flags |= B_CACHE;
+		vfs_vmio_iodone(bp);
+	}
+	if (!LIST_EMPTY(&bp->b_dep))
+		buf_complete(bp);
+	if ((bp->b_flags & B_CKHASH) != 0) {
+		KASSERT(bp->b_iocmd == BIO_READ,
+		    ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd));
+		KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp));
+		(*bp->b_ckhashcalc)(bp);
+	}
+	/*
+	 * For asynchronous completions, release the buffer now. The brelse
+	 * will do a wakeup there if necessary - so no need to do a wakeup
+	 * here in the async case. The sync case always needs to do a wakeup.
+	 */
+	if (bp->b_flags & B_ASYNC) {
+		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
+		    (bp->b_ioflags & BIO_ERROR))
+			brelse(bp);
+		else
+			bqrelse(bp);
+	} else
+		bdone(bp);
+	if (dropobj)
+		bufobj_wdrop(dropobj);
+}
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O.  This keeps the busy status for pages
+ * consistent.
+ */
+void
+vfs_unbusy_pages(struct buf *bp)
+{
+	int i;
+	vm_object_t obj;
+	vm_page_t m;
+
+	runningbufwakeup(bp);
+	if (!(bp->b_flags & B_VMIO))
+		return;
+
+	obj = bp->b_bufobj->bo_object;
+	VM_OBJECT_WLOCK(obj);
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		if (m == bogus_page) {
+			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+			if (!m)
+				panic("vfs_unbusy_pages: page missing\n");
+			bp->b_pages[i] = m;
+			if (buf_mapped(bp)) {
+				BUF_CHECK_MAPPED(bp);
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+				    bp->b_pages, bp->b_npages);
+			} else
+				BUF_CHECK_UNMAPPED(bp);
+		}
+		vm_page_sunbusy(m);
+	}
+	vm_object_pip_wakeupn(obj, bp->b_npages);
+	VM_OBJECT_WUNLOCK(obj);
+}
+
+/*
+ * vfs_page_set_valid:
+ *
+ *	Set the valid bits in a page based on the supplied offset.   The
+ *	range is restricted to the buffer's size.
+ *
+ *	This routine is typically called after a read completes.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+	vm_ooffset_t eoff;
+
+	/*
+	 * Compute the end offset, eoff, such that [off, eoff) does not span a
+	 * page boundary and eoff is not greater than the end of the buffer.
+	 * The end of the buffer, in this case, is our file EOF, not the
+	 * allocation size of the buffer.
+	 */
+	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
+	if (eoff > bp->b_offset + bp->b_bcount)
+		eoff = bp->b_offset + bp->b_bcount;
+
+	/*
+	 * Set valid range.  This is typically the entire buffer and thus the
+	 * entire page.
+	 */
+	if (eoff > off)
+		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
+}
+
+/*
+ * vfs_page_set_validclean:
+ *
+ *	Set the valid bits and clear the dirty bits in a page based on the
+ *	supplied offset.   The range is restricted to the buffer's size.
+ */
+static void
+vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+	vm_ooffset_t soff, eoff;
+
+	/*
+	 * Start and end offsets in buffer.  eoff - soff may not cross a
+	 * page boundary or cross the end of the buffer.  The end of the
+	 * buffer, in this case, is our file EOF, not the allocation size
+	 * of the buffer.
+	 */
+	soff = off;
+	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+	if (eoff > bp->b_offset + bp->b_bcount)
+		eoff = bp->b_offset + bp->b_bcount;
+
+	/*
+	 * Set valid range.  This is typically the entire buffer and thus the
+	 * entire page.
+	 */
+	if (eoff > soff) {
+		vm_page_set_validclean(
+		    m,
+		   (vm_offset_t) (soff & PAGE_MASK),
+		   (vm_offset_t) (eoff - soff)
+		);
+	}
+}
+
+/*
+ * Ensure that all buffer pages are not exclusive busied.  If any page is
+ * exclusive busy, drain it.
+ */
+void
+vfs_drain_busy_pages(struct buf *bp)
+{
+	vm_page_t m;
+	int i, last_busied;
+
+	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
+	last_busied = 0;
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		if (vm_page_xbusied(m)) {
+			for (; last_busied < i; last_busied++)
+				vm_page_sbusy(bp->b_pages[last_busied]);
+			while (vm_page_xbusied(m)) {
+				vm_page_lock(m);
+				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+				vm_page_busy_sleep(m, "vbpage", true);
+				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+			}
+		}
+	}
+	for (i = 0; i < last_busied; i++)
+		vm_page_sunbusy(bp->b_pages[i]);
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being exclusive busy.  Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistent.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as BIO_ERROR or B_INVAL may be in an inconsistent state
+ * and should be ignored.
+ */
+void
+vfs_busy_pages(struct buf *bp, int clear_modify)
+{
+	vm_object_t obj;
+	vm_ooffset_t foff;
+	vm_page_t m;
+	int i;
+	bool bogus;
+
+	if (!(bp->b_flags & B_VMIO))
+		return;
+
+	obj = bp->b_bufobj->bo_object;
+	foff = bp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET,
+	    ("vfs_busy_pages: no buffer offset"));
+	VM_OBJECT_WLOCK(obj);
+	vfs_drain_busy_pages(bp);
+	if (bp->b_bufsize != 0)
+		vfs_setdirty_locked_object(bp);
+	bogus = false;
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+
+		if ((bp->b_flags & B_CLUSTER) == 0) {
+			vm_object_pip_add(obj, 1);
+			vm_page_sbusy(m);
+		}
+		/*
+		 * When readying a buffer for a read ( i.e
+		 * clear_modify == 0 ), it is important to do
+		 * bogus_page replacement for valid pages in 
+		 * partially instantiated buffers.  Partially 
+		 * instantiated buffers can, in turn, occur when
+		 * reconstituting a buffer from its VM backing store
+		 * base.  We only have to do this if B_CACHE is
+		 * clear ( which causes the I/O to occur in the
+		 * first place ).  The replacement prevents the read
+		 * I/O from overwriting potentially dirty VM-backed
+		 * pages.  XXX bogus page replacement is, uh, bogus.
+		 * It may not work properly with small-block devices.
+		 * We need to find a better way.
+		 */
+		if (clear_modify) {
+			pmap_remove_write(m);
+			vfs_page_set_validclean(bp, foff, m);
+		} else if (m->valid == VM_PAGE_BITS_ALL &&
+		    (bp->b_flags & B_CACHE) == 0) {
+			bp->b_pages[i] = bogus_page;
+			bogus = true;
+		}
+		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+	}
+	VM_OBJECT_WUNLOCK(obj);
+	if (bogus && buf_mapped(bp)) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+		    bp->b_pages, bp->b_npages);
+	}
+}
+
+/*
+ *	vfs_bio_set_valid:
+ *
+ *	Set the range within the buffer to valid.  The range is
+ *	relative to the beginning of the buffer, b_offset.  Note that
+ *	b_offset itself may be offset from the beginning of the first
+ *	page.
+ */
+void   
+vfs_bio_set_valid(struct buf *bp, int base, int size)
+{
+	int i, n;
+	vm_page_t m;
+
+	if (!(bp->b_flags & B_VMIO))
+		return;
+
+	/*
+	 * Fixup base to be relative to beginning of first page.
+	 * Set initial n to be the maximum number of bytes in the
+	 * first page that can be validated.
+	 */
+	base += (bp->b_offset & PAGE_MASK);
+	n = PAGE_SIZE - (base & PAGE_MASK);
+
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+		m = bp->b_pages[i];
+		if (n > size)
+			n = size;
+		vm_page_set_valid_range(m, base & PAGE_MASK, n);
+		base += n;
+		size -= n;
+		n = PAGE_SIZE;
+	}
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+/*
+ *	vfs_bio_clrbuf:
+ *
+ *	If the specified buffer is a non-VMIO buffer, clear the entire
+ *	buffer.  If the specified buffer is a VMIO buffer, clear and
+ *	validate only the previously invalid portions of the buffer.
+ *	This routine essentially fakes an I/O, so we need to clear
+ *	BIO_ERROR and B_INVAL.
+ *
+ *	Note that while we only theoretically need to clear through b_bcount,
+ *	we go ahead and clear through b_bufsize.
+ */
+void
+vfs_bio_clrbuf(struct buf *bp) 
+{
+	int i, j, mask, sa, ea, slide;
+
+	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
+		clrbuf(bp);
+		return;
+	}
+	bp->b_flags &= ~B_INVAL;
+	bp->b_ioflags &= ~BIO_ERROR;
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+	    (bp->b_offset & PAGE_MASK) == 0) {
+		if (bp->b_pages[0] == bogus_page)
+			goto unlock;
+		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
+		if ((bp->b_pages[0]->valid & mask) == mask)
+			goto unlock;
+		if ((bp->b_pages[0]->valid & mask) == 0) {
+			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
+			bp->b_pages[0]->valid |= mask;
+			goto unlock;
+		}
+	}
+	sa = bp->b_offset & PAGE_MASK;
+	slide = 0;
+	for (i = 0; i < bp->b_npages; i++, sa = 0) {
+		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
+		ea = slide & PAGE_MASK;
+		if (ea == 0)
+			ea = PAGE_SIZE;
+		if (bp->b_pages[i] == bogus_page)
+			continue;
+		j = sa / DEV_BSIZE;
+		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
+		if ((bp->b_pages[i]->valid & mask) == mask)
+			continue;
+		if ((bp->b_pages[i]->valid & mask) == 0)
+			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
+		else {
+			for (; sa < ea; sa += DEV_BSIZE, j++) {
+				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
+					pmap_zero_page_area(bp->b_pages[i],
+					    sa, DEV_BSIZE);
+				}
+			}
+		}
+		bp->b_pages[i]->valid |= mask;
+	}
+unlock:
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+	bp->b_resid = 0;
+}
+
+void
+vfs_bio_bzero_buf(struct buf *bp, int base, int size)
+{
+	vm_page_t m;
+	int i, n;
+
+	if (buf_mapped(bp)) {
+		BUF_CHECK_MAPPED(bp);
+		bzero(bp->b_data + base, size);
+	} else {
+		BUF_CHECK_UNMAPPED(bp);
+		n = PAGE_SIZE - (base & PAGE_MASK);
+		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+			m = bp->b_pages[i];
+			if (n > size)
+				n = size;
+			pmap_zero_page_area(m, base & PAGE_MASK, n);
+			base += n;
+			size -= n;
+			n = PAGE_SIZE;
+		}
+	}
+}
+
+/*
+ * Update buffer flags based on I/O request parameters, optionally releasing the
+ * buffer.  If it's VMIO or direct I/O, the buffer pages are released to the VM,
+ * where they may be placed on a page queue (VMIO) or freed immediately (direct
+ * I/O).  Otherwise the buffer is released to the cache.
+ */
+static void
+b_io_dismiss(struct buf *bp, int ioflag, bool release)
+{
+
+	KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0,
+	    ("buf %p non-VMIO noreuse", bp));
+
+	if ((ioflag & IO_DIRECT) != 0)
+		bp->b_flags |= B_DIRECT;
+	if ((ioflag & IO_EXT) != 0)
+		bp->b_xflags |= BX_ALTDATA;
+	if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) {
+		bp->b_flags |= B_RELBUF;
+		if ((ioflag & IO_NOREUSE) != 0)
+			bp->b_flags |= B_NOREUSE;
+		if (release)
+			brelse(bp);
+	} else if (release)
+		bqrelse(bp);
+}
+
+void
+vfs_bio_brelse(struct buf *bp, int ioflag)
+{
+
+	b_io_dismiss(bp, ioflag, true);
+}
+
+void
+vfs_bio_set_flags(struct buf *bp, int ioflag)
+{
+
+	b_io_dismiss(bp, ioflag, false);
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_free_pages get pages into
+ * a buffers address space.  The pages are anonymous and are
+ * not associated with a file object.
+ */
+static void
+vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index;
+
+	BUF_CHECK_MAPPED(bp);
+
+	to = round_page(to);
+	from = round_page(from);
+	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+		/*
+		 * note: must allocate system pages since blocking here
+		 * could interfere with paging I/O, no matter which
+		 * process we are.
+		 */
+		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
+		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) |
+		    VM_ALLOC_WAITOK);
+		pmap_qenter(pg, &p, 1);
+		bp->b_pages[index] = p;
+	}
+	bp->b_npages = index;
+}
+
+/* Return pages associated with this buf to the vm system */
+static void
+vm_hold_free_pages(struct buf *bp, int newbsize)
+{
+	vm_offset_t from;
+	vm_page_t p;
+	int index, newnpages;
+
+	BUF_CHECK_MAPPED(bp);
+
+	from = round_page((vm_offset_t)bp->b_data + newbsize);
+	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+	if (bp->b_npages > newnpages)
+		pmap_qremove(from, bp->b_npages - newnpages);
+	for (index = newnpages; index < bp->b_npages; index++) {
+		p = bp->b_pages[index];
+		bp->b_pages[index] = NULL;
+		p->wire_count--;
+		vm_page_free(p);
+	}
+	vm_wire_sub(bp->b_npages - newnpages);
+	bp->b_npages = newnpages;
+}
+
+/*
+ * Map an IO request into kernel virtual address space.
+ *
+ * All requests are (re)mapped into kernel VA space.
+ * Notice that we use b_bufsize for the size of the buffer
+ * to be mapped.  b_bcount might be modified by the driver.
+ *
+ * Note that even if the caller determines that the address space should
+ * be valid, a race or a smaller-file mapped into a larger space may
+ * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
+ * check the return value.
+ *
+ * This function only works with pager buffers.
+ */
+int
+vmapbuf(struct buf *bp, int mapbuf)
+{
+	vm_prot_t prot;
+	int pidx;
+
+	if (bp->b_bufsize < 0)
+		return (-1);
+	prot = VM_PROT_READ;
+	if (bp->b_iocmd == BIO_READ)
+		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
+	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
+	    btoc(MAXPHYS))) < 0)
+		return (-1);
+	bp->b_npages = pidx;
+	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
+	if (mapbuf || !unmapped_buf_allowed) {
+		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
+		bp->b_data = bp->b_kvabase + bp->b_offset;
+	} else
+		bp->b_data = unmapped_buf;
+	return(0);
+}
+
+/*
+ * Free the io map PTEs associated with this IO operation.
+ * We also invalidate the TLB entries and restore the original b_addr.
+ *
+ * This function only works with pager buffers.
+ */
+void
+vunmapbuf(struct buf *bp)
+{
+	int npages;
+
+	npages = bp->b_npages;
+	if (buf_mapped(bp))
+		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
+	vm_page_unhold_pages(bp->b_pages, npages);
+
+	bp->b_data = unmapped_buf;
+}
+
+void
+bdone(struct buf *bp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	bp->b_flags |= B_DONE;
+	wakeup(bp);
+	mtx_unlock(mtxp);
+}
+
+void
+bwait(struct buf *bp, u_char pri, const char *wchan)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, bp);
+	mtx_lock(mtxp);
+	while ((bp->b_flags & B_DONE) == 0)
+		msleep(bp, mtxp, pri, wchan, 0);
+	mtx_unlock(mtxp);
+}
+
+int
+bufsync(struct bufobj *bo, int waitfor)
+{
+
+	return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
+}
+
+void
+bufstrategy(struct bufobj *bo, struct buf *bp)
+{
+	int i __unused;
+	struct vnode *vp;
+
+	vp = bp->b_vp;
+	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
+	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
+	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
+	i = VOP_STRATEGY(vp, bp);
+	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
+}
+
+/*
+ * Initialize a struct bufobj before use.  Memory is assumed zero filled.
+ */
+void
+bufobj_init(struct bufobj *bo, void *private)
+{
+	static volatile int bufobj_cleanq;
+
+        bo->bo_domain =
+            atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains;
+        rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+        bo->bo_private = private;
+        TAILQ_INIT(&bo->bo_clean.bv_hd);
+        TAILQ_INIT(&bo->bo_dirty.bv_hd);
+}
+
+void
+bufobj_wrefl(struct bufobj *bo)
+{
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+	ASSERT_BO_WLOCKED(bo);
+	bo->bo_numoutput++;
+}
+
+void
+bufobj_wref(struct bufobj *bo)
+{
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+	BO_LOCK(bo);
+	bo->bo_numoutput++;
+	BO_UNLOCK(bo);
+}
+
+void
+bufobj_wdrop(struct bufobj *bo)
+{
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
+	BO_LOCK(bo);
+	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
+	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
+		bo->bo_flag &= ~BO_WWAIT;
+		wakeup(&bo->bo_numoutput);
+	}
+	BO_UNLOCK(bo);
+}
+
+int
+bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
+{
+	int error;
+
+	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
+	ASSERT_BO_WLOCKED(bo);
+	error = 0;
+	while (bo->bo_numoutput) {
+		bo->bo_flag |= BO_WWAIT;
+		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
+		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+	if (!buf_mapped(bp)) {
+		KASSERT(unmapped_buf_allowed, ("unmapped"));
+		bip->bio_ma = bp->b_pages;
+		bip->bio_ma_n = bp->b_npages;
+		bip->bio_data = unmapped_buf;
+		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+		bip->bio_flags |= BIO_UNMAPPED;
+		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+		    PAGE_SIZE == bp->b_npages,
+		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
+		    (long long)bip->bio_length, bip->bio_ma_n));
+	} else {
+		bip->bio_data = bp->b_data;
+		bip->bio_ma = NULL;
+	}
+}
+
+/*
+ * The MIPS pmap code currently doesn't handle aliased pages.
+ * The VIPT caches may not handle page aliasing themselves, leading
+ * to data corruption.
+ *
+ * As such, this code makes a system extremely unhappy if said
+ * system doesn't support unaliasing the above situation in hardware.
+ * Some "recent" systems (eg some mips24k/mips74k cores) don't enable
+ * this feature at build time, so it has to be handled in software.
+ *
+ * Once the MIPS pmap/cache code grows to support this function on
+ * earlier chips, it should be flipped back off.
+ */
+#ifdef	__mips__
+static int buf_pager_relbuf = 1;
+#else
+static int buf_pager_relbuf = 0;
+#endif
+SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
+    &buf_pager_relbuf, 0,
+    "Make buffer pager release buffers after reading");
+
+/*
+ * The buffer pager.  It uses buffer reads to validate pages.
+ *
+ * In contrast to the generic local pager from vm/vnode_pager.c, this
+ * pager correctly and easily handles volumes where the underlying
+ * device block size is greater than the machine page size.  The
+ * buffer cache transparently extends the requested page run to be
+ * aligned at the block boundary, and does the necessary bogus page
+ * replacements in the addends to avoid obliterating already valid
+ * pages.
+ *
+ * The only non-trivial issue is that the exclusive busy state for
+ * pages, which is assumed by the vm_pager_getpages() interface, is
+ * incompatible with the VMIO buffer cache's desire to share-busy the
+ * pages.  This function performs a trivial downgrade of the pages'
+ * state before reading buffers, and a less trivial upgrade from the
+ * shared-busy to excl-busy state after the read.
+ */
+int
+vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
+    vbg_get_blksize_t get_blksize)
+{
+	vm_page_t m;
+	vm_object_t object;
+	struct buf *bp;
+	struct mount *mp;
+	daddr_t lbn, lbnp;
+	vm_ooffset_t la, lb, poff, poffe;
+	long bsize;
+	int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
+	bool redo, lpart;
+
+	object = vp->v_object;
+	mp = vp->v_mount;
+	error = 0;
+	la = IDX_TO_OFF(ma[count - 1]->pindex);
+	if (la >= object->un_pager.vnp.vnp_size)
+		return (VM_PAGER_BAD);
+
+	/*
+	 * Change the meaning of la from where the last requested page starts
+	 * to where it ends, because that's the end of the requested region
+	 * and the start of the potential read-ahead region.
+	 */
+	la += PAGE_SIZE;
+	lpart = la > object->un_pager.vnp.vnp_size;
+	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
+
+	/*
+	 * Calculate read-ahead, behind and total pages.
+	 */
+	pgsin = count;
+	lb = IDX_TO_OFF(ma[0]->pindex);
+	pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
+	pgsin += pgsin_b;
+	if (rbehind != NULL)
+		*rbehind = pgsin_b;
+	pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
+	if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
+		pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
+		    PAGE_SIZE) - la);
+	pgsin += pgsin_a;
+	if (rahead != NULL)
+		*rahead = pgsin_a;
+	VM_CNT_INC(v_vnodein);
+	VM_CNT_ADD(v_vnodepgsin, pgsin);
+
+	br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
+	    != 0) ? GB_UNMAPPED : 0;
+	VM_OBJECT_WLOCK(object);
+again:
+	for (i = 0; i < count; i++)
+		vm_page_busy_downgrade(ma[i]);
+	VM_OBJECT_WUNLOCK(object);
+
+	lbnp = -1;
+	for (i = 0; i < count; i++) {
+		m = ma[i];
+
+		/*
+		 * Pages are shared busy and the object lock is not
+		 * owned, which together allow for the pages'
+		 * invalidation.  The racy test for validity avoids
+		 * useless creation of the buffer for the most typical
+		 * case when invalidation is not used in redo or for
+		 * parallel read.  The shared->excl upgrade loop at
+		 * the end of the function catches the race in a
+		 * reliable way (protected by the object lock).
+		 */
+		if (m->valid == VM_PAGE_BITS_ALL)
+			continue;
+
+		poff = IDX_TO_OFF(m->pindex);
+		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
+		for (; poff < poffe; poff += bsize) {
+			lbn = get_lblkno(vp, poff);
+			if (lbn == lbnp)
+				goto next_page;
+			lbnp = lbn;
+
+			bsize = get_blksize(vp, lbn);
+			error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
+			    br_flags, &bp);
+			if (error != 0)
+				goto end_pages;
+			if (LIST_EMPTY(&bp->b_dep)) {
+				/*
+				 * Invalidation clears m->valid, but
+				 * may leave B_CACHE flag if the
+				 * buffer existed at the invalidation
+				 * time.  In this case, recycle the
+				 * buffer to do real read on next
+				 * bread() after redo.
+				 *
+				 * Otherwise B_RELBUF is not strictly
+				 * necessary, enable to reduce buf
+				 * cache pressure.
+				 */
+				if (buf_pager_relbuf ||
+				    m->valid != VM_PAGE_BITS_ALL)
+					bp->b_flags |= B_RELBUF;
+
+				bp->b_flags &= ~B_NOCACHE;
+				brelse(bp);
+			} else {
+				bqrelse(bp);
+			}
+		}
+		KASSERT(1 /* racy, enable for debugging */ ||
+		    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
+		    ("buf %d %p invalid", i, m));
+		if (i == count - 1 && lpart) {
+			VM_OBJECT_WLOCK(object);
+			if (m->valid != 0 &&
+			    m->valid != VM_PAGE_BITS_ALL)
+				vm_page_zero_invalid(m, TRUE);
+			VM_OBJECT_WUNLOCK(object);
+		}
+next_page:;
+	}
+end_pages:
+
+	VM_OBJECT_WLOCK(object);
+	redo = false;
+	for (i = 0; i < count; i++) {
+		vm_page_sunbusy(ma[i]);
+		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
+
+		/*
+		 * Since the pages were only sbusy while neither the
+		 * buffer nor the object lock was held by us, or
+		 * reallocated while vm_page_grab() slept for busy
+		 * relinguish, they could have been invalidated.
+		 * Recheck the valid bits and re-read as needed.
+		 *
+		 * Note that the last page is made fully valid in the
+		 * read loop, and partial validity for the page at
+		 * index count - 1 could mean that the page was
+		 * invalidated or removed, so we must restart for
+		 * safety as well.
+		 */
+		if (ma[i]->valid != VM_PAGE_BITS_ALL)
+			redo = true;
+	}
+	if (redo && error == 0)
+		goto again;
+	VM_OBJECT_WUNLOCK(object);
+	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+/* DDB command to show buffer data */
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+	/* get args */
+	struct buf *bp = (struct buf *)addr;
+#ifdef FULL_BUF_TRACKING
+	uint32_t i, j;
+#endif
+
+	if (!have_addr) {
+		db_printf("usage: show buffer <addr>\n");
+		return;
+	}
+
+	db_printf("buf at %p\n", bp);
+	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
+	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
+	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
+	db_printf(
+	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
+	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
+	    "b_dep = %p\n",
+	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
+	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
+	db_printf("b_kvabase = %p, b_kvasize = %d\n",
+	    bp->b_kvabase, bp->b_kvasize);
+	if (bp->b_npages) {
+		int i;
+		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m;
+			m = bp->b_pages[i];
+			if (m != NULL)
+				db_printf("(%p, 0x%lx, 0x%lx)", m->object,
+				    (u_long)m->pindex,
+				    (u_long)VM_PAGE_TO_PHYS(m));
+			else
+				db_printf("( ??? )");
+			if ((i + 1) < bp->b_npages)
+				db_printf(",");
+		}
+		db_printf("\n");
+	}
+	BUF_LOCKPRINTINFO(bp);
+#if defined(FULL_BUF_TRACKING)
+	db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
+
+	i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
+	for (j = 1; j <= BUF_TRACKING_SIZE; j++) {
+		if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL)
+			continue;
+		db_printf(" %2u: %s\n", j,
+		    bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
+	}
+#elif defined(BUF_TRACKING)
+	db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
+#endif
+	db_printf(" ");
+}
+
+DB_SHOW_COMMAND(bufqueues, bufqueues)
+{
+	struct bufdomain *bd;
+	struct buf *bp;
+	long total;
+	int i, j, cnt;
+
+	db_printf("bqempty: %d\n", bqempty.bq_len);
+
+	for (i = 0; i < buf_domains; i++) {
+		bd = &bdomain[i];
+		db_printf("Buf domain %d\n", i);
+		db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers);
+		db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers);
+		db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers);
+		db_printf("\n");
+		db_printf("\tbufspace\t%ld\n", bd->bd_bufspace);
+		db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace);
+		db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace);
+		db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace);
+		db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh);
+		db_printf("\n");
+		db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers);
+		db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers);
+		db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers);
+		db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh);
+		db_printf("\n");
+		total = 0;
+		TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist)
+			total += bp->b_bufsize;
+		db_printf("\tcleanq count\t%d (%ld)\n",
+		    bd->bd_cleanq->bq_len, total);
+		total = 0;
+		TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist)
+			total += bp->b_bufsize;
+		db_printf("\tdirtyq count\t%d (%ld)\n",
+		    bd->bd_dirtyq.bq_len, total);
+		db_printf("\twakeup\t\t%d\n", bd->bd_wanted);
+		db_printf("\tlim\t\t%d\n", bd->bd_lim);
+		db_printf("\tCPU ");
+		for (j = 0; j <= mp_maxid; j++)
+			db_printf("%d, ", bd->bd_subq[j].bq_len);
+		db_printf("\n");
+		cnt = 0;
+		total = 0;
+		for (j = 0; j < nbuf; j++)
+			if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) {
+				cnt++;
+				total += buf[j].b_bufsize;
+			}
+		db_printf("\tLocked buffers: %d space %ld\n", cnt, total);
+		cnt = 0;
+		total = 0;
+		for (j = 0; j < nbuf; j++)
+			if (buf[j].b_domain == i) {
+				cnt++;
+				total += buf[j].b_bufsize;
+			}
+		db_printf("\tTotal buffers: %d space %ld\n", cnt, total);
+	}
+}
+
+DB_SHOW_COMMAND(lockedbufs, lockedbufs)
+{
+	struct buf *bp;
+	int i;
+
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		if (BUF_ISLOCKED(bp)) {
+			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+			db_printf("\n");
+			if (db_pager_quit)
+				break;
+		}
+	}
+}
+
+DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
+{
+	struct vnode *vp;
+	struct buf *bp;
+
+	if (!have_addr) {
+		db_printf("usage: show vnodebufs <addr>\n");
+		return;
+	}
+	vp = (struct vnode *)addr;
+	db_printf("Clean buffers:\n");
+	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
+		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+		db_printf("\n");
+	}
+	db_printf("Dirty buffers:\n");
+	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
+		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+		db_printf("\n");
+	}
+}
+
+DB_COMMAND(countfreebufs, db_coundfreebufs)
+{
+	struct buf *bp;
+	int i, used = 0, nfree = 0;
+
+	if (have_addr) {
+		db_printf("usage: countfreebufs\n");
+		return;
+	}
+
+	for (i = 0; i < nbuf; i++) {
+		bp = &buf[i];
+		if (bp->b_qindex == QUEUE_EMPTY)
+			nfree++;
+		else
+			used++;
+	}
+
+	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
+	    nfree + used);
+	db_printf("numfreebuffers is %d\n", numfreebuffers);
+}
+#endif /* DDB */
diff --git a/freebsd/sys/kern/vfs_cache.c b/freebsd/sys/kern/vfs_cache.c
new file mode 100644
index 00000000..7c14b080
--- /dev/null
+++ b/freebsd/sys/kern/vfs_cache.c
@@ -0,0 +1,2604 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <vm/uma.h>
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
+    "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
+    "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
+    "char *", "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
+    "struct vnode *", "char *");
+SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
+    "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
+    "struct vnode *", "char *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
+    "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
+    "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *",
+    "char *", "int");
+SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *",
+    "char *", "int");
+
+/*
+ * This structure describes the elements in the cache of recent
+ * names looked up by namei.
+ */
+
+struct	namecache {
+	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
+	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
+	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
+	struct	vnode *nc_dvp;		/* vnode of parent of name */
+	union {
+		struct	vnode *nu_vp;	/* vnode the name refers to */
+		u_int	nu_neghits;	/* negative entry hits */
+	} n_un;
+	u_char	nc_flag;		/* flag bits */
+	u_char	nc_nlen;		/* length of name */
+	char	nc_name[0];		/* segment name + nul */
+};
+
+/*
+ * struct namecache_ts repeats struct namecache layout up to the
+ * nc_nlen member.
+ * struct namecache_ts is used in place of struct namecache when time(s) need
+ * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
+ * both a non-dotdot directory name plus dotdot for the directory's
+ * parent.
+ */
+struct	namecache_ts {
+	struct	timespec nc_time;	/* timespec provided by fs */
+	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
+	int	nc_ticks;		/* ticks value when entry was added */
+	struct namecache nc_nc;
+};
+
+#define	nc_vp		n_un.nu_vp
+#define	nc_neghits	n_un.nu_neghits
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE	0x01
+#define NCF_ISDOTDOT	0x02
+#define	NCF_TS		0x04
+#define	NCF_DTS		0x08
+#define	NCF_DVDROP	0x10
+#define	NCF_NEGATIVE	0x20
+#define	NCF_HOTNEGATIVE	0x40
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference.  It is managed LRU, so frequently
+ * used names will hang around.  Cache is indexed by hash value
+ * obtained from (dvp, name) where dvp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME		TYPE	ROLE
+ * vnodelock	mtx	vnode lists and v_cache_dd field protection
+ * bucketlock	rwlock	for access to given set of hash buckets
+ * neglist	mtx	negative entry LRU management
+ *
+ * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
+ * shrinking the LRU list.
+ *
+ * It is legal to take multiple vnodelock and bucketlock locks. The locking
+ * order is lower address first. Both are recursive.
+ *
+ * "." lookups are lockless.
+ *
+ * ".." and vnode -> name lookups require vnodelock.
+ *
+ * name -> vnode lookup requires the relevant bucketlock to be held for reading.
+ *
+ * Insertions and removals of entries require involved vnodes and bucketlocks
+ * to be write-locked to prevent other threads from seeing the entry.
+ *
+ * Some lookups result in removal of the found entry (e.g. getting rid of a
+ * negative entry with the intent to create a positive one), which poses a
+ * problem when multiple threads reach the state. Similarly, two different
+ * threads can purge two different vnodes and try to remove the same name.
+ *
+ * If the already held vnode lock is lower than the second required lock, we
+ * can just take the other lock. However, in the opposite case, this could
+ * deadlock. As such, this is resolved by trylocking and if that fails unlocking
+ * the first node, locking everything in order and revalidating the state.
+ */
+
+/*
+ * Structures associated with name caching.
+ */
+#define NCHHASH(hash) \
+	(&nchashtbl[(hash) & nchash])
+static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
+static u_long __read_mostly	nchash;			/* size of hash table */
+SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
+    "Size of namecache hash table");
+static u_long __read_mostly	ncnegfactor = 12; /* ratio of negative entries */
+SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
+    "Ratio of negative namecache entries");
+static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
+    "Number of negative entries in namecache");
+static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
+    "Number of namecache entries");
+static u_long __exclusive_cache_line	numcachehv;/* number of cache entries with vnodes held */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
+    "Number of namecache entries with vnodes held");
+u_int __read_mostly	ncsizefactor = 2;
+SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
+    "Size factor for namecache");
+static u_int __read_mostly	ncpurgeminvnodes;
+SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
+    "Number of vnodes below which purgevfs ignores the request");
+static u_int __read_mostly	ncneghitsrequeue = 8;
+SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0,
+    "Number of hits to requeue a negative entry in the LRU list");
+
+struct nchstats	nchstats;		/* cache effectiveness statistics */
+
+static struct mtx       ncneg_shrink_lock;
+static int	shrink_list_turn;
+
+struct neglist {
+	struct mtx		nl_lock;
+	TAILQ_HEAD(, namecache) nl_list;
+} __aligned(CACHE_LINE_SIZE);
+
+static struct neglist __read_mostly	*neglists;
+static struct neglist ncneg_hot;
+
+#define	numneglists (ncneghash + 1)
+static u_int __read_mostly	ncneghash;
+static inline struct neglist *
+NCP2NEGLIST(struct namecache *ncp)
+{
+
+	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
+}
+
+#define	numbucketlocks (ncbuckethash + 1)
+static u_int __read_mostly  ncbuckethash;
+static struct rwlock_padalign __read_mostly  *bucketlocks;
+#define	HASH2BUCKETLOCK(hash) \
+	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
+
+#define	numvnodelocks (ncvnodehash + 1)
+static u_int __read_mostly  ncvnodehash;
+static struct mtx __read_mostly *vnodelocks;
+static inline struct mtx *
+VP2VNODELOCK(struct vnode *vp)
+{
+
+	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
+}
+
+/*
+ * UMA zones for the VFS cache.
+ *
+ * The small cache is used for entries with short names, which are the
+ * most common.  The large cache is used for entries which are too big to
+ * fit in the small cache.
+ */
+static uma_zone_t __read_mostly cache_zone_small;
+static uma_zone_t __read_mostly cache_zone_small_ts;
+static uma_zone_t __read_mostly cache_zone_large;
+static uma_zone_t __read_mostly cache_zone_large_ts;
+
+#define	CACHE_PATH_CUTOFF	35
+
+static struct namecache *
+cache_alloc(int len, int ts)
+{
+	struct namecache_ts *ncp_ts;
+	struct namecache *ncp;
+
+	if (__predict_false(ts)) {
+		if (len <= CACHE_PATH_CUTOFF)
+			ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK);
+		else
+			ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK);
+		ncp = &ncp_ts->nc_nc;
+	} else {
+		if (len <= CACHE_PATH_CUTOFF)
+			ncp = uma_zalloc(cache_zone_small, M_WAITOK);
+		else
+			ncp = uma_zalloc(cache_zone_large, M_WAITOK);
+	}
+	return (ncp);
+}
+
+static void
+cache_free(struct namecache *ncp)
+{
+	struct namecache_ts *ncp_ts;
+
+	if (ncp == NULL)
+		return;
+	if ((ncp->nc_flag & NCF_DVDROP) != 0)
+		vdrop(ncp->nc_dvp);
+	if (__predict_false(ncp->nc_flag & NCF_TS)) {
+		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
+			uma_zfree(cache_zone_small_ts, ncp_ts);
+		else
+			uma_zfree(cache_zone_large_ts, ncp_ts);
+	} else {
+		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
+			uma_zfree(cache_zone_small, ncp);
+		else
+			uma_zfree(cache_zone_large, ncp);
+	}
+}
+
+static void
+cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
+{
+	struct namecache_ts *ncp_ts;
+
+	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
+	    (tsp == NULL && ticksp == NULL),
+	    ("No NCF_TS"));
+
+	if (tsp == NULL && ticksp == NULL)
+		return;
+
+	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+	if (tsp != NULL)
+		*tsp = ncp_ts->nc_time;
+	if (ticksp != NULL)
+		*ticksp = ncp_ts->nc_ticks;
+}
+
+static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
+    "VFS namecache enabled");
+
+/* Export size information to userland */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
+    sizeof(struct namecache), "sizeof(struct namecache)");
+
+/*
+ * The new name cache statistics
+ */
+static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
+    "Name cache statistics");
+#define STATNODE_ULONG(name, descr)	\
+	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
+#define STATNODE_COUNTER(name, descr)	\
+	static counter_u64_t __read_mostly name; \
+	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
+STATNODE_ULONG(numneg, "Number of negative cache entries");
+STATNODE_ULONG(numcache, "Number of cache entries");
+STATNODE_COUNTER(numcalls, "Number of cache lookups");
+STATNODE_COUNTER(dothits, "Number of '.' hits");
+STATNODE_COUNTER(dotdothits, "Number of '..' hits");
+STATNODE_COUNTER(numchecks, "Number of checks in lookup");
+STATNODE_COUNTER(nummiss, "Number of cache misses");
+STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
+STATNODE_COUNTER(numposzaps,
+    "Number of cache hits (positive) we do not want to cache");
+STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
+STATNODE_COUNTER(numnegzaps,
+    "Number of cache hits (negative) we do not want to cache");
+STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
+/* These count for kern___getcwd(), too. */
+STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
+STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
+STATNODE_COUNTER(numfullpathfail2,
+    "Number of fullpath search errors (VOP_VPTOCNP failures)");
+STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
+STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+    "Number of times zap_and_exit failed to lock");
+static long cache_lock_vnodes_cel_3_failures;
+STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
+    "Number of times 3-way vnode locking failed");
+
+static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
+static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+    char *buf, char **retbuf, u_int buflen);
+
+static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
+
+static int cache_yield;
+SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
+    "Number of times cache called yield");
+
+static void
+cache_maybe_yield(void)
+{
+
+	if (should_yield()) {
+		cache_yield++;
+		kern_yield(PRI_USER);
+	}
+}
+
+static inline void
+cache_assert_vlp_locked(struct mtx *vlp)
+{
+
+	if (vlp != NULL)
+		mtx_assert(vlp, MA_OWNED);
+}
+
+static inline void
+cache_assert_vnode_locked(struct vnode *vp)
+{
+	struct mtx *vlp;
+
+	vlp = VP2VNODELOCK(vp);
+	cache_assert_vlp_locked(vlp);
+}
+
+static uint32_t
+cache_get_hash(char *name, u_char len, struct vnode *dvp)
+{
+	uint32_t hash;
+
+	hash = fnv_32_buf(name, len, FNV1_32_INIT);
+	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
+	return (hash);
+}
+
+static inline struct rwlock *
+NCP2BUCKETLOCK(struct namecache *ncp)
+{
+	uint32_t hash;
+
+	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
+	return (HASH2BUCKETLOCK(hash));
+}
+
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+	struct rwlock *blp;
+
+	blp = NCP2BUCKETLOCK(ncp);
+	rw_assert(blp, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+#define cache_sort(x, y)	_cache_sort((void **)(x), (void **)(y))
+static void
+_cache_sort(void **p1, void **p2)
+{
+	void *tmp;
+
+	if (*p1 > *p2) {
+		tmp = *p2;
+		*p2 = *p1;
+		*p1 = tmp;
+	}
+}
+
+static void
+cache_lock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wunlock(&bucketlocks[i]);
+}
+
+static void
+cache_lock_all_vnodes(void)
+{
+	u_int i;
+
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_lock(&vnodelocks[i]);
+}
+
+static void
+cache_unlock_all_vnodes(void)
+{
+	u_int i;
+
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_unlock(&vnodelocks[i]);
+}
+
+static int
+cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+	cache_sort(&vlp1, &vlp2);
+	MPASS(vlp2 != NULL);
+
+	if (vlp1 != NULL) {
+		if (!mtx_trylock(vlp1))
+			return (EAGAIN);
+	}
+	if (!mtx_trylock(vlp2)) {
+		if (vlp1 != NULL)
+			mtx_unlock(vlp1);
+		return (EAGAIN);
+	}
+
+	return (0);
+}
+
+static void
+cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+	MPASS(vlp1 != NULL || vlp2 != NULL);
+
+	if (vlp1 != NULL)
+		mtx_unlock(vlp1);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+}
+
+static int
+sysctl_nchstats(SYSCTL_HANDLER_ARGS)
+{
+	struct nchstats snap;
+
+	if (req->oldptr == NULL)
+		return (SYSCTL_OUT(req, 0, sizeof(snap)));
+
+	snap = nchstats;
+	snap.ncs_goodhits = counter_u64_fetch(numposhits);
+	snap.ncs_neghits = counter_u64_fetch(numneghits);
+	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
+	    counter_u64_fetch(numnegzaps);
+	snap.ncs_miss = counter_u64_fetch(nummisszap) +
+	    counter_u64_fetch(nummiss);
+
+	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
+}
+SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
+    "VFS cache effectiveness statistics");
+
+#ifdef DIAGNOSTIC
+/*
+ * Grab an atomic snapshot of the name cache hash chain lengths
+ */
+static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
+    "hash table stats");
+
+static int
+sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
+{
+	struct nchashhead *ncpp;
+	struct namecache *ncp;
+	int i, error, n_nchash, *cntbuf;
+
+retry:
+	n_nchash = nchash + 1;	/* nchash is max index, not count */
+	if (req->oldptr == NULL)
+		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
+	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
+	cache_lock_all_buckets();
+	if (n_nchash != nchash + 1) {
+		cache_unlock_all_buckets();
+		free(cntbuf, M_TEMP);
+		goto retry;
+	}
+	/* Scan hash tables counting entries */
+	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
+		LIST_FOREACH(ncp, ncpp, nc_hash)
+			cntbuf[i]++;
+	cache_unlock_all_buckets();
+	for (error = 0, i = 0; i < n_nchash; i++)
+		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
+			break;
+	free(cntbuf, M_TEMP);
+	return (error);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
+    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
+    "nchash chain lengths");
+
+static int
+sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct nchashhead *ncpp;
+	struct namecache *ncp;
+	int n_nchash;
+	int count, maxlength, used, pct;
+
+	if (!req->oldptr)
+		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
+
+	cache_lock_all_buckets();
+	n_nchash = nchash + 1;	/* nchash is max index, not count */
+	used = 0;
+	maxlength = 0;
+
+	/* Scan hash tables for applicable entries */
+	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+		count = 0;
+		LIST_FOREACH(ncp, ncpp, nc_hash) {
+			count++;
+		}
+		if (count)
+			used++;
+		if (maxlength < count)
+			maxlength = count;
+	}
+	n_nchash = nchash + 1;
+	cache_unlock_all_buckets();
+	pct = (used * 100) / (n_nchash / 100);
+	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &used, sizeof(used));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req, &pct, sizeof(pct));
+	if (error)
+		return (error);
+	return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
+    CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
+    "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
+#endif
+
+/*
+ * Negative entries management
+ *
+ * A variation of LRU scheme is used. New entries are hashed into one of
+ * numneglists cold lists. Entries get promoted to the hot list on first hit.
+ * Partial LRU for the hot list is maintained by requeueing them every
+ * ncneghitsrequeue hits.
+ *
+ * The shrinker will demote hot list head and evict from the cold list in a
+ * round-robin manner.
+ */
+static void
+cache_negative_hit(struct namecache *ncp)
+{
+	struct neglist *neglist;
+	u_int hits;
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	hits = atomic_fetchadd_int(&ncp->nc_neghits, 1);
+	if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+		if ((hits % ncneghitsrequeue) != 0)
+			return;
+		mtx_lock(&ncneg_hot.nl_lock);
+		if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+			TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+			TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
+			mtx_unlock(&ncneg_hot.nl_lock);
+			return;
+		}
+		/*
+		 * The shrinker cleared the flag and removed the entry from
+		 * the hot list. Put it back.
+		 */
+	} else {
+		mtx_lock(&ncneg_hot.nl_lock);
+	}
+	neglist = NCP2NEGLIST(ncp);
+	mtx_lock(&neglist->nl_lock);
+	if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
+		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
+		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
+		ncp->nc_flag |= NCF_HOTNEGATIVE;
+	}
+	mtx_unlock(&neglist->nl_lock);
+	mtx_unlock(&ncneg_hot.nl_lock);
+}
+
+static void
+cache_negative_insert(struct namecache *ncp, bool neg_locked)
+{
+	struct neglist *neglist;
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+	neglist = NCP2NEGLIST(ncp);
+	if (!neg_locked) {
+		mtx_lock(&neglist->nl_lock);
+	} else {
+		mtx_assert(&neglist->nl_lock, MA_OWNED);
+	}
+	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
+	if (!neg_locked)
+		mtx_unlock(&neglist->nl_lock);
+	atomic_add_rel_long(&numneg, 1);
+}
+
+static void
+cache_negative_remove(struct namecache *ncp, bool neg_locked)
+{
+	struct neglist *neglist;
+	bool hot_locked = false;
+	bool list_locked = false;
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+	neglist = NCP2NEGLIST(ncp);
+	if (!neg_locked) {
+		if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+			hot_locked = true;
+			mtx_lock(&ncneg_hot.nl_lock);
+			if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
+				list_locked = true;
+				mtx_lock(&neglist->nl_lock);
+			}
+		} else {
+			list_locked = true;
+			mtx_lock(&neglist->nl_lock);
+		}
+	}
+	if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
+		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+	} else {
+		mtx_assert(&neglist->nl_lock, MA_OWNED);
+		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
+	}
+	if (list_locked)
+		mtx_unlock(&neglist->nl_lock);
+	if (hot_locked)
+		mtx_unlock(&ncneg_hot.nl_lock);
+	atomic_subtract_rel_long(&numneg, 1);
+}
+
+static void
+cache_negative_shrink_select(int start, struct namecache **ncpp,
+    struct neglist **neglistpp)
+{
+	struct neglist *neglist;
+	struct namecache *ncp;
+	int i;
+
+	*ncpp = ncp = NULL;
+	neglist = NULL;
+
+	for (i = start; i < numneglists; i++) {
+		neglist = &neglists[i];
+		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
+			continue;
+		mtx_lock(&neglist->nl_lock);
+		ncp = TAILQ_FIRST(&neglist->nl_list);
+		if (ncp != NULL)
+			break;
+		mtx_unlock(&neglist->nl_lock);
+	}
+
+	*neglistpp = neglist;
+	*ncpp = ncp;
+}
+
+static void
+cache_negative_zap_one(void)
+{
+	struct namecache *ncp, *ncp2;
+	struct neglist *neglist;
+	struct mtx *dvlp;
+	struct rwlock *blp;
+
+	if (!mtx_trylock(&ncneg_shrink_lock))
+		return;
+
+	mtx_lock(&ncneg_hot.nl_lock);
+	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
+	if (ncp != NULL) {
+		neglist = NCP2NEGLIST(ncp);
+		mtx_lock(&neglist->nl_lock);
+		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
+		ncp->nc_flag &= ~NCF_HOTNEGATIVE;
+		mtx_unlock(&neglist->nl_lock);
+	}
+
+	cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
+	shrink_list_turn++;
+	if (shrink_list_turn == numneglists)
+		shrink_list_turn = 0;
+	if (ncp == NULL && shrink_list_turn == 0)
+		cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
+	if (ncp == NULL) {
+		mtx_unlock(&ncneg_hot.nl_lock);
+		goto out;
+	}
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	blp = NCP2BUCKETLOCK(ncp);
+	mtx_unlock(&neglist->nl_lock);
+	mtx_unlock(&ncneg_hot.nl_lock);
+	mtx_lock(dvlp);
+	rw_wlock(blp);
+	mtx_lock(&neglist->nl_lock);
+	ncp2 = TAILQ_FIRST(&neglist->nl_list);
+	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
+	    blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
+		ncp = NULL;
+		goto out_unlock_all;
+	}
+	SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
+	    ncp->nc_name, ncp->nc_neghits);
+
+	cache_zap_locked(ncp, true);
+out_unlock_all:
+	mtx_unlock(&neglist->nl_lock);
+	rw_wunlock(blp);
+	mtx_unlock(dvlp);
+out:
+	mtx_unlock(&ncneg_shrink_lock);
+	cache_free(ncp);
+}
+
+/*
+ * cache_zap_locked():
+ *
+ *   Removes a namecache entry from cache, whether it contains an actual
+ *   pointer to a vnode or if it is just a negative cache entry.
+ */
+static void
+cache_zap_locked(struct namecache *ncp, bool neg_locked)
+{
+
+	if (!(ncp->nc_flag & NCF_NEGATIVE))
+		cache_assert_vnode_locked(ncp->nc_vp);
+	cache_assert_vnode_locked(ncp->nc_dvp);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
+	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
+	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
+		    ncp->nc_name, ncp->nc_vp);
+	} else {
+		SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp,
+		    ncp->nc_name, ncp->nc_neghits);
+	}
+	LIST_REMOVE(ncp, nc_hash);
+	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+		if (ncp == ncp->nc_vp->v_cache_dd)
+			ncp->nc_vp->v_cache_dd = NULL;
+	} else {
+		cache_negative_remove(ncp, neg_locked);
+	}
+	if (ncp->nc_flag & NCF_ISDOTDOT) {
+		if (ncp == ncp->nc_dvp->v_cache_dd)
+			ncp->nc_dvp->v_cache_dd = NULL;
+	} else {
+		LIST_REMOVE(ncp, nc_src);
+		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
+			ncp->nc_flag |= NCF_DVDROP;
+			atomic_subtract_rel_long(&numcachehv, 1);
+		}
+	}
+	atomic_subtract_rel_long(&numcache, 1);
+}
+
+static void
+cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
+{
+	struct rwlock *blp;
+
+	MPASS(ncp->nc_dvp == vp);
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	cache_assert_vnode_locked(vp);
+
+	blp = NCP2BUCKETLOCK(ncp);
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+}
+
+static bool
+cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
+    struct mtx **vlpp)
+{
+	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+	struct rwlock *blp;
+
+	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+	cache_assert_vnode_locked(vp);
+
+	if (ncp->nc_flag & NCF_NEGATIVE) {
+		if (*vlpp != NULL) {
+			mtx_unlock(*vlpp);
+			*vlpp = NULL;
+		}
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		return (true);
+	}
+
+	pvlp = VP2VNODELOCK(vp);
+	blp = NCP2BUCKETLOCK(ncp);
+	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+	vlp2 = VP2VNODELOCK(ncp->nc_vp);
+
+	if (*vlpp == vlp1 || *vlpp == vlp2) {
+		to_unlock = *vlpp;
+		*vlpp = NULL;
+	} else {
+		if (*vlpp != NULL) {
+			mtx_unlock(*vlpp);
+			*vlpp = NULL;
+		}
+		cache_sort(&vlp1, &vlp2);
+		if (vlp1 == pvlp) {
+			mtx_lock(vlp2);
+			to_unlock = vlp2;
+		} else {
+			if (!mtx_trylock(vlp1))
+				goto out_relock;
+			to_unlock = vlp1;
+		}
+	}
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+	if (to_unlock != NULL)
+		mtx_unlock(to_unlock);
+	return (true);
+
+out_relock:
+	mtx_unlock(vlp2);
+	mtx_lock(vlp1);
+	mtx_lock(vlp2);
+	MPASS(*vlpp == NULL);
+	*vlpp = vlp1;
+	return (false);
+}
+
+static int
+cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
+{
+	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+	struct rwlock *blp;
+	int error = 0;
+
+	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+	cache_assert_vnode_locked(vp);
+
+	pvlp = VP2VNODELOCK(vp);
+	if (ncp->nc_flag & NCF_NEGATIVE) {
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		goto out;
+	}
+
+	blp = NCP2BUCKETLOCK(ncp);
+	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+	vlp2 = VP2VNODELOCK(ncp->nc_vp);
+	cache_sort(&vlp1, &vlp2);
+	if (vlp1 == pvlp) {
+		mtx_lock(vlp2);
+		to_unlock = vlp2;
+	} else {
+		if (!mtx_trylock(vlp1)) {
+			error = EAGAIN;
+			goto out;
+		}
+		to_unlock = vlp1;
+	}
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+	mtx_unlock(to_unlock);
+out:
+	mtx_unlock(pvlp);
+	return (error);
+}
+
+static int
+cache_zap_wlocked_bucket(struct namecache *ncp, struct rwlock *blp)
+{
+	struct mtx *dvlp, *vlp;
+
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	vlp = NULL;
+	if (!(ncp->nc_flag & NCF_NEGATIVE))
+		vlp = VP2VNODELOCK(ncp->nc_vp);
+	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+		cache_zap_locked(ncp, false);
+		rw_wunlock(blp);
+		cache_unlock_vnodes(dvlp, vlp);
+		return (0);
+	}
+
+	rw_wunlock(blp);
+	return (EAGAIN);
+}
+
+static int
+cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp)
+{
+	struct mtx *dvlp, *vlp;
+
+	cache_assert_bucket_locked(ncp, RA_RLOCKED);
+
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	vlp = NULL;
+	if (!(ncp->nc_flag & NCF_NEGATIVE))
+		vlp = VP2VNODELOCK(ncp->nc_vp);
+	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+		rw_runlock(blp);
+		rw_wlock(blp);
+		cache_zap_locked(ncp, false);
+		rw_wunlock(blp);
+		cache_unlock_vnodes(dvlp, vlp);
+		return (0);
+	}
+
+	rw_runlock(blp);
+	return (EAGAIN);
+}
+
+static int
+cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
+    struct mtx **vlpp1, struct mtx **vlpp2)
+{
+	struct mtx *dvlp, *vlp;
+
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	vlp = NULL;
+	if (!(ncp->nc_flag & NCF_NEGATIVE))
+		vlp = VP2VNODELOCK(ncp->nc_vp);
+	cache_sort(&dvlp, &vlp);
+
+	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
+		cache_zap_locked(ncp, false);
+		cache_unlock_vnodes(dvlp, vlp);
+		*vlpp1 = NULL;
+		*vlpp2 = NULL;
+		return (0);
+	}
+
+	if (*vlpp1 != NULL)
+		mtx_unlock(*vlpp1);
+	if (*vlpp2 != NULL)
+		mtx_unlock(*vlpp2);
+	*vlpp1 = NULL;
+	*vlpp2 = NULL;
+
+	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+		cache_zap_locked(ncp, false);
+		cache_unlock_vnodes(dvlp, vlp);
+		return (0);
+	}
+
+	rw_wunlock(blp);
+	*vlpp1 = dvlp;
+	*vlpp2 = vlp;
+	if (*vlpp1 != NULL)
+		mtx_lock(*vlpp1);
+	mtx_lock(*vlpp2);
+	rw_wlock(blp);
+	return (EAGAIN);
+}
+
+static void
+cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
+{
+
+	if (blp != NULL) {
+		rw_runlock(blp);
+	} else {
+		mtx_unlock(vlp);
+	}
+}
+
+static int __noinline
+cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
+    struct timespec *tsp, int *ticksp)
+{
+	int ltype;
+
+	*vpp = dvp;
+	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
+			dvp, cnp->cn_nameptr);
+	counter_u64_add(dothits, 1);
+	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
+	if (tsp != NULL)
+		timespecclear(tsp);
+	if (ticksp != NULL)
+		*ticksp = ticks;
+	vrefact(*vpp);
+	/*
+	 * When we lookup "." we still can be asked to lock it
+	 * differently...
+	 */
+	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
+	if (ltype != VOP_ISLOCKED(*vpp)) {
+		if (ltype == LK_EXCLUSIVE) {
+			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
+			if ((*vpp)->v_iflag & VI_DOOMED) {
+				/* forced unmount */
+				vrele(*vpp);
+				*vpp = NULL;
+				return (ENOENT);
+			}
+		} else
+			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
+	}
+	return (-1);
+}
+
+static __noinline int
+cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
+    struct componentname *cnp, struct timespec *tsp, int *ticksp)
+{
+	struct namecache *ncp;
+	struct rwlock *blp;
+	struct mtx *dvlp, *dvlp2;
+	uint32_t hash;
+	int error;
+
+	if (cnp->cn_namelen == 2 &&
+	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
+		counter_u64_add(dotdothits, 1);
+		dvlp = VP2VNODELOCK(dvp);
+		dvlp2 = NULL;
+		mtx_lock(dvlp);
+retry_dotdot:
+		ncp = dvp->v_cache_dd;
+		if (ncp == NULL) {
+			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
+			    "..", NULL);
+			mtx_unlock(dvlp);
+			if (dvlp2 != NULL)
+				mtx_unlock(dvlp2);
+			return (0);
+		}
+		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
+			if (ncp->nc_dvp != dvp)
+				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
+			if (!cache_zap_locked_vnode_kl2(ncp,
+			    dvp, &dvlp2))
+				goto retry_dotdot;
+			MPASS(dvp->v_cache_dd == NULL);
+			mtx_unlock(dvlp);
+			if (dvlp2 != NULL)
+				mtx_unlock(dvlp2);
+			cache_free(ncp);
+		} else {
+			dvp->v_cache_dd = NULL;
+			mtx_unlock(dvlp);
+			if (dvlp2 != NULL)
+				mtx_unlock(dvlp2);
+		}
+		return (0);
+	}
+
+	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+	blp = HASH2BUCKETLOCK(hash);
+retry:
+	if (LIST_EMPTY(NCHHASH(hash)))
+		goto out_no_entry;
+
+	rw_wlock(blp);
+
+	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+		counter_u64_add(numchecks, 1);
+		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+			break;
+	}
+
+	/* We failed to find an entry */
+	if (ncp == NULL) {
+		rw_wunlock(blp);
+		goto out_no_entry;
+	}
+
+	counter_u64_add(numposzaps, 1);
+
+	error = cache_zap_wlocked_bucket(ncp, blp);
+	if (error != 0) {
+		zap_and_exit_bucket_fail++;
+		cache_maybe_yield();
+		goto retry;
+	}
+	cache_free(ncp);
+	return (0);
+out_no_entry:
+	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
+	counter_u64_add(nummisszap, 1);
+	return (0);
+}
+
+/**
+ * Lookup a name in the name cache
+ *
+ * # Arguments
+ *
+ * - dvp:	Parent directory in which to search.
+ * - vpp:	Return argument.  Will contain desired vnode on cache hit.
+ * - cnp:	Parameters of the name search.  The most interesting bits of
+ *   		the cn_flags field have the following meanings:
+ *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
+ *   			it up.
+ *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
+ * - tsp:	Return storage for cache timestamp.  On a successful (positive
+ *   		or negative) lookup, tsp will be filled with any timespec that
+ *   		was stored when this cache entry was created.  However, it will
+ *   		be clear for "." entries.
+ * - ticks:	Return storage for alternate cache timestamp.  On a successful
+ *   		(positive or negative) lookup, it will contain the ticks value
+ *   		that was current when the cache entry was created, unless cnp
+ *   		was ".".
+ *
+ * # Returns
+ *
+ * - -1:	A positive cache hit.  vpp will contain the desired vnode.
+ * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
+ *		to a forced unmount.  vpp will not be modified.  If the entry
+ *		is a whiteout, then the ISWHITEOUT flag will be set in
+ *		cnp->cn_flags.
+ * - 0:		A cache miss.  vpp will not be modified.
+ *
+ * # Locking
+ *
+ * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
+ * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
+ * lock is not recursively acquired.
+ */
+int
+cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
+    struct timespec *tsp, int *ticksp)
+{
+	struct namecache_ts *ncp_ts;
+	struct namecache *ncp;
+	struct rwlock *blp;
+	struct mtx *dvlp;
+	uint32_t hash;
+	int error, ltype;
+
+	if (__predict_false(!doingcache)) {
+		cnp->cn_flags &= ~MAKEENTRY;
+		return (0);
+	}
+
+	counter_u64_add(numcalls, 1);
+
+	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
+		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
+
+	if ((cnp->cn_flags & MAKEENTRY) == 0)
+		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
+
+retry:
+	blp = NULL;
+	dvlp = NULL;
+	error = 0;
+	if (cnp->cn_namelen == 2 &&
+	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
+		counter_u64_add(dotdothits, 1);
+		dvlp = VP2VNODELOCK(dvp);
+		mtx_lock(dvlp);
+		ncp = dvp->v_cache_dd;
+		if (ncp == NULL) {
+			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
+			    "..", NULL);
+			mtx_unlock(dvlp);
+			return (0);
+		}
+		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
+			if (ncp->nc_flag & NCF_NEGATIVE)
+				*vpp = NULL;
+			else
+				*vpp = ncp->nc_vp;
+		} else
+			*vpp = ncp->nc_dvp;
+		/* Return failure if negative entry was found. */
+		if (*vpp == NULL)
+			goto negative_success;
+		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
+		    dvp, cnp->cn_nameptr, *vpp);
+		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
+		    *vpp);
+		cache_out_ts(ncp, tsp, ticksp);
+		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
+		    NCF_DTS && tsp != NULL) {
+			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+			*tsp = ncp_ts->nc_dotdottime;
+		}
+		goto success;
+	}
+
+	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+	blp = HASH2BUCKETLOCK(hash);
+	rw_rlock(blp);
+
+	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+		counter_u64_add(numchecks, 1);
+		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+			break;
+	}
+
+	/* We failed to find an entry */
+	if (ncp == NULL) {
+		rw_runlock(blp);
+		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
+		    NULL);
+		counter_u64_add(nummiss, 1);
+		return (0);
+	}
+
+	/* We found a "positive" match, return the vnode */
+	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+		counter_u64_add(numposhits, 1);
+		*vpp = ncp->nc_vp;
+		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
+		    dvp, cnp->cn_nameptr, *vpp, ncp);
+		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
+		    *vpp);
+		cache_out_ts(ncp, tsp, ticksp);
+		goto success;
+	}
+
+negative_success:
+	/* We found a negative match, and want to create it, so purge */
+	if (cnp->cn_nameiop == CREATE) {
+		counter_u64_add(numnegzaps, 1);
+		goto zap_and_exit;
+	}
+
+	counter_u64_add(numneghits, 1);
+	cache_negative_hit(ncp);
+	if (ncp->nc_flag & NCF_WHITE)
+		cnp->cn_flags |= ISWHITEOUT;
+	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+	    ncp->nc_name);
+	cache_out_ts(ncp, tsp, ticksp);
+	cache_lookup_unlock(blp, dvlp);
+	return (ENOENT);
+
+success:
+	/*
+	 * On success we return a locked and ref'd vnode as per the lookup
+	 * protocol.
+	 */
+	MPASS(dvp != *vpp);
+	ltype = 0;	/* silence gcc warning */
+	if (cnp->cn_flags & ISDOTDOT) {
+		ltype = VOP_ISLOCKED(dvp);
+		VOP_UNLOCK(dvp, 0);
+	}
+	vhold(*vpp);
+	cache_lookup_unlock(blp, dvlp);
+	error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
+	if (cnp->cn_flags & ISDOTDOT) {
+		vn_lock(dvp, ltype | LK_RETRY);
+		if (dvp->v_iflag & VI_DOOMED) {
+			if (error == 0)
+				vput(*vpp);
+			*vpp = NULL;
+			return (ENOENT);
+		}
+	}
+	if (error) {
+		*vpp = NULL;
+		goto retry;
+	}
+	if ((cnp->cn_flags & ISLASTCN) &&
+	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
+		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
+	}
+	return (-1);
+
+zap_and_exit:
+	if (blp != NULL)
+		error = cache_zap_rlocked_bucket(ncp, blp);
+	else
+		error = cache_zap_locked_vnode(ncp, dvp);
+	if (error != 0) {
+		zap_and_exit_bucket_fail++;
+		cache_maybe_yield();
+		goto retry;
+	}
+	cache_free(ncp);
+	return (0);
+}
+
+struct celockstate {
+	struct mtx *vlp[3];
+	struct rwlock *blp[2];
+};
+CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
+CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
+
+static inline void
+cache_celockstate_init(struct celockstate *cel)
+{
+
+	bzero(cel, sizeof(*cel));
+}
+
+static void
+cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
+    struct vnode *dvp)
+{
+	struct mtx *vlp1, *vlp2;
+
+	MPASS(cel->vlp[0] == NULL);
+	MPASS(cel->vlp[1] == NULL);
+	MPASS(cel->vlp[2] == NULL);
+
+	MPASS(vp != NULL || dvp != NULL);
+
+	vlp1 = VP2VNODELOCK(vp);
+	vlp2 = VP2VNODELOCK(dvp);
+	cache_sort(&vlp1, &vlp2);
+
+	if (vlp1 != NULL) {
+		mtx_lock(vlp1);
+		cel->vlp[0] = vlp1;
+	}
+	mtx_lock(vlp2);
+	cel->vlp[1] = vlp2;
+}
+
+static void
+cache_unlock_vnodes_cel(struct celockstate *cel)
+{
+
+	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
+
+	if (cel->vlp[0] != NULL)
+		mtx_unlock(cel->vlp[0]);
+	if (cel->vlp[1] != NULL)
+		mtx_unlock(cel->vlp[1]);
+	if (cel->vlp[2] != NULL)
+		mtx_unlock(cel->vlp[2]);
+}
+
+static bool
+cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
+{
+	struct mtx *vlp;
+	bool ret;
+
+	cache_assert_vlp_locked(cel->vlp[0]);
+	cache_assert_vlp_locked(cel->vlp[1]);
+	MPASS(cel->vlp[2] == NULL);
+
+	MPASS(vp != NULL);
+	vlp = VP2VNODELOCK(vp);
+
+	ret = true;
+	if (vlp >= cel->vlp[1]) {
+		mtx_lock(vlp);
+	} else {
+		if (mtx_trylock(vlp))
+			goto out;
+		cache_lock_vnodes_cel_3_failures++;
+		cache_unlock_vnodes_cel(cel);
+		if (vlp < cel->vlp[0]) {
+			mtx_lock(vlp);
+			mtx_lock(cel->vlp[0]);
+			mtx_lock(cel->vlp[1]);
+		} else {
+			if (cel->vlp[0] != NULL)
+				mtx_lock(cel->vlp[0]);
+			mtx_lock(vlp);
+			mtx_lock(cel->vlp[1]);
+		}
+		ret = false;
+	}
+out:
+	cel->vlp[2] = vlp;
+	return (ret);
+}
+
+static void
+cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
+    struct rwlock *blp2)
+{
+
+	MPASS(cel->blp[0] == NULL);
+	MPASS(cel->blp[1] == NULL);
+
+	cache_sort(&blp1, &blp2);
+
+	if (blp1 != NULL) {
+		rw_wlock(blp1);
+		cel->blp[0] = blp1;
+	}
+	rw_wlock(blp2);
+	cel->blp[1] = blp2;
+}
+
+static void
+cache_unlock_buckets_cel(struct celockstate *cel)
+{
+
+	if (cel->blp[0] != NULL)
+		rw_wunlock(cel->blp[0]);
+	rw_wunlock(cel->blp[1]);
+}
+
+/*
+ * Lock part of the cache affected by the insertion.
+ *
+ * This means vnodelocks for dvp, vp and the relevant bucketlock.
+ * However, insertion can result in removal of an old entry. In this
+ * case we have an additional vnode and bucketlock pair to lock. If the
+ * entry is negative, ncelock is locked instead of the vnode.
+ *
+ * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
+ * preserving the locking order (smaller address first).
+ */
+static void
+cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+    uint32_t hash)
+{
+	struct namecache *ncp;
+	struct rwlock *blps[2];
+
+	blps[0] = HASH2BUCKETLOCK(hash);
+	for (;;) {
+		blps[1] = NULL;
+		cache_lock_vnodes_cel(cel, dvp, vp);
+		if (vp == NULL || vp->v_type != VDIR)
+			break;
+		ncp = vp->v_cache_dd;
+		if (ncp == NULL)
+			break;
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+		MPASS(ncp->nc_dvp == vp);
+		blps[1] = NCP2BUCKETLOCK(ncp);
+		if (ncp->nc_flag & NCF_NEGATIVE)
+			break;
+		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+			break;
+		/*
+		 * All vnodes got re-locked. Re-validate the state and if
+		 * nothing changed we are done. Otherwise restart.
+		 */
+		if (ncp == vp->v_cache_dd &&
+		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+		    blps[1] == NCP2BUCKETLOCK(ncp) &&
+		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+			break;
+		cache_unlock_vnodes_cel(cel);
+		cel->vlp[0] = NULL;
+		cel->vlp[1] = NULL;
+		cel->vlp[2] = NULL;
+	}
+	cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+    uint32_t hash)
+{
+	struct namecache *ncp;
+	struct rwlock *blps[2];
+
+	blps[0] = HASH2BUCKETLOCK(hash);
+	for (;;) {
+		blps[1] = NULL;
+		cache_lock_vnodes_cel(cel, dvp, vp);
+		ncp = dvp->v_cache_dd;
+		if (ncp == NULL)
+			break;
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+		MPASS(ncp->nc_dvp == dvp);
+		blps[1] = NCP2BUCKETLOCK(ncp);
+		if (ncp->nc_flag & NCF_NEGATIVE)
+			break;
+		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+			break;
+		if (ncp == dvp->v_cache_dd &&
+		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+		    blps[1] == NCP2BUCKETLOCK(ncp) &&
+		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+			break;
+		cache_unlock_vnodes_cel(cel);
+		cel->vlp[0] = NULL;
+		cel->vlp[1] = NULL;
+		cel->vlp[2] = NULL;
+	}
+	cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_unlock(struct celockstate *cel)
+{
+
+	cache_unlock_buckets_cel(cel);
+	cache_unlock_vnodes_cel(cel);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
+    struct timespec *tsp, struct timespec *dtsp)
+{
+	struct celockstate cel;
+	struct namecache *ncp, *n2, *ndd;
+	struct namecache_ts *ncp_ts, *n2_ts;
+	struct nchashhead *ncpp;
+	struct neglist *neglist;
+	uint32_t hash;
+	int flag;
+	int len;
+	bool neg_locked;
+	int lnumcache;
+
+	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
+	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
+	    ("cache_enter: Adding a doomed vnode"));
+	VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
+	    ("cache_enter: Doomed vnode used as src"));
+
+	if (__predict_false(!doingcache))
+		return;
+
+	/*
+	 * Avoid blowout in namecache entries.
+	 */
+	if (__predict_false(numcache >= desiredvnodes * ncsizefactor))
+		return;
+
+	cache_celockstate_init(&cel);
+	ndd = NULL;
+	ncp_ts = NULL;
+	flag = 0;
+	if (cnp->cn_nameptr[0] == '.') {
+		if (cnp->cn_namelen == 1)
+			return;
+		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+			len = cnp->cn_namelen;
+			hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
+			cache_enter_lock_dd(&cel, dvp, vp, hash);
+			/*
+			 * If dotdot entry already exists, just retarget it
+			 * to new parent vnode, otherwise continue with new
+			 * namecache entry allocation.
+			 */
+			if ((ncp = dvp->v_cache_dd) != NULL &&
+			    ncp->nc_flag & NCF_ISDOTDOT) {
+				KASSERT(ncp->nc_dvp == dvp,
+				    ("wrong isdotdot parent"));
+				neg_locked = false;
+				if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) {
+					neglist = NCP2NEGLIST(ncp);
+					mtx_lock(&ncneg_hot.nl_lock);
+					mtx_lock(&neglist->nl_lock);
+					neg_locked = true;
+				}
+				if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
+					    ncp, nc_dst);
+				} else {
+					cache_negative_remove(ncp, true);
+				}
+				if (vp != NULL) {
+					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
+					    ncp, nc_dst);
+					ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE);
+				} else {
+					ncp->nc_flag &= ~(NCF_HOTNEGATIVE);
+					ncp->nc_flag |= NCF_NEGATIVE;
+					cache_negative_insert(ncp, true);
+				}
+				if (neg_locked) {
+					mtx_unlock(&neglist->nl_lock);
+					mtx_unlock(&ncneg_hot.nl_lock);
+				}
+				ncp->nc_vp = vp;
+				cache_enter_unlock(&cel);
+				return;
+			}
+			dvp->v_cache_dd = NULL;
+			cache_enter_unlock(&cel);
+			cache_celockstate_init(&cel);
+			SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
+			flag = NCF_ISDOTDOT;
+		}
+	}
+
+	/*
+	 * Calculate the hash key and setup as much of the new
+	 * namecache entry as possible before acquiring the lock.
+	 */
+	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
+	ncp->nc_flag = flag;
+	ncp->nc_vp = vp;
+	if (vp == NULL)
+		ncp->nc_flag |= NCF_NEGATIVE;
+	ncp->nc_dvp = dvp;
+	if (tsp != NULL) {
+		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+		ncp_ts->nc_time = *tsp;
+		ncp_ts->nc_ticks = ticks;
+		ncp_ts->nc_nc.nc_flag |= NCF_TS;
+		if (dtsp != NULL) {
+			ncp_ts->nc_dotdottime = *dtsp;
+			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
+		}
+	}
+	len = ncp->nc_nlen = cnp->cn_namelen;
+	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
+	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
+	cache_enter_lock(&cel, dvp, vp, hash);
+
+	/*
+	 * See if this vnode or negative entry is already in the cache
+	 * with this name.  This can happen with concurrent lookups of
+	 * the same path name.
+	 */
+	ncpp = NCHHASH(hash);
+	LIST_FOREACH(n2, ncpp, nc_hash) {
+		if (n2->nc_dvp == dvp &&
+		    n2->nc_nlen == cnp->cn_namelen &&
+		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
+			if (tsp != NULL) {
+				KASSERT((n2->nc_flag & NCF_TS) != 0,
+				    ("no NCF_TS"));
+				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
+				n2_ts->nc_time = ncp_ts->nc_time;
+				n2_ts->nc_ticks = ncp_ts->nc_ticks;
+				if (dtsp != NULL) {
+					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
+					if (ncp->nc_flag & NCF_NEGATIVE)
+						mtx_lock(&ncneg_hot.nl_lock);
+					n2_ts->nc_nc.nc_flag |= NCF_DTS;
+					if (ncp->nc_flag & NCF_NEGATIVE)
+						mtx_unlock(&ncneg_hot.nl_lock);
+				}
+			}
+			goto out_unlock_free;
+		}
+	}
+
+	if (flag == NCF_ISDOTDOT) {
+		/*
+		 * See if we are trying to add .. entry, but some other lookup
+		 * has populated v_cache_dd pointer already.
+		 */
+		if (dvp->v_cache_dd != NULL)
+			goto out_unlock_free;
+		KASSERT(vp == NULL || vp->v_type == VDIR,
+		    ("wrong vnode type %p", vp));
+		dvp->v_cache_dd = ncp;
+	}
+
+	if (vp != NULL) {
+		if (vp->v_type == VDIR) {
+			if (flag != NCF_ISDOTDOT) {
+				/*
+				 * For this case, the cache entry maps both the
+				 * directory name in it and the name ".." for the
+				 * directory's parent.
+				 */
+				if ((ndd = vp->v_cache_dd) != NULL) {
+					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
+						cache_zap_locked(ndd, false);
+					else
+						ndd = NULL;
+				}
+				vp->v_cache_dd = ncp;
+			}
+		} else {
+			vp->v_cache_dd = NULL;
+		}
+	}
+
+	if (flag != NCF_ISDOTDOT) {
+		if (LIST_EMPTY(&dvp->v_cache_src)) {
+			vhold(dvp);
+			atomic_add_rel_long(&numcachehv, 1);
+		}
+		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+	}
+
+	/*
+	 * Insert the new namecache entry into the appropriate chain
+	 * within the cache entries table.
+	 */
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
+	/*
+	 * If the entry is "negative", we place it into the
+	 * "negative" cache queue, otherwise, we place it into the
+	 * destination vnode's cache entries queue.
+	 */
+	if (vp != NULL) {
+		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
+		    vp);
+	} else {
+		if (cnp->cn_flags & ISWHITEOUT)
+			ncp->nc_flag |= NCF_WHITE;
+		cache_negative_insert(ncp, false);
+		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
+		    ncp->nc_name);
+	}
+	cache_enter_unlock(&cel);
+	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
+	if (numneg * ncnegfactor > lnumcache)
+		cache_negative_zap_one();
+	cache_free(ndd);
+	return;
+out_unlock_free:
+	cache_enter_unlock(&cel);
+	cache_free(ncp);
+	return;
+}
+
+static u_int
+cache_roundup_2(u_int val)
+{
+	u_int res;
+
+	for (res = 1; res <= val; res <<= 1)
+		continue;
+
+	return (res);
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+static void
+nchinit(void *dummy __unused)
+{
+	u_int i;
+
+	cache_zone_small = uma_zcreate("S VFS Cache",
+	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+	    UMA_ZONE_ZINIT);
+	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
+	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+	    UMA_ZONE_ZINIT);
+	cache_zone_large = uma_zcreate("L VFS Cache",
+	    sizeof(struct namecache) + NAME_MAX + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+	    UMA_ZONE_ZINIT);
+	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
+	    sizeof(struct namecache_ts) + NAME_MAX + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+	    UMA_ZONE_ZINIT);
+
+	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+	ncbuckethash = cache_roundup_2(mp_ncpus * 64) - 1;
+	if (ncbuckethash > nchash)
+		ncbuckethash = nchash;
+	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numbucketlocks; i++)
+		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
+	ncvnodehash = cache_roundup_2(mp_ncpus * 64) - 1;
+	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
+	ncpurgeminvnodes = numbucketlocks;
+
+	ncneghash = 3;
+	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numneglists; i++) {
+		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
+		TAILQ_INIT(&neglists[i].nl_list);
+	}
+	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
+	TAILQ_INIT(&ncneg_hot.nl_list);
+
+	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
+
+	numcalls = counter_u64_alloc(M_WAITOK);
+	dothits = counter_u64_alloc(M_WAITOK);
+	dotdothits = counter_u64_alloc(M_WAITOK);
+	numchecks = counter_u64_alloc(M_WAITOK);
+	nummiss = counter_u64_alloc(M_WAITOK);
+	nummisszap = counter_u64_alloc(M_WAITOK);
+	numposzaps = counter_u64_alloc(M_WAITOK);
+	numposhits = counter_u64_alloc(M_WAITOK);
+	numnegzaps = counter_u64_alloc(M_WAITOK);
+	numneghits = counter_u64_alloc(M_WAITOK);
+	numfullpathcalls = counter_u64_alloc(M_WAITOK);
+	numfullpathfail1 = counter_u64_alloc(M_WAITOK);
+	numfullpathfail2 = counter_u64_alloc(M_WAITOK);
+	numfullpathfail4 = counter_u64_alloc(M_WAITOK);
+	numfullpathfound = counter_u64_alloc(M_WAITOK);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
+
+void
+cache_changesize(int newmaxvnodes)
+{
+	struct nchashhead *new_nchashtbl, *old_nchashtbl;
+	u_long new_nchash, old_nchash;
+	struct namecache *ncp;
+	uint32_t hash;
+	int i;
+
+	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
+	if (newmaxvnodes < numbucketlocks)
+		newmaxvnodes = numbucketlocks;
+
+	new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
+	/* If same hash table size, nothing to do */
+	if (nchash == new_nchash) {
+		free(new_nchashtbl, M_VFSCACHE);
+		return;
+	}
+	/*
+	 * Move everything from the old hash table to the new table.
+	 * None of the namecache entries in the table can be removed
+	 * because to do so, they have to be removed from the hash table.
+	 */
+	cache_lock_all_vnodes();
+	cache_lock_all_buckets();
+	old_nchashtbl = nchashtbl;
+	old_nchash = nchash;
+	nchashtbl = new_nchashtbl;
+	nchash = new_nchash;
+	for (i = 0; i <= old_nchash; i++) {
+		while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
+			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
+			    ncp->nc_dvp);
+			LIST_REMOVE(ncp, nc_hash);
+			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
+		}
+	}
+	cache_unlock_all_buckets();
+	cache_unlock_all_vnodes();
+	free(old_nchashtbl, M_VFSCACHE);
+}
+
+/*
+ * Invalidate all entries from and to a particular vnode.
+ */
+void
+cache_purge(struct vnode *vp)
+{
+	TAILQ_HEAD(, namecache) ncps;
+	struct namecache *ncp, *nnp;
+	struct mtx *vlp, *vlp2;
+
+	CTR1(KTR_VFS, "cache_purge(%p)", vp);
+	SDT_PROBE1(vfs, namecache, purge, done, vp);
+	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
+	    vp->v_cache_dd == NULL)
+		return;
+	TAILQ_INIT(&ncps);
+	vlp = VP2VNODELOCK(vp);
+	vlp2 = NULL;
+	mtx_lock(vlp);
+retry:
+	while (!LIST_EMPTY(&vp->v_cache_src)) {
+		ncp = LIST_FIRST(&vp->v_cache_src);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	}
+	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
+		ncp = TAILQ_FIRST(&vp->v_cache_dst);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	}
+	ncp = vp->v_cache_dd;
+	if (ncp != NULL) {
+		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
+		   ("lost dotdot link"));
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	}
+	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
+	mtx_unlock(vlp);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+		cache_free(ncp);
+	}
+}
+
+/*
+ * Invalidate all negative entries for a particular directory vnode.
+ */
+void
+cache_purge_negative(struct vnode *vp)
+{
+	TAILQ_HEAD(, namecache) ncps;
+	struct namecache *ncp, *nnp;
+	struct mtx *vlp;
+
+	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
+	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
+	if (LIST_EMPTY(&vp->v_cache_src))
+		return;
+	TAILQ_INIT(&ncps);
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
+	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
+		if (!(ncp->nc_flag & NCF_NEGATIVE))
+			continue;
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	}
+	mtx_unlock(vlp);
+	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+		cache_free(ncp);
+	}
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ */
+void
+cache_purgevfs(struct mount *mp, bool force)
+{
+	TAILQ_HEAD(, namecache) ncps;
+	struct mtx *vlp1, *vlp2;
+	struct rwlock *blp;
+	struct nchashhead *bucket;
+	struct namecache *ncp, *nnp;
+	u_long i, j, n_nchash;
+	int error;
+
+	/* Scan hash tables for applicable entries */
+	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
+	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
+		return;
+	TAILQ_INIT(&ncps);
+	n_nchash = nchash + 1;
+	vlp1 = vlp2 = NULL;
+	for (i = 0; i < numbucketlocks; i++) {
+		blp = (struct rwlock *)&bucketlocks[i];
+		rw_wlock(blp);
+		for (j = i; j < n_nchash; j += numbucketlocks) {
+retry:
+			bucket = &nchashtbl[j];
+			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+				cache_assert_bucket_locked(ncp, RA_WLOCKED);
+				if (ncp->nc_dvp->v_mount != mp)
+					continue;
+				error = cache_zap_wlocked_bucket_kl(ncp, blp,
+				    &vlp1, &vlp2);
+				if (error != 0)
+					goto retry;
+				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+			}
+		}
+		rw_wunlock(blp);
+		if (vlp1 == NULL && vlp2 == NULL)
+			cache_maybe_yield();
+	}
+	if (vlp1 != NULL)
+		mtx_unlock(vlp1);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+
+	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+		cache_free(ncp);
+	}
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(struct vop_lookup_args *ap)
+{
+	struct vnode *dvp;
+	int error;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	int flags = cnp->cn_flags;
+
+	*vpp = NULL;
+	dvp = ap->a_dvp;
+
+	if (dvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+		return (EROFS);
+
+	error = vn_dir_check_exec(dvp, cnp);
+	if (error != 0)
+		return (error);
+
+	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
+	if (error == 0)
+		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * XXX All of these sysctls would probably be more productive dead.
+ */
+static int __read_mostly disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
+   "Disable the getcwd syscall");
+
+/* Implementation of the getcwd syscall. */
+int
+sys___getcwd(struct thread *td, struct __getcwd_args *uap)
+{
+
+	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
+	    MAXPATHLEN));
+}
+
+int
+kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen,
+    size_t path_max)
+{
+	char *bp, *tmpbuf;
+	struct filedesc *fdp;
+	struct vnode *cdir, *rdir;
+	int error;
+
+	if (__predict_false(disablecwd))
+		return (ENODEV);
+	if (__predict_false(buflen < 2))
+		return (EINVAL);
+	if (buflen > path_max)
+		buflen = path_max;
+
+	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	cdir = fdp->fd_cdir;
+	vrefact(cdir);
+	rdir = fdp->fd_rdir;
+	vrefact(rdir);
+	FILEDESC_SUNLOCK(fdp);
+	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
+	vrele(rdir);
+	vrele(cdir);
+
+	if (!error) {
+		if (bufseg == UIO_SYSSPACE)
+			bcopy(bp, buf, strlen(bp) + 1);
+		else
+			error = copyout(bp, buf, strlen(bp) + 1);
+#ifdef KTRACE
+	if (KTRPOINT(curthread, KTR_NAMEI))
+		ktrnamei(bp);
+#endif
+	}
+	free(tmpbuf, M_TEMP);
+	return (error);
+}
+
+/*
+ * Thus begins the fullpath magic.
+ */
+
+static int __read_mostly disablefullpath;
+SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
+    "Disable the vn_fullpath function");
+
+/*
+ * Retrieve the full filesystem path that correspond to a vnode from the name
+ * cache (if available)
+ */
+int
+vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
+{
+	char *buf;
+	struct filedesc *fdp;
+	struct vnode *rdir;
+	int error;
+
+	if (__predict_false(disablefullpath))
+		return (ENODEV);
+	if (__predict_false(vn == NULL))
+		return (EINVAL);
+
+	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	rdir = fdp->fd_rdir;
+	vrefact(rdir);
+	FILEDESC_SUNLOCK(fdp);
+	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
+	vrele(rdir);
+
+	if (!error)
+		*freebuf = buf;
+	else
+		free(buf, M_TEMP);
+	return (error);
+}
+
+/*
+ * This function is similar to vn_fullpath, but it attempts to lookup the
+ * pathname relative to the global root mount point.  This is required for the
+ * auditing sub-system, as audited pathnames must be absolute, relative to the
+ * global root mount point.
+ */
+int
+vn_fullpath_global(struct thread *td, struct vnode *vn,
+    char **retbuf, char **freebuf)
+{
+	char *buf;
+	int error;
+
+	if (__predict_false(disablefullpath))
+		return (ENODEV);
+	if (__predict_false(vn == NULL))
+		return (EINVAL);
+	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
+	if (!error)
+		*freebuf = buf;
+	else
+		free(buf, M_TEMP);
+	return (error);
+}
+
+int
+vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
+{
+	struct vnode *dvp;
+	struct namecache *ncp;
+	struct mtx *vlp;
+	int error;
+
+	vlp = VP2VNODELOCK(*vp);
+	mtx_lock(vlp);
+	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+	}
+	if (ncp != NULL) {
+		if (*buflen < ncp->nc_nlen) {
+			mtx_unlock(vlp);
+			vrele(*vp);
+			counter_u64_add(numfullpathfail4, 1);
+			error = ENOMEM;
+			SDT_PROBE3(vfs, namecache, fullpath, return, error,
+			    vp, NULL);
+			return (error);
+		}
+		*buflen -= ncp->nc_nlen;
+		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
+		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
+		    ncp->nc_name, vp);
+		dvp = *vp;
+		*vp = ncp->nc_dvp;
+		vref(*vp);
+		mtx_unlock(vlp);
+		vrele(dvp);
+		return (0);
+	}
+	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
+
+	mtx_unlock(vlp);
+	vn_lock(*vp, LK_SHARED | LK_RETRY);
+	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
+	vput(*vp);
+	if (error) {
+		counter_u64_add(numfullpathfail2, 1);
+		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
+		return (error);
+	}
+
+	*vp = dvp;
+	if (dvp->v_iflag & VI_DOOMED) {
+		/* forced unmount */
+		vrele(dvp);
+		error = ENOENT;
+		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
+		return (error);
+	}
+	/*
+	 * *vp has its use count incremented still.
+	 */
+
+	return (0);
+}
+
+/*
+ * The magic behind kern___getcwd() and vn_fullpath().
+ */
+static int
+vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+    char *buf, char **retbuf, u_int buflen)
+{
+	int error, slash_prefixed;
+#ifdef KDTRACE_HOOKS
+	struct vnode *startvp = vp;
+#endif
+	struct vnode *vp1;
+
+	buflen--;
+	buf[buflen] = '\0';
+	error = 0;
+	slash_prefixed = 0;
+
+	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
+	counter_u64_add(numfullpathcalls, 1);
+	vref(vp);
+	if (vp->v_type != VDIR) {
+		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
+		if (error)
+			return (error);
+		if (buflen == 0) {
+			vrele(vp);
+			return (ENOMEM);
+		}
+		buf[--buflen] = '/';
+		slash_prefixed = 1;
+	}
+	while (vp != rdir && vp != rootvnode) {
+		/*
+		 * The vp vnode must be already fully constructed,
+		 * since it is either found in namecache or obtained
+		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
+		 * without obtaining the vnode lock.
+		 */
+		if ((vp->v_vflag & VV_ROOT) != 0) {
+			vn_lock(vp, LK_RETRY | LK_SHARED);
+
+			/*
+			 * With the vnode locked, check for races with
+			 * unmount, forced or not.  Note that we
+			 * already verified that vp is not equal to
+			 * the root vnode, which means that
+			 * mnt_vnodecovered can be NULL only for the
+			 * case of unmount.
+			 */
+			if ((vp->v_iflag & VI_DOOMED) != 0 ||
+			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
+			    vp1->v_mountedhere != vp->v_mount) {
+				vput(vp);
+				error = ENOENT;
+				SDT_PROBE3(vfs, namecache, fullpath, return,
+				    error, vp, NULL);
+				break;
+			}
+
+			vref(vp1);
+			vput(vp);
+			vp = vp1;
+			continue;
+		}
+		if (vp->v_type != VDIR) {
+			vrele(vp);
+			counter_u64_add(numfullpathfail1, 1);
+			error = ENOTDIR;
+			SDT_PROBE3(vfs, namecache, fullpath, return,
+			    error, vp, NULL);
+			break;
+		}
+		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
+		if (error)
+			break;
+		if (buflen == 0) {
+			vrele(vp);
+			error = ENOMEM;
+			SDT_PROBE3(vfs, namecache, fullpath, return, error,
+			    startvp, NULL);
+			break;
+		}
+		buf[--buflen] = '/';
+		slash_prefixed = 1;
+	}
+	if (error)
+		return (error);
+	if (!slash_prefixed) {
+		if (buflen == 0) {
+			vrele(vp);
+			counter_u64_add(numfullpathfail4, 1);
+			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
+			    startvp, NULL);
+			return (ENOMEM);
+		}
+		buf[--buflen] = '/';
+	}
+	counter_u64_add(numfullpathfound, 1);
+	vrele(vp);
+
+	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
+	*retbuf = buf + buflen;
+	return (0);
+}
+
+struct vnode *
+vn_dir_dd_ino(struct vnode *vp)
+{
+	struct namecache *ncp;
+	struct vnode *ddvp;
+	struct mtx *vlp;
+
+	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
+	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
+		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+			continue;
+		ddvp = ncp->nc_dvp;
+		vhold(ddvp);
+		mtx_unlock(vlp);
+		if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
+			return (NULL);
+		return (ddvp);
+	}
+	mtx_unlock(vlp);
+	return (NULL);
+}
+
+int
+vn_commname(struct vnode *vp, char *buf, u_int buflen)
+{
+	struct namecache *ncp;
+	struct mtx *vlp;
+	int l;
+
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
+	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+	if (ncp == NULL) {
+		mtx_unlock(vlp);
+		return (ENOENT);
+	}
+	l = min(ncp->nc_nlen, buflen - 1);
+	memcpy(buf, ncp->nc_name, l);
+	mtx_unlock(vlp);
+	buf[l] = '\0';
+	return (0);
+}
+
+/* ABI compat shims for old kernel modules. */
+#undef cache_enter
+
+void	cache_enter(struct vnode *dvp, struct vnode *vp,
+	    struct componentname *cnp);
+
+void
+cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
+{
+
+	cache_enter_time(dvp, vp, cnp, NULL, NULL);
+}
+
+/*
+ * This function updates path string to vnode's full global path
+ * and checks the size of the new path string against the pathlen argument.
+ *
+ * Requires a locked, referenced vnode.
+ * Vnode is re-locked on success or ENODEV, otherwise unlocked.
+ *
+ * If sysctl debug.disablefullpath is set, ENODEV is returned,
+ * vnode is left locked and path remain untouched.
+ *
+ * If vp is a directory, the call to vn_fullpath_global() always succeeds
+ * because it falls back to the ".." lookup if the namecache lookup fails.
+ */
+int
+vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
+    u_int pathlen)
+{
+	struct nameidata nd;
+	struct vnode *vp1;
+	char *rpath, *fbuf;
+	int error;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* Return ENODEV if sysctl debug.disablefullpath==1 */
+	if (__predict_false(disablefullpath))
+		return (ENODEV);
+
+	/* Construct global filesystem path from vp. */
+	VOP_UNLOCK(vp, 0);
+	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
+
+	if (error != 0) {
+		vrele(vp);
+		return (error);
+	}
+
+	if (strlen(rpath) >= pathlen) {
+		vrele(vp);
+		error = ENAMETOOLONG;
+		goto out;
+	}
+
+	/*
+	 * Re-lookup the vnode by path to detect a possible rename.
+	 * As a side effect, the vnode is relocked.
+	 * If vnode was renamed, return ENOENT.
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+	    UIO_SYSSPACE, path, td);
+	error = namei(&nd);
+	if (error != 0) {
+		vrele(vp);
+		goto out;
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp1 = nd.ni_vp;
+	vrele(vp);
+	if (vp1 == vp)
+		strcpy(path, rpath);
+	else {
+		vput(vp1);
+		error = ENOENT;
+	}
+
+out:
+	free(fbuf, M_TEMP);
+	return (error);
+}
+
+#ifdef DDB
+static void
+db_print_vpath(struct vnode *vp)
+{
+
+	while (vp != NULL) {
+		db_printf("%p: ", vp);
+		if (vp == rootvnode) {
+			db_printf("/");
+			vp = NULL;
+		} else {
+			if (vp->v_vflag & VV_ROOT) {
+				db_printf("<mount point>");
+				vp = vp->v_mount->mnt_vnodecovered;
+			} else {
+				struct namecache *ncp;
+				char *ncn;
+				int i;
+
+				ncp = TAILQ_FIRST(&vp->v_cache_dst);
+				if (ncp != NULL) {
+					ncn = ncp->nc_name;
+					for (i = 0; i < ncp->nc_nlen; i++)
+						db_printf("%c", *ncn++);
+					vp = ncp->nc_dvp;
+				} else {
+					vp = NULL;
+				}
+			}
+		}
+		db_printf("\n");
+	}
+
+	return;
+}
+
+DB_SHOW_COMMAND(vpath, db_show_vpath)
+{
+	struct vnode *vp;
+
+	if (!have_addr) {
+		db_printf("usage: show vpath <struct vnode *>\n");
+		return;
+	}
+
+	vp = (struct vnode *)addr;
+	db_print_vpath(vp);
+}
+
+#endif
diff --git a/freebsd/sys/kern/vfs_cluster.c b/freebsd/sys/kern/vfs_cluster.c
new file mode 100644
index 00000000..1ebe4a56
--- /dev/null
+++ b/freebsd/sys/kern/vfs_cluster.c
@@ -0,0 +1,1086 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * Modifications/enhancements:
+ * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <sys/sysctl.h>
+
+#if defined(CLUSTERDEBUG)
+static int	rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
+    "Debug VFS clustering code");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
+
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+	    struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+	    daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+	    struct buf *fbp);
+static void cluster_callback(struct buf *);
+
+static int write_behind = 1;
+SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
+    "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
+
+static int read_max = 64;
+SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
+    "Cluster read-ahead max block count");
+
+static int read_min = 1;
+SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
+    "Cluster read min block count");
+
+/*
+ * Read data to a buf, including read-ahead if we find this to be beneficial.
+ * cluster_read replaces bread.
+ */
+int
+cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
+    struct ucred *cred, long totread, int seqcount, int gbflags,
+    struct buf **bpp)
+{
+	struct buf *bp, *rbp, *reqbp;
+	struct bufobj *bo;
+	struct thread *td;
+	daddr_t blkno, origblkno;
+	int maxra, racluster;
+	int error, ncontig;
+	int i;
+
+	error = 0;
+	td = curthread;
+	bo = &vp->v_bufobj;
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
+	/*
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
+	 */
+	racluster = vp->v_mount->mnt_iosize_max / size;
+	maxra = seqcount;
+	maxra = min(read_max, maxra);
+	maxra = min(nbuf/8, maxra);
+	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
+		maxra = (filesize / size) - lblkno;
+
+	/*
+	 * get the requested block
+	 */
+	error = getblkx(vp, lblkno, size, 0, 0, gbflags, &bp);
+	if (error != 0) {
+		*bpp = NULL;
+		return (error);
+	}
+	gbflags &= ~GB_NOSPARSE;
+	origblkno = lblkno;
+	*bpp = reqbp = bp;
+
+	/*
+	 * if it is in the cache, then check to see if the reads have been
+	 * sequential.  If they have, then try some read-ahead, otherwise
+	 * back-off on prospective read-aheads.
+	 */
+	if (bp->b_flags & B_CACHE) {
+		if (!seqcount) {
+			return 0;
+		} else if ((bp->b_flags & B_RAM) == 0) {
+			return 0;
+		} else {
+			bp->b_flags &= ~B_RAM;
+			BO_RLOCK(bo);
+			for (i = 1; i < maxra; i++) {
+				/*
+				 * Stop if the buffer does not exist or it
+				 * is invalid (about to go away?)
+				 */
+				rbp = gbincore(&vp->v_bufobj, lblkno+i);
+				if (rbp == NULL || (rbp->b_flags & B_INVAL))
+					break;
+
+				/*
+				 * Set another read-ahead mark so we know 
+				 * to check again. (If we can lock the
+				 * buffer without waiting)
+				 */
+				if ((((i % racluster) == (racluster - 1)) ||
+				    (i == (maxra - 1))) 
+				    && (0 == BUF_LOCK(rbp, 
+					LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
+					rbp->b_flags |= B_RAM;
+					BUF_UNLOCK(rbp);
+				}			
+			}
+			BO_RUNLOCK(bo);
+			if (i >= maxra) {
+				return 0;
+			}
+			lblkno += i;
+		}
+		reqbp = bp = NULL;
+	/*
+	 * If it isn't in the cache, then get a chunk from
+	 * disk if sequential, otherwise just get the block.
+	 */
+	} else {
+		off_t firstread = bp->b_offset;
+		int nblks;
+		long minread;
+
+		KASSERT(bp->b_offset != NOOFFSET,
+		    ("cluster_read: no buffer offset"));
+
+		ncontig = 0;
+
+		/*
+		 * Adjust totread if needed
+		 */
+		minread = read_min * size;
+		if (minread > totread)
+			totread = minread;
+
+		/*
+		 * Compute the total number of blocks that we should read
+		 * synchronously.
+		 */
+		if (firstread + totread > filesize)
+			totread = filesize - firstread;
+		nblks = howmany(totread, size);
+		if (nblks > racluster)
+			nblks = racluster;
+
+		/*
+		 * Now compute the number of contiguous blocks.
+		 */
+		if (nblks > 1) {
+	    		error = VOP_BMAP(vp, lblkno, NULL,
+				&blkno, &ncontig, NULL);
+			/*
+			 * If this failed to map just do the original block.
+			 */
+			if (error || blkno == -1)
+				ncontig = 0;
+		}
+
+		/*
+		 * If we have contiguous data available do a cluster
+		 * otherwise just read the requested block.
+		 */
+		if (ncontig) {
+			/* Account for our first block. */
+			ncontig = min(ncontig + 1, nblks);
+			if (ncontig < nblks)
+				nblks = ncontig;
+			bp = cluster_rbuild(vp, filesize, lblkno,
+			    blkno, size, nblks, gbflags, bp);
+			lblkno += (bp->b_bufsize / size);
+		} else {
+			bp->b_flags |= B_RAM;
+			bp->b_iocmd = BIO_READ;
+			lblkno += 1;
+		}
+	}
+
+	/*
+	 * handle the synchronous read so that it is available ASAP.
+	 */
+	if (bp) {
+		if ((bp->b_flags & B_CLUSTER) == 0) {
+			vfs_busy_pages(bp, 0);
+		}
+		bp->b_flags &= ~B_INVAL;
+		bp->b_ioflags &= ~BIO_ERROR;
+		if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
+			BUF_KERNPROC(bp);
+		bp->b_iooffset = dbtob(bp->b_blkno);
+		bstrategy(bp);
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			racct_add_buf(td->td_proc, bp, 0);
+			PROC_UNLOCK(td->td_proc);
+		}
+#endif /* RACCT */
+		td->td_ru.ru_inblock++;
+	}
+
+	/*
+	 * If we have been doing sequential I/O, then do some read-ahead.
+	 */
+	while (lblkno < (origblkno + maxra)) {
+		error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
+		if (error)
+			break;
+
+		if (blkno == -1)
+			break;
+
+		/*
+		 * We could throttle ncontig here by maxra but we might as
+		 * well read the data if it is contiguous.  We're throttled
+		 * by racluster anyway.
+		 */
+		if (ncontig) {
+			ncontig = min(ncontig + 1, racluster);
+			rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
+			    size, ncontig, gbflags, NULL);
+			lblkno += (rbp->b_bufsize / size);
+			if (rbp->b_flags & B_DELWRI) {
+				bqrelse(rbp);
+				continue;
+			}
+		} else {
+			rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
+			lblkno += 1;
+			if (rbp->b_flags & B_DELWRI) {
+				bqrelse(rbp);
+				continue;
+			}
+			rbp->b_flags |= B_ASYNC | B_RAM;
+			rbp->b_iocmd = BIO_READ;
+			rbp->b_blkno = blkno;
+		}
+		if (rbp->b_flags & B_CACHE) {
+			rbp->b_flags &= ~B_ASYNC;
+			bqrelse(rbp);
+			continue;
+		}
+		if ((rbp->b_flags & B_CLUSTER) == 0) {
+			vfs_busy_pages(rbp, 0);
+		}
+		rbp->b_flags &= ~B_INVAL;
+		rbp->b_ioflags &= ~BIO_ERROR;
+		if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
+			BUF_KERNPROC(rbp);
+		rbp->b_iooffset = dbtob(rbp->b_blkno);
+		bstrategy(rbp);
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			racct_add_buf(td->td_proc, rbp, 0);
+			PROC_UNLOCK(td->td_proc);
+		}
+#endif /* RACCT */
+		td->td_ru.ru_inblock++;
+	}
+
+	if (reqbp) {
+		/*
+		 * Like bread, always brelse() the buffer when
+		 * returning an error.
+		 */
+		error = bufwait(reqbp);
+		if (error != 0) {
+			brelse(reqbp);
+			*bpp = NULL;
+		}
+	}
+	return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead.  We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+    daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
+{
+	struct buf *bp, *tbp;
+	daddr_t bn;
+	off_t off;
+	long tinc, tsize;
+	int i, inc, j, k, toff;
+
+	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+	    ("cluster_rbuild: size %ld != f_iosize %jd\n",
+	    size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
+
+	/*
+	 * avoid a division
+	 */
+	while ((u_quad_t) size * (lbn + run) > filesize) {
+		--run;
+	}
+
+	if (fbp) {
+		tbp = fbp;
+		tbp->b_iocmd = BIO_READ; 
+	} else {
+		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
+		if (tbp->b_flags & B_CACHE)
+			return tbp;
+		tbp->b_flags |= B_ASYNC | B_RAM;
+		tbp->b_iocmd = BIO_READ;
+	}
+	tbp->b_blkno = blkno;
+	if( (tbp->b_flags & B_MALLOC) ||
+		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+		return tbp;
+
+	bp = trypbuf(&cluster_pbuf_freecnt);
+	if (bp == NULL)
+		return tbp;
+
+	/*
+	 * We are synthesizing a buffer out of vm_page_t's, but
+	 * if the block size is not page aligned then the starting
+	 * address may not be either.  Inherit the b_data offset
+	 * from the original buffer.
+	 */
+	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+	if ((gbflags & GB_UNMAPPED) != 0) {
+		bp->b_data = unmapped_buf;
+	} else {
+		bp->b_data = (char *)((vm_offset_t)bp->b_data |
+		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+	}
+	bp->b_iocmd = BIO_READ;
+	bp->b_iodone = cluster_callback;
+	bp->b_blkno = blkno;
+	bp->b_lblkno = lbn;
+	bp->b_offset = tbp->b_offset;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+	pbgetvp(vp, bp);
+
+	TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+	bp->b_bcount = 0;
+	bp->b_bufsize = 0;
+	bp->b_npages = 0;
+
+	inc = btodb(size);
+	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+		if (i == 0) {
+			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+			vfs_drain_busy_pages(tbp);
+			vm_object_pip_add(tbp->b_bufobj->bo_object,
+			    tbp->b_npages);
+			for (k = 0; k < tbp->b_npages; k++)
+				vm_page_sbusy(tbp->b_pages[k]);
+			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+		} else {
+			if ((bp->b_npages * PAGE_SIZE) +
+			    round_page(size) > vp->v_mount->mnt_iosize_max) {
+				break;
+			}
+
+			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+			    (gbflags & GB_UNMAPPED));
+
+			/* Don't wait around for locked bufs. */
+			if (tbp == NULL)
+				break;
+
+			/*
+			 * Stop scanning if the buffer is fully valid
+			 * (marked B_CACHE), or locked (may be doing a
+			 * background write), or if the buffer is not
+			 * VMIO backed.  The clustering code can only deal
+			 * with VMIO-backed buffers.  The bo lock is not
+			 * required for the BKGRDINPROG check since it
+			 * can not be set without the buf lock.
+			 */
+			if ((tbp->b_vflags & BV_BKGRDINPROG) ||
+			    (tbp->b_flags & B_CACHE) ||
+			    (tbp->b_flags & B_VMIO) == 0) {
+				bqrelse(tbp);
+				break;
+			}
+
+			/*
+			 * The buffer must be completely invalid in order to
+			 * take part in the cluster.  If it is partially valid
+			 * then we stop.
+			 */
+			off = tbp->b_offset;
+			tsize = size;
+			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+			for (j = 0; tsize > 0; j++) {
+				toff = off & PAGE_MASK;
+				tinc = tsize;
+				if (toff + tinc > PAGE_SIZE)
+					tinc = PAGE_SIZE - toff;
+				VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
+				if ((tbp->b_pages[j]->valid &
+				    vm_page_bits(toff, tinc)) != 0)
+					break;
+				if (vm_page_xbusied(tbp->b_pages[j]))
+					break;
+				vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
+				vm_page_sbusy(tbp->b_pages[j]);
+				off += tinc;
+				tsize -= tinc;
+			}
+			if (tsize > 0) {
+clean_sbusy:
+				vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
+				for (k = 0; k < j; k++)
+					vm_page_sunbusy(tbp->b_pages[k]);
+				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+				bqrelse(tbp);
+				break;
+			}
+			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+
+			/*
+			 * Set a read-ahead mark as appropriate
+			 */
+			if ((fbp && (i == 1)) || (i == (run - 1)))
+				tbp->b_flags |= B_RAM;
+
+			/*
+			 * Set the buffer up for an async read (XXX should
+			 * we do this only if we do not wind up brelse()ing?).
+			 * Set the block number if it isn't set, otherwise
+			 * if it is make sure it matches the block number we
+			 * expect.
+			 */
+			tbp->b_flags |= B_ASYNC;
+			tbp->b_iocmd = BIO_READ;
+			if (tbp->b_blkno == tbp->b_lblkno) {
+				tbp->b_blkno = bn;
+			} else if (tbp->b_blkno != bn) {
+				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+				goto clean_sbusy;
+			}
+		}
+		/*
+		 * XXX fbp from caller may not be B_ASYNC, but we are going
+		 * to biodone() it in cluster_callback() anyway
+		 */
+		BUF_KERNPROC(tbp);
+		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+			tbp, b_cluster.cluster_entry);
+		VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+		for (j = 0; j < tbp->b_npages; j += 1) {
+			vm_page_t m;
+			m = tbp->b_pages[j];
+			if ((bp->b_npages == 0) ||
+			    (bp->b_pages[bp->b_npages-1] != m)) {
+				bp->b_pages[bp->b_npages] = m;
+				bp->b_npages++;
+			}
+			if (m->valid == VM_PAGE_BITS_ALL)
+				tbp->b_pages[j] = bogus_page;
+		}
+		VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+		/*
+		 * Don't inherit tbp->b_bufsize as it may be larger due to
+		 * a non-page-aligned size.  Instead just aggregate using
+		 * 'size'.
+		 */
+		if (tbp->b_bcount != size)
+			printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
+		if (tbp->b_bufsize != size)
+			printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
+		bp->b_bcount += size;
+		bp->b_bufsize += size;
+	}
+
+	/*
+	 * Fully valid pages in the cluster are already good and do not need
+	 * to be re-read from disk.  Replace the page with bogus_page
+	 */
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+	for (j = 0; j < bp->b_npages; j++) {
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
+		if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
+			bp->b_pages[j] = bogus_page;
+	}
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+	if (bp->b_bufsize > bp->b_kvasize)
+		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+		    bp->b_bufsize, bp->b_kvasize);
+
+	if (buf_mapped(bp)) {
+		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		    (vm_page_t *)bp->b_pages, bp->b_npages);
+	}
+	return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+static void
+cluster_callback(struct buf *bp)
+{
+	struct buf *nbp, *tbp;
+	int error = 0;
+
+	/*
+	 * Must propagate errors to all the components.
+	 */
+	if (bp->b_ioflags & BIO_ERROR)
+		error = bp->b_error;
+
+	if (buf_mapped(bp)) {
+		pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+		    bp->b_npages);
+	}
+	/*
+	 * Move memory from the large cluster buffer into the component
+	 * buffers and mark IO as done on these.
+	 */
+	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+		tbp; tbp = nbp) {
+		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+		if (error) {
+			tbp->b_ioflags |= BIO_ERROR;
+			tbp->b_error = error;
+		} else {
+			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+			tbp->b_flags &= ~B_INVAL;
+			tbp->b_ioflags &= ~BIO_ERROR;
+			/*
+			 * XXX the bdwrite()/bqrelse() issued during
+			 * cluster building clears B_RELBUF (see bqrelse()
+			 * comment).  If direct I/O was specified, we have
+			 * to restore it here to allow the buffer and VM
+			 * to be freed.
+			 */
+			if (tbp->b_flags & B_DIRECT)
+				tbp->b_flags |= B_RELBUF;
+		}
+		bufdone(tbp);
+	}
+	pbrelvp(bp);
+	relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ *	cluster_wbuild_wb:
+ *
+ *	Implement modified write build for cluster.
+ *
+ *		write_behind = 0	write behind disabled
+ *		write_behind = 1	write behind normal (default)
+ *		write_behind = 2	write behind backed-off
+ */
+
+static __inline int
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+    int gbflags)
+{
+	int r = 0;
+
+	switch (write_behind) {
+	case 2:
+		if (start_lbn < len)
+			break;
+		start_lbn -= len;
+		/* FALLTHROUGH */
+	case 1:
+		r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
+		/* FALLTHROUGH */
+	default:
+		/* FALLTHROUGH */
+		break;
+	}
+	return(r);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ *	1. Write is not sequential (write asynchronously)
+ *	Write is sequential:
+ *	2.	beginning of cluster - begin cluster
+ *	3.	middle of a cluster - add to cluster
+ *	4.	end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
+    int gbflags)
+{
+	daddr_t lbn;
+	int maxclen, cursize;
+	int lblocksize;
+	int async;
+
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
+	if (vp->v_type == VREG) {
+		async = DOINGASYNC(vp);
+		lblocksize = vp->v_mount->mnt_stat.f_iosize;
+	} else {
+		async = 0;
+		lblocksize = bp->b_bufsize;
+	}
+	lbn = bp->b_lblkno;
+	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+	/* Initialize vnode to beginning of file. */
+	if (lbn == 0)
+		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+		maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
+		if (vp->v_clen != 0) {
+			/*
+			 * Next block is not sequential.
+			 *
+			 * If we are not writing at end of file, the process
+			 * seeked to another point in the file since its last
+			 * write, or we have reached our maximum cluster size,
+			 * then push the previous cluster. Otherwise try
+			 * reallocating to make it sequential.
+			 *
+			 * Change to algorithm: only push previous cluster if
+			 * it was sequential from the point of view of the
+			 * seqcount heuristic, otherwise leave the buffer 
+			 * intact so we can potentially optimize the I/O
+			 * later on in the buf_daemon or update daemon
+			 * flush.
+			 */
+			cursize = vp->v_lastw - vp->v_cstart + 1;
+			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+				if (!async && seqcount > 0) {
+					cluster_wbuild_wb(vp, lblocksize,
+					    vp->v_cstart, cursize, gbflags);
+				}
+			} else {
+				struct buf **bpp, **endbp;
+				struct cluster_save *buflist;
+
+				buflist = cluster_collectbufs(vp, bp, gbflags);
+				if (buflist == NULL) {
+					/*
+					 * Cluster build failed so just write
+					 * it now.
+					 */
+					bawrite(bp);
+					return;
+				}
+				endbp = &buflist->bs_children
+				    [buflist->bs_nchildren - 1];
+				if (VOP_REALLOCBLKS(vp, buflist)) {
+					/*
+					 * Failed, push the previous cluster
+					 * if *really* writing sequentially
+					 * in the logical file (seqcount > 1),
+					 * otherwise delay it in the hopes that
+					 * the low level disk driver can
+					 * optimize the write ordering.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp < endbp; bpp++)
+						brelse(*bpp);
+					free(buflist, M_SEGMENT);
+					if (seqcount > 1) {
+						cluster_wbuild_wb(vp, 
+						    lblocksize, vp->v_cstart, 
+						    cursize, gbflags);
+					}
+				} else {
+					/*
+					 * Succeeded, keep building cluster.
+					 */
+					for (bpp = buflist->bs_children;
+					     bpp <= endbp; bpp++)
+						bdwrite(*bpp);
+					free(buflist, M_SEGMENT);
+					vp->v_lastw = lbn;
+					vp->v_lasta = bp->b_blkno;
+					return;
+				}
+			}
+		}
+		/*
+		 * Consider beginning a cluster. If at end of file, make
+		 * cluster as large as possible, otherwise find size of
+		 * existing cluster.
+		 */
+		if ((vp->v_type == VREG) &&
+			((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+		    (bp->b_blkno == bp->b_lblkno) &&
+		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+		     bp->b_blkno == -1)) {
+			bawrite(bp);
+			vp->v_clen = 0;
+			vp->v_lasta = bp->b_blkno;
+			vp->v_cstart = lbn + 1;
+			vp->v_lastw = lbn;
+			return;
+		}
+		vp->v_clen = maxclen;
+		if (!async && maxclen == 0) {	/* I/O not contiguous */
+			vp->v_cstart = lbn + 1;
+			bawrite(bp);
+		} else {	/* Wait for rest of cluster */
+			vp->v_cstart = lbn;
+			bdwrite(bp);
+		}
+	} else if (lbn == vp->v_cstart + vp->v_clen) {
+		/*
+		 * At end of cluster, write it out if seqcount tells us we
+		 * are operating sequentially, otherwise let the buf or
+		 * update daemon handle it.
+		 */
+		bdwrite(bp);
+		if (seqcount > 1) {
+			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+			    vp->v_clen + 1, gbflags);
+		}
+		vp->v_clen = 0;
+		vp->v_cstart = lbn + 1;
+	} else if (vm_page_count_severe()) {
+		/*
+		 * We are low on memory, get it going NOW
+		 */
+		bawrite(bp);
+	} else {
+		/*
+		 * In the middle of a cluster, so just delay the I/O for now.
+		 */
+		bdwrite(bp);
+	}
+	vp->v_lastw = lbn;
+	vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed.  Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
+    int gbflags)
+{
+	struct buf *bp, *tbp;
+	struct bufobj *bo;
+	int i, j;
+	int totalwritten = 0;
+	int dbsize = btodb(size);
+
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
+	bo = &vp->v_bufobj;
+	while (len > 0) {
+		/*
+		 * If the buffer is not delayed-write (i.e. dirty), or it
+		 * is delayed-write but either locked or inval, it cannot
+		 * partake in the clustered write.
+		 */
+		BO_LOCK(bo);
+		if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
+		    (tbp->b_vflags & BV_BKGRDINPROG)) {
+			BO_UNLOCK(bo);
+			++start_lbn;
+			--len;
+			continue;
+		}
+		if (BUF_LOCK(tbp,
+		    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
+			++start_lbn;
+			--len;
+			continue;
+		}
+		if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
+			BUF_UNLOCK(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
+		bremfree(tbp);
+		tbp->b_flags &= ~B_DONE;
+
+		/*
+		 * Extra memory in the buffer, punt on this buffer.
+		 * XXX we could handle this in most cases, but we would
+		 * have to push the extra memory down to after our max
+		 * possible cluster size and then potentially pull it back
+		 * up if the cluster was terminated prematurely--too much
+		 * hassle.
+		 */
+		if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 
+		     (B_CLUSTEROK | B_VMIO)) ||
+		  (tbp->b_bcount != tbp->b_bufsize) ||
+		  (tbp->b_bcount != size) ||
+		  (len == 1) ||
+		  ((bp = (vp->v_vflag & VV_MD) != 0 ?
+		  trypbuf(&cluster_pbuf_freecnt) :
+		  getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+			totalwritten += tbp->b_bufsize;
+			bawrite(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
+
+		/*
+		 * We got a pbuf to make the cluster in.
+		 * so initialise it.
+		 */
+		TAILQ_INIT(&bp->b_cluster.cluster_head);
+		bp->b_bcount = 0;
+		bp->b_bufsize = 0;
+		bp->b_npages = 0;
+		if (tbp->b_wcred != NOCRED)
+			bp->b_wcred = crhold(tbp->b_wcred);
+
+		bp->b_blkno = tbp->b_blkno;
+		bp->b_lblkno = tbp->b_lblkno;
+		bp->b_offset = tbp->b_offset;
+
+		/*
+		 * We are synthesizing a buffer out of vm_page_t's, but
+		 * if the block size is not page aligned then the starting
+		 * address may not be either.  Inherit the b_data offset
+		 * from the original buffer.
+		 */
+		if ((gbflags & GB_UNMAPPED) == 0 ||
+		    (tbp->b_flags & B_VMIO) == 0) {
+			bp->b_data = (char *)((vm_offset_t)bp->b_data |
+			    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+		} else {
+			bp->b_data = unmapped_buf;
+		}
+		bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+		    B_NEEDCOMMIT));
+		bp->b_iodone = cluster_callback;
+		pbgetvp(vp, bp);
+		/*
+		 * From this location in the file, scan forward to see
+		 * if there are buffers with adjacent data that need to
+		 * be written as well.
+		 */
+		for (i = 0; i < len; ++i, ++start_lbn) {
+			if (i != 0) { /* If not the first buffer */
+				/*
+				 * If the adjacent data is not even in core it
+				 * can't need to be written.
+				 */
+				BO_LOCK(bo);
+				if ((tbp = gbincore(bo, start_lbn)) == NULL ||
+				    (tbp->b_vflags & BV_BKGRDINPROG)) {
+					BO_UNLOCK(bo);
+					break;
+				}
+
+				/*
+				 * If it IS in core, but has different
+				 * characteristics, or is locked (which
+				 * means it could be undergoing a background
+				 * I/O or be in a weird state), then don't
+				 * cluster with it.
+				 */
+				if (BUF_LOCK(tbp,
+				    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
+				    BO_LOCKPTR(bo)))
+					break;
+
+				if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
+				    B_INVAL | B_DELWRI | B_NEEDCOMMIT))
+				    != (B_DELWRI | B_CLUSTEROK |
+				    (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
+				    tbp->b_wcred != bp->b_wcred) {
+					BUF_UNLOCK(tbp);
+					break;
+				}
+
+				/*
+				 * Check that the combined cluster
+				 * would make sense with regard to pages
+				 * and would not be too large
+				 */
+				if ((tbp->b_bcount != size) ||
+				  ((bp->b_blkno + (dbsize * i)) !=
+				    tbp->b_blkno) ||
+				  ((tbp->b_npages + bp->b_npages) >
+				    (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
+					BUF_UNLOCK(tbp);
+					break;
+				}
+
+				/*
+				 * Ok, it's passed all the tests,
+				 * so remove it from the free list
+				 * and mark it busy. We will use it.
+				 */
+				bremfree(tbp);
+				tbp->b_flags &= ~B_DONE;
+			} /* end of code for non-first buffers only */
+			/*
+			 * If the IO is via the VM then we do some
+			 * special VM hackery (yuck).  Since the buffer's
+			 * block size may not be page-aligned it is possible
+			 * for a page to be shared between two buffers.  We
+			 * have to get rid of the duplication when building
+			 * the cluster.
+			 */
+			if (tbp->b_flags & B_VMIO) {
+				vm_page_t m;
+
+				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+				if (i == 0) {
+					vfs_drain_busy_pages(tbp);
+				} else { /* if not first buffer */
+					for (j = 0; j < tbp->b_npages; j += 1) {
+						m = tbp->b_pages[j];
+						if (vm_page_xbusied(m)) {
+							VM_OBJECT_WUNLOCK(
+							    tbp->b_object);
+							bqrelse(tbp);
+							goto finishcluster;
+						}
+					}
+				}
+				for (j = 0; j < tbp->b_npages; j += 1) {
+					m = tbp->b_pages[j];
+					vm_page_sbusy(m);
+					vm_object_pip_add(m->object, 1);
+					if ((bp->b_npages == 0) ||
+					  (bp->b_pages[bp->b_npages - 1] != m)) {
+						bp->b_pages[bp->b_npages] = m;
+						bp->b_npages++;
+					}
+				}
+				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+			}
+			bp->b_bcount += size;
+			bp->b_bufsize += size;
+			/*
+			 * If any of the clustered buffers have their
+			 * B_BARRIER flag set, transfer that request to
+			 * the cluster.
+			 */
+			bp->b_flags |= (tbp->b_flags & B_BARRIER);
+			tbp->b_flags &= ~(B_DONE | B_BARRIER);
+			tbp->b_flags |= B_ASYNC;
+			tbp->b_ioflags &= ~BIO_ERROR;
+			tbp->b_iocmd = BIO_WRITE;
+			bundirty(tbp);
+			reassignbuf(tbp);		/* put on clean list */
+			bufobj_wref(tbp->b_bufobj);
+			BUF_KERNPROC(tbp);
+			buf_track(tbp, __func__);
+			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+				tbp, b_cluster.cluster_entry);
+		}
+	finishcluster:
+		if (buf_mapped(bp)) {
+			pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+			    (vm_page_t *)bp->b_pages, bp->b_npages);
+		}
+		if (bp->b_bufsize > bp->b_kvasize)
+			panic(
+			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+			    bp->b_bufsize, bp->b_kvasize);
+		totalwritten += bp->b_bufsize;
+		bp->b_dirtyoff = 0;
+		bp->b_dirtyend = bp->b_bufsize;
+		bawrite(bp);
+
+		len -= i;
+	}
+	return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
+{
+	struct cluster_save *buflist;
+	struct buf *bp;
+	daddr_t lbn;
+	int i, j, len, error;
+
+	len = vp->v_lastw - vp->v_cstart + 1;
+	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+	    M_SEGMENT, M_WAITOK);
+	buflist->bs_nchildren = 0;
+	buflist->bs_children = (struct buf **) (buflist + 1);
+	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+		error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+		    gbflags, &bp);
+		if (error != 0) {
+			/*
+			 * If read fails, release collected buffers
+			 * and return failure.
+			 */
+			for (j = 0; j < i; j++)
+				brelse(buflist->bs_children[j]);
+			free(buflist, M_SEGMENT);
+			return (NULL);
+		}
+		buflist->bs_children[i] = bp;
+		if (bp->b_blkno == bp->b_lblkno)
+			VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+				NULL, NULL);
+	}
+	buflist->bs_children[i] = bp = last_bp;
+	if (bp->b_blkno == bp->b_lblkno)
+		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+	buflist->bs_nchildren = i + 1;
+	return (buflist);
+}
diff --git a/freebsd/sys/kern/vfs_default.c b/freebsd/sys/kern/vfs_default.c
new file mode 100644
index 00000000..40041c9d
--- /dev/null
+++ b/freebsd/sys/kern/vfs_default.c
@@ -0,0 +1,1286 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/rwlock.h>
+#include <sys/fcntl.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/dirent.h>
+#include <sys/poll.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+
+static int	vop_nolookup(struct vop_lookup_args *);
+static int	vop_norename(struct vop_rename_args *);
+static int	vop_nostrategy(struct vop_strategy_args *);
+static int	get_next_dirent(struct vnode *vp, struct dirent **dpp,
+				char *dirbuf, int dirbuflen, off_t *off,
+				char **cpos, int *len, int *eofflag,
+				struct thread *td);
+static int	dirent_exists(struct vnode *vp, const char *dirname,
+			      struct thread *td);
+
+#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
+
+static int vop_stdis_text(struct vop_is_text_args *ap);
+static int vop_stdunset_text(struct vop_unset_text_args *ap);
+static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
+static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
+static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ * Note that every filesystem has to implement either vop_access
+ * or vop_accessx; failing to do so will result in immediate crash
+ * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(),
+ * which calls vop_stdaccess() etc.
+ */
+
+struct vop_vector default_vnodeops = {
+	.vop_default =		NULL,
+	.vop_bypass =		VOP_EOPNOTSUPP,
+
+	.vop_access =		vop_stdaccess,
+	.vop_accessx =		vop_stdaccessx,
+	.vop_advise =		vop_stdadvise,
+	.vop_advlock =		vop_stdadvlock,
+	.vop_advlockasync =	vop_stdadvlockasync,
+	.vop_advlockpurge =	vop_stdadvlockpurge,
+	.vop_allocate =		vop_stdallocate,
+	.vop_bmap =		vop_stdbmap,
+	.vop_close =		VOP_NULL,
+	.vop_fsync =		VOP_NULL,
+	.vop_fdatasync =	vop_stdfdatasync,
+	.vop_getpages =		vop_stdgetpages,
+	.vop_getpages_async =	vop_stdgetpages_async,
+	.vop_getwritemount = 	vop_stdgetwritemount,
+	.vop_inactive =		VOP_NULL,
+	.vop_ioctl =		VOP_ENOTTY,
+	.vop_kqfilter =		vop_stdkqfilter,
+	.vop_islocked =		vop_stdislocked,
+	.vop_lock1 =		vop_stdlock,
+	.vop_lookup =		vop_nolookup,
+	.vop_open =		VOP_NULL,
+	.vop_pathconf =		VOP_EINVAL,
+	.vop_poll =		vop_nopoll,
+	.vop_putpages =		vop_stdputpages,
+	.vop_readlink =		VOP_EINVAL,
+	.vop_rename =		vop_norename,
+	.vop_revoke =		VOP_PANIC,
+	.vop_strategy =		vop_nostrategy,
+	.vop_unlock =		vop_stdunlock,
+	.vop_vptocnp =		vop_stdvptocnp,
+	.vop_vptofh =		vop_stdvptofh,
+	.vop_unp_bind =		vop_stdunp_bind,
+	.vop_unp_connect =	vop_stdunp_connect,
+	.vop_unp_detach =	vop_stdunp_detach,
+	.vop_is_text =		vop_stdis_text,
+	.vop_set_text =		vop_stdset_text,
+	.vop_unset_text =	vop_stdunset_text,
+	.vop_add_writecount =	vop_stdadd_writecount,
+};
+
+/*
+ * Series of placeholder functions for various error returns for
+ * VOPs.
+ */
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+	/*
+	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+	*/
+
+	return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+	return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+	return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+	return (EINVAL);
+}
+
+int
+vop_enoent(struct vop_generic_args *ap)
+{
+
+	return (ENOENT);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+	return (0);
+}
+
+/*
+ * Helper function to panic on some bad VOPs in some filesystems.
+ */
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
+}
+
+/*
+ * vop_std<something> and vop_no<something> are default functions for use by
+ * filesystems that need the "default reasonable" implementation for a
+ * particular operation.
+ *
+ * The documentation for the operations they implement exists (if it exists)
+ * in the VOP_<SOMETHING>(9) manpage (all uppercase).
+ */
+
+/*
+ * Default vop for filesystems that do not support name lookup
+ */
+static int
+vop_nolookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+
+	*ap->a_vpp = NULL;
+	return (ENOTDIR);
+}
+
+/*
+ * vop_norename:
+ *
+ * Handle unlock and reference counting for arguments of vop_rename
+ * for filesystems that do not implement rename operation.
+ */
+static int
+vop_norename(struct vop_rename_args *ap)
+{
+
+	vop_rename_fail(ap);
+	return (EOPNOTSUPP);
+}
+
+/*
+ *	vop_nostrategy:
+ *
+ *	Strategy routine for VFS devices that have none.
+ *
+ *	BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
+ *	routine.  Typically this is done for a BIO_READ strategy call.
+ *	Typically B_INVAL is assumed to already be clear prior to a write
+ *	and should not be cleared manually unless you just made the buffer
+ *	invalid.  BIO_ERROR should be cleared either way.
+ */
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+	printf("No strategy for buffer at %p\n", ap->a_bp);
+	vn_printf(ap->a_vp, "vnode ");
+	ap->a_bp->b_ioflags |= BIO_ERROR;
+	ap->a_bp->b_error = EOPNOTSUPP;
+	bufdone(ap->a_bp);
+	return (EOPNOTSUPP);
+}
+
+static int
+get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
+		int dirbuflen, off_t *off, char **cpos, int *len,
+		int *eofflag, struct thread *td)
+{
+	int error, reclen;
+	struct uio uio;
+	struct iovec iov;
+	struct dirent *dp;
+
+	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+	if (*len == 0) {
+		iov.iov_base = dirbuf;
+		iov.iov_len = dirbuflen;
+
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_offset = *off;
+		uio.uio_resid = dirbuflen;
+		uio.uio_segflg = UIO_SYSSPACE;
+		uio.uio_rw = UIO_READ;
+		uio.uio_td = td;
+
+		*eofflag = 0;
+
+#ifdef MAC
+		error = mac_vnode_check_readdir(td->td_ucred, vp);
+		if (error == 0)
+#endif
+			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
+		    		NULL, NULL);
+		if (error)
+			return (error);
+
+		*off = uio.uio_offset;
+
+		*cpos = dirbuf;
+		*len = (dirbuflen - uio.uio_resid);
+
+		if (*len == 0)
+			return (ENOENT);
+	}
+
+	dp = (struct dirent *)(*cpos);
+	reclen = dp->d_reclen;
+	*dpp = dp;
+
+	/* check for malformed directory.. */
+	if (reclen < DIRENT_MINSIZE)
+		return (EINVAL);
+
+	*cpos += reclen;
+	*len -= reclen;
+
+	return (0);
+}
+
+/*
+ * Check if a named file exists in a given directory vnode.
+ */
+static int
+dirent_exists(struct vnode *vp, const char *dirname, struct thread *td)
+{
+	char *dirbuf, *cpos;
+	int error, eofflag, dirbuflen, len, found;
+	off_t off;
+	struct dirent *dp;
+	struct vattr va;
+
+	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+	found = 0;
+
+	error = VOP_GETATTR(vp, &va, td->td_ucred);
+	if (error)
+		return (found);
+
+	dirbuflen = DEV_BSIZE;
+	if (dirbuflen < va.va_blocksize)
+		dirbuflen = va.va_blocksize;
+	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+	off = 0;
+	len = 0;
+	do {
+		error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
+					&cpos, &len, &eofflag, td);
+		if (error)
+			goto out;
+
+		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+		    strcmp(dp->d_name, dirname) == 0) {
+			found = 1;
+			goto out;
+		}
+	} while (len > 0 || !eofflag);
+
+out:
+	free(dirbuf, M_TEMP);
+	return (found);
+}
+
+int
+vop_stdaccess(struct vop_access_args *ap)
+{
+
+	KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
+	    VAPPEND)) == 0, ("invalid bit in accmode"));
+
+	return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td));
+}
+
+int
+vop_stdaccessx(struct vop_accessx_args *ap)
+{
+	int error;
+	accmode_t accmode = ap->a_accmode;
+
+	error = vfs_unixify_accmode(&accmode);
+	if (error != 0)
+		return (error);
+
+	if (accmode == 0)
+		return (0);
+
+	return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td));
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+vop_stdadvlock(struct vop_advlock_args *ap)
+{
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+
+	vp = ap->a_vp;
+	if (ap->a_fl->l_whence == SEEK_END) {
+		/*
+		 * The NFSv4 server must avoid doing a vn_lock() here, since it
+		 * can deadlock the nfsd threads, due to a LOR.  Fortunately
+		 * the NFSv4 server always uses SEEK_SET and this code is
+		 * only required for the SEEK_END case.
+		 */
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+		VOP_UNLOCK(vp, 0);
+		if (error)
+			return (error);
+	} else
+		vattr.va_size = 0;
+
+	return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockasync(struct vop_advlockasync_args *ap)
+{
+	struct vnode *vp;
+	struct vattr vattr;
+	int error;
+
+	vp = ap->a_vp;
+	if (ap->a_fl->l_whence == SEEK_END) {
+		/* The size argument is only needed for SEEK_END. */
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+		VOP_UNLOCK(vp, 0);
+		if (error)
+			return (error);
+	} else
+		vattr.va_size = 0;
+
+	return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockpurge(struct vop_advlockpurge_args *ap)
+{
+	struct vnode *vp;
+
+	vp = ap->a_vp;
+	lf_purgelocks(vp, &vp->v_lockf);
+	return (0);
+}
+
+/*
+ * vop_stdpathconf:
+ *
+ * Standard implementation of POSIX pathconf, to get information about limits
+ * for a filesystem.
+ * Override per filesystem for the case where the filesystem has smaller
+ * limits.
+ */
+int
+vop_stdpathconf(ap)
+	struct vop_pathconf_args /* {
+	struct vnode *a_vp;
+	int a_name;
+	int *a_retval;
+	} */ *ap;
+{
+
+	switch (ap->a_name) {
+		case _PC_ASYNC_IO:
+			*ap->a_retval = _POSIX_ASYNCHRONOUS_IO;
+			return (0);
+		case _PC_PATH_MAX:
+			*ap->a_retval = PATH_MAX;
+			return (0);
+		case _PC_ACL_EXTENDED:
+		case _PC_ACL_NFS4:
+		case _PC_CAP_PRESENT:
+		case _PC_INF_PRESENT:
+		case _PC_MAC_PRESENT:
+			*ap->a_retval = 0;
+			return (0);
+		default:
+			return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ */
+int
+vop_stdlock(ap)
+	struct vop_lock1_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		char *file;
+		int line;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct mtx *ilk;
+
+	ilk = VI_MTX(vp);
+	return (lockmgr_lock_fast_path(vp->v_vnlock, ap->a_flags,
+	    &ilk->lock_object, ap->a_file, ap->a_line));
+}
+
+/* See above. */
+int
+vop_stdunlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct mtx *ilk;
+
+	ilk = VI_MTX(vp);
+	return (lockmgr_unlock_fast_path(vp->v_vnlock, ap->a_flags,
+	    &ilk->lock_object));
+}
+
+/* See above. */
+int
+vop_stdislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+
+	return (lockstatus(ap->a_vp->v_vnlock));
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	return (poll_no_poll(ap->a_events));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+	struct vop_poll_args /* {
+		struct vnode *a_vp;
+		int  a_events;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
+{
+	if (ap->a_events & ~POLLSTANDARD)
+		return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
+	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Return our mount point, as we will take charge of the writes.
+ */
+int
+vop_stdgetwritemount(ap)
+	struct vop_getwritemount_args /* {
+		struct vnode *a_vp;
+		struct mount **a_mpp;
+	} */ *ap;
+{
+	struct mount *mp;
+
+	/*
+	 * XXX Since this is called unlocked we may be recycled while
+	 * attempting to ref the mount.  If this is the case or mountpoint
+	 * will be set to NULL.  We only have to prevent this call from
+	 * returning with a ref to an incorrect mountpoint.  It is not
+	 * harmful to return with a ref to our previous mountpoint.
+	 */
+	mp = ap->a_vp->v_mount;
+	if (mp != NULL) {
+		vfs_ref(mp);
+		if (mp != ap->a_vp->v_mount) {
+			vfs_rel(mp);
+			mp = NULL;
+		}
+	}
+	*(ap->a_mpp) = mp;
+	return (0);
+}
+
+/*
+ * If the file system doesn't implement VOP_BMAP, then return sensible defaults:
+ * - Return the vnode's bufobj instead of any underlying device's bufobj
+ * - Calculate the physical block number as if there were equal size
+ *   consecutive blocks, but
+ * - Report no contiguous runs of blocks.
+ */
+int
+vop_stdbmap(ap)
+	struct vop_bmap_args /* {
+		struct vnode *a_vp;
+		daddr_t  a_bn;
+		struct bufobj **a_bop;
+		daddr_t *a_bnp;
+		int *a_runp;
+		int *a_runb;
+	} */ *ap;
+{
+
+	if (ap->a_bop != NULL)
+		*ap->a_bop = &ap->a_vp->v_bufobj;
+	if (ap->a_bnp != NULL)
+		*ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	if (ap->a_runb != NULL)
+		*ap->a_runb = 0;
+	return (0);
+}
+
+int
+vop_stdfsync(ap)
+	struct vop_fsync_args /* {
+		struct vnode *a_vp;
+		int a_waitfor;
+		struct thread *a_td;
+	} */ *ap;
+{
+
+	return (vn_fsync_buf(ap->a_vp, ap->a_waitfor));
+}
+
+static int
+vop_stdfdatasync(struct vop_fdatasync_args *ap)
+{
+
+	return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td));
+}
+
+int
+vop_stdfdatasync_buf(struct vop_fdatasync_args *ap)
+{
+
+	return (vn_fsync_buf(ap->a_vp, MNT_WAIT));
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
+int
+vop_stdgetpages(ap)
+	struct vop_getpages_args /* {
+		struct vnode *a_vp;
+		vm_page_t *a_m;
+		int a_count;
+		int *a_rbehind;
+		int *a_rahead;
+	} */ *ap;
+{
+
+	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
+	    ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL);
+}
+
+static int
+vop_stdgetpages_async(struct vop_getpages_async_args *ap)
+{
+	int error;
+
+	error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
+	    ap->a_rahead);
+	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
+	return (error);
+}
+
+int
+vop_stdkqfilter(struct vop_kqfilter_args *ap)
+{
+	return vfs_kqfilter(ap);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
+int
+vop_stdputpages(ap)
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		vm_page_t *a_m;
+		int a_count;
+		int a_sync;
+		int *a_rtvals;
+	} */ *ap;
+{
+
+	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
+	     ap->a_sync, ap->a_rtvals);
+}
+
+int
+vop_stdvptofh(struct vop_vptofh_args *ap)
+{
+	return (EOPNOTSUPP);
+}
+
+int
+vop_stdvptocnp(struct vop_vptocnp_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode **dvp = ap->a_vpp;
+	struct ucred *cred = ap->a_cred;
+	char *buf = ap->a_buf;
+	int *buflen = ap->a_buflen;
+	char *dirbuf, *cpos;
+	int i, error, eofflag, dirbuflen, flags, locked, len, covered;
+	off_t off;
+	ino_t fileno;
+	struct vattr va;
+	struct nameidata nd;
+	struct thread *td;
+	struct dirent *dp;
+	struct vnode *mvp;
+
+	i = *buflen;
+	error = 0;
+	covered = 0;
+	td = curthread;
+
+	if (vp->v_type != VDIR)
+		return (ENOENT);
+
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error)
+		return (error);
+
+	VREF(vp);
+	locked = VOP_ISLOCKED(vp);
+	VOP_UNLOCK(vp, 0);
+	NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE,
+	    "..", vp, td);
+	flags = FREAD;
+	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL);
+	if (error) {
+		vn_lock(vp, locked | LK_RETRY);
+		return (error);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	mvp = *dvp = nd.ni_vp;
+
+	if (vp->v_mount != (*dvp)->v_mount &&
+	    ((*dvp)->v_vflag & VV_ROOT) &&
+	    ((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
+		*dvp = (*dvp)->v_mount->mnt_vnodecovered;
+		VREF(mvp);
+		VOP_UNLOCK(mvp, 0);
+		vn_close(mvp, FREAD, cred, td);
+		VREF(*dvp);
+		vn_lock(*dvp, LK_SHARED | LK_RETRY);
+		covered = 1;
+	}
+
+	fileno = va.va_fileid;
+
+	dirbuflen = DEV_BSIZE;
+	if (dirbuflen < va.va_blocksize)
+		dirbuflen = va.va_blocksize;
+	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+	if ((*dvp)->v_type != VDIR) {
+		error = ENOENT;
+		goto out;
+	}
+
+	off = 0;
+	len = 0;
+	do {
+		/* call VOP_READDIR of parent */
+		error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
+					&cpos, &len, &eofflag, td);
+		if (error)
+			goto out;
+
+		if ((dp->d_type != DT_WHT) &&
+		    (dp->d_fileno == fileno)) {
+			if (covered) {
+				VOP_UNLOCK(*dvp, 0);
+				vn_lock(mvp, LK_SHARED | LK_RETRY);
+				if (dirent_exists(mvp, dp->d_name, td)) {
+					error = ENOENT;
+					VOP_UNLOCK(mvp, 0);
+					vn_lock(*dvp, LK_SHARED | LK_RETRY);
+					goto out;
+				}
+				VOP_UNLOCK(mvp, 0);
+				vn_lock(*dvp, LK_SHARED | LK_RETRY);
+			}
+			i -= dp->d_namlen;
+
+			if (i < 0) {
+				error = ENOMEM;
+				goto out;
+			}
+			if (dp->d_namlen == 1 && dp->d_name[0] == '.') {
+				error = ENOENT;
+			} else {
+				bcopy(dp->d_name, buf + i, dp->d_namlen);
+				error = 0;
+			}
+			goto out;
+		}
+	} while (len > 0 || !eofflag);
+	error = ENOENT;
+
+out:
+	free(dirbuf, M_TEMP);
+	if (!error) {
+		*buflen = i;
+		vref(*dvp);
+	}
+	if (covered) {
+		vput(*dvp);
+		vrele(mvp);
+	} else {
+		VOP_UNLOCK(mvp, 0);
+		vn_close(mvp, FREAD, cred, td);
+	}
+	vn_lock(vp, locked | LK_RETRY);
+	return (error);
+}
+
+int
+vop_stdallocate(struct vop_allocate_args *ap)
+{
+#ifdef __notyet__
+	struct statfs *sfs;
+	off_t maxfilesize = 0;
+#endif
+	struct iovec aiov;
+	struct vattr vattr, *vap;
+	struct uio auio;
+	off_t fsize, len, cur, offset;
+	uint8_t *buf;
+	struct thread *td;
+	struct vnode *vp;
+	size_t iosize;
+	int error;
+
+	buf = NULL;
+	error = 0;
+	td = curthread;
+	vap = &vattr;
+	vp = ap->a_vp;
+	len = *ap->a_len;
+	offset = *ap->a_offset;
+
+	error = VOP_GETATTR(vp, vap, td->td_ucred);
+	if (error != 0)
+		goto out;
+	fsize = vap->va_size;
+	iosize = vap->va_blocksize;
+	if (iosize == 0)
+		iosize = BLKDEV_IOSIZE;
+	if (iosize > MAXPHYS)
+		iosize = MAXPHYS;
+	buf = malloc(iosize, M_TEMP, M_WAITOK);
+
+#ifdef __notyet__
+	/*
+	 * Check if the filesystem sets f_maxfilesize; if not use
+	 * VOP_SETATTR to perform the check.
+	 */
+	sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = VFS_STATFS(vp->v_mount, sfs, td);
+	if (error == 0)
+		maxfilesize = sfs->f_maxfilesize;
+	free(sfs, M_STATFS);
+	if (error != 0)
+		goto out;
+	if (maxfilesize) {
+		if (offset > maxfilesize || len > maxfilesize ||
+		    offset + len > maxfilesize) {
+			error = EFBIG;
+			goto out;
+		}
+	} else
+#endif
+	if (offset + len > vap->va_size) {
+		/*
+		 * Test offset + len against the filesystem's maxfilesize.
+		 */
+		VATTR_NULL(vap);
+		vap->va_size = offset + len;
+		error = VOP_SETATTR(vp, vap, td->td_ucred);
+		if (error != 0)
+			goto out;
+		VATTR_NULL(vap);
+		vap->va_size = fsize;
+		error = VOP_SETATTR(vp, vap, td->td_ucred);
+		if (error != 0)
+			goto out;
+	}
+
+	for (;;) {
+		/*
+		 * Read and write back anything below the nominal file
+		 * size.  There's currently no way outside the filesystem
+		 * to know whether this area is sparse or not.
+		 */
+		cur = iosize;
+		if ((offset % iosize) != 0)
+			cur -= (offset % iosize);
+		if (cur > len)
+			cur = len;
+		if (offset < fsize) {
+			aiov.iov_base = buf;
+			aiov.iov_len = cur;
+			auio.uio_iov = &aiov;
+			auio.uio_iovcnt = 1;
+			auio.uio_offset = offset;
+			auio.uio_resid = cur;
+			auio.uio_segflg = UIO_SYSSPACE;
+			auio.uio_rw = UIO_READ;
+			auio.uio_td = td;
+			error = VOP_READ(vp, &auio, 0, td->td_ucred);
+			if (error != 0)
+				break;
+			if (auio.uio_resid > 0) {
+				bzero(buf + cur - auio.uio_resid,
+				    auio.uio_resid);
+			}
+		} else {
+			bzero(buf, cur);
+		}
+
+		aiov.iov_base = buf;
+		aiov.iov_len = cur;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = offset;
+		auio.uio_resid = cur;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_rw = UIO_WRITE;
+		auio.uio_td = td;
+
+		error = VOP_WRITE(vp, &auio, 0, td->td_ucred);
+		if (error != 0)
+			break;
+
+		len -= cur;
+		offset += cur;
+		if (len == 0)
+			break;
+		if (should_yield())
+			break;
+	}
+
+ out:
+	*ap->a_len = len;
+	*ap->a_offset = offset;
+	free(buf, M_TEMP);
+	return (error);
+}
+
+int
+vop_stdadvise(struct vop_advise_args *ap)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+	daddr_t startn, endn;
+	off_t start, end;
+	int bsize, error;
+
+	vp = ap->a_vp;
+	switch (ap->a_advice) {
+	case POSIX_FADV_WILLNEED:
+		/*
+		 * Do nothing for now.  Filesystems should provide a
+		 * custom method which starts an asynchronous read of
+		 * the requested region.
+		 */
+		error = 0;
+		break;
+	case POSIX_FADV_DONTNEED:
+		error = 0;
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		if (vp->v_iflag & VI_DOOMED) {
+			VOP_UNLOCK(vp, 0);
+			break;
+		}
+
+		/*
+		 * Deactivate pages in the specified range from the backing VM
+		 * object.  Pages that are resident in the buffer cache will
+		 * remain wired until their corresponding buffers are released
+		 * below.
+		 */
+		if (vp->v_object != NULL) {
+			start = trunc_page(ap->a_start);
+			end = round_page(ap->a_end);
+			VM_OBJECT_RLOCK(vp->v_object);
+			vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start),
+			    OFF_TO_IDX(end));
+			VM_OBJECT_RUNLOCK(vp->v_object);
+		}
+
+		bo = &vp->v_bufobj;
+		BO_RLOCK(bo);
+		bsize = vp->v_bufobj.bo_bsize;
+		startn = ap->a_start / bsize;
+		endn = ap->a_end / bsize;
+		error = bnoreuselist(&bo->bo_clean, bo, startn, endn);
+		if (error == 0)
+			error = bnoreuselist(&bo->bo_dirty, bo, startn, endn);
+		BO_RUNLOCK(bo);
+		VOP_UNLOCK(vp, 0);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+int
+vop_stdunp_bind(struct vop_unp_bind_args *ap)
+{
+
+	ap->a_vp->v_unpcb = ap->a_unpcb;
+	return (0);
+}
+
+int
+vop_stdunp_connect(struct vop_unp_connect_args *ap)
+{
+
+	*ap->a_unpcb = ap->a_vp->v_unpcb;
+	return (0);
+}
+
+int
+vop_stdunp_detach(struct vop_unp_detach_args *ap)
+{
+
+	ap->a_vp->v_unpcb = NULL;
+	return (0);
+}
+
+static int
+vop_stdis_text(struct vop_is_text_args *ap)
+{
+
+	return (ap->a_vp->v_writecount < 0);
+}
+
+int
+vop_stdset_text(struct vop_set_text_args *ap)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	int error;
+
+	vp = ap->a_vp;
+	VI_LOCK(vp);
+	if (vp->v_writecount > 0) {
+		error = ETXTBSY;
+	} else {
+		/*
+		 * If requested by fs, keep a use reference to the
+		 * vnode until the last text reference is released.
+		 */
+		mp = vp->v_mount;
+		if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 &&
+		    vp->v_writecount == 0) {
+			vp->v_iflag |= VI_TEXT_REF;
+			vrefl(vp);
+		}
+
+		vp->v_writecount--;
+		error = 0;
+	}
+	VI_UNLOCK(vp);
+	return (error);
+}
+
+static int
+vop_stdunset_text(struct vop_unset_text_args *ap)
+{
+	struct vnode *vp;
+	int error;
+	bool last;
+
+	vp = ap->a_vp;
+	last = false;
+	VI_LOCK(vp);
+	if (vp->v_writecount < 0) {
+		if ((vp->v_iflag & VI_TEXT_REF) != 0 &&
+		    vp->v_writecount == -1) {
+			last = true;
+			vp->v_iflag &= ~VI_TEXT_REF;
+		}
+		vp->v_writecount++;
+		error = 0;
+	} else {
+		error = EINVAL;
+	}
+	VI_UNLOCK(vp);
+	if (last)
+		vunref(vp);
+	return (error);
+}
+
+static int
+vop_stdadd_writecount(struct vop_add_writecount_args *ap)
+{
+	struct vnode *vp;
+	int error;
+
+	vp = ap->a_vp;
+	VI_LOCK_FLAGS(vp, MTX_DUPOK);
+	if (vp->v_writecount < 0) {
+		error = ETXTBSY;
+	} else {
+		VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
+		    ("neg writecount increment %d", ap->a_inc));
+		vp->v_writecount += ap->a_inc;
+		error = 0;
+	}
+	VI_UNLOCK(vp);
+	return (error);
+}
+
+/*
+ * vfs default ops
+ * used to fill the vfs function table to get reasonable default return values.
+ */
+int
+vfs_stdroot (mp, flags, vpp)
+	struct mount *mp;
+	int flags;
+	struct vnode **vpp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstatfs (mp, sbp)
+	struct mount *mp;
+	struct statfs *sbp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdquotactl (mp, cmds, uid, arg)
+	struct mount *mp;
+	int cmds;
+	uid_t uid;
+	void *arg;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsync(mp, waitfor)
+	struct mount *mp;
+	int waitfor;
+{
+	struct vnode *vp, *mvp;
+	struct thread *td;
+	int error, lockreq, allerror = 0;
+
+	td = curthread;
+	lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
+	if (waitfor != MNT_WAIT)
+		lockreq |= LK_NOWAIT;
+	/*
+	 * Force stale buffer cache information to be flushed.
+	 */
+loop:
+	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+		if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
+			VI_UNLOCK(vp);
+			continue;
+		}
+		if ((error = vget(vp, lockreq, td)) != 0) {
+			if (error == ENOENT) {
+				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+				goto loop;
+			}
+			continue;
+		}
+		error = VOP_FSYNC(vp, waitfor, td);
+		if (error)
+			allerror = error;
+		vput(vp);
+	}
+	return (allerror);
+}
+
+int
+vfs_stdnosync (mp, waitfor)
+	struct mount *mp;
+	int waitfor;
+{
+
+	return (0);
+}
+
+int
+vfs_stdvget (mp, ino, flags, vpp)
+	struct mount *mp;
+	ino_t ino;
+	int flags;
+	struct vnode **vpp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdfhtovp (mp, fhp, flags, vpp)
+	struct mount *mp;
+	struct fid *fhp;
+	int flags;
+	struct vnode **vpp;
+{
+
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdinit (vfsp)
+	struct vfsconf *vfsp;
+{
+
+	return (0);
+}
+
+int
+vfs_stduninit (vfsp)
+	struct vfsconf *vfsp;
+{
+
+	return(0);
+}
+
+int
+vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)
+	struct mount *mp;
+	int cmd;
+	struct vnode *filename_vp;
+	int attrnamespace;
+	const char *attrname;
+{
+
+	if (filename_vp != NULL)
+		VOP_UNLOCK(filename_vp, 0);
+	return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsysctl(mp, op, req)
+	struct mount *mp;
+	fsctlop_t op;
+	struct sysctl_req *req;
+{
+
+	return (EOPNOTSUPP);
+}
+
+/* end of vfs default ops */
diff --git a/freebsd/sys/kern/vfs_export.c b/freebsd/sys/kern/vfs_export.c
new file mode 100644
index 00000000..669d4e9f
--- /dev/null
+++ b/freebsd/sys/kern/vfs_export.c
@@ -0,0 +1,528 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dirent.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/rmlock.h>
+#include <sys/refcount.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/vnode.h>
+
+#include <netinet/in.h>
+#include <net/radix.h>
+
+static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure");
+
+#if defined(INET) || defined(INET6)
+static struct radix_node_head *vfs_create_addrlist_af(
+		    struct radix_node_head **prnh, int off);
+#endif
+static void	vfs_free_addrlist(struct netexport *nep);
+static int	vfs_free_netcred(struct radix_node *rn, void *w);
+static void	vfs_free_addrlist_af(struct radix_node_head **prnh);
+static int	vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+		    struct export_args *argp);
+static struct netcred *vfs_export_lookup(struct mount *, struct sockaddr *);
+
+/*
+ * Network address lookup element
+ */
+struct netcred {
+	struct	radix_node netc_rnodes[2];
+	int	netc_exflags;
+	struct	ucred *netc_anon;
+	int	netc_numsecflavors;
+	int	netc_secflavors[MAXSECFLAVORS];
+};
+
+/*
+ * Network export information
+ */
+struct netexport {
+	struct	netcred ne_defexported;		      /* Default export */
+	struct 	radix_node_head	*ne4;
+	struct 	radix_node_head	*ne6;
+};
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by vfs_export() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+    struct export_args *argp)
+{
+	struct netcred *np;
+	struct radix_node_head *rnh;
+	int i;
+	struct radix_node *rn;
+	struct sockaddr *saddr, *smask = NULL;
+#if defined(INET6) || defined(INET)
+	int off;
+#endif
+	int error;
+
+	/*
+	 * XXX: This routine converts from a `struct xucred'
+	 * (argp->ex_anon) to a `struct ucred' (np->netc_anon).  This
+	 * operation is questionable; for example, what should be done
+	 * with fields like cr_uidinfo and cr_prison?  Currently, this
+	 * routine does not touch them (leaves them as NULL).
+	 */
+	if (argp->ex_anon.cr_version != XUCRED_VERSION) {
+		vfs_mount_error(mp, "ex_anon.cr_version: %d != %d",
+		    argp->ex_anon.cr_version, XUCRED_VERSION);
+		return (EINVAL);
+	}
+
+	if (argp->ex_addrlen == 0) {
+		if (mp->mnt_flag & MNT_DEFEXPORTED) {
+			vfs_mount_error(mp,
+			    "MNT_DEFEXPORTED already set for mount %p", mp);
+			return (EPERM);
+		}
+		np = &nep->ne_defexported;
+		np->netc_exflags = argp->ex_flags;
+		np->netc_anon = crget();
+		np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+		crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+		    argp->ex_anon.cr_groups);
+		np->netc_anon->cr_prison = &prison0;
+		prison_hold(np->netc_anon->cr_prison);
+		np->netc_numsecflavors = argp->ex_numsecflavors;
+		bcopy(argp->ex_secflavors, np->netc_secflavors,
+		    sizeof(np->netc_secflavors));
+		MNT_ILOCK(mp);
+		mp->mnt_flag |= MNT_DEFEXPORTED;
+		MNT_IUNLOCK(mp);
+		return (0);
+	}
+
+#if MSIZE <= 256
+	if (argp->ex_addrlen > MLEN) {
+		vfs_mount_error(mp, "ex_addrlen %d is greater than %d",
+		    argp->ex_addrlen, MLEN);
+		return (EINVAL);
+	}
+#endif
+
+	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
+	saddr = (struct sockaddr *) (np + 1);
+	if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
+		goto out;
+	if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) {
+		error = EINVAL;
+		vfs_mount_error(mp, "Invalid saddr->sa_family: %d");
+		goto out;
+	}
+	if (saddr->sa_len > argp->ex_addrlen)
+		saddr->sa_len = argp->ex_addrlen;
+	if (argp->ex_masklen) {
+		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
+		error = copyin(argp->ex_mask, smask, argp->ex_masklen);
+		if (error)
+			goto out;
+		if (smask->sa_len > argp->ex_masklen)
+			smask->sa_len = argp->ex_masklen;
+	}
+	rnh = NULL;
+	switch (saddr->sa_family) {
+#ifdef INET
+	case AF_INET:
+		if ((rnh = nep->ne4) == NULL) {
+			off = offsetof(struct sockaddr_in, sin_addr) << 3;
+			rnh = vfs_create_addrlist_af(&nep->ne4, off);
+		}
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		if ((rnh = nep->ne6) == NULL) {
+			off = offsetof(struct sockaddr_in6, sin6_addr) << 3;
+			rnh = vfs_create_addrlist_af(&nep->ne6, off);
+		}
+		break;
+#endif
+	}
+	if (rnh == NULL) {
+		error = ENOBUFS;
+		vfs_mount_error(mp, "%s %s %d",
+		    "Unable to initialize radix node head ",
+		    "for address family", saddr->sa_family);
+		goto out;
+	}
+	RADIX_NODE_HEAD_LOCK(rnh);
+	rn = (*rnh->rnh_addaddr)(saddr, smask, &rnh->rh, np->netc_rnodes);
+	RADIX_NODE_HEAD_UNLOCK(rnh);
+	if (rn == NULL || np != (struct netcred *)rn) {	/* already exists */
+		error = EPERM;
+		vfs_mount_error(mp,
+		    "netcred already exists for given addr/mask");
+		goto out;
+	}
+	np->netc_exflags = argp->ex_flags;
+	np->netc_anon = crget();
+	np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+	crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+	    argp->ex_anon.cr_groups);
+	np->netc_anon->cr_prison = &prison0;
+	prison_hold(np->netc_anon->cr_prison);
+	np->netc_numsecflavors = argp->ex_numsecflavors;
+	bcopy(argp->ex_secflavors, np->netc_secflavors,
+	    sizeof(np->netc_secflavors));
+	return (0);
+out:
+	free(np, M_NETADDR);
+	return (error);
+}
+
+/* Helper for vfs_free_addrlist. */
+/* ARGSUSED */
+static int
+vfs_free_netcred(struct radix_node *rn, void *w)
+{
+	struct radix_node_head *rnh = (struct radix_node_head *) w;
+	struct ucred *cred;
+
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, &rnh->rh);
+	cred = ((struct netcred *)rn)->netc_anon;
+	if (cred != NULL)
+		crfree(cred);
+	free(rn, M_NETADDR);
+	return (0);
+}
+
+#if defined(INET) || defined(INET6)
+static struct radix_node_head *
+vfs_create_addrlist_af(struct radix_node_head **prnh, int off)
+{
+
+	if (rn_inithead((void **)prnh, off) == 0)
+		return (NULL);
+	RADIX_NODE_HEAD_LOCK_INIT(*prnh);
+	return (*prnh);
+}
+#endif
+
+static void
+vfs_free_addrlist_af(struct radix_node_head **prnh)
+{
+	struct radix_node_head *rnh;
+
+	rnh = *prnh;
+	RADIX_NODE_HEAD_LOCK(rnh);
+	(*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, rnh);
+	RADIX_NODE_HEAD_UNLOCK(rnh);
+	RADIX_NODE_HEAD_DESTROY(rnh);
+	rn_detachhead((void **)prnh);
+	prnh = NULL;
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(struct netexport *nep)
+{
+	struct ucred *cred;
+
+	if (nep->ne4 != NULL)
+		vfs_free_addrlist_af(&nep->ne4);
+	if (nep->ne6 != NULL)
+		vfs_free_addrlist_af(&nep->ne6);
+
+	cred = nep->ne_defexported.netc_anon;
+	if (cred != NULL)
+		crfree(cred);
+
+}
+
+/*
+ * High level function to manipulate export options on a mount point
+ * and the passed in netexport.
+ * Struct export_args *argp is the variable used to twiddle options,
+ * the structure is described in sys/mount.h
+ */
+int
+vfs_export(struct mount *mp, struct export_args *argp)
+{
+	struct netexport *nep;
+	int error;
+
+	if (argp->ex_numsecflavors < 0
+	    || argp->ex_numsecflavors >= MAXSECFLAVORS)
+		return (EINVAL);
+
+	error = 0;
+	lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL);
+	nep = mp->mnt_export;
+	if (argp->ex_flags & MNT_DELEXPORT) {
+		if (nep == NULL) {
+			error = ENOENT;
+			goto out;
+		}
+		if (mp->mnt_flag & MNT_EXPUBLIC) {
+			vfs_setpublicfs(NULL, NULL, NULL);
+			MNT_ILOCK(mp);
+			mp->mnt_flag &= ~MNT_EXPUBLIC;
+			MNT_IUNLOCK(mp);
+		}
+		vfs_free_addrlist(nep);
+		mp->mnt_export = NULL;
+		free(nep, M_MOUNT);
+		nep = NULL;
+		MNT_ILOCK(mp);
+		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+		MNT_IUNLOCK(mp);
+	}
+	if (argp->ex_flags & MNT_EXPORTED) {
+		if (nep == NULL) {
+			nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO);
+			mp->mnt_export = nep;
+		}
+		if (argp->ex_flags & MNT_EXPUBLIC) {
+			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+				goto out;
+			MNT_ILOCK(mp);
+			mp->mnt_flag |= MNT_EXPUBLIC;
+			MNT_IUNLOCK(mp);
+		}
+		if ((error = vfs_hang_addrlist(mp, nep, argp)))
+			goto out;
+		MNT_ILOCK(mp);
+		mp->mnt_flag |= MNT_EXPORTED;
+		MNT_IUNLOCK(mp);
+	}
+
+out:
+	lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+	/*
+	 * Once we have executed the vfs_export() command, we do
+	 * not want to keep the "export" option around in the
+	 * options list, since that will cause subsequent MNT_UPDATE
+	 * calls to fail.  The export information is saved in
+	 * mp->mnt_export, so we can safely delete the "export" mount option
+	 * here.
+	 */
+	vfs_deleteopt(mp->mnt_optnew, "export");
+	vfs_deleteopt(mp->mnt_opt, "export");
+	return (error);
+}
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(struct mount *mp, struct netexport *nep,
+    struct export_args *argp)
+{
+	int error;
+	struct vnode *rvp;
+	char *cp;
+
+	/*
+	 * mp == NULL -> invalidate the current info, the FS is
+	 * no longer exported. May be called from either vfs_export
+	 * or unmount, so check if it hasn't already been done.
+	 */
+	if (mp == NULL) {
+		if (nfs_pub.np_valid) {
+			nfs_pub.np_valid = 0;
+			if (nfs_pub.np_index != NULL) {
+				free(nfs_pub.np_index, M_TEMP);
+				nfs_pub.np_index = NULL;
+			}
+		}
+		return (0);
+	}
+
+	/*
+	 * Only one allowed at a time.
+	 */
+	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+		return (EBUSY);
+
+	/*
+	 * Get real filehandle for root of exported FS.
+	 */
+	bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+	if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp)))
+		return (error);
+
+	if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+		return (error);
+
+	vput(rvp);
+
+	/*
+	 * If an indexfile was specified, pull it in.
+	 */
+	if (argp->ex_indexfile != NULL) {
+		if (nfs_pub.np_index == NULL)
+			nfs_pub.np_index = malloc(MAXNAMLEN + 1, M_TEMP,
+			    M_WAITOK);
+		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+		    MAXNAMLEN, (size_t *)0);
+		if (!error) {
+			/*
+			 * Check for illegal filenames.
+			 */
+			for (cp = nfs_pub.np_index; *cp; cp++) {
+				if (*cp == '/') {
+					error = EINVAL;
+					break;
+				}
+			}
+		}
+		if (error) {
+			free(nfs_pub.np_index, M_TEMP);
+			nfs_pub.np_index = NULL;
+			return (error);
+		}
+	}
+
+	nfs_pub.np_mount = mp;
+	nfs_pub.np_valid = 1;
+	return (0);
+}
+
+/*
+ * Used by the filesystems to determine if a given network address
+ * (passed in 'nam') is present in their exports list, returns a pointer
+ * to struct netcred so that the filesystem can examine it for
+ * access rights (read/write/etc).
+ */
+static struct netcred *
+vfs_export_lookup(struct mount *mp, struct sockaddr *nam)
+{
+	RADIX_NODE_HEAD_RLOCK_TRACKER;
+	struct netexport *nep;
+	struct netcred *np = NULL;
+	struct radix_node_head *rnh;
+	struct sockaddr *saddr;
+
+	nep = mp->mnt_export;
+	if (nep == NULL)
+		return (NULL);
+	if ((mp->mnt_flag & MNT_EXPORTED) == 0)
+		return (NULL);
+
+	/*
+	 * Lookup in the export list
+	 */
+	if (nam != NULL) {
+		saddr = nam;
+		rnh = NULL;
+		switch (saddr->sa_family) {
+		case AF_INET:
+			rnh = nep->ne4;
+			break;
+		case AF_INET6:
+			rnh = nep->ne6;
+			break;
+		}
+		if (rnh != NULL) {
+			RADIX_NODE_HEAD_RLOCK(rnh);
+			np = (struct netcred *) (*rnh->rnh_matchaddr)(saddr, &rnh->rh);
+			RADIX_NODE_HEAD_RUNLOCK(rnh);
+			if (np != NULL && (np->netc_rnodes->rn_flags & RNF_ROOT) != 0)
+				return (NULL);
+		}
+	}
+
+	/*
+	 * If no address match, use the default if it exists.
+	 */
+	if (np == NULL && (mp->mnt_flag & MNT_DEFEXPORTED) != 0)
+		return (&nep->ne_defexported);
+
+	return (np);
+}
+
+/*
+ * XXX: This comment comes from the deprecated ufs_check_export()
+ * XXX: and may not entirely apply, but lacking something better:
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ *
+ * Verify that a host should have access to a filesystem.
+ */
+
+int 
+vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors)
+{
+	struct netcred *np;
+
+	lockmgr(&mp->mnt_explock, LK_SHARED, NULL);
+	np = vfs_export_lookup(mp, nam);
+	if (np == NULL) {
+		lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+		*credanonp = NULL;
+		return (EACCES);
+	}
+	*extflagsp = np->netc_exflags;
+	if ((*credanonp = np->netc_anon) != NULL)
+		crhold(*credanonp);
+	if (numsecflavors)
+		*numsecflavors = np->netc_numsecflavors;
+	if (secflavors)
+		*secflavors = np->netc_secflavors;
+	lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+	return (0);
+}
+
diff --git a/freebsd/sys/kern/vfs_extattr.c b/freebsd/sys/kern/vfs_extattr.c
new file mode 100644
index 00000000..2903fd37
--- /dev/null
+++ b/freebsd/sys/kern/vfs_extattr.c
@@ -0,0 +1,757 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capsicum.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/fcntl.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/limits.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/extattr.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+static int	kern_extattr_set_path(struct thread *td, const char *path,
+		    int attrnamespace, const char *attrname, void *data,
+		    size_t nbytes, int follow);
+static int	kern_extattr_get_path(struct thread *td, const char *path,
+		    int attrnamespace, const char *attrname, void *data,
+		    size_t nbytes, int follow);
+static int	kern_extattr_delete_path(struct thread *td, const char *path,
+		    int attrnamespace, const char *attrname, int follow);
+static int	kern_extattr_list_path(struct thread *td, const char *path,
+		    int attrnamespace, void *data, size_t nbytes, int follow);
+
+/*
+ * Syscall to push extended attribute configuration information into the VFS.
+ * Accepts a path, which it converts to a mountpoint, as well as a command
+ * (int cmd), and attribute name and misc data.
+ *
+ * Currently this is used only by UFS1 extended attributes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct extattrctl_args {
+	const char *path;
+	int cmd;
+	const char *filename;
+	int attrnamespace;
+	const char *attrname;
+};
+#endif
+int
+sys_extattrctl(struct thread *td, struct extattrctl_args *uap)
+{
+	struct vnode *filename_vp;
+	struct nameidata nd;
+	struct mount *mp, *mp_writable;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_CMD(uap->cmd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	/*
+	 * uap->attrname is not always defined.  We check again later when we
+	 * invoke the VFS call so as to pass in NULL there if needed.
+	 */
+	if (uap->attrname != NULL) {
+		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+		    NULL);
+		if (error)
+			return (error);
+	}
+	AUDIT_ARG_TEXT(attrname);
+
+	mp = NULL;
+	filename_vp = NULL;
+	if (uap->filename != NULL) {
+		NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2,
+		    UIO_USERSPACE, uap->filename, td);
+		error = namei(&nd);
+		if (error)
+			return (error);
+		filename_vp = nd.ni_vp;
+		NDFREE(&nd, NDF_NO_VP_RELE);
+	}
+
+	/* uap->path is always defined. */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error)
+		goto out;
+	mp = nd.ni_vp->v_mount;
+	error = vfs_busy(mp, 0);
+	if (error) {
+		NDFREE(&nd, 0);
+		mp = NULL;
+		goto out;
+	}
+	VOP_UNLOCK(nd.ni_vp, 0);
+	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+	NDFREE(&nd, NDF_NO_VP_UNLOCK);
+	if (error)
+		goto out;
+	if (filename_vp != NULL) {
+		/*
+		 * uap->filename is not always defined.  If it is,
+		 * grab a vnode lock, which VFS_EXTATTRCTL() will
+		 * later release.
+		 */
+		error = vn_lock(filename_vp, LK_EXCLUSIVE);
+		if (error) {
+			vn_finished_write(mp_writable);
+			goto out;
+		}
+	}
+
+	error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
+	    uap->attrname != NULL ? attrname : NULL);
+
+	vn_finished_write(mp_writable);
+out:
+	if (mp != NULL)
+		vfs_unbusy(mp);
+
+	/*
+	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
+	 * so vrele it if it is defined.
+	 */
+	if (filename_vp != NULL)
+		vrele(filename_vp);
+	return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct mount *mp;
+	struct uio auio;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	if (nbytes > IOSIZE_MAX)
+		return (EINVAL);
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+	aiov.iov_base = data;
+	aiov.iov_len = nbytes;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = 0;
+	auio.uio_resid = nbytes;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	cnt = nbytes;
+
+#ifdef MAC
+	error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
+	    attrname);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+	    td->td_ucred, td);
+	cnt -= auio.uio_resid;
+	td->td_retval[0] = cnt;
+
+#ifdef MAC
+done:
+#endif
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_set_fd_args {
+	int fd;
+	int attrnamespace;
+	const char *attrname;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_set_fd(struct thread *td, struct extattr_set_fd_args *uap)
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	error = getvnode(td, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+	fdrop(fp, td);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_set_file_args {
+	const char *path;
+	int attrnamespace;
+	const char *attrname;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_set_file(struct thread *td, struct extattr_set_file_args *uap)
+{
+
+	return (kern_extattr_set_path(td, uap->path, uap->attrnamespace,
+	    uap->attrname, uap->data, uap->nbytes, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_set_link_args {
+	const char *path;
+	int attrnamespace;
+	const char *attrname;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_set_link(struct thread *td, struct extattr_set_link_args *uap)
+{
+
+	return (kern_extattr_set_path(td, uap->path, uap->attrnamespace,
+	    uap->attrname, uap->data, uap->nbytes, NOFOLLOW));
+}
+
+static int
+kern_extattr_set_path(struct thread *td, const char *path, int attrnamespace,
+    const char *uattrname, void *data, size_t nbytes, int follow)
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(attrnamespace);
+	error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_set_vp(nd.ni_vp, attrnamespace, attrname, data,
+	    nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	struct iovec aiov;
+	ssize_t cnt;
+	size_t size, *sizep;
+	int error;
+
+	if (nbytes > IOSIZE_MAX)
+		return (EINVAL);
+
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+
+	/*
+	 * Slightly unusual semantics: if the user provides a NULL data
+	 * pointer, they don't want to receive the data, just the maximum
+	 * read length.
+	 */
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+#ifdef MAC
+	error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
+	    attrname);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+#ifdef MAC
+done:
+#endif
+	VOP_UNLOCK(vp, 0);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_get_fd_args {
+	int fd;
+	int attrnamespace;
+	const char *attrname;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_get_fd(struct thread *td, struct extattr_get_fd_args *uap)
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	error = getvnode(td, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_GET), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_get_file_args {
+	const char *path;
+	int attrnamespace;
+	const char *attrname;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_get_file(struct thread *td, struct extattr_get_file_args *uap)
+{
+	return (kern_extattr_get_path(td, uap->path, uap->attrnamespace,
+	    uap->attrname, uap->data, uap->nbytes, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_get_link_args {
+	const char *path;
+	int attrnamespace;
+	const char *attrname;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_get_link(struct thread *td, struct extattr_get_link_args *uap)
+{
+	return (kern_extattr_get_path(td, uap->path, uap->attrnamespace,
+	    uap->attrname, uap->data, uap->nbytes, NOFOLLOW));
+}
+
+static int
+kern_extattr_get_path(struct thread *td, const char *path, int attrnamespace,
+    const char *uattrname, void *data, size_t nbytes, int follow)
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(attrnamespace);
+	error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_get_vp(nd.ni_vp, attrnamespace, attrname, data,
+	    nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ *                      directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    struct thread *td)
+{
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+#ifdef MAC
+	error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
+	    attrname);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
+	    td);
+	if (error == EOPNOTSUPP)
+		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+		    td->td_ucred, td);
+#ifdef MAC
+done:
+#endif
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_delete_fd_args {
+	int fd;
+	int attrnamespace;
+	const char *attrname;
+};
+#endif
+int
+sys_extattr_delete_fd(struct thread *td, struct extattr_delete_fd_args *uap)
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG_TEXT(attrname);
+
+	error = getvnode(td, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, td);
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_delete_file_args {
+	const char *path;
+	int attrnamespace;
+	const char *attrname;
+};
+#endif
+int
+sys_extattr_delete_file(struct thread *td, struct extattr_delete_file_args *uap)
+{
+
+	return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace,
+	    uap->attrname, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_delete_link_args {
+	const char *path;
+	int attrnamespace;
+	const char *attrname;
+};
+#endif
+int
+sys_extattr_delete_link(struct thread *td, struct extattr_delete_link_args *uap)
+{
+
+	return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace,
+	    uap->attrname, NOFOLLOW));
+}
+
+static int
+kern_extattr_delete_path(struct thread *td, const char *path, int attrnamespace,
+    const char *uattrname, int follow)
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int error;
+
+	AUDIT_ARG_VALUE(attrnamespace);
+	error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return(error);
+	AUDIT_ARG_TEXT(attrname);
+
+	NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error)
+		return(error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_delete_vp(nd.ni_vp, attrnamespace, attrname, td);
+	vrele(nd.ni_vp);
+	return(error);
+}
+
+/*-
+ * Retrieve a list of extended attributes on a file or directory.
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
+ *            userspace buffer pointer "data", buffer length "nbytes",
+ *            thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
+    size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	size_t size, *sizep;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	if (nbytes > IOSIZE_MAX)
+		return (EINVAL);
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+#ifdef MAC
+	error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+#ifdef MAC
+done:
+#endif
+	VOP_UNLOCK(vp, 0);
+	return (error);
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_list_fd_args {
+	int fd;
+	int attrnamespace;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_list_fd(struct thread *td, struct extattr_list_fd_args *uap)
+{
+	struct file *fp;
+	cap_rights_t rights;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_VALUE(uap->attrnamespace);
+	error = getvnode(td, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp);
+	if (error)
+		return (error);
+
+	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
+	    uap->nbytes, td);
+
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_list_file_args {
+	const char *path;
+	int attrnamespace;
+	void *data;
+	size_t nbytes;
+}
+#endif
+int
+sys_extattr_list_file(struct thread *td, struct extattr_list_file_args *uap)
+{
+
+	return (kern_extattr_list_path(td, uap->path, uap->attrnamespace,
+	    uap->data, uap->nbytes, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_list_link_args {
+	const char *path;
+	int attrnamespace;
+	void *data;
+	size_t nbytes;
+};
+#endif
+int
+sys_extattr_list_link(struct thread *td, struct extattr_list_link_args *uap)
+{
+
+	return (kern_extattr_list_path(td, uap->path, uap->attrnamespace,
+	    uap->data, uap->nbytes, NOFOLLOW));
+}
+
+static int
+kern_extattr_list_path(struct thread *td, const char *path, int attrnamespace,
+    void *data, size_t nbytes, int follow)
+{
+	struct nameidata nd;
+	int error;
+
+	AUDIT_ARG_VALUE(attrnamespace);
+	NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = extattr_list_vp(nd.ni_vp, attrnamespace, data, nbytes, td);
+
+	vrele(nd.ni_vp);
+	return (error);
+}
diff --git a/freebsd/sys/kern/vfs_hash.c b/freebsd/sys/kern/vfs_hash.c
new file mode 100644
index 00000000..b938f485
--- /dev/null
+++ b/freebsd/sys/kern/vfs_hash.c
@@ -0,0 +1,234 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2005 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/rwlock.h>
+#include <sys/vnode.h>
+
+static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
+
+static LIST_HEAD(vfs_hash_head, vnode)	*vfs_hash_tbl;
+static LIST_HEAD(,vnode)		vfs_hash_side;
+static u_long				vfs_hash_mask;
+static struct rwlock			vfs_hash_lock;
+
+static void
+vfs_hashinit(void *dummy __unused)
+{
+
+	vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask);
+	rw_init(&vfs_hash_lock, "vfs hash");
+	LIST_INIT(&vfs_hash_side);
+}
+
+/* Must be SI_ORDER_SECOND so desiredvnodes is available */
+SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL);
+
+u_int
+vfs_hash_index(struct vnode *vp)
+{
+
+	return (vp->v_hash + vp->v_mount->mnt_hashseed);
+}
+
+static struct vfs_hash_head *
+vfs_hash_bucket(const struct mount *mp, u_int hash)
+{
+
+	return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
+}
+
+int
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td,
+    struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+	struct vnode *vp;
+	int error;
+
+	while (1) {
+		rw_rlock(&vfs_hash_lock);
+		LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+			if (vp->v_hash != hash)
+				continue;
+			if (vp->v_mount != mp)
+				continue;
+			if (fn != NULL && fn(vp, arg))
+				continue;
+			vhold(vp);
+			rw_runlock(&vfs_hash_lock);
+			error = vget(vp, flags | LK_VNHELD, td);
+			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+				break;
+			if (error)
+				return (error);
+			*vpp = vp;
+			return (0);
+		}
+		if (vp == NULL) {
+			rw_runlock(&vfs_hash_lock);
+			*vpp = NULL;
+			return (0);
+		}
+	}
+}
+
+void
+vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
+    struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+	struct vnode *vp;
+
+	while (1) {
+		rw_rlock(&vfs_hash_lock);
+		LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+			if (vp->v_hash != hash)
+				continue;
+			if (vp->v_mount != mp)
+				continue;
+			if (fn != NULL && fn(vp, arg))
+				continue;
+			vhold(vp);
+			rw_runlock(&vfs_hash_lock);
+			vref(vp);
+			vdrop(vp);
+			*vpp = vp;
+			return;
+		}
+		if (vp == NULL) {
+			rw_runlock(&vfs_hash_lock);
+			*vpp = NULL;
+			return;
+		}
+	}
+}
+
+void
+vfs_hash_remove(struct vnode *vp)
+{
+
+	rw_wlock(&vfs_hash_lock);
+	LIST_REMOVE(vp, v_hashlist);
+	rw_wunlock(&vfs_hash_lock);
+}
+
+int
+vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
+    struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+	struct vnode *vp2;
+	int error;
+
+	*vpp = NULL;
+	while (1) {
+		rw_wlock(&vfs_hash_lock);
+		LIST_FOREACH(vp2,
+		    vfs_hash_bucket(vp->v_mount, hash), v_hashlist) {
+			if (vp2->v_hash != hash)
+				continue;
+			if (vp2->v_mount != vp->v_mount)
+				continue;
+			if (fn != NULL && fn(vp2, arg))
+				continue;
+			vhold(vp2);
+			rw_wunlock(&vfs_hash_lock);
+			error = vget(vp2, flags | LK_VNHELD, td);
+			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+				break;
+			rw_wlock(&vfs_hash_lock);
+			LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist);
+			rw_wunlock(&vfs_hash_lock);
+			vput(vp);
+			if (!error)
+				*vpp = vp2;
+			return (error);
+		}
+		if (vp2 == NULL)
+			break;
+			
+	}
+	vp->v_hash = hash;
+	LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+	rw_wunlock(&vfs_hash_lock);
+	return (0);
+}
+
+void
+vfs_hash_rehash(struct vnode *vp, u_int hash)
+{
+
+	rw_wlock(&vfs_hash_lock);
+	LIST_REMOVE(vp, v_hashlist);
+	LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+	vp->v_hash = hash;
+	rw_wunlock(&vfs_hash_lock);
+}
+
+void
+vfs_hash_changesize(int newmaxvnodes)
+{
+	struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl;
+	u_long vfs_hash_newmask, vfs_hash_oldmask;
+	struct vnode *vp;
+	int i;
+
+	vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH,
+		&vfs_hash_newmask);
+	/* If same hash table size, nothing to do */
+	if (vfs_hash_mask == vfs_hash_newmask) {
+		free(vfs_hash_newtbl, M_VFS_HASH);
+		return;
+	}
+	/*
+	 * Move everything from the old hash table to the new table.
+	 * None of the vnodes in the table can be recycled because to
+	 * do so, they have to be removed from the hash table.
+	 */
+	rw_wlock(&vfs_hash_lock);
+	vfs_hash_oldtbl = vfs_hash_tbl;
+	vfs_hash_oldmask = vfs_hash_mask;
+	vfs_hash_tbl = vfs_hash_newtbl;
+	vfs_hash_mask = vfs_hash_newmask;
+	for (i = 0; i <= vfs_hash_oldmask; i++) {
+		while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) {
+			LIST_REMOVE(vp, v_hashlist);
+			LIST_INSERT_HEAD(
+			    vfs_hash_bucket(vp->v_mount, vp->v_hash),
+			    vp, v_hashlist);
+		}
+	}
+	rw_wunlock(&vfs_hash_lock);
+	free(vfs_hash_oldtbl, M_VFS_HASH);
+}
diff --git a/freebsd/sys/kern/vfs_init.c b/freebsd/sys/kern/vfs_init.c
new file mode 100644
index 00000000..5eb38e6d
--- /dev/null
+++ b/freebsd/sys/kern/vfs_init.c
@@ -0,0 +1,376 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_init.c	8.3 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fnv_hash.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+
+static int	vfs_register(struct vfsconf *);
+static int	vfs_unregister(struct vfsconf *);
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * The highest defined VFS number.
+ */
+int maxvfsconf = VFS_GENERIC + 1;
+
+/*
+ * Single-linked list of configured VFSes.
+ * New entries are added/deleted by vfs_register()/vfs_unregister()
+ */
+struct vfsconfhead vfsconf = TAILQ_HEAD_INITIALIZER(vfsconf);
+struct sx vfsconf_sx;
+SX_SYSINIT(vfsconf, &vfsconf_sx, "vfsconf");
+
+/*
+ * Loader.conf variable vfs.typenumhash enables setting vfc_typenum using a hash
+ * calculation on vfc_name, so that it doesn't change when file systems are
+ * loaded in a different order. This will avoid the NFS server file handles from
+ * changing for file systems that use vfc_typenum in their fsid.
+ */
+static int	vfs_typenumhash = 1;
+SYSCTL_INT(_vfs, OID_AUTO, typenumhash, CTLFLAG_RDTUN, &vfs_typenumhash, 0,
+    "Set vfc_typenum using a hash calculation on vfc_name, so that it does not"
+    "change when file systems are loaded in a different order.");
+
+/*
+ * A Zen vnode attribute structure.
+ *
+ * Initialized when the first filesystem registers by vfs_register().
+ */
+struct vattr va_null;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+static struct vfsconf *
+vfs_byname_locked(const char *name)
+{
+	struct vfsconf *vfsp;
+
+	sx_assert(&vfsconf_sx, SA_LOCKED);
+	if (!strcmp(name, "ffs"))
+		name = "ufs";
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+		if (!strcmp(name, vfsp->vfc_name))
+			return (vfsp);
+	}
+	return (NULL);
+}
+
+struct vfsconf *
+vfs_byname(const char *name)
+{
+	struct vfsconf *vfsp;
+
+	vfsconf_slock();
+	vfsp = vfs_byname_locked(name);
+	vfsconf_sunlock();
+	return (vfsp);
+}
+
+struct vfsconf *
+vfs_byname_kld(const char *fstype, struct thread *td, int *error)
+{
+	struct vfsconf *vfsp;
+	int fileid, loaded;
+
+	vfsp = vfs_byname(fstype);
+	if (vfsp != NULL)
+		return (vfsp);
+
+	/* Try to load the respective module. */
+	*error = kern_kldload(td, fstype, &fileid);
+	loaded = (*error == 0);
+	if (*error == EEXIST)
+		*error = 0;
+	if (*error)
+		return (NULL);
+
+	/* Look up again to see if the VFS was loaded. */
+	vfsp = vfs_byname(fstype);
+	if (vfsp == NULL) {
+		if (loaded)
+			(void)kern_kldunload(td, fileid, LINKER_UNLOAD_FORCE);
+		*error = ENODEV;
+		return (NULL);
+	}
+	return (vfsp);
+}
+
+
+/* Register a new filesystem type in the global table */
+static int
+vfs_register(struct vfsconf *vfc)
+{
+	struct sysctl_oid *oidp;
+	struct vfsops *vfsops;
+	static int once;
+	struct vfsconf *tvfc;
+	uint32_t hashval;
+	int secondpass;
+
+	if (!once) {
+		vattr_null(&va_null);
+		once = 1;
+	}
+	
+	if (vfc->vfc_version != VFS_VERSION) {
+		printf("ERROR: filesystem %s, unsupported ABI version %x\n",
+		    vfc->vfc_name, vfc->vfc_version);
+		return (EINVAL);
+	}
+	vfsconf_lock();
+	if (vfs_byname_locked(vfc->vfc_name) != NULL) {
+		vfsconf_unlock();
+		return (EEXIST);
+	}
+
+	if (vfs_typenumhash != 0) {
+		/*
+		 * Calculate a hash on vfc_name to use for vfc_typenum. Unless
+		 * all of 1<->255 are assigned, it is limited to 8bits since
+		 * that is what ZFS uses from vfc_typenum and is also the
+		 * preferred range for vfs_getnewfsid().
+		 */
+		hashval = fnv_32_str(vfc->vfc_name, FNV1_32_INIT);
+		hashval &= 0xff;
+		secondpass = 0;
+		do {
+			/* Look for and fix any collision. */
+			TAILQ_FOREACH(tvfc, &vfsconf, vfc_list) {
+				if (hashval == tvfc->vfc_typenum) {
+					if (hashval == 255 && secondpass == 0) {
+						hashval = 1;
+						secondpass = 1;
+					} else
+						hashval++;
+					break;
+				}
+			}
+		} while (tvfc != NULL);
+		vfc->vfc_typenum = hashval;
+		if (vfc->vfc_typenum >= maxvfsconf)
+			maxvfsconf = vfc->vfc_typenum + 1;
+	} else
+		vfc->vfc_typenum = maxvfsconf++;
+	TAILQ_INSERT_TAIL(&vfsconf, vfc, vfc_list);
+
+	/*
+	 * Initialise unused ``struct vfsops'' fields, to use
+	 * the vfs_std*() functions.  Note, we need the mount
+	 * and unmount operations, at the least.  The check
+	 * for vfsops available is just a debugging aid.
+	 */
+	KASSERT(vfc->vfc_vfsops != NULL,
+	    ("Filesystem %s has no vfsops", vfc->vfc_name));
+	/*
+	 * Check the mount and unmount operations.
+	 */
+	vfsops = vfc->vfc_vfsops;
+	KASSERT(vfsops->vfs_mount != NULL,
+	    ("Filesystem %s has no mount op", vfc->vfc_name));
+	KASSERT(vfsops->vfs_unmount != NULL,
+	    ("Filesystem %s has no unmount op", vfc->vfc_name));
+
+	if (vfsops->vfs_root == NULL)
+		/* return file system's root vnode */
+		vfsops->vfs_root =	vfs_stdroot;
+	if (vfsops->vfs_quotactl == NULL)
+		/* quota control */
+		vfsops->vfs_quotactl =	vfs_stdquotactl;
+	if (vfsops->vfs_statfs == NULL)
+		/* return file system's status */
+		vfsops->vfs_statfs =	vfs_stdstatfs;
+	if (vfsops->vfs_sync == NULL)
+		/*
+		 * flush unwritten data (nosync)
+		 * file systems can use vfs_stdsync
+		 * explicitly by setting it in the
+		 * vfsop vector.
+		 */
+		vfsops->vfs_sync =	vfs_stdnosync;
+	if (vfsops->vfs_vget == NULL)
+		/* convert an inode number to a vnode */
+		vfsops->vfs_vget =	vfs_stdvget;
+	if (vfsops->vfs_fhtovp == NULL)
+		/* turn an NFS file handle into a vnode */
+		vfsops->vfs_fhtovp =	vfs_stdfhtovp;
+	if (vfsops->vfs_checkexp == NULL)
+		/* check if file system is exported */
+		vfsops->vfs_checkexp =	vfs_stdcheckexp;
+	if (vfsops->vfs_init == NULL)
+		/* file system specific initialisation */
+		vfsops->vfs_init =	vfs_stdinit;
+	if (vfsops->vfs_uninit == NULL)
+		/* file system specific uninitialisation */
+		vfsops->vfs_uninit =	vfs_stduninit;
+	if (vfsops->vfs_extattrctl == NULL)
+		/* extended attribute control */
+		vfsops->vfs_extattrctl = vfs_stdextattrctl;
+	if (vfsops->vfs_sysctl == NULL)
+		vfsops->vfs_sysctl = vfs_stdsysctl;
+
+	if (vfc->vfc_flags & VFCF_JAIL)
+		prison_add_vfs(vfc);
+
+	/*
+	 * Call init function for this VFS...
+	 */
+	(*(vfc->vfc_vfsops->vfs_init))(vfc);
+	vfsconf_unlock();
+
+	/*
+	 * If this filesystem has a sysctl node under vfs
+	 * (i.e. vfs.xxfs), then change the oid number of that node to
+	 * match the filesystem's type number.  This allows user code
+	 * which uses the type number to read sysctl variables defined
+	 * by the filesystem to continue working. Since the oids are
+	 * in a sorted list, we need to make sure the order is
+	 * preserved by re-registering the oid after modifying its
+	 * number.
+	 */
+	sysctl_wlock();
+	SLIST_FOREACH(oidp, SYSCTL_CHILDREN(&sysctl___vfs), oid_link) {
+		if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+			sysctl_unregister_oid(oidp);
+			oidp->oid_number = vfc->vfc_typenum;
+			sysctl_register_oid(oidp);
+			break;
+		}
+	}
+	sysctl_wunlock();
+
+	return (0);
+}
+
+
+/* Remove registration of a filesystem type */
+static int
+vfs_unregister(struct vfsconf *vfc)
+{
+	struct vfsconf *vfsp;
+	int error, maxtypenum;
+
+	vfsconf_lock();
+	vfsp = vfs_byname_locked(vfc->vfc_name);
+	if (vfsp == NULL) {
+		vfsconf_unlock();
+		return (EINVAL);
+	}
+	if (vfsp->vfc_refcount != 0) {
+		vfsconf_unlock();
+		return (EBUSY);
+	}
+	if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+		error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+		if (error != 0) {
+			vfsconf_unlock();
+			return (error);
+		}
+	}
+	TAILQ_REMOVE(&vfsconf, vfsp, vfc_list);
+	maxtypenum = VFS_GENERIC;
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+		if (maxtypenum < vfsp->vfc_typenum)
+			maxtypenum = vfsp->vfc_typenum;
+	maxvfsconf = maxtypenum + 1;
+	vfsconf_unlock();
+	return (0);
+}
+
+/*
+ * Standard kernel module handling code for filesystem modules.
+ * Referenced from VFS_SET().
+ */
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+	struct vfsconf *vfc;
+	int error = 0;
+
+	vfc = (struct vfsconf *)data;
+
+	switch (type) {
+	case MOD_LOAD:
+		if (vfc)
+			error = vfs_register(vfc);
+		break;
+
+	case MOD_UNLOAD:
+		if (vfc)
+			error = vfs_unregister(vfc);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
+}
diff --git a/freebsd/sys/kern/vfs_lookup.c b/freebsd/sys/kern/vfs_lookup.c
new file mode 100644
index 00000000..5ee3f219
--- /dev/null
+++ b/freebsd/sys/kern/vfs_lookup.c
@@ -0,0 +1,1450 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/capsicum.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+#define	NAMEI_DIAGNOSTIC 1
+#undef NAMEI_DIAGNOSTIC
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
+    "unsigned long");
+SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
+
+/* Allocation zone for namei. */
+uma_zone_t namei_zone;
+
+/* Placeholder vnode for mp traversal. */
+static struct vnode *vp_crossmp;
+
+static int
+crossmp_vop_islocked(struct vop_islocked_args *ap)
+{
+
+	return (LK_SHARED);
+}
+
+static int
+crossmp_vop_lock1(struct vop_lock1_args *ap)
+{
+	struct vnode *vp;
+	struct lock *lk __unused;
+	const char *file __unused;
+	int flags, line __unused;
+
+	vp = ap->a_vp;
+	lk = vp->v_vnlock;
+	flags = ap->a_flags;
+	file = ap->a_file;
+	line = ap->a_line;
+
+	if ((flags & LK_SHARED) == 0)
+		panic("invalid lock request for crossmp");
+
+	WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line,
+	    flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL);
+	WITNESS_LOCK(&lk->lock_object, 0, file, line);
+	if ((flags & LK_INTERLOCK) != 0)
+		VI_UNLOCK(vp);
+	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, line);
+	return (0);
+}
+
+static int
+crossmp_vop_unlock(struct vop_unlock_args *ap)
+{
+	struct vnode *vp;
+	struct lock *lk __unused;
+	int flags;
+
+	vp = ap->a_vp;
+	lk = vp->v_vnlock;
+	flags = ap->a_flags;
+
+	if ((flags & LK_INTERLOCK) != 0)
+		VI_UNLOCK(vp);
+	WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE);
+	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE,
+	    LOCK_LINE);
+	return (0);
+}
+
+static struct vop_vector crossmp_vnodeops = {
+	.vop_default =		&default_vnodeops,
+	.vop_islocked =		crossmp_vop_islocked,
+	.vop_lock1 =		crossmp_vop_lock1,
+	.vop_unlock =		crossmp_vop_unlock,
+};
+
+struct nameicap_tracker {
+	struct vnode *dp;
+	TAILQ_ENTRY(nameicap_tracker) nm_link;
+};
+
+/* Zone for cap mode tracker elements used for dotdot capability checks. */
+static uma_zone_t nt_zone;
+
+static void
+nameiinit(void *dummy __unused)
+{
+
+	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_PTR, 0);
+	nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
+
+static int lookup_cap_dotdot = 1;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
+    &lookup_cap_dotdot, 0,
+    "enables \"..\" components in path lookup in capability mode");
+static int lookup_cap_dotdot_nonlocal = 1;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
+    &lookup_cap_dotdot_nonlocal, 0,
+    "enables \"..\" components in path lookup in capability mode "
+    "on non-local mount");
+
+static void
+nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
+{
+	struct nameicap_tracker *nt;
+
+	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
+		return;
+	nt = uma_zalloc(nt_zone, M_WAITOK);
+	vhold(dp);
+	nt->dp = dp;
+	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+}
+
+static void
+nameicap_cleanup(struct nameidata *ndp)
+{
+	struct nameicap_tracker *nt, *nt1;
+
+	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
+	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
+	TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
+		vdrop(nt->dp);
+		uma_zfree(nt_zone, nt);
+	}
+}
+
+/*
+ * For dotdot lookups in capability mode, only allow the component
+ * lookup to succeed if the resulting directory was already traversed
+ * during the operation.  Also fail dotdot lookups for non-local
+ * filesystems, where external agents might assist local lookups to
+ * escape the compartment.
+ */
+static int
+nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
+{
+	struct nameicap_tracker *nt;
+	struct mount *mp;
+
+	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL ||
+	    dp->v_type != VDIR)
+		return (0);
+	mp = dp->v_mount;
+	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
+	    (mp->mnt_flag & MNT_LOCAL) == 0)
+		return (ENOTCAPABLE);
+	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
+	    nm_link) {
+		if (dp == nt->dp)
+			return (0);
+	}
+	return (ENOTCAPABLE);
+}
+
+static void
+namei_cleanup_cnp(struct componentname *cnp)
+{
+
+	uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+	cnp->cn_pnbuf = NULL;
+	cnp->cn_nameptr = NULL;
+#endif
+}
+
+static int
+namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
+{
+	struct componentname *cnp;
+
+	cnp = &ndp->ni_cnd;
+	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
+#ifdef KTRACE
+		if (KTRPOINT(curthread, KTR_CAPFAIL))
+			ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+		return (ENOTCAPABLE);
+	}
+	while (*(cnp->cn_nameptr) == '/') {
+		cnp->cn_nameptr++;
+		ndp->ni_pathlen--;
+	}
+	*dpp = ndp->ni_rootdir;
+	vrefact(*dpp);
+	return (0);
+}
+
+/*
+ * Convert a pathname into a pointer to a locked vnode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ *	copy in name
+ *	get starting directory
+ *	while (!done && !error) {
+ *		call lookup to search path.
+ *		if symbolic link, massage name in buffer and continue
+ *	}
+ */
+int
+namei(struct nameidata *ndp)
+{
+	struct filedesc *fdp;	/* pointer to file descriptor state */
+	char *cp;		/* pointer into pathname argument */
+	struct vnode *dp;	/* the directory we are searching */
+	struct iovec aiov;		/* uio for reading symbolic links */
+	struct componentname *cnp;
+	struct file *dfp;
+	struct thread *td;
+	struct proc *p;
+	cap_rights_t rights;
+	struct uio auio;
+	int error, linklen, startdir_used;
+
+	cnp = &ndp->ni_cnd;
+	td = cnp->cn_thread;
+	p = td->td_proc;
+	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+	    ("namei: nameiop contaminated with flags"));
+	KASSERT((cnp->cn_flags & OPMASK) == 0,
+	    ("namei: flags contaminated with nameiops"));
+	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
+	    ndp->ni_startdir->v_type == VBAD);
+	fdp = p->p_fd;
+	TAILQ_INIT(&ndp->ni_cap_tracker);
+	ndp->ni_lcf = 0;
+
+	/* We will set this ourselves if we need it. */
+	cnp->cn_flags &= ~TRAILINGSLASH;
+
+	/*
+	 * Get a buffer for the name to be translated, and copy the
+	 * name into the buffer.
+	 */
+	if ((cnp->cn_flags & HASBUF) == 0)
+		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+	if (ndp->ni_segflg == UIO_SYSSPACE)
+		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+		    &ndp->ni_pathlen);
+	else
+		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+		    &ndp->ni_pathlen);
+
+	/*
+	 * Don't allow empty pathnames.
+	 */
+	if (error == 0 && *cnp->cn_pnbuf == '\0')
+		error = ENOENT;
+
+#ifdef CAPABILITY_MODE
+	/*
+	 * In capability mode, lookups must be restricted to happen in
+	 * the subtree with the root specified by the file descriptor:
+	 * - The root must be real file descriptor, not the pseudo-descriptor
+	 *   AT_FDCWD.
+	 * - The passed path must be relative and not absolute.
+	 * - If lookup_cap_dotdot is disabled, path must not contain the
+	 *   '..' components.
+	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
+	 *   components lookups result in the directories which were
+	 *   previously walked by us, which prevents an escape from
+	 *   the relative root.
+	 */
+	if (error == 0 && IN_CAPABILITY_MODE(td) &&
+	    (cnp->cn_flags & NOCAPCHECK) == 0) {
+		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
+		if (ndp->ni_dirfd == AT_FDCWD) {
+#ifdef KTRACE
+			if (KTRPOINT(td, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+			error = ECAPMODE;
+		}
+	}
+#endif
+	if (error != 0) {
+		namei_cleanup_cnp(cnp);
+		ndp->ni_vp = NULL;
+		return (error);
+	}
+	ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_NAMEI)) {
+		KASSERT(cnp->cn_thread == curthread,
+		    ("namei not using curthread"));
+		ktrnamei(cnp->cn_pnbuf);
+	}
+#endif
+	/*
+	 * Get starting point for the translation.
+	 */
+	FILEDESC_SLOCK(fdp);
+	ndp->ni_rootdir = fdp->fd_rdir;
+	vrefact(ndp->ni_rootdir);
+	ndp->ni_topdir = fdp->fd_jdir;
+
+	/*
+	 * If we are auditing the kernel pathname, save the user pathname.
+	 */
+	if (cnp->cn_flags & AUDITVNODE1)
+		AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+	if (cnp->cn_flags & AUDITVNODE2)
+		AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+
+	startdir_used = 0;
+	dp = NULL;
+	cnp->cn_nameptr = cnp->cn_pnbuf;
+	if (cnp->cn_pnbuf[0] == '/') {
+		ndp->ni_resflags |= NIRES_ABS;
+		error = namei_handle_root(ndp, &dp);
+	} else {
+		if (ndp->ni_startdir != NULL) {
+			dp = ndp->ni_startdir;
+			startdir_used = 1;
+		} else if (ndp->ni_dirfd == AT_FDCWD) {
+			dp = fdp->fd_cdir;
+			vrefact(dp);
+		} else {
+			rights = ndp->ni_rightsneeded;
+			cap_rights_set(&rights, CAP_LOOKUP);
+
+			if (cnp->cn_flags & AUDITVNODE1)
+				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
+			if (cnp->cn_flags & AUDITVNODE2)
+				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
+			/*
+			 * Effectively inlined fgetvp_rights, because we need to
+			 * inspect the file as well as grabbing the vnode.
+			 */
+			error = fget_cap_locked(fdp, ndp->ni_dirfd, &rights,
+			    &dfp, &ndp->ni_filecaps);
+			if (error != 0) {
+				/*
+				 * Preserve the error; it should either be EBADF
+				 * or capability-related, both of which can be
+				 * safely returned to the caller.
+				 */
+			} else if (dfp->f_ops == &badfileops) {
+				error = EBADF;
+			} else if (dfp->f_vnode == NULL) {
+				error = ENOTDIR;
+			} else {
+				dp = dfp->f_vnode;
+				vrefact(dp);
+
+				if ((dfp->f_flag & FSEARCH) != 0)
+					cnp->cn_flags |= NOEXECCHECK;
+			}
+#ifdef CAPABILITIES
+			/*
+			 * If file descriptor doesn't have all rights,
+			 * all lookups relative to it must also be
+			 * strictly relative.
+			 */
+			CAP_ALL(&rights);
+			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
+			    &rights) ||
+			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
+			    ndp->ni_filecaps.fc_nioctls != -1) {
+				ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
+			}
+#endif
+		}
+		if (error == 0 && dp->v_type != VDIR)
+			error = ENOTDIR;
+	}
+	FILEDESC_SUNLOCK(fdp);
+	if (ndp->ni_startdir != NULL && !startdir_used)
+		vrele(ndp->ni_startdir);
+	if (error != 0) {
+		if (dp != NULL)
+			vrele(dp);
+		goto out;
+	}
+	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
+	    lookup_cap_dotdot != 0)
+		ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
+	SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
+	    cnp->cn_flags);
+	for (;;) {
+		ndp->ni_startdir = dp;
+		error = lookup(ndp);
+		if (error != 0)
+			goto out;
+		/*
+		 * If not a symbolic link, we're done.
+		 */
+		if ((cnp->cn_flags & ISSYMLINK) == 0) {
+			vrele(ndp->ni_rootdir);
+			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
+				namei_cleanup_cnp(cnp);
+			} else
+				cnp->cn_flags |= HASBUF;
+			nameicap_cleanup(ndp);
+			SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp);
+			return (0);
+		}
+		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+			error = ELOOP;
+			break;
+		}
+#ifdef MAC
+		if ((cnp->cn_flags & NOMACCHECK) == 0) {
+			error = mac_vnode_check_readlink(td->td_ucred,
+			    ndp->ni_vp);
+			if (error != 0)
+				break;
+		}
+#endif
+		if (ndp->ni_pathlen > 1)
+			cp = uma_zalloc(namei_zone, M_WAITOK);
+		else
+			cp = cnp->cn_pnbuf;
+		aiov.iov_base = cp;
+		aiov.iov_len = MAXPATHLEN;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_td = td;
+		auio.uio_resid = MAXPATHLEN;
+		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+		if (error != 0) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			break;
+		}
+		linklen = MAXPATHLEN - auio.uio_resid;
+		if (linklen == 0) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			error = ENOENT;
+			break;
+		}
+		if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
+			if (ndp->ni_pathlen > 1)
+				uma_zfree(namei_zone, cp);
+			error = ENAMETOOLONG;
+			break;
+		}
+		if (ndp->ni_pathlen > 1) {
+			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+			uma_zfree(namei_zone, cnp->cn_pnbuf);
+			cnp->cn_pnbuf = cp;
+		} else
+			cnp->cn_pnbuf[linklen] = '\0';
+		ndp->ni_pathlen += linklen;
+		vput(ndp->ni_vp);
+		dp = ndp->ni_dvp;
+		/*
+		 * Check if root directory should replace current directory.
+		 */
+		cnp->cn_nameptr = cnp->cn_pnbuf;
+		if (*(cnp->cn_nameptr) == '/') {
+			vrele(dp);
+			error = namei_handle_root(ndp, &dp);
+			if (error != 0)
+				goto out;
+		}
+	}
+	vput(ndp->ni_vp);
+	ndp->ni_vp = NULL;
+	vrele(ndp->ni_dvp);
+out:
+	vrele(ndp->ni_rootdir);
+	namei_cleanup_cnp(cnp);
+	nameicap_cleanup(ndp);
+	SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
+	return (error);
+}
+
+static int
+compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
+{
+
+	if (mp == NULL || ((lkflags & LK_SHARED) &&
+	    (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
+	    ((cnflags & ISDOTDOT) &&
+	    (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
+		lkflags &= ~LK_SHARED;
+		lkflags |= LK_EXCLUSIVE;
+	}
+	lkflags |= LK_NODDLKTREAT;
+	return (lkflags);
+}
+
+static __inline int
+needs_exclusive_leaf(struct mount *mp, int flags)
+{
+
+	/*
+	 * Intermediate nodes can use shared locks, we only need to
+	 * force an exclusive lock for leaf nodes.
+	 */
+	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
+		return (0);
+
+	/* Always use exclusive locks if LOCKSHARED isn't set. */
+	if (!(flags & LOCKSHARED))
+		return (1);
+
+	/*
+	 * For lookups during open(), if the mount point supports
+	 * extended shared operations, then use a shared lock for the
+	 * leaf node, otherwise use an exclusive lock.
+	 */
+	if ((flags & ISOPEN) != 0)
+		return (!MNT_EXTENDED_SHARED(mp));
+
+	/*
+	 * Lookup requests outside of open() that specify LOCKSHARED
+	 * only need a shared lock on the leaf vnode.
+	 */
+	return (0);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ *	identify next component of name at ndp->ni_ptr
+ *	handle degenerate case where name is null string
+ *	if .. and crossing mount points and on mounted filesys, find parent
+ *	call VOP_LOOKUP routine for next component name
+ *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ *	    component vnode returned in ni_vp (if it exists), locked.
+ *	if result vnode is mounted on and crossing mount points,
+ *	    find mounted on vnode
+ *	if more components of name, do next level at dirloop
+ *	return the answer in ni_vp, locked if LOCKLEAF set
+ *	    if LOCKPARENT set, return locked parent in ni_dvp
+ *	    if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(struct nameidata *ndp)
+{
+	char *cp;			/* pointer into pathname argument */
+	char *prev_ni_next;		/* saved ndp->ni_next */
+	struct vnode *dp = NULL;	/* the directory we are searching */
+	struct vnode *tdp;		/* saved dp */
+	struct mount *mp;		/* mount table entry */
+	struct prison *pr;
+	size_t prev_ni_pathlen;		/* saved ndp->ni_pathlen */
+	int docache;			/* == 0 do not cache last component */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int error = 0;
+	int dpunlocked = 0;		/* dp has already been unlocked */
+	int relookup = 0;		/* do not consume the path component */
+	struct componentname *cnp = &ndp->ni_cnd;
+	int lkflags_save;
+	int ni_dvp_unlocked;
+	
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	ni_dvp_unlocked = 0;
+	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
+	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
+	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+	if (cnp->cn_nameiop == DELETE ||
+	    (wantparent && cnp->cn_nameiop != CREATE &&
+	     cnp->cn_nameiop != LOOKUP))
+		docache = 0;
+	rdonly = cnp->cn_flags & RDONLY;
+	cnp->cn_flags &= ~ISSYMLINK;
+	ndp->ni_dvp = NULL;
+	/*
+	 * We use shared locks until we hit the parent of the last cn then
+	 * we adjust based on the requesting flags.
+	 */
+	cnp->cn_lkflags = LK_SHARED;
+	dp = ndp->ni_startdir;
+	ndp->ni_startdir = NULLVP;
+	vn_lock(dp,
+	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
+	    cnp->cn_flags));
+
+dirloop:
+	/*
+	 * Search a new directory.
+	 *
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+		continue;
+	cnp->cn_namelen = cp - cnp->cn_nameptr;
+	if (cnp->cn_namelen > NAME_MAX) {
+		error = ENAMETOOLONG;
+		goto bad;
+	}
+#ifdef NAMEI_DIAGNOSTIC
+	{ char c = *cp;
+	*cp = '\0';
+	printf("{%s}: ", cnp->cn_nameptr);
+	*cp = c; }
+#endif
+	prev_ni_pathlen = ndp->ni_pathlen;
+	ndp->ni_pathlen -= cnp->cn_namelen;
+	KASSERT(ndp->ni_pathlen <= PATH_MAX,
+	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
+	prev_ni_next = ndp->ni_next;
+	ndp->ni_next = cp;
+
+	/*
+	 * Replace multiple slashes by a single slash and trailing slashes
+	 * by a null.  This must be done before VOP_LOOKUP() because some
+	 * fs's don't know about trailing slashes.  Remember if there were
+	 * trailing slashes to handle symlinks, existing non-directories
+	 * and non-existing files that won't be directories specially later.
+	 */
+	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+		cp++;
+		ndp->ni_pathlen--;
+		if (*cp == '\0') {
+			*ndp->ni_next = '\0';
+			cnp->cn_flags |= TRAILINGSLASH;
+		}
+	}
+	ndp->ni_next = cp;
+
+	cnp->cn_flags |= MAKEENTRY;
+	if (*cp == '\0' && docache == 0)
+		cnp->cn_flags &= ~MAKEENTRY;
+	if (cnp->cn_namelen == 2 &&
+	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+		cnp->cn_flags |= ISDOTDOT;
+	else
+		cnp->cn_flags &= ~ISDOTDOT;
+	if (*ndp->ni_next == 0)
+		cnp->cn_flags |= ISLASTCN;
+	else
+		cnp->cn_flags &= ~ISLASTCN;
+
+	if ((cnp->cn_flags & ISLASTCN) != 0 &&
+	    cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EINVAL;
+		goto bad;
+	}
+
+	nameicap_tracker_add(ndp, dp);
+
+	/*
+	 * Check for degenerate name (e.g. / or "")
+	 * which is a way of talking about a directory,
+	 * e.g. like "/." or ".".
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		if (dp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto bad;
+		}
+		if (cnp->cn_nameiop != LOOKUP) {
+			error = EISDIR;
+			goto bad;
+		}
+		if (wantparent) {
+			ndp->ni_dvp = dp;
+			VREF(dp);
+		}
+		ndp->ni_vp = dp;
+
+		if (cnp->cn_flags & AUDITVNODE1)
+			AUDIT_ARG_VNODE1(dp);
+		else if (cnp->cn_flags & AUDITVNODE2)
+			AUDIT_ARG_VNODE2(dp);
+
+		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+			VOP_UNLOCK(dp, 0);
+		/* XXX This should probably move to the top of function. */
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		goto success;
+	}
+
+	/*
+	 * Handle "..": five special cases.
+	 * 0. If doing a capability lookup and lookup_cap_dotdot is
+	 *    disabled, return ENOTCAPABLE.
+	 * 1. Return an error if this is the last component of
+	 *    the name and the operation is DELETE or RENAME.
+	 * 2. If at root directory (e.g. after chroot)
+	 *    or at absolute root directory
+	 *    then ignore it so can't get out.
+	 * 3. If this vnode is the root of a mounted
+	 *    filesystem, then replace it with the
+	 *    vnode which was mounted on so we take the
+	 *    .. in the other filesystem.
+	 * 4. If the vnode is the top directory of
+	 *    the jail or chroot, don't let them out.
+	 * 5. If doing a capability lookup and lookup_cap_dotdot is
+	 *    enabled, return ENOTCAPABLE if the lookup would escape
+	 *    from the initial file descriptor directory.  Checks are
+	 *    done by ensuring that namei() already traversed the
+	 *    result of dotdot lookup.
+	 */
+	if (cnp->cn_flags & ISDOTDOT) {
+		if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
+		    == NI_LCF_STRICTRELATIVE) {
+#ifdef KTRACE
+			if (KTRPOINT(curthread, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+			error = ENOTCAPABLE;
+			goto bad;
+		}
+		if ((cnp->cn_flags & ISLASTCN) != 0 &&
+		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+			error = EINVAL;
+			goto bad;
+		}
+		for (;;) {
+			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
+			     pr = pr->pr_parent)
+				if (dp == pr->pr_root)
+					break;
+			if (dp == ndp->ni_rootdir || 
+			    dp == ndp->ni_topdir || 
+			    dp == rootvnode ||
+			    pr != NULL ||
+			    ((dp->v_vflag & VV_ROOT) != 0 &&
+			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
+				ndp->ni_dvp = dp;
+				ndp->ni_vp = dp;
+				VREF(dp);
+				goto nextname;
+			}
+			if ((dp->v_vflag & VV_ROOT) == 0)
+				break;
+			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
+				error = ENOENT;
+				goto bad;
+			}
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			VREF(dp);
+			vput(tdp);
+			vn_lock(dp,
+			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+			    LK_RETRY, ISDOTDOT));
+			error = nameicap_check_dotdot(ndp, dp);
+			if (error != 0) {
+#ifdef KTRACE
+				if (KTRPOINT(curthread, KTR_CAPFAIL))
+					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+				goto bad;
+			}
+		}
+	}
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+unionlookup:
+#ifdef MAC
+	if ((cnp->cn_flags & NOMACCHECK) == 0) {
+		error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
+		    cnp);
+		if (error)
+			goto bad;
+	}
+#endif
+	ndp->ni_dvp = dp;
+	ndp->ni_vp = NULL;
+	ASSERT_VOP_LOCKED(dp, "lookup");
+	/*
+	 * If we have a shared lock we may need to upgrade the lock for the
+	 * last operation.
+	 */
+	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
+	    dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED)
+		vn_lock(dp, LK_UPGRADE|LK_RETRY);
+	if ((dp->v_iflag & VI_DOOMED) != 0) {
+		error = ENOENT;
+		goto bad;
+	}
+	/*
+	 * If we're looking up the last component and we need an exclusive
+	 * lock, adjust our lkflags.
+	 */
+	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
+		cnp->cn_lkflags = LK_EXCLUSIVE;
+#ifdef NAMEI_DIAGNOSTIC
+	vn_printf(dp, "lookup in ");
+#endif
+	lkflags_save = cnp->cn_lkflags;
+	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
+	    cnp->cn_flags);
+	error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
+	cnp->cn_lkflags = lkflags_save;
+	if (error != 0) {
+		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+		printf("not found\n");
+#endif
+		if ((error == ENOENT) &&
+		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
+		    (dp->v_mount->mnt_flag & MNT_UNION)) {
+			tdp = dp;
+			dp = dp->v_mount->mnt_vnodecovered;
+			VREF(dp);
+			vput(tdp);
+			vn_lock(dp,
+			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+			    LK_RETRY, cnp->cn_flags));
+			nameicap_tracker_add(ndp, dp);
+			goto unionlookup;
+		}
+
+		if (error == ERELOOKUP) {
+			vref(dp);
+			ndp->ni_vp = dp;
+			error = 0;
+			relookup = 1;
+			goto good;
+		}
+
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * At this point, we know we're at the end of the
+		 * pathname.  If creating / renaming, we can consider
+		 * allowing the file or directory to be created / renamed,
+		 * provided we're not on a read-only filesystem.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		/* trailing slash only allowed for directories */
+		if ((cnp->cn_flags & TRAILINGSLASH) &&
+		    !(cnp->cn_flags & WILLBEDIR)) {
+			error = ENOENT;
+			goto bad;
+		}
+		if ((cnp->cn_flags & LOCKPARENT) == 0)
+			VOP_UNLOCK(dp, 0);
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory vnode in ndp->ni_dvp.
+		 */
+		if (cnp->cn_flags & SAVESTART) {
+			ndp->ni_startdir = ndp->ni_dvp;
+			VREF(ndp->ni_startdir);
+		}
+		goto success;
+	}
+
+good:
+#ifdef NAMEI_DIAGNOSTIC
+	printf("found\n");
+#endif
+	dp = ndp->ni_vp;
+
+	/*
+	 * Check to see if the vnode has been mounted on;
+	 * if so find the root of the mounted filesystem.
+	 */
+	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		if (vfs_busy(mp, 0))
+			continue;
+		vput(dp);
+		if (dp != ndp->ni_dvp)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+		vrefact(vp_crossmp);
+		ndp->ni_dvp = vp_crossmp;
+		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
+		    cnp->cn_flags), &tdp);
+		vfs_unbusy(mp);
+		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
+			panic("vp_crossmp exclusively locked or reclaimed");
+		if (error) {
+			dpunlocked = 1;
+			goto bad2;
+		}
+		ndp->ni_vp = dp = tdp;
+	}
+
+	/*
+	 * Check for symbolic link
+	 */
+	if ((dp->v_type == VLNK) &&
+	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
+	     *ndp->ni_next == '/')) {
+		cnp->cn_flags |= ISSYMLINK;
+		if (dp->v_iflag & VI_DOOMED) {
+			/*
+			 * We can't know whether the directory was mounted with
+			 * NOSYMFOLLOW, so we can't follow safely.
+			 */
+			error = ENOENT;
+			goto bad2;
+		}
+		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+			error = EACCES;
+			goto bad2;
+		}
+		/*
+		 * Symlink code always expects an unlocked dvp.
+		 */
+		if (ndp->ni_dvp != ndp->ni_vp) {
+			VOP_UNLOCK(ndp->ni_dvp, 0);
+			ni_dvp_unlocked = 1;
+		}
+		goto success;
+	}
+
+nextname:
+	/*
+	 * Not a symbolic link that we will follow.  Continue with the
+	 * next component if there is any; otherwise, we're done.
+	 */
+	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
+	    ("lookup: invalid path state."));
+	if (relookup) {
+		relookup = 0;
+		ndp->ni_pathlen = prev_ni_pathlen;
+		ndp->ni_next = prev_ni_next;
+		if (ndp->ni_dvp != dp)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+		goto dirloop;
+	}
+	if (cnp->cn_flags & ISDOTDOT) {
+		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
+		if (error != 0) {
+#ifdef KTRACE
+			if (KTRPOINT(curthread, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+			goto bad2;
+		}
+	}
+	if (*ndp->ni_next == '/') {
+		cnp->cn_nameptr = ndp->ni_next;
+		while (*cnp->cn_nameptr == '/') {
+			cnp->cn_nameptr++;
+			ndp->ni_pathlen--;
+		}
+		if (ndp->ni_dvp != dp)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+		goto dirloop;
+	}
+	/*
+	 * If we're processing a path with a trailing slash,
+	 * check that the end result is a directory.
+	 */
+	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto bad2;
+	}
+	/*
+	 * Disallow directory write attempts on read-only filesystems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		error = EROFS;
+		goto bad2;
+	}
+	if (cnp->cn_flags & SAVESTART) {
+		ndp->ni_startdir = ndp->ni_dvp;
+		VREF(ndp->ni_startdir);
+	}
+	if (!wantparent) {
+		ni_dvp_unlocked = 2;
+		if (ndp->ni_dvp != dp)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
+		VOP_UNLOCK(ndp->ni_dvp, 0);
+		ni_dvp_unlocked = 1;
+	}
+
+	if (cnp->cn_flags & AUDITVNODE1)
+		AUDIT_ARG_VNODE1(dp);
+	else if (cnp->cn_flags & AUDITVNODE2)
+		AUDIT_ARG_VNODE2(dp);
+
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0);
+success:
+	/*
+	 * Because of shared lookup we may have the vnode shared locked, but
+	 * the caller may want it to be exclusively locked.
+	 */
+	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
+	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
+		vn_lock(dp, LK_UPGRADE | LK_RETRY);
+		if (dp->v_iflag & VI_DOOMED) {
+			error = ENOENT;
+			goto bad2;
+		}
+	}
+	return (0);
+
+bad2:
+	if (ni_dvp_unlocked != 2) {
+		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+	}
+bad:
+	if (!dpunlocked)
+		vput(dp);
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ *    Used by lookup to re-acquire things.
+ */
+int
+relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
+{
+	struct vnode *dp = NULL;		/* the directory we are searching */
+	int wantparent;			/* 1 => wantparent or lockparent flag */
+	int rdonly;			/* lookup read-only flag bit */
+	int error = 0;
+
+	KASSERT(cnp->cn_flags & ISLASTCN,
+	    ("relookup: Not given last component."));
+	/*
+	 * Setup: break out flag bits into variables.
+	 */
+	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+	KASSERT(wantparent, ("relookup: parent not wanted."));
+	rdonly = cnp->cn_flags & RDONLY;
+	cnp->cn_flags &= ~ISSYMLINK;
+	dp = dvp;
+	cnp->cn_lkflags = LK_EXCLUSIVE;
+	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
+
+	/*
+	 * Search a new directory.
+	 *
+	 * The last component of the filename is left accessible via
+	 * cnp->cn_nameptr for callers that need the name. Callers needing
+	 * the name set the SAVENAME flag. When done, they assume
+	 * responsibility for freeing the pathname buffer.
+	 */
+#ifdef NAMEI_DIAGNOSTIC
+	printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+	/*
+	 * Check for "" which represents the root directory after slash
+	 * removal.
+	 */
+	if (cnp->cn_nameptr[0] == '\0') {
+		/*
+		 * Support only LOOKUP for "/" because lookup()
+		 * can't succeed for CREATE, DELETE and RENAME.
+		 */
+		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
+		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
+
+		if (!(cnp->cn_flags & LOCKLEAF))
+			VOP_UNLOCK(dp, 0);
+		*vpp = dp;
+		/* XXX This should probably move to the top of function. */
+		if (cnp->cn_flags & SAVESTART)
+			panic("lookup: SAVESTART");
+		return (0);
+	}
+
+	if (cnp->cn_flags & ISDOTDOT)
+		panic ("relookup: lookup on dot-dot");
+
+	/*
+	 * We now have a segment name to search for, and a directory to search.
+	 */
+#ifdef NAMEI_DIAGNOSTIC
+	vn_printf(dp, "search in ");
+#endif
+	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
+		KASSERT(*vpp == NULL, ("leaf should be empty"));
+		if (error != EJUSTRETURN)
+			goto bad;
+		/*
+		 * If creating and at end of pathname, then can consider
+		 * allowing file to be created.
+		 */
+		if (rdonly) {
+			error = EROFS;
+			goto bad;
+		}
+		/* ASSERT(dvp == ndp->ni_startdir) */
+		if (cnp->cn_flags & SAVESTART)
+			VREF(dvp);
+		if ((cnp->cn_flags & LOCKPARENT) == 0)
+			VOP_UNLOCK(dp, 0);
+		/*
+		 * We return with ni_vp NULL to indicate that the entry
+		 * doesn't currently exist, leaving a pointer to the
+		 * (possibly locked) directory vnode in ndp->ni_dvp.
+		 */
+		return (0);
+	}
+
+	dp = *vpp;
+
+	/*
+	 * Disallow directory write attempts on read-only filesystems.
+	 */
+	if (rdonly &&
+	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+		if (dvp == dp)
+			vrele(dvp);
+		else
+			vput(dvp);
+		error = EROFS;
+		goto bad;
+	}
+	/*
+	 * Set the parent lock/ref state to the requested state.
+	 */
+	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
+		if (wantparent)
+			VOP_UNLOCK(dvp, 0);
+		else
+			vput(dvp);
+	} else if (!wantparent)
+		vrele(dvp);
+	/*
+	 * Check for symbolic link
+	 */
+	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+	    ("relookup: symlink found.\n"));
+
+	/* ASSERT(dvp == ndp->ni_startdir) */
+	if (cnp->cn_flags & SAVESTART)
+		VREF(dvp);
+	
+	if ((cnp->cn_flags & LOCKLEAF) == 0)
+		VOP_UNLOCK(dp, 0);
+	return (0);
+bad:
+	vput(dp);
+	*vpp = NULL;
+	return (error);
+}
+
+void
+NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
+    const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
+    struct thread *td)
+{
+
+	ndp->ni_cnd.cn_nameiop = op;
+	ndp->ni_cnd.cn_flags = flags;
+	ndp->ni_segflg = segflg;
+	ndp->ni_dirp = namep;
+	ndp->ni_dirfd = dirfd;
+	ndp->ni_startdir = startdir;
+	ndp->ni_resflags = 0;
+	if (rightsp != NULL)
+		ndp->ni_rightsneeded = *rightsp;
+	else
+		cap_rights_init(&ndp->ni_rightsneeded);
+	filecaps_init(&ndp->ni_filecaps);
+	ndp->ni_cnd.cn_thread = td;
+}
+
+/*
+ * Free data allocated by namei(); see namei(9) for details.
+ */
+void
+NDFREE(struct nameidata *ndp, const u_int flags)
+{
+	int unlock_dvp;
+	int unlock_vp;
+
+	unlock_dvp = 0;
+	unlock_vp = 0;
+
+	if (!(flags & NDF_NO_FREE_PNBUF) &&
+	    (ndp->ni_cnd.cn_flags & HASBUF)) {
+		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+		ndp->ni_cnd.cn_flags &= ~HASBUF;
+	}
+	if (!(flags & NDF_NO_VP_UNLOCK) &&
+	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
+		unlock_vp = 1;
+	if (!(flags & NDF_NO_DVP_UNLOCK) &&
+	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
+	    ndp->ni_dvp != ndp->ni_vp)
+		unlock_dvp = 1;
+	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
+		if (unlock_vp) {
+			vput(ndp->ni_vp);
+			unlock_vp = 0;
+		} else
+			vrele(ndp->ni_vp);
+		ndp->ni_vp = NULL;
+	}
+	if (unlock_vp)
+		VOP_UNLOCK(ndp->ni_vp, 0);
+	if (!(flags & NDF_NO_DVP_RELE) &&
+	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
+		if (unlock_dvp) {
+			vput(ndp->ni_dvp);
+			unlock_dvp = 0;
+		} else
+			vrele(ndp->ni_dvp);
+		ndp->ni_dvp = NULL;
+	}
+	if (unlock_dvp)
+		VOP_UNLOCK(ndp->ni_dvp, 0);
+	if (!(flags & NDF_NO_STARTDIR_RELE) &&
+	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
+		vrele(ndp->ni_startdir);
+		ndp->ni_startdir = NULL;
+	}
+}
+
+/*
+ * Determine if there is a suitable alternate filename under the specified
+ * prefix for the specified path.  If the create flag is set, then the
+ * alternate prefix will be used so long as the parent directory exists.
+ * This is used by the various compatibility ABIs so that Linux binaries prefer
+ * files under /compat/linux for example.  The chosen path (whether under
+ * the prefix or under /) is returned in a kernel malloc'd buffer pointed
+ * to by pathbuf.  The caller is responsible for free'ing the buffer from
+ * the M_TEMP bucket if one is returned.
+ */
+int
+kern_alternate_path(struct thread *td, const char *prefix, const char *path,
+    enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
+{
+	struct nameidata nd, ndroot;
+	char *ptr, *buf, *cp;
+	size_t len, sz;
+	int error;
+
+	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	*pathbuf = buf;
+
+	/* Copy the prefix into the new pathname as a starting point. */
+	len = strlcpy(buf, prefix, MAXPATHLEN);
+	if (len >= MAXPATHLEN) {
+		*pathbuf = NULL;
+		free(buf, M_TEMP);
+		return (EINVAL);
+	}
+	sz = MAXPATHLEN - len;
+	ptr = buf + len;
+
+	/* Append the filename to the prefix. */
+	if (pathseg == UIO_SYSSPACE)
+		error = copystr(path, ptr, sz, &len);
+	else
+		error = copyinstr(path, ptr, sz, &len);
+
+	if (error) {
+		*pathbuf = NULL;
+		free(buf, M_TEMP);
+		return (error);
+	}
+
+	/* Only use a prefix with absolute pathnames. */
+	if (*ptr != '/') {
+		error = EINVAL;
+		goto keeporig;
+	}
+
+	if (dirfd != AT_FDCWD) {
+		/*
+		 * We want the original because the "prefix" is
+		 * included in the already opened dirfd.
+		 */
+		bcopy(ptr, buf, len);
+		return (0);
+	}
+
+	/*
+	 * We know that there is a / somewhere in this pathname.
+	 * Search backwards for it, to find the file's parent dir
+	 * to see if it exists in the alternate tree. If it does,
+	 * and we want to create a file (cflag is set). We don't
+	 * need to worry about the root comparison in this case.
+	 */
+
+	if (create) {
+		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
+		*cp = '\0';
+
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
+		error = namei(&nd);
+		*cp = '/';
+		if (error != 0)
+			goto keeporig;
+	} else {
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
+
+		error = namei(&nd);
+		if (error != 0)
+			goto keeporig;
+
+		/*
+		 * We now compare the vnode of the prefix to the one
+		 * vnode asked. If they resolve to be the same, then we
+		 * ignore the match so that the real root gets used.
+		 * This avoids the problem of traversing "../.." to find the
+		 * root directory and never finding it, because "/" resolves
+		 * to the emulation root directory. This is expensive :-(
+		 */
+		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
+		    td);
+
+		/* We shouldn't ever get an error from this namei(). */
+		error = namei(&ndroot);
+		if (error == 0) {
+			if (nd.ni_vp == ndroot.ni_vp)
+				error = ENOENT;
+
+			NDFREE(&ndroot, NDF_ONLY_PNBUF);
+			vrele(ndroot.ni_vp);
+		}
+	}
+
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vrele(nd.ni_vp);
+
+keeporig:
+	/* If there was an error, use the original path name. */
+	if (error)
+		bcopy(ptr, buf, len);
+	return (error);
+}
diff --git a/freebsd/sys/kern/vfs_mount.c b/freebsd/sys/kern/vfs_mount.c
new file mode 100644
index 00000000..3610763f
--- /dev/null
+++ b/freebsd/sys/kern/vfs_mount.c
@@ -0,0 +1,2052 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1999-2004 Poul-Henning Kamp
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/reboot.h>
+#include <sys/sbuf.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <vm/uma.h>
+
+#include <geom/geom.h>
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
+
+static int	vfs_domount(struct thread *td, const char *fstype, char *fspath,
+		    uint64_t fsflags, struct vfsoptlist **optlist);
+static void	free_mntarg(struct mntarg *ma);
+
+static int	usermount = 0;
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
+    "Unprivileged users may mount and unmount file systems");
+
+static bool	default_autoro = false;
+SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
+    "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
+static uma_zone_t mount_zone;
+
+/* List of mounted filesystems. */
+struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
+
+/* For any iteration/modification of mountlist */
+struct mtx mountlist_mtx;
+MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
+
+EVENTHANDLER_LIST_DEFINE(vfs_mounted);
+EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
+
+/*
+ * Global opts, taken by all filesystems
+ */
+static const char *global_opts[] = {
+	"errmsg",
+	"fstype",
+	"fspath",
+	"ro",
+	"rw",
+	"nosuid",
+	"noexec",
+	NULL
+};
+
+static int
+mount_init(void *mem, int size, int flags)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)mem;
+	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
+	mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
+	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+	return (0);
+}
+
+static void
+mount_fini(void *mem, int size)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)mem;
+	lockdestroy(&mp->mnt_explock);
+	mtx_destroy(&mp->mnt_listmtx);
+	mtx_destroy(&mp->mnt_mtx);
+}
+
+static void
+vfs_mount_init(void *dummy __unused)
+{
+
+	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
+	    NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+}
+SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for building and sanitizing the mount options
+ */
+
+/* Remove one mount option. */
+static void
+vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
+{
+
+	TAILQ_REMOVE(opts, opt, link);
+	free(opt->name, M_MOUNT);
+	if (opt->value != NULL)
+		free(opt->value, M_MOUNT);
+	free(opt, M_MOUNT);
+}
+
+/* Release all resources related to the mount options. */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+	struct vfsopt *opt;
+
+	while (!TAILQ_EMPTY(opts)) {
+		opt = TAILQ_FIRST(opts);
+		vfs_freeopt(opts, opt);
+	}
+	free(opts, M_MOUNT);
+}
+
+void
+vfs_deleteopt(struct vfsoptlist *opts, const char *name)
+{
+	struct vfsopt *opt, *temp;
+
+	if (opts == NULL)
+		return;
+	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
+		if (strcmp(opt->name, name) == 0)
+			vfs_freeopt(opts, opt);
+	}
+}
+
+static int
+vfs_isopt_ro(const char *opt)
+{
+
+	if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
+	    strcmp(opt, "norw") == 0)
+		return (1);
+	return (0);
+}
+
+static int
+vfs_isopt_rw(const char *opt)
+{
+
+	if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
+		return (1);
+	return (0);
+}
+
+/*
+ * Check if options are equal (with or without the "no" prefix).
+ */
+static int
+vfs_equalopts(const char *opt1, const char *opt2)
+{
+	char *p;
+
+	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
+	if (strcmp(opt1, opt2) == 0)
+		return (1);
+	/* "noopt" vs. "opt" */
+	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+		return (1);
+	/* "opt" vs. "noopt" */
+	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+		return (1);
+	while ((p = strchr(opt1, '.')) != NULL &&
+	    !strncmp(opt1, opt2, ++p - opt1)) {
+		opt2 += p - opt1;
+		opt1 = p;
+		/* "foo.noopt" vs. "foo.opt" */
+		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+			return (1);
+		/* "foo.opt" vs. "foo.noopt" */
+		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+			return (1);
+	}
+	/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
+	if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
+	    (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
+		return (1);
+	return (0);
+}
+
+/*
+ * If a mount option is specified several times,
+ * (with or without the "no" prefix) only keep
+ * the last occurrence of it.
+ */
+static void
+vfs_sanitizeopts(struct vfsoptlist *opts)
+{
+	struct vfsopt *opt, *opt2, *tmp;
+
+	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
+		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
+		while (opt2 != NULL) {
+			if (vfs_equalopts(opt->name, opt2->name)) {
+				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
+				vfs_freeopt(opts, opt2);
+				opt2 = tmp;
+			} else {
+				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
+			}
+		}
+	}
+}
+
+/*
+ * Build a linked list of mount options from a struct uio.
+ */
+int
+vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
+{
+	struct vfsoptlist *opts;
+	struct vfsopt *opt;
+	size_t memused, namelen, optlen;
+	unsigned int i, iovcnt;
+	int error;
+
+	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+	TAILQ_INIT(opts);
+	memused = 0;
+	iovcnt = auio->uio_iovcnt;
+	for (i = 0; i < iovcnt; i += 2) {
+		namelen = auio->uio_iov[i].iov_len;
+		optlen = auio->uio_iov[i + 1].iov_len;
+		memused += sizeof(struct vfsopt) + optlen + namelen;
+		/*
+		 * Avoid consuming too much memory, and attempts to overflow
+		 * memused.
+		 */
+		if (memused > VFS_MOUNTARG_SIZE_MAX ||
+		    optlen > VFS_MOUNTARG_SIZE_MAX ||
+		    namelen > VFS_MOUNTARG_SIZE_MAX) {
+			error = EINVAL;
+			goto bad;
+		}
+
+		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
+		opt->value = NULL;
+		opt->len = 0;
+		opt->pos = i / 2;
+		opt->seen = 0;
+
+		/*
+		 * Do this early, so jumps to "bad" will free the current
+		 * option.
+		 */
+		TAILQ_INSERT_TAIL(opts, opt, link);
+
+		if (auio->uio_segflg == UIO_SYSSPACE) {
+			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
+		} else {
+			error = copyin(auio->uio_iov[i].iov_base, opt->name,
+			    namelen);
+			if (error)
+				goto bad;
+		}
+		/* Ensure names are null-terminated strings. */
+		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
+			error = EINVAL;
+			goto bad;
+		}
+		if (optlen != 0) {
+			opt->len = optlen;
+			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
+			if (auio->uio_segflg == UIO_SYSSPACE) {
+				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
+				    optlen);
+			} else {
+				error = copyin(auio->uio_iov[i + 1].iov_base,
+				    opt->value, optlen);
+				if (error)
+					goto bad;
+			}
+		}
+	}
+	vfs_sanitizeopts(opts);
+	*options = opts;
+	return (0);
+bad:
+	vfs_freeopts(opts);
+	return (error);
+}
+
+/*
+ * Merge the old mount options with the new ones passed
+ * in the MNT_UPDATE case.
+ *
+ * XXX: This function will keep a "nofoo" option in the new
+ * options.  E.g, if the option's canonical name is "foo",
+ * "nofoo" ends up in the mount point's active options.
+ */
+static void
+vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
+{
+	struct vfsopt *opt, *new;
+
+	TAILQ_FOREACH(opt, oldopts, link) {
+		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+		new->name = strdup(opt->name, M_MOUNT);
+		if (opt->len != 0) {
+			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+			bcopy(opt->value, new->value, opt->len);
+		} else
+			new->value = NULL;
+		new->len = opt->len;
+		new->seen = opt->seen;
+		TAILQ_INSERT_HEAD(toopts, new, link);
+	}
+	vfs_sanitizeopts(toopts);
+}
+
+/*
+ * Mount a filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nmount_args {
+	struct iovec *iovp;
+	unsigned int iovcnt;
+	int flags;
+};
+#endif
+int
+sys_nmount(struct thread *td, struct nmount_args *uap)
+{
+	struct uio *auio;
+	int error;
+	u_int iovcnt;
+	uint64_t flags;
+
+	/*
+	 * Mount flags are now 64-bits. On 32-bit archtectures only
+	 * 32-bits are passed in, but from here on everything handles
+	 * 64-bit flags correctly.
+	 */
+	flags = uap->flags;
+
+	AUDIT_ARG_FFLAGS(flags);
+	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
+	    uap->iovp, uap->iovcnt, flags);
+
+	/*
+	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
+	 * userspace to set this flag, but we must filter it out if we want
+	 * MNT_UPDATE on the root file system to work.
+	 * MNT_ROOTFS should only be set by the kernel when mounting its
+	 * root file system.
+	 */
+	flags &= ~MNT_ROOTFS;
+
+	iovcnt = uap->iovcnt;
+	/*
+	 * Check that we have an even number of iovec's
+	 * and that we have at least two options.
+	 */
+	if ((iovcnt & 1) || (iovcnt < 4)) {
+		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
+		    uap->iovcnt);
+		return (EINVAL);
+	}
+
+	error = copyinuio(uap->iovp, iovcnt, &auio);
+	if (error) {
+		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
+		    __func__, error);
+		return (error);
+	}
+	error = vfs_donmount(td, flags, auio);
+
+	free(auio, M_IOV);
+	return (error);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Various utility functions
+ */
+
+void
+vfs_ref(struct mount *mp)
+{
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	MNT_IUNLOCK(mp);
+}
+
+void
+vfs_rel(struct mount *mp)
+{
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * Allocate and initialize the mount point struct.
+ */
+struct mount *
+vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
+    struct ucred *cred)
+{
+	struct mount *mp;
+
+	mp = uma_zalloc(mount_zone, M_WAITOK);
+	bzero(&mp->mnt_startzero,
+	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
+	TAILQ_INIT(&mp->mnt_nvnodelist);
+	mp->mnt_nvnodelistsize = 0;
+	TAILQ_INIT(&mp->mnt_activevnodelist);
+	mp->mnt_activevnodelistsize = 0;
+	TAILQ_INIT(&mp->mnt_tmpfreevnodelist);
+	mp->mnt_tmpfreevnodelistsize = 0;
+	mp->mnt_ref = 0;
+	(void) vfs_busy(mp, MBF_NOWAIT);
+	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_gen++;
+	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_cred = crdup(cred);
+	mp->mnt_stat.f_owner = cred->cr_uid;
+	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+	mp->mnt_iosize_max = DFLTPHYS;
+#ifdef MAC
+	mac_mount_init(mp);
+	mac_mount_create(cred, mp);
+#endif
+	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
+	TAILQ_INIT(&mp->mnt_uppers);
+	return (mp);
+}
+
+/*
+ * Destroy the mount struct previously allocated by vfs_mount_alloc().
+ */
+void
+vfs_mount_destroy(struct mount *mp)
+{
+
+	MNT_ILOCK(mp);
+	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
+	if (mp->mnt_kern_flag & MNTK_MWAIT) {
+		mp->mnt_kern_flag &= ~MNTK_MWAIT;
+		wakeup(mp);
+	}
+	while (mp->mnt_ref)
+		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
+	KASSERT(mp->mnt_ref == 0,
+	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
+	    __FILE__, __LINE__));
+	if (mp->mnt_writeopcount != 0)
+		panic("vfs_mount_destroy: nonzero writeopcount");
+	if (mp->mnt_secondary_writes != 0)
+		panic("vfs_mount_destroy: nonzero secondary_writes");
+	atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
+	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
+		struct vnode *vp;
+
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
+			vn_printf(vp, "dangling vnode ");
+		panic("unmount: dangling vnode");
+	}
+	KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
+	if (mp->mnt_nvnodelistsize != 0)
+		panic("vfs_mount_destroy: nonzero nvnodelistsize");
+	if (mp->mnt_activevnodelistsize != 0)
+		panic("vfs_mount_destroy: nonzero activevnodelistsize");
+	if (mp->mnt_tmpfreevnodelistsize != 0)
+		panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize");
+	if (mp->mnt_lockref != 0)
+		panic("vfs_mount_destroy: nonzero lock refcount");
+	MNT_IUNLOCK(mp);
+	if (mp->mnt_vnodecovered != NULL)
+		vrele(mp->mnt_vnodecovered);
+#ifdef MAC
+	mac_mount_destroy(mp);
+#endif
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	crfree(mp->mnt_cred);
+	uma_zfree(mount_zone, mp);
+}
+
+static bool
+vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
+{
+	/* This is an upgrade of an exisiting mount. */
+	if ((fsflags & MNT_UPDATE) != 0)
+		return (false);
+	/* This is already an R/O mount. */
+	if ((fsflags & MNT_RDONLY) != 0)
+		return (false);
+
+	switch (error) {
+	case ENODEV:	/* generic, geom, ... */
+	case EACCES:	/* cam/scsi, ... */
+	case EROFS:	/* md, mmcsd, ... */
+		/*
+		 * These errors can be returned by the storage layer to signal
+		 * that the media is read-only.  No harm in the R/O mount
+		 * attempt if the error was returned for some other reason.
+		 */
+		return (true);
+	default:
+		return (false);
+	}
+}
+
+int
+vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
+{
+	struct vfsoptlist *optlist;
+	struct vfsopt *opt, *tmp_opt;
+	char *fstype, *fspath, *errmsg;
+	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+	bool autoro;
+
+	errmsg = fspath = NULL;
+	errmsg_len = fspathlen = 0;
+	errmsg_pos = -1;
+	autoro = default_autoro;
+
+	error = vfs_buildopts(fsoptions, &optlist);
+	if (error)
+		return (error);
+
+	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
+		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
+
+	/*
+	 * We need these two options before the others,
+	 * and they are mandatory for any filesystem.
+	 * Ensure they are NUL terminated as well.
+	 */
+	fstypelen = 0;
+	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+	if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') {
+		error = EINVAL;
+		if (errmsg != NULL)
+			strncpy(errmsg, "Invalid fstype", errmsg_len);
+		goto bail;
+	}
+	fspathlen = 0;
+	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+	if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') {
+		error = EINVAL;
+		if (errmsg != NULL)
+			strncpy(errmsg, "Invalid fspath", errmsg_len);
+		goto bail;
+	}
+
+	/*
+	 * We need to see if we have the "update" option
+	 * before we call vfs_domount(), since vfs_domount() has special
+	 * logic based on MNT_UPDATE.  This is very important
+	 * when we want to update the root filesystem.
+	 */
+	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
+		if (strcmp(opt->name, "update") == 0) {
+			fsflags |= MNT_UPDATE;
+			vfs_freeopt(optlist, opt);
+		}
+		else if (strcmp(opt->name, "async") == 0)
+			fsflags |= MNT_ASYNC;
+		else if (strcmp(opt->name, "force") == 0) {
+			fsflags |= MNT_FORCE;
+			vfs_freeopt(optlist, opt);
+		}
+		else if (strcmp(opt->name, "reload") == 0) {
+			fsflags |= MNT_RELOAD;
+			vfs_freeopt(optlist, opt);
+		}
+		else if (strcmp(opt->name, "multilabel") == 0)
+			fsflags |= MNT_MULTILABEL;
+		else if (strcmp(opt->name, "noasync") == 0)
+			fsflags &= ~MNT_ASYNC;
+		else if (strcmp(opt->name, "noatime") == 0)
+			fsflags |= MNT_NOATIME;
+		else if (strcmp(opt->name, "atime") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoatime", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noclusterr") == 0)
+			fsflags |= MNT_NOCLUSTERR;
+		else if (strcmp(opt->name, "clusterr") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoclusterr", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noclusterw") == 0)
+			fsflags |= MNT_NOCLUSTERW;
+		else if (strcmp(opt->name, "clusterw") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoclusterw", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noexec") == 0)
+			fsflags |= MNT_NOEXEC;
+		else if (strcmp(opt->name, "exec") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoexec", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "nosuid") == 0)
+			fsflags |= MNT_NOSUID;
+		else if (strcmp(opt->name, "suid") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonosuid", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "nosymfollow") == 0)
+			fsflags |= MNT_NOSYMFOLLOW;
+		else if (strcmp(opt->name, "symfollow") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonosymfollow", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noro") == 0) {
+			fsflags &= ~MNT_RDONLY;
+			autoro = false;
+		}
+		else if (strcmp(opt->name, "rw") == 0) {
+			fsflags &= ~MNT_RDONLY;
+			autoro = false;
+		}
+		else if (strcmp(opt->name, "ro") == 0) {
+			fsflags |= MNT_RDONLY;
+			autoro = false;
+		}
+		else if (strcmp(opt->name, "rdonly") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("ro", M_MOUNT);
+			fsflags |= MNT_RDONLY;
+			autoro = false;
+		}
+		else if (strcmp(opt->name, "autoro") == 0) {
+			vfs_freeopt(optlist, opt);
+			autoro = true;
+		}
+		else if (strcmp(opt->name, "suiddir") == 0)
+			fsflags |= MNT_SUIDDIR;
+		else if (strcmp(opt->name, "sync") == 0)
+			fsflags |= MNT_SYNCHRONOUS;
+		else if (strcmp(opt->name, "union") == 0)
+			fsflags |= MNT_UNION;
+		else if (strcmp(opt->name, "automounted") == 0) {
+			fsflags |= MNT_AUTOMOUNTED;
+			vfs_freeopt(optlist, opt);
+		}
+	}
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
+		error = ENAMETOOLONG;
+		goto bail;
+	}
+
+	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+
+	/*
+	 * See if we can mount in the read-only mode if the error code suggests
+	 * that it could be possible and the mount options allow for that.
+	 * Never try it if "[no]{ro|rw}" has been explicitly requested and not
+	 * overridden by "autoro".
+	 */
+	if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
+		printf("%s: R/W mount failed, possibly R/O media,"
+		    " trying R/O mount\n", __func__);
+		fsflags |= MNT_RDONLY;
+		error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+	}
+bail:
+	/* copyout the errmsg */
+	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
+	    && errmsg_len > 0 && errmsg != NULL) {
+		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
+			bcopy(errmsg,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+		} else {
+			copyout(errmsg,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+		}
+	}
+
+	if (optlist != NULL)
+		vfs_freeopts(optlist);
+	return (error);
+}
+
+/*
+ * Old mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+sys_mount(struct thread *td, struct mount_args *uap)
+{
+	char *fstype;
+	struct vfsconf *vfsp = NULL;
+	struct mntarg *ma = NULL;
+	uint64_t flags;
+	int error;
+
+	/*
+	 * Mount flags are now 64-bits. On 32-bit architectures only
+	 * 32-bits are passed in, but from here on everything handles
+	 * 64-bit flags correctly.
+	 */
+	flags = uap->flags;
+
+	AUDIT_ARG_FFLAGS(flags);
+
+	/*
+	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
+	 * userspace to set this flag, but we must filter it out if we want
+	 * MNT_UPDATE on the root file system to work.
+	 * MNT_ROOTFS should only be set by the kernel when mounting its
+	 * root file system.
+	 */
+	flags &= ~MNT_ROOTFS;
+
+	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
+	if (error) {
+		free(fstype, M_TEMP);
+		return (error);
+	}
+
+	AUDIT_ARG_TEXT(fstype);
+	vfsp = vfs_byname_kld(fstype, td, &error);
+	free(fstype, M_TEMP);
+	if (vfsp == NULL)
+		return (ENOENT);
+	if (vfsp->vfc_vfsops->vfs_cmount == NULL)
+		return (EOPNOTSUPP);
+
+	ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
+	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
+	ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
+	ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
+	ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
+
+	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags);
+	return (error);
+}
+
+/*
+ * vfs_domount_first(): first file system mount (not update)
+ */
+static int
+vfs_domount_first(
+	struct thread *td,		/* Calling thread. */
+	struct vfsconf *vfsp,		/* File system type. */
+	char *fspath,			/* Mount path. */
+	struct vnode *vp,		/* Vnode to be covered. */
+	uint64_t fsflags,		/* Flags common to all filesystems. */
+	struct vfsoptlist **optlist	/* Options local to the filesystem. */
+	)
+{
+	struct vattr va;
+	struct mount *mp;
+	struct vnode *newdp;
+	int error, error1;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
+
+	/*
+	 * If the jail of the calling thread lacks permission for this type of
+	 * file system, deny immediately.
+	 */
+	if (jailed(td->td_ucred) && !prison_allow(td->td_ucred,
+	    vfsp->vfc_prison_flag)) {
+		vput(vp);
+		return (EPERM);
+	}
+
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	error = VOP_GETATTR(vp, &va, td->td_ucred);
+	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
+		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
+	if (error == 0)
+		error = vinvalbuf(vp, V_SAVE, 0, 0);
+	if (error == 0 && vp->v_type != VDIR)
+		error = ENOTDIR;
+	if (error == 0) {
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+			vp->v_iflag |= VI_MOUNT;
+		else
+			error = EBUSY;
+		VI_UNLOCK(vp);
+	}
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0);
+
+	/* Allocate and initialize the filesystem. */
+	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
+	/* XXXMAC: pass to vfs_mount_alloc? */
+	mp->mnt_optnew = *optlist;
+	/* Set the mount level flags. */
+	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
+
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error1 = 0;
+	if ((error = VFS_MOUNT(mp)) != 0 ||
+	    (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 ||
+	    (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) {
+		if (error1 != 0) {
+			error = error1;
+			if ((error1 = VFS_UNMOUNT(mp, 0)) != 0)
+				printf("VFS_UNMOUNT returned %d\n", error1);
+		}
+		vfs_unbusy(mp);
+		mp->mnt_vnodecovered = NULL;
+		vfs_mount_destroy(mp);
+		VI_LOCK(vp);
+		vp->v_iflag &= ~VI_MOUNT;
+		VI_UNLOCK(vp);
+		vrele(vp);
+		return (error);
+	}
+	VOP_UNLOCK(newdp, 0);
+
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	mp->mnt_opt = mp->mnt_optnew;
+	*optlist = NULL;
+
+	/*
+	 * Prevent external consumers of mount options from reading mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+
+	MNT_ILOCK(mp);
+	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+		mp->mnt_kern_flag |= MNTK_ASYNC;
+	else
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	cache_purge(vp);
+	VI_LOCK(vp);
+	vp->v_iflag &= ~VI_MOUNT;
+	VI_UNLOCK(vp);
+	vp->v_mountedhere = mp;
+	/* Place the new filesystem at the end of the mount list. */
+	mtx_lock(&mountlist_mtx);
+	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+	vfs_event_signal(NULL, VQ_MOUNT, 0);
+	vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
+	VOP_UNLOCK(vp, 0);
+	EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
+	VOP_UNLOCK(newdp, 0);
+	mountcheckdirs(vp, newdp);
+	vrele(newdp);
+	if ((mp->mnt_flag & MNT_RDONLY) == 0)
+		vfs_allocate_syncvnode(mp);
+	vfs_unbusy(mp);
+	return (0);
+}
+
+/*
+ * vfs_domount_update(): update of mounted file system
+ */
+static int
+vfs_domount_update(
+	struct thread *td,		/* Calling thread. */
+	struct vnode *vp,		/* Mount point vnode. */
+	uint64_t fsflags,		/* Flags common to all filesystems. */
+	struct vfsoptlist **optlist	/* Options local to the filesystem. */
+	)
+{
+	struct export_args export;
+	void *bufp;
+	struct mount *mp;
+	int error, export_error, len;
+	uint64_t flag;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
+	mp = vp->v_mount;
+
+	if ((vp->v_vflag & VV_ROOT) == 0) {
+		if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
+		    == 0)
+			error = EXDEV;
+		else
+			error = EINVAL;
+		vput(vp);
+		return (error);
+	}
+
+	/*
+	 * We only allow the filesystem to be reloaded if it
+	 * is currently mounted read-only.
+	 */
+	flag = mp->mnt_flag;
+	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
+		vput(vp);
+		return (EOPNOTSUPP);	/* Needs translation */
+	}
+	/*
+	 * Only privileged root, or (if MNT_USER is set) the user that
+	 * did the original mount is permitted to update it.
+	 */
+	error = vfs_suser(mp, td);
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	if (vfs_busy(mp, MBF_NOWAIT)) {
+		vput(vp);
+		return (EBUSY);
+	}
+	VI_LOCK(vp);
+	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
+		VI_UNLOCK(vp);
+		vfs_unbusy(mp);
+		vput(vp);
+		return (EBUSY);
+	}
+	vp->v_iflag |= VI_MOUNT;
+	VI_UNLOCK(vp);
+	VOP_UNLOCK(vp, 0);
+
+	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+		MNT_IUNLOCK(mp);
+		error = EBUSY;
+		goto end;
+	}
+	mp->mnt_flag &= ~MNT_UPDATEMASK;
+	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
+	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
+	if ((mp->mnt_flag & MNT_ASYNC) == 0)
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+	mp->mnt_optnew = *optlist;
+	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
+
+	/*
+	 * Mount the filesystem.
+	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+	 * get.  No freeing of cn_pnbuf.
+	 */
+	error = VFS_MOUNT(mp);
+
+	export_error = 0;
+	/* Process the export option. */
+	if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
+	    &len) == 0) {
+		/* Assume that there is only 1 ABI for each length. */
+		switch (len) {
+		case (sizeof(struct oexport_args)):
+			bzero(&export, sizeof(export));
+			/* FALLTHROUGH */
+		case (sizeof(export)):
+			bcopy(bufp, &export, len);
+			export_error = vfs_export(mp, &export);
+			break;
+		default:
+			export_error = EINVAL;
+			break;
+		}
+	}
+
+	MNT_ILOCK(mp);
+	if (error == 0) {
+		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
+		    MNT_SNAPSHOT);
+	} else {
+		/*
+		 * If we fail, restore old mount flags. MNT_QUOTA is special,
+		 * because it is not part of MNT_UPDATEMASK, but it could have
+		 * changed in the meantime if quotactl(2) was called.
+		 * All in all we want current value of MNT_QUOTA, not the old
+		 * one.
+		 */
+		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
+	}
+	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+		mp->mnt_kern_flag |= MNTK_ASYNC;
+	else
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+
+	if (error != 0)
+		goto end;
+
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	mp->mnt_opt = mp->mnt_optnew;
+	*optlist = NULL;
+	(void)VFS_STATFS(mp, &mp->mnt_stat);
+	/*
+	 * Prevent external consumers of mount options from reading
+	 * mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+
+	if ((mp->mnt_flag & MNT_RDONLY) == 0)
+		vfs_allocate_syncvnode(mp);
+	else
+		vfs_deallocate_syncvnode(mp);
+end:
+	vfs_unbusy(mp);
+	VI_LOCK(vp);
+	vp->v_iflag &= ~VI_MOUNT;
+	VI_UNLOCK(vp);
+	vrele(vp);
+	return (error != 0 ? error : export_error);
+}
+
+/*
+ * vfs_domount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_domount(
+	struct thread *td,		/* Calling thread. */
+	const char *fstype,		/* Filesystem type. */
+	char *fspath,			/* Mount path. */
+	uint64_t fsflags,		/* Flags common to all filesystems. */
+	struct vfsoptlist **optlist	/* Options local to the filesystem. */
+	)
+{
+	struct vfsconf *vfsp;
+	struct nameidata nd;
+	struct vnode *vp;
+	char *pathbuf;
+	int error;
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+		return (ENAMETOOLONG);
+
+	if (jailed(td->td_ucred) || usermount == 0) {
+		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
+			return (error);
+	}
+
+	/*
+	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
+	 */
+	if (fsflags & MNT_EXPORTED) {
+		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
+		if (error)
+			return (error);
+	}
+	if (fsflags & MNT_SUIDDIR) {
+		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
+	 */
+	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
+		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
+			fsflags |= MNT_NOSUID | MNT_USER;
+	}
+
+	/* Load KLDs before we lock the covered vnode to avoid reversals. */
+	vfsp = NULL;
+	if ((fsflags & MNT_UPDATE) == 0) {
+		/* Don't try to load KLDs if we're mounting the root. */
+		if (fsflags & MNT_ROOTFS)
+			vfsp = vfs_byname(fstype);
+		else
+			vfsp = vfs_byname_kld(fstype, td, &error);
+		if (vfsp == NULL)
+			return (ENODEV);
+	}
+
+	/*
+	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+	    UIO_SYSSPACE, fspath, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	if ((fsflags & MNT_UPDATE) == 0) {
+		pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+		strcpy(pathbuf, fspath);
+		error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN);
+		/* debug.disablefullpath == 1 results in ENODEV */
+		if (error == 0 || error == ENODEV) {
+			error = vfs_domount_first(td, vfsp, pathbuf, vp,
+			    fsflags, optlist);
+		}
+		free(pathbuf, M_TEMP);
+	} else
+		error = vfs_domount_update(td, vp, fsflags, optlist);
+
+	return (error);
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument, not
+ * special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+sys_unmount(struct thread *td, struct unmount_args *uap)
+{
+	struct nameidata nd;
+	struct mount *mp;
+	char *pathbuf;
+	int error, id0, id1;
+
+	AUDIT_ARG_VALUE(uap->flags);
+	if (jailed(td->td_ucred) || usermount == 0) {
+		error = priv_check(td, PRIV_VFS_UNMOUNT);
+		if (error)
+			return (error);
+	}
+
+	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
+	if (error) {
+		free(pathbuf, M_TEMP);
+		return (error);
+	}
+	if (uap->flags & MNT_BYFSID) {
+		AUDIT_ARG_TEXT(pathbuf);
+		/* Decode the filesystem ID. */
+		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
+			free(pathbuf, M_TEMP);
+			return (EINVAL);
+		}
+
+		mtx_lock(&mountlist_mtx);
+		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
+			    mp->mnt_stat.f_fsid.val[1] == id1) {
+				vfs_ref(mp);
+				break;
+			}
+		}
+		mtx_unlock(&mountlist_mtx);
+	} else {
+		/*
+		 * Try to find global path for path argument.
+		 */
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+		    UIO_SYSSPACE, pathbuf, td);
+		if (namei(&nd) == 0) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
+			    MNAMELEN);
+			if (error == 0 || error == ENODEV)
+				vput(nd.ni_vp);
+		}
+		mtx_lock(&mountlist_mtx);
+		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
+				vfs_ref(mp);
+				break;
+			}
+		}
+		mtx_unlock(&mountlist_mtx);
+	}
+	free(pathbuf, M_TEMP);
+	if (mp == NULL) {
+		/*
+		 * Previously we returned ENOENT for a nonexistent path and
+		 * EINVAL for a non-mountpoint.  We cannot tell these apart
+		 * now, so in the !MNT_BYFSID case return the more likely
+		 * EINVAL for compatibility.
+		 */
+		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
+	}
+
+	/*
+	 * Don't allow unmounting the root filesystem.
+	 */
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		vfs_rel(mp);
+		return (EINVAL);
+	}
+	error = dounmount(mp, uap->flags, td);
+	return (error);
+}
+
+/*
+ * Return error if any of the vnodes, ignoring the root vnode
+ * and the syncer vnode, have non-zero usecount.
+ *
+ * This function is purely advisory - it can return false positives
+ * and negatives.
+ */
+static int
+vfs_check_usecounts(struct mount *mp)
+{
+	struct vnode *vp, *mvp;
+
+	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+		if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
+		    vp->v_usecount != 0) {
+			VI_UNLOCK(vp);
+			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+			return (EBUSY);
+		}
+		VI_UNLOCK(vp);
+	}
+
+	return (0);
+}
+
+static void
+dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
+{
+
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+	mp->mnt_kern_flag &= ~mntkflags;
+	if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
+		mp->mnt_kern_flag &= ~MNTK_MWAIT;
+		wakeup(mp);
+	}
+	MNT_IUNLOCK(mp);
+	if (coveredvp != NULL) {
+		VOP_UNLOCK(coveredvp, 0);
+		vdrop(coveredvp);
+	}
+	vn_finished_write(mp);
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(struct mount *mp, int flags, struct thread *td)
+{
+	struct vnode *coveredvp;
+	int error;
+	uint64_t async_flag;
+	int mnt_gen_r;
+
+	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
+		mnt_gen_r = mp->mnt_gen;
+		VI_LOCK(coveredvp);
+		vholdl(coveredvp);
+		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
+		/*
+		 * Check for mp being unmounted while waiting for the
+		 * covered vnode lock.
+		 */
+		if (coveredvp->v_mountedhere != mp ||
+		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
+			VOP_UNLOCK(coveredvp, 0);
+			vdrop(coveredvp);
+			vfs_rel(mp);
+			return (EBUSY);
+		}
+	}
+
+	/*
+	 * Only privileged root, or (if MNT_USER is set) the user that did the
+	 * original mount is permitted to unmount this filesystem.
+	 */
+	error = vfs_suser(mp, td);
+	if (error != 0) {
+		if (coveredvp != NULL) {
+			VOP_UNLOCK(coveredvp, 0);
+			vdrop(coveredvp);
+		}
+		vfs_rel(mp);
+		return (error);
+	}
+
+	vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
+	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
+	    (mp->mnt_flag & MNT_UPDATE) != 0 ||
+	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
+		dounmount_cleanup(mp, coveredvp, 0);
+		return (EBUSY);
+	}
+	mp->mnt_kern_flag |= MNTK_UNMOUNT;
+	if (flags & MNT_NONBUSY) {
+		MNT_IUNLOCK(mp);
+		error = vfs_check_usecounts(mp);
+		MNT_ILOCK(mp);
+		if (error != 0) {
+			dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
+			return (error);
+		}
+	}
+	/* Allow filesystems to detect that a forced unmount is in progress. */
+	if (flags & MNT_FORCE) {
+		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+		MNT_IUNLOCK(mp);
+		/*
+		 * Must be done after setting MNTK_UNMOUNTF and before
+		 * waiting for mnt_lockref to become 0.
+		 */
+		VFS_PURGE(mp);
+		MNT_ILOCK(mp);
+	}
+	error = 0;
+	if (mp->mnt_lockref) {
+		mp->mnt_kern_flag |= MNTK_DRAINING;
+		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
+		    "mount drain", 0);
+	}
+	MNT_IUNLOCK(mp);
+	KASSERT(mp->mnt_lockref == 0,
+	    ("%s: invalid lock refcount in the drain path @ %s:%d",
+	    __func__, __FILE__, __LINE__));
+	KASSERT(error == 0,
+	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
+	    __func__, __FILE__, __LINE__));
+
+	if (mp->mnt_flag & MNT_EXPUBLIC)
+		vfs_setpublicfs(NULL, NULL, NULL);
+
+	/*
+	 * From now, we can claim that the use reference on the
+	 * coveredvp is ours, and the ref can be released only by
+	 * successfull unmount by us, or left for later unmount
+	 * attempt.  The previously acquired hold reference is no
+	 * longer needed to protect the vnode from reuse.
+	 */
+	if (coveredvp != NULL)
+		vdrop(coveredvp);
+
+	vfs_msync(mp, MNT_WAIT);
+	MNT_ILOCK(mp);
+	async_flag = mp->mnt_flag & MNT_ASYNC;
+	mp->mnt_flag &= ~MNT_ASYNC;
+	mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
+	cache_purgevfs(mp, false); /* remove cache entries for this file sys */
+	vfs_deallocate_syncvnode(mp);
+	error = VFS_UNMOUNT(mp, flags);
+	vn_finished_write(mp);
+	/*
+	 * If we failed to flush the dirty blocks for this mount point,
+	 * undo all the cdir/rdir and rootvnode changes we made above.
+	 * Unless we failed to do so because the device is reporting that
+	 * it doesn't exist anymore.
+	 */
+	if (error && error != ENXIO) {
+		MNT_ILOCK(mp);
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			MNT_IUNLOCK(mp);
+			vfs_allocate_syncvnode(mp);
+			MNT_ILOCK(mp);
+		}
+		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+		mp->mnt_flag |= async_flag;
+		if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+		    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+			mp->mnt_kern_flag |= MNTK_ASYNC;
+		if (mp->mnt_kern_flag & MNTK_MWAIT) {
+			mp->mnt_kern_flag &= ~MNTK_MWAIT;
+			wakeup(mp);
+		}
+		MNT_IUNLOCK(mp);
+		if (coveredvp)
+			VOP_UNLOCK(coveredvp, 0);
+		return (error);
+	}
+	mtx_lock(&mountlist_mtx);
+	TAILQ_REMOVE(&mountlist, mp, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+	EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
+	if (coveredvp != NULL) {
+		coveredvp->v_mountedhere = NULL;
+		VOP_UNLOCK(coveredvp, 0);
+	}
+	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
+	if (rootvnode != NULL && mp == rootvnode->v_mount) {
+		vrele(rootvnode);
+		rootvnode = NULL;
+	}
+	if (mp == rootdevmp)
+		rootdevmp = NULL;
+	vfs_mount_destroy(mp);
+	return (0);
+}
+
+/*
+ * Report errors during filesystem mounting.
+ */
+void
+vfs_mount_error(struct mount *mp, const char *fmt, ...)
+{
+	struct vfsoptlist *moptlist = mp->mnt_optnew;
+	va_list ap;
+	int error, len;
+	char *errmsg;
+
+	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
+	if (error || errmsg == NULL || len <= 0)
+		return;
+
+	va_start(ap, fmt);
+	vsnprintf(errmsg, (size_t)len, fmt, ap);
+	va_end(ap);
+}
+
+void
+vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
+{
+	va_list ap;
+	int error, len;
+	char *errmsg;
+
+	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
+	if (error || errmsg == NULL || len <= 0)
+		return;
+
+	va_start(ap, fmt);
+	vsnprintf(errmsg, (size_t)len, fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for querying mount options/arguments from filesystems.
+ */
+
+/*
+ * Check that no unknown options are given
+ */
+int
+vfs_filteropt(struct vfsoptlist *opts, const char **legal)
+{
+	struct vfsopt *opt;
+	char errmsg[255];
+	const char **t, *p, *q;
+	int ret = 0;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		p = opt->name;
+		q = NULL;
+		if (p[0] == 'n' && p[1] == 'o')
+			q = p + 2;
+		for(t = global_opts; *t != NULL; t++) {
+			if (strcmp(*t, p) == 0)
+				break;
+			if (q != NULL) {
+				if (strcmp(*t, q) == 0)
+					break;
+			}
+		}
+		if (*t != NULL)
+			continue;
+		for(t = legal; *t != NULL; t++) {
+			if (strcmp(*t, p) == 0)
+				break;
+			if (q != NULL) {
+				if (strcmp(*t, q) == 0)
+					break;
+			}
+		}
+		if (*t != NULL)
+			continue;
+		snprintf(errmsg, sizeof(errmsg),
+		    "mount option <%s> is unknown", p);
+		ret = EINVAL;
+	}
+	if (ret != 0) {
+		TAILQ_FOREACH(opt, opts, link) {
+			if (strcmp(opt->name, "errmsg") == 0) {
+				strncpy((char *)opt->value, errmsg, opt->len);
+				break;
+			}
+		}
+		if (opt == NULL)
+			printf("%s\n", errmsg);
+	}
+	return (ret);
+}
+
+/*
+ * Get a mount option by its name.
+ *
+ * Return 0 if the option was found, ENOENT otherwise.
+ * If len is non-NULL it will be filled with the length
+ * of the option. If buf is non-NULL, it will be filled
+ * with the address of the option.
+ */
+int
+vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len)
+{
+	struct vfsopt *opt;
+
+	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			if (len != NULL)
+				*len = opt->len;
+			if (buf != NULL)
+				*buf = opt->value;
+			return (0);
+		}
+	}
+	return (ENOENT);
+}
+
+int
+vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
+{
+	struct vfsopt *opt;
+
+	if (opts == NULL)
+		return (-1);
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			return (opt->pos);
+		}
+	}
+	return (-1);
+}
+
+int
+vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
+{
+	char *opt_value, *vtp;
+	quad_t iv;
+	int error, opt_len;
+
+	error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
+	if (error != 0)
+		return (error);
+	if (opt_len == 0 || opt_value == NULL)
+		return (EINVAL);
+	if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
+		return (EINVAL);
+	iv = strtoq(opt_value, &vtp, 0);
+	if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
+		return (EINVAL);
+	if (iv < 0)
+		return (EINVAL);
+	switch (vtp[0]) {
+	case 't': case 'T':
+		iv *= 1024;
+		/* FALLTHROUGH */
+	case 'g': case 'G':
+		iv *= 1024;
+		/* FALLTHROUGH */
+	case 'm': case 'M':
+		iv *= 1024;
+		/* FALLTHROUGH */
+	case 'k': case 'K':
+		iv *= 1024;
+	case '\0':
+		break;
+	default:
+		return (EINVAL);
+	}
+	*value = iv;
+
+	return (0);
+}
+
+char *
+vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
+{
+	struct vfsopt *opt;
+
+	*error = 0;
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->len == 0 ||
+		    ((char *)opt->value)[opt->len - 1] != '\0') {
+			*error = EINVAL;
+			return (NULL);
+		}
+		return (opt->value);
+	}
+	*error = ENOENT;
+	return (NULL);
+}
+
+int
+vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
+	uint64_t val)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			if (w != NULL)
+				*w |= val;
+			return (1);
+		}
+	}
+	if (w != NULL)
+		*w &= ~val;
+	return (0);
+}
+
+int
+vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
+{
+	va_list ap;
+	struct vfsopt *opt;
+	int ret;
+
+	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->len == 0 || opt->value == NULL)
+			return (0);
+		if (((char *)opt->value)[opt->len - 1] != '\0')
+			return (0);
+		va_start(ap, fmt);
+		ret = vsscanf(opt->value, fmt, ap);
+		va_end(ap);
+		return (ret);
+	}
+	return (0);
+}
+
+int
+vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->value == NULL)
+			opt->len = len;
+		else {
+			if (opt->len != len)
+				return (EINVAL);
+			bcopy(value, opt->value, len);
+		}
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->value == NULL)
+			opt->len = len;
+		else {
+			if (opt->len < len)
+				return (EINVAL);
+			opt->len = len;
+			bcopy(value, opt->value, len);
+		}
+		return (0);
+	}
+	return (ENOENT);
+}
+
+int
+vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
+{
+	struct vfsopt *opt;
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) != 0)
+			continue;
+		opt->seen = 1;
+		if (opt->value == NULL)
+			opt->len = strlen(value) + 1;
+		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
+			return (EINVAL);
+		return (0);
+	}
+	return (ENOENT);
+}
+
+/*
+ * Find and copy a mount option.
+ *
+ * The size of the buffer has to be specified
+ * in len, if it is not the same length as the
+ * mount option, EINVAL is returned.
+ * Returns ENOENT if the option is not found.
+ */
+int
+vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len)
+{
+	struct vfsopt *opt;
+
+	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
+
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0) {
+			opt->seen = 1;
+			if (len != opt->len)
+				return (EINVAL);
+			bcopy(opt->value, dest, opt->len);
+			return (0);
+		}
+	}
+	return (ENOENT);
+}
+
+int
+__vfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+	int error;
+
+	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
+	if (sbp != &mp->mnt_stat)
+		*sbp = mp->mnt_stat;
+	return (error);
+}
+
+void
+vfs_mountedfrom(struct mount *mp, const char *from)
+{
+
+	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
+	strlcpy(mp->mnt_stat.f_mntfromname, from,
+	    sizeof mp->mnt_stat.f_mntfromname);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * This is the api for building mount args and mounting filesystems from
+ * inside the kernel.
+ *
+ * The API works by accumulation of individual args.  First error is
+ * latched.
+ *
+ * XXX: should be documented in new manpage kernel_mount(9)
+ */
+
+/* A memory allocation which must be freed when we are done */
+struct mntaarg {
+	SLIST_ENTRY(mntaarg)	next;
+};
+
+/* The header for the mount arguments */
+struct mntarg {
+	struct iovec *v;
+	int len;
+	int error;
+	SLIST_HEAD(, mntaarg)	list;
+};
+
+/*
+ * Add a boolean argument.
+ *
+ * flag is the boolean value.
+ * name must start with "no".
+ */
+struct mntarg *
+mount_argb(struct mntarg *ma, int flag, const char *name)
+{
+
+	KASSERT(name[0] == 'n' && name[1] == 'o',
+	    ("mount_argb(...,%s): name must start with 'no'", name));
+
+	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
+}
+
+/*
+ * Add an argument printf style
+ */
+struct mntarg *
+mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
+{
+	va_list ap;
+	struct mntaarg *maa;
+	struct sbuf *sb;
+	int len;
+
+	if (ma == NULL) {
+		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+		SLIST_INIT(&ma->list);
+	}
+	if (ma->error)
+		return (ma);
+
+	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+	    M_MOUNT, M_WAITOK);
+	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+	ma->v[ma->len].iov_len = strlen(name) + 1;
+	ma->len++;
+
+	sb = sbuf_new_auto();
+	va_start(ap, fmt);
+	sbuf_vprintf(sb, fmt, ap);
+	va_end(ap);
+	sbuf_finish(sb);
+	len = sbuf_len(sb) + 1;
+	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+	SLIST_INSERT_HEAD(&ma->list, maa, next);
+	bcopy(sbuf_data(sb), maa + 1, len);
+	sbuf_delete(sb);
+
+	ma->v[ma->len].iov_base = maa + 1;
+	ma->v[ma->len].iov_len = len;
+	ma->len++;
+
+	return (ma);
+}
+
+/*
+ * Add an argument which is a userland string.
+ */
+struct mntarg *
+mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
+{
+	struct mntaarg *maa;
+	char *tbuf;
+
+	if (val == NULL)
+		return (ma);
+	if (ma == NULL) {
+		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+		SLIST_INIT(&ma->list);
+	}
+	if (ma->error)
+		return (ma);
+	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+	SLIST_INSERT_HEAD(&ma->list, maa, next);
+	tbuf = (void *)(maa + 1);
+	ma->error = copyinstr(val, tbuf, len, NULL);
+	return (mount_arg(ma, name, tbuf, -1));
+}
+
+/*
+ * Plain argument.
+ *
+ * If length is -1, treat value as a C string.
+ */
+struct mntarg *
+mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
+{
+
+	if (ma == NULL) {
+		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+		SLIST_INIT(&ma->list);
+	}
+	if (ma->error)
+		return (ma);
+
+	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+	    M_MOUNT, M_WAITOK);
+	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+	ma->v[ma->len].iov_len = strlen(name) + 1;
+	ma->len++;
+
+	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
+	if (len < 0)
+		ma->v[ma->len].iov_len = strlen(val) + 1;
+	else
+		ma->v[ma->len].iov_len = len;
+	ma->len++;
+	return (ma);
+}
+
+/*
+ * Free a mntarg structure
+ */
+static void
+free_mntarg(struct mntarg *ma)
+{
+	struct mntaarg *maa;
+
+	while (!SLIST_EMPTY(&ma->list)) {
+		maa = SLIST_FIRST(&ma->list);
+		SLIST_REMOVE_HEAD(&ma->list, next);
+		free(maa, M_MOUNT);
+	}
+	free(ma->v, M_MOUNT);
+	free(ma, M_MOUNT);
+}
+
+/*
+ * Mount a filesystem
+ */
+int
+kernel_mount(struct mntarg *ma, uint64_t flags)
+{
+	struct uio auio;
+	int error;
+
+	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
+	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
+	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
+
+	auio.uio_iov = ma->v;
+	auio.uio_iovcnt = ma->len;
+	auio.uio_segflg = UIO_SYSSPACE;
+
+	error = ma->error;
+	if (!error)
+		error = vfs_donmount(curthread, flags, &auio);
+	free_mntarg(ma);
+	return (error);
+}
+
+/*
+ * A printflike function to mount a filesystem.
+ */
+int
+kernel_vmount(int flags, ...)
+{
+	struct mntarg *ma = NULL;
+	va_list ap;
+	const char *cp;
+	const void *vp;
+	int error;
+
+	va_start(ap, flags);
+	for (;;) {
+		cp = va_arg(ap, const char *);
+		if (cp == NULL)
+			break;
+		vp = va_arg(ap, const void *);
+		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
+	}
+	va_end(ap);
+
+	error = kernel_mount(ma, flags);
+	return (error);
+}
+
+void
+vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
+{
+
+	bcopy(oexp, exp, sizeof(*oexp));
+	exp->ex_numsecflavors = 0;
+}
diff --git a/freebsd/sys/kern/vfs_subr.c b/freebsd/sys/kern/vfs_subr.c
new file mode 100644
index 00000000..f84caac0
--- /dev/null
+++ b/freebsd/sys/kern/vfs_subr.c
@@ -0,0 +1,5719 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/extattr.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/pctrie.h>
+#include <sys/priv.h>
+#include <sys/reboot.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+
+#include <machine/stdarg.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static void	delmntque(struct vnode *vp);
+static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
+		    int slpflag, int slptimeo);
+static void	syncer_shutdown(void *arg, int howto);
+static int	vtryrecycle(struct vnode *vp);
+static void	v_init_counters(struct vnode *);
+static void	v_incr_usecount(struct vnode *);
+static void	v_incr_usecount_locked(struct vnode *);
+static void	v_incr_devcount(struct vnode *);
+static void	v_decr_devcount(struct vnode *);
+static void	vgonel(struct vnode *);
+static void	vfs_knllock(void *arg);
+static void	vfs_knlunlock(void *arg);
+static void	vfs_knl_assert_locked(void *arg);
+static void	vfs_knl_assert_unlocked(void *arg);
+static void	vnlru_return_batches(struct vfsops *mnt_op);
+static void	destroy_vpollinfo(struct vpollinfo *vi);
+static int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
+		    daddr_t startlbn, daddr_t endlbn);
+
+/*
+ * These fences are intended for cases where some synchronization is
+ * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
+ * and v_usecount) updates.  Access to v_iflags is generally synchronized
+ * by the interlock, but we have some internal assertions that check vnode
+ * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
+ * for now.
+ */
+#ifdef INVARIANTS
+#define	VNODE_REFCOUNT_FENCE_ACQ()	atomic_thread_fence_acq()
+#define	VNODE_REFCOUNT_FENCE_REL()	atomic_thread_fence_rel()
+#else
+#define	VNODE_REFCOUNT_FENCE_ACQ()
+#define	VNODE_REFCOUNT_FENCE_REL()
+#endif
+
+/*
+ * Number of vnodes in existence.  Increased whenever getnewvnode()
+ * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
+ */
+static unsigned long	numvnodes;
+
+SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
+    "Number of vnodes in existence");
+
+static counter_u64_t vnodes_created;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
+    "Number of vnodes created by getnewvnode");
+
+static u_long mnt_free_list_batch = 128;
+SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
+    &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
+
+/*
+ * Conversion tables for conversion from vnode types to inode formats
+ * and back.
+ */
+enum vtype iftovt_tab[16] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
+};
+int vttoif_tab[10] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
+};
+
+/*
+ * List of vnodes that are ready for recycling.
+ */
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;
+
+/*
+ * "Free" vnode target.  Free vnodes are rarely completely free, but are
+ * just ones that are cheap to recycle.  Usually they are for files which
+ * have been stat'd but not read; these usually have inode and namecache
+ * data attached to them.  This target is the preferred minimum size of a
+ * sub-cache consisting mostly of such files. The system balances the size
+ * of this sub-cache with its complement to try to prevent either from
+ * thrashing while the other is relatively inactive.  The targets express
+ * a preference for the best balance.
+ *
+ * "Above" this target there are 2 further targets (watermarks) related
+ * to recyling of free vnodes.  In the best-operating case, the cache is
+ * exactly full, the free list has size between vlowat and vhiwat above the
+ * free target, and recycling from it and normal use maintains this state.
+ * Sometimes the free list is below vlowat or even empty, but this state
+ * is even better for immediate use provided the cache is not full.
+ * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
+ * ones) to reach one of these states.  The watermarks are currently hard-
+ * coded as 4% and 9% of the available space higher.  These and the default
+ * of 25% for wantfreevnodes are too large if the memory size is large.
+ * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
+ * whenever vnlru_proc() becomes active.
+ */
+static u_long wantfreevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
+    &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
+static u_long freevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
+    &freevnodes, 0, "Number of \"free\" vnodes");
+
+static counter_u64_t recycles_count;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
+    "Number of vnodes recycled to meet vnode cache targets");
+
+/*
+ * Various variables used for debugging the new implementation of
+ * reassignbuf().
+ * XXX these are probably of (very) limited utility now.
+ */
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
+    "Number of calls to reassignbuf");
+
+static counter_u64_t free_owe_inact;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
+    "Number of times free vnodes kept on active list due to VFS "
+    "owing inactivation");
+
+/* To keep more than one thread at a time from running vfs_getnewfsid */
+static struct mtx mntid_mtx;
+
+/*
+ * Lock for any access to the following:
+ *	vnode_free_list
+ *	numvnodes
+ *	freevnodes
+ */
+static struct mtx vnode_free_list_mtx;
+
+/* Publicly exported FS */
+struct nfs_public nfs_pub;
+
+static uma_zone_t buf_trie_zone;
+
+/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
+static uma_zone_t vnode_zone;
+static uma_zone_t vnodepoll_zone;
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syncer process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ *	syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+static int syncer_delayno;
+static long syncer_mask;
+LIST_HEAD(synclist, bufobj);
+static struct synclist *syncer_workitem_pending;
+/*
+ * The sync_mtx protects:
+ *	bo->bo_synclist
+ *	sync_vnode_count
+ *	syncer_delayno
+ *	syncer_state
+ *	syncer_workitem_pending
+ *	syncer_worklist_len
+ *	rushjob
+ */
+static struct mtx sync_mtx;
+static struct cv sync_wakeup;
+
+#define SYNCER_MAXDELAY		32
+static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
+static int syncdelay = 30;		/* max time to delay syncing data */
+static int filedelay = 30;		/* time to delay syncing files */
+SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
+    "Time to delay syncing files (in seconds)");
+static int dirdelay = 29;		/* time to delay syncing directories */
+SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
+    "Time to delay syncing directories (in seconds)");
+static int metadelay = 28;		/* time to delay syncing metadata */
+SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
+    "Time to delay syncing metadata (in seconds)");
+static int rushjob;		/* number of slots to run ASAP */
+static int stat_rush_requests;	/* number of times I/O speeded up */
+SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
+    "Number of times I/O speeded up (rush requests)");
+
+/*
+ * When shutting down the syncer, run it at four times normal speed.
+ */
+#define SYNCER_SHUTDOWN_SPEEDUP		4
+static int sync_vnode_count;
+static int syncer_worklist_len;
+static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
+    syncer_state;
+
+/* Target for maximum number of vnodes. */
+int desiredvnodes;
+static int gapvnodes;		/* gap between wanted and desired */
+static int vhiwat;		/* enough extras after expansion */
+static int vlowat;		/* minimal extras before expansion */
+static int vstir;		/* nonzero to stir non-free vnodes */
+static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
+
+static int
+sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
+{
+	int error, old_desiredvnodes;
+
+	old_desiredvnodes = desiredvnodes;
+	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
+		return (error);
+	if (old_desiredvnodes != desiredvnodes) {
+		wantfreevnodes = desiredvnodes / 4;
+		/* XXX locking seems to be incomplete. */
+		vfs_hash_changesize(desiredvnodes);
+		cache_changesize(desiredvnodes);
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
+    sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
+SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
+    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
+static int vnlru_nowhere;
+SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
+    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
+
+static int
+sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
+{
+	struct vnode *vp;
+	struct nameidata nd;
+	char *buf;
+	unsigned long ndflags;
+	int error;
+
+	if (req->newptr == NULL)
+		return (EINVAL);
+	if (req->newlen >= PATH_MAX)
+		return (E2BIG);
+
+	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
+	error = SYSCTL_IN(req, buf, req->newlen);
+	if (error != 0)
+		goto out;
+
+	buf[req->newlen] = '\0';
+
+	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME;
+	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
+	if ((error = namei(&nd)) != 0)
+		goto out;
+	vp = nd.ni_vp;
+
+	if ((vp->v_iflag & VI_DOOMED) != 0) {
+		/*
+		 * This vnode is being recycled.  Return != 0 to let the caller
+		 * know that the sysctl had no effect.  Return EAGAIN because a
+		 * subsequent call will likely succeed (since namei will create
+		 * a new vnode if necessary)
+		 */
+		error = EAGAIN;
+		goto putvnode;
+	}
+
+	counter_u64_add(recycles_count, 1);
+	vgone(vp);
+putvnode:
+	NDFREE(&nd, 0);
+out:
+	free(buf, M_TEMP);
+	return (error);
+}
+
+static int
+sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
+{
+	struct thread *td = curthread;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+	int fd;
+
+	if (req->newptr == NULL)
+		return (EBADF);
+
+        error = sysctl_handle_int(oidp, &fd, 0, req);
+        if (error != 0)
+                return (error);
+	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+
+	error = vn_lock(vp, LK_EXCLUSIVE);
+	if (error != 0)
+		goto drop;
+
+	counter_u64_add(recycles_count, 1);
+	vgone(vp);
+	VOP_UNLOCK(vp, 0);
+drop:
+	fdrop(fp, td);
+	return (error);
+}
+
+SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
+    CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
+    sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
+SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
+    sysctl_ftry_reclaim_vnode, "I",
+    "Try to reclaim a vnode by its file descriptor");
+
+/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
+static int vnsz2log;
+
+/*
+ * Support for the bufobj clean & dirty pctrie.
+ */
+static void *
+buf_trie_alloc(struct pctrie *ptree)
+{
+
+	return uma_zalloc(buf_trie_zone, M_NOWAIT);
+}
+
+static void
+buf_trie_free(struct pctrie *ptree, void *node)
+{
+
+	uma_zfree(buf_trie_zone, node);
+}
+PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
+
+/*
+ * Initialize the vnode management data structures.
+ *
+ * Reevaluate the following cap on the number of vnodes after the physical
+ * memory size exceeds 512GB.  In the limit, as the physical memory size
+ * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
+ */
+#ifndef	MAXVNODES_MAX
+#define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
+#endif
+
+/*
+ * Initialize a vnode as it first enters the zone.
+ */
+static int
+vnode_init(void *mem, int size, int flags)
+{
+	struct vnode *vp;
+
+	vp = mem;
+	bzero(vp, size);
+	/*
+	 * Setup locks.
+	 */
+	vp->v_vnlock = &vp->v_lock;
+	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+	/*
+	 * By default, don't allow shared locks unless filesystems opt-in.
+	 */
+	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
+	    LK_NOSHARE | LK_IS_VNODE);
+	/*
+	 * Initialize bufobj.
+	 */
+	bufobj_init(&vp->v_bufobj, vp);
+	/*
+	 * Initialize namecache.
+	 */
+	LIST_INIT(&vp->v_cache_src);
+	TAILQ_INIT(&vp->v_cache_dst);
+	/*
+	 * Initialize rangelocks.
+	 */
+	rangelock_init(&vp->v_rl);
+	return (0);
+}
+
+/*
+ * Free a vnode when it is cleared from the zone.
+ */
+static void
+vnode_fini(void *mem, int size)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	vp = mem;
+	rangelock_destroy(&vp->v_rl);
+	lockdestroy(vp->v_vnlock);
+	mtx_destroy(&vp->v_interlock);
+	bo = &vp->v_bufobj;
+	rw_destroy(BO_LOCKPTR(bo));
+}
+
+/*
+ * Provide the size of NFS nclnode and NFS fh for calculation of the
+ * vnode memory consumption.  The size is specified directly to
+ * eliminate dependency on NFS-private header.
+ *
+ * Other filesystems may use bigger or smaller (like UFS and ZFS)
+ * private inode data, but the NFS-based estimation is ample enough.
+ * Still, we care about differences in the size between 64- and 32-bit
+ * platforms.
+ *
+ * Namecache structure size is heuristically
+ * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
+ */
+#ifdef _LP64
+#define	NFS_NCLNODE_SZ	(528 + 64)
+#define	NC_SZ		148
+#else
+#define	NFS_NCLNODE_SZ	(360 + 32)
+#define	NC_SZ		92
+#endif
+
+static void
+vntblinit(void *dummy __unused)
+{
+	u_int i;
+	int physvnodes, virtvnodes;
+
+	/*
+	 * Desiredvnodes is a function of the physical memory size and the
+	 * kernel's heap size.  Generally speaking, it scales with the
+	 * physical memory size.  The ratio of desiredvnodes to the physical
+	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
+	 * Thereafter, the
+	 * marginal ratio of desiredvnodes to the physical memory size is
+	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
+	 * size.  The memory required by desiredvnodes vnodes and vm objects
+	 * must not exceed 1/10th of the kernel's heap size.
+	 */
+	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
+	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
+	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
+	    sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
+	desiredvnodes = min(physvnodes, virtvnodes);
+	if (desiredvnodes > MAXVNODES_MAX) {
+		if (bootverbose)
+			printf("Reducing kern.maxvnodes %d -> %d\n",
+			    desiredvnodes, MAXVNODES_MAX);
+		desiredvnodes = MAXVNODES_MAX;
+	}
+	wantfreevnodes = desiredvnodes / 4;
+	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
+	TAILQ_INIT(&vnode_free_list);
+	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
+	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
+	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
+	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	/*
+	 * Preallocate enough nodes to support one-per buf so that
+	 * we can not fail an insert.  reassignbuf() callers can not
+	 * tolerate the insertion failure.
+	 */
+	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
+	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
+	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
+	uma_prealloc(buf_trie_zone, nbuf);
+
+	vnodes_created = counter_u64_alloc(M_WAITOK);
+	recycles_count = counter_u64_alloc(M_WAITOK);
+	free_owe_inact = counter_u64_alloc(M_WAITOK);
+
+	/*
+	 * Initialize the filesystem syncer.
+	 */
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+	    &syncer_mask);
+	syncer_maxdelay = syncer_mask + 1;
+	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
+	cv_init(&sync_wakeup, "syncer");
+	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+		vnsz2log++;
+	vnsz2log--;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
+
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Eventually, mountlist_mtx is not released on failure.
+ *
+ * vfs_busy() is a custom lock, it can block the caller.
+ * vfs_busy() only sleeps if the unmount is active on the mount point.
+ * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
+ * vnode belonging to mp.
+ *
+ * Lookup uses vfs_busy() to traverse mount points.
+ * root fs			var fs
+ * / vnode lock		A	/ vnode lock (/var)		D
+ * /var vnode lock	B	/log vnode lock(/var/log)	E
+ * vfs_busy lock	C	vfs_busy lock			F
+ *
+ * Within each file system, the lock order is C->A->B and F->D->E.
+ *
+ * When traversing across mounts, the system follows that lock order:
+ *
+ *        C->A->B
+ *              |
+ *              +->F->D->E
+ *
+ * The lookup() process for namei("/var") illustrates the process:
+ *  VOP_LOOKUP() obtains B while A is held
+ *  vfs_busy() obtains a shared lock on F while A and B are held
+ *  vput() releases lock on B
+ *  vput() releases lock on A
+ *  VFS_ROOT() obtains lock on D while shared lock on F is held
+ *  vfs_unbusy() releases shared lock on F
+ *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
+ *    Attempt to lock A (instead of vp_crossmp) while D is held would
+ *    violate the global order, causing deadlocks.
+ *
+ * dounmount() locks B while F is drained.
+ */
+int
+vfs_busy(struct mount *mp, int flags)
+{
+
+	MPASS((flags & ~MBF_MASK) == 0);
+	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
+
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	/*
+	 * If mount point is currently being unmounted, sleep until the
+	 * mount point fate is decided.  If thread doing the unmounting fails,
+	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
+	 * that this mount point has survived the unmount attempt and vfs_busy
+	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
+	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
+	 * about to be really destroyed.  vfs_busy needs to release its
+	 * reference on the mount point in this case and return with ENOENT,
+	 * telling the caller that mount mount it tried to busy is no longer
+	 * valid.
+	 */
+	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
+			MNT_REL(mp);
+			MNT_IUNLOCK(mp);
+			CTR1(KTR_VFS, "%s: failed busying before sleeping",
+			    __func__);
+			return (ENOENT);
+		}
+		if (flags & MBF_MNTLSTLOCK)
+			mtx_unlock(&mountlist_mtx);
+		mp->mnt_kern_flag |= MNTK_MWAIT;
+		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
+		if (flags & MBF_MNTLSTLOCK)
+			mtx_lock(&mountlist_mtx);
+		MNT_ILOCK(mp);
+	}
+	if (flags & MBF_MNTLSTLOCK)
+		mtx_unlock(&mountlist_mtx);
+	mp->mnt_lockref++;
+	MNT_IUNLOCK(mp);
+	return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(struct mount *mp)
+{
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
+	mp->mnt_lockref--;
+	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
+		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
+		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
+		mp->mnt_kern_flag &= ~MNTK_DRAINING;
+		wakeup(&mp->mnt_lockref);
+	}
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid_t *fsid)
+{
+	struct mount *mp;
+
+	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			vfs_ref(mp);
+			mtx_unlock(&mountlist_mtx);
+			return (mp);
+		}
+	}
+	mtx_unlock(&mountlist_mtx);
+	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier, busying it before
+ * returning.
+ *
+ * To avoid congestion on mountlist_mtx, implement simple direct-mapped
+ * cache for popular filesystem identifiers.  The cache is lockess, using
+ * the fact that struct mount's are never freed.  In worst case we may
+ * get pointer to unmounted or even different filesystem, so we have to
+ * check what we got, and go slow way if so.
+ */
+struct mount *
+vfs_busyfs(fsid_t *fsid)
+{
+#define	FSID_CACHE_SIZE	256
+	typedef struct mount * volatile vmp_t;
+	static vmp_t cache[FSID_CACHE_SIZE];
+	struct mount *mp;
+	int error;
+	uint32_t hash;
+
+	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+	hash = fsid->val[0] ^ fsid->val[1];
+	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
+	mp = cache[hash];
+	if (mp == NULL ||
+	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
+	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
+		goto slow;
+	if (vfs_busy(mp, 0) != 0) {
+		cache[hash] = NULL;
+		goto slow;
+	}
+	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
+		return (mp);
+	else
+	    vfs_unbusy(mp);
+
+slow:
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			error = vfs_busy(mp, MBF_MNTLSTLOCK);
+			if (error) {
+				cache[hash] = NULL;
+				mtx_unlock(&mountlist_mtx);
+				return (NULL);
+			}
+			cache[hash] = mp;
+			return (mp);
+		}
+	}
+	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+	mtx_unlock(&mountlist_mtx);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Check if a user can access privileged mount options.
+ */
+int
+vfs_suser(struct mount *mp, struct thread *td)
+{
+	int error;
+
+	if (jailed(td->td_ucred)) {
+		/*
+		 * If the jail of the calling thread lacks permission for
+		 * this type of file system, deny immediately.
+		 */
+		if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
+			return (EPERM);
+
+		/*
+		 * If the file system was mounted outside the jail of the
+		 * calling thread, deny immediately.
+		 */
+		if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
+			return (EPERM);
+	}
+
+	/*
+	 * If file system supports delegated administration, we don't check
+	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
+	 * by the file system itself.
+	 * If this is not the user that did original mount, we check for
+	 * the PRIV_VFS_MOUNT_OWNER privilege.
+	 */
+	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
+	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
+		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
+			return (error);
+	}
+	return (0);
+}
+
+/*
+ * Get a new unique fsid.  Try to make its val[0] unique, since this value
+ * will be used to create fake device numbers for stat().  Also try (but
+ * not so hard) make its val[0] unique mod 2^16, since some emulators only
+ * support 16-bit device numbers.  We end up with unique val[0]'s for the
+ * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
+ *
+ * Keep in mind that several mounts may be running in parallel.  Starting
+ * the search one past where the previous search terminated is both a
+ * micro-optimization and a defense against returning the same fsid to
+ * different mounts.
+ */
+void
+vfs_getnewfsid(struct mount *mp)
+{
+	static uint16_t mntid_base;
+	struct mount *nmp;
+	fsid_t tfsid;
+	int mtype;
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+	mtx_lock(&mntid_mtx);
+	mtype = mp->mnt_vfc->vfc_typenum;
+	tfsid.val[1] = mtype;
+	mtype = (mtype & 0xFF) << 24;
+	for (;;) {
+		tfsid.val[0] = makedev(255,
+		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
+		mntid_base++;
+		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
+			break;
+		vfs_rel(nmp);
+	}
+	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
+	mtx_unlock(&mntid_mtx);
+}
+
+/*
+ * Knob to control the precision of file timestamps:
+ *
+ *   0 = seconds only; nanoseconds zeroed.
+ *   1 = seconds and nanoseconds, accurate within 1/HZ.
+ *   2 = seconds and nanoseconds, truncated to microseconds.
+ * >=3 = seconds and nanoseconds, maximum precision.
+ */
+enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
+
+static int timestamp_precision = TSP_USEC;
+SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
+    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
+    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
+    "3+: sec + ns (max. precision))");
+
+/*
+ * Get a current timestamp.
+ */
+void
+vfs_timestamp(struct timespec *tsp)
+{
+	struct timeval tv;
+
+	switch (timestamp_precision) {
+	case TSP_SEC:
+		tsp->tv_sec = time_second;
+		tsp->tv_nsec = 0;
+		break;
+	case TSP_HZ:
+		getnanotime(tsp);
+		break;
+	case TSP_USEC:
+		microtime(&tv);
+		TIMEVAL_TO_TIMESPEC(&tv, tsp);
+		break;
+	case TSP_NSEC:
+	default:
+		nanotime(tsp);
+		break;
+	}
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(struct vattr *vap)
+{
+
+	vap->va_type = VNON;
+	vap->va_size = VNOVAL;
+	vap->va_bytes = VNOVAL;
+	vap->va_mode = VNOVAL;
+	vap->va_nlink = VNOVAL;
+	vap->va_uid = VNOVAL;
+	vap->va_gid = VNOVAL;
+	vap->va_fsid = VNOVAL;
+	vap->va_fileid = VNOVAL;
+	vap->va_blocksize = VNOVAL;
+	vap->va_rdev = VNOVAL;
+	vap->va_atime.tv_sec = VNOVAL;
+	vap->va_atime.tv_nsec = VNOVAL;
+	vap->va_mtime.tv_sec = VNOVAL;
+	vap->va_mtime.tv_nsec = VNOVAL;
+	vap->va_ctime.tv_sec = VNOVAL;
+	vap->va_ctime.tv_nsec = VNOVAL;
+	vap->va_birthtime.tv_sec = VNOVAL;
+	vap->va_birthtime.tv_nsec = VNOVAL;
+	vap->va_flags = VNOVAL;
+	vap->va_gen = VNOVAL;
+	vap->va_vaflags = 0;
+}
+
+/*
+ * This routine is called when we have too many vnodes.  It attempts
+ * to free <count> vnodes and will potentially free vnodes that still
+ * have VM backing store (VM backing store is typically the cause
+ * of a vnode blowout so we want to do this).  Therefore, this operation
+ * is not considered cheap.
+ *
+ * A number of conditions may prevent a vnode from being reclaimed.
+ * the buffer cache may have references on the vnode, a directory
+ * vnode may still have references due to the namei cache representing
+ * underlying files, or the vnode may be in active use.   It is not
+ * desirable to reuse such vnodes.  These conditions may cause the
+ * number of vnodes to reach some minimum value regardless of what
+ * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
+ *
+ * @param mp		 Try to reclaim vnodes from this mountpoint
+ * @param reclaim_nc_src Only reclaim directories with outgoing namecache
+ * 			 entries if this argument is strue
+ * @param trigger	 Only reclaim vnodes with fewer than this many resident
+ *			 pages.
+ * @return		 The number of vnodes that were reclaimed.
+ */
+static int
+vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger)
+{
+	struct vnode *vp;
+	int count, done, target;
+
+	done = 0;
+	vn_start_write(NULL, &mp, V_WAIT);
+	MNT_ILOCK(mp);
+	count = mp->mnt_nvnodelistsize;
+	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
+	target = target / 10 + 1;
+	while (count != 0 && done < target) {
+		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+		while (vp != NULL && vp->v_type == VMARKER)
+			vp = TAILQ_NEXT(vp, v_nmntvnodes);
+		if (vp == NULL)
+			break;
+		/*
+		 * XXX LRU is completely broken for non-free vnodes.  First
+		 * by calling here in mountpoint order, then by moving
+		 * unselected vnodes to the end here, and most grossly by
+		 * removing the vlruvp() function that was supposed to
+		 * maintain the order.  (This function was born broken
+		 * since syncer problems prevented it doing anything.)  The
+		 * order is closer to LRC (C = Created).
+		 *
+		 * LRU reclaiming of vnodes seems to have last worked in
+		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
+		 * Then there was no hold count, and inactive vnodes were
+		 * simply put on the free list in LRU order.  The separate
+		 * lists also break LRU.  We prefer to reclaim from the
+		 * free list for technical reasons.  This tends to thrash
+		 * the free list to keep very unrecently used held vnodes.
+		 * The problem is mitigated by keeping the free list large.
+		 */
+		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+		--count;
+		if (!VI_TRYLOCK(vp))
+			goto next_iter;
+		/*
+		 * If it's been deconstructed already, it's still
+		 * referenced, or it exceeds the trigger, skip it.
+		 * Also skip free vnodes.  We are trying to make space
+		 * to expand the free list, not reduce it.
+		 */
+		if (vp->v_usecount ||
+		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+		    ((vp->v_iflag & VI_FREE) != 0) ||
+		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
+		    vp->v_object->resident_page_count > trigger)) {
+			VI_UNLOCK(vp);
+			goto next_iter;
+		}
+		MNT_IUNLOCK(mp);
+		vholdl(vp);
+		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
+			vdrop(vp);
+			goto next_iter_mntunlocked;
+		}
+		VI_LOCK(vp);
+		/*
+		 * v_usecount may have been bumped after VOP_LOCK() dropped
+		 * the vnode interlock and before it was locked again.
+		 *
+		 * It is not necessary to recheck VI_DOOMED because it can
+		 * only be set by another thread that holds both the vnode
+		 * lock and vnode interlock.  If another thread has the
+		 * vnode lock before we get to VOP_LOCK() and obtains the
+		 * vnode interlock after VOP_LOCK() drops the vnode
+		 * interlock, the other thread will be unable to drop the
+		 * vnode lock before our VOP_LOCK() call fails.
+		 */
+		if (vp->v_usecount ||
+		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+		    (vp->v_iflag & VI_FREE) != 0 ||
+		    (vp->v_object != NULL &&
+		    vp->v_object->resident_page_count > trigger)) {
+			VOP_UNLOCK(vp, LK_INTERLOCK);
+			vdrop(vp);
+			goto next_iter_mntunlocked;
+		}
+		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
+		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
+		counter_u64_add(recycles_count, 1);
+		vgonel(vp);
+		VOP_UNLOCK(vp, 0);
+		vdropl(vp);
+		done++;
+next_iter_mntunlocked:
+		if (!should_yield())
+			goto relock_mnt;
+		goto yield;
+next_iter:
+		if (!should_yield())
+			continue;
+		MNT_IUNLOCK(mp);
+yield:
+		kern_yield(PRI_USER);
+relock_mnt:
+		MNT_ILOCK(mp);
+	}
+	MNT_IUNLOCK(mp);
+	vn_finished_write(mp);
+	return done;
+}
+
+static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
+SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
+    0,
+    "limit on vnode free requests per call to the vnlru_free routine");
+
+/*
+ * Attempt to reduce the free list by the requested amount.
+ */
+static void
+vnlru_free_locked(int count, struct vfsops *mnt_op)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	bool tried_batches;
+
+	tried_batches = false;
+	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+	if (count > max_vnlru_free)
+		count = max_vnlru_free;
+	for (; count > 0; count--) {
+		vp = TAILQ_FIRST(&vnode_free_list);
+		/*
+		 * The list can be modified while the free_list_mtx
+		 * has been dropped and vp could be NULL here.
+		 */
+		if (vp == NULL) {
+			if (tried_batches)
+				break;
+			mtx_unlock(&vnode_free_list_mtx);
+			vnlru_return_batches(mnt_op);
+			tried_batches = true;
+			mtx_lock(&vnode_free_list_mtx);
+			continue;
+		}
+
+		VNASSERT(vp->v_op != NULL, vp,
+		    ("vnlru_free: vnode already reclaimed."));
+		KASSERT((vp->v_iflag & VI_FREE) != 0,
+		    ("Removing vnode not on freelist"));
+		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+		    ("Mangling active vnode"));
+		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+
+		/*
+		 * Don't recycle if our vnode is from different type
+		 * of mount point.  Note that mp is type-safe, the
+		 * check does not reach unmapped address even if
+		 * vnode is reclaimed.
+		 * Don't recycle if we can't get the interlock without
+		 * blocking.
+		 */
+		if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
+		    mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+			continue;
+		}
+		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
+		    vp, ("vp inconsistent on freelist"));
+
+		/*
+		 * The clear of VI_FREE prevents activation of the
+		 * vnode.  There is no sense in putting the vnode on
+		 * the mount point active list, only to remove it
+		 * later during recycling.  Inline the relevant part
+		 * of vholdl(), to avoid triggering assertions or
+		 * activating.
+		 */
+		freevnodes--;
+		vp->v_iflag &= ~VI_FREE;
+		VNODE_REFCOUNT_FENCE_REL();
+		refcount_acquire(&vp->v_holdcnt);
+
+		mtx_unlock(&vnode_free_list_mtx);
+		VI_UNLOCK(vp);
+		vtryrecycle(vp);
+		/*
+		 * If the recycled succeeded this vdrop will actually free
+		 * the vnode.  If not it will simply place it back on
+		 * the free list.
+		 */
+		vdrop(vp);
+		mtx_lock(&vnode_free_list_mtx);
+	}
+}
+
+void
+vnlru_free(int count, struct vfsops *mnt_op)
+{
+
+	mtx_lock(&vnode_free_list_mtx);
+	vnlru_free_locked(count, mnt_op);
+	mtx_unlock(&vnode_free_list_mtx);
+}
+
+
+/* XXX some names and initialization are bad for limits and watermarks. */
+static int
+vspace(void)
+{
+	int space;
+
+	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
+	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
+	vlowat = vhiwat / 2;
+	if (numvnodes > desiredvnodes)
+		return (0);
+	space = desiredvnodes - numvnodes;
+	if (freevnodes > wantfreevnodes)
+		space += freevnodes - wantfreevnodes;
+	return (space);
+}
+
+static void
+vnlru_return_batch_locked(struct mount *mp)
+{
+	struct vnode *vp;
+
+	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+
+	if (mp->mnt_tmpfreevnodelistsize == 0)
+		return;
+
+	TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
+		VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
+		    ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
+		vp->v_mflag &= ~VMP_TMPMNTFREELIST;
+	}
+	mtx_lock(&vnode_free_list_mtx);
+	TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
+	freevnodes += mp->mnt_tmpfreevnodelistsize;
+	mtx_unlock(&vnode_free_list_mtx);
+	mp->mnt_tmpfreevnodelistsize = 0;
+}
+
+static void
+vnlru_return_batch(struct mount *mp)
+{
+
+	mtx_lock(&mp->mnt_listmtx);
+	vnlru_return_batch_locked(mp);
+	mtx_unlock(&mp->mnt_listmtx);
+}
+
+static void
+vnlru_return_batches(struct vfsops *mnt_op)
+{
+	struct mount *mp, *nmp;
+	bool need_unbusy;
+
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		need_unbusy = false;
+		if (mnt_op != NULL && mp->mnt_op != mnt_op)
+			goto next;
+		if (mp->mnt_tmpfreevnodelistsize == 0)
+			goto next;
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
+			vnlru_return_batch(mp);
+			need_unbusy = true;
+			mtx_lock(&mountlist_mtx);
+		}
+next:
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		if (need_unbusy)
+			vfs_unbusy(mp);
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+
+/*
+ * Attempt to recycle vnodes in a context that is always safe to block.
+ * Calling vlrurecycle() from the bowels of filesystem code has some
+ * interesting deadlock problems.
+ */
+static struct proc *vnlruproc;
+static int vnlruproc_sig;
+
+static void
+vnlru_proc(void)
+{
+	struct mount *mp, *nmp;
+	unsigned long onumvnodes;
+	int done, force, trigger, usevnodes, vsp;
+	bool reclaim_nc_src;
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
+	    SHUTDOWN_PRI_FIRST);
+
+	force = 0;
+	for (;;) {
+		kproc_suspend_check(vnlruproc);
+		mtx_lock(&vnode_free_list_mtx);
+		/*
+		 * If numvnodes is too large (due to desiredvnodes being
+		 * adjusted using its sysctl, or emergency growth), first
+		 * try to reduce it by discarding from the free list.
+		 */
+		if (numvnodes > desiredvnodes)
+			vnlru_free_locked(numvnodes - desiredvnodes, NULL);
+		/*
+		 * Sleep if the vnode cache is in a good state.  This is
+		 * when it is not over-full and has space for about a 4%
+		 * or 9% expansion (by growing its size or inexcessively
+		 * reducing its free list).  Otherwise, try to reclaim
+		 * space for a 10% expansion.
+		 */
+		if (vstir && force == 0) {
+			force = 1;
+			vstir = 0;
+		}
+		vsp = vspace();
+		if (vsp >= vlowat && force == 0) {
+			vnlruproc_sig = 0;
+			wakeup(&vnlruproc_sig);
+			msleep(vnlruproc, &vnode_free_list_mtx,
+			    PVFS|PDROP, "vlruwt", hz);
+			continue;
+		}
+		mtx_unlock(&vnode_free_list_mtx);
+		done = 0;
+		onumvnodes = numvnodes;
+		/*
+		 * Calculate parameters for recycling.  These are the same
+		 * throughout the loop to give some semblance of fairness.
+		 * The trigger point is to avoid recycling vnodes with lots
+		 * of resident pages.  We aren't trying to free memory; we
+		 * are trying to recycle or at least free vnodes.
+		 */
+		if (numvnodes <= desiredvnodes)
+			usevnodes = numvnodes - freevnodes;
+		else
+			usevnodes = numvnodes;
+		if (usevnodes <= 0)
+			usevnodes = 1;
+		/*
+		 * The trigger value is is chosen to give a conservatively
+		 * large value to ensure that it alone doesn't prevent
+		 * making progress.  The value can easily be so large that
+		 * it is effectively infinite in some congested and
+		 * misconfigured cases, and this is necessary.  Normally
+		 * it is about 8 to 100 (pages), which is quite large.
+		 */
+		trigger = vm_cnt.v_page_count * 2 / usevnodes;
+		if (force < 2)
+			trigger = vsmalltrigger;
+		reclaim_nc_src = force >= 3;
+		mtx_lock(&mountlist_mtx);
+		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+				nmp = TAILQ_NEXT(mp, mnt_list);
+				continue;
+			}
+			done += vlrureclaim(mp, reclaim_nc_src, trigger);
+			mtx_lock(&mountlist_mtx);
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			vfs_unbusy(mp);
+		}
+		mtx_unlock(&mountlist_mtx);
+		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
+			uma_reclaim();
+		if (done == 0) {
+			if (force == 0 || force == 1) {
+				force = 2;
+				continue;
+			}
+			if (force == 2) {
+				force = 3;
+				continue;
+			}
+			force = 0;
+			vnlru_nowhere++;
+			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
+		} else
+			kern_yield(PRI_USER);
+		/*
+		 * After becoming active to expand above low water, keep
+		 * active until above high water.
+		 */
+		vsp = vspace();
+		force = vsp < vhiwat;
+	}
+}
+
+static struct kproc_desc vnlru_kp = {
+	"vnlru",
+	vnlru_proc,
+	&vnlruproc
+};
+SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
+    &vnlru_kp);
+ 
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+/*
+ * Try to recycle a freed vnode.  We abort if anyone picks up a reference
+ * before we actually vgone().  This function must be called with the vnode
+ * held to prevent the vnode from being returned to the free list midway
+ * through vgone().
+ */
+static int
+vtryrecycle(struct vnode *vp)
+{
+	struct mount *vnmp;
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	VNASSERT(vp->v_holdcnt, vp,
+	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
+	/*
+	 * This vnode may found and locked via some other list, if so we
+	 * can't recycle it yet.
+	 */
+	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+		CTR2(KTR_VFS,
+		    "%s: impossible to recycle, vp %p lock is already held",
+		    __func__, vp);
+		return (EWOULDBLOCK);
+	}
+	/*
+	 * Don't recycle if its filesystem is being suspended.
+	 */
+	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
+		VOP_UNLOCK(vp, 0);
+		CTR2(KTR_VFS,
+		    "%s: impossible to recycle, cannot start the write for %p",
+		    __func__, vp);
+		return (EBUSY);
+	}
+	/*
+	 * If we got this far, we need to acquire the interlock and see if
+	 * anyone picked up this vnode from another list.  If not, we will
+	 * mark it with DOOMED via vgonel() so that anyone who does find it
+	 * will skip over it.
+	 */
+	VI_LOCK(vp);
+	if (vp->v_usecount) {
+		VOP_UNLOCK(vp, LK_INTERLOCK);
+		vn_finished_write(vnmp);
+		CTR2(KTR_VFS,
+		    "%s: impossible to recycle, %p is already referenced",
+		    __func__, vp);
+		return (EBUSY);
+	}
+	if ((vp->v_iflag & VI_DOOMED) == 0) {
+		counter_u64_add(recycles_count, 1);
+		vgonel(vp);
+	}
+	VOP_UNLOCK(vp, LK_INTERLOCK);
+	vn_finished_write(vnmp);
+	return (0);
+}
+
+static void
+vcheckspace(void)
+{
+	int vsp;
+
+	vsp = vspace();
+	if (vsp < vlowat && vnlruproc_sig == 0) {
+		vnlruproc_sig = 1;
+		wakeup(vnlruproc);
+	}
+}
+
+/*
+ * Wait if necessary for space for a new vnode.
+ */
+static int
+getnewvnode_wait(int suspended)
+{
+
+	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+	if (numvnodes >= desiredvnodes) {
+		if (suspended) {
+			/*
+			 * The file system is being suspended.  We cannot
+			 * risk a deadlock here, so allow allocation of
+			 * another vnode even if this would give too many.
+			 */
+			return (0);
+		}
+		if (vnlruproc_sig == 0) {
+			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
+			wakeup(vnlruproc);
+		}
+		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
+		    "vlruwk", hz);
+	}
+	/* Post-adjust like the pre-adjust in getnewvnode(). */
+	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
+		vnlru_free_locked(1, NULL);
+	return (numvnodes >= desiredvnodes ? ENFILE : 0);
+}
+
+/*
+ * This hack is fragile, and probably not needed any more now that the
+ * watermark handling works.
+ */
+void
+getnewvnode_reserve(u_int count)
+{
+	struct thread *td;
+
+	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
+	/* XXX no longer so quick, but this part is not racy. */
+	mtx_lock(&vnode_free_list_mtx);
+	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
+		vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes,
+		    freevnodes - wantfreevnodes), NULL);
+	mtx_unlock(&vnode_free_list_mtx);
+
+	td = curthread;
+	/* First try to be quick and racy. */
+	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
+		td->td_vp_reserv += count;
+		vcheckspace();	/* XXX no longer so quick, but more racy */
+		return;
+	} else
+		atomic_subtract_long(&numvnodes, count);
+
+	mtx_lock(&vnode_free_list_mtx);
+	while (count > 0) {
+		if (getnewvnode_wait(0) == 0) {
+			count--;
+			td->td_vp_reserv++;
+			atomic_add_long(&numvnodes, 1);
+		}
+	}
+	vcheckspace();
+	mtx_unlock(&vnode_free_list_mtx);
+}
+
+/*
+ * This hack is fragile, especially if desiredvnodes or wantvnodes are
+ * misconfgured or changed significantly.  Reducing desiredvnodes below
+ * the reserved amount should cause bizarre behaviour like reducing it
+ * below the number of active vnodes -- the system will try to reduce
+ * numvnodes to match, but should fail, so the subtraction below should
+ * not overflow.
+ */
+void
+getnewvnode_drop_reserve(void)
+{
+	struct thread *td;
+
+	td = curthread;
+	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
+	td->td_vp_reserv = 0;
+}
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
+    struct vnode **vpp)
+{
+	struct vnode *vp;
+	struct thread *td;
+	struct lock_object *lo;
+	static int cyclecount;
+	int error __unused;
+
+	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
+	vp = NULL;
+	td = curthread;
+	if (td->td_vp_reserv > 0) {
+		td->td_vp_reserv -= 1;
+		goto alloc;
+	}
+	mtx_lock(&vnode_free_list_mtx);
+	if (numvnodes < desiredvnodes)
+		cyclecount = 0;
+	else if (cyclecount++ >= freevnodes) {
+		cyclecount = 0;
+		vstir = 1;
+	}
+	/*
+	 * Grow the vnode cache if it will not be above its target max
+	 * after growing.  Otherwise, if the free list is nonempty, try
+	 * to reclaim 1 item from it before growing the cache (possibly
+	 * above its target max if the reclamation failed or is delayed).
+	 * Otherwise, wait for some space.  In all cases, schedule
+	 * vnlru_proc() if we are getting short of space.  The watermarks
+	 * should be chosen so that we never wait or even reclaim from
+	 * the free list to below its target minimum.
+	 */
+	if (numvnodes + 1 <= desiredvnodes)
+		;
+	else if (freevnodes > 0)
+		vnlru_free_locked(1, NULL);
+	else {
+		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+		    MNTK_SUSPEND));
+#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
+		if (error != 0) {
+			mtx_unlock(&vnode_free_list_mtx);
+			return (error);
+		}
+#endif
+	}
+	vcheckspace();
+	atomic_add_long(&numvnodes, 1);
+	mtx_unlock(&vnode_free_list_mtx);
+alloc:
+	counter_u64_add(vnodes_created, 1);
+	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
+	/*
+	 * Locks are given the generic name "vnode" when created.
+	 * Follow the historic practice of using the filesystem
+	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
+	 *
+	 * Locks live in a witness group keyed on their name. Thus,
+	 * when a lock is renamed, it must also move from the witness
+	 * group of its old name to the witness group of its new name.
+	 *
+	 * The change only needs to be made when the vnode moves
+	 * from one filesystem type to another. We ensure that each
+	 * filesystem use a single static name pointer for its tag so
+	 * that we can compare pointers rather than doing a strcmp().
+	 */
+	lo = &vp->v_vnlock->lock_object;
+	if (lo->lo_name != tag) {
+		lo->lo_name = tag;
+		WITNESS_DESTROY(lo);
+		WITNESS_INIT(lo, tag);
+	}
+	/*
+	 * By default, don't allow shared locks unless filesystems opt-in.
+	 */
+	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
+	/*
+	 * Finalize various vnode identity bits.
+	 */
+	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
+	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
+	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
+	vp->v_type = VNON;
+	vp->v_tag = tag;
+	vp->v_op = vops;
+	v_init_counters(vp);
+	vp->v_bufobj.bo_ops = &buf_ops_bio;
+#ifdef DIAGNOSTIC
+	if (mp == NULL && vops != &dead_vnodeops)
+		printf("NULL mp in getnewvnode(9), tag %s\n", tag);
+#endif
+#ifdef MAC
+	mac_vnode_init(vp);
+	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
+		mac_vnode_associate_singlelabel(mp, vp);
+#endif
+	if (mp != NULL) {
+		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
+		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
+			vp->v_vflag |= VV_NOKNOTE;
+	}
+
+	/*
+	 * For the filesystems which do not use vfs_hash_insert(),
+	 * still initialize v_hash to have vfs_hash_index() useful.
+	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+	 * its own hashing.
+	 */
+	vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
+	*vpp = vp;
+	return (0);
+}
+
+/*
+ * Delete from old mount point vnode list, if on one.
+ */
+static void
+delmntque(struct vnode *vp)
+{
+	struct mount *mp;
+	int active;
+
+	mp = vp->v_mount;
+	if (mp == NULL)
+		return;
+	MNT_ILOCK(mp);
+	VI_LOCK(vp);
+	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
+	    ("Active vnode list size %d > Vnode list size %d",
+	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
+	active = vp->v_iflag & VI_ACTIVE;
+	vp->v_iflag &= ~VI_ACTIVE;
+	if (active) {
+		mtx_lock(&mp->mnt_listmtx);
+		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
+		mp->mnt_activevnodelistsize--;
+		mtx_unlock(&mp->mnt_listmtx);
+	}
+	vp->v_mount = NULL;
+	VI_UNLOCK(vp);
+	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
+		("bad mount point vnode list size"));
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+	mp->mnt_nvnodelistsize--;
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+}
+
+static void
+insmntque_stddtr(struct vnode *vp, void *dtr_arg)
+{
+
+	vp->v_data = NULL;
+	vp->v_op = &dead_vnodeops;
+	vgone(vp);
+	vput(vp);
+}
+
+/*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+int
+insmntque1(struct vnode *vp, struct mount *mp,
+	void (*dtr)(struct vnode *, void *), void *dtr_arg)
+{
+
+	KASSERT(vp->v_mount == NULL,
+		("insmntque: vnode already on per mount vnode list"));
+	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
+	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
+
+	/*
+	 * We acquire the vnode interlock early to ensure that the
+	 * vnode cannot be recycled by another process releasing a
+	 * holdcnt on it before we get it on both the vnode list
+	 * and the active vnode list. The mount mutex protects only
+	 * manipulation of the vnode list and the vnode freelist
+	 * mutex protects only manipulation of the active vnode list.
+	 * Hence the need to hold the vnode interlock throughout.
+	 */
+	MNT_ILOCK(mp);
+	VI_LOCK(vp);
+	if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
+	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
+	    mp->mnt_nvnodelistsize == 0)) &&
+	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
+		VI_UNLOCK(vp);
+		MNT_IUNLOCK(mp);
+		if (dtr != NULL)
+			dtr(vp, dtr_arg);
+		return (EBUSY);
+	}
+	vp->v_mount = mp;
+	MNT_REF(mp);
+	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
+		("neg mount point vnode list size"));
+	mp->mnt_nvnodelistsize++;
+	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+	    ("Activating already active vnode"));
+	vp->v_iflag |= VI_ACTIVE;
+	mtx_lock(&mp->mnt_listmtx);
+	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+	mp->mnt_activevnodelistsize++;
+	mtx_unlock(&mp->mnt_listmtx);
+	VI_UNLOCK(vp);
+	MNT_IUNLOCK(mp);
+	return (0);
+}
+
+int
+insmntque(struct vnode *vp, struct mount *mp)
+{
+
+	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a bufobj
+ * Called with the underlying object locked.
+ */
+int
+bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
+{
+	int error;
+
+	BO_LOCK(bo);
+	if (flags & V_SAVE) {
+		error = bufobj_wwait(bo, slpflag, slptimeo);
+		if (error) {
+			BO_UNLOCK(bo);
+			return (error);
+		}
+		if (bo->bo_dirty.bv_cnt > 0) {
+			BO_UNLOCK(bo);
+			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
+				return (error);
+			/*
+			 * XXX We could save a lock/unlock if this was only
+			 * enabled under INVARIANTS
+			 */
+			BO_LOCK(bo);
+			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
+				panic("vinvalbuf: dirty bufs");
+		}
+	}
+	/*
+	 * If you alter this loop please notice that interlock is dropped and
+	 * reacquired in flushbuflist.  Special care is needed to ensure that
+	 * no race conditions occur from this.
+	 */
+	do {
+		error = flushbuflist(&bo->bo_clean,
+		    flags, bo, slpflag, slptimeo);
+		if (error == 0 && !(flags & V_CLEANONLY))
+			error = flushbuflist(&bo->bo_dirty,
+			    flags, bo, slpflag, slptimeo);
+		if (error != 0 && error != EAGAIN) {
+			BO_UNLOCK(bo);
+			return (error);
+		}
+	} while (error != 0);
+
+	/*
+	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
+	 * have write I/O in-progress but if there is a VM object then the
+	 * VM object can also have read-I/O in-progress.
+	 */
+	do {
+		bufobj_wwait(bo, 0, 0);
+		if ((flags & V_VMIO) == 0) {
+			BO_UNLOCK(bo);
+			if (bo->bo_object != NULL) {
+				VM_OBJECT_WLOCK(bo->bo_object);
+				vm_object_pip_wait(bo->bo_object, "bovlbx");
+				VM_OBJECT_WUNLOCK(bo->bo_object);
+			}
+			BO_LOCK(bo);
+		}
+	} while (bo->bo_numoutput > 0);
+	BO_UNLOCK(bo);
+
+	/*
+	 * Destroy the copy in the VM cache, too.
+	 */
+	if (bo->bo_object != NULL &&
+	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
+		VM_OBJECT_WLOCK(bo->bo_object);
+		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
+		    OBJPR_CLEANONLY : 0);
+		VM_OBJECT_WUNLOCK(bo->bo_object);
+	}
+
+#ifdef INVARIANTS
+	BO_LOCK(bo);
+	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
+	    V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
+	    bo->bo_clean.bv_cnt > 0))
+		panic("vinvalbuf: flush failed");
+	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
+	    bo->bo_dirty.bv_cnt > 0)
+		panic("vinvalbuf: flush dirty failed");
+	BO_UNLOCK(bo);
+#endif
+	return (0);
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
+{
+
+	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
+	if (vp->v_object != NULL && vp->v_object->handle != vp)
+		return (0);
+	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
+}
+
+/*
+ * Flush out buffers on the specified list.
+ *
+ */
+static int
+flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+    int slptimeo)
+{
+	struct buf *bp, *nbp;
+	int retval, error;
+	daddr_t lblkno;
+	b_xflags_t xflags;
+
+	ASSERT_BO_WLOCKED(bo);
+
+	retval = 0;
+	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
+		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
+		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
+			continue;
+		}
+		if (nbp != NULL) {
+			lblkno = nbp->b_lblkno;
+			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
+		}
+		retval = EAGAIN;
+		error = BUF_TIMELOCK(bp,
+		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
+		    "flushbuf", slpflag, slptimeo);
+		if (error) {
+			BO_LOCK(bo);
+			return (error != ENOLCK ? error : EAGAIN);
+		}
+		KASSERT(bp->b_bufobj == bo,
+		    ("bp %p wrong b_bufobj %p should be %p",
+		    bp, bp->b_bufobj, bo));
+		/*
+		 * XXX Since there are no node locks for NFS, I
+		 * believe there is a slight chance that a delayed
+		 * write will occur while sleeping just above, so
+		 * check for it.
+		 */
+		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+		    (flags & V_SAVE)) {
+			bremfree(bp);
+			bp->b_flags |= B_ASYNC;
+			bwrite(bp);
+			BO_LOCK(bo);
+			return (EAGAIN);	/* XXX: why not loop ? */
+		}
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~B_ASYNC;
+		brelse(bp);
+		BO_LOCK(bo);
+		if (nbp == NULL)
+			break;
+		nbp = gbincore(bo, lblkno);
+		if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+		    != xflags)
+			break;			/* nbp invalid */
+	}
+	return (retval);
+}
+
+int
+bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
+{
+	struct buf *bp;
+	int error;
+	daddr_t lblkno;
+
+	ASSERT_BO_LOCKED(bo);
+
+	for (lblkno = startn;;) {
+again:
+		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
+		if (bp == NULL || bp->b_lblkno >= endn ||
+		    bp->b_lblkno < startn)
+			break;
+		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
+		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
+		if (error != 0) {
+			BO_RLOCK(bo);
+			if (error == ENOLCK)
+				goto again;
+			return (error);
+		}
+		KASSERT(bp->b_bufobj == bo,
+		    ("bp %p wrong b_bufobj %p should be %p",
+		    bp, bp->b_bufobj, bo));
+		lblkno = bp->b_lblkno + 1;
+		if ((bp->b_flags & B_MANAGED) == 0)
+			bremfree(bp);
+		bp->b_flags |= B_RELBUF;
+		/*
+		 * In the VMIO case, use the B_NOREUSE flag to hint that the
+		 * pages backing each buffer in the range are unlikely to be
+		 * reused.  Dirty buffers will have the hint applied once
+		 * they've been written.
+		 */
+		if ((bp->b_flags & B_VMIO) != 0)
+			bp->b_flags |= B_NOREUSE;
+		brelse(bp);
+		BO_RLOCK(bo);
+	}
+	return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length.  This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(struct vnode *vp, off_t length, int blksize)
+{
+	struct buf *bp, *nbp;
+	struct bufobj *bo;
+	daddr_t startlbn;
+
+	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
+	    vp, blksize, (uintmax_t)length);
+
+	/*
+	 * Round up to the *next* lbn.
+	 */
+	startlbn = howmany(length, blksize);
+
+	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
+
+	bo = &vp->v_bufobj;
+restart_unlocked:
+	BO_LOCK(bo);
+
+	while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
+		;
+
+	if (length > 0) {
+restartsync:
+		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+			if (bp->b_lblkno > 0)
+				continue;
+			/*
+			 * Since we hold the vnode lock this should only
+			 * fail if we're racing with the buf daemon.
+			 */
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+			    BO_LOCKPTR(bo)) == ENOLCK)
+				goto restart_unlocked;
+
+			VNASSERT((bp->b_flags & B_DELWRI), vp,
+			    ("buf(%p) on dirty queue without DELWRI", bp));
+
+			bremfree(bp);
+			bawrite(bp);
+			BO_LOCK(bo);
+			goto restartsync;
+		}
+	}
+
+	bufobj_wwait(bo, 0, 0);
+	BO_UNLOCK(bo);
+	vnode_pager_setsize(vp, length);
+
+	return (0);
+}
+
+/*
+ * Invalidate the cached pages of a file's buffer within the range of block
+ * numbers [startlbn, endlbn).
+ */
+void
+v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
+    int blksize)
+{
+	struct bufobj *bo;
+	off_t start, end;
+
+	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
+
+	start = blksize * startlbn;
+	end = blksize * endlbn;
+
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+	MPASS(blksize == bo->bo_bsize);
+
+	while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
+		;
+
+	BO_UNLOCK(bo);
+	vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
+}
+
+static int
+v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
+    daddr_t startlbn, daddr_t endlbn)
+{
+	struct buf *bp, *nbp;
+	bool anyfreed;
+
+	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
+	ASSERT_BO_LOCKED(bo);
+
+	do {
+		anyfreed = false;
+		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
+			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+			    BO_LOCKPTR(bo)) == ENOLCK) {
+				BO_LOCK(bo);
+				return (EAGAIN);
+			}
+
+			bremfree(bp);
+			bp->b_flags |= B_INVAL | B_RELBUF;
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+			anyfreed = true;
+
+			BO_LOCK(bo);
+			if (nbp != NULL &&
+			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
+			    nbp->b_vp != vp ||
+			    (nbp->b_flags & B_DELWRI) != 0))
+				return (EAGAIN);
+		}
+
+		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+			    BO_LOCKPTR(bo)) == ENOLCK) {
+				BO_LOCK(bo);
+				return (EAGAIN);
+			}
+			bremfree(bp);
+			bp->b_flags |= B_INVAL | B_RELBUF;
+			bp->b_flags &= ~B_ASYNC;
+			brelse(bp);
+			anyfreed = true;
+
+			BO_LOCK(bo);
+			if (nbp != NULL &&
+			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
+			    (nbp->b_vp != vp) ||
+			    (nbp->b_flags & B_DELWRI) == 0))
+				return (EAGAIN);
+		}
+	} while (anyfreed);
+	return (0);
+}
+
+static void
+buf_vlist_remove(struct buf *bp)
+{
+	struct bufv *bv;
+
+	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+	ASSERT_BO_WLOCKED(bp->b_bufobj);
+	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
+	    (BX_VNDIRTY|BX_VNCLEAN),
+	    ("buf_vlist_remove: Buf %p is on two lists", bp));
+	if (bp->b_xflags & BX_VNDIRTY)
+		bv = &bp->b_bufobj->bo_dirty;
+	else
+		bv = &bp->b_bufobj->bo_clean;
+	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
+	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
+	bv->bv_cnt--;
+	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+}
+
+/*
+ * Add the buffer to the sorted clean or dirty block list.
+ *
+ * NOTE: xflags is passed as a constant, optimizing this inline function!
+ */
+static void
+buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
+{
+	struct bufv *bv;
+	struct buf *n;
+	int error;
+
+	ASSERT_BO_WLOCKED(bo);
+	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
+	    ("dead bo %p", bo));
+	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
+	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
+	bp->b_xflags |= xflags;
+	if (xflags & BX_VNDIRTY)
+		bv = &bo->bo_dirty;
+	else
+		bv = &bo->bo_clean;
+
+	/*
+	 * Keep the list ordered.  Optimize empty list insertion.  Assume
+	 * we tend to grow at the tail so lookup_le should usually be cheaper
+	 * than _ge. 
+	 */
+	if (bv->bv_cnt == 0 ||
+	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
+		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
+	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
+		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
+	else
+		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
+	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
+	if (error)
+		panic("buf_vlist_add:  Preallocated nodes insufficient.");
+	bv->bv_cnt++;
+}
+
+/*
+ * Look up a buffer using the buffer tries.
+ */
+struct buf *
+gbincore(struct bufobj *bo, daddr_t lblkno)
+{
+	struct buf *bp;
+
+	ASSERT_BO_LOCKED(bo);
+	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
+	if (bp != NULL)
+		return (bp);
+	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(struct vnode *vp, struct buf *bp)
+{
+	struct bufobj *bo;
+
+	bo = &vp->v_bufobj;
+	ASSERT_BO_WLOCKED(bo);
+	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
+
+	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
+	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
+	    ("bgetvp: bp already attached! %p", bp));
+
+	vhold(vp);
+	bp->b_vp = vp;
+	bp->b_bufobj = bo;
+	/*
+	 * Insert onto list for new vnode.
+	 */
+	buf_vlist_add(bp, bo, BX_VNCLEAN);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(struct buf *bp)
+{
+	struct bufobj *bo;
+	struct vnode *vp;
+
+	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	vp = bp->b_vp;		/* XXX */
+	bo = bp->b_bufobj;
+	BO_LOCK(bo);
+	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+		buf_vlist_remove(bp);
+	else
+		panic("brelvp: Buffer %p not on queue.", bp);
+	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+		bo->bo_flag &= ~BO_ONWORKLST;
+		mtx_lock(&sync_mtx);
+		LIST_REMOVE(bo, bo_synclist);
+		syncer_worklist_len--;
+		mtx_unlock(&sync_mtx);
+	}
+	bp->b_vp = NULL;
+	bp->b_bufobj = NULL;
+	BO_UNLOCK(bo);
+	vdrop(vp);
+}
+
+/*
+ * Add an item to the syncer work queue.
+ */
+static void
+vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
+{
+	int slot;
+
+	ASSERT_BO_WLOCKED(bo);
+
+	mtx_lock(&sync_mtx);
+	if (bo->bo_flag & BO_ONWORKLST)
+		LIST_REMOVE(bo, bo_synclist);
+	else {
+		bo->bo_flag |= BO_ONWORKLST;
+		syncer_worklist_len++;
+	}
+
+	if (delay > syncer_maxdelay - 2)
+		delay = syncer_maxdelay - 2;
+	slot = (syncer_delayno + delay) & syncer_mask;
+
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
+	mtx_unlock(&sync_mtx);
+}
+
+static int
+sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
+{
+	int error, len;
+
+	mtx_lock(&sync_mtx);
+	len = syncer_worklist_len - sync_vnode_count;
+	mtx_unlock(&sync_mtx);
+	error = SYSCTL_OUT(req, &len, sizeof(len));
+	return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
+    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
+
+static struct proc *updateproc;
+static void sched_sync(void);
+static struct kproc_desc up_kp = {
+	"syncer",
+	sched_sync,
+	&updateproc
+};
+SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
+
+static int
+sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
+{
+	struct vnode *vp;
+	struct mount *mp;
+
+	*bo = LIST_FIRST(slp);
+	if (*bo == NULL)
+		return (0);
+	vp = bo2vnode(*bo);
+	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
+		return (1);
+	/*
+	 * We use vhold in case the vnode does not
+	 * successfully sync.  vhold prevents the vnode from
+	 * going away when we unlock the sync_mtx so that
+	 * we can acquire the vnode interlock.
+	 */
+	vholdl(vp);
+	mtx_unlock(&sync_mtx);
+	VI_UNLOCK(vp);
+	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+		vdrop(vp);
+		mtx_lock(&sync_mtx);
+		return (*bo == LIST_FIRST(slp));
+	}
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	(void) VOP_FSYNC(vp, MNT_LAZY, td);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	BO_LOCK(*bo);
+	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
+		/*
+		 * Put us back on the worklist.  The worklist
+		 * routine will remove us from our current
+		 * position and then add us back in at a later
+		 * position.
+		 */
+		vn_syncer_add_to_worklist(*bo, syncdelay);
+	}
+	BO_UNLOCK(*bo);
+	vdrop(vp);
+	mtx_lock(&sync_mtx);
+	return (0);
+}
+
+static int first_printf = 1;
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+static void
+sched_sync(void)
+{
+	struct synclist *next, *slp;
+	struct bufobj *bo;
+	long starttime;
+	struct thread *td = curthread;
+	int last_work_seen;
+	int net_worklist_len;
+	int syncer_final_iter;
+	int error;
+
+	last_work_seen = 0;
+	syncer_final_iter = 0;
+	syncer_state = SYNCER_RUNNING;
+	starttime = time_uptime;
+	td->td_pflags |= TDP_NORUNNINGBUF;
+
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
+	    SHUTDOWN_PRI_LAST);
+
+	mtx_lock(&sync_mtx);
+	for (;;) {
+		if (syncer_state == SYNCER_FINAL_DELAY &&
+		    syncer_final_iter == 0) {
+			mtx_unlock(&sync_mtx);
+			kproc_suspend_check(td->td_proc);
+			mtx_lock(&sync_mtx);
+		}
+		net_worklist_len = syncer_worklist_len - sync_vnode_count;
+		if (syncer_state != SYNCER_RUNNING &&
+		    starttime != time_uptime) {
+			if (first_printf) {
+				printf("\nSyncing disks, vnodes remaining... ");
+				first_printf = 0;
+			}
+			printf("%d ", net_worklist_len);
+		}
+		starttime = time_uptime;
+
+		/*
+		 * Push files whose dirty time has expired.  Be careful
+		 * of interrupt race on slp queue.
+		 *
+		 * Skip over empty worklist slots when shutting down.
+		 */
+		do {
+			slp = &syncer_workitem_pending[syncer_delayno];
+			syncer_delayno += 1;
+			if (syncer_delayno == syncer_maxdelay)
+				syncer_delayno = 0;
+			next = &syncer_workitem_pending[syncer_delayno];
+			/*
+			 * If the worklist has wrapped since the
+			 * it was emptied of all but syncer vnodes,
+			 * switch to the FINAL_DELAY state and run
+			 * for one more second.
+			 */
+			if (syncer_state == SYNCER_SHUTTING_DOWN &&
+			    net_worklist_len == 0 &&
+			    last_work_seen == syncer_delayno) {
+				syncer_state = SYNCER_FINAL_DELAY;
+				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
+			}
+		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
+		    syncer_worklist_len > 0);
+
+		/*
+		 * Keep track of the last time there was anything
+		 * on the worklist other than syncer vnodes.
+		 * Return to the SHUTTING_DOWN state if any
+		 * new work appears.
+		 */
+		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
+			last_work_seen = syncer_delayno;
+		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
+			syncer_state = SYNCER_SHUTTING_DOWN;
+		while (!LIST_EMPTY(slp)) {
+			error = sync_vnode(slp, &bo, td);
+			if (error == 1) {
+				LIST_REMOVE(bo, bo_synclist);
+				LIST_INSERT_HEAD(next, bo, bo_synclist);
+				continue;
+			}
+
+			if (first_printf == 0) {
+				/*
+				 * Drop the sync mutex, because some watchdog
+				 * drivers need to sleep while patting
+				 */
+				mtx_unlock(&sync_mtx);
+				wdog_kern_pat(WD_LASTVAL);
+				mtx_lock(&sync_mtx);
+			}
+
+		}
+		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
+			syncer_final_iter--;
+		/*
+		 * The variable rushjob allows the kernel to speed up the
+		 * processing of the filesystem syncer process. A rushjob
+		 * value of N tells the filesystem syncer to process the next
+		 * N seconds worth of work on its queue ASAP. Currently rushjob
+		 * is used by the soft update code to speed up the filesystem
+		 * syncer process when the incore state is getting so far
+		 * ahead of the disk that the kernel memory pool is being
+		 * threatened with exhaustion.
+		 */
+		if (rushjob > 0) {
+			rushjob -= 1;
+			continue;
+		}
+		/*
+		 * Just sleep for a short period of time between
+		 * iterations when shutting down to allow some I/O
+		 * to happen.
+		 *
+		 * If it has taken us less than a second to process the
+		 * current work, then wait. Otherwise start right over
+		 * again. We can still lose time if any single round
+		 * takes more than two seconds, but it does not really
+		 * matter as we are just trying to generally pace the
+		 * filesystem activity.
+		 */
+		if (syncer_state != SYNCER_RUNNING ||
+		    time_uptime == starttime) {
+			thread_lock(td);
+			sched_prio(td, PPAUSE);
+			thread_unlock(td);
+		}
+		if (syncer_state != SYNCER_RUNNING)
+			cv_timedwait(&sync_wakeup, &sync_mtx,
+			    hz / SYNCER_SHUTDOWN_SPEEDUP);
+		else if (time_uptime == starttime)
+			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
+	}
+}
+
+/*
+ * Request the syncer daemon to speed up its work.
+ * We never push it to speed up more than half of its
+ * normal turn time, otherwise it could take over the cpu.
+ */
+int
+speedup_syncer(void)
+{
+	int ret = 0;
+
+	mtx_lock(&sync_mtx);
+	if (rushjob < syncdelay / 2) {
+		rushjob += 1;
+		stat_rush_requests += 1;
+		ret = 1;
+	}
+	mtx_unlock(&sync_mtx);
+	cv_broadcast(&sync_wakeup);
+	return (ret);
+}
+
+/*
+ * Tell the syncer to speed up its work and run though its work
+ * list several times, then tell it to shut down.
+ */
+static void
+syncer_shutdown(void *arg, int howto)
+{
+
+	if (howto & RB_NOSYNC)
+		return;
+	mtx_lock(&sync_mtx);
+	syncer_state = SYNCER_SHUTTING_DOWN;
+	rushjob = 0;
+	mtx_unlock(&sync_mtx);
+	cv_broadcast(&sync_wakeup);
+	kproc_shutdown(arg, howto);
+}
+
+void
+syncer_suspend(void)
+{
+
+	syncer_shutdown(updateproc, 0);
+}
+
+void
+syncer_resume(void)
+{
+
+	mtx_lock(&sync_mtx);
+	first_printf = 1;
+	syncer_state = SYNCER_RUNNING;
+	mtx_unlock(&sync_mtx);
+	cv_broadcast(&sync_wakeup);
+	kproc_resume(updateproc);
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(struct buf *bp)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+	int delay;
+#ifdef INVARIANTS
+	struct bufv *bv;
+#endif
+
+	vp = bp->b_vp;
+	bo = bp->b_bufobj;
+	++reassignbufcalls;
+
+	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
+	    bp, bp->b_vp, bp->b_flags);
+	/*
+	 * B_PAGING flagged buffers cannot be reassigned because their vp
+	 * is not fully linked in.
+	 */
+	if (bp->b_flags & B_PAGING)
+		panic("cannot reassign paging buffer");
+
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	BO_LOCK(bo);
+	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+		buf_vlist_remove(bp);
+	else
+		panic("reassignbuf: Buffer %p not on queue.", bp);
+	/*
+	 * If dirty, put on list of dirty buffers; otherwise insert onto list
+	 * of clean buffers.
+	 */
+	if (bp->b_flags & B_DELWRI) {
+		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
+			switch (vp->v_type) {
+			case VDIR:
+				delay = dirdelay;
+				break;
+			case VCHR:
+				delay = metadelay;
+				break;
+			default:
+				delay = filedelay;
+			}
+			vn_syncer_add_to_worklist(bo, delay);
+		}
+		buf_vlist_add(bp, bo, BX_VNDIRTY);
+	} else {
+		buf_vlist_add(bp, bo, BX_VNCLEAN);
+
+		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+			mtx_lock(&sync_mtx);
+			LIST_REMOVE(bo, bo_synclist);
+			syncer_worklist_len--;
+			mtx_unlock(&sync_mtx);
+			bo->bo_flag &= ~BO_ONWORKLST;
+		}
+	}
+#ifdef INVARIANTS
+	bv = &bo->bo_clean;
+	bp = TAILQ_FIRST(&bv->bv_hd);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	bp = TAILQ_LAST(&bv->bv_hd, buflists);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	bv = &bo->bo_dirty;
+	bp = TAILQ_FIRST(&bv->bv_hd);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+	bp = TAILQ_LAST(&bv->bv_hd, buflists);
+	KASSERT(bp == NULL || bp->b_bufobj == bo,
+	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+#endif
+	BO_UNLOCK(bo);
+}
+
+static void
+v_init_counters(struct vnode *vp)
+{
+
+	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
+	    vp, ("%s called for an initialized vnode", __FUNCTION__));
+	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
+
+	refcount_init(&vp->v_holdcnt, 1);
+	refcount_init(&vp->v_usecount, 1);
+}
+
+static void
+v_incr_usecount_locked(struct vnode *vp)
+{
+
+	ASSERT_VI_LOCKED(vp, __func__);
+	if ((vp->v_iflag & VI_OWEINACT) != 0) {
+		VNASSERT(vp->v_usecount == 0, vp,
+		    ("vnode with usecount and VI_OWEINACT set"));
+		vp->v_iflag &= ~VI_OWEINACT;
+	}
+	refcount_acquire(&vp->v_usecount);
+	v_incr_devcount(vp);
+}
+
+/*
+ * Increment the use count on the vnode, taking care to reference
+ * the driver's usecount if this is a chardev.
+ */
+static void
+v_incr_usecount(struct vnode *vp)
+{
+
+	ASSERT_VI_UNLOCKED(vp, __func__);
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+
+	if (vp->v_type != VCHR &&
+	    refcount_acquire_if_not_zero(&vp->v_usecount)) {
+		VNODE_REFCOUNT_FENCE_ACQ();
+		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
+		    ("vnode with usecount and VI_OWEINACT set"));
+	} else {
+		VI_LOCK(vp);
+		v_incr_usecount_locked(vp);
+		VI_UNLOCK(vp);
+	}
+}
+
+/*
+ * Increment si_usecount of the associated device, if any.
+ */
+static void
+v_incr_devcount(struct vnode *vp)
+{
+
+	ASSERT_VI_LOCKED(vp, __FUNCTION__);
+	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+		dev_lock();
+		vp->v_rdev->si_usecount++;
+		dev_unlock();
+	}
+}
+
+/*
+ * Decrement si_usecount of the associated device, if any.
+ */
+static void
+v_decr_devcount(struct vnode *vp)
+{
+
+	ASSERT_VI_LOCKED(vp, __FUNCTION__);
+	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+		dev_lock();
+		vp->v_rdev->si_usecount--;
+		dev_unlock();
+	}
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it.  VI_DOOMED is set if the vnode
+ * is being destroyed.  Only callers who specify LK_RETRY will
+ * see doomed vnodes.  If inactive processing was delayed in
+ * vput try to do it here.
+ *
+ * Notes on lockless counter manipulation:
+ * _vhold, vputx and other routines make various decisions based
+ * on either holdcnt or usecount being 0. As long as either counter
+ * is not transitioning 0->1 nor 1->0, the manipulation can be done
+ * with atomic operations. Otherwise the interlock is taken covering
+ * both the atomic and additional actions.
+ */
+int
+vget(struct vnode *vp, int flags, struct thread *td)
+{
+	int error, oweinact;
+
+	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+	    ("vget: invalid lock operation"));
+
+	if ((flags & LK_INTERLOCK) != 0)
+		ASSERT_VI_LOCKED(vp, __func__);
+	else
+		ASSERT_VI_UNLOCKED(vp, __func__);
+	if ((flags & LK_VNHELD) != 0)
+		VNASSERT((vp->v_holdcnt > 0), vp,
+		    ("vget: LK_VNHELD passed but vnode not held"));
+
+	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+
+	if ((flags & LK_VNHELD) == 0)
+		_vhold(vp, (flags & LK_INTERLOCK) != 0);
+
+	if ((error = vn_lock(vp, flags)) != 0) {
+		vdrop(vp);
+		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
+		    vp);
+		return (error);
+	}
+	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
+		panic("vget: vn_lock failed to return ENOENT\n");
+	/*
+	 * We don't guarantee that any particular close will
+	 * trigger inactive processing so just make a best effort
+	 * here at preventing a reference to a removed file.  If
+	 * we don't succeed no harm is done.
+	 *
+	 * Upgrade our holdcnt to a usecount.
+	 */
+	if (vp->v_type == VCHR ||
+	    !refcount_acquire_if_not_zero(&vp->v_usecount)) {
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_OWEINACT) == 0) {
+			oweinact = 0;
+		} else {
+			oweinact = 1;
+			vp->v_iflag &= ~VI_OWEINACT;
+			VNODE_REFCOUNT_FENCE_REL();
+		}
+		refcount_acquire(&vp->v_usecount);
+		v_incr_devcount(vp);
+		if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
+		    (flags & LK_NOWAIT) == 0)
+			vinactive(vp, td);
+		VI_UNLOCK(vp);
+	}
+	return (0);
+}
+
+/*
+ * Increase the reference (use) and hold count of a vnode.
+ * This will also remove the vnode from the free list if it is presently free.
+ */
+void
+vref(struct vnode *vp)
+{
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	_vhold(vp, false);
+	v_incr_usecount(vp);
+}
+
+void
+vrefl(struct vnode *vp)
+{
+
+	ASSERT_VI_LOCKED(vp, __func__);
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	_vhold(vp, true);
+	v_incr_usecount_locked(vp);
+}
+
+void
+vrefact(struct vnode *vp)
+{
+
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	if (__predict_false(vp->v_type == VCHR)) {
+		VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
+		    ("%s: wrong ref counts", __func__));
+		vref(vp);
+		return;
+	}
+#ifdef INVARIANTS
+	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
+	VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
+	old = atomic_fetchadd_int(&vp->v_usecount, 1);
+	VNASSERT(old > 0, vp, ("%s: wrong use count", __func__));
+#else
+	refcount_acquire(&vp->v_holdcnt);
+	refcount_acquire(&vp->v_usecount);
+#endif
+}
+
+/*
+ * Return reference count of a vnode.
+ *
+ * The results of this call are only guaranteed when some mechanism is used to
+ * stop other processes from gaining references to the vnode.  This may be the
+ * case if the caller holds the only reference.  This is also useful when stale
+ * data is acceptable as race conditions may be accounted for by some other
+ * means.
+ */
+int
+vrefcnt(struct vnode *vp)
+{
+
+	return (vp->v_usecount);
+}
+
+#define	VPUTX_VRELE	1
+#define	VPUTX_VPUT	2
+#define	VPUTX_VUNREF	3
+
+/*
+ * Decrement the use and hold counts for a vnode.
+ *
+ * See an explanation near vget() as to why atomic operation is safe.
+ */
+static void
+vputx(struct vnode *vp, int func)
+{
+	int error;
+
+	KASSERT(vp != NULL, ("vputx: null vp"));
+	if (func == VPUTX_VUNREF)
+		ASSERT_VOP_LOCKED(vp, "vunref");
+	else if (func == VPUTX_VPUT)
+		ASSERT_VOP_LOCKED(vp, "vput");
+	else
+		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
+	ASSERT_VI_UNLOCKED(vp, __func__);
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+
+	if (vp->v_type != VCHR &&
+	    refcount_release_if_not_last(&vp->v_usecount)) {
+		if (func == VPUTX_VPUT)
+			VOP_UNLOCK(vp, 0);
+		vdrop(vp);
+		return;
+	}
+
+	VI_LOCK(vp);
+
+	/*
+	 * We want to hold the vnode until the inactive finishes to
+	 * prevent vgone() races.  We drop the use count here and the
+	 * hold count below when we're done.
+	 */
+	if (!refcount_release(&vp->v_usecount) ||
+	    (vp->v_iflag & VI_DOINGINACT)) {
+		if (func == VPUTX_VPUT)
+			VOP_UNLOCK(vp, 0);
+		v_decr_devcount(vp);
+		vdropl(vp);
+		return;
+	}
+
+	v_decr_devcount(vp);
+
+	error = 0;
+
+	if (vp->v_usecount != 0) {
+		vn_printf(vp, "vputx: usecount not zero for vnode ");
+		panic("vputx: usecount not zero");
+	}
+
+	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
+
+	/*
+	 * We must call VOP_INACTIVE with the node locked. Mark
+	 * as VI_DOINGINACT to avoid recursion.
+	 */
+	vp->v_iflag |= VI_OWEINACT;
+	switch (func) {
+	case VPUTX_VRELE:
+		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
+		VI_LOCK(vp);
+		break;
+	case VPUTX_VPUT:
+		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
+			    LK_NOWAIT);
+			VI_LOCK(vp);
+		}
+		break;
+	case VPUTX_VUNREF:
+		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
+			VI_LOCK(vp);
+		}
+		break;
+	}
+	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
+	    ("vnode with usecount and VI_OWEINACT set"));
+	if (error == 0) {
+		if (vp->v_iflag & VI_OWEINACT)
+			vinactive(vp, curthread);
+		if (func != VPUTX_VUNREF)
+			VOP_UNLOCK(vp, 0);
+	}
+	vdropl(vp);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(struct vnode *vp)
+{
+
+	vputx(vp, VPUTX_VRELE);
+}
+
+/*
+ * Release an already locked vnode.  This give the same effects as
+ * unlock+vrele(), but takes less time and avoids releasing and
+ * re-aquiring the lock (as vrele() acquires the lock internally.)
+ */
+void
+vput(struct vnode *vp)
+{
+
+	vputx(vp, VPUTX_VPUT);
+}
+
+/*
+ * Release an exclusively locked vnode. Do not unlock the vnode lock.
+ */
+void
+vunref(struct vnode *vp)
+{
+
+	vputx(vp, VPUTX_VUNREF);
+}
+
+/*
+ * Increase the hold count and activate if this is the first reference.
+ */
+void
+_vhold(struct vnode *vp, bool locked)
+{
+	struct mount *mp;
+
+	if (locked)
+		ASSERT_VI_LOCKED(vp, __func__);
+	else
+		ASSERT_VI_UNLOCKED(vp, __func__);
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	if (!locked) {
+		if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
+			VNODE_REFCOUNT_FENCE_ACQ();
+			VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+			    ("_vhold: vnode with holdcnt is free"));
+			return;
+		}
+		VI_LOCK(vp);
+	}
+	if ((vp->v_iflag & VI_FREE) == 0) {
+		refcount_acquire(&vp->v_holdcnt);
+		if (!locked)
+			VI_UNLOCK(vp);
+		return;
+	}
+	VNASSERT(vp->v_holdcnt == 0, vp,
+	    ("%s: wrong hold count", __func__));
+	VNASSERT(vp->v_op != NULL, vp,
+	    ("%s: vnode already reclaimed.", __func__));
+	/*
+	 * Remove a vnode from the free list, mark it as in use,
+	 * and put it on the active list.
+	 */
+	VNASSERT(vp->v_mount != NULL, vp,
+	    ("_vhold: vnode not on per mount vnode list"));
+	mp = vp->v_mount;
+	mtx_lock(&mp->mnt_listmtx);
+	if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
+		TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
+		mp->mnt_tmpfreevnodelistsize--;
+		vp->v_mflag &= ~VMP_TMPMNTFREELIST;
+	} else {
+		mtx_lock(&vnode_free_list_mtx);
+		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+		freevnodes--;
+		mtx_unlock(&vnode_free_list_mtx);
+	}
+	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+	    ("Activating already active vnode"));
+	vp->v_iflag &= ~VI_FREE;
+	vp->v_iflag |= VI_ACTIVE;
+	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+	mp->mnt_activevnodelistsize++;
+	mtx_unlock(&mp->mnt_listmtx);
+	refcount_acquire(&vp->v_holdcnt);
+	if (!locked)
+		VI_UNLOCK(vp);
+}
+
+/*
+ * Drop the hold count of the vnode.  If this is the last reference to
+ * the vnode we place it on the free list unless it has been vgone'd
+ * (marked VI_DOOMED) in which case we will free it.
+ *
+ * Because the vnode vm object keeps a hold reference on the vnode if
+ * there is at least one resident non-cached page, the vnode cannot
+ * leave the active list without the page cleanup done.
+ */
+void
+_vdrop(struct vnode *vp, bool locked)
+{
+	struct bufobj *bo;
+	struct mount *mp;
+	int active;
+
+	if (locked)
+		ASSERT_VI_LOCKED(vp, __func__);
+	else
+		ASSERT_VI_UNLOCKED(vp, __func__);
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	if ((int)vp->v_holdcnt <= 0)
+		panic("vdrop: holdcnt %d", vp->v_holdcnt);
+	if (!locked) {
+		if (refcount_release_if_not_last(&vp->v_holdcnt))
+			return;
+		VI_LOCK(vp);
+	}
+	if (refcount_release(&vp->v_holdcnt) == 0) {
+		VI_UNLOCK(vp);
+		return;
+	}
+	if ((vp->v_iflag & VI_DOOMED) == 0) {
+		/*
+		 * Mark a vnode as free: remove it from its active list
+		 * and put it up for recycling on the freelist.
+		 */
+		VNASSERT(vp->v_op != NULL, vp,
+		    ("vdropl: vnode already reclaimed."));
+		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+		    ("vnode already free"));
+		VNASSERT(vp->v_holdcnt == 0, vp,
+		    ("vdropl: freeing when we shouldn't"));
+		active = vp->v_iflag & VI_ACTIVE;
+		if ((vp->v_iflag & VI_OWEINACT) == 0) {
+			vp->v_iflag &= ~VI_ACTIVE;
+			mp = vp->v_mount;
+			if (mp != NULL) {
+				mtx_lock(&mp->mnt_listmtx);
+				if (active) {
+					TAILQ_REMOVE(&mp->mnt_activevnodelist,
+					    vp, v_actfreelist);
+					mp->mnt_activevnodelistsize--;
+				}
+				TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist,
+				    vp, v_actfreelist);
+				mp->mnt_tmpfreevnodelistsize++;
+				vp->v_iflag |= VI_FREE;
+				vp->v_mflag |= VMP_TMPMNTFREELIST;
+				VI_UNLOCK(vp);
+				if (mp->mnt_tmpfreevnodelistsize >=
+				    mnt_free_list_batch)
+					vnlru_return_batch_locked(mp);
+				mtx_unlock(&mp->mnt_listmtx);
+			} else {
+				VNASSERT(active == 0, vp,
+				    ("vdropl: active vnode not on per mount "
+				    "vnode list"));
+				mtx_lock(&vnode_free_list_mtx);
+				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
+				    v_actfreelist);
+				freevnodes++;
+				vp->v_iflag |= VI_FREE;
+				VI_UNLOCK(vp);
+				mtx_unlock(&vnode_free_list_mtx);
+			}
+		} else {
+			VI_UNLOCK(vp);
+			counter_u64_add(free_owe_inact, 1);
+		}
+		return;
+	}
+	/*
+	 * The vnode has been marked for destruction, so free it.
+	 *
+	 * The vnode will be returned to the zone where it will
+	 * normally remain until it is needed for another vnode. We
+	 * need to cleanup (or verify that the cleanup has already
+	 * been done) any residual data left from its current use
+	 * so as not to contaminate the freshly allocated vnode.
+	 */
+	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
+	atomic_subtract_long(&numvnodes, 1);
+	bo = &vp->v_bufobj;
+	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+	    ("cleaned vnode still on the free list."));
+	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
+	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
+	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
+	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
+	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
+	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
+	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
+	    ("clean blk trie not empty"));
+	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
+	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
+	    ("dirty blk trie not empty"));
+	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
+	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
+	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
+	    ("Dangling rangelock waiters"));
+	VI_UNLOCK(vp);
+#ifdef MAC
+	mac_vnode_destroy(vp);
+#endif
+	if (vp->v_pollinfo != NULL) {
+		destroy_vpollinfo(vp->v_pollinfo);
+		vp->v_pollinfo = NULL;
+	}
+#ifdef INVARIANTS
+	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
+	vp->v_op = NULL;
+#endif
+	vp->v_mountedhere = NULL;
+	vp->v_unpcb = NULL;
+	vp->v_rdev = NULL;
+	vp->v_fifoinfo = NULL;
+	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+	vp->v_iflag = 0;
+	vp->v_vflag = 0;
+	bo->bo_flag = 0;
+	uma_zfree(vnode_zone, vp);
+}
+
+/*
+ * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
+ * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
+ * OWEINACT tracks whether a vnode missed a call to inactive due to a
+ * failed lock upgrade.
+ */
+void
+vinactive(struct vnode *vp, struct thread *td)
+{
+	struct vm_object *obj;
+
+	ASSERT_VOP_ELOCKED(vp, "vinactive");
+	ASSERT_VI_LOCKED(vp, "vinactive");
+	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
+	    ("vinactive: recursed on VI_DOINGINACT"));
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vp->v_iflag |= VI_DOINGINACT;
+	vp->v_iflag &= ~VI_OWEINACT;
+	VI_UNLOCK(vp);
+	/*
+	 * Before moving off the active list, we must be sure that any
+	 * modified pages are converted into the vnode's dirty
+	 * buffers, since these will no longer be checked once the
+	 * vnode is on the inactive list.
+	 *
+	 * The write-out of the dirty pages is asynchronous.  At the
+	 * point that VOP_INACTIVE() is called, there could still be
+	 * pending I/O and dirty pages in the object.
+	 */
+	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
+	    (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
+		VM_OBJECT_WLOCK(obj);
+		vm_object_page_clean(obj, 0, 0, 0);
+		VM_OBJECT_WUNLOCK(obj);
+	}
+	VOP_INACTIVE(vp, td);
+	VI_LOCK(vp);
+	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
+	    ("vinactive: lost VI_DOINGINACT"));
+	vp->v_iflag &= ~VI_DOINGINACT;
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If FORCECLOSE is not specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If FORCECLOSE is specified, detach any active vnodes
+ * that are found.
+ *
+ * If WRITECLOSE is set, only flush out regular file vnodes open for
+ * writing.
+ *
+ * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
+ *
+ * `rootrefs' specifies the base reference count for the root vnode
+ * of this filesystem. The root vnode is considered busy if its
+ * v_usecount exceeds this value. On a successful return, vflush(, td)
+ * will call vrele() on the root vnode exactly rootrefs times.
+ * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
+ * be zero.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0;		/* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
+#endif
+
+int
+vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
+{
+	struct vnode *vp, *mvp, *rootvp = NULL;
+	struct vattr vattr;
+	int busy = 0, error;
+
+	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
+	    rootrefs, flags);
+	if (rootrefs > 0) {
+		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
+		    ("vflush: bad args"));
+		/*
+		 * Get the filesystem root vnode. We can vput() it
+		 * immediately, since with rootrefs > 0, it won't go away.
+		 */
+		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
+			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
+			    __func__, error);
+			return (error);
+		}
+		vput(rootvp);
+	}
+loop:
+	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+		vholdl(vp);
+		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
+		if (error) {
+			vdrop(vp);
+			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+			goto loop;
+		}
+		/*
+		 * Skip over a vnodes marked VV_SYSTEM.
+		 */
+		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
+			VOP_UNLOCK(vp, 0);
+			vdrop(vp);
+			continue;
+		}
+		/*
+		 * If WRITECLOSE is set, flush out unlinked but still open
+		 * files (even if open only for reading) and regular file
+		 * vnodes open for writing.
+		 */
+		if (flags & WRITECLOSE) {
+			if (vp->v_object != NULL) {
+				VM_OBJECT_WLOCK(vp->v_object);
+				vm_object_page_clean(vp->v_object, 0, 0, 0);
+				VM_OBJECT_WUNLOCK(vp->v_object);
+			}
+			error = VOP_FSYNC(vp, MNT_WAIT, td);
+			if (error != 0) {
+				VOP_UNLOCK(vp, 0);
+				vdrop(vp);
+				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+				return (error);
+			}
+			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+			VI_LOCK(vp);
+
+			if ((vp->v_type == VNON ||
+			    (error == 0 && vattr.va_nlink > 0)) &&
+			    (vp->v_writecount <= 0 || vp->v_type != VREG)) {
+				VOP_UNLOCK(vp, 0);
+				vdropl(vp);
+				continue;
+			}
+		} else
+			VI_LOCK(vp);
+		/*
+		 * With v_usecount == 0, all we need to do is clear out the
+		 * vnode data structures and we are done.
+		 *
+		 * If FORCECLOSE is set, forcibly close the vnode.
+		 */
+		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
+			vgonel(vp);
+		} else {
+			busy++;
+#ifdef DIAGNOSTIC
+			if (busyprt)
+				vn_printf(vp, "vflush: busy vnode ");
+#endif
+		}
+		VOP_UNLOCK(vp, 0);
+		vdropl(vp);
+	}
+	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
+		/*
+		 * If just the root vnode is busy, and if its refcount
+		 * is equal to `rootrefs', then go ahead and kill it.
+		 */
+		VI_LOCK(rootvp);
+		KASSERT(busy > 0, ("vflush: not busy"));
+		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
+		    ("vflush: usecount %d < rootrefs %d",
+		     rootvp->v_usecount, rootrefs));
+		if (busy == 1 && rootvp->v_usecount == rootrefs) {
+			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
+			vgone(rootvp);
+			VOP_UNLOCK(rootvp, 0);
+			busy = 0;
+		} else
+			VI_UNLOCK(rootvp);
+	}
+	if (busy) {
+		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
+		    busy);
+		return (EBUSY);
+	}
+	for (; rootrefs > 0; rootrefs--)
+		vrele(rootvp);
+	return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ */
+int
+vrecycle(struct vnode *vp)
+{
+	int recycled;
+
+	VI_LOCK(vp);
+	recycled = vrecyclel(vp);
+	VI_UNLOCK(vp);
+	return (recycled);
+}
+
+/*
+ * vrecycle, with the vp interlock held.
+ */
+int
+vrecyclel(struct vnode *vp)
+{
+	int recycled;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+	ASSERT_VI_LOCKED(vp, __func__);
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	recycled = 0;
+	if (vp->v_usecount == 0) {
+		recycled = 1;
+		vgonel(vp);
+	}
+	return (recycled);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(struct vnode *vp)
+{
+	VI_LOCK(vp);
+	vgonel(vp);
+	VI_UNLOCK(vp);
+}
+
+static void
+notify_lowervp_vfs_dummy(struct mount *mp __unused,
+    struct vnode *lowervp __unused)
+{
+}
+
+/*
+ * Notify upper mounts about reclaimed or unlinked vnode.
+ */
+void
+vfs_notify_upper(struct vnode *vp, int event)
+{
+	static struct vfsops vgonel_vfsops = {
+		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
+		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
+	};
+	struct mount *mp, *ump, *mmp;
+
+	mp = vp->v_mount;
+	if (mp == NULL)
+		return;
+
+	MNT_ILOCK(mp);
+	if (TAILQ_EMPTY(&mp->mnt_uppers))
+		goto unlock;
+	MNT_IUNLOCK(mp);
+	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
+	mmp->mnt_op = &vgonel_vfsops;
+	mmp->mnt_kern_flag |= MNTK_MARKER;
+	MNT_ILOCK(mp);
+	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
+	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
+		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
+			ump = TAILQ_NEXT(ump, mnt_upper_link);
+			continue;
+		}
+		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
+		MNT_IUNLOCK(mp);
+		switch (event) {
+		case VFS_NOTIFY_UPPER_RECLAIM:
+			VFS_RECLAIM_LOWERVP(ump, vp);
+			break;
+		case VFS_NOTIFY_UPPER_UNLINK:
+			VFS_UNLINK_LOWERVP(ump, vp);
+			break;
+		default:
+			KASSERT(0, ("invalid event %d", event));
+			break;
+		}
+		MNT_ILOCK(mp);
+		ump = TAILQ_NEXT(mmp, mnt_upper_link);
+		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
+	}
+	free(mmp, M_TEMP);
+	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
+	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
+		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
+		wakeup(&mp->mnt_uppers);
+	}
+unlock:
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+static void
+vgonel(struct vnode *vp)
+{
+	struct thread *td;
+	int oweinact;
+	int active;
+	struct mount *mp;
+
+	ASSERT_VOP_ELOCKED(vp, "vgonel");
+	ASSERT_VI_LOCKED(vp, "vgonel");
+	VNASSERT(vp->v_holdcnt, vp,
+	    ("vgonel: vp %p has no reference.", vp));
+	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	td = curthread;
+
+	/*
+	 * Don't vgonel if we're already doomed.
+	 */
+	if (vp->v_iflag & VI_DOOMED)
+		return;
+	vp->v_iflag |= VI_DOOMED;
+
+	/*
+	 * Check to see if the vnode is in use.  If so, we have to call
+	 * VOP_CLOSE() and VOP_INACTIVE().
+	 */
+	active = vp->v_usecount;
+	oweinact = (vp->v_iflag & VI_OWEINACT);
+	VI_UNLOCK(vp);
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
+
+	/*
+	 * If purging an active vnode, it must be closed and
+	 * deactivated before being reclaimed.
+	 */
+	if (active)
+		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
+	if (oweinact || active) {
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOINGINACT) == 0)
+			vinactive(vp, td);
+		VI_UNLOCK(vp);
+	}
+	if (vp->v_type == VSOCK)
+		vfs_unp_reclaim(vp);
+
+	/*
+	 * Clean out any buffers associated with the vnode.
+	 * If the flush fails, just toss the buffers.
+	 */
+	mp = NULL;
+	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
+		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
+	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
+		while (vinvalbuf(vp, 0, 0, 0) != 0)
+			;
+	}
+
+	BO_LOCK(&vp->v_bufobj);
+	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
+	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
+	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
+	    vp->v_bufobj.bo_clean.bv_cnt == 0,
+	    ("vp %p bufobj not invalidated", vp));
+
+	/*
+	 * For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
+	 * after the object's page queue is flushed.
+	 */
+	if (vp->v_bufobj.bo_object == NULL)
+		vp->v_bufobj.bo_flag |= BO_DEAD;
+	BO_UNLOCK(&vp->v_bufobj);
+
+	/*
+	 * Reclaim the vnode.
+	 */
+	if (VOP_RECLAIM(vp, td))
+		panic("vgone: cannot reclaim");
+	if (mp != NULL)
+		vn_finished_secondary_write(mp);
+	VNASSERT(vp->v_object == NULL, vp,
+	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
+	/*
+	 * Clear the advisory locks and wake up waiting threads.
+	 */
+	(void)VOP_ADVLOCKPURGE(vp);
+	vp->v_lockf = NULL;
+	/*
+	 * Delete from old mount point vnode list.
+	 */
+	delmntque(vp);
+	cache_purge(vp);
+	/*
+	 * Done with purge, reset to the standard lock and invalidate
+	 * the vnode.
+	 */
+	VI_LOCK(vp);
+	vp->v_vnlock = &vp->v_lock;
+	vp->v_op = &dead_vnodeops;
+	vp->v_tag = "none";
+	vp->v_type = VBAD;
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(struct vnode *vp)
+{
+	int count;
+
+	dev_lock();
+	count = vp->v_rdev->si_usecount;
+	dev_unlock();
+	return (count);
+}
+
+/*
+ * Same as above, but using the struct cdev *as argument
+ */
+int
+count_dev(struct cdev *dev)
+{
+	int count;
+
+	dev_lock();
+	count = dev->si_usecount;
+	dev_unlock();
+	return(count);
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
+ "VMARKER"};
+
+void
+vn_printf(struct vnode *vp, const char *fmt, ...)
+{
+	va_list ap;
+	char buf[256], buf2[16];
+	u_long flags;
+
+	va_start(ap, fmt);
+	vprintf(fmt, ap);
+	va_end(ap);
+	printf("%p: ", (void *)vp);
+	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
+	printf("    usecount %d, writecount %d, refcount %d",
+	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
+	switch (vp->v_type) {
+	case VDIR:
+		printf(" mountedhere %p\n", vp->v_mountedhere);
+		break;
+	case VCHR:
+		printf(" rdev %p\n", vp->v_rdev);
+		break;
+	case VSOCK:
+		printf(" socket %p\n", vp->v_unpcb);
+		break;
+	case VFIFO:
+		printf(" fifoinfo %p\n", vp->v_fifoinfo);
+		break;
+	default:
+		printf("\n");
+		break;
+	}
+	buf[0] = '\0';
+	buf[1] = '\0';
+	if (vp->v_vflag & VV_ROOT)
+		strlcat(buf, "|VV_ROOT", sizeof(buf));
+	if (vp->v_vflag & VV_ISTTY)
+		strlcat(buf, "|VV_ISTTY", sizeof(buf));
+	if (vp->v_vflag & VV_NOSYNC)
+		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
+	if (vp->v_vflag & VV_ETERNALDEV)
+		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
+	if (vp->v_vflag & VV_CACHEDLABEL)
+		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
+	if (vp->v_vflag & VV_COPYONWRITE)
+		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
+	if (vp->v_vflag & VV_SYSTEM)
+		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
+	if (vp->v_vflag & VV_PROCDEP)
+		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
+	if (vp->v_vflag & VV_NOKNOTE)
+		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
+	if (vp->v_vflag & VV_DELETED)
+		strlcat(buf, "|VV_DELETED", sizeof(buf));
+	if (vp->v_vflag & VV_MD)
+		strlcat(buf, "|VV_MD", sizeof(buf));
+	if (vp->v_vflag & VV_FORCEINSMQ)
+		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
+	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
+	    VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
+	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
+	if (flags != 0) {
+		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
+		strlcat(buf, buf2, sizeof(buf));
+	}
+	if (vp->v_iflag & VI_MOUNT)
+		strlcat(buf, "|VI_MOUNT", sizeof(buf));
+	if (vp->v_iflag & VI_DOOMED)
+		strlcat(buf, "|VI_DOOMED", sizeof(buf));
+	if (vp->v_iflag & VI_FREE)
+		strlcat(buf, "|VI_FREE", sizeof(buf));
+	if (vp->v_iflag & VI_ACTIVE)
+		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
+	if (vp->v_iflag & VI_DOINGINACT)
+		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
+	if (vp->v_iflag & VI_OWEINACT)
+		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
+	if (vp->v_iflag & VI_TEXT_REF)
+		strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
+	flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
+	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT | VI_TEXT_REF);
+	if (flags != 0) {
+		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
+		strlcat(buf, buf2, sizeof(buf));
+	}
+	printf("    flags (%s)\n", buf + 1);
+	if (mtx_owned(VI_MTX(vp)))
+		printf(" VI_LOCKed");
+	if (vp->v_object != NULL)
+		printf("    v_object %p ref %d pages %d "
+		    "cleanbuf %d dirtybuf %d\n",
+		    vp->v_object, vp->v_object->ref_count,
+		    vp->v_object->resident_page_count,
+		    vp->v_bufobj.bo_clean.bv_cnt,
+		    vp->v_bufobj.bo_dirty.bv_cnt);
+	printf("    ");
+	lockmgr_printinfo(vp->v_vnlock);
+	if (vp->v_data != NULL)
+		VOP_PRINT(vp);
+}
+
+#ifdef DDB
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
+{
+	struct mount *mp;
+	struct vnode *vp;
+
+	/*
+	 * Note: because this is DDB, we can't obey the locking semantics
+	 * for these structures, which means we could catch an inconsistent
+	 * state and dereference a nasty pointer.  Not much to be done
+	 * about that.
+	 */
+	db_printf("Locked vnodes\n");
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
+				vn_printf(vp, "vnode ");
+		}
+	}
+}
+
+/*
+ * Show details about the given vnode.
+ */
+DB_SHOW_COMMAND(vnode, db_show_vnode)
+{
+	struct vnode *vp;
+
+	if (!have_addr)
+		return;
+	vp = (struct vnode *)addr;
+	vn_printf(vp, "vnode ");
+}
+
+/*
+ * Show details about the given mount point.
+ */
+DB_SHOW_COMMAND(mount, db_show_mount)
+{
+	struct mount *mp;
+	struct vfsopt *opt;
+	struct statfs *sp;
+	struct vnode *vp;
+	char buf[512];
+	uint64_t mflags;
+	u_int flags;
+
+	if (!have_addr) {
+		/* No address given, print short info about all mount points. */
+		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+			db_printf("%p %s on %s (%s)\n", mp,
+			    mp->mnt_stat.f_mntfromname,
+			    mp->mnt_stat.f_mntonname,
+			    mp->mnt_stat.f_fstypename);
+			if (db_pager_quit)
+				break;
+		}
+		db_printf("\nMore info: show mount <addr>\n");
+		return;
+	}
+
+	mp = (struct mount *)addr;
+	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
+	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
+
+	buf[0] = '\0';
+	mflags = mp->mnt_flag;
+#define	MNT_FLAG(flag)	do {						\
+	if (mflags & (flag)) {						\
+		if (buf[0] != '\0')					\
+			strlcat(buf, ", ", sizeof(buf));		\
+		strlcat(buf, (#flag) + 4, sizeof(buf));			\
+		mflags &= ~(flag);					\
+	}								\
+} while (0)
+	MNT_FLAG(MNT_RDONLY);
+	MNT_FLAG(MNT_SYNCHRONOUS);
+	MNT_FLAG(MNT_NOEXEC);
+	MNT_FLAG(MNT_NOSUID);
+	MNT_FLAG(MNT_NFS4ACLS);
+	MNT_FLAG(MNT_UNION);
+	MNT_FLAG(MNT_ASYNC);
+	MNT_FLAG(MNT_SUIDDIR);
+	MNT_FLAG(MNT_SOFTDEP);
+	MNT_FLAG(MNT_NOSYMFOLLOW);
+	MNT_FLAG(MNT_GJOURNAL);
+	MNT_FLAG(MNT_MULTILABEL);
+	MNT_FLAG(MNT_ACLS);
+	MNT_FLAG(MNT_NOATIME);
+	MNT_FLAG(MNT_NOCLUSTERR);
+	MNT_FLAG(MNT_NOCLUSTERW);
+	MNT_FLAG(MNT_SUJ);
+	MNT_FLAG(MNT_EXRDONLY);
+	MNT_FLAG(MNT_EXPORTED);
+	MNT_FLAG(MNT_DEFEXPORTED);
+	MNT_FLAG(MNT_EXPORTANON);
+	MNT_FLAG(MNT_EXKERB);
+	MNT_FLAG(MNT_EXPUBLIC);
+	MNT_FLAG(MNT_LOCAL);
+	MNT_FLAG(MNT_QUOTA);
+	MNT_FLAG(MNT_ROOTFS);
+	MNT_FLAG(MNT_USER);
+	MNT_FLAG(MNT_IGNORE);
+	MNT_FLAG(MNT_UPDATE);
+	MNT_FLAG(MNT_DELEXPORT);
+	MNT_FLAG(MNT_RELOAD);
+	MNT_FLAG(MNT_FORCE);
+	MNT_FLAG(MNT_SNAPSHOT);
+	MNT_FLAG(MNT_BYFSID);
+#undef MNT_FLAG
+	if (mflags != 0) {
+		if (buf[0] != '\0')
+			strlcat(buf, ", ", sizeof(buf));
+		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+		    "0x%016jx", mflags);
+	}
+	db_printf("    mnt_flag = %s\n", buf);
+
+	buf[0] = '\0';
+	flags = mp->mnt_kern_flag;
+#define	MNT_KERN_FLAG(flag)	do {					\
+	if (flags & (flag)) {						\
+		if (buf[0] != '\0')					\
+			strlcat(buf, ", ", sizeof(buf));		\
+		strlcat(buf, (#flag) + 5, sizeof(buf));			\
+		flags &= ~(flag);					\
+	}								\
+} while (0)
+	MNT_KERN_FLAG(MNTK_UNMOUNTF);
+	MNT_KERN_FLAG(MNTK_ASYNC);
+	MNT_KERN_FLAG(MNTK_SOFTDEP);
+	MNT_KERN_FLAG(MNTK_DRAINING);
+	MNT_KERN_FLAG(MNTK_REFEXPIRE);
+	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
+	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
+	MNT_KERN_FLAG(MNTK_NO_IOPF);
+	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
+	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
+	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
+	MNT_KERN_FLAG(MNTK_MARKER);
+	MNT_KERN_FLAG(MNTK_USES_BCACHE);
+	MNT_KERN_FLAG(MNTK_NOASYNC);
+	MNT_KERN_FLAG(MNTK_UNMOUNT);
+	MNT_KERN_FLAG(MNTK_MWAIT);
+	MNT_KERN_FLAG(MNTK_SUSPEND);
+	MNT_KERN_FLAG(MNTK_SUSPEND2);
+	MNT_KERN_FLAG(MNTK_SUSPENDED);
+	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
+	MNT_KERN_FLAG(MNTK_NOKNOTE);
+#undef MNT_KERN_FLAG
+	if (flags != 0) {
+		if (buf[0] != '\0')
+			strlcat(buf, ", ", sizeof(buf));
+		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+		    "0x%08x", flags);
+	}
+	db_printf("    mnt_kern_flag = %s\n", buf);
+
+	db_printf("    mnt_opt = ");
+	opt = TAILQ_FIRST(mp->mnt_opt);
+	if (opt != NULL) {
+		db_printf("%s", opt->name);
+		opt = TAILQ_NEXT(opt, link);
+		while (opt != NULL) {
+			db_printf(", %s", opt->name);
+			opt = TAILQ_NEXT(opt, link);
+		}
+	}
+	db_printf("\n");
+
+	sp = &mp->mnt_stat;
+	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
+	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
+	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
+	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
+	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
+	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
+	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
+	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
+	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
+	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
+	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
+	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
+
+	db_printf("    mnt_cred = { uid=%u ruid=%u",
+	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
+	if (jailed(mp->mnt_cred))
+		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
+	db_printf(" }\n");
+	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
+	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
+	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
+	db_printf("    mnt_activevnodelistsize = %d\n",
+	    mp->mnt_activevnodelistsize);
+	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
+	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
+	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
+	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
+	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
+	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
+	db_printf("    mnt_secondary_accwrites = %d\n",
+	    mp->mnt_secondary_accwrites);
+	db_printf("    mnt_gjprovider = %s\n",
+	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
+
+	db_printf("\n\nList of active vnodes\n");
+	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
+		if (vp->v_type != VMARKER) {
+			vn_printf(vp, "vnode ");
+			if (db_pager_quit)
+				break;
+		}
+	}
+	db_printf("\n\nList of inactive vnodes\n");
+	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
+			vn_printf(vp, "vnode ");
+			if (db_pager_quit)
+				break;
+		}
+	}
+}
+#endif	/* DDB */
+
+/*
+ * Fill in a struct xvfsconf based on a struct vfsconf.
+ */
+static int
+vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+	struct xvfsconf xvfsp;
+
+	bzero(&xvfsp, sizeof(xvfsp));
+	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+	xvfsp.vfc_typenum = vfsp->vfc_typenum;
+	xvfsp.vfc_refcount = vfsp->vfc_refcount;
+	xvfsp.vfc_flags = vfsp->vfc_flags;
+	/*
+	 * These are unused in userland, we keep them
+	 * to not break binary compatibility.
+	 */
+	xvfsp.vfc_vfsops = NULL;
+	xvfsp.vfc_next = NULL;
+	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+
+#ifdef COMPAT_FREEBSD32
+struct xvfsconf32 {
+	uint32_t	vfc_vfsops;
+	char		vfc_name[MFSNAMELEN];
+	int32_t		vfc_typenum;
+	int32_t		vfc_refcount;
+	int32_t		vfc_flags;
+	uint32_t	vfc_next;
+};
+
+static int
+vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+	struct xvfsconf32 xvfsp;
+
+	bzero(&xvfsp, sizeof(xvfsp));
+	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+	xvfsp.vfc_typenum = vfsp->vfc_typenum;
+	xvfsp.vfc_refcount = vfsp->vfc_refcount;
+	xvfsp.vfc_flags = vfsp->vfc_flags;
+	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int
+sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
+{
+	struct vfsconf *vfsp;
+	int error;
+
+	error = 0;
+	vfsconf_slock();
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+#ifdef COMPAT_FREEBSD32
+		if (req->flags & SCTL_MASK32)
+			error = vfsconf2x32(req, vfsp);
+		else
+#endif
+			error = vfsconf2x(req, vfsp);
+		if (error)
+			break;
+	}
+	vfsconf_sunlock();
+	return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
+    "S,xvfsconf", "List of all configured filesystems");
+
+#ifndef BURN_BRIDGES
+static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	int *name = (int *)arg1 - 1;	/* XXX */
+	u_int namelen = arg2 + 1;	/* XXX */
+	struct vfsconf *vfsp;
+
+	log(LOG_WARNING, "userland calling deprecated sysctl, "
+	    "please rebuild world\n");
+
+#if 1 || defined(COMPAT_PRELITE2)
+	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+	if (namelen == 1)
+		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+	switch (name[1]) {
+	case VFS_MAXTYPENUM:
+		if (namelen != 2)
+			return (ENOTDIR);
+		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+	case VFS_CONF:
+		if (namelen != 3)
+			return (ENOTDIR);	/* overloaded */
+		vfsconf_slock();
+		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+			if (vfsp->vfc_typenum == name[2])
+				break;
+		}
+		vfsconf_sunlock();
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+#ifdef COMPAT_FREEBSD32
+		if (req->flags & SCTL_MASK32)
+			return (vfsconf2x32(req, vfsp));
+		else
+#endif
+			return (vfsconf2x(req, vfsp));
+	}
+	return (EOPNOTSUPP);
+}
+
+static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
+    CTLFLAG_MPSAFE, vfs_sysctl,
+    "Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	struct vfsconf *vfsp;
+	struct ovfsconf ovfs;
+
+	vfsconf_slock();
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+		bzero(&ovfs, sizeof(ovfs));
+		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
+		strcpy(ovfs.vfc_name, vfsp->vfc_name);
+		ovfs.vfc_index = vfsp->vfc_typenum;
+		ovfs.vfc_refcount = vfsp->vfc_refcount;
+		ovfs.vfc_flags = vfsp->vfc_flags;
+		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+		if (error != 0) {
+			vfsconf_sunlock();
+			return (error);
+		}
+	}
+	vfsconf_sunlock();
+	return (0);
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+#endif /* !BURN_BRIDGES */
+
+#define KINFO_VNODESLOP		10
+#ifdef notyet
+/*
+ * Dump vnode list (via sysctl).
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode(SYSCTL_HANDLER_ARGS)
+{
+	struct xvnode *xvn;
+	struct mount *mp;
+	struct vnode *vp;
+	int error, len, n;
+
+	/*
+	 * Stale numvnodes access is not fatal here.
+	 */
+	req->lock = 0;
+	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
+	if (!req->oldptr)
+		/* Make an estimate */
+		return (SYSCTL_OUT(req, 0, len));
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
+	n = 0;
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
+			continue;
+		MNT_ILOCK(mp);
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+			if (n == len)
+				break;
+			vref(vp);
+			xvn[n].xv_size = sizeof *xvn;
+			xvn[n].xv_vnode = vp;
+			xvn[n].xv_id = 0;	/* XXX compat */
+#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
+			XV_COPY(usecount);
+			XV_COPY(writecount);
+			XV_COPY(holdcnt);
+			XV_COPY(mount);
+			XV_COPY(numoutput);
+			XV_COPY(type);
+#undef XV_COPY
+			xvn[n].xv_flag = vp->v_vflag;
+
+			switch (vp->v_type) {
+			case VREG:
+			case VDIR:
+			case VLNK:
+				break;
+			case VBLK:
+			case VCHR:
+				if (vp->v_rdev == NULL) {
+					vrele(vp);
+					continue;
+				}
+				xvn[n].xv_dev = dev2udev(vp->v_rdev);
+				break;
+			case VSOCK:
+				xvn[n].xv_socket = vp->v_socket;
+				break;
+			case VFIFO:
+				xvn[n].xv_fifo = vp->v_fifoinfo;
+				break;
+			case VNON:
+			case VBAD:
+			default:
+				/* shouldn't happen? */
+				vrele(vp);
+				continue;
+			}
+			vrele(vp);
+			++n;
+		}
+		MNT_IUNLOCK(mp);
+		mtx_lock(&mountlist_mtx);
+		vfs_unbusy(mp);
+		if (n == len)
+			break;
+	}
+	mtx_unlock(&mountlist_mtx);
+
+	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
+	free(xvn, M_TEMP);
+	return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
+    "");
+#endif
+
+static void
+unmount_or_warn(struct mount *mp)
+{
+	int error;
+
+	error = dounmount(mp, MNT_FORCE, curthread);
+	if (error != 0) {
+		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
+		if (error == EBUSY)
+			printf("BUSY)\n");
+		else
+			printf("%d)\n", error);
+	}
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall(void)
+{
+	struct mount *mp, *tmp;
+
+	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
+
+	/*
+	 * Since this only runs when rebooting, it is not interlocked.
+	 */
+	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
+		vfs_ref(mp);
+
+		/*
+		 * Forcibly unmounting "/dev" before "/" would prevent clean
+		 * unmount of the latter.
+		 */
+		if (mp == rootdevmp)
+			continue;
+
+		unmount_or_warn(mp);
+	}
+
+	if (rootdevmp != NULL)
+		unmount_or_warn(rootdevmp);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags)
+{
+	struct vnode *vp, *mvp;
+	struct vm_object *obj;
+
+	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+
+	vnlru_return_batch(mp);
+
+	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
+		obj = vp->v_object;
+		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
+			if (!vget(vp,
+			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
+			    curthread)) {
+				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
+					vput(vp);
+					continue;
+				}
+
+				obj = vp->v_object;
+				if (obj != NULL) {
+					VM_OBJECT_WLOCK(obj);
+					vm_object_page_clean(obj, 0, 0,
+					    flags == MNT_WAIT ?
+					    OBJPC_SYNC : OBJPC_NOSYNC);
+					VM_OBJECT_WUNLOCK(obj);
+				}
+				vput(vp);
+			}
+		} else
+			VI_UNLOCK(vp);
+	}
+}
+
+static void
+destroy_vpollinfo_free(struct vpollinfo *vi)
+{
+
+	knlist_destroy(&vi->vpi_selinfo.si_note);
+	mtx_destroy(&vi->vpi_lock);
+	uma_zfree(vnodepoll_zone, vi);
+}
+
+static void
+destroy_vpollinfo(struct vpollinfo *vi)
+{
+
+	knlist_clear(&vi->vpi_selinfo.si_note, 1);
+	seldrain(&vi->vpi_selinfo);
+	destroy_vpollinfo_free(vi);
+}
+
+/*
+ * Initialize per-vnode helper structure to hold poll-related state.
+ */
+void
+v_addpollinfo(struct vnode *vp)
+{
+	struct vpollinfo *vi;
+
+	if (vp->v_pollinfo != NULL)
+		return;
+	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
+	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
+	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
+	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
+	VI_LOCK(vp);
+	if (vp->v_pollinfo != NULL) {
+		VI_UNLOCK(vp);
+		destroy_vpollinfo_free(vi);
+		return;
+	}
+	vp->v_pollinfo = vi;
+	VI_UNLOCK(vp);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode.  Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions.  (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(struct vnode *vp, struct thread *td, int events)
+{
+
+	v_addpollinfo(vp);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
+	if (vp->v_pollinfo->vpi_revents & events) {
+		/*
+		 * This leaves events we are not interested
+		 * in available for the other process which
+		 * which presumably had requested them
+		 * (otherwise they would never have been
+		 * recorded).
+		 */
+		events &= vp->v_pollinfo->vpi_revents;
+		vp->v_pollinfo->vpi_revents &= ~events;
+
+		mtx_unlock(&vp->v_pollinfo->vpi_lock);
+		return (events);
+	}
+	vp->v_pollinfo->vpi_events |= events;
+	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
+	mtx_unlock(&vp->v_pollinfo->vpi_lock);
+	return (0);
+}
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*)(struct  vop_close_args *))nullop)
+static int	sync_fsync(struct  vop_fsync_args *);
+static int	sync_inactive(struct  vop_inactive_args *);
+static int	sync_reclaim(struct  vop_reclaim_args *);
+
+static struct vop_vector sync_vnodeops = {
+	.vop_bypass =	VOP_EOPNOTSUPP,
+	.vop_close =	sync_close,		/* close */
+	.vop_fsync =	sync_fsync,		/* fsync */
+	.vop_inactive =	sync_inactive,	/* inactive */
+	.vop_reclaim =	sync_reclaim,	/* reclaim */
+	.vop_lock1 =	vop_stdlock,	/* lock */
+	.vop_unlock =	vop_stdunlock,	/* unlock */
+	.vop_islocked =	vop_stdislocked,	/* islocked */
+};
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+void
+vfs_allocate_syncvnode(struct mount *mp)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+	static long start, incr, next;
+	int error;
+
+	/* Allocate a new vnode */
+	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
+	if (error != 0)
+		panic("vfs_allocate_syncvnode: getnewvnode() failed");
+	vp->v_type = VNON;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	vp->v_vflag |= VV_FORCEINSMQ;
+	error = insmntque(vp, mp);
+	if (error != 0)
+		panic("vfs_allocate_syncvnode: insmntque() failed");
+	vp->v_vflag &= ~VV_FORCEINSMQ;
+	VOP_UNLOCK(vp, 0);
+	/*
+	 * Place the vnode onto the syncer worklist. We attempt to
+	 * scatter them about on the list so that they will go off
+	 * at evenly distributed times even if all the filesystems
+	 * are mounted at once.
+	 */
+	next += incr;
+	if (next == 0 || next > syncer_maxdelay) {
+		start /= 2;
+		incr /= 2;
+		if (start == 0) {
+			start = syncer_maxdelay / 2;
+			incr = syncer_maxdelay;
+		}
+		next = start;
+	}
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
+	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
+	mtx_lock(&sync_mtx);
+	sync_vnode_count++;
+	if (mp->mnt_syncer == NULL) {
+		mp->mnt_syncer = vp;
+		vp = NULL;
+	}
+	mtx_unlock(&sync_mtx);
+	BO_UNLOCK(bo);
+	if (vp != NULL) {
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		vgone(vp);
+		vput(vp);
+	}
+}
+
+void
+vfs_deallocate_syncvnode(struct mount *mp)
+{
+	struct vnode *vp;
+
+	mtx_lock(&sync_mtx);
+	vp = mp->mnt_syncer;
+	if (vp != NULL)
+		mp->mnt_syncer = NULL;
+	mtx_unlock(&sync_mtx);
+	if (vp != NULL)
+		vrele(vp);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(struct vop_fsync_args *ap)
+{
+	struct vnode *syncvp = ap->a_vp;
+	struct mount *mp = syncvp->v_mount;
+	int error, save;
+	struct bufobj *bo;
+
+	/*
+	 * We only need to do something if this is a lazy evaluation.
+	 */
+	if (ap->a_waitfor != MNT_LAZY)
+		return (0);
+
+	/*
+	 * Move ourselves to the back of the sync list.
+	 */
+	bo = &syncvp->v_bufobj;
+	BO_LOCK(bo);
+	vn_syncer_add_to_worklist(bo, syncdelay);
+	BO_UNLOCK(bo);
+
+	/*
+	 * Walk the list of vnodes pushing all that are dirty and
+	 * not already on the sync list.
+	 */
+	if (vfs_busy(mp, MBF_NOWAIT) != 0)
+		return (0);
+	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
+		vfs_unbusy(mp);
+		return (0);
+	}
+	save = curthread_pflags_set(TDP_SYNCIO);
+	vfs_msync(mp, MNT_NOWAIT);
+	error = VFS_SYNC(mp, MNT_LAZY);
+	curthread_pflags_restore(save);
+	vn_finished_write(mp);
+	vfs_unbusy(mp);
+	return (error);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(struct vop_inactive_args *ap)
+{
+
+	vgone(ap->a_vp);
+	return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ *
+ * Modifications to the worklist must be protected by sync_mtx.
+ */
+static int
+sync_reclaim(struct vop_reclaim_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct bufobj *bo;
+
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+	mtx_lock(&sync_mtx);
+	if (vp->v_mount->mnt_syncer == vp)
+		vp->v_mount->mnt_syncer = NULL;
+	if (bo->bo_flag & BO_ONWORKLST) {
+		LIST_REMOVE(bo, bo_synclist);
+		syncer_worklist_len--;
+		sync_vnode_count--;
+		bo->bo_flag &= ~BO_ONWORKLST;
+	}
+	mtx_unlock(&sync_mtx);
+	BO_UNLOCK(bo);
+
+	return (0);
+}
+
+/*
+ * Check if vnode represents a disk device
+ */
+int
+vn_isdisk(struct vnode *vp, int *errp)
+{
+	int error;
+
+	if (vp->v_type != VCHR) {
+		error = ENOTBLK;
+		goto out;
+	}
+	error = 0;
+	dev_lock();
+	if (vp->v_rdev == NULL)
+		error = ENXIO;
+	else if (vp->v_rdev->si_devsw == NULL)
+		error = ENXIO;
+	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
+		error = ENOTBLK;
+	dev_unlock();
+out:
+	if (errp != NULL)
+		*errp = error;
+	return (error == 0);
+}
+
+/*
+ * Common filesystem object access control check routine.  Accepts a
+ * vnode's type, "mode", uid and gid, requested access mode, credentials,
+ * and optional call-by-reference privused argument allowing vaccess()
+ * to indicate to the caller whether privilege was used to satisfy the
+ * request (obsoleted).  Returns 0 on success, or an errno on failure.
+ */
+int
+vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
+    accmode_t accmode, struct ucred *cred, int *privused)
+{
+	accmode_t dac_granted;
+	accmode_t priv_granted;
+
+	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
+	    ("invalid bit in accmode"));
+	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+	    ("VAPPEND without VWRITE"));
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.
+	 */
+
+	if (privused != NULL)
+		*privused = 0;
+
+	dac_granted = 0;
+
+	/* Check the owner. */
+	if (cred->cr_uid == file_uid) {
+		dac_granted |= VADMIN;
+		if (file_mode & S_IXUSR)
+			dac_granted |= VEXEC;
+		if (file_mode & S_IRUSR)
+			dac_granted |= VREAD;
+		if (file_mode & S_IWUSR)
+			dac_granted |= (VWRITE | VAPPEND);
+
+		if ((accmode & dac_granted) == accmode)
+			return (0);
+
+		goto privcheck;
+	}
+
+	/* Otherwise, check the groups (first match) */
+	if (groupmember(file_gid, cred)) {
+		if (file_mode & S_IXGRP)
+			dac_granted |= VEXEC;
+		if (file_mode & S_IRGRP)
+			dac_granted |= VREAD;
+		if (file_mode & S_IWGRP)
+			dac_granted |= (VWRITE | VAPPEND);
+
+		if ((accmode & dac_granted) == accmode)
+			return (0);
+
+		goto privcheck;
+	}
+
+	/* Otherwise, check everyone else. */
+	if (file_mode & S_IXOTH)
+		dac_granted |= VEXEC;
+	if (file_mode & S_IROTH)
+		dac_granted |= VREAD;
+	if (file_mode & S_IWOTH)
+		dac_granted |= (VWRITE | VAPPEND);
+	if ((accmode & dac_granted) == accmode)
+		return (0);
+
+privcheck:
+	/*
+	 * Build a privilege mask to determine if the set of privileges
+	 * satisfies the requirements when combined with the granted mask
+	 * from above.  For each privilege, if the privilege is required,
+	 * bitwise or the request type onto the priv_granted mask.
+	 */
+	priv_granted = 0;
+
+	if (type == VDIR) {
+		/*
+		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
+		 * requests, instead of PRIV_VFS_EXEC.
+		 */
+		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
+			priv_granted |= VEXEC;
+	} else {
+		/*
+		 * Ensure that at least one execute bit is on. Otherwise,
+		 * a privileged user will always succeed, and we don't want
+		 * this to happen unless the file really is executable.
+		 */
+		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+			priv_granted |= VEXEC;
+	}
+
+	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
+	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
+		priv_granted |= VREAD;
+
+	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
+	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+		priv_granted |= (VWRITE | VAPPEND);
+
+	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
+	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+		priv_granted |= VADMIN;
+
+	if ((accmode & (priv_granted | dac_granted)) == accmode) {
+		/* XXX audit: privilege used */
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+	return ((accmode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * Credential check based on process requesting service, and per-attribute
+ * permissions.
+ */
+int
+extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
+    struct thread *td, accmode_t accmode)
+{
+
+	/*
+	 * Kernel-invoked always succeeds.
+	 */
+	if (cred == NOCRED)
+		return (0);
+
+	/*
+	 * Do not allow privileged processes in jail to directly manipulate
+	 * system attributes.
+	 */
+	switch (attrnamespace) {
+	case EXTATTR_NAMESPACE_SYSTEM:
+		/* Potentially should be: return (EPERM); */
+		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
+	case EXTATTR_NAMESPACE_USER:
+		return (VOP_ACCESS(vp, accmode, cred, td));
+	default:
+		return (EPERM);
+	}
+}
+
+#ifdef DEBUG_VFS_LOCKS
+/*
+ * This only exists to suppress warnings from unlocked specfs accesses.  It is
+ * no longer ok to have an unlocked VFS.
+ */
+#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
+	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
+
+int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
+    "Drop into debugger on lock violation");
+
+int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
+    0, "Check for interlock across VOPs");
+
+int vfs_badlock_print = 1;	/* Print lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
+    0, "Print lock violations");
+
+int vfs_badlock_vnode = 1;	/* Print vnode details on lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
+    0, "Print vnode details on lock violations");
+
+#ifdef KDB
+int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
+    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
+#endif
+
+static void
+vfs_badlock(const char *msg, const char *str, struct vnode *vp)
+{
+
+#ifdef KDB
+	if (vfs_badlock_backtrace)
+		kdb_backtrace();
+#endif
+	if (vfs_badlock_vnode)
+		vn_printf(vp, "vnode ");
+	if (vfs_badlock_print)
+		printf("%s: %p %s\n", str, (void *)vp, msg);
+	if (vfs_badlock_ddb)
+		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+}
+
+void
+assert_vi_locked(struct vnode *vp, const char *str)
+{
+
+	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
+		vfs_badlock("interlock is not locked but should be", str, vp);
+}
+
+void
+assert_vi_unlocked(struct vnode *vp, const char *str)
+{
+
+	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
+		vfs_badlock("interlock is locked but should not be", str, vp);
+}
+
+void
+assert_vop_locked(struct vnode *vp, const char *str)
+{
+	int locked;
+
+	if (!IGNORE_LOCK(vp)) {
+		locked = VOP_ISLOCKED(vp);
+		if (locked == 0 || locked == LK_EXCLOTHER)
+			vfs_badlock("is not locked but should be", str, vp);
+	}
+}
+
+void
+assert_vop_unlocked(struct vnode *vp, const char *str)
+{
+
+	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+		vfs_badlock("is locked but should not be", str, vp);
+}
+
+void
+assert_vop_elocked(struct vnode *vp, const char *str)
+{
+
+	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+		vfs_badlock("is not exclusive locked but should be", str, vp);
+}
+#endif /* DEBUG_VFS_LOCKS */
+
+void
+vop_rename_fail(struct vop_rename_args *ap)
+{
+
+	if (ap->a_tvp != NULL)
+		vput(ap->a_tvp);
+	if (ap->a_tdvp == ap->a_tvp)
+		vrele(ap->a_tdvp);
+	else
+		vput(ap->a_tdvp);
+	vrele(ap->a_fdvp);
+	vrele(ap->a_fvp);
+}
+
+void
+vop_rename_pre(void *ap)
+{
+	struct vop_rename_args *a = ap;
+
+#ifdef DEBUG_VFS_LOCKS
+	if (a->a_tvp)
+		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
+	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
+	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
+	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
+
+	/* Check the source (from). */
+	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
+	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
+		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
+	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
+		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
+
+	/* Check the target. */
+	if (a->a_tvp)
+		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
+	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+#endif
+	if (a->a_tdvp != a->a_fdvp)
+		vhold(a->a_fdvp);
+	if (a->a_tvp != a->a_fvp)
+		vhold(a->a_fvp);
+	vhold(a->a_tdvp);
+	if (a->a_tvp)
+		vhold(a->a_tvp);
+}
+
+#ifdef DEBUG_VFS_LOCKS
+void
+vop_strategy_pre(void *ap)
+{
+	struct vop_strategy_args *a;
+	struct buf *bp;
+
+	a = ap;
+	bp = a->a_bp;
+
+	/*
+	 * Cluster ops lock their component buffers but not the IO container.
+	 */
+	if ((bp->b_flags & B_CLUSTER) != 0)
+		return;
+
+	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
+		if (vfs_badlock_print)
+			printf(
+			    "VOP_STRATEGY: bp is not locked but should be\n");
+		if (vfs_badlock_ddb)
+			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+	}
+}
+
+void
+vop_lock_pre(void *ap)
+{
+	struct vop_lock1_args *a = ap;
+
+	if ((a->a_flags & LK_INTERLOCK) == 0)
+		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+	else
+		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
+}
+
+void
+vop_lock_post(void *ap, int rc)
+{
+	struct vop_lock1_args *a = ap;
+
+	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
+		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
+}
+
+void
+vop_unlock_pre(void *ap)
+{
+	struct vop_unlock_args *a = ap;
+
+	if (a->a_flags & LK_INTERLOCK)
+		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
+	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
+}
+
+void
+vop_unlock_post(void *ap, int rc)
+{
+	struct vop_unlock_args *a = ap;
+
+	if (a->a_flags & LK_INTERLOCK)
+		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
+}
+#endif
+
+void
+vop_create_post(void *ap, int rc)
+{
+	struct vop_create_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_deleteextattr_post(void *ap, int rc)
+{
+	struct vop_deleteextattr_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_link_post(void *ap, int rc)
+{
+	struct vop_link_args *a = ap;
+
+	if (!rc) {
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
+		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
+	}
+}
+
+void
+vop_mkdir_post(void *ap, int rc)
+{
+	struct vop_mkdir_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+}
+
+void
+vop_mknod_post(void *ap, int rc)
+{
+	struct vop_mknod_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_reclaim_post(void *ap, int rc)
+{
+	struct vop_reclaim_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
+}
+
+void
+vop_remove_post(void *ap, int rc)
+{
+	struct vop_remove_args *a = ap;
+
+	if (!rc) {
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+	}
+}
+
+void
+vop_rename_post(void *ap, int rc)
+{
+	struct vop_rename_args *a = ap;
+	long hint;
+
+	if (!rc) {
+		hint = NOTE_WRITE;
+		if (a->a_fdvp == a->a_tdvp) {
+			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
+				hint |= NOTE_LINK;
+			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+		} else {
+			hint |= NOTE_EXTEND;
+			if (a->a_fvp->v_type == VDIR)
+				hint |= NOTE_LINK;
+			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+
+			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
+			    a->a_tvp->v_type == VDIR)
+				hint &= ~NOTE_LINK;
+			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+		}
+
+		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
+		if (a->a_tvp)
+			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
+	}
+	if (a->a_tdvp != a->a_fdvp)
+		vdrop(a->a_fdvp);
+	if (a->a_tvp != a->a_fvp)
+		vdrop(a->a_fvp);
+	vdrop(a->a_tdvp);
+	if (a->a_tvp)
+		vdrop(a->a_tvp);
+}
+
+void
+vop_rmdir_post(void *ap, int rc)
+{
+	struct vop_rmdir_args *a = ap;
+
+	if (!rc) {
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+	}
+}
+
+void
+vop_setattr_post(void *ap, int rc)
+{
+	struct vop_setattr_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_setextattr_post(void *ap, int rc)
+{
+	struct vop_setextattr_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_symlink_post(void *ap, int rc)
+{
+	struct vop_symlink_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_open_post(void *ap, int rc)
+{
+	struct vop_open_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
+}
+
+void
+vop_close_post(void *ap, int rc)
+{
+	struct vop_close_args *a = ap;
+
+	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
+	    (a->a_vp->v_iflag & VI_DOOMED) == 0)) {
+		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
+		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
+	}
+}
+
+void
+vop_read_post(void *ap, int rc)
+{
+	struct vop_read_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
+void
+vop_readdir_post(void *ap, int rc)
+{
+	struct vop_readdir_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
+static struct knlist fs_knlist;
+
+static void
+vfs_event_init(void *arg)
+{
+	knlist_init_mtx(&fs_knlist, NULL);
+}
+/* XXX - correct order? */
+SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
+
+void
+vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
+{
+
+	KNOTE_UNLOCKED(&fs_knlist, event);
+}
+
+static int	filt_fsattach(struct knote *kn);
+static void	filt_fsdetach(struct knote *kn);
+static int	filt_fsevent(struct knote *kn, long hint);
+
+struct filterops fs_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_fsattach,
+	.f_detach = filt_fsdetach,
+	.f_event = filt_fsevent
+};
+
+static int
+filt_fsattach(struct knote *kn)
+{
+
+	kn->kn_flags |= EV_CLEAR;
+	knlist_add(&fs_knlist, kn, 0);
+	return (0);
+}
+
+static void
+filt_fsdetach(struct knote *kn)
+{
+
+	knlist_remove(&fs_knlist, kn, 0);
+}
+
+static int
+filt_fsevent(struct knote *kn, long hint)
+{
+
+	kn->kn_fflags |= hint;
+	return (kn->kn_fflags != 0);
+}
+
+static int
+sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
+{
+	struct vfsidctl vc;
+	int error;
+	struct mount *mp;
+
+	error = SYSCTL_IN(req, &vc, sizeof(vc));
+	if (error)
+		return (error);
+	if (vc.vc_vers != VFS_CTL_VERS1)
+		return (EINVAL);
+	mp = vfs_getvfs(&vc.vc_fsid);
+	if (mp == NULL)
+		return (ENOENT);
+	/* ensure that a specific sysctl goes to the right filesystem. */
+	if (strcmp(vc.vc_fstypename, "*") != 0 &&
+	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
+		vfs_rel(mp);
+		return (EINVAL);
+	}
+	VCTLTOREQ(&vc, req);
+	error = VFS_SYSCTL(mp, vc.vc_op, req);
+	vfs_rel(mp);
+	return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
+    NULL, 0, sysctl_vfs_ctl, "",
+    "Sysctl by fsid");
+
+/*
+ * Function to initialize a va_filerev field sensibly.
+ * XXX: Wouldn't a random number make a lot more sense ??
+ */
+u_quad_t
+init_va_filerev(void)
+{
+	struct bintime bt;
+
+	getbinuptime(&bt);
+	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
+}
+
+static int	filt_vfsread(struct knote *kn, long hint);
+static int	filt_vfswrite(struct knote *kn, long hint);
+static int	filt_vfsvnode(struct knote *kn, long hint);
+static void	filt_vfsdetach(struct knote *kn);
+static struct filterops vfsread_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_vfsdetach,
+	.f_event = filt_vfsread
+};
+static struct filterops vfswrite_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_vfsdetach,
+	.f_event = filt_vfswrite
+};
+static struct filterops vfsvnode_filtops = {
+	.f_isfd = 1,
+	.f_detach = filt_vfsdetach,
+	.f_event = filt_vfsvnode
+};
+
+static void
+vfs_knllock(void *arg)
+{
+	struct vnode *vp = arg;
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+}
+
+static void
+vfs_knlunlock(void *arg)
+{
+	struct vnode *vp = arg;
+
+	VOP_UNLOCK(vp, 0);
+}
+
+static void
+vfs_knl_assert_locked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vnode *vp = arg;
+
+	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
+#endif
+}
+
+static void
+vfs_knl_assert_unlocked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+	struct vnode *vp = arg;
+
+	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
+#endif
+}
+
+int
+vfs_kqfilter(struct vop_kqfilter_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct knote *kn = ap->a_kn;
+	struct knlist *knl;
+
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		kn->kn_fop = &vfsread_filtops;
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &vfswrite_filtops;
+		break;
+	case EVFILT_VNODE:
+		kn->kn_fop = &vfsvnode_filtops;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	kn->kn_hook = (caddr_t)vp;
+
+	v_addpollinfo(vp);
+	if (vp->v_pollinfo == NULL)
+		return (ENOMEM);
+	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
+	vhold(vp);
+	knlist_add(knl, kn, 0);
+
+	return (0);
+}
+
+/*
+ * Detach knote from vnode
+ */
+static void
+filt_vfsdetach(struct knote *kn)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
+	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
+	vdrop(vp);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfsread(struct knote *kn, long hint)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+	struct vattr va;
+	int res;
+
+	/*
+	 * filesystem is gone, so set the EOF flag and schedule
+	 * the knote for deletion.
+	 */
+	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
+		VI_LOCK(vp);
+		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+		VI_UNLOCK(vp);
+		return (1);
+	}
+
+	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
+		return (0);
+
+	VI_LOCK(vp);
+	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
+	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
+	VI_UNLOCK(vp);
+	return (res);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfswrite(struct knote *kn, long hint)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+	VI_LOCK(vp);
+
+	/*
+	 * filesystem is gone, so set the EOF flag and schedule
+	 * the knote for deletion.
+	 */
+	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
+		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+
+	kn->kn_data = 0;
+	VI_UNLOCK(vp);
+	return (1);
+}
+
+static int
+filt_vfsvnode(struct knote *kn, long hint)
+{
+	struct vnode *vp = (struct vnode *)kn->kn_hook;
+	int res;
+
+	VI_LOCK(vp);
+	if (kn->kn_sfflags & hint)
+		kn->kn_fflags |= hint;
+	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
+		kn->kn_flags |= EV_EOF;
+		VI_UNLOCK(vp);
+		return (1);
+	}
+	res = (kn->kn_fflags != 0);
+	VI_UNLOCK(vp);
+	return (res);
+}
+
+int
+vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
+{
+	int error;
+
+	if (dp->d_reclen > ap->a_uio->uio_resid)
+		return (ENAMETOOLONG);
+	error = uiomove(dp, dp->d_reclen, ap->a_uio);
+	if (error) {
+		if (ap->a_ncookies != NULL) {
+			if (ap->a_cookies != NULL)
+				free(ap->a_cookies, M_TEMP);
+			ap->a_cookies = NULL;
+			*ap->a_ncookies = 0;
+		}
+		return (error);
+	}
+	if (ap->a_ncookies == NULL)
+		return (0);
+
+	KASSERT(ap->a_cookies,
+	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
+
+	*ap->a_cookies = realloc(*ap->a_cookies,
+	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
+	(*ap->a_cookies)[*ap->a_ncookies] = off;
+	*ap->a_ncookies += 1;
+	return (0);
+}
+
+/*
+ * Mark for update the access time of the file if the filesystem
+ * supports VOP_MARKATIME.  This functionality is used by execve and
+ * mmap, so we want to avoid the I/O implied by directly setting
+ * va_atime for the sake of efficiency.
+ */
+void
+vfs_mark_atime(struct vnode *vp, struct ucred *cred)
+{
+	struct mount *mp;
+
+	mp = vp->v_mount;
+	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
+	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
+		(void)VOP_MARKATIME(vp);
+}
+
+/*
+ * The purpose of this routine is to remove granularity from accmode_t,
+ * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
+ * VADMIN and VAPPEND.
+ *
+ * If it returns 0, the caller is supposed to continue with the usual
+ * access checks using 'accmode' as modified by this routine.  If it
+ * returns nonzero value, the caller is supposed to return that value
+ * as errno.
+ *
+ * Note that after this routine runs, accmode may be zero.
+ */
+int
+vfs_unixify_accmode(accmode_t *accmode)
+{
+	/*
+	 * There is no way to specify explicit "deny" rule using
+	 * file mode or POSIX.1e ACLs.
+	 */
+	if (*accmode & VEXPLICIT_DENY) {
+		*accmode = 0;
+		return (0);
+	}
+
+	/*
+	 * None of these can be translated into usual access bits.
+	 * Also, the common case for NFSv4 ACLs is to not contain
+	 * either of these bits. Caller should check for VWRITE
+	 * on the containing directory instead.
+	 */
+	if (*accmode & (VDELETE_CHILD | VDELETE))
+		return (EPERM);
+
+	if (*accmode & VADMIN_PERMS) {
+		*accmode &= ~VADMIN_PERMS;
+		*accmode |= VADMIN;
+	}
+
+	/*
+	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
+	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
+	 */
+	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
+
+	return (0);
+}
+
+/*
+ * These are helper functions for filesystems to traverse all
+ * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
+ *
+ * This interface replaces MNT_VNODE_FOREACH.
+ */
+
+MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
+
+struct vnode *
+__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp;
+
+	if (should_yield())
+		kern_yield(PRI_USER);
+	MNT_ILOCK(mp);
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
+	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
+		/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+		if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+			continue;
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0) {
+			VI_UNLOCK(vp);
+			continue;
+		}
+		break;
+	}
+	if (vp == NULL) {
+		__mnt_vnode_markerfree_all(mvp, mp);
+		/* MNT_IUNLOCK(mp); -- done in above function */
+		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
+		return (NULL);
+	}
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+	MNT_IUNLOCK(mp);
+	return (vp);
+}
+
+struct vnode *
+__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp;
+
+	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	(*mvp)->v_mount = mp;
+	(*mvp)->v_type = VMARKER;
+
+	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+		/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+		if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+			continue;
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0) {
+			VI_UNLOCK(vp);
+			continue;
+		}
+		break;
+	}
+	if (vp == NULL) {
+		MNT_REL(mp);
+		MNT_IUNLOCK(mp);
+		free(*mvp, M_VNODE_MARKER);
+		*mvp = NULL;
+		return (NULL);
+	}
+	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+	MNT_IUNLOCK(mp);
+	return (vp);
+}
+
+void
+__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
+{
+
+	if (*mvp == NULL) {
+		MNT_IUNLOCK(mp);
+		return;
+	}
+
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+	free(*mvp, M_VNODE_MARKER);
+	*mvp = NULL;
+}
+
+/*
+ * These are helper functions for filesystems to traverse their
+ * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
+ */
+static void
+mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+	free(*mvp, M_VNODE_MARKER);
+	*mvp = NULL;
+}
+
+/*
+ * Relock the mp mount vnode list lock with the vp vnode interlock in the
+ * conventional lock order during mnt_vnode_next_active iteration.
+ *
+ * On entry, the mount vnode list lock is held and the vnode interlock is not.
+ * The list lock is dropped and reacquired.  On success, both locks are held.
+ * On failure, the mount vnode list lock is held but the vnode interlock is
+ * not, and the procedure may have yielded.
+ */
+static bool
+mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
+    struct vnode *vp)
+{
+	const struct vnode *tmp;
+	bool held, ret;
+
+	VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
+	    TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
+	    ("%s: bad marker", __func__));
+	VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
+	    ("%s: inappropriate vnode", __func__));
+	ASSERT_VI_UNLOCKED(vp, __func__);
+	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+
+	ret = false;
+
+	TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
+	TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
+
+	/*
+	 * Use a hold to prevent vp from disappearing while the mount vnode
+	 * list lock is dropped and reacquired.  Normally a hold would be
+	 * acquired with vhold(), but that might try to acquire the vnode
+	 * interlock, which would be a LOR with the mount vnode list lock.
+	 */
+	held = refcount_acquire_if_not_zero(&vp->v_holdcnt);
+	mtx_unlock(&mp->mnt_listmtx);
+	if (!held)
+		goto abort;
+	VI_LOCK(vp);
+	if (!refcount_release_if_not_last(&vp->v_holdcnt)) {
+		vdropl(vp);
+		goto abort;
+	}
+	mtx_lock(&mp->mnt_listmtx);
+
+	/*
+	 * Determine whether the vnode is still the next one after the marker,
+	 * excepting any other markers.  If the vnode has not been doomed by
+	 * vgone() then the hold should have ensured that it remained on the
+	 * active list.  If it has been doomed but is still on the active list,
+	 * don't abort, but rather skip over it (avoid spinning on doomed
+	 * vnodes).
+	 */
+	tmp = mvp;
+	do {
+		tmp = TAILQ_NEXT(tmp, v_actfreelist);
+	} while (tmp != NULL && tmp->v_type == VMARKER);
+	if (tmp != vp) {
+		mtx_unlock(&mp->mnt_listmtx);
+		VI_UNLOCK(vp);
+		goto abort;
+	}
+
+	ret = true;
+	goto out;
+abort:
+	maybe_yield();
+	mtx_lock(&mp->mnt_listmtx);
+out:
+	if (ret)
+		ASSERT_VI_LOCKED(vp, __func__);
+	else
+		ASSERT_VI_UNLOCKED(vp, __func__);
+	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+	return (ret);
+}
+
+static struct vnode *
+mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp, *nvp;
+
+	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+restart:
+	vp = TAILQ_NEXT(*mvp, v_actfreelist);
+	while (vp != NULL) {
+		if (vp->v_type == VMARKER) {
+			vp = TAILQ_NEXT(vp, v_actfreelist);
+			continue;
+		}
+		/*
+		 * Try-lock because this is the wrong lock order.  If that does
+		 * not succeed, drop the mount vnode list lock and try to
+		 * reacquire it and the vnode interlock in the right order.
+		 */
+		if (!VI_TRYLOCK(vp) &&
+		    !mnt_vnode_next_active_relock(*mvp, mp, vp))
+			goto restart;
+		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
+		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
+		    ("alien vnode on the active list %p %p", vp, mp));
+		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
+			break;
+		nvp = TAILQ_NEXT(vp, v_actfreelist);
+		VI_UNLOCK(vp);
+		vp = nvp;
+	}
+	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+
+	/* Check if we are done */
+	if (vp == NULL) {
+		mtx_unlock(&mp->mnt_listmtx);
+		mnt_vnode_markerfree_active(mvp, mp);
+		return (NULL);
+	}
+	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
+	mtx_unlock(&mp->mnt_listmtx);
+	ASSERT_VI_LOCKED(vp, "active iter");
+	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
+	return (vp);
+}
+
+struct vnode *
+__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+
+	if (should_yield())
+		kern_yield(PRI_USER);
+	mtx_lock(&mp->mnt_listmtx);
+	return (mnt_vnode_next_active(mvp, mp));
+}
+
+struct vnode *
+__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
+{
+	struct vnode *vp;
+
+	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+	MNT_ILOCK(mp);
+	MNT_REF(mp);
+	MNT_IUNLOCK(mp);
+	(*mvp)->v_type = VMARKER;
+	(*mvp)->v_mount = mp;
+
+	mtx_lock(&mp->mnt_listmtx);
+	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
+	if (vp == NULL) {
+		mtx_unlock(&mp->mnt_listmtx);
+		mnt_vnode_markerfree_active(mvp, mp);
+		return (NULL);
+	}
+	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
+	return (mnt_vnode_next_active(mvp, mp));
+}
+
+void
+__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+	if (*mvp == NULL)
+		return;
+
+	mtx_lock(&mp->mnt_listmtx);
+	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+	mtx_unlock(&mp->mnt_listmtx);
+	mnt_vnode_markerfree_active(mvp, mp);
+}
+
+int
+vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
+{
+
+	if ((cnp->cn_flags & NOEXECCHECK) != 0) {
+		cnp->cn_flags &= ~NOEXECCHECK;
+		return (0);
+	}
+
+	return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
+}
diff --git a/freebsd/sys/kern/vfs_syscalls.c b/freebsd/sys/kern/vfs_syscalls.c
new file mode 100644
index 00000000..06aaa935
--- /dev/null
+++ b/freebsd/sys/kern/vfs_syscalls.c
@@ -0,0 +1,4748 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capsicum.h>
+#include <sys/disk.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/limits.h>
+#include <sys/linker.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/jail.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#include <ufs/ufs/quota.h>
+
+MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
+SDT_PROVIDER_DEFINE(vfs);
+SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
+SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
+
+static int kern_chflagsat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, u_long flags, int atflag);
+static int setfflags(struct thread *td, struct vnode *, u_long);
+static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
+static int getutimens(const struct timespec *, enum uio_seg,
+    struct timespec *, int *);
+static int setutimes(struct thread *td, struct vnode *,
+    const struct timespec *, int, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+    struct thread *td);
+static int kern_fhlinkat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, fhandle_t *fhp);
+static int kern_getfhat(struct thread *td, int flags, int fd,
+    const char *path, enum uio_seg pathseg, fhandle_t *fhp);
+static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg,
+    size_t count, struct thread *td);
+static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd,
+    const char *path, enum uio_seg segflag);
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+	int     dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sync(struct thread *td, struct sync_args *uap)
+{
+	struct mount *mp, *nmp;
+	int save;
+
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+			save = curthread_pflags_set(TDP_SYNCIO);
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT);
+			curthread_pflags_restore(save);
+			vn_finished_write(mp);
+		}
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp);
+	}
+	mtx_unlock(&mountlist_mtx);
+	return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
+int
+sys_quotactl(struct thread *td, struct quotactl_args *uap)
+{
+	struct mount *mp;
+	struct nameidata nd;
+	int error;
+
+	AUDIT_ARG_CMD(uap->cmd);
+	AUDIT_ARG_UID(uap->uid);
+	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
+		return (EPERM);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	mp = nd.ni_vp->v_mount;
+	vfs_ref(mp);
+	vput(nd.ni_vp);
+	error = vfs_busy(mp, 0);
+	vfs_rel(mp);
+	if (error != 0)
+		return (error);
+	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
+
+	/*
+	 * Since quota on operation typically needs to open quota
+	 * file, the Q_QUOTAON handler needs to unbusy the mount point
+	 * before calling into namei.  Otherwise, unmount might be
+	 * started between two vfs_busy() invocations (first is our,
+	 * second is from mount point cross-walk code in lookup()),
+	 * causing deadlock.
+	 *
+	 * Require that Q_QUOTAON handles the vfs_busy() reference on
+	 * its own, always returning with ubusied mount point.
+	 */
+	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON &&
+	    (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF)
+		vfs_unbusy(mp);
+	return (error);
+}
+
+/*
+ * Used by statfs conversion routines to scale the block size up if
+ * necessary so that all of the block counts are <= 'max_size'.  Note
+ * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
+ * value of 'n'.
+ */
+void
+statfs_scale_blocks(struct statfs *sf, long max_size)
+{
+	uint64_t count;
+	int shift;
+
+	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
+
+	/*
+	 * Attempt to scale the block counts to give a more accurate
+	 * overview to userland of the ratio of free space to used
+	 * space.  To do this, find the largest block count and compute
+	 * a divisor that lets it fit into a signed integer <= max_size.
+	 */
+	if (sf->f_bavail < 0)
+		count = -sf->f_bavail;
+	else
+		count = sf->f_bavail;
+	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
+	if (count <= max_size)
+		return;
+
+	count >>= flsl(max_size);
+	shift = 0;
+	while (count > 0) {
+		shift++;
+		count >>=1;
+	}
+
+	sf->f_bsize <<= shift;
+	sf->f_blocks >>= shift;
+	sf->f_bfree >>= shift;
+	sf->f_bavail >>= shift;
+}
+
+static int
+kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
+{
+	struct statfs *sp;
+	int error;
+
+	if (mp == NULL)
+		return (EBADF);
+	error = vfs_busy(mp, 0);
+	vfs_rel(mp);
+	if (error != 0)
+		return (error);
+#ifdef MAC
+	error = mac_mount_check_stat(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#endif
+	/*
+	 * Set these in case the underlying filesystem fails to do so.
+	 */
+	sp = &mp->mnt_stat;
+	sp->f_version = STATFS_VERSION;
+	sp->f_namemax = NAME_MAX;
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	error = VFS_STATFS(mp, sp);
+	if (error != 0)
+		goto out;
+	*buf = *sp;
+	if (priv_check(td, PRIV_VFS_GENERATION)) {
+		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
+		prison_enforce_statfs(td->td_ucred, mp, buf);
+	}
+out:
+	vfs_unbusy(mp);
+	return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
+int
+sys_statfs(struct thread *td, struct statfs_args *uap)
+{
+	struct statfs *sfp;
+	int error;
+
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
+	if (error == 0)
+		error = copyout(sfp, uap->buf, sizeof(struct statfs));
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+int
+kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
+    struct statfs *buf)
+{
+	struct mount *mp;
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	vfs_ref(mp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_vp);
+	return (kern_do_statfs(td, mp, buf));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
+int
+sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
+{
+	struct statfs *sfp;
+	int error;
+
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_fstatfs(td, uap->fd, sfp);
+	if (error == 0)
+		error = copyout(sfp, uap->buf, sizeof(struct statfs));
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+int
+kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
+{
+	struct file *fp;
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	error = getvnode(td, fd, &cap_fstatfs_rights, &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+#ifdef AUDIT
+	AUDIT_ARG_VNODE1(vp);
+#endif
+	mp = vp->v_mount;
+	if (mp != NULL)
+		vfs_ref(mp);
+	VOP_UNLOCK(vp, 0);
+	fdrop(fp, td);
+	return (kern_do_statfs(td, mp, buf));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int mode;
+};
+#endif
+int
+sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
+{
+	size_t count;
+	int error;
+
+	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
+		return (EINVAL);
+	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
+	    UIO_USERSPACE, uap->mode);
+	if (error == 0)
+		td->td_retval[0] = count;
+	return (error);
+}
+
+/*
+ * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
+ *	The caller is responsible for freeing memory which will be allocated
+ *	in '*buf'.
+ */
+int
+kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
+    size_t *countp, enum uio_seg bufseg, int mode)
+{
+	struct mount *mp, *nmp;
+	struct statfs *sfsp, *sp, *sptmp, *tofree;
+	size_t count, maxcount;
+	int error;
+
+	switch (mode) {
+	case MNT_WAIT:
+	case MNT_NOWAIT:
+		break;
+	default:
+		if (bufseg == UIO_SYSSPACE)
+			*buf = NULL;
+		return (EINVAL);
+	}
+restart:
+	maxcount = bufsize / sizeof(struct statfs);
+	if (bufsize == 0) {
+		sfsp = NULL;
+		tofree = NULL;
+	} else if (bufseg == UIO_USERSPACE) {
+		sfsp = *buf;
+		tofree = NULL;
+	} else /* if (bufseg == UIO_SYSSPACE) */ {
+		count = 0;
+		mtx_lock(&mountlist_mtx);
+		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+			count++;
+		}
+		mtx_unlock(&mountlist_mtx);
+		if (maxcount > count)
+			maxcount = count;
+		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
+		    M_STATFS, M_WAITOK);
+	}
+	count = 0;
+	mtx_lock(&mountlist_mtx);
+	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+		if (prison_canseemount(td->td_ucred, mp) != 0) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+#ifdef MAC
+		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			continue;
+		}
+#endif
+		if (mode == MNT_WAIT) {
+			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
+				/*
+				 * If vfs_busy() failed, and MBF_NOWAIT
+				 * wasn't passed, then the mp is gone.
+				 * Furthermore, because of MBF_MNTLSTLOCK,
+				 * the mountlist_mtx was dropped.  We have
+				 * no other choice than to start over.
+				 */
+				mtx_unlock(&mountlist_mtx);
+				free(tofree, M_STATFS);
+				goto restart;
+			}
+		} else {
+			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
+				nmp = TAILQ_NEXT(mp, mnt_list);
+				continue;
+			}
+		}
+		if (sfsp != NULL && count < maxcount) {
+			sp = &mp->mnt_stat;
+			/*
+			 * Set these in case the underlying filesystem
+			 * fails to do so.
+			 */
+			sp->f_version = STATFS_VERSION;
+			sp->f_namemax = NAME_MAX;
+			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			/*
+			 * If MNT_NOWAIT is specified, do not refresh
+			 * the fsstat cache.
+			 */
+			if (mode != MNT_NOWAIT) {
+				error = VFS_STATFS(mp, sp);
+				if (error != 0) {
+					mtx_lock(&mountlist_mtx);
+					nmp = TAILQ_NEXT(mp, mnt_list);
+					vfs_unbusy(mp);
+					continue;
+				}
+			}
+			if (priv_check(td, PRIV_VFS_GENERATION)) {
+				sptmp = malloc(sizeof(struct statfs), M_STATFS,
+				    M_WAITOK);
+				*sptmp = *sp;
+				sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
+				prison_enforce_statfs(td->td_ucred, mp, sptmp);
+				sp = sptmp;
+			} else
+				sptmp = NULL;
+			if (bufseg == UIO_SYSSPACE) {
+				bcopy(sp, sfsp, sizeof(*sp));
+				free(sptmp, M_STATFS);
+			} else /* if (bufseg == UIO_USERSPACE) */ {
+				error = copyout(sp, sfsp, sizeof(*sp));
+				free(sptmp, M_STATFS);
+				if (error != 0) {
+					vfs_unbusy(mp);
+					return (error);
+				}
+			}
+			sfsp++;
+		}
+		count++;
+		mtx_lock(&mountlist_mtx);
+		nmp = TAILQ_NEXT(mp, mnt_list);
+		vfs_unbusy(mp);
+	}
+	mtx_unlock(&mountlist_mtx);
+	if (sfsp != NULL && count > maxcount)
+		*countp = maxcount;
+	else
+		*countp = count;
+	return (0);
+}
+
+#ifdef COMPAT_FREEBSD4
+/*
+ * Get old format filesystem statistics.
+ */
+static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *);
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_statfs_args {
+	char *path;
+	struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
+{
+	struct ostatfs osb;
+	struct statfs *sfp;
+	int error;
+
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
+	if (error == 0) {
+		freebsd4_cvtstatfs(sfp, &osb);
+		error = copyout(&osb, uap->buf, sizeof(osb));
+	}
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fstatfs_args {
+	int fd;
+	struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
+{
+	struct ostatfs osb;
+	struct statfs *sfp;
+	int error;
+
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_fstatfs(td, uap->fd, sfp);
+	if (error == 0) {
+		freebsd4_cvtstatfs(sfp, &osb);
+		error = copyout(&osb, uap->buf, sizeof(osb));
+	}
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_getfsstat_args {
+	struct ostatfs *buf;
+	long bufsize;
+	int mode;
+};
+#endif
+int
+freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
+{
+	struct statfs *buf, *sp;
+	struct ostatfs osb;
+	size_t count, size;
+	int error;
+
+	if (uap->bufsize < 0)
+		return (EINVAL);
+	count = uap->bufsize / sizeof(struct ostatfs);
+	if (count > SIZE_MAX / sizeof(struct statfs))
+		return (EINVAL);
+	size = count * sizeof(struct statfs);
+	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
+	    uap->mode);
+	if (error == 0)
+		td->td_retval[0] = count;
+	if (size != 0) {
+		sp = buf;
+		while (count != 0 && error == 0) {
+			freebsd4_cvtstatfs(sp, &osb);
+			error = copyout(&osb, uap->buf, sizeof(osb));
+			sp++;
+			uap->buf++;
+			count--;
+		}
+		free(buf, M_STATFS);
+	}
+	return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fhstatfs_args {
+	struct fhandle *u_fhp;
+	struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
+{
+	struct ostatfs osb;
+	struct statfs *sfp;
+	fhandle_t fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+	if (error != 0)
+		return (error);
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_fhstatfs(td, fh, sfp);
+	if (error == 0) {
+		freebsd4_cvtstatfs(sfp, &osb);
+		error = copyout(&osb, uap->buf, sizeof(osb));
+	}
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+/*
+ * Convert a new format statfs structure to an old format statfs structure.
+ */
+static void
+freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
+{
+
+	statfs_scale_blocks(nsp, LONG_MAX);
+	bzero(osp, sizeof(*osp));
+	osp->f_bsize = nsp->f_bsize;
+	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
+	osp->f_blocks = nsp->f_blocks;
+	osp->f_bfree = nsp->f_bfree;
+	osp->f_bavail = nsp->f_bavail;
+	osp->f_files = MIN(nsp->f_files, LONG_MAX);
+	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
+	osp->f_owner = nsp->f_owner;
+	osp->f_type = nsp->f_type;
+	osp->f_flags = nsp->f_flags;
+	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
+	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
+	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
+	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
+	strlcpy(osp->f_fstypename, nsp->f_fstypename,
+	    MIN(MFSNAMELEN, OMFSNAMELEN));
+	strlcpy(osp->f_mntonname, nsp->f_mntonname,
+	    MIN(MNAMELEN, OMNAMELEN));
+	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
+	    MIN(MNAMELEN, OMNAMELEN));
+	osp->f_fsid = nsp->f_fsid;
+}
+#endif /* COMPAT_FREEBSD4 */
+
+#if defined(COMPAT_FREEBSD11)
+/*
+ * Get old format filesystem statistics.
+ */
+static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *);
+
+int
+freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap)
+{
+	struct freebsd11_statfs osb;
+	struct statfs *sfp;
+	int error;
+
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
+	if (error == 0) {
+		freebsd11_cvtstatfs(sfp, &osb);
+		error = copyout(&osb, uap->buf, sizeof(osb));
+	}
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+int
+freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap)
+{
+	struct freebsd11_statfs osb;
+	struct statfs *sfp;
+	int error;
+
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_fstatfs(td, uap->fd, sfp);
+	if (error == 0) {
+		freebsd11_cvtstatfs(sfp, &osb);
+		error = copyout(&osb, uap->buf, sizeof(osb));
+	}
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+int
+freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap)
+{
+	struct freebsd11_statfs osb;
+	struct statfs *buf, *sp;
+	size_t count, size;
+	int error;
+
+	count = uap->bufsize / sizeof(struct ostatfs);
+	size = count * sizeof(struct statfs);
+	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
+	    uap->mode);
+	if (error == 0)
+		td->td_retval[0] = count;
+	if (size > 0) {
+		sp = buf;
+		while (count > 0 && error == 0) {
+			freebsd11_cvtstatfs(sp, &osb);
+			error = copyout(&osb, uap->buf, sizeof(osb));
+			sp++;
+			uap->buf++;
+			count--;
+		}
+		free(buf, M_STATFS);
+	}
+	return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+int
+freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap)
+{
+	struct freebsd11_statfs osb;
+	struct statfs *sfp;
+	fhandle_t fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+	if (error)
+		return (error);
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_fhstatfs(td, fh, sfp);
+	if (error == 0) {
+		freebsd11_cvtstatfs(sfp, &osb);
+		error = copyout(&osb, uap->buf, sizeof(osb));
+	}
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+/*
+ * Convert a new format statfs structure to an old format statfs structure.
+ */
+static void
+freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp)
+{
+
+	bzero(osp, sizeof(*osp));
+	osp->f_version = FREEBSD11_STATFS_VERSION;
+	osp->f_type = nsp->f_type;
+	osp->f_flags = nsp->f_flags;
+	osp->f_bsize = nsp->f_bsize;
+	osp->f_iosize = nsp->f_iosize;
+	osp->f_blocks = nsp->f_blocks;
+	osp->f_bfree = nsp->f_bfree;
+	osp->f_bavail = nsp->f_bavail;
+	osp->f_files = nsp->f_files;
+	osp->f_ffree = nsp->f_ffree;
+	osp->f_syncwrites = nsp->f_syncwrites;
+	osp->f_asyncwrites = nsp->f_asyncwrites;
+	osp->f_syncreads = nsp->f_syncreads;
+	osp->f_asyncreads = nsp->f_asyncreads;
+	osp->f_namemax = nsp->f_namemax;
+	osp->f_owner = nsp->f_owner;
+	osp->f_fsid = nsp->f_fsid;
+	strlcpy(osp->f_fstypename, nsp->f_fstypename,
+	    MIN(MFSNAMELEN, sizeof(osp->f_fstypename)));
+	strlcpy(osp->f_mntonname, nsp->f_mntonname,
+	    MIN(MNAMELEN, sizeof(osp->f_mntonname)));
+	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
+	    MIN(MNAMELEN, sizeof(osp->f_mntfromname)));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
+int
+sys_fchdir(struct thread *td, struct fchdir_args *uap)
+{
+	struct vnode *vp, *tdp;
+	struct mount *mp;
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	error = getvnode(td, uap->fd, &cap_fchdir_rights,
+	    &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+	vrefact(vp);
+	fdrop(fp, td);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	error = change_dir(vp, td);
+	while (!error && (mp = vp->v_mountedhere) != NULL) {
+		if (vfs_busy(mp, 0))
+			continue;
+		error = VFS_ROOT(mp, LK_SHARED, &tdp);
+		vfs_unbusy(mp);
+		if (error != 0)
+			break;
+		vput(vp);
+		vp = tdp;
+	}
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0);
+	pwd_chdir(td, vp);
+	return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
+int
+sys_chdir(struct thread *td, struct chdir_args *uap)
+{
+
+	return (kern_chdir(td, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	if ((error = change_dir(nd.ni_vp, td)) != 0) {
+		vput(nd.ni_vp);
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		return (error);
+	}
+	VOP_UNLOCK(nd.ni_vp, 0);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	pwd_chdir(td, nd.ni_vp);
+	return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
+int
+sys_chroot(struct thread *td, struct chroot_args *uap)
+{
+	struct nameidata nd;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_CHROOT);
+	if (error != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error != 0)
+		goto error;
+	error = change_dir(nd.ni_vp, td);
+	if (error != 0)
+		goto e_vunlock;
+#ifdef MAC
+	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
+	if (error != 0)
+		goto e_vunlock;
+#endif
+	VOP_UNLOCK(nd.ni_vp, 0);
+	error = pwd_chroot(td, nd.ni_vp);
+	vrele(nd.ni_vp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	return (error);
+e_vunlock:
+	vput(nd.ni_vp);
+error:
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	return (error);
+}
+
+/*
+ * Common routine for chroot and chdir.  Callers must provide a locked vnode
+ * instance.
+ */
+int
+change_dir(struct vnode *vp, struct thread *td)
+{
+#ifdef MAC
+	int error;
+#endif
+
+	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+#ifdef MAC
+	error = mac_vnode_check_chdir(td->td_ucred, vp);
+	if (error != 0)
+		return (error);
+#endif
+	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
+}
+
+static __inline void
+flags_to_rights(int flags, cap_rights_t *rightsp)
+{
+
+	if (flags & O_EXEC) {
+		cap_rights_set(rightsp, CAP_FEXECVE);
+	} else {
+		switch ((flags & O_ACCMODE)) {
+		case O_RDONLY:
+			cap_rights_set(rightsp, CAP_READ);
+			break;
+		case O_RDWR:
+			cap_rights_set(rightsp, CAP_READ);
+			/* FALLTHROUGH */
+		case O_WRONLY:
+			cap_rights_set(rightsp, CAP_WRITE);
+			if (!(flags & (O_APPEND | O_TRUNC)))
+				cap_rights_set(rightsp, CAP_SEEK);
+			break;
+		}
+	}
+
+	if (flags & O_CREAT)
+		cap_rights_set(rightsp, CAP_CREATE);
+
+	if (flags & O_TRUNC)
+		cap_rights_set(rightsp, CAP_FTRUNCATE);
+
+	if (flags & (O_SYNC | O_FSYNC))
+		cap_rights_set(rightsp, CAP_FSYNC);
+
+	if (flags & (O_EXLOCK | O_SHLOCK))
+		cap_rights_set(rightsp, CAP_FLOCK);
+}
+
+/*
+ * Check permissions, allocate an open file structure, and call the device
+ * open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
+int
+sys_open(struct thread *td, struct open_args *uap)
+{
+
+	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->flags, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct openat_args {
+	int	fd;
+	char	*path;
+	int	flag;
+	int	mode;
+};
+#endif
+int
+sys_openat(struct thread *td, struct openat_args *uap)
+{
+
+	AUDIT_ARG_FD(uap->fd);
+	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+	    uap->mode));
+}
+
+int
+kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int flags, int mode)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	struct file *fp;
+	struct vnode *vp;
+	struct nameidata nd;
+	cap_rights_t rights;
+	int cmode, error, indx;
+
+	indx = -1;
+
+	AUDIT_ARG_FFLAGS(flags);
+	AUDIT_ARG_MODE(mode);
+	cap_rights_init(&rights, CAP_LOOKUP);
+	flags_to_rights(flags, &rights);
+	/*
+	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
+	 * may be specified.
+	 */
+	if (flags & O_EXEC) {
+		if (flags & O_ACCMODE)
+			return (EINVAL);
+	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
+		return (EINVAL);
+	} else {
+		flags = FFLAGS(flags);
+	}
+
+	/*
+	 * Allocate a file structure. The descriptor to reference it
+	 * is allocated and set by finstall() below.
+	 */
+	error = falloc_noinstall(td, &fp);
+	if (error != 0)
+		return (error);
+	/*
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
+	 */
+	/* Set the flags early so the finit in devfs can pick them up. */
+	fp->f_flag = flags & FMASK;
+	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+	    &rights, td);
+	td->td_dupfd = -1;		/* XXX check for fdopen */
+	error = vn_open(&nd, &flags, cmode, fp);
+	if (error != 0) {
+		/*
+		 * If the vn_open replaced the method vector, something
+		 * wonderous happened deep below and we just pass it up
+		 * pretending we know what we do.
+		 */
+		if (error == ENXIO && fp->f_ops != &badfileops)
+			goto success;
+
+		/*
+		 * Handle special fdopen() case. bleh.
+		 *
+		 * Don't do this for relative (capability) lookups; we don't
+		 * understand exactly what would happen, and we don't think
+		 * that it ever should.
+		 */
+		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
+		    (error == ENODEV || error == ENXIO) &&
+		    td->td_dupfd >= 0) {
+			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+			    &indx);
+			if (error == 0)
+				goto success;
+		}
+
+		goto bad;
+	}
+	td->td_dupfd = 0;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	/*
+	 * Store the vnode, for any f_type. Typically, the vnode use
+	 * count is decremented by direct call to vn_closefile() for
+	 * files that switched type in the cdevsw fdopen() method.
+	 */
+	fp->f_vnode = vp;
+	/*
+	 * If the file wasn't claimed by devfs bind it to the normal
+	 * vnode operations here.
+	 */
+	if (fp->f_ops == &badfileops) {
+		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
+		fp->f_seqcount = 1;
+		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
+		    DTYPE_VNODE, vp, &vnops);
+	}
+
+	VOP_UNLOCK(vp, 0);
+	if (flags & O_TRUNC) {
+		error = fo_truncate(fp, 0, td->td_ucred, td);
+		if (error != 0)
+			goto bad;
+	}
+success:
+	/*
+	 * If we haven't already installed the FD (for dupfdopen), do so now.
+	 */
+	if (indx == -1) {
+		struct filecaps *fcaps;
+
+#ifdef CAPABILITIES
+		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0)
+			fcaps = &nd.ni_filecaps;
+		else
+#endif
+			fcaps = NULL;
+		error = finstall(td, fp, &indx, flags, fcaps);
+		/* On success finstall() consumes fcaps. */
+		if (error != 0) {
+			filecaps_free(&nd.ni_filecaps);
+			goto bad;
+		}
+	} else {
+		filecaps_free(&nd.ni_filecaps);
+	}
+
+	/*
+	 * Release our private reference, leaving the one associated with
+	 * the descriptor table intact.
+	 */
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (0);
+bad:
+	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+ocreat(struct thread *td, struct ocreat_args *uap)
+{
+
+	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknodat_args {
+	int	fd;
+	char	*path;
+	mode_t	mode;
+	dev_t	dev;
+};
+#endif
+int
+sys_mknodat(struct thread *td, struct mknodat_args *uap)
+{
+
+	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
+	    uap->dev));
+}
+
+#if defined(COMPAT_FREEBSD11)
+int
+freebsd11_mknod(struct thread *td,
+    struct freebsd11_mknod_args *uap)
+{
+
+	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->mode, uap->dev));
+}
+
+int
+freebsd11_mknodat(struct thread *td,
+    struct freebsd11_mknodat_args *uap)
+{
+
+	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
+	    uap->dev));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+int
+kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int mode, dev_t dev)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error, whiteout = 0;
+
+	AUDIT_ARG_MODE(mode);
+	AUDIT_ARG_DEV(dev);
+	switch (mode & S_IFMT) {
+	case S_IFCHR:
+	case S_IFBLK:
+		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+		if (error == 0 && dev == VNOVAL)
+			error = EINVAL;
+		break;
+	case S_IFWHT:
+		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
+		break;
+	case S_IFIFO:
+		if (dev == 0)
+			return (kern_mkfifoat(td, fd, path, pathseg, mode));
+		/* FALLTHROUGH */
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (error != 0)
+		return (error);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, pathseg, path, fd, &cap_mknodat_rights,
+	    td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EEXIST);
+	} else {
+		VATTR_NULL(&vattr);
+		vattr.va_mode = (mode & ALLPERMS) &
+		    ~td->td_proc->p_fd->fd_cmask;
+		vattr.va_rdev = dev;
+		whiteout = 0;
+
+		switch (mode & S_IFMT) {
+		case S_IFCHR:
+			vattr.va_type = VCHR;
+			break;
+		case S_IFBLK:
+			vattr.va_type = VBLK;
+			break;
+		case S_IFWHT:
+			whiteout = 1;
+			break;
+		default:
+			panic("kern_mknod: invalid mode");
+		}
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+#ifdef MAC
+	if (error == 0 && !whiteout)
+		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
+		    &nd.ni_cnd, &vattr);
+#endif
+	if (error == 0) {
+		if (whiteout)
+			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+		else {
+			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+						&nd.ni_cnd, &vattr);
+			if (error == 0)
+				vput(nd.ni_vp);
+		}
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
+{
+
+	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifoat_args {
+	int	fd;
+	char	*path;
+	mode_t	mode;
+};
+#endif
+int
+sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
+{
+
+	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->mode));
+}
+
+int
+kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int mode)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error;
+
+	AUDIT_ARG_MODE(mode);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights,
+	    td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	if (nd.ni_vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VFIFO;
+	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	if (error == 0)
+		vput(nd.ni_vp);
+#ifdef MAC
+out:
+#endif
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
+int
+sys_link(struct thread *td, struct link_args *uap)
+{
+
+	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
+	    UIO_USERSPACE, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct linkat_args {
+	int	fd1;
+	char	*path1;
+	int	fd2;
+	char	*path2;
+	int	flag;
+};
+#endif
+int
+sys_linkat(struct thread *td, struct linkat_args *uap)
+{
+	int flag;
+
+	flag = uap->flag;
+	if (flag & ~AT_SYMLINK_FOLLOW)
+		return (EINVAL);
+
+	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
+	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
+}
+
+int hardlink_check_uid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
+    &hardlink_check_uid, 0,
+    "Unprivileged processes cannot create hard links to files owned by other "
+    "users");
+static int hardlink_check_gid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
+    &hardlink_check_gid, 0,
+    "Unprivileged processes cannot create hard links to files owned by other "
+    "groups");
+
+static int
+can_hardlink(struct vnode *vp, struct ucred *cred)
+{
+	struct vattr va;
+	int error;
+
+	if (!hardlink_check_uid && !hardlink_check_gid)
+		return (0);
+
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error != 0)
+		return (error);
+
+	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
+		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+		if (error != 0)
+			return (error);
+	}
+
+	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
+		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+		if (error != 0)
+			return (error);
+	}
+
+	return (0);
+}
+
+int
+kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
+    enum uio_seg segflag, int follow)
+{
+	struct nameidata nd;
+	int error;
+
+	do {
+		bwillwrite();
+		NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflag,
+		    path1, fd1, &cap_linkat_source_rights, td);
+		if ((error = namei(&nd)) != 0)
+			return (error);
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag);
+	} while (error ==  EAGAIN);
+	return (error);
+}
+
+static int
+kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path,
+    enum uio_seg segflag)
+{
+	struct nameidata nd;
+	struct mount *mp;
+	int error;
+
+	if (vp->v_type == VDIR) {
+		vrele(vp);
+		return (EPERM);		/* POSIX */
+	}
+	NDINIT_ATRIGHTS(&nd, CREATE,
+	    LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd,
+	    &cap_linkat_target_rights, td);
+	if ((error = namei(&nd)) == 0) {
+		if (nd.ni_vp != NULL) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			if (nd.ni_dvp == nd.ni_vp)
+				vrele(nd.ni_dvp);
+			else
+				vput(nd.ni_dvp);
+			vrele(nd.ni_vp);
+			vrele(vp);
+			return (EEXIST);
+		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
+			/*
+			 * Cross-device link.  No need to recheck
+			 * vp->v_type, since it cannot change, except
+			 * to VBAD.
+			 */
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			vput(nd.ni_dvp);
+			vrele(vp);
+			return (EXDEV);
+		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
+			error = can_hardlink(vp, td->td_ucred);
+#ifdef MAC
+			if (error == 0)
+				error = mac_vnode_check_link(td->td_ucred,
+				    nd.ni_dvp, vp, &nd.ni_cnd);
+#endif
+			if (error != 0) {
+				vput(vp);
+				vput(nd.ni_dvp);
+				NDFREE(&nd, NDF_ONLY_PNBUF);
+				return (error);
+			}
+			error = vn_start_write(vp, &mp, V_NOWAIT);
+			if (error != 0) {
+				vput(vp);
+				vput(nd.ni_dvp);
+				NDFREE(&nd, NDF_ONLY_PNBUF);
+				error = vn_start_write(NULL, &mp,
+				    V_XSLEEP | PCATCH);
+				if (error != 0)
+					return (error);
+				return (EAGAIN);
+			}
+			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+			VOP_UNLOCK(vp, 0);
+			vput(nd.ni_dvp);
+			vn_finished_write(mp);
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+		} else {
+			vput(nd.ni_dvp);
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			vrele(vp);
+			return (EAGAIN);
+		}
+	}
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
+int
+sys_symlink(struct thread *td, struct symlink_args *uap)
+{
+
+	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
+	    UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct symlinkat_args {
+	char	*path;
+	int	fd;
+	char	*path2;
+};
+#endif
+int
+sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
+{
+
+	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
+	    UIO_USERSPACE));
+}
+
+int
+kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
+    enum uio_seg segflg)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	char *syspath;
+	struct nameidata nd;
+	int error;
+
+	if (segflg == UIO_SYSSPACE) {
+		syspath = path1;
+	} else {
+		syspath = uma_zalloc(namei_zone, M_WAITOK);
+		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
+			goto out;
+	}
+	AUDIT_ARG_TEXT(syspath);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, segflg, path2, fd, &cap_symlinkat_rights,
+	    td);
+	if ((error = namei(&nd)) != 0)
+		goto out;
+	if (nd.ni_vp) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		error = EEXIST;
+		goto out;
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			goto out;
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+	vattr.va_type = VLNK;
+	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+	if (error != 0)
+		goto out2;
+#endif
+	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
+	if (error == 0)
+		vput(nd.ni_vp);
+#ifdef MAC
+out2:
+#endif
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+out:
+	if (segflg != UIO_SYSSPACE)
+		uma_zfree(namei_zone, syspath);
+	return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct undelete_args {
+	char *path;
+};
+#endif
+int
+sys_undelete(struct thread *td, struct undelete_args *uap)
+{
+	struct mount *mp;
+	struct nameidata nd;
+	int error;
+
+restart:
+	bwillwrite();
+	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+
+	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (nd.ni_vp)
+			vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
+int
+sys_unlink(struct thread *td, struct unlink_args *uap)
+{
+
+	return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct unlinkat_args {
+	int	fd;
+	char	*path;
+	int	flag;
+};
+#endif
+int
+sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
+{
+	int flag = uap->flag;
+	int fd = uap->fd;
+	char *path = uap->path;
+
+	if (flag & ~AT_REMOVEDIR)
+		return (EINVAL);
+
+	if (flag & AT_REMOVEDIR)
+		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
+	else
+		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
+}
+
+int
+kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    ino_t oldinum)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct nameidata nd;
+	struct stat sb;
+	int error;
+
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, &cap_unlinkat_rights, td);
+	if ((error = namei(&nd)) != 0)
+		return (error == EINVAL ? EPERM : error);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR && oldinum == 0) {
+		error = EPERM;		/* POSIX */
+	} else if (oldinum != 0 &&
+		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
+		  sb.st_ino != oldinum) {
+			error = EIDRM;	/* Identifier removed */
+	} else {
+		/*
+		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
+		 */
+		if (vp->v_vflag & VV_ROOT)
+			error = EBUSY;
+	}
+	if (error == 0) {
+		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			vput(nd.ni_dvp);
+			if (vp == nd.ni_dvp)
+				vrele(vp);
+			else
+				vput(vp);
+			if ((error = vn_start_write(NULL, &mp,
+			    V_XSLEEP | PCATCH)) != 0)
+				return (error);
+			goto restart;
+		}
+#ifdef MAC
+		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+		    &nd.ni_cnd);
+		if (error != 0)
+			goto out;
+#endif
+		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+#ifdef MAC
+out:
+#endif
+		vn_finished_write(mp);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (vp == nd.ni_dvp)
+		vrele(vp);
+	else
+		vput(vp);
+	return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
+int
+sys_lseek(struct thread *td, struct lseek_args *uap)
+{
+
+	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
+}
+
+int
+kern_lseek(struct thread *td, int fd, off_t offset, int whence)
+{
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	error = fget(td, fd, &cap_seek_rights, &fp);
+	if (error != 0)
+		return (error);
+	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
+	    fo_seek(fp, offset, whence, td) : ESPIPE;
+	fdrop(fp, td);
+	return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
+int
+olseek(struct thread *td, struct olseek_args *uap)
+{
+
+	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD6)
+/* Version with the 'pad' argument */
+int
+freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
+{
+
+	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
+}
+#endif
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+     struct thread *td)
+{
+	accmode_t accmode;
+	int error;
+
+	/* Flags == 0 means only check for existence. */
+	if (user_flags == 0)
+		return (0);
+
+	accmode = 0;
+	if (user_flags & R_OK)
+		accmode |= VREAD;
+	if (user_flags & W_OK)
+		accmode |= VWRITE;
+	if (user_flags & X_OK)
+		accmode |= VEXEC;
+#ifdef MAC
+	error = mac_vnode_check_access(cred, vp, accmode);
+	if (error != 0)
+		return (error);
+#endif
+	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+		error = VOP_ACCESS(vp, accmode, cred, td);
+	return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	amode;
+};
+#endif
+int
+sys_access(struct thread *td, struct access_args *uap)
+{
+
+	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    0, uap->amode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct faccessat_args {
+	int	dirfd;
+	char	*path;
+	int	amode;
+	int	flag;
+}
+#endif
+int
+sys_faccessat(struct thread *td, struct faccessat_args *uap)
+{
+
+	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+	    uap->amode));
+}
+
+int
+kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int flag, int amode)
+{
+	struct ucred *cred, *usecred;
+	struct vnode *vp;
+	struct nameidata nd;
+	int error;
+
+	if (flag & ~AT_EACCESS)
+		return (EINVAL);
+	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
+		return (EINVAL);
+
+	/*
+	 * Create and modify a temporary credential instead of one that
+	 * is potentially shared (if we need one).
+	 */
+	cred = td->td_ucred;
+	if ((flag & AT_EACCESS) == 0 &&
+	    ((cred->cr_uid != cred->cr_ruid ||
+	    cred->cr_rgid != cred->cr_groups[0]))) {
+		usecred = crdup(cred);
+		usecred->cr_uid = cred->cr_ruid;
+		usecred->cr_groups[0] = cred->cr_rgid;
+		td->td_ucred = usecred;
+	} else
+		usecred = cred;
+	AUDIT_ARG_VALUE(amode);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
+	    AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights,
+	    td);
+	if ((error = namei(&nd)) != 0)
+		goto out;
+	vp = nd.ni_vp;
+
+	error = vn_access(vp, amode, usecred, td);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+out:
+	if (usecred != cred) {
+		td->td_ucred = cred;
+		crfree(usecred);
+	}
+	return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+	char	*path;
+	int	amode;
+};
+#endif
+int
+sys_eaccess(struct thread *td, struct eaccess_args *uap)
+{
+
+	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    AT_EACCESS, uap->amode));
+}
+
+#if defined(COMPAT_43)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+int
+ostat(struct thread *td, struct ostat_args *uap)
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+
+	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    &sb, NULL);
+	if (error != 0)
+		return (error);
+	cvtstat(&sb, &osb);
+	return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+int
+olstat(struct thread *td, struct olstat_args *uap)
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+
+	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
+	    UIO_USERSPACE, &sb, NULL);
+	if (error != 0)
+		return (error);
+	cvtstat(&sb, &osb);
+	return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ * XXX: many values are blindly truncated.
+ */
+void
+cvtstat(struct stat *st, struct ostat *ost)
+{
+
+	bzero(ost, sizeof(*ost));
+	ost->st_dev = st->st_dev;
+	ost->st_ino = st->st_ino;
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	ost->st_size = MIN(st->st_size, INT32_MAX);
+	ost->st_atim = st->st_atim;
+	ost->st_mtim = st->st_mtim;
+	ost->st_ctim = st->st_ctim;
+	ost->st_blksize = st->st_blksize;
+	ost->st_blocks = st->st_blocks;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
+int ino64_trunc_error;
+SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW,
+    &ino64_trunc_error, 0,
+    "Error on truncation of device, file or inode number, or link count");
+
+int
+freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost)
+{
+
+	ost->st_dev = st->st_dev;
+	if (ost->st_dev != st->st_dev) {
+		switch (ino64_trunc_error) {
+		default:
+			/*
+			 * Since dev_t is almost raw, don't clamp to the
+			 * maximum for case 2, but ignore the error.
+			 */
+			break;
+		case 1:
+			return (EOVERFLOW);
+		}
+	}
+	ost->st_ino = st->st_ino;
+	if (ost->st_ino != st->st_ino) {
+		switch (ino64_trunc_error) {
+		default:
+		case 0:
+			break;
+		case 1:
+			return (EOVERFLOW);
+		case 2:
+			ost->st_ino = UINT32_MAX;
+			break;
+		}
+	}
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	if (ost->st_nlink != st->st_nlink) {
+		switch (ino64_trunc_error) {
+		default:
+		case 0:
+			break;
+		case 1:
+			return (EOVERFLOW);
+		case 2:
+			ost->st_nlink = UINT16_MAX;
+			break;
+		}
+	}
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	if (ost->st_rdev != st->st_rdev) {
+		switch (ino64_trunc_error) {
+		default:
+			break;
+		case 1:
+			return (EOVERFLOW);
+		}
+	}
+	ost->st_atim = st->st_atim;
+	ost->st_mtim = st->st_mtim;
+	ost->st_ctim = st->st_ctim;
+	ost->st_size = st->st_size;
+	ost->st_blocks = st->st_blocks;
+	ost->st_blksize = st->st_blksize;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+	ost->st_lspare = 0;
+	ost->st_birthtim = st->st_birthtim;
+	bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim),
+	    sizeof(*ost) - offsetof(struct freebsd11_stat,
+	    st_birthtim) - sizeof(ost->st_birthtim));
+	return (0);
+}
+
+int
+freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap)
+{
+	struct stat sb;
+	struct freebsd11_stat osb;
+	int error;
+
+	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    &sb, NULL);
+	if (error != 0)
+		return (error);
+	error = freebsd11_cvtstat(&sb, &osb);
+	if (error == 0)
+		error = copyout(&osb, uap->ub, sizeof(osb));
+	return (error);
+}
+
+int
+freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap)
+{
+	struct stat sb;
+	struct freebsd11_stat osb;
+	int error;
+
+	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
+	    UIO_USERSPACE, &sb, NULL);
+	if (error != 0)
+		return (error);
+	error = freebsd11_cvtstat(&sb, &osb);
+	if (error == 0)
+		error = copyout(&osb, uap->ub, sizeof(osb));
+	return (error);
+}
+
+int
+freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap)
+{
+	struct fhandle fh;
+	struct stat sb;
+	struct freebsd11_stat osb;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+	if (error != 0)
+		return (error);
+	error = kern_fhstat(td, fh, &sb);
+	if (error != 0)
+		return (error);
+	error = freebsd11_cvtstat(&sb, &osb);
+	if (error == 0)
+		error = copyout(&osb, uap->sb, sizeof(osb));
+	return (error);
+}
+
+int
+freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap)
+{
+	struct stat sb;
+	struct freebsd11_stat osb;
+	int error;
+
+	error = kern_statat(td, uap->flag, uap->fd, uap->path,
+	    UIO_USERSPACE, &sb, NULL);
+	if (error != 0)
+		return (error);
+	error = freebsd11_cvtstat(&sb, &osb);
+	if (error == 0)
+		error = copyout(&osb, uap->buf, sizeof(osb));
+	return (error);
+}
+#endif	/* COMPAT_FREEBSD11 */
+
+/*
+ * Get file status
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatat_args {
+	int	fd;
+	char	*path;
+	struct stat	*buf;
+	int	flag;
+}
+#endif
+int
+sys_fstatat(struct thread *td, struct fstatat_args *uap)
+{
+	struct stat sb;
+	int error;
+
+	error = kern_statat(td, uap->flag, uap->fd, uap->path,
+	    UIO_USERSPACE, &sb, NULL);
+	if (error == 0)
+		error = copyout(&sb, uap->buf, sizeof (sb));
+	return (error);
+}
+
+int
+kern_statat(struct thread *td, int flag, int fd, char *path,
+    enum uio_seg pathseg, struct stat *sbp,
+    void (*hook)(struct vnode *vp, struct stat *sbp))
+{
+	struct nameidata nd;
+	struct stat sb;
+	int error;
+
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
+	    &cap_fstat_rights, td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
+	if (error == 0) {
+		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
+		if (S_ISREG(sb.st_mode))
+			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
+		if (__predict_false(hook != NULL))
+			hook(nd.ni_vp, &sb);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_vp);
+	if (error != 0)
+		return (error);
+#ifdef __STAT_TIME_T_EXT
+	sb.st_atim_ext = 0;
+	sb.st_mtim_ext = 0;
+	sb.st_ctim_ext = 0;
+	sb.st_btim_ext = 0;
+#endif
+	*sbp = sb;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrstat(&sb);
+#endif
+	return (0);
+}
+
+#if defined(COMPAT_FREEBSD11)
+/*
+ * Implementation of the NetBSD [l]stat() functions.
+ */
+void
+freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb)
+{
+
+	bzero(nsb, sizeof(*nsb));
+	nsb->st_dev = sb->st_dev;
+	nsb->st_ino = sb->st_ino;
+	nsb->st_mode = sb->st_mode;
+	nsb->st_nlink = sb->st_nlink;
+	nsb->st_uid = sb->st_uid;
+	nsb->st_gid = sb->st_gid;
+	nsb->st_rdev = sb->st_rdev;
+	nsb->st_atim = sb->st_atim;
+	nsb->st_mtim = sb->st_mtim;
+	nsb->st_ctim = sb->st_ctim;
+	nsb->st_size = sb->st_size;
+	nsb->st_blocks = sb->st_blocks;
+	nsb->st_blksize = sb->st_blksize;
+	nsb->st_flags = sb->st_flags;
+	nsb->st_gen = sb->st_gen;
+	nsb->st_birthtim = sb->st_birthtim;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_nstat_args {
+	char	*path;
+	struct nstat *ub;
+};
+#endif
+int
+freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap)
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+
+	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    &sb, NULL);
+	if (error != 0)
+		return (error);
+	freebsd11_cvtnstat(&sb, &nsb);
+	return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+
+/*
+ * NetBSD lstat.  Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_nlstat_args {
+	char	*path;
+	struct nstat *ub;
+};
+#endif
+int
+freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap)
+{
+	struct stat sb;
+	struct nstat nsb;
+	int error;
+
+	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
+	    UIO_USERSPACE, &sb, NULL);
+	if (error != 0)
+		return (error);
+	freebsd11_cvtnstat(&sb, &nsb);
+	return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+int
+sys_pathconf(struct thread *td, struct pathconf_args *uap)
+{
+	long value;
+	int error;
+
+	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW,
+	    &value);
+	if (error == 0)
+		td->td_retval[0] = value;
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct lpathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+int
+sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
+{
+	long value;
+	int error;
+
+	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
+	    NOFOLLOW, &value);
+	if (error == 0)
+		td->td_retval[0] = value;
+	return (error);
+}
+
+int
+kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
+    u_long flags, long *valuep)
+{
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
+	    pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	error = VOP_PATHCONF(nd.ni_vp, name, valuep);
+	vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	size_t	count;
+};
+#endif
+int
+sys_readlink(struct thread *td, struct readlink_args *uap)
+{
+
+	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->buf, UIO_USERSPACE, uap->count));
+}
+#ifndef _SYS_SYSPROTO_H_
+struct readlinkat_args {
+	int	fd;
+	char	*path;
+	char	*buf;
+	size_t	bufsize;
+};
+#endif
+int
+sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
+{
+
+	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->buf, UIO_USERSPACE, uap->bufsize));
+}
+
+int
+kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    char *buf, enum uio_seg bufseg, size_t count)
+{
+	struct vnode *vp;
+	struct nameidata nd;
+	int error;
+
+	if (count > IOSIZE_MAX)
+		return (EINVAL);
+
+	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+
+	error = kern_readlink_vp(vp, buf, bufseg, count, td);
+	vput(vp);
+
+	return (error);
+}
+
+/*
+ * Helper function to readlink from a vnode
+ */
+static int
+kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count,
+    struct thread *td)
+{
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+
+	ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked");
+#ifdef MAC
+	error = mac_vnode_check_readlink(td->td_ucred, vp);
+	if (error != 0)
+		return (error);
+#endif
+	if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
+		return (EINVAL);
+
+	aiov.iov_base = buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = 0;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = bufseg;
+	auio.uio_td = td;
+	auio.uio_resid = count;
+	error = VOP_READLINK(vp, &auio, td->td_ucred);
+	td->td_retval[0] = count - auio.uio_resid;
+	return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(struct thread *td, struct vnode *vp, u_long flags)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+
+	/* We can't support the value matching VNOVAL. */
+	if (flags == VNOVAL)
+		return (EOPNOTSUPP);
+
+	/*
+	 * Prevent non-root users from setting flags on devices.  When
+	 * a device is reused, users can retain ownership of the device
+	 * if they are allowed to set flags and programs assume that
+	 * chown can't fail when done as root.
+	 */
+	if (vp->v_type == VCHR || vp->v_type == VBLK) {
+		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
+		if (error != 0)
+			return (error);
+	}
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = flags;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
+	if (error == 0)
+#endif
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	const char *path;
+	u_long	flags;
+};
+#endif
+int
+sys_chflags(struct thread *td, struct chflags_args *uap)
+{
+
+	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->flags, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct chflagsat_args {
+	int	fd;
+	const char *path;
+	u_long	flags;
+	int	atflag;
+}
+#endif
+int
+sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
+{
+	int fd = uap->fd;
+	const char *path = uap->path;
+	u_long flags = uap->flags;
+	int atflag = uap->atflag;
+
+	if (atflag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchflags_args {
+	const char *path;
+	u_long flags;
+};
+#endif
+int
+sys_lchflags(struct thread *td, struct lchflags_args *uap)
+{
+
+	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->flags, AT_SYMLINK_NOFOLLOW));
+}
+
+static int
+kern_chflagsat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, u_long flags, int atflag)
+{
+	struct nameidata nd;
+	int error, follow;
+
+	AUDIT_ARG_FFLAGS(flags);
+	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    &cap_fchflags_rights, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfflags(td, nd.ni_vp, flags);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	u_long	flags;
+};
+#endif
+int
+sys_fchflags(struct thread *td, struct fchflags_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_FFLAGS(uap->flags);
+	error = getvnode(td, uap->fd, &cap_fchflags_rights,
+	    &fp);
+	if (error != 0)
+		return (error);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(fp->f_vnode);
+	VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+	error = setfflags(td, fp->f_vnode, uap->flags);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+int
+setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = mode & ALLPERMS;
+#ifdef MAC
+	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
+	if (error == 0)
+#endif
+		error = VOP_SETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_chmod(struct thread *td, struct chmod_args *uap)
+{
+
+	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->mode, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchmodat_args {
+	int	dirfd;
+	char	*path;
+	mode_t	mode;
+	int	flag;
+}
+#endif
+int
+sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
+{
+	int flag = uap->flag;
+	int fd = uap->fd;
+	char *path = uap->path;
+	mode_t mode = uap->mode;
+
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_lchmod(struct thread *td, struct lchmod_args *uap)
+{
+
+	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->mode, AT_SYMLINK_NOFOLLOW));
+}
+
+int
+kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    mode_t mode, int flag)
+{
+	struct nameidata nd;
+	int error, follow;
+
+	AUDIT_ARG_MODE(mode);
+	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    &cap_fchmod_rights, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
+int
+sys_fchmod(struct thread *td, struct fchmod_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_MODE(uap->mode);
+
+	error = fget(td, uap->fd, &cap_fchmod_rights, &fp);
+	if (error != 0)
+		return (error);
+	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+int
+setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
+    gid_t gid)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = uid;
+	vattr.va_gid = gid;
+#ifdef MAC
+	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
+	    vattr.va_gid);
+	if (error == 0)
+#endif
+		error = VOP_SETATTR(vp, &vattr, cred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+int
+sys_chown(struct thread *td, struct chown_args *uap)
+{
+
+	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
+	    uap->gid, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchownat_args {
+	int fd;
+	const char * path;
+	uid_t uid;
+	gid_t gid;
+	int flag;
+};
+#endif
+int
+sys_fchownat(struct thread *td, struct fchownat_args *uap)
+{
+	int flag;
+
+	flag = uap->flag;
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
+	    uap->gid, uap->flag));
+}
+
+int
+kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    int uid, int gid, int flag)
+{
+	struct nameidata nd;
+	int error, follow;
+
+	AUDIT_ARG_OWNER(uid, gid);
+	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    &cap_fchown_rights, td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+int
+sys_lchown(struct thread *td, struct lchown_args *uap)
+{
+
+	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
+int
+sys_fchown(struct thread *td, struct fchown_args *uap)
+{
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(uap->fd);
+	AUDIT_ARG_OWNER(uap->uid, uap->gid);
+	error = fget(td, uap->fd, &cap_fchown_rights, &fp);
+	if (error != 0)
+		return (error);
+	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
+    struct timespec *tsp)
+{
+	struct timeval tv[2];
+	const struct timeval *tvp;
+	int error;
+
+	if (usrtvp == NULL) {
+		vfs_timestamp(&tsp[0]);
+		tsp[1] = tsp[0];
+	} else {
+		if (tvpseg == UIO_SYSSPACE) {
+			tvp = usrtvp;
+		} else {
+			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
+				return (error);
+			tvp = tv;
+		}
+
+		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
+		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
+			return (EINVAL);
+		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
+		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
+	}
+	return (0);
+}
+
+/*
+ * Common implementation code for futimens(), utimensat().
+ */
+#define	UTIMENS_NULL	0x1
+#define	UTIMENS_EXIT	0x2
+static int
+getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
+    struct timespec *tsp, int *retflags)
+{
+	struct timespec tsnow;
+	int error;
+
+	vfs_timestamp(&tsnow);
+	*retflags = 0;
+	if (usrtsp == NULL) {
+		tsp[0] = tsnow;
+		tsp[1] = tsnow;
+		*retflags |= UTIMENS_NULL;
+		return (0);
+	}
+	if (tspseg == UIO_SYSSPACE) {
+		tsp[0] = usrtsp[0];
+		tsp[1] = usrtsp[1];
+	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
+		return (error);
+	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
+		*retflags |= UTIMENS_EXIT;
+	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
+		*retflags |= UTIMENS_NULL;
+	if (tsp[0].tv_nsec == UTIME_OMIT)
+		tsp[0].tv_sec = VNOVAL;
+	else if (tsp[0].tv_nsec == UTIME_NOW)
+		tsp[0] = tsnow;
+	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
+		return (EINVAL);
+	if (tsp[1].tv_nsec == UTIME_OMIT)
+		tsp[1].tv_sec = VNOVAL;
+	else if (tsp[1].tv_nsec == UTIME_NOW)
+		tsp[1] = tsnow;
+	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
+ * and utimensat().
+ */
+static int
+setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
+    int numtimes, int nullflag)
+{
+	struct mount *mp;
+	struct vattr vattr;
+	int error, setbirthtime;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		return (error);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	setbirthtime = 0;
+	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
+	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
+		setbirthtime = 1;
+	VATTR_NULL(&vattr);
+	vattr.va_atime = ts[0];
+	vattr.va_mtime = ts[1];
+	if (setbirthtime)
+		vattr.va_birthtime = ts[1];
+	if (numtimes > 2)
+		vattr.va_birthtime = ts[2];
+	if (nullflag)
+		vattr.va_vaflags |= VA_UTIMES_NULL;
+#ifdef MAC
+	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
+	    vattr.va_mtime);
+#endif
+	if (error == 0)
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+int
+sys_utimes(struct thread *td, struct utimes_args *uap)
+{
+
+	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->tptr, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct futimesat_args {
+	int fd;
+	const char * path;
+	const struct timeval * times;
+};
+#endif
+int
+sys_futimesat(struct thread *td, struct futimesat_args *uap)
+{
+
+	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->times, UIO_USERSPACE));
+}
+
+int
+kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    struct timeval *tptr, enum uio_seg tptrseg)
+{
+	struct nameidata nd;
+	struct timespec ts[2];
+	int error;
+
+	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+		return (error);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+	    &cap_futimes_rights, td);
+
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+int
+sys_lutimes(struct thread *td, struct lutimes_args *uap)
+{
+
+	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
+	    UIO_USERSPACE));
+}
+
+int
+kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
+    struct timeval *tptr, enum uio_seg tptrseg)
+{
+	struct timespec ts[2];
+	struct nameidata nd;
+	int error;
+
+	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+		return (error);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+	int	fd;
+	struct	timeval *tptr;
+};
+#endif
+int
+sys_futimes(struct thread *td, struct futimes_args *uap)
+{
+
+	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
+}
+
+int
+kern_futimes(struct thread *td, int fd, struct timeval *tptr,
+    enum uio_seg tptrseg)
+{
+	struct timespec ts[2];
+	struct file *fp;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	error = getutimes(tptr, tptrseg, ts);
+	if (error != 0)
+		return (error);
+	error = getvnode(td, fd, &cap_futimes_rights, &fp);
+	if (error != 0)
+		return (error);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(fp->f_vnode);
+	VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_futimens(struct thread *td, struct futimens_args *uap)
+{
+
+	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
+}
+
+int
+kern_futimens(struct thread *td, int fd, struct timespec *tptr,
+    enum uio_seg tptrseg)
+{
+	struct timespec ts[2];
+	struct file *fp;
+	int error, flags;
+
+	AUDIT_ARG_FD(fd);
+	error = getutimens(tptr, tptrseg, ts, &flags);
+	if (error != 0)
+		return (error);
+	if (flags & UTIMENS_EXIT)
+		return (0);
+	error = getvnode(td, fd, &cap_futimes_rights, &fp);
+	if (error != 0)
+		return (error);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(fp->f_vnode);
+	VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_utimensat(struct thread *td, struct utimensat_args *uap)
+{
+
+	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->times, UIO_USERSPACE, uap->flag));
+}
+
+int
+kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    struct timespec *tptr, enum uio_seg tptrseg, int flag)
+{
+	struct nameidata nd;
+	struct timespec ts[2];
+	int error, flags;
+
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
+		return (error);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+	    FOLLOW) | AUDITVNODE1, pathseg, path, fd,
+	    &cap_futimes_rights, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	/*
+	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
+	 * POSIX states:
+	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
+	 * "Search permission is denied by a component of the path prefix."
+	 */
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if ((flags & UTIMENS_EXIT) == 0)
+		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
+int
+sys_truncate(struct thread *td, struct truncate_args *uap)
+{
+
+	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+
+int
+kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	void *rl_cookie;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error;
+
+	if (length < 0)
+		return(EINVAL);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		vn_rangelock_unlock(vp, rl_cookie);
+		vrele(vp);
+		return (error);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+#ifdef MAC
+	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
+	}
+#endif
+	else if ((error = vn_writechk(vp)) == 0 &&
+	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = length;
+		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+	}
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+	vn_rangelock_unlock(vp, rl_cookie);
+	vrele(vp);
+	return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
+int
+otruncate(struct thread *td, struct otruncate_args *uap)
+{
+
+	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD6)
+/* Versions with the pad argument */
+int
+freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
+{
+
+	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+
+int
+freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
+{
+
+	return (kern_ftruncate(td, uap->fd, uap->length));
+}
+#endif
+
+int
+kern_fsync(struct thread *td, int fd, bool fullsync)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct file *fp;
+	int error, lock_flags;
+
+	AUDIT_ARG_FD(fd);
+	error = getvnode(td, fd, &cap_fsync_rights, &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+#if 0
+	if (!fullsync)
+		/* XXXKIB: compete outstanding aio writes */;
+#endif
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		goto drop;
+	if (MNT_SHARED_WRITES(mp) ||
+	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+		lock_flags = LK_SHARED;
+	} else {
+		lock_flags = LK_EXCLUSIVE;
+	}
+	vn_lock(vp, lock_flags | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	if (vp->v_object != NULL) {
+		VM_OBJECT_WLOCK(vp->v_object);
+		vm_object_page_clean(vp->v_object, 0, 0, 0);
+		VM_OBJECT_WUNLOCK(vp->v_object);
+	}
+	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+drop:
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
+int
+sys_fsync(struct thread *td, struct fsync_args *uap)
+{
+
+	return (kern_fsync(td, uap->fd, true));
+}
+
+int
+sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
+{
+
+	return (kern_fsync(td, uap->fd, false));
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories, or
+ * both not be directories.  If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
+int
+sys_rename(struct thread *td, struct rename_args *uap)
+{
+
+	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
+	    uap->to, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct renameat_args {
+	int	oldfd;
+	char	*old;
+	int	newfd;
+	char	*new;
+};
+#endif
+int
+sys_renameat(struct thread *td, struct renameat_args *uap)
+{
+
+	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
+	    UIO_USERSPACE));
+}
+
+int
+kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
+    enum uio_seg pathseg)
+{
+	struct mount *mp = NULL;
+	struct vnode *tvp, *fvp, *tdvp;
+	struct nameidata fromnd, tond;
+	int error;
+
+again:
+	bwillwrite();
+#ifdef MAC
+	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
+	    AUDITVNODE1, pathseg, old, oldfd,
+	    &cap_renameat_source_rights, td);
+#else
+	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
+	    pathseg, old, oldfd,
+	    &cap_renameat_source_rights, td);
+#endif
+
+	if ((error = namei(&fromnd)) != 0)
+		return (error);
+#ifdef MAC
+	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
+	    fromnd.ni_vp, &fromnd.ni_cnd);
+	VOP_UNLOCK(fromnd.ni_dvp, 0);
+	if (fromnd.ni_dvp != fromnd.ni_vp)
+		VOP_UNLOCK(fromnd.ni_vp, 0);
+#endif
+	fvp = fromnd.ni_vp;
+	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
+	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
+	    &cap_renameat_target_rights, td);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&tond)) != 0) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	tdvp = tond.ni_dvp;
+	tvp = tond.ni_vp;
+	error = vn_start_write(fvp, &mp, V_NOWAIT);
+	if (error != 0) {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+		if (tvp != NULL)
+			vput(tvp);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		vrele(tond.ni_startdir);
+		if (fromnd.ni_startdir != NULL)
+			vrele(fromnd.ni_startdir);
+		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+		if (error != 0)
+			return (error);
+		goto again;
+	}
+	if (tvp != NULL) {
+		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto out;
+		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+			error = EISDIR;
+			goto out;
+		}
+#ifdef CAPABILITIES
+		if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) {
+			/*
+			 * If the target already exists we require CAP_UNLINKAT
+			 * from 'newfd', when newfd was used for the lookup.
+			 */
+			error = cap_check(&tond.ni_filecaps.fc_rights,
+			    &cap_unlinkat_rights);
+			if (error != 0)
+				goto out;
+		}
+#endif
+	}
+	if (fvp == tdvp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * If the source is the same as the destination (that is, if they
+	 * are links to the same vnode), then there is nothing to do.
+	 */
+	if (fvp == tvp)
+		error = -1;
+#ifdef MAC
+	else
+		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
+		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
+#endif
+out:
+	if (error == 0) {
+		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+	} else {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+		if (tvp != NULL)
+			vput(tvp);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+	}
+	vrele(tond.ni_startdir);
+	vn_finished_write(mp);
+out1:
+	if (fromnd.ni_startdir)
+		vrele(fromnd.ni_startdir);
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+sys_mkdir(struct thread *td, struct mkdir_args *uap)
+{
+
+	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkdirat_args {
+	int	fd;
+	char	*path;
+	mode_t	mode;
+};
+#endif
+int
+sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
+{
+
+	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+int
+kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
+    int mode)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error;
+
+	AUDIT_ARG_MODE(mode);
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, segflg, path, fd, &cap_mkdirat_rights,
+	    td);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		/*
+		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
+		 * the strange behaviour of leaving the vnode unlocked
+		 * if the target is the same vnode as the parent.
+		 */
+		if (vp == nd.ni_dvp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EEXIST);
+	}
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VDIR;
+	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+#ifdef MAC
+out:
+#endif
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (error == 0)
+		vput(nd.ni_vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
+int
+sys_rmdir(struct thread *td, struct rmdir_args *uap)
+{
+
+	return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct nameidata nd;
+	int error;
+
+restart:
+	bwillwrite();
+	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, &cap_unlinkat_rights, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+	/*
+	 * No rmdir "." please.
+	 */
+	if (nd.ni_dvp == vp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * The root of a mounted filesystem cannot be deleted.
+	 */
+	if (vp->v_vflag & VV_ROOT) {
+		error = EBUSY;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+	    &nd.ni_cnd);
+	if (error != 0)
+		goto out;
+#endif
+	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		vput(vp);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+			return (error);
+		goto restart;
+	}
+	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	vn_finished_write(mp);
+out:
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(vp);
+	if (nd.ni_dvp == vp)
+		vrele(nd.ni_dvp);
+	else
+		vput(nd.ni_dvp);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
+int
+freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count,
+    long *basep, void (*func)(struct freebsd11_dirent *))
+{
+	struct freebsd11_dirent dstdp;
+	struct dirent *dp, *edp;
+	char *dirbuf;
+	off_t base;
+	ssize_t resid, ucount;
+	int error;
+
+	/* XXX arbitrary sanity limit on `count'. */
+	count = min(count, 64 * 1024);
+
+	dirbuf = malloc(count, M_TEMP, M_WAITOK);
+
+	error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid,
+	    UIO_SYSSPACE);
+	if (error != 0)
+		goto done;
+	if (basep != NULL)
+		*basep = base;
+
+	ucount = 0;
+	for (dp = (struct dirent *)dirbuf,
+	    edp = (struct dirent *)&dirbuf[count - resid];
+	    ucount < count && dp < edp; ) {
+		if (dp->d_reclen == 0)
+			break;
+		MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0));
+		if (dp->d_namlen >= sizeof(dstdp.d_name))
+			continue;
+		dstdp.d_type = dp->d_type;
+		dstdp.d_namlen = dp->d_namlen;
+		dstdp.d_fileno = dp->d_fileno;		/* truncate */
+		if (dstdp.d_fileno != dp->d_fileno) {
+			switch (ino64_trunc_error) {
+			default:
+			case 0:
+				break;
+			case 1:
+				error = EOVERFLOW;
+				goto done;
+			case 2:
+				dstdp.d_fileno = UINT32_MAX;
+				break;
+			}
+		}
+		dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) +
+		    ((dp->d_namlen + 1 + 3) &~ 3);
+		bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
+		bzero(dstdp.d_name + dstdp.d_namlen,
+		    dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) -
+		    dstdp.d_namlen);
+		MPASS(dstdp.d_reclen <= dp->d_reclen);
+		MPASS(ucount + dstdp.d_reclen <= count);
+		if (func != NULL)
+			func(&dstdp);
+		error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen);
+		if (error != 0)
+			break;
+		dp = (struct dirent *)((char *)dp + dp->d_reclen);
+		ucount += dstdp.d_reclen;
+	}
+
+done:
+	free(dirbuf, M_TEMP);
+	if (error == 0)
+		td->td_retval[0] = ucount;
+	return (error);
+}
+#endif /* COMPAT */
+
+#ifdef COMPAT_43
+static void
+ogetdirentries_cvt(struct freebsd11_dirent *dp)
+{
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+	/*
+	 * The expected low byte of dp->d_namlen is our dp->d_type.
+	 * The high MBZ byte of dp->d_namlen is our dp->d_namlen.
+	 */
+	dp->d_type = dp->d_namlen;
+	dp->d_namlen = 0;
+#else
+	/*
+	 * The dp->d_type is the high byte of the expected dp->d_namlen,
+	 * so must be zero'ed.
+	 */
+	dp->d_type = 0;
+#endif
+}
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
+{
+	long loff;
+	int error;
+
+	error = kern_ogetdirentries(td, uap, &loff);
+	if (error == 0)
+		error = copyout(&loff, uap->basep, sizeof(long));
+	return (error);
+}
+
+int
+kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
+    long *ploff)
+{
+	long base;
+	int error;
+
+	/* XXX arbitrary sanity limit on `count'. */
+	if (uap->count > 64 * 1024)
+		return (EINVAL);
+
+	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
+	    &base, ogetdirentries_cvt);
+
+	if (error == 0 && uap->basep != NULL)
+		error = copyout(&base, uap->basep, sizeof(long));
+
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD11)
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+freebsd11_getdirentries(struct thread *td,
+    struct freebsd11_getdirentries_args *uap)
+{
+	long base;
+	int error;
+
+	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
+	    &base, NULL);
+
+	if (error == 0 && uap->basep != NULL)
+		error = copyout(&base, uap->basep, sizeof(long));
+	return (error);
+}
+
+int
+freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap)
+{
+	struct freebsd11_getdirentries_args ap;
+
+	ap.fd = uap->fd;
+	ap.buf = uap->buf;
+	ap.count = uap->count;
+	ap.basep = NULL;
+	return (freebsd11_getdirentries(td, &ap));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+int
+sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
+{
+	off_t base;
+	int error;
+
+	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+	    NULL, UIO_USERSPACE);
+	if (error != 0)
+		return (error);
+	if (uap->basep != NULL)
+		error = copyout(&base, uap->basep, sizeof(off_t));
+	return (error);
+}
+
+int
+kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
+    off_t *basep, ssize_t *residp, enum uio_seg bufseg)
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	off_t loff;
+	int error, eofflag;
+	off_t foffset;
+
+	AUDIT_ARG_FD(fd);
+	if (count > IOSIZE_MAX)
+		return (EINVAL);
+	auio.uio_resid = count;
+	error = getvnode(td, fd, &cap_read_rights, &fp);
+	if (error != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
+unionread:
+	if (vp->v_type != VDIR) {
+		error = EINVAL;
+		goto fail;
+	}
+	aiov.iov_base = buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = bufseg;
+	auio.uio_td = td;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	loff = auio.uio_offset = foffset;
+#ifdef MAC
+	error = mac_vnode_check_readdir(td->td_ucred, vp);
+	if (error == 0)
+#endif
+		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
+		    NULL);
+	foffset = auio.uio_offset;
+	if (error != 0) {
+		VOP_UNLOCK(vp, 0);
+		goto fail;
+	}
+	if (count == auio.uio_resid &&
+	    (vp->v_vflag & VV_ROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_vnode = vp;
+		fp->f_data = vp;
+		foffset = 0;
+		vput(tvp);
+		goto unionread;
+	}
+	VOP_UNLOCK(vp, 0);
+	*basep = loff;
+	if (residp != NULL)
+		*residp = auio.uio_resid;
+	td->td_retval[0] = count - auio.uio_resid;
+fail:
+	foffset_unlock(fp, foffset, 0);
+	fdrop(fp, td);
+	return (error);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
+int
+sys_umask(struct thread *td, struct umask_args *uap)
+{
+	struct filedesc *fdp;
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	td->td_retval[0] = fdp->fd_cmask;
+	fdp->fd_cmask = uap->newmask & ALLPERMS;
+	FILEDESC_XUNLOCK(fdp);
+	return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem away from
+ * vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
+int
+sys_revoke(struct thread *td, struct revoke_args *uap)
+{
+	struct vnode *vp;
+	struct vattr vattr;
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_vnode_check_revoke(td->td_ucred, vp);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+	if (error != 0)
+		goto out;
+	if (td->td_ucred->cr_uid != vattr.va_uid) {
+		error = priv_check(td, PRIV_VFS_ADMIN);
+		if (error != 0)
+			goto out;
+	}
+	if (vcount(vp) > 1)
+		VOP_REVOKE(vp, REVOKEALL);
+out:
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry and check that, if it
+ * is a capability, the correct rights are present. A reference on the file
+ * entry is held upon returning.
+ */
+int
+getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+	struct file *fp;
+	int error;
+
+	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * The file could be not of the vnode type, or it may be not
+	 * yet fully initialized, in which case the f_vnode pointer
+	 * may be set, but f_ops is still badfileops.  E.g.,
+	 * devfs_open() transiently create such situation to
+	 * facilitate csw d_fdopen().
+	 *
+	 * Dupfdopen() handling in kern_openat() installs the
+	 * half-baked file into the process descriptor table, allowing
+	 * other thread to dereference it. Guard against the race by
+	 * checking f_ops.
+	 */
+	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
+		fdrop(fp, td);
+		return (EINVAL);
+	}
+	*fpp = fp;
+	return (0);
+}
+
+
+/*
+ * Get an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lgetfh_args {
+	char *fname;
+	fhandle_t *fhp;
+};
+#endif
+int
+sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
+{
+
+	return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname,
+	    UIO_USERSPACE, uap->fhp));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+	char *fname;
+	fhandle_t *fhp;
+};
+#endif
+int
+sys_getfh(struct thread *td, struct getfh_args *uap)
+{
+
+	return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE,
+	    uap->fhp));
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate an open descriptor into
+ * a NFS file handle.
+ *
+ * warning: do not remove the priv_check() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfhat_args {
+	int fd;
+	char *path;
+	fhandle_t *fhp;
+	int flags;
+};
+#endif
+int
+sys_getfhat(struct thread *td, struct getfhat_args *uap)
+{
+
+	if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW)) != 0)
+	    return (EINVAL);
+	return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->fhp));
+}
+
+static int
+kern_getfhat(struct thread *td, int flags, int fd, const char *path,
+    enum uio_seg pathseg, fhandle_t *fhp)
+{
+	struct nameidata nd;
+	fhandle_t fh;
+	struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_GETFH);
+	if (error != 0)
+		return (error);
+	NDINIT_AT(&nd, LOOKUP, ((flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW :
+	    FOLLOW) | /*((flags & AT_BENEATH) != 0 ? BENEATH : 0) |*/ LOCKLEAF |
+	    AUDITVNODE1, pathseg, path, fd, td);
+	error = namei(&nd);
+	if (error != 0)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vp = nd.ni_vp;
+	bzero(&fh, sizeof(fh));
+	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+	error = VOP_VPTOFH(vp, &fh.fh_fid);
+	vput(vp);
+	if (error == 0)
+		error = copyout(&fh, fhp, sizeof (fh));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fhlink_args {
+	fhandle_t *fhp;
+	const char *to;
+};
+#endif
+int
+sys_fhlink(struct thread *td, struct fhlink_args *uap)
+{
+
+	return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fhlinkat_args {
+	fhandle_t *fhp;
+	int tofd;
+	const char *to;
+};
+#endif
+int
+sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap)
+{
+
+	return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp));
+}
+
+static int
+kern_fhlinkat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, fhandle_t *fhp)
+{
+	fhandle_t fh;
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_GETFH);
+	if (error != 0)
+		return (error);
+	error = copyin(fhp, &fh, sizeof(fh));
+	if (error != 0)
+		return (error);
+	do {
+		bwillwrite();
+		if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+			return (ESTALE);
+		error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
+		vfs_unbusy(mp);
+		if (error != 0)
+			return (error);
+		VOP_UNLOCK(vp, 0);
+	} while ((error = kern_linkat_vp(td, vp, fd, path, pathseg)) == EAGAIN);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fhreadlink_args {
+	fhandle_t *fhp;
+	char *buf;
+	size_t bufsize;
+};
+#endif
+int
+sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap)
+{
+	fhandle_t fh;
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_GETFH);
+	if (error != 0)
+		return (error);
+	if (uap->bufsize > IOSIZE_MAX)
+		return (EINVAL);
+	error = copyin(uap->fhp, &fh, sizeof(fh));
+	if (error != 0)
+		return (error);
+	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
+	vfs_unbusy(mp);
+	if (error != 0)
+		return (error);
+	error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into an
+ * open descriptor.
+ *
+ * warning: do not remove the priv_check() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+	const struct fhandle *u_fhp;
+	int flags;
+};
+#endif
+int
+sys_fhopen(struct thread *td, struct fhopen_args *uap)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	struct fhandle fhp;
+	struct file *fp;
+	int fmode, error;
+	int indx;
+
+	error = priv_check(td, PRIV_VFS_FHOPEN);
+	if (error != 0)
+		return (error);
+	indx = -1;
+	fmode = FFLAGS(uap->flags);
+	/* why not allow a non-read/write open for our lockd? */
+	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+		return (EINVAL);
+	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
+	if (error != 0)
+		return(error);
+	/* find the mount point */
+	mp = vfs_busyfs(&fhp.fh_fsid);
+	if (mp == NULL)
+		return (ESTALE);
+	/* now give me my vnode, it gets returned to me locked */
+	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
+	vfs_unbusy(mp);
+	if (error != 0)
+		return (error);
+
+	error = falloc_noinstall(td, &fp);
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	/*
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
+	 */
+
+#ifdef INVARIANTS
+	td->td_dupfd = -1;
+#endif
+	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+	if (error != 0) {
+		KASSERT(fp->f_ops == &badfileops,
+		    ("VOP_OPEN in fhopen() set f_ops"));
+		KASSERT(td->td_dupfd < 0,
+		    ("fhopen() encountered fdopen()"));
+
+		vput(vp);
+		goto bad;
+	}
+#ifdef INVARIANTS
+	td->td_dupfd = 0;
+#endif
+	fp->f_vnode = vp;
+	fp->f_seqcount = 1;
+	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
+	    &vnops);
+	VOP_UNLOCK(vp, 0);
+	if ((fmode & O_TRUNC) != 0) {
+		error = fo_truncate(fp, 0, td->td_ucred, td);
+		if (error != 0)
+			goto bad;
+	}
+
+	error = finstall(td, fp, &indx, fmode, NULL);
+bad:
+	fdrop(fp, td);
+	td->td_retval[0] = indx;
+	return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+	struct fhandle *u_fhp;
+	struct stat *sb;
+};
+#endif
+int
+sys_fhstat(struct thread *td, struct fhstat_args *uap)
+{
+	struct stat sb;
+	struct fhandle fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fh));
+	if (error != 0)
+		return (error);
+	error = kern_fhstat(td, fh, &sb);
+	if (error == 0)
+		error = copyout(&sb, uap->sb, sizeof(sb));
+	return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_FHSTAT);
+	if (error != 0)
+		return (error);
+	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+	vfs_unbusy(mp);
+	if (error != 0)
+		return (error);
+	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+	struct fhandle *u_fhp;
+	struct statfs *buf;
+};
+#endif
+int
+sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
+{
+	struct statfs *sfp;
+	fhandle_t fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+	if (error != 0)
+		return (error);
+	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+	error = kern_fhstatfs(td, fh, sfp);
+	if (error == 0)
+		error = copyout(sfp, uap->buf, sizeof(*sfp));
+	free(sfp, M_STATFS);
+	return (error);
+}
+
+int
+kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
+{
+	struct statfs *sp;
+	struct mount *mp;
+	struct vnode *vp;
+	int error;
+
+	error = priv_check(td, PRIV_VFS_FHSTATFS);
+	if (error != 0)
+		return (error);
+	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+		return (ESTALE);
+	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+	if (error != 0) {
+		vfs_unbusy(mp);
+		return (error);
+	}
+	vput(vp);
+	error = prison_canseemount(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#ifdef MAC
+	error = mac_mount_check_stat(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#endif
+	/*
+	 * Set these in case the underlying filesystem fails to do so.
+	 */
+	sp = &mp->mnt_stat;
+	sp->f_version = STATFS_VERSION;
+	sp->f_namemax = NAME_MAX;
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	error = VFS_STATFS(mp, sp);
+	if (error == 0)
+		*buf = *sp;
+out:
+	vfs_unbusy(mp);
+	return (error);
+}
+
+int
+kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
+{
+	struct file *fp;
+	struct mount *mp;
+	struct vnode *vp;
+	off_t olen, ooffset;
+	int error;
+#ifdef AUDIT
+	int audited_vnode1 = 0;
+#endif
+
+	AUDIT_ARG_FD(fd);
+	if (offset < 0 || len <= 0)
+		return (EINVAL);
+	/* Check for wrap. */
+	if (offset > OFF_MAX - len)
+		return (EFBIG);
+	AUDIT_ARG_FD(fd);
+	error = fget(td, fd, &cap_pwrite_rights, &fp);
+	if (error != 0)
+		return (error);
+	AUDIT_ARG_FILE(td->td_proc, fp);
+	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
+		error = ESPIPE;
+		goto out;
+	}
+	if ((fp->f_flag & FWRITE) == 0) {
+		error = EBADF;
+		goto out;
+	}
+	if (fp->f_type != DTYPE_VNODE) {
+		error = ENODEV;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = ENODEV;
+		goto out;
+	}
+
+	/* Allocating blocks may take a long time, so iterate. */
+	for (;;) {
+		olen = len;
+		ooffset = offset;
+
+		bwillwrite();
+		mp = NULL;
+		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+		if (error != 0)
+			break;
+		error = vn_lock(vp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vn_finished_write(mp);
+			break;
+		}
+#ifdef AUDIT
+		if (!audited_vnode1) {
+			AUDIT_ARG_VNODE1(vp);
+			audited_vnode1 = 1;
+		}
+#endif
+#ifdef MAC
+		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
+		if (error == 0)
+#endif
+			error = VOP_ALLOCATE(vp, &offset, &len);
+		VOP_UNLOCK(vp, 0);
+		vn_finished_write(mp);
+
+		if (olen + ooffset != offset + len) {
+			panic("offset + len changed from %jx/%jx to %jx/%jx",
+			    ooffset, olen, offset, len);
+		}
+		if (error != 0 || len == 0)
+			break;
+		KASSERT(olen > len, ("Iteration did not make progress?"));
+		maybe_yield();
+	}
+ out:
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
+{
+	int error;
+
+	error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
+	return (kern_posix_error(td, error));
+}
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint.  Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+    int advice)
+{
+	struct fadvise_info *fa, *new;
+	struct file *fp;
+	struct vnode *vp;
+	off_t end;
+	int error;
+
+	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
+		return (EINVAL);
+	AUDIT_ARG_VALUE(advice);
+	switch (advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+		break;
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		new = NULL;
+		break;
+	default:
+		return (EINVAL);
+	}
+	/* XXX: CAP_POSIX_FADVISE? */
+	AUDIT_ARG_FD(fd);
+	error = fget(td, fd, &cap_no_rights, &fp);
+	if (error != 0)
+		goto out;
+	AUDIT_ARG_FILE(td->td_proc, fp);
+	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
+		error = ESPIPE;
+		goto out;
+	}
+	if (fp->f_type != DTYPE_VNODE) {
+		error = ENODEV;
+		goto out;
+	}
+	vp = fp->f_vnode;
+	if (vp->v_type != VREG) {
+		error = ENODEV;
+		goto out;
+	}
+	if (len == 0)
+		end = OFF_MAX;
+	else
+		end = offset + len - 1;
+	switch (advice) {
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_NOREUSE:
+		/*
+		 * Try to merge any existing non-standard region with
+		 * this new region if possible, otherwise create a new
+		 * non-standard region for this request.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL && fa->fa_advice == advice &&
+		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
+		    (end != OFF_MAX && fa->fa_start == end + 1) ||
+		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
+			if (offset < fa->fa_start)
+				fa->fa_start = offset;
+			if (end > fa->fa_end)
+				fa->fa_end = end;
+		} else {
+			new->fa_advice = advice;
+			new->fa_start = offset;
+			new->fa_end = end;
+			fp->f_advice = new;
+			new = fa;
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_NORMAL:
+		/*
+		 * If a the "normal" region overlaps with an existing
+		 * non-standard region, trim or remove the
+		 * non-standard region.
+		 */
+		mtx_pool_lock(mtxpool_sleep, fp);
+		fa = fp->f_advice;
+		if (fa != NULL) {
+			if (offset <= fa->fa_start && end >= fa->fa_end) {
+				new = fa;
+				fp->f_advice = NULL;
+			} else if (offset <= fa->fa_start &&
+			    end >= fa->fa_start)
+				fa->fa_start = end + 1;
+			else if (offset <= fa->fa_end && end >= fa->fa_end)
+				fa->fa_end = offset - 1;
+			else if (offset >= fa->fa_start && end <= fa->fa_end) {
+				/*
+				 * If the "normal" region is a middle
+				 * portion of the existing
+				 * non-standard region, just remove
+				 * the whole thing rather than picking
+				 * one side or the other to
+				 * preserve.
+				 */
+				new = fa;
+				fp->f_advice = NULL;
+			}
+		}
+		mtx_pool_unlock(mtxpool_sleep, fp);
+		break;
+	case POSIX_FADV_WILLNEED:
+	case POSIX_FADV_DONTNEED:
+		error = VOP_ADVISE(vp, offset, end, advice);
+		break;
+	}
+out:
+	if (fp != NULL)
+		fdrop(fp, td);
+	free(new, M_FADVISE);
+	return (error);
+}
+
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+	int error;
+
+	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
+	    uap->advice);
+	return (kern_posix_error(td, error));
+}
diff --git a/freebsd/sys/kern/vfs_vnops.c b/freebsd/sys/kern/vfs_vnops.c
new file mode 100644
index 00000000..bdd6692d
--- /dev/null
+++ b/freebsd/sys/kern/vfs_vnops.c
@@ -0,0 +1,2607 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2013, 2014 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/disk.h>
+#include <sys/fail.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/kdb.h>
+#include <sys/stat.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/filio.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/ttycom.h>
+#include <sys/conf.h>
+#include <sys/syslog.h>
+#include <sys/unistd.h>
+#include <sys/user.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+static fo_rdwr_t	vn_read;
+static fo_rdwr_t	vn_write;
+static fo_rdwr_t	vn_io_fault;
+static fo_truncate_t	vn_truncate;
+static fo_ioctl_t	vn_ioctl;
+static fo_poll_t	vn_poll;
+static fo_kqfilter_t	vn_kqfilter;
+static fo_stat_t	vn_statfile;
+static fo_close_t	vn_closefile;
+static fo_mmap_t	vn_mmap;
+
+struct 	fileops vnops = {
+	.fo_read = vn_io_fault,
+	.fo_write = vn_io_fault,
+	.fo_truncate = vn_truncate,
+	.fo_ioctl = vn_ioctl,
+	.fo_poll = vn_poll,
+	.fo_kqfilter = vn_kqfilter,
+	.fo_stat = vn_statfile,
+	.fo_close = vn_closefile,
+	.fo_chmod = vn_chmod,
+	.fo_chown = vn_chown,
+	.fo_sendfile = vn_sendfile,
+	.fo_seek = vn_seek,
+	.fo_fill_kinfo = vn_fill_kinfo,
+	.fo_mmap = vn_mmap,
+	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+};
+
+static const int io_hold_cnt = 16;
+static int vn_io_fault_enable = 1;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
+    &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
+static int vn_io_fault_prefault = 0;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,
+    &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
+static u_long vn_io_faults_cnt;
+SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
+    &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
+
+/*
+ * Returns true if vn_io_fault mode of handling the i/o request should
+ * be used.
+ */
+static bool
+do_vn_io_fault(struct vnode *vp, struct uio *uio)
+{
+	struct mount *mp;
+
+	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
+	    (mp = vp->v_mount) != NULL &&
+	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
+}
+
+/*
+ * Structure used to pass arguments to vn_io_fault1(), to do either
+ * file- or vnode-based I/O calls.
+ */
+struct vn_io_fault_args {
+	enum {
+		VN_IO_FAULT_FOP,
+		VN_IO_FAULT_VOP
+	} kind;
+	struct ucred *cred;
+	int flags;
+	union {
+		struct fop_args_tag {
+			struct file *fp;
+			fo_rdwr_t *doio;
+		} fop_args;
+		struct vop_args_tag {
+			struct vnode *vp;
+		} vop_args;
+	} args;
+};
+
+static int vn_io_fault1(struct vnode *vp, struct uio *uio,
+    struct vn_io_fault_args *args, struct thread *td);
+
+int
+vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
+{
+	struct thread *td = ndp->ni_cnd.cn_thread;
+
+	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
+}
+
+/*
+ * Common code for vnode open operations via a name lookup.
+ * Lookup the vnode and invoke VOP_CREATE if needed.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ * 
+ * Note that this does NOT free nameidata for the successful case,
+ * due to the NDINIT being done elsewhere.
+ */
+int
+vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
+    struct ucred *cred, struct file *fp)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct thread *td = ndp->ni_cnd.cn_thread;
+	struct vattr vat;
+	struct vattr *vap = &vat;
+	int fmode, error;
+
+restart:
+	fmode = *flagp;
+	if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
+	    O_EXCL | O_DIRECTORY))
+		return (EINVAL);
+	else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
+		ndp->ni_cnd.cn_nameiop = CREATE;
+		/*
+		 * Set NOCACHE to avoid flushing the cache when
+		 * rolling in many files at once.
+		*/
+		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE;
+		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+			ndp->ni_cnd.cn_flags |= FOLLOW;
+		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+		if ((vn_open_flags & VN_OPEN_INVFS) == 0)
+			bwillwrite();
+		if ((error = namei(ndp)) != 0)
+			return (error);
+		if (ndp->ni_vp == NULL) {
+			VATTR_NULL(vap);
+			vap->va_type = VREG;
+			vap->va_mode = cmode;
+			if (fmode & O_EXCL)
+				vap->va_vaflags |= VA_EXCLUSIVE;
+			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
+				NDFREE(ndp, NDF_ONLY_PNBUF);
+				vput(ndp->ni_dvp);
+				if ((error = vn_start_write(NULL, &mp,
+				    V_XSLEEP | PCATCH)) != 0)
+					return (error);
+				goto restart;
+			}
+			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
+				ndp->ni_cnd.cn_flags |= MAKEENTRY;
+#ifdef MAC
+			error = mac_vnode_check_create(cred, ndp->ni_dvp,
+			    &ndp->ni_cnd, vap);
+			if (error == 0)
+#endif
+				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+						   &ndp->ni_cnd, vap);
+			vput(ndp->ni_dvp);
+			vn_finished_write(mp);
+			if (error) {
+				NDFREE(ndp, NDF_ONLY_PNBUF);
+				return (error);
+			}
+			fmode &= ~O_TRUNC;
+			vp = ndp->ni_vp;
+		} else {
+			if (ndp->ni_dvp == ndp->ni_vp)
+				vrele(ndp->ni_dvp);
+			else
+				vput(ndp->ni_dvp);
+			ndp->ni_dvp = NULL;
+			vp = ndp->ni_vp;
+			if (fmode & O_EXCL) {
+				error = EEXIST;
+				goto bad;
+			}
+			if (vp->v_type == VDIR) {
+				error = EISDIR;
+				goto bad;
+			}
+			fmode &= ~O_CREAT;
+		}
+	} else {
+		ndp->ni_cnd.cn_nameiop = LOOKUP;
+		ndp->ni_cnd.cn_flags = ISOPEN |
+		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+		if (!(fmode & FWRITE))
+			ndp->ni_cnd.cn_flags |= LOCKSHARED;
+		if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+			ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+		if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+			ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+		if ((error = namei(ndp)) != 0)
+			return (error);
+		vp = ndp->ni_vp;
+	}
+	error = vn_open_vnode(vp, fmode, cred, td, fp);
+	if (error)
+		goto bad;
+	*flagp = fmode;
+	return (0);
+bad:
+	NDFREE(ndp, NDF_ONLY_PNBUF);
+	vput(vp);
+	*flagp = fmode;
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+static int
+vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
+{
+	struct flock lf;
+	int error, lock_flags, type;
+
+	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
+	if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
+		return (0);
+	KASSERT(fp != NULL, ("open with flock requires fp"));
+	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
+		return (EOPNOTSUPP);
+
+	lock_flags = VOP_ISLOCKED(vp);
+	VOP_UNLOCK(vp, 0);
+
+	lf.l_whence = SEEK_SET;
+	lf.l_start = 0;
+	lf.l_len = 0;
+	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
+	type = F_FLOCK;
+	if ((fmode & FNONBLOCK) == 0)
+		type |= F_WAIT;
+	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
+	if (error == 0)
+		fp->f_flag |= FHASLOCK;
+
+	vn_lock(vp, lock_flags | LK_RETRY);
+	if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
+		error = ENOENT;
+	return (error);
+}
+
+/*
+ * Common code for vnode open operations once a vnode is located.
+ * Check permissions, and call the VOP_OPEN routine.
+ */
+int
+vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
+    struct thread *td, struct file *fp)
+{
+	accmode_t accmode;
+	int error;
+
+	if (vp->v_type == VLNK)
+		return (EMLINK);
+	if (vp->v_type == VSOCK)
+		return (EOPNOTSUPP);
+	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
+		return (ENOTDIR);
+	accmode = 0;
+	if (fmode & (FWRITE | O_TRUNC)) {
+		if (vp->v_type == VDIR)
+			return (EISDIR);
+		accmode |= VWRITE;
+	}
+	if (fmode & FREAD)
+		accmode |= VREAD;
+	if (fmode & FEXEC)
+		accmode |= VEXEC;
+	if ((fmode & O_APPEND) && (fmode & FWRITE))
+		accmode |= VAPPEND;
+#ifdef MAC
+	if (fmode & O_CREAT)
+		accmode |= VCREAT;
+	if (fmode & O_VERIFY)
+		accmode |= VVERIFY;
+	error = mac_vnode_check_open(cred, vp, accmode);
+	if (error)
+		return (error);
+
+	accmode &= ~(VCREAT | VVERIFY);
+#endif
+	if ((fmode & O_CREAT) == 0 && accmode != 0) {
+		error = VOP_ACCESS(vp, accmode, cred, td);
+		if (error != 0)
+			return (error);
+	}
+	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+		vn_lock(vp, LK_UPGRADE | LK_RETRY);
+	error = VOP_OPEN(vp, fmode, cred, td, fp);
+	if (error != 0)
+		return (error);
+
+	error = vn_open_vnode_advlock(vp, fmode, fp);
+	if (error == 0 && (fmode & FWRITE) != 0) {
+		error = VOP_ADD_WRITECOUNT(vp, 1);
+		if (error == 0) {
+			CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
+			     __func__, vp, vp->v_writecount);
+		}
+	}
+
+	/*
+	 * Error from advlock or VOP_ADD_WRITECOUNT() still requires
+	 * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
+	 * Arrange for that by having fdrop() to use vn_closefile().
+	 */
+	if (error != 0) {
+		fp->f_flag |= FOPENFAILED;
+		fp->f_vnode = vp;
+		if (fp->f_ops == &badfileops) {
+			fp->f_type = DTYPE_VNODE;
+			fp->f_ops = &vnops;
+		}
+		vref(vp);
+	}
+
+	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
+	return (error);
+
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ * It is racy.
+ */
+int
+vn_writechk(struct vnode *vp)
+{
+
+	ASSERT_VOP_LOCKED(vp, "vn_writechk");
+	/*
+	 * If there's shared text associated with
+	 * the vnode, try to free it up once.  If
+	 * we fail, we can't allow writing.
+	 */
+	if (VOP_IS_TEXT(vp))
+		return (ETXTBSY);
+
+	return (0);
+}
+
+/*
+ * Vnode close call
+ */
+static int
+vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
+    struct thread *td, bool keep_ref)
+{
+	struct mount *mp;
+	int error, lock_flags;
+
+	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
+	    MNT_EXTENDED_SHARED(vp->v_mount))
+		lock_flags = LK_SHARED;
+	else
+		lock_flags = LK_EXCLUSIVE;
+
+	vn_start_write(vp, &mp, V_WAIT);
+	vn_lock(vp, lock_flags | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
+		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
+		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+		    __func__, vp, vp->v_writecount);
+	}
+	error = VOP_CLOSE(vp, flags, file_cred, td);
+	if (keep_ref)
+		VOP_UNLOCK(vp, 0);
+	else
+		vput(vp);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
+    struct thread *td)
+{
+
+	return (vn_close1(vp, flags, file_cred, td, false));
+}
+
+/*
+ * Heuristic to detect sequential operation.
+ */
+static int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+
+	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
+	if (fp->f_flag & FRDAHEAD)
+		return (fp->f_seqcount << IO_SEQSHIFT);
+
+	/*
+	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
+	 * that the first I/O is normally considered to be slightly
+	 * sequential.  Seeking to offset 0 doesn't change sequentiality
+	 * unless previous seeks have reduced f_seqcount to 0, in which
+	 * case offset 0 is not special.
+	 */
+	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+	    uio->uio_offset == fp->f_nextoff) {
+		/*
+		 * f_seqcount is in units of fixed-size blocks so that it
+		 * depends mainly on the amount of sequential I/O and not
+		 * much on the number of sequential I/O's.  The fixed size
+		 * of 16384 is hard-coded here since it is (not quite) just
+		 * a magic size that works well here.  This size is more
+		 * closely related to the best I/O size for real disks than
+		 * to any block size used by software.
+		 */
+		if (uio->uio_resid >= IO_SEQMAX * 16384)
+			fp->f_seqcount = IO_SEQMAX;
+		else {
+			fp->f_seqcount += howmany(uio->uio_resid, 16384);
+			if (fp->f_seqcount > IO_SEQMAX)
+				fp->f_seqcount = IO_SEQMAX;
+		}
+		return (fp->f_seqcount << IO_SEQSHIFT);
+	}
+
+	/* Not sequential.  Quickly draw-down sequentiality. */
+	if (fp->f_seqcount > 1)
+		fp->f_seqcount = 1;
+	else
+		fp->f_seqcount = 0;
+	return (0);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, ssize_t *aresid, struct thread *td)
+{
+	struct uio auio;
+	struct iovec aiov;
+	struct mount *mp;
+	struct ucred *cred;
+	void *rl_cookie;
+	struct vn_io_fault_args args;
+	int error, lock_flags;
+
+	if (offset < 0 && vp->v_type != VCHR)
+		return (EINVAL);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_td = td;
+	error = 0;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if ((ioflg & IO_RANGELOCKED) == 0) {
+			if (rw == UIO_READ) {
+				rl_cookie = vn_rangelock_rlock(vp, offset,
+				    offset + len);
+			} else {
+				rl_cookie = vn_rangelock_wlock(vp, offset,
+				    offset + len);
+			}
+		} else
+			rl_cookie = NULL;
+		mp = NULL;
+		if (rw == UIO_WRITE) { 
+			if (vp->v_type != VCHR &&
+			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+			    != 0)
+				goto out;
+			if (MNT_SHARED_WRITES(mp) ||
+			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
+				lock_flags = LK_SHARED;
+			else
+				lock_flags = LK_EXCLUSIVE;
+		} else
+			lock_flags = LK_SHARED;
+		vn_lock(vp, lock_flags | LK_RETRY);
+	} else
+		rl_cookie = NULL;
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+#ifdef MAC
+	if ((ioflg & IO_NOMACCHECK) == 0) {
+		if (rw == UIO_READ)
+			error = mac_vnode_check_read(active_cred, file_cred,
+			    vp);
+		else
+			error = mac_vnode_check_write(active_cred, file_cred,
+			    vp);
+	}
+#endif
+	if (error == 0) {
+		if (file_cred != NULL)
+			cred = file_cred;
+		else
+			cred = active_cred;
+		if (do_vn_io_fault(vp, &auio)) {
+			args.kind = VN_IO_FAULT_VOP;
+			args.cred = cred;
+			args.flags = ioflg;
+			args.args.vop_args.vp = vp;
+			error = vn_io_fault1(vp, &auio, &args, td);
+		} else if (rw == UIO_READ) {
+			error = VOP_READ(vp, &auio, ioflg, cred);
+		} else /* if (rw == UIO_WRITE) */ {
+			error = VOP_WRITE(vp, &auio, ioflg, cred);
+		}
+	}
+	if (aresid)
+		*aresid = auio.uio_resid;
+	else
+		if (auio.uio_resid && error == 0)
+			error = EIO;
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		VOP_UNLOCK(vp, 0);
+		if (mp != NULL)
+			vn_finished_write(mp);
+	}
+ out:
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
+	return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.  The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we 
+ * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
+ * to give other processes a chance to lock the vnode (either other processes
+ * core'ing the same binary, or unrelated processes scanning the directory).
+ */
+int
+vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
+    off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, size_t *aresid, struct thread *td)
+{
+	int error = 0;
+	ssize_t iaresid;
+
+	do {
+		int chunk;
+
+		/*
+		 * Force `offset' to a multiple of MAXBSIZE except possibly
+		 * for the first chunk, so that filesystems only need to
+		 * write full blocks except possibly for the first and last
+		 * chunks.
+		 */
+		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
+
+		if (chunk > len)
+			chunk = len;
+		if (rw != UIO_READ && vp->v_type == VREG)
+			bwillwrite();
+		iaresid = 0;
+		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+		    ioflg, active_cred, file_cred, &iaresid, td);
+		len -= chunk;	/* aresid calc already includes length */
+		if (error)
+			break;
+		offset += chunk;
+		base = (char *)base + chunk;
+		kern_yield(PRI_USER);
+	} while (len);
+	if (aresid)
+		*aresid = len + iaresid;
+	return (error);
+}
+
+off_t
+foffset_lock(struct file *fp, int flags)
+{
+	struct mtx *mtxp;
+	off_t res;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	/*
+	 * Caller only wants the current f_offset value.  Assume that
+	 * the long and shorter integer types reads are atomic.
+	 */
+	if ((flags & FOF_NOLOCK) != 0)
+		return (fp->f_offset);
+#endif
+
+	/*
+	 * According to McKusick the vn lock was protecting f_offset here.
+	 * It is now protected by the FOFFSET_LOCKED flag.
+	 */
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOLOCK) == 0) {
+		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+			    "vofflock", 0);
+		}
+		fp->f_vnread_flags |= FOFFSET_LOCKED;
+	}
+	res = fp->f_offset;
+	mtx_unlock(mtxp);
+	return (res);
+}
+
+void
+foffset_unlock(struct file *fp, off_t val, int flags)
+{
+	struct mtx *mtxp;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	if ((flags & FOF_NOLOCK) != 0) {
+		if ((flags & FOF_NOUPDATE) == 0)
+			fp->f_offset = val;
+		if ((flags & FOF_NEXTOFF) != 0)
+			fp->f_nextoff = val;
+		return;
+	}
+#endif
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOUPDATE) == 0)
+		fp->f_offset = val;
+	if ((flags & FOF_NEXTOFF) != 0)
+		fp->f_nextoff = val;
+	if ((flags & FOF_NOLOCK) == 0) {
+		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
+		    ("Lost FOFFSET_LOCKED"));
+		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+			wakeup(&fp->f_vnread_flags);
+		fp->f_vnread_flags = 0;
+	}
+	mtx_unlock(mtxp);
+}
+
+void
+foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = foffset_lock(fp, flags);
+}
+
+void
+foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		foffset_unlock(fp, uio->uio_offset, flags);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+	struct mtx *mtxp;
+	int ret;
+
+	ret = POSIX_FADV_NORMAL;
+	if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
+		return (ret);
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if (fp->f_advice != NULL &&
+	    uio->uio_offset >= fp->f_advice->fa_start &&
+	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+		ret = fp->f_advice->fa_advice;
+	mtx_unlock(mtxp);
+	return (ret);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
+{
+	struct vnode *vp;
+	off_t orig_offset;
+	int error, ioflag;
+	int advice;
+
+	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+	    uio->uio_td, td));
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+	vp = fp->f_vnode;
+	ioflag = 0;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+	advice = get_advice(fp, uio);
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* Disable read-ahead for random I/O. */
+		break;
+	}
+	orig_offset = uio->uio_offset;
+
+#ifdef MAC
+	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
+	if (error == 0)
+#endif
+		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
+	fp->f_nextoff = uio->uio_offset;
+	VOP_UNLOCK(vp, 0);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    orig_offset != uio->uio_offset)
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
+		 * for the backing file after a POSIX_FADV_NOREUSE
+		 * read(2).
+		 */
+		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
+		    POSIX_FADV_DONTNEED);
+	return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
+{
+	struct vnode *vp;
+	struct mount *mp;
+	off_t orig_offset;
+	int error, ioflag, lock_flags;
+	int advice;
+
+	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+	    uio->uio_td, td));
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+	vp = fp->f_vnode;
+	if (vp->v_type == VREG)
+		bwillwrite();
+	ioflag = IO_UNIT;
+	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
+		ioflag |= IO_APPEND;
+	if (fp->f_flag & FNONBLOCK)
+		ioflag |= IO_NDELAY;
+	if (fp->f_flag & O_DIRECT)
+		ioflag |= IO_DIRECT;
+	if ((fp->f_flag & O_FSYNC) ||
+	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+		ioflag |= IO_SYNC;
+	mp = NULL;
+	if (vp->v_type != VCHR &&
+	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto unlock;
+
+	advice = get_advice(fp, uio);
+
+	if (MNT_SHARED_WRITES(mp) ||
+	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
+		lock_flags = LK_SHARED;
+	} else {
+		lock_flags = LK_EXCLUSIVE;
+	}
+
+	vn_lock(vp, lock_flags | LK_RETRY);
+	switch (advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
+		ioflag |= sequential_heuristic(uio, fp);
+		break;
+	case POSIX_FADV_RANDOM:
+		/* XXX: Is this correct? */
+		break;
+	}
+	orig_offset = uio->uio_offset;
+
+#ifdef MAC
+	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+	if (error == 0)
+#endif
+		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
+	fp->f_nextoff = uio->uio_offset;
+	VOP_UNLOCK(vp, 0);
+	if (vp->v_type != VCHR)
+		vn_finished_write(mp);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    orig_offset != uio->uio_offset)
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
+		 * for the backing file after a POSIX_FADV_NOREUSE
+		 * write(2).
+		 */
+		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
+		    POSIX_FADV_DONTNEED);
+unlock:
+	return (error);
+}
+
+/*
+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 into userspace
+ * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ() or VOP_WRITE().
+ * Instead, it first tries to do the whole range i/o with pagefaults
+ * disabled. If all pages in the i/o buffer are resident and mapped,
+ * VOP will succeed (ignoring the genuine filesystem errors).
+ * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
+ * i/o in chunks, with all pages in the chunk prefaulted and held
+ * using vm_fault_quick_hold_pages().
+ *
+ * Filesystems using this deadlock avoidance scheme should use the
+ * array of the held pages from uio, saved in the curthread->td_ma,
+ * instead of doing uiomove().  A helper function
+ * vn_io_fault_uiomove() converts uiomove request into
+ * uiomove_fromphys() over td_ma array.
+ *
+ * Since vnode locks do not cover the whole i/o anymore, rangelocks
+ * make the current i/o request atomic with respect to other i/os and
+ * truncations.
+ */
+
+/*
+ * Decode vn_io_fault_args and perform the corresponding i/o.
+ */
+static int
+vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
+    struct thread *td)
+{
+	int error, save;
+
+	error = 0;
+	save = vm_fault_disable_pagefaults();
+	switch (args->kind) {
+	case VN_IO_FAULT_FOP:
+		error = (args->args.fop_args.doio)(args->args.fop_args.fp,
+		    uio, args->cred, args->flags, td);
+		break;
+	case VN_IO_FAULT_VOP:
+		if (uio->uio_rw == UIO_READ) {
+			error = VOP_READ(args->args.vop_args.vp, uio,
+			    args->flags, args->cred);
+		} else if (uio->uio_rw == UIO_WRITE) {
+			error = VOP_WRITE(args->args.vop_args.vp, uio,
+			    args->flags, args->cred);
+		}
+		break;
+	default:
+		panic("vn_io_fault_doio: unknown kind of io %d %d",
+		    args->kind, uio->uio_rw);
+	}
+	vm_fault_enable_pagefaults(save);
+	return (error);
+}
+
+static int
+vn_io_fault_touch(char *base, const struct uio *uio)
+{
+	int r;
+
+	r = fubyte(base);
+	if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
+		return (EFAULT);
+	return (0);
+}
+
+static int
+vn_io_fault_prefault_user(const struct uio *uio)
+{
+	char *base;
+	const struct iovec *iov;
+	size_t len;
+	ssize_t resid;
+	int error, i;
+
+	KASSERT(uio->uio_segflg == UIO_USERSPACE,
+	    ("vn_io_fault_prefault userspace"));
+
+	error = i = 0;
+	iov = uio->uio_iov;
+	resid = uio->uio_resid;
+	base = iov->iov_base;
+	len = iov->iov_len;
+	while (resid > 0) {
+		error = vn_io_fault_touch(base, uio);
+		if (error != 0)
+			break;
+		if (len < PAGE_SIZE) {
+			if (len != 0) {
+				error = vn_io_fault_touch(base + len - 1, uio);
+				if (error != 0)
+					break;
+				resid -= len;
+			}
+			if (++i >= uio->uio_iovcnt)
+				break;
+			iov = uio->uio_iov + i;
+			base = iov->iov_base;
+			len = iov->iov_len;
+		} else {
+			len -= PAGE_SIZE;
+			base += PAGE_SIZE;
+			resid -= PAGE_SIZE;
+		}
+	}
+	return (error);
+}
+
+/*
+ * Common code for vn_io_fault(), agnostic to the kind of i/o request.
+ * Uses vn_io_fault_doio() to make the call to an actual i/o function.
+ * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
+ * into args and call vn_io_fault1() to handle faults during the user
+ * mode buffer accesses.
+ */
+static int
+vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
+    struct thread *td)
+{
+	vm_page_t ma[io_hold_cnt + 2];
+	struct uio *uio_clone, short_uio;
+	struct iovec short_iovec[1];
+	vm_page_t *prev_td_ma;
+	vm_prot_t prot;
+	vm_offset_t addr, end;
+	size_t len, resid;
+	ssize_t adv;
+	int error, cnt, saveheld, prev_td_ma_cnt;
+
+	if (vn_io_fault_prefault) {
+		error = vn_io_fault_prefault_user(uio);
+		if (error != 0)
+			return (error); /* Or ignore ? */
+	}
+
+	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
+
+	/*
+	 * The UFS follows IO_UNIT directive and replays back both
+	 * uio_offset and uio_resid if an error is encountered during the
+	 * operation.  But, since the iovec may be already advanced,
+	 * uio is still in an inconsistent state.
+	 *
+	 * Cache a copy of the original uio, which is advanced to the redo
+	 * point using UIO_NOCOPY below.
+	 */
+	uio_clone = cloneuio(uio);
+	resid = uio->uio_resid;
+
+	short_uio.uio_segflg = UIO_USERSPACE;
+	short_uio.uio_rw = uio->uio_rw;
+	short_uio.uio_td = uio->uio_td;
+
+	error = vn_io_fault_doio(args, uio, td);
+	if (error != EFAULT)
+		goto out;
+
+	atomic_add_long(&vn_io_faults_cnt, 1);
+	uio_clone->uio_segflg = UIO_NOCOPY;
+	uiomove(NULL, resid - uio->uio_resid, uio_clone);
+	uio_clone->uio_segflg = uio->uio_segflg;
+
+	saveheld = curthread_pflags_set(TDP_UIOHELD);
+	prev_td_ma = td->td_ma;
+	prev_td_ma_cnt = td->td_ma_cnt;
+
+	while (uio_clone->uio_resid != 0) {
+		len = uio_clone->uio_iov->iov_len;
+		if (len == 0) {
+			KASSERT(uio_clone->uio_iovcnt >= 1,
+			    ("iovcnt underflow"));
+			uio_clone->uio_iov++;
+			uio_clone->uio_iovcnt--;
+			continue;
+		}
+		if (len > io_hold_cnt * PAGE_SIZE)
+			len = io_hold_cnt * PAGE_SIZE;
+		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
+		end = round_page(addr + len);
+		if (end < addr) {
+			error = EFAULT;
+			break;
+		}
+		cnt = atop(end - trunc_page(addr));
+		/*
+		 * A perfectly misaligned address and length could cause
+		 * both the start and the end of the chunk to use partial
+		 * page.  +2 accounts for such a situation.
+		 */
+		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
+		    addr, len, prot, ma, io_hold_cnt + 2);
+		if (cnt == -1) {
+			error = EFAULT;
+			break;
+		}
+		short_uio.uio_iov = &short_iovec[0];
+		short_iovec[0].iov_base = (void *)addr;
+		short_uio.uio_iovcnt = 1;
+		short_uio.uio_resid = short_iovec[0].iov_len = len;
+		short_uio.uio_offset = uio_clone->uio_offset;
+		td->td_ma = ma;
+		td->td_ma_cnt = cnt;
+
+		error = vn_io_fault_doio(args, &short_uio, td);
+		vm_page_unhold_pages(ma, cnt);
+		adv = len - short_uio.uio_resid;
+
+		uio_clone->uio_iov->iov_base =
+		    (char *)uio_clone->uio_iov->iov_base + adv;
+		uio_clone->uio_iov->iov_len -= adv;
+		uio_clone->uio_resid -= adv;
+		uio_clone->uio_offset += adv;
+
+		uio->uio_resid -= adv;
+		uio->uio_offset += adv;
+
+		if (error != 0 || adv == 0)
+			break;
+	}
+	td->td_ma = prev_td_ma;
+	td->td_ma_cnt = prev_td_ma_cnt;
+	curthread_pflags_restore(saveheld);
+out:
+	free(uio_clone, M_IOV);
+	return (error);
+}
+
+static int
+vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	fo_rdwr_t *doio;
+	struct vnode *vp;
+	void *rl_cookie;
+	struct vn_io_fault_args args;
+	int error;
+
+	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
+	vp = fp->f_vnode;
+	foffset_lock_uio(fp, uio, flags);
+	if (do_vn_io_fault(vp, uio)) {
+		args.kind = VN_IO_FAULT_FOP;
+		args.args.fop_args.fp = fp;
+		args.args.fop_args.doio = doio;
+		args.cred = active_cred;
+		args.flags = flags | FOF_OFFSET;
+		if (uio->uio_rw == UIO_READ) {
+			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+			    uio->uio_offset + uio->uio_resid);
+		} else if ((fp->f_flag & O_APPEND) != 0 ||
+		    (flags & FOF_OFFSET) == 0) {
+			/* For appenders, punt and lock the whole range. */
+			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+		} else {
+			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+			    uio->uio_offset + uio->uio_resid);
+		}
+		error = vn_io_fault1(vp, uio, &args, td);
+		vn_rangelock_unlock(vp, rl_cookie);
+	} else {
+		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+	}
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
+}
+
+/*
+ * Helper function to perform the requested uiomove operation using
+ * the held pages for io->uio_iov[0].iov_base buffer instead of
+ * copyin/copyout.  Access to the pages with uiomove_fromphys()
+ * instead of iov_base prevents page faults that could occur due to
+ * pmap_collect() invalidating the mapping created by
+ * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
+ * object cleanup revoking the write access from page mappings.
+ *
+ * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
+ * instead of plain uiomove().
+ */
+int
+vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
+{
+	struct uio transp_uio;
+	struct iovec transp_iov[1];
+	struct thread *td;
+	size_t adv;
+	int error, pgadv;
+
+	td = curthread;
+	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+	    uio->uio_segflg != UIO_USERSPACE)
+		return (uiomove(data, xfersize, uio));
+
+	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+	transp_iov[0].iov_base = data;
+	transp_uio.uio_iov = &transp_iov[0];
+	transp_uio.uio_iovcnt = 1;
+	if (xfersize > uio->uio_resid)
+		xfersize = uio->uio_resid;
+	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
+	transp_uio.uio_offset = 0;
+	transp_uio.uio_segflg = UIO_SYSSPACE;
+	/*
+	 * Since transp_iov points to data, and td_ma page array
+	 * corresponds to original uio->uio_iov, we need to invert the
+	 * direction of the i/o operation as passed to
+	 * uiomove_fromphys().
+	 */
+	switch (uio->uio_rw) {
+	case UIO_WRITE:
+		transp_uio.uio_rw = UIO_READ;
+		break;
+	case UIO_READ:
+		transp_uio.uio_rw = UIO_WRITE;
+		break;
+	}
+	transp_uio.uio_td = uio->uio_td;
+	error = uiomove_fromphys(td->td_ma,
+	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
+	    xfersize, &transp_uio);
+	adv = xfersize - transp_uio.uio_resid;
+	pgadv =
+	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
+	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
+	td->td_ma += pgadv;
+	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+	    pgadv));
+	td->td_ma_cnt -= pgadv;
+	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
+	uio->uio_iov->iov_len -= adv;
+	uio->uio_resid -= adv;
+	uio->uio_offset += adv;
+	return (error);
+}
+
+int
+vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
+    struct uio *uio)
+{
+	struct thread *td;
+	vm_offset_t iov_base;
+	int cnt, pgadv;
+
+	td = curthread;
+	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+	    uio->uio_segflg != UIO_USERSPACE)
+		return (uiomove_fromphys(ma, offset, xfersize, uio));
+
+	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
+	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
+	switch (uio->uio_rw) {
+	case UIO_WRITE:
+		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
+		    offset, cnt);
+		break;
+	case UIO_READ:
+		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
+		    cnt);
+		break;
+	}
+	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
+	td->td_ma += pgadv;
+	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+	    pgadv));
+	td->td_ma_cnt -= pgadv;
+	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
+	uio->uio_iov->iov_len -= cnt;
+	uio->uio_resid -= cnt;
+	uio->uio_offset += cnt;
+	return (0);
+}
+
+
+/*
+ * File table truncate routine.
+ */
+static int
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vattr vattr;
+	struct mount *mp;
+	struct vnode *vp;
+	void *rl_cookie;
+	int error;
+
+	vp = fp->f_vnode;
+
+	/*
+	 * Lock the whole range for truncation.  Otherwise split i/o
+	 * might happen partly before and partly after the truncation.
+	 */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		goto out1;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	if (vp->v_type == VDIR) {
+		error = EISDIR;
+		goto out;
+	}
+#ifdef MAC
+	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+	if (error)
+		goto out;
+#endif
+	error = VOP_ADD_WRITECOUNT(vp, 1);
+	if (error == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = length;
+		if ((fp->f_flag & O_FSYNC) != 0)
+			vattr.va_vaflags |= VA_SYNC;
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
+		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
+	}
+out:
+	VOP_UNLOCK(vp, 0);
+	vn_finished_write(mp);
+out1:
+	vn_rangelock_unlock(vp, rl_cookie);
+	return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+static int
+vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vnode *vp = fp->f_vnode;
+	int error;
+
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
+	VOP_UNLOCK(vp, 0);
+
+	return (error);
+}
+
+/*
+ * Stat a vnode; implementation for the stat syscall
+ */
+int
+vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
+    struct ucred *file_cred, struct thread *td)
+{
+	struct vattr vattr;
+	struct vattr *vap;
+	int error;
+	u_short mode;
+
+	AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+	error = mac_vnode_check_stat(active_cred, file_cred, vp);
+	if (error)
+		return (error);
+#endif
+
+	vap = &vattr;
+
+	/*
+	 * Initialize defaults for new and unusual fields, so that file
+	 * systems which don't support these fields don't need to know
+	 * about them.
+	 */
+	vap->va_birthtime.tv_sec = -1;
+	vap->va_birthtime.tv_nsec = 0;
+	vap->va_fsid = VNOVAL;
+	vap->va_rdev = NODEV;
+
+	error = VOP_GETATTR(vp, vap, active_cred);
+	if (error)
+		return (error);
+
+	/*
+	 * Zero the spare stat fields
+	 */
+	bzero(sb, sizeof *sb);
+
+	/*
+	 * Copy from vattr table
+	 */
+	if (vap->va_fsid != VNOVAL)
+		sb->st_dev = vap->va_fsid;
+	else
+		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
+	sb->st_ino = vap->va_fileid;
+	mode = vap->va_mode;
+	switch (vap->va_type) {
+	case VREG:
+		mode |= S_IFREG;
+		break;
+	case VDIR:
+		mode |= S_IFDIR;
+		break;
+	case VBLK:
+		mode |= S_IFBLK;
+		break;
+	case VCHR:
+		mode |= S_IFCHR;
+		break;
+	case VLNK:
+		mode |= S_IFLNK;
+		break;
+	case VSOCK:
+		mode |= S_IFSOCK;
+		break;
+	case VFIFO:
+		mode |= S_IFIFO;
+		break;
+	default:
+		return (EBADF);
+	}
+	sb->st_mode = mode;
+	sb->st_nlink = vap->va_nlink;
+	sb->st_uid = vap->va_uid;
+	sb->st_gid = vap->va_gid;
+	sb->st_rdev = vap->va_rdev;
+	if (vap->va_size > OFF_MAX)
+		return (EOVERFLOW);
+	sb->st_size = vap->va_size;
+	sb->st_atim = vap->va_atime;
+	sb->st_mtim = vap->va_mtime;
+	sb->st_ctim = vap->va_ctime;
+	sb->st_birthtim = vap->va_birthtime;
+
+        /*
+	 * According to www.opengroup.org, the meaning of st_blksize is 
+	 *   "a filesystem-specific preferred I/O block size for this 
+	 *    object.  In some filesystem types, this may vary from file
+	 *    to file"
+	 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
+	 */
+
+	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
+	
+	sb->st_flags = vap->va_flags;
+	if (priv_check(td, PRIV_VFS_GENERATION))
+		sb->st_gen = 0;
+	else
+		sb->st_gen = vap->va_gen;
+
+	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+	return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	struct fiobmap2_arg *bmarg;
+	int error;
+
+	vp = fp->f_vnode;
+	switch (vp->v_type) {
+	case VDIR:
+	case VREG:
+		switch (com) {
+		case FIONREAD:
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+			error = VOP_GETATTR(vp, &vattr, active_cred);
+			VOP_UNLOCK(vp, 0);
+			if (error == 0)
+				*(int *)data = vattr.va_size - fp->f_offset;
+			return (error);
+		case FIOBMAP2:
+			bmarg = (struct fiobmap2_arg *)data;
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+#ifdef MAC
+			error = mac_vnode_check_read(active_cred, fp->f_cred,
+			    vp);
+			if (error == 0)
+#endif
+				error = VOP_BMAP(vp, bmarg->bn, NULL,
+				    &bmarg->bn, &bmarg->runp, &bmarg->runb);
+			VOP_UNLOCK(vp, 0);
+			return (error);
+		case FIONBIO:
+		case FIOASYNC:
+			return (0);
+		default:
+			return (VOP_IOCTL(vp, com, data, fp->f_flag,
+			    active_cred, td));
+		}
+		break;
+	case VCHR:
+		return (VOP_IOCTL(vp, com, data, fp->f_flag,
+		    active_cred, td));
+	default:
+		return (ENOTTY);
+	}
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vnode *vp;
+	int error;
+
+	vp = fp->f_vnode;
+#ifdef MAC
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
+	VOP_UNLOCK(vp, 0);
+	if (!error)
+#endif
+
+	error = VOP_POLL(vp, events, fp->f_cred, td);
+	return (error);
+}
+
+/*
+ * Acquire the requested lock and then check for validity.  LK_RETRY
+ * permits vn_lock to return doomed vnodes.
+ */
+int
+_vn_lock(struct vnode *vp, int flags, char *file, int line)
+{
+	int error;
+
+	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+	    ("vn_lock: no locktype"));
+	VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count"));
+retry:
+	error = VOP_LOCK1(vp, flags, file, line);
+	flags &= ~LK_INTERLOCK;	/* Interlock is always dropped. */
+	KASSERT((flags & LK_RETRY) == 0 || error == 0,
+	    ("vn_lock: error %d incompatible with flags %#x", error, flags));
+
+	if ((flags & LK_RETRY) == 0) {
+		if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
+			VOP_UNLOCK(vp, 0);
+			error = ENOENT;
+		}
+	} else if (error != 0)
+		goto retry;
+	return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(struct file *fp, struct thread *td)
+{
+	struct vnode *vp;
+	struct flock lf;
+	int error;
+	bool ref;
+
+	vp = fp->f_vnode;
+	fp->f_ops = &badfileops;
+	ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
+
+	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
+
+	if (__predict_false(ref)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		lf.l_type = F_UNLCK;
+		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
+		vrele(vp);
+	}
+	return (error);
+}
+
+static bool
+vn_suspendable(struct mount *mp)
+{
+
+	return (mp->mnt_op->vfs_susp_clean != NULL);
+}
+
+/*
+ * Preparing to start a filesystem write operation. If the operation is
+ * permitted, then we bump the count of operations in progress and
+ * proceed. If a suspend request is in progress, we wait until the
+ * suspension is over, and then proceed.
+ */
+static int
+vn_start_write_locked(struct mount *mp, int flags)
+{
+	int error, mflags;
+
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+	error = 0;
+
+	/*
+	 * Check on status of suspension.
+	 */
+	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
+	    mp->mnt_susp_owner != curthread) {
+		mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
+		    (flags & PCATCH) : 0) | (PUSER - 1);
+		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+			if (flags & V_NOWAIT) {
+				error = EWOULDBLOCK;
+				goto unlock;
+			}
+			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
+			    "suspfs", 0);
+			if (error)
+				goto unlock;
+		}
+	}
+	if (flags & V_XSLEEP)
+		goto unlock;
+	mp->mnt_writeopcount++;
+unlock:
+	if (error != 0 || (flags & V_XSLEEP) != 0)
+		MNT_REL(mp);
+	MNT_IUNLOCK(mp);
+	return (error);
+}
+
+int
+vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
+{
+	struct mount *mp;
+	int error;
+
+	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
+	    ("V_MNTREF requires mp"));
+
+	error = 0;
+	/*
+	 * If a vnode is provided, get and return the mount point that
+	 * to which it will write.
+	 */
+	if (vp != NULL) {
+		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+			*mpp = NULL;
+			if (error != EOPNOTSUPP)
+				return (error);
+			return (0);
+		}
+	}
+	if ((mp = *mpp) == NULL)
+		return (0);
+
+	if (!vn_suspendable(mp)) {
+		if (vp != NULL || (flags & V_MNTREF) != 0)
+			vfs_rel(mp);
+		return (0);
+	}
+
+	/*
+	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+	 * a vfs_ref().
+	 * As long as a vnode is not provided we need to acquire a
+	 * refcount for the provided mountpoint too, in order to
+	 * emulate a vfs_ref().
+	 */
+	MNT_ILOCK(mp);
+	if (vp == NULL && (flags & V_MNTREF) == 0)
+		MNT_REF(mp);
+
+	return (vn_start_write_locked(mp, flags));
+}
+
+/*
+ * Secondary suspension. Used by operations such as vop_inactive
+ * routines that are needed by the higher level functions. These
+ * are allowed to proceed until all the higher level functions have
+ * completed (indicated by mnt_writeopcount dropping to zero). At that
+ * time, these operations are halted until the suspension is over.
+ */
+int
+vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
+{
+	struct mount *mp;
+	int error;
+
+	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
+	    ("V_MNTREF requires mp"));
+
+ retry:
+	if (vp != NULL) {
+		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+			*mpp = NULL;
+			if (error != EOPNOTSUPP)
+				return (error);
+			return (0);
+		}
+	}
+	/*
+	 * If we are not suspended or have not yet reached suspended
+	 * mode, then let the operation proceed.
+	 */
+	if ((mp = *mpp) == NULL)
+		return (0);
+
+	if (!vn_suspendable(mp)) {
+		if (vp != NULL || (flags & V_MNTREF) != 0)
+			vfs_rel(mp);
+		return (0);
+	}
+
+	/*
+	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+	 * a vfs_ref().
+	 * As long as a vnode is not provided we need to acquire a
+	 * refcount for the provided mountpoint too, in order to
+	 * emulate a vfs_ref().
+	 */
+	MNT_ILOCK(mp);
+	if (vp == NULL && (flags & V_MNTREF) == 0)
+		MNT_REF(mp);
+	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
+		mp->mnt_secondary_writes++;
+		mp->mnt_secondary_accwrites++;
+		MNT_IUNLOCK(mp);
+		return (0);
+	}
+	if (flags & V_NOWAIT) {
+		MNT_REL(mp);
+		MNT_IUNLOCK(mp);
+		return (EWOULDBLOCK);
+	}
+	/*
+	 * Wait for the suspension to finish.
+	 */
+	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
+	    ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
+	    "suspfs", 0);
+	vfs_rel(mp);
+	if (error == 0)
+		goto retry;
+	return (error);
+}
+
+/*
+ * Filesystem write operation has completed. If we are suspending and this
+ * operation is the last one, notify the suspender that the suspension is
+ * now in effect.
+ */
+void
+vn_finished_write(struct mount *mp)
+{
+	if (mp == NULL || !vn_suspendable(mp))
+		return;
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	mp->mnt_writeopcount--;
+	if (mp->mnt_writeopcount < 0)
+		panic("vn_finished_write: neg cnt");
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+	    mp->mnt_writeopcount <= 0)
+		wakeup(&mp->mnt_writeopcount);
+	MNT_IUNLOCK(mp);
+}
+
+
+/*
+ * Filesystem secondary write operation has completed. If we are
+ * suspending and this operation is the last one, notify the suspender
+ * that the suspension is now in effect.
+ */
+void
+vn_finished_secondary_write(struct mount *mp)
+{
+	if (mp == NULL || !vn_suspendable(mp))
+		return;
+	MNT_ILOCK(mp);
+	MNT_REL(mp);
+	mp->mnt_secondary_writes--;
+	if (mp->mnt_secondary_writes < 0)
+		panic("vn_finished_secondary_write: neg cnt");
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+	    mp->mnt_secondary_writes <= 0)
+		wakeup(&mp->mnt_secondary_writes);
+	MNT_IUNLOCK(mp);
+}
+
+
+
+/*
+ * Request a filesystem to suspend write operations.
+ */
+int
+vfs_write_suspend(struct mount *mp, int flags)
+{
+	int error;
+
+	MPASS(vn_suspendable(mp));
+
+	MNT_ILOCK(mp);
+	if (mp->mnt_susp_owner == curthread) {
+		MNT_IUNLOCK(mp);
+		return (EALREADY);
+	}
+	while (mp->mnt_kern_flag & MNTK_SUSPEND)
+		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
+
+	/*
+	 * Unmount holds a write reference on the mount point.  If we
+	 * own busy reference and drain for writers, we deadlock with
+	 * the reference draining in the unmount path.  Callers of
+	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
+	 * vfs_busy() reference is owned and caller is not in the
+	 * unmount context.
+	 */
+	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
+	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+		MNT_IUNLOCK(mp);
+		return (EBUSY);
+	}
+
+	mp->mnt_kern_flag |= MNTK_SUSPEND;
+	mp->mnt_susp_owner = curthread;
+	if (mp->mnt_writeopcount > 0)
+		(void) msleep(&mp->mnt_writeopcount, 
+		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
+	else
+		MNT_IUNLOCK(mp);
+	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
+		vfs_write_resume(mp, 0);
+	return (error);
+}
+
+/*
+ * Request a filesystem to resume write operations.
+ */
+void
+vfs_write_resume(struct mount *mp, int flags)
+{
+
+	MPASS(vn_suspendable(mp));
+
+	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
+		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
+				       MNTK_SUSPENDED);
+		mp->mnt_susp_owner = NULL;
+		wakeup(&mp->mnt_writeopcount);
+		wakeup(&mp->mnt_flag);
+		curthread->td_pflags &= ~TDP_IGNSUSP;
+		if ((flags & VR_START_WRITE) != 0) {
+			MNT_REF(mp);
+			mp->mnt_writeopcount++;
+		}
+		MNT_IUNLOCK(mp);
+		if ((flags & VR_NO_SUSPCLR) == 0)
+			VFS_SUSP_CLEAN(mp);
+	} else if ((flags & VR_START_WRITE) != 0) {
+		MNT_REF(mp);
+		vn_start_write_locked(mp, 0);
+	} else {
+		MNT_IUNLOCK(mp);
+	}
+}
+
+/*
+ * Helper loop around vfs_write_suspend() for filesystem unmount VFS
+ * methods.
+ */
+int
+vfs_write_suspend_umnt(struct mount *mp)
+{
+	int error;
+
+	MPASS(vn_suspendable(mp));
+	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
+	    ("vfs_write_suspend_umnt: recursed"));
+
+	/* dounmount() already called vn_start_write(). */
+	for (;;) {
+		vn_finished_write(mp);
+		error = vfs_write_suspend(mp, 0);
+		if (error != 0) {
+			vn_start_write(NULL, &mp, V_WAIT);
+			return (error);
+		}
+		MNT_ILOCK(mp);
+		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
+			break;
+		MNT_IUNLOCK(mp);
+		vn_start_write(NULL, &mp, V_WAIT);
+	}
+	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
+	wakeup(&mp->mnt_flag);
+	MNT_IUNLOCK(mp);
+	curthread->td_pflags |= TDP_IGNSUSP;
+	return (0);
+}
+
+/*
+ * Implement kqueues for files by translating it to vnode operation.
+ */
+static int
+vn_kqfilter(struct file *fp, struct knote *kn)
+{
+
+	return (VOP_KQFILTER(fp->f_vnode, kn));
+}
+
+/*
+ * Simplified in-kernel wrapper calls for extended attribute access.
+ * Both calls pass in a NULL credential, authorizing as "kernel" access.
+ * Set IO_NODELOCKED in ioflg if the vnode is already locked.
+ */
+int
+vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, int *buflen, char *buf, struct thread *td)
+{
+	struct uio	auio;
+	struct iovec	iov;
+	int	error;
+
+	iov.iov_len = *buflen;
+	iov.iov_base = buf;
+
+	auio.uio_iov = &iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;
+	auio.uio_resid = *buflen;
+
+	if ((ioflg & IO_NODELOCKED) == 0)
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+	/* authorize attribute retrieval as kernel */
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
+	    td);
+
+	if ((ioflg & IO_NODELOCKED) == 0)
+		VOP_UNLOCK(vp, 0);
+
+	if (error == 0) {
+		*buflen = *buflen - auio.uio_resid;
+	}
+
+	return (error);
+}
+
+/*
+ * XXX failure mode if partially written?
+ */
+int
+vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, int buflen, char *buf, struct thread *td)
+{
+	struct uio	auio;
+	struct iovec	iov;
+	struct mount	*mp;
+	int	error;
+
+	iov.iov_len = buflen;
+	iov.iov_base = buf;
+
+	auio.uio_iov = &iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;
+	auio.uio_resid = buflen;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+			return (error);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+	/* authorize attribute setting as kernel */
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		vn_finished_write(mp);
+		VOP_UNLOCK(vp, 0);
+	}
+
+	return (error);
+}
+
+int
+vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
+    const char *attrname, struct thread *td)
+{
+	struct mount	*mp;
+	int	error;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+			return (error);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+	/* authorize attribute removal as kernel */
+	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
+	if (error == EOPNOTSUPP)
+		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+		    NULL, td);
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		vn_finished_write(mp);
+		VOP_UNLOCK(vp, 0);
+	}
+
+	return (error);
+}
+
+static int
+vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
+    struct vnode **rvp)
+{
+
+	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
+}
+
+int
+vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
+{
+
+	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
+	    lkflags, rvp));
+}
+
+int
+vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
+    int lkflags, struct vnode **rvp)
+{
+	struct mount *mp;
+	int ltype, error;
+
+	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
+	mp = vp->v_mount;
+	ltype = VOP_ISLOCKED(vp);
+	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
+	    ("vn_vget_ino: vp not locked"));
+	error = vfs_busy(mp, MBF_NOWAIT);
+	if (error != 0) {
+		vfs_ref(mp);
+		VOP_UNLOCK(vp, 0);
+		error = vfs_busy(mp, 0);
+		vn_lock(vp, ltype | LK_RETRY);
+		vfs_rel(mp);
+		if (error != 0)
+			return (ENOENT);
+		if (vp->v_iflag & VI_DOOMED) {
+			vfs_unbusy(mp);
+			return (ENOENT);
+		}
+	}
+	VOP_UNLOCK(vp, 0);
+	error = alloc(mp, alloc_arg, lkflags, rvp);
+	vfs_unbusy(mp);
+	if (error != 0 || *rvp != vp)
+		vn_lock(vp, ltype | LK_RETRY);
+	if (vp->v_iflag & VI_DOOMED) {
+		if (error == 0) {
+			if (*rvp == vp)
+				vunref(vp);
+			else
+				vput(*rvp);
+		}
+		error = ENOENT;
+	}
+	return (error);
+}
+
+int
+vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
+    struct thread *td)
+{
+
+	if (vp->v_type != VREG || td == NULL)
+		return (0);
+	if ((uoff_t)uio->uio_offset + uio->uio_resid >
+	    lim_cur(td, RLIMIT_FSIZE)) {
+		PROC_LOCK(td->td_proc);
+		kern_psignal(td->td_proc, SIGXFSZ);
+		PROC_UNLOCK(td->td_proc);
+		return (EFBIG);
+	}
+	return (0);
+}
+
+int
+vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vnode *vp;
+
+	vp = fp->f_vnode;
+#ifdef AUDIT
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	VOP_UNLOCK(vp, 0);
+#endif
+	return (setfmode(td, active_cred, vp, mode));
+}
+
+int
+vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct vnode *vp;
+
+	vp = fp->f_vnode;
+#ifdef AUDIT
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(vp);
+	VOP_UNLOCK(vp, 0);
+#endif
+	return (setfown(td, active_cred, vp, uid, gid));
+}
+
+void
+vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
+{
+	vm_object_t object;
+
+	if ((object = vp->v_object) == NULL)
+		return;
+	VM_OBJECT_WLOCK(object);
+	vm_object_page_remove(object, start, end, 0);
+	VM_OBJECT_WUNLOCK(object);
+}
+
+int
+vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
+{
+	struct vattr va;
+	daddr_t bn, bnp;
+	uint64_t bsize;
+	off_t noff;
+	int error;
+
+	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
+	    ("Wrong command %lu", cmd));
+
+	if (vn_lock(vp, LK_SHARED) != 0)
+		return (EBADF);
+	if (vp->v_type != VREG) {
+		error = ENOTTY;
+		goto unlock;
+	}
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error != 0)
+		goto unlock;
+	noff = *off;
+	if (noff >= va.va_size) {
+		error = ENXIO;
+		goto unlock;
+	}
+	bsize = vp->v_mount->mnt_stat.f_iosize;
+	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
+	    noff % bsize) {
+		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
+		if (error == EOPNOTSUPP) {
+			error = ENOTTY;
+			goto unlock;
+		}
+		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
+		    (bnp != -1 && cmd == FIOSEEKDATA)) {
+			noff = bn * bsize;
+			if (noff < *off)
+				noff = *off;
+			goto unlock;
+		}
+	}
+	if (noff > va.va_size)
+		noff = va.va_size;
+	/* noff == va.va_size. There is an implicit hole at the end of file. */
+	if (cmd == FIOSEEKDATA)
+		error = ENXIO;
+unlock:
+	VOP_UNLOCK(vp, 0);
+	if (error == 0)
+		*off = noff;
+	return (error);
+}
+
+int
+vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
+{
+	struct ucred *cred;
+	struct vnode *vp;
+	struct vattr vattr;
+	off_t foffset, size;
+	int error, noneg;
+
+	cred = td->td_ucred;
+	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
+	noneg = (vp->v_type != VCHR);
+	error = 0;
+	switch (whence) {
+	case L_INCR:
+		if (noneg &&
+		    (foffset < 0 ||
+		    (offset > 0 && foffset > OFF_MAX - offset))) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += foffset;
+		break;
+	case L_XTND:
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_GETATTR(vp, &vattr, cred);
+		VOP_UNLOCK(vp, 0);
+		if (error)
+			break;
+
+		/*
+		 * If the file references a disk device, then fetch
+		 * the media size and use that to determine the ending
+		 * offset.
+		 */
+		if (vattr.va_size == 0 && vp->v_type == VCHR &&
+		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
+			vattr.va_size = size;
+		if (noneg &&
+		    (vattr.va_size > OFF_MAX ||
+		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += vattr.va_size;
+		break;
+	case L_SET:
+		break;
+	case SEEK_DATA:
+		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
+		break;
+	case SEEK_HOLE:
+		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
+		break;
+	default:
+		error = EINVAL;
+	}
+	if (error == 0 && noneg && offset < 0)
+		error = EINVAL;
+	if (error != 0)
+		goto drop;
+	VFS_KNOTE_UNLOCKED(vp, 0);
+	td->td_uretoff.tdu_off = offset;
+drop:
+	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
+	return (error);
+}
+
+int
+vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
+    struct thread *td)
+{
+	int error;
+
+	/*
+	 * Grant permission if the caller is the owner of the file, or
+	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
+	 * on the file.  If the time pointer is null, then write
+	 * permission on the file is also sufficient.
+	 *
+	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
+	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
+	 * will be allowed to set the times [..] to the current
+	 * server time.
+	 */
+	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
+	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
+		error = VOP_ACCESS(vp, VWRITE, cred, td);
+	return (error);
+}
+
+int
+vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
+{
+	struct vnode *vp;
+	int error;
+
+	if (fp->f_type == DTYPE_FIFO)
+		kif->kf_type = KF_TYPE_FIFO;
+	else
+		kif->kf_type = KF_TYPE_VNODE;
+	vp = fp->f_vnode;
+	vref(vp);
+	FILEDESC_SUNLOCK(fdp);
+	error = vn_fill_kinfo_vnode(vp, kif);
+	vrele(vp);
+	FILEDESC_SLOCK(fdp);
+	return (error);
+}
+
+static inline void
+vn_fill_junk(struct kinfo_file *kif)
+{
+	size_t len, olen;
+
+	/*
+	 * Simulate vn_fullpath returning changing values for a given
+	 * vp during e.g. coredump.
+	 */
+	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
+	olen = strlen(kif->kf_path);
+	if (len < olen)
+		strcpy(&kif->kf_path[len - 1], "$");
+	else
+		for (; olen < len; olen++)
+			strcpy(&kif->kf_path[olen], "A");
+}
+
+int
+vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
+{
+	struct vattr va;
+	char *fullpath, *freepath;
+	int error;
+
+	kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
+	freepath = NULL;
+	fullpath = "-";
+	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
+	if (error == 0) {
+		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
+	}
+	if (freepath != NULL)
+		free(freepath, M_TEMP);
+
+	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
+		vn_fill_junk(kif);
+	);
+
+	/*
+	 * Retrieve vnode attributes.
+	 */
+	va.va_fsid = VNOVAL;
+	va.va_rdev = NODEV;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
+	VOP_UNLOCK(vp, 0);
+	if (error != 0)
+		return (error);
+	if (va.va_fsid != VNOVAL)
+		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
+	else
+		kif->kf_un.kf_file.kf_file_fsid =
+		    vp->v_mount->mnt_stat.f_fsid.val[0];
+	kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
+	    kif->kf_un.kf_file.kf_file_fsid; /* truncate */
+	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
+	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
+	kif->kf_un.kf_file.kf_file_size = va.va_size;
+	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
+	kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
+	    kif->kf_un.kf_file.kf_file_rdev; /* truncate */
+	return (0);
+}
+
+int
+vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
+    vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
+    struct thread *td)
+{
+#ifdef HWPMC_HOOKS
+	struct pmckern_map_in pkm;
+#endif
+	struct mount *mp;
+	struct vnode *vp;
+	vm_object_t object;
+	vm_prot_t maxprot;
+	boolean_t writecounted;
+	int error;
+
+#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
+    defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
+	/*
+	 * POSIX shared-memory objects are defined to have
+	 * kernel persistence, and are not defined to support
+	 * read(2)/write(2) -- or even open(2).  Thus, we can
+	 * use MAP_ASYNC to trade on-disk coherence for speed.
+	 * The shm_open(3) library routine turns on the FPOSIXSHM
+	 * flag to request this behavior.
+	 */
+	if ((fp->f_flag & FPOSIXSHM) != 0)
+		flags |= MAP_NOSYNC;
+#endif
+	vp = fp->f_vnode;
+
+	/*
+	 * Ensure that file and memory protections are
+	 * compatible.  Note that we only worry about
+	 * writability if mapping is shared; in this case,
+	 * current and max prot are dictated by the open file.
+	 * XXX use the vnode instead?  Problem is: what
+	 * credentials do we use for determination? What if
+	 * proc does a setuid?
+	 */
+	mp = vp->v_mount;
+	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
+		maxprot = VM_PROT_NONE;
+		if ((prot & VM_PROT_EXECUTE) != 0)
+			return (EACCES);
+	} else
+		maxprot = VM_PROT_EXECUTE;
+	if ((fp->f_flag & FREAD) != 0)
+		maxprot |= VM_PROT_READ;
+	else if ((prot & VM_PROT_READ) != 0)
+		return (EACCES);
+
+	/*
+	 * If we are sharing potential changes via MAP_SHARED and we
+	 * are trying to get write permission although we opened it
+	 * without asking for it, bail out.
+	 */
+	if ((flags & MAP_SHARED) != 0) {
+		if ((fp->f_flag & FWRITE) != 0)
+			maxprot |= VM_PROT_WRITE;
+		else if ((prot & VM_PROT_WRITE) != 0)
+			return (EACCES);
+	} else {
+		maxprot |= VM_PROT_WRITE;
+		cap_maxprot |= VM_PROT_WRITE;
+	}
+	maxprot &= cap_maxprot;
+
+	/*
+	 * For regular files and shared memory, POSIX requires that
+	 * the value of foff be a legitimate offset within the data
+	 * object.  In particular, negative offsets are invalid.
+	 * Blocking negative offsets and overflows here avoids
+	 * possible wraparound or user-level access into reserved
+	 * ranges of the data object later.  In contrast, POSIX does
+	 * not dictate how offsets are used by device drivers, so in
+	 * the case of a device mapping a negative offset is passed
+	 * on.
+	 */
+	if (
+#ifdef _LP64
+	    size > OFF_MAX ||
+#endif
+	    foff < 0 || foff > OFF_MAX - size)
+		return (EINVAL);
+
+	writecounted = FALSE;
+	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
+	    &foff, &object, &writecounted);
+	if (error != 0)
+		return (error);
+	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+	    foff, writecounted, td);
+	if (error != 0) {
+		/*
+		 * If this mapping was accounted for in the vnode's
+		 * writecount, then undo that now.
+		 */
+		if (writecounted)
+			vm_pager_release_writecount(object, 0, size);
+		vm_object_deallocate(object);
+	}
+#ifdef HWPMC_HOOKS
+	/* Inform hwpmc(4) if an executable is being mapped. */
+	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
+		if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
+			pkm.pm_file = vp;
+			pkm.pm_address = (uintptr_t) *addr;
+			PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
+		}
+	}
+#endif
+	return (error);
+}
+
+void
+vn_fsid(struct vnode *vp, struct vattr *va)
+{
+	fsid_t *f;
+
+	f = &vp->v_mount->mnt_stat.f_fsid;
+	va->va_fsid = (uint32_t)f->val[1];
+	va->va_fsid <<= sizeof(f->val[1]) * NBBY;
+	va->va_fsid += (uint32_t)f->val[0];
+}
+
+int
+vn_fsync_buf(struct vnode *vp, int waitfor)
+{
+	struct buf *bp, *nbp;
+	struct bufobj *bo;
+	struct mount *mp;
+	int error, maxretry;
+
+	error = 0;
+	maxretry = 10000;     /* large, arbitrarily chosen */
+	mp = NULL;
+	if (vp->v_type == VCHR) {
+		VI_LOCK(vp);
+		mp = vp->v_rdev->si_mountpt;
+		VI_UNLOCK(vp);
+	}
+	bo = &vp->v_bufobj;
+	BO_LOCK(bo);
+loop1:
+	/*
+	 * MARK/SCAN initialization to avoid infinite loops.
+	 */
+        TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
+		bp->b_vflags &= ~BV_SCANNED;
+		bp->b_error = 0;
+	}
+
+	/*
+	 * Flush all dirty buffers associated with a vnode.
+	 */
+loop2:
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+		if ((bp->b_vflags & BV_SCANNED) != 0)
+			continue;
+		bp->b_vflags |= BV_SCANNED;
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
+			if (waitfor != MNT_WAIT)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
+			    BO_LOCKPTR(bo)) != 0) {
+				BO_LOCK(bo);
+				goto loop1;
+			}
+			BO_LOCK(bo);
+		}
+		BO_UNLOCK(bo);
+		KASSERT(bp->b_bufobj == bo,
+		    ("bp %p wrong b_bufobj %p should be %p",
+		    bp, bp->b_bufobj, bo));
+		if ((bp->b_flags & B_DELWRI) == 0)
+			panic("fsync: not dirty");
+		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
+			vfs_bio_awrite(bp);
+		} else {
+			bremfree(bp);
+			bawrite(bp);
+		}
+		if (maxretry < 1000)
+			pause("dirty", hz < 1000 ? 1 : hz / 1000);
+		BO_LOCK(bo);
+		goto loop2;
+	}
+
+	/*
+	 * If synchronous the caller expects us to completely resolve all
+	 * dirty buffers in the system.  Wait for in-progress I/O to
+	 * complete (which could include background bitmap writes), then
+	 * retry if dirty blocks still exist.
+	 */
+	if (waitfor == MNT_WAIT) {
+		bufobj_wwait(bo, 0, 0);
+		if (bo->bo_dirty.bv_cnt > 0) {
+			/*
+			 * If we are unable to write any of these buffers
+			 * then we fail now rather than trying endlessly
+			 * to write them out.
+			 */
+			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
+				if ((error = bp->b_error) != 0)
+					break;
+			if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
+			    (error == 0 && --maxretry >= 0))
+				goto loop1;
+			if (error == 0)
+				error = EAGAIN;
+		}
+	}
+	BO_UNLOCK(bo);
+	if (error != 0)
+		vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
+
+	return (error);
+}
diff --git a/freebsd/sys/sys/bio.h b/freebsd/sys/sys/bio.h
new file mode 100644
index 00000000..1dab6155
--- /dev/null
+++ b/freebsd/sys/sys/bio.h
@@ -0,0 +1,184 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)buf.h	8.9 (Berkeley) 3/30/95
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_BIO_H_
+#define	_SYS_BIO_H_
+
+#include <sys/queue.h>
+#include <sys/disk_zone.h>
+
+/* bio_cmd */
+#define BIO_READ	0x01	/* Read I/O data */
+#define BIO_WRITE	0x02	/* Write I/O data */
+#define BIO_DELETE	0x03	/* TRIM or free blocks, i.e. mark as unused */
+#define BIO_GETATTR	0x04	/* Get GEOM attributes of object */
+#define BIO_FLUSH	0x05	/* Commit outstanding I/O now */
+#define BIO_CMD0	0x06	/* Available for local hacks */
+#define BIO_CMD1	0x07	/* Available for local hacks */
+#define BIO_CMD2	0x08	/* Available for local hacks */
+#define BIO_ZONE	0x09	/* Zone command */
+
+/* bio_flags */
+#define BIO_ERROR	0x01	/* An error occurred processing this bio. */
+#define BIO_DONE	0x02	/* This bio is finished. */
+#define BIO_ONQUEUE	0x04	/* This bio is in a queue & not yet taken. */
+/*
+ * This bio must be executed after all previous bios in the queue have been
+ * executed, and before any successive bios can be executed.
+ */
+#define BIO_ORDERED	0x08
+#define	BIO_UNMAPPED	0x10
+#define	BIO_TRANSIENT_MAPPING	0x20
+#define	BIO_VLIST	0x40
+
+#ifdef _KERNEL
+struct disk;
+struct bio;
+struct vm_map;
+
+/* Empty classifier tag, to prevent further classification. */
+#define	BIO_NOTCLASSIFIED		(void *)(~0UL)
+
+typedef void bio_task_t(void *);
+
+/*
+ * The bio structure describes an I/O operation in the kernel.
+ */
+struct bio {
+	uint16_t bio_cmd;		/* I/O operation. */
+	uint16_t bio_flags;		/* General flags. */
+	uint16_t bio_cflags;		/* Private use by the consumer. */
+	uint16_t bio_pflags;		/* Private use by the provider. */
+	struct cdev *bio_dev;		/* Device to do I/O on. */
+	struct disk *bio_disk;		/* Valid below geom_disk.c only */
+	off_t	bio_offset;		/* Offset into file. */
+	long	bio_bcount;		/* Valid bytes in buffer. */
+	caddr_t	bio_data;		/* Memory, superblocks, indirect etc. */
+	struct vm_page **bio_ma;	/* Or unmapped. */
+	int	bio_ma_offset;		/* Offset in the first page of bio_ma. */
+	int	bio_ma_n;		/* Number of pages in bio_ma. */
+	int	bio_error;		/* Errno for BIO_ERROR. */
+	long	bio_resid;		/* Remaining I/O in bytes. */
+	void	(*bio_done)(struct bio *);
+	void	*bio_driver1;		/* Private use by the provider. */
+	void	*bio_driver2;		/* Private use by the provider. */
+	void	*bio_caller1;		/* Private use by the consumer. */
+	void	*bio_caller2;		/* Private use by the consumer. */
+	TAILQ_ENTRY(bio) bio_queue;	/* Disksort queue. */
+	const char *bio_attribute;	/* Attribute for BIO_[GS]ETATTR */
+	struct  disk_zone_args bio_zone;/* Used for BIO_ZONE */
+	struct g_consumer *bio_from;	/* GEOM linkage */
+	struct g_provider *bio_to;	/* GEOM linkage */
+	off_t	bio_length;		/* Like bio_bcount */
+	off_t	bio_completed;		/* Inverse of bio_resid */
+	u_int	bio_children;		/* Number of spawned bios */
+	u_int	bio_inbed;		/* Children safely home by now */
+	struct bio *bio_parent;		/* Pointer to parent */
+	struct bintime bio_t0;		/* Time request started */
+
+	bio_task_t *bio_task;		/* Task_queue handler */
+	void	*bio_task_arg;		/* Argument to above */
+
+	void	*bio_classifier1;	/* Classifier tag. */
+	void	*bio_classifier2;	/* Classifier tag. */
+
+#ifdef DIAGNOSTIC
+	void	*_bio_caller1;
+	void	*_bio_caller2;
+	uint8_t	_bio_cflags;
+#endif
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+	struct buf *bio_track_bp;	/* Parent buf for tracking */
+#endif
+
+	/* XXX: these go away when bio chaining is introduced */
+	daddr_t bio_pblkno;               /* physical block number */
+};
+
+struct uio;
+struct devstat;
+
+struct bio_queue_head {
+	TAILQ_HEAD(bio_queue, bio) queue;
+	off_t last_offset;
+	struct	bio *insert_point;
+	int total;
+	int batched;
+};
+
+extern struct vm_map *bio_transient_map;
+extern int bio_transient_maxcnt;
+
+void biodone(struct bio *bp);
+void biofinish(struct bio *bp, struct devstat *stat, int error);
+int biowait(struct bio *bp, const char *wchan);
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+void biotrack_buf(struct bio *bp, const char *location);
+
+static __inline void
+biotrack(struct bio *bp, const char *location)
+{
+
+	if (bp->bio_track_bp != NULL)
+		biotrack_buf(bp, location);
+}
+#else
+static __inline void
+biotrack(struct bio *bp __unused, const char *location __unused)
+{
+}
+#endif
+
+void bioq_disksort(struct bio_queue_head *ap, struct bio *bp);
+struct bio *bioq_first(struct bio_queue_head *head);
+struct bio *bioq_takefirst(struct bio_queue_head *head);
+void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error);
+void bioq_init(struct bio_queue_head *head);
+void bioq_insert_head(struct bio_queue_head *head, struct bio *bp);
+void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp);
+void bioq_remove(struct bio_queue_head *head, struct bio *bp);
+
+int	physio(struct cdev *dev, struct uio *uio, int ioflag);
+#define physread physio
+#define physwrite physio
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_BIO_H_ */
diff --git a/freebsd/sys/sys/namei.h b/freebsd/sys/sys/namei.h
new file mode 100644
index 00000000..53814117
--- /dev/null
+++ b/freebsd/sys/sys/namei.h
@@ -0,0 +1,226 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1985, 1989, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)namei.h	8.5 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_NAMEI_H_
+#define	_SYS_NAMEI_H_
+
+#include <sys/caprights.h>
+#include <sys/filedesc.h>
+#include <sys/queue.h>
+#include <sys/_uio.h>
+
+struct componentname {
+	/*
+	 * Arguments to lookup.
+	 */
+	u_long	cn_nameiop;	/* namei operation */
+	u_int64_t cn_flags;	/* flags to namei */
+	struct	thread *cn_thread;/* thread requesting lookup */
+	struct	ucred *cn_cred;	/* credentials */
+	int	cn_lkflags;	/* Lock flags LK_EXCLUSIVE or LK_SHARED */
+	/*
+	 * Shared between lookup and commit routines.
+	 */
+	char	*cn_pnbuf;	/* pathname buffer */
+	char	*cn_nameptr;	/* pointer to looked up name */
+	long	cn_namelen;	/* length of looked up component */
+};
+
+struct nameicap_tracker;
+TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker);
+
+/*
+ * Encapsulation of namei parameters.
+ */
+struct nameidata {
+	/*
+	 * Arguments to namei/lookup.
+	 */
+	const	char *ni_dirp;		/* pathname pointer */
+	enum	uio_seg ni_segflg;	/* location of pathname */
+	cap_rights_t ni_rightsneeded;	/* rights required to look up vnode */
+	/*
+	 * Arguments to lookup.
+	 */
+	struct  vnode *ni_startdir;	/* starting directory */
+	struct	vnode *ni_rootdir;	/* logical root directory */
+	struct	vnode *ni_topdir;	/* logical top directory */
+	int	ni_dirfd;		/* starting directory for *at functions */
+	int	ni_lcf;			/* local call flags */
+	/*
+	 * Results: returned from namei
+	 */
+	struct filecaps ni_filecaps;	/* rights the *at base has */
+	/*
+	 * Results: returned from/manipulated by lookup
+	 */
+	struct	vnode *ni_vp;		/* vnode of result */
+	struct	vnode *ni_dvp;		/* vnode of intermediate directory */
+	/*
+	 * Results: flags returned from namei
+	 */
+	u_int	ni_resflags;
+	/*
+	 * Shared between namei and lookup/commit routines.
+	 */
+	size_t	ni_pathlen;		/* remaining chars in path */
+	char	*ni_next;		/* next location in pathname */
+	u_int	ni_loopcnt;		/* count of symlinks encountered */
+	/*
+	 * Lookup parameters: this structure describes the subset of
+	 * information from the nameidata structure that is passed
+	 * through the VOP interface.
+	 */
+	struct componentname ni_cnd;
+	struct nameicap_tracker_head ni_cap_tracker;
+};
+
+#ifdef _KERNEL
+/*
+ * namei operations
+ */
+#define	LOOKUP		0	/* perform name lookup only */
+#define	CREATE		1	/* setup for file creation */
+#define	DELETE		2	/* setup for file deletion */
+#define	RENAME		3	/* setup for file renaming */
+#define	OPMASK		3	/* mask for operation */
+/*
+ * namei operational modifier flags, stored in ni_cnd.flags
+ */
+#define	LOCKLEAF	0x0004	/* lock vnode on return */
+#define	LOCKPARENT	0x0008	/* want parent vnode returned locked */
+#define	WANTPARENT	0x0010	/* want parent vnode returned unlocked */
+#define	NOCACHE		0x0020	/* name must not be left in cache */
+#define	FOLLOW		0x0040	/* follow symbolic links */
+#define	LOCKSHARED	0x0100	/* Shared lock leaf */
+#define	NOFOLLOW	0x0000	/* do not follow symbolic links (pseudo) */
+#define	MODMASK		0x01fc	/* mask of operational modifiers */
+/*
+ * Namei parameter descriptors.
+ *
+ * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP.
+ * If the caller of namei sets the flag (for example execve wants to
+ * know the name of the program that is being executed), then it must
+ * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must
+ * be freed by either the commit routine or the VOP_ABORT routine.
+ * SAVESTART is set only by the callers of namei. It implies SAVENAME
+ * plus the addition of saving the parent directory that contains the
+ * name in ni_startdir. It allows repeated calls to lookup for the
+ * name being sought. The caller is responsible for releasing the
+ * buffer and for vrele'ing ni_startdir.
+ */
+#define	RDONLY		0x00000200 /* lookup with read-only semantics */
+#define	HASBUF		0x00000400 /* has allocated pathname buffer */
+#define	SAVENAME	0x00000800 /* save pathname buffer */
+#define	SAVESTART	0x00001000 /* save starting directory */
+#define	ISDOTDOT	0x00002000 /* current component name is .. */
+#define	MAKEENTRY	0x00004000 /* entry is to be added to name cache */
+#define	ISLASTCN	0x00008000 /* this is last component of pathname */
+#define	ISSYMLINK	0x00010000 /* symlink needs interpretation */
+#define	ISWHITEOUT	0x00020000 /* found whiteout */
+#define	DOWHITEOUT	0x00040000 /* do whiteouts */
+#define	WILLBEDIR	0x00080000 /* new files will be dirs; allow trailing / */
+#define	ISUNICODE	0x00100000 /* current component name is unicode*/
+#define	ISOPEN		0x00200000 /* caller is opening; return a real vnode. */
+#define	NOCROSSMOUNT	0x00400000 /* do not cross mount points */
+#define	NOMACCHECK	0x00800000 /* do not perform MAC checks */
+#define	AUDITVNODE1	0x04000000 /* audit the looked up vnode information */
+#define	AUDITVNODE2	0x08000000 /* audit the looked up vnode information */
+#define	TRAILINGSLASH	0x10000000 /* path ended in a slash */
+#define	NOCAPCHECK	0x20000000 /* do not perform capability checks */
+#define	NOEXECCHECK	0x40000000 /* do not perform exec check on dir */
+#define	PARAMASK	0x7ffffe00 /* mask of parameter descriptors */
+
+/*
+ * Namei results flags
+ */
+#define	NIRES_ABS	0x00000001 /* Path was absolute */
+
+/*
+ * Flags in ni_lcf, valid for the duration of the namei call.
+ */
+#define	NI_LCF_STRICTRELATIVE	0x0001	/* relative lookup only */
+#define	NI_LCF_CAP_DOTDOT	0x0002	/* ".." in strictrelative case */
+
+/*
+ * Initialization of a nameidata structure.
+ */
+#define	NDINIT(ndp, op, flags, segflg, namep, td)			\
+	NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, 0, td)
+#define	NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td)		\
+	NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, 0, td)
+#define	NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rightsp, td) \
+	NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, rightsp, td)
+#define	NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td)		\
+	NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, 0, td)
+
+void NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags,
+    enum uio_seg segflg, const char *namep, int dirfd, struct vnode *startdir,
+    cap_rights_t *rightsp, struct thread *td);
+
+#define NDF_NO_DVP_RELE		0x00000001
+#define NDF_NO_DVP_UNLOCK	0x00000002
+#define NDF_NO_DVP_PUT		0x00000003
+#define NDF_NO_VP_RELE		0x00000004
+#define NDF_NO_VP_UNLOCK	0x00000008
+#define NDF_NO_VP_PUT		0x0000000c
+#define NDF_NO_STARTDIR_RELE	0x00000010
+#define NDF_NO_FREE_PNBUF	0x00000020
+#define NDF_ONLY_PNBUF		(~NDF_NO_FREE_PNBUF)
+
+void NDFREE(struct nameidata *, const u_int);
+
+int	namei(struct nameidata *ndp);
+int	lookup(struct nameidata *ndp);
+int	relookup(struct vnode *dvp, struct vnode **vpp,
+	    struct componentname *cnp);
+#endif
+
+/*
+ * Stats on usefulness of namei caches.
+ */
+struct nchstats {
+	long	ncs_goodhits;		/* hits that we can really use */
+	long	ncs_neghits;		/* negative hits that we can use */
+	long	ncs_badhits;		/* hits we must drop */
+	long	ncs_falsehits;		/* hits with id mismatch */
+	long	ncs_miss;		/* misses */
+	long	ncs_long;		/* long names that ignore cache */
+	long	ncs_pass2;		/* names found with passes == 2 */
+	long	ncs_2passes;		/* number of times we attempt it */
+};
+
+extern struct nchstats nchstats;
+
+#endif /* !_SYS_NAMEI_H_ */
diff --git a/freebsd/sys/sys/pctrie.h b/freebsd/sys/sys/pctrie.h
new file mode 100644
index 00000000..88d5d258
--- /dev/null
+++ b/freebsd/sys/sys/pctrie.h
@@ -0,0 +1,152 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PCTRIE_H_
+#define _SYS_PCTRIE_H_
+
+#include <sys/_pctrie.h>
+
+#ifdef _KERNEL
+
+#define	PCTRIE_DEFINE(name, type, field, allocfn, freefn)		\
+									\
+CTASSERT(sizeof(((struct type *)0)->field) == sizeof(uint64_t));	\
+/*									\
+ * XXX This assert protects flag bits, it does not enforce natural	\
+ * alignment.  32bit architectures do not naturally align 64bit fields.	\
+ */									\
+CTASSERT((__offsetof(struct type, field) & (sizeof(uint32_t) - 1)) == 0); \
+									\
+static __inline struct type *						\
+name##_PCTRIE_VAL2PTR(uint64_t *val)					\
+{									\
+									\
+	if (val == NULL)						\
+		return (NULL);						\
+	return (struct type *)						\
+	    ((uintptr_t)val - __offsetof(struct type, field));		\
+}									\
+									\
+static __inline uint64_t *						\
+name##_PCTRIE_PTR2VAL(struct type *ptr)					\
+{									\
+									\
+	return &ptr->field;						\
+}									\
+									\
+static __inline int							\
+name##_PCTRIE_INSERT(struct pctrie *ptree, struct type *ptr)		\
+{									\
+									\
+	return pctrie_insert(ptree, name##_PCTRIE_PTR2VAL(ptr),		\
+	    allocfn);							\
+}									\
+									\
+static __inline struct type *						\
+name##_PCTRIE_LOOKUP(struct pctrie *ptree, uint64_t key)		\
+{									\
+									\
+	return name##_PCTRIE_VAL2PTR(pctrie_lookup(ptree, key));	\
+}									\
+									\
+static __inline __unused struct type *						\
+name##_PCTRIE_LOOKUP_LE(struct pctrie *ptree, uint64_t key)		\
+{									\
+									\
+	return name##_PCTRIE_VAL2PTR(pctrie_lookup_le(ptree, key));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_LOOKUP_GE(struct pctrie *ptree, uint64_t key)		\
+{									\
+									\
+	return name##_PCTRIE_VAL2PTR(pctrie_lookup_ge(ptree, key));	\
+}									\
+									\
+static __inline __unused void						\
+name##_PCTRIE_RECLAIM(struct pctrie *ptree)				\
+{									\
+									\
+	pctrie_reclaim_allnodes(ptree, freefn);				\
+}									\
+									\
+static __inline void							\
+name##_PCTRIE_REMOVE(struct pctrie *ptree, uint64_t key)		\
+{									\
+									\
+	pctrie_remove(ptree, key, freefn);				\
+}
+
+typedef	void	*(*pctrie_alloc_t)(struct pctrie *ptree);
+typedef	void 	(*pctrie_free_t)(struct pctrie *ptree, void *node);
+
+int		pctrie_insert(struct pctrie *ptree, uint64_t *val, 
+		    pctrie_alloc_t allocfn);
+uint64_t	*pctrie_lookup(struct pctrie *ptree, uint64_t key);
+uint64_t	*pctrie_lookup_ge(struct pctrie *ptree, uint64_t key);
+uint64_t	*pctrie_lookup_le(struct pctrie *ptree, uint64_t key);
+void		pctrie_reclaim_allnodes(struct pctrie *ptree,
+		    pctrie_free_t freefn);
+void		pctrie_remove(struct pctrie *ptree, uint64_t key,
+		    pctrie_free_t freefn);
+size_t		pctrie_node_size(void);
+int		pctrie_zone_init(void *mem, int size, int flags);
+
+static __inline void
+pctrie_init(struct pctrie *ptree)
+{
+
+	ptree->pt_root = 0;
+}
+
+static __inline boolean_t
+pctrie_is_empty(struct pctrie *ptree)
+{
+
+	return (ptree->pt_root == 0);
+}
+
+/*
+ * These widths should allow the pointers to a node's children to fit within
+ * a single cache line.  The extra levels from a narrow width should not be
+ * a problem thanks to path compression.
+ */
+#ifdef __LP64__
+#define	PCTRIE_WIDTH	4
+#else
+#define	PCTRIE_WIDTH	3
+#endif
+
+#define	PCTRIE_COUNT	(1 << PCTRIE_WIDTH)
+
+#endif /* _KERNEL */
+#endif /* !_SYS_PCTRIE_H_ */
diff --git a/freebsd/sys/sys/syscallsubr.h b/freebsd/sys/sys/syscallsubr.h
new file mode 100644
index 00000000..677afdd6
--- /dev/null
+++ b/freebsd/sys/sys/syscallsubr.h
@@ -0,0 +1,317 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2002 Ian Dowse.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_SYSCALLSUBR_H_
+#define _SYS_SYSCALLSUBR_H_
+
+#include <sys/signal.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <sys/mount.h>
+#include <sys/_cpuset.h>
+#include <sys/_domainset.h>
+#include <sys/_uio.h>
+
+struct __wrusage;
+struct file;
+struct filecaps;
+enum idtype;
+struct itimerval;
+struct image_args;
+struct jail;
+struct kevent;
+struct kevent_copyops;
+struct kld_file_stat;
+struct ksiginfo;
+struct mbuf;
+struct msghdr;
+struct msqid_ds;
+struct pollfd;
+struct ogetdirentries_args;
+struct rlimit;
+struct rusage;
+struct sched_param;
+union semun;
+struct sockaddr;
+struct stat;
+struct thr_param;
+struct uio;
+
+typedef int (*mmap_check_fp_fn)(struct file *, int, int, int);
+
+int	kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg,
+	    size_t buflen, size_t path_max);
+int	kern_accept(struct thread *td, int s, struct sockaddr **name,
+	    socklen_t *namelen, struct file **fp);
+int	kern_accept4(struct thread *td, int s, struct sockaddr **name,
+	    socklen_t *namelen, int flags, struct file **fp);
+int	kern_accessat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, int flags, int mode);
+int	kern_adjtime(struct thread *td, struct timeval *delta,
+	    struct timeval *olddelta);
+int	kern_alternate_path(struct thread *td, const char *prefix, const char *path,
+	    enum uio_seg pathseg, char **pathbuf, int create, int dirfd);
+int	kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa);
+int	kern_break(struct thread *td, uintptr_t *addr);
+int	kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds,
+	    size_t ncmds);
+int	kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights);
+int	kern_chdir(struct thread *td, char *path, enum uio_seg pathseg);
+int	kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
+	    clockid_t *clk_id);
+int	kern_clock_getres(struct thread *td, clockid_t clock_id,
+	    struct timespec *ts);
+int	kern_clock_gettime(struct thread *td, clockid_t clock_id,
+	    struct timespec *ats);
+int	kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
+	    const struct timespec *rqtp, struct timespec *rmtp);
+int	kern_clock_settime(struct thread *td, clockid_t clock_id,
+	    struct timespec *ats);
+int	kern_close(struct thread *td, int fd);
+int	kern_connectat(struct thread *td, int dirfd, int fd,
+	    struct sockaddr *sa);
+int	kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp);
+int	kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, size_t cpusetsize,
+	    const cpuset_t *maskp);
+int	kern_cpuset_getdomain(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, size_t domainsetsize,
+	    domainset_t *maskp, int *policyp);
+int	kern_cpuset_setdomain(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, size_t domainsetsize,
+	    const domainset_t *maskp, int policy);
+int	kern_cpuset_getid(struct thread *td, cpulevel_t level,
+	    cpuwhich_t which, id_t id, cpusetid_t *setid);
+int	kern_cpuset_setid(struct thread *td, cpuwhich_t which,
+	    id_t id, cpusetid_t setid);
+int	kern_dup(struct thread *td, u_int mode, int flags, int old, int new);
+int	kern_execve(struct thread *td, struct image_args *args,
+	    struct mac *mac_p);
+int	kern_fchmodat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, mode_t mode, int flag);
+int	kern_fchownat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, int uid, int gid, int flag);
+int	kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg);
+int	kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg);
+int	kern_fhstat(struct thread *td, fhandle_t fh, struct stat *buf);
+int	kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf);
+int	kern_fpathconf(struct thread *td, int fd, int name, long *valuep);
+int	kern_fstat(struct thread *td, int fd, struct stat *sbp);
+int	kern_fstatfs(struct thread *td, int fd, struct statfs *buf);
+int	kern_fsync(struct thread *td, int fd, bool fullsync);
+int	kern_ftruncate(struct thread *td, int fd, off_t length);
+int	kern_futimes(struct thread *td, int fd, struct timeval *tptr,
+	    enum uio_seg tptrseg);
+int	kern_futimens(struct thread *td, int fd, struct timespec *tptr,
+	    enum uio_seg tptrseg);
+int	kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
+	    off_t *basep, ssize_t *residp, enum uio_seg bufseg);
+int	kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
+	    size_t *countp, enum uio_seg bufseg, int mode);
+int	kern_getitimer(struct thread *, u_int, struct itimerval *);
+int	kern_getppid(struct thread *);
+int	kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
+	    socklen_t *alen);
+int	kern_getrusage(struct thread *td, int who, struct rusage *rup);
+int	kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
+	    socklen_t *alen);
+int	kern_getsockopt(struct thread *td, int s, int level, int name,
+	    void *optval, enum uio_seg valseg, socklen_t *valsize);
+int	kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data);
+int	kern_jail(struct thread *td, struct jail *j);
+int	kern_jail_get(struct thread *td, struct uio *options, int flags);
+int	kern_jail_set(struct thread *td, struct uio *options, int flags);
+int	kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
+	    struct kevent_copyops *k_ops, const struct timespec *timeout);
+int	kern_kevent_anonymous(struct thread *td, int nevents,
+	    struct kevent_copyops *k_ops);
+int	kern_kevent_fp(struct thread *td, struct file *fp, int nchanges,
+	    int nevents, struct kevent_copyops *k_ops,
+	    const struct timespec *timeout);
+int	kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
+int	kern_kldload(struct thread *td, const char *file, int *fileid);
+int	kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
+int	kern_kldunload(struct thread *td, int fileid, int flags);
+int	kern_linkat(struct thread *td, int fd1, int fd2, char *path1,
+	    char *path2, enum uio_seg segflg, int follow);
+int	kern_listen(struct thread *td, int s, int backlog);
+int	kern_lseek(struct thread *td, int fd, off_t offset, int whence);
+int	kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
+	    struct timeval *tptr, enum uio_seg tptrseg);
+int	kern_madvise(struct thread *td, uintptr_t addr, size_t len, int behav);
+int	kern_mincore(struct thread *td, uintptr_t addr, size_t len, char *vec);
+int	kern_mkdirat(struct thread *td, int fd, char *path,
+	    enum uio_seg segflg, int mode);
+int	kern_mkfifoat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, int mode);
+int	kern_mknodat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, int mode, dev_t dev);
+int	kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr,
+	    size_t len);
+int	kern_mmap(struct thread *td, uintptr_t addr, size_t size, int prot,
+	    int flags, int fd, off_t pos);
+int	kern_mmap_fpcheck(struct thread *td, uintptr_t addr, size_t len,
+	    int prot, int flags, int fd, off_t pos,
+	    mmap_check_fp_fn check_fp_fn);
+int	kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot);
+int	kern_msgctl(struct thread *, int, int, struct msqid_ds *);
+int	kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *);
+int	kern_msgsnd(struct thread *, int, const void *, size_t, int, long);
+int	kern_msync(struct thread *td, uintptr_t addr, size_t size, int flags);
+int	kern_munlock(struct thread *td, uintptr_t addr, size_t size);
+int	kern_munmap(struct thread *td, uintptr_t addr, size_t size);
+int     kern_nanosleep(struct thread *td, struct timespec *rqt,
+	    struct timespec *rmt);
+int	kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
+	    long *ploff);
+int	kern_openat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, int flags, int mode);
+int	kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg,
+	    int name, u_long flags, long *valuep);
+int	kern_pipe(struct thread *td, int fildes[2], int flags,
+	    struct filecaps *fcaps1, struct filecaps *fcaps2);
+int	kern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
+	    struct timespec *tsp, sigset_t *uset);
+int	kern_posix_error(struct thread *td, int error);
+int	kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+	    int advice);
+int	kern_posix_fallocate(struct thread *td, int fd, off_t offset,
+	    off_t len);
+int	kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com,
+	    void *data);
+int	kern_pread(struct thread *td, int fd, void *buf, size_t nbyte,
+	    off_t offset);
+int	kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset);
+int	kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou,
+	    fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits);
+int	kern_ptrace(struct thread *td, int req, pid_t pid, void *addr,
+	    int data);
+int	kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
+	    off_t offset);
+int	kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset);
+int	kern_readlinkat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count);
+int	kern_readv(struct thread *td, int fd, struct uio *auio);
+int	kern_recvit(struct thread *td, int s, struct msghdr *mp,
+	    enum uio_seg fromseg, struct mbuf **controlp);
+int	kern_renameat(struct thread *td, int oldfd, char *old, int newfd,
+	    char *new, enum uio_seg pathseg);
+int	kern_rmdirat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg);
+int	kern_sched_getparam(struct thread *td, struct thread *targettd,
+	    struct sched_param *param);
+int	kern_sched_getscheduler(struct thread *td, struct thread *targettd,
+	    int *policy);
+int	kern_sched_setparam(struct thread *td, struct thread *targettd,
+	    struct sched_param *param);
+int	kern_sched_setscheduler(struct thread *td, struct thread *targettd,
+	    int policy, struct sched_param *param);
+int	kern_sched_rr_get_interval(struct thread *td, pid_t pid,
+	    struct timespec *ts);
+int	kern_sched_rr_get_interval_td(struct thread *td, struct thread *targettd,
+	    struct timespec *ts);
+int	kern_semctl(struct thread *td, int semid, int semnum, int cmd,
+	    union semun *arg, register_t *rval);
+int	kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
+	    fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits);
+int	kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
+	    struct mbuf *control, enum uio_seg segflg);
+int	kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups);
+int	kern_setitimer(struct thread *, u_int, struct itimerval *,
+	    struct itimerval *);
+int	kern_setrlimit(struct thread *, u_int, struct rlimit *);
+int	kern_setsockopt(struct thread *td, int s, int level, int name,
+	    void *optval, enum uio_seg valseg, socklen_t valsize);
+int	kern_settimeofday(struct thread *td, struct timeval *tv,
+	    struct timezone *tzp);
+int	kern_shm_open(struct thread *td, const char *userpath, int flags,
+	    mode_t mode, struct filecaps *fcaps);
+int	kern_shmat(struct thread *td, int shmid, const void *shmaddr,
+	    int shmflg);
+int	kern_shmctl(struct thread *td, int shmid, int cmd, void *buf,
+	    size_t *bufsz);
+int	kern_shutdown(struct thread *td, int s, int how);
+int	kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
+	    struct sigaction *oact, int flags);
+int	kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss);
+int	kern_sigprocmask(struct thread *td, int how,
+	    sigset_t *set, sigset_t *oset, int flags);
+int	kern_sigsuspend(struct thread *td, sigset_t mask);
+int	kern_sigtimedwait(struct thread *td, sigset_t waitset,
+	    struct ksiginfo *ksi, struct timespec *timeout);
+int	kern_sigqueue(struct thread *td, pid_t pid, int signum,
+	    union sigval *value);
+int	kern_socket(struct thread *td, int domain, int type, int protocol);
+int	kern_statat(struct thread *td, int flag, int fd, char *path,
+	    enum uio_seg pathseg, struct stat *sbp,
+	    void (*hook)(struct vnode *vp, struct stat *sbp));
+int	kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
+	    struct statfs *buf);
+int	kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
+	    enum uio_seg segflg);
+int	kern_ktimer_create(struct thread *td, clockid_t clock_id,
+	    struct sigevent *evp, int *timerid, int preset_id);
+int	kern_ktimer_delete(struct thread *, int);
+int	kern_ktimer_settime(struct thread *td, int timer_id, int flags,
+	    struct itimerspec *val, struct itimerspec *oval);
+int	kern_ktimer_gettime(struct thread *td, int timer_id,
+	    struct itimerspec *val);
+int	kern_ktimer_getoverrun(struct thread *td, int timer_id);
+int	kern_thr_alloc(struct proc *, int pages, struct thread **);
+int	kern_thr_exit(struct thread *td);
+int	kern_thr_new(struct thread *td, struct thr_param *param);
+int	kern_thr_suspend(struct thread *td, struct timespec *tsp);
+int	kern_truncate(struct thread *td, char *path, enum uio_seg pathseg,
+	    off_t length);
+int	kern_unlinkat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, ino_t oldinum);
+int	kern_utimesat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg);
+int	kern_utimensat(struct thread *td, int fd, char *path,
+	    enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg,
+	    int follow);
+int	kern_wait(struct thread *td, pid_t pid, int *status, int options,
+	    struct rusage *rup);
+int	kern_wait6(struct thread *td, enum idtype idtype, id_t id, int *status,
+	    int options, struct __wrusage *wrup, siginfo_t *sip);
+int	kern_writev(struct thread *td, int fd, struct uio *auio);
+int	kern_socketpair(struct thread *td, int domain, int type, int protocol,
+	    int *rsv);
+
+/* flags for kern_sigaction */
+#define	KSA_OSIGSET	0x0001	/* uses osigact_t */
+#define	KSA_FREEBSD4	0x0002	/* uses ucontext4 */
+
+struct freebsd11_dirent;
+
+int	freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int
+	    count, long *basep, void (*func)(struct freebsd11_dirent *));
+
+#endif /* !_SYS_SYSCALLSUBR_H_ */
diff --git a/freebsd/sys/sys/sysent.h b/freebsd/sys/sys/sysent.h
new file mode 100644
index 00000000..d1d9e99b
--- /dev/null
+++ b/freebsd/sys/sys/sysent.h
@@ -0,0 +1,327 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1988, 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_SYSENT_H_
+#define	_SYS_SYSENT_H_
+
+#include <bsm/audit.h>
+
+struct rlimit;
+struct sysent;
+struct thread;
+struct ksiginfo;
+struct syscall_args;
+
+enum systrace_probe_t {
+	SYSTRACE_ENTRY,
+	SYSTRACE_RETURN,
+};
+
+typedef	int	sy_call_t(struct thread *, void *);
+
+typedef	void	(*systrace_probe_func_t)(struct syscall_args *,
+		    enum systrace_probe_t, int);
+typedef	void	(*systrace_args_func_t)(int, void *, uint64_t *, int *);
+
+#ifdef _KERNEL
+extern bool			systrace_enabled;
+#endif
+extern systrace_probe_func_t	systrace_probe_func;
+
+struct sysent {			/* system call table */
+	int	sy_narg;	/* number of arguments */
+	sy_call_t *sy_call;	/* implementing function */
+	au_event_t sy_auevent;	/* audit event associated with syscall */
+	systrace_args_func_t sy_systrace_args_func;
+				/* optional argument conversion function. */
+	u_int32_t sy_entry;	/* DTrace entry ID for systrace. */
+	u_int32_t sy_return;	/* DTrace return ID for systrace. */
+	u_int32_t sy_flags;	/* General flags for system calls. */
+	u_int32_t sy_thrcnt;
+};
+
+/*
+ * A system call is permitted in capability mode.
+ */
+#define	SYF_CAPENABLED	0x00000001
+
+#define	SY_THR_FLAGMASK	0x7
+#define	SY_THR_STATIC	0x1
+#define	SY_THR_DRAINING	0x2
+#define	SY_THR_ABSENT	0x4
+#define	SY_THR_INCR	0x8
+
+#ifdef KLD_MODULE
+#define	SY_THR_STATIC_KLD	0
+#else
+#define	SY_THR_STATIC_KLD	SY_THR_STATIC
+#endif
+
+struct image_params;
+struct __sigset;
+struct trapframe;
+struct vnode;
+
+struct sysentvec {
+	int		sv_size;	/* number of entries */
+	struct sysent	*sv_table;	/* pointer to sysent */
+	u_int		sv_mask;	/* optional mask to index */
+	int		sv_errsize;	/* size of errno translation table */
+	const int 	*sv_errtbl;	/* errno translation table */
+	int		(*sv_transtrap)(int, int);
+					/* translate trap-to-signal mapping */
+	int		(*sv_fixup)(register_t **, struct image_params *);
+					/* stack fixup function */
+	void		(*sv_sendsig)(void (*)(int), struct ksiginfo *, struct __sigset *);
+			    		/* send signal */
+	char 		*sv_sigcode;	/* start of sigtramp code */
+	int 		*sv_szsigcode;	/* size of sigtramp code */
+	char		*sv_name;	/* name of binary type */
+	int		(*sv_coredump)(struct thread *, struct vnode *, off_t, int);
+					/* function to dump core, or NULL */
+	int		(*sv_imgact_try)(struct image_params *);
+	void		(*sv_stackgap)(struct image_params *, u_long *);
+	int		sv_minsigstksz;	/* minimum signal stack size */
+	int		sv_pagesize;	/* spare / no longer used */
+	vm_offset_t	sv_minuser;	/* VM_MIN_ADDRESS */
+	vm_offset_t	sv_maxuser;	/* VM_MAXUSER_ADDRESS */
+	vm_offset_t	sv_usrstack;	/* USRSTACK */
+	vm_offset_t	sv_psstrings;	/* PS_STRINGS */
+	int		sv_stackprot;	/* vm protection for stack */
+	register_t	*(*sv_copyout_strings)(struct image_params *);
+	void		(*sv_setregs)(struct thread *, struct image_params *,
+			    u_long);
+	void		(*sv_fixlimit)(struct rlimit *, int);
+	u_long		*sv_maxssiz;
+	u_int		sv_flags;
+	void		(*sv_set_syscall_retval)(struct thread *, int);
+	int		(*sv_fetch_syscall_args)(struct thread *);
+	const char	**sv_syscallnames;
+	vm_offset_t	sv_timekeep_base;
+	vm_offset_t	sv_shared_page_base;
+	vm_offset_t	sv_shared_page_len;
+	vm_offset_t	sv_sigcode_base;
+	void		*sv_shared_page_obj;
+	void		(*sv_schedtail)(struct thread *);
+	void		(*sv_thread_detach)(struct thread *);
+	int		(*sv_trap)(struct thread *);
+	u_long		*sv_hwcap;	/* Value passed in AT_HWCAP. */
+	u_long		*sv_hwcap2;	/* Value passed in AT_HWCAP2. */
+};
+
+#define	SV_ILP32	0x000100	/* 32-bit executable. */
+#define	SV_LP64		0x000200	/* 64-bit executable. */
+#define	SV_IA32		0x004000	/* Intel 32-bit executable. */
+#define	SV_AOUT		0x008000	/* a.out executable. */
+#define	SV_SHP		0x010000	/* Shared page. */
+#define	SV_CAPSICUM	0x020000	/* Force cap_enter() on startup. */
+#define	SV_TIMEKEEP	0x040000	/* Shared page timehands. */
+#define	SV_ASLR		0x080000	/* ASLR allowed. */
+
+#define	SV_ABI_MASK	0xff
+#define	SV_ABI_ERRNO(p, e)	((p)->p_sysent->sv_errsize <= 0 ? e :	\
+	((e) >= (p)->p_sysent->sv_errsize ? -1 : (p)->p_sysent->sv_errtbl[e]))
+#define	SV_PROC_FLAG(p, x)	((p)->p_sysent->sv_flags & (x))
+#define	SV_PROC_ABI(p)		((p)->p_sysent->sv_flags & SV_ABI_MASK)
+#define	SV_CURPROC_FLAG(x)	SV_PROC_FLAG(curproc, x)
+#define	SV_CURPROC_ABI()	SV_PROC_ABI(curproc)
+/* same as ELFOSABI_XXX, to prevent header pollution */
+#define	SV_ABI_LINUX	3
+#define	SV_ABI_FREEBSD 	9
+#define	SV_ABI_CLOUDABI	17
+#define	SV_ABI_UNDEF	255
+
+#ifdef _KERNEL
+extern struct sysentvec aout_sysvec;
+extern struct sysent sysent[];
+extern const char *syscallnames[];
+
+#if defined(__amd64__)
+extern int i386_read_exec;
+#endif
+
+#define	NO_SYSCALL (-1)
+
+struct module;
+
+struct syscall_module_data {
+	int	(*chainevh)(struct module *, int, void *); /* next handler */
+	void	*chainarg;		/* arg for next event handler */
+	int	*offset;		/* offset into sysent */
+	struct sysent *new_sysent;	/* new sysent */
+	struct sysent old_sysent;	/* old sysent */
+	int	flags;			/* flags for syscall_register */
+};
+
+/* separate initialization vector so it can be used in a substructure */
+#define SYSENT_INIT_VALS(_syscallname) {			\
+	.sy_narg = (sizeof(struct _syscallname ## _args )	\
+	    / sizeof(register_t)),				\
+	.sy_call = (sy_call_t *)&sys_##_syscallname,		\
+	.sy_auevent = SYS_AUE_##_syscallname,			\
+	.sy_systrace_args_func = NULL,				\
+	.sy_entry = 0,						\
+	.sy_return = 0,						\
+	.sy_flags = 0,						\
+	.sy_thrcnt = 0						\
+}							
+
+#define	MAKE_SYSENT(syscallname)				\
+static struct sysent syscallname##_sysent = SYSENT_INIT_VALS(syscallname);
+
+#define	MAKE_SYSENT_COMPAT(syscallname)				\
+static struct sysent syscallname##_sysent = {			\
+	(sizeof(struct syscallname ## _args )			\
+	    / sizeof(register_t)),				\
+	(sy_call_t *)& syscallname,				\
+	SYS_AUE_##syscallname					\
+}
+
+#define SYSCALL_MODULE(name, offset, new_sysent, evh, arg)	\
+static struct syscall_module_data name##_syscall_mod = {	\
+	evh, arg, offset, new_sysent, { 0, NULL, AUE_NULL }	\
+};								\
+								\
+static moduledata_t name##_mod = {				\
+	"sys/" #name,						\
+	syscall_module_handler,					\
+	&name##_syscall_mod					\
+};								\
+DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE)
+
+#define	SYSCALL_MODULE_HELPER(syscallname)			\
+static int syscallname##_syscall = SYS_##syscallname;		\
+MAKE_SYSENT(syscallname);					\
+SYSCALL_MODULE(syscallname,					\
+    & syscallname##_syscall, & syscallname##_sysent,		\
+    NULL, NULL)
+
+#define	SYSCALL_MODULE_PRESENT(syscallname)				\
+	(sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmnosys &&	\
+	sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmressys)
+
+/*
+ * Syscall registration helpers with resource allocation handling.
+ */
+struct syscall_helper_data {
+	struct sysent new_sysent;
+	struct sysent old_sysent;
+	int syscall_no;
+	int registered;
+};
+#define SYSCALL_INIT_HELPER_F(syscallname, flags) {		\
+    .new_sysent = {						\
+	.sy_narg = (sizeof(struct syscallname ## _args )	\
+	    / sizeof(register_t)),				\
+	.sy_call = (sy_call_t *)& sys_ ## syscallname,		\
+	.sy_auevent = SYS_AUE_##syscallname,			\
+	.sy_flags = (flags)					\
+    },								\
+    .syscall_no = SYS_##syscallname				\
+}
+#define SYSCALL_INIT_HELPER_COMPAT_F(syscallname, flags) {	\
+    .new_sysent = {						\
+	.sy_narg = (sizeof(struct syscallname ## _args )	\
+	    / sizeof(register_t)),				\
+	.sy_call = (sy_call_t *)& syscallname,			\
+	.sy_auevent = SYS_AUE_##syscallname,			\
+	.sy_flags = (flags)					\
+    },								\
+    .syscall_no = SYS_##syscallname				\
+}
+#define SYSCALL_INIT_HELPER(syscallname)			\
+    SYSCALL_INIT_HELPER_F(syscallname, 0)
+#define SYSCALL_INIT_HELPER_COMPAT(syscallname)			\
+    SYSCALL_INIT_HELPER_COMPAT_F(syscallname, 0)
+#define SYSCALL_INIT_LAST {					\
+    .syscall_no = NO_SYSCALL					\
+}
+
+int	syscall_module_handler(struct module *mod, int what, void *arg);
+int	syscall_helper_register(struct syscall_helper_data *sd, int flags);
+int	syscall_helper_unregister(struct syscall_helper_data *sd);
+/* Implementation, exposed for COMPAT code */
+int	kern_syscall_register(struct sysent *sysents, int *offset,
+	    struct sysent *new_sysent, struct sysent *old_sysent, int flags);
+int	kern_syscall_deregister(struct sysent *sysents, int offset,
+	    const struct sysent *old_sysent);
+int	kern_syscall_module_handler(struct sysent *sysents,
+	    struct module *mod, int what, void *arg);
+int	kern_syscall_helper_register(struct sysent *sysents,
+	    struct syscall_helper_data *sd, int flags);
+int	kern_syscall_helper_unregister(struct sysent *sysents,
+	    struct syscall_helper_data *sd);
+
+struct proc;
+const char *syscallname(struct proc *p, u_int code);
+
+/* Special purpose system call functions. */
+struct nosys_args;
+
+int	lkmnosys(struct thread *, struct nosys_args *);
+int	lkmressys(struct thread *, struct nosys_args *);
+
+int	_syscall_thread_enter(struct thread *td, struct sysent *se);
+void	_syscall_thread_exit(struct thread *td, struct sysent *se);
+
+static inline int
+syscall_thread_enter(struct thread *td, struct sysent *se)
+{
+
+	if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0))
+		return (0);
+	return (_syscall_thread_enter(td, se));
+}
+
+static inline void
+syscall_thread_exit(struct thread *td, struct sysent *se)
+{
+
+	if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0))
+		return;
+	_syscall_thread_exit(td, se);
+}
+
+int shared_page_alloc(int size, int align);
+int shared_page_fill(int size, int align, const void *data);
+void shared_page_write(int base, int size, const void *data);
+void exec_sysvec_init(void *param);
+void exec_inittk(void);
+
+#define INIT_SYSENTVEC(name, sv)					\
+    SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY,				\
+	(sysinit_cfunc_t)exec_sysvec_init, sv);
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_SYSENT_H_ */
diff --git a/freebsd/sys/sys/vmem.h b/freebsd/sys/sys/vmem.h
new file mode 100644
index 00000000..e74d1e3f
--- /dev/null
+++ b/freebsd/sys/sys/vmem.h
@@ -0,0 +1,145 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c)2006 YAMAMOTO Takashi,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/* From	$NetBSD: vmem.h,v 1.20 2013/01/29 21:26:24 para Exp $	*/
+
+/* $FreeBSD$ */
+
+#ifndef _SYS_VMEM_H_
+#define	_SYS_VMEM_H_
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+
+typedef struct vmem vmem_t;
+
+typedef uintptr_t	vmem_addr_t;
+typedef size_t		vmem_size_t;
+
+#define	VMEM_ADDR_MIN		0
+#define	VMEM_ADDR_QCACHE_MIN	1
+#define	VMEM_ADDR_MAX		(~(vmem_addr_t)0)
+
+typedef int (vmem_import_t)(void *, vmem_size_t, int, vmem_addr_t *);
+typedef void (vmem_release_t)(void *, vmem_addr_t, vmem_size_t);
+typedef void (vmem_reclaim_t)(vmem_t *, int);
+
+/*
+ * Create a vmem:
+ *	name		- Name of the region
+ *	base		- Initial span start (optional)
+ *	size		- Initial span size
+ *	quantum		- Natural unit of allocation (ie PAGE_SIZE, 1, etc)
+ *	qcache_max	- Maximum size to quantum cache.  This creates a UMA
+ *			  cache for each multiple of quantum up to qcache_max.
+ *	flags		- M_* flags
+ */
+vmem_t *vmem_create(const char *name, vmem_addr_t base,
+    vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags);
+vmem_t *vmem_init(vmem_t *vm, const char *name, vmem_addr_t base,
+    vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags);
+void vmem_destroy(vmem_t *);
+
+/*
+ * Set callbacks for bringing in dynamic regions:
+ *	importfn	- Backing store import routine.
+ *	releasefn	- Backing store release routine.
+ *	arg		- Backing store argument
+ *	import_quantum	- Size to import from backing store
+ */
+
+void vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
+    vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum);
+
+/*
+ * Set a limit on the total size of a vmem.
+ */
+
+void vmem_set_limit(vmem_t *vm, vmem_size_t limit);
+
+/*
+ * Set a callback for reclaiming memory when space is exhausted:
+ */
+void vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn);
+
+/*
+ * Allocate and free linear regions from a vmem.  Must specify
+ * BESTFIT or FIRSTFIT.  Free is non-blocking.  These routines
+ * respect the quantum caches.
+ */
+int vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp);
+void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size);
+
+/*
+ * Constrained allocate and free routines.  These bypass the quantum cache.
+ *	size		- Size in units of 1, not quantum.
+ *	align		- Required alignment of the start of region
+ *	phase		- Offset from alignment
+ *	nocross		- Illegal boundary
+ *	minaddr		- Minimum allowed address for last byte
+ *	maxaddr		- Maximum allowed address for first byte
+ *	flags		- M_* flags
+ *	addrp		- result
+ */
+int vmem_xalloc(vmem_t *vm, vmem_size_t size, vmem_size_t align,
+    vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
+    vmem_addr_t maxaddr, int flags, vmem_addr_t *addrp);
+void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size);
+
+/*
+ * Add a static region to a vmem after create.  This won't be freed
+ * until the vmem is destroyed.
+ */
+int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags);
+
+/*
+ * Given roundup size to the vmem's native quantum size.
+ */
+vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size);
+
+/*
+ * Report vmem utilization according to the requested type.
+ */
+vmem_size_t vmem_size(vmem_t *vm, int typemask);
+
+void vmem_whatis(vmem_addr_t addr, int (*fn)(const char *, ...)
+    __printflike(1, 2));
+void vmem_print(vmem_addr_t addr, const char *, int (*fn)(const char *, ...)
+    __printflike(1, 2));
+void vmem_printall(const char *, int (*fn)(const char *, ...)
+    __printflike(1, 2));
+void vmem_startup(void);
+
+/* vmem_size typemask */
+#define VMEM_ALLOC	0x01
+#define VMEM_FREE	0x02
+#define VMEM_MAXFREE	0x10
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_VMEM_H_ */
diff --git a/freebsd/sys/vm/vm_meter.c b/freebsd/sys/vm/vm_meter.c
new file mode 100644
index 00000000..dfd50081
--- /dev/null
+++ b/freebsd/sys/vm/vm_meter.c
@@ -0,0 +1,561 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vm_meter.c	8.4 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/vmmeter.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <sys/sysctl.h>
+
+struct vmmeter __read_mostly vm_cnt = {
+	.v_swtch = EARLY_COUNTER,
+	.v_trap = EARLY_COUNTER,
+	.v_syscall = EARLY_COUNTER,
+	.v_intr = EARLY_COUNTER,
+	.v_soft = EARLY_COUNTER,
+	.v_vm_faults = EARLY_COUNTER,
+	.v_io_faults = EARLY_COUNTER,
+	.v_cow_faults = EARLY_COUNTER,
+	.v_cow_optim = EARLY_COUNTER,
+	.v_zfod = EARLY_COUNTER,
+	.v_ozfod = EARLY_COUNTER,
+	.v_swapin = EARLY_COUNTER,
+	.v_swapout = EARLY_COUNTER,
+	.v_swappgsin = EARLY_COUNTER,
+	.v_swappgsout = EARLY_COUNTER,
+	.v_vnodein = EARLY_COUNTER,
+	.v_vnodeout = EARLY_COUNTER,
+	.v_vnodepgsin = EARLY_COUNTER,
+	.v_vnodepgsout = EARLY_COUNTER,
+	.v_intrans = EARLY_COUNTER,
+	.v_reactivated = EARLY_COUNTER,
+	.v_pdwakeups = EARLY_COUNTER,
+	.v_pdpages = EARLY_COUNTER,
+	.v_pdshortfalls = EARLY_COUNTER,
+	.v_dfree = EARLY_COUNTER,
+	.v_pfree = EARLY_COUNTER,
+	.v_tfree = EARLY_COUNTER,
+	.v_forks = EARLY_COUNTER,
+	.v_vforks = EARLY_COUNTER,
+	.v_rforks = EARLY_COUNTER,
+	.v_kthreads = EARLY_COUNTER,
+	.v_forkpages = EARLY_COUNTER,
+	.v_vforkpages = EARLY_COUNTER,
+	.v_rforkpages = EARLY_COUNTER,
+	.v_kthreadpages = EARLY_COUNTER,
+	.v_wire_count = EARLY_COUNTER,
+};
+
+static void
+vmcounter_startup(void)
+{
+	counter_u64_t *cnt = (counter_u64_t *)&vm_cnt;
+
+	COUNTER_ARRAY_ALLOC(cnt, VM_METER_NCOUNTERS, M_WAITOK);
+}
+SYSINIT(counter, SI_SUB_KMEM, SI_ORDER_FIRST, vmcounter_startup, NULL);
+
+SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
+	CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold");
+SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
+	CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages");
+SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
+	CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock");
+SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
+	CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive");
+SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
+	CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
+SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
+	CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point");
+
+static int
+sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
+{
+	
+#ifdef SCTL_MASK32
+	u_int32_t la[4];
+
+	if (req->flags & SCTL_MASK32) {
+		la[0] = averunnable.ldavg[0];
+		la[1] = averunnable.ldavg[1];
+		la[2] = averunnable.ldavg[2];
+		la[3] = averunnable.fscale;
+		return SYSCTL_OUT(req, la, sizeof(la));
+	} else
+#endif
+		return SYSCTL_OUT(req, &averunnable, sizeof(averunnable));
+}
+SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg",
+    "Machine loadaverage history");
+
+/*
+ * This function aims to determine if the object is mapped,
+ * specifically, if it is referenced by a vm_map_entry.  Because
+ * objects occasionally acquire transient references that do not
+ * represent a mapping, the method used here is inexact.  However, it
+ * has very low overhead and is good enough for the advisory
+ * vm.vmtotal sysctl.
+ */
+static bool
+is_object_active(vm_object_t obj)
+{
+
+	return (obj->ref_count > obj->shadow_count);
+}
+
+#if defined(COMPAT_FREEBSD11)
+struct vmtotal11 {
+	int16_t	t_rq;
+	int16_t	t_dw;
+	int16_t	t_pw;
+	int16_t	t_sl;
+	int16_t	t_sw;
+	int32_t	t_vm;
+	int32_t	t_avm;
+	int32_t	t_rm;
+	int32_t	t_arm;
+	int32_t	t_vmshr;
+	int32_t	t_avmshr;
+	int32_t	t_rmshr;
+	int32_t	t_armshr;
+	int32_t	t_free;
+};
+#endif
+
+static int
+vmtotal(SYSCTL_HANDLER_ARGS)
+{
+	struct vmtotal total;
+#if defined(COMPAT_FREEBSD11)
+	struct vmtotal11 total11;
+#endif
+	vm_object_t object;
+	struct proc *p;
+	struct thread *td;
+
+	if (req->oldptr == NULL) {
+#if defined(COMPAT_FREEBSD11)
+		if (curproc->p_osrel < P_OSREL_VMTOTAL64)
+			return (SYSCTL_OUT(req, NULL, sizeof(total11)));
+#endif
+		return (SYSCTL_OUT(req, NULL, sizeof(total)));
+	}
+	bzero(&total, sizeof(total));
+
+	/*
+	 * Calculate process statistics.
+	 */
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if ((p->p_flag & P_SYSTEM) != 0)
+			continue;
+		PROC_LOCK(p);
+		if (p->p_state != PRS_NEW) {
+			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
+				switch (td->td_state) {
+				case TDS_INHIBITED:
+					if (TD_IS_SWAPPED(td))
+						total.t_sw++;
+					else if (TD_IS_SLEEPING(td)) {
+						if (td->td_priority <= PZERO)
+							total.t_dw++;
+						else
+							total.t_sl++;
+					}
+					break;
+				case TDS_CAN_RUN:
+					total.t_sw++;
+					break;
+				case TDS_RUNQ:
+				case TDS_RUNNING:
+					total.t_rq++;
+					break;
+				default:
+					break;
+				}
+				thread_unlock(td);
+			}
+		}
+		PROC_UNLOCK(p);
+	}
+	sx_sunlock(&allproc_lock);
+	/*
+	 * Calculate object memory usage statistics.
+	 */
+	mtx_lock(&vm_object_list_mtx);
+	TAILQ_FOREACH(object, &vm_object_list, object_list) {
+		/*
+		 * Perform unsynchronized reads on the object.  In
+		 * this case, the lack of synchronization should not
+		 * impair the accuracy of the reported statistics.
+		 */
+		if ((object->flags & OBJ_FICTITIOUS) != 0) {
+			/*
+			 * Devices, like /dev/mem, will badly skew our totals.
+			 */
+			continue;
+		}
+		if (object->ref_count == 0) {
+			/*
+			 * Also skip unreferenced objects, including
+			 * vnodes representing mounted file systems.
+			 */
+			continue;
+		}
+		if (object->ref_count == 1 &&
+		    (object->flags & OBJ_NOSPLIT) != 0) {
+			/*
+			 * Also skip otherwise unreferenced swap
+			 * objects backing tmpfs vnodes, and POSIX or
+			 * SysV shared memory.
+			 */
+			continue;
+		}
+		total.t_vm += object->size;
+		total.t_rm += object->resident_page_count;
+		if (is_object_active(object)) {
+			total.t_avm += object->size;
+			total.t_arm += object->resident_page_count;
+		}
+		if (object->shadow_count > 1) {
+			/* shared object */
+			total.t_vmshr += object->size;
+			total.t_rmshr += object->resident_page_count;
+			if (is_object_active(object)) {
+				total.t_avmshr += object->size;
+				total.t_armshr += object->resident_page_count;
+			}
+		}
+	}
+	mtx_unlock(&vm_object_list_mtx);
+	total.t_pw = vm_wait_count();
+	total.t_free = vm_free_count();
+#if defined(COMPAT_FREEBSD11)
+	/* sysctl(8) allocates twice as much memory as reported by sysctl(3) */
+	if (curproc->p_osrel < P_OSREL_VMTOTAL64 && (req->oldlen ==
+	    sizeof(total11) || req->oldlen == 2 * sizeof(total11))) {
+		bzero(&total11, sizeof(total11));
+		total11.t_rq = total.t_rq;
+		total11.t_dw = total.t_dw;
+		total11.t_pw = total.t_pw;
+		total11.t_sl = total.t_sl;
+		total11.t_sw = total.t_sw;
+		total11.t_vm = total.t_vm;	/* truncate */
+		total11.t_avm = total.t_avm;	/* truncate */
+		total11.t_rm = total.t_rm;	/* truncate */
+		total11.t_arm = total.t_arm;	/* truncate */
+		total11.t_vmshr = total.t_vmshr;	/* truncate */
+		total11.t_avmshr = total.t_avmshr;	/* truncate */
+		total11.t_rmshr = total.t_rmshr;	/* truncate */
+		total11.t_armshr = total.t_armshr;	/* truncate */
+		total11.t_free = total.t_free;		/* truncate */
+		return (SYSCTL_OUT(req, &total11, sizeof(total11)));
+	}
+#endif
+	return (SYSCTL_OUT(req, &total, sizeof(total)));
+}
+
+SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, vmtotal, "S,vmtotal",
+    "System virtual memory statistics");
+SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats");
+static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0,
+	"VM meter sys stats");
+static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0,
+	"VM meter vm stats");
+SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats");
+
+static int
+sysctl_handle_vmstat(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+#ifdef COMPAT_FREEBSD11
+	uint32_t val32;
+#endif
+
+	val = counter_u64_fetch(*(counter_u64_t *)arg1);
+#ifdef COMPAT_FREEBSD11
+	if (req->oldlen == sizeof(val32)) {
+		val32 = val;		/* truncate */
+		return (SYSCTL_OUT(req, &val32, sizeof(val32)));
+	}
+#endif
+	return (SYSCTL_OUT(req, &val, sizeof(val)));
+}
+
+#define	VM_STATS(parent, var, descr) \
+    SYSCTL_OID(parent, OID_AUTO, var, CTLTYPE_U64 | CTLFLAG_MPSAFE | \
+    CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr)
+#define	VM_STATS_VM(var, descr)		VM_STATS(_vm_stats_vm, var, descr)
+#define	VM_STATS_SYS(var, descr)	VM_STATS(_vm_stats_sys, var, descr)
+
+VM_STATS_SYS(v_swtch, "Context switches");
+VM_STATS_SYS(v_trap, "Traps");
+VM_STATS_SYS(v_syscall, "System calls");
+VM_STATS_SYS(v_intr, "Device interrupts");
+VM_STATS_SYS(v_soft, "Software interrupts");
+VM_STATS_VM(v_vm_faults, "Address memory faults");
+VM_STATS_VM(v_io_faults, "Page faults requiring I/O");
+VM_STATS_VM(v_cow_faults, "Copy-on-write faults");
+VM_STATS_VM(v_cow_optim, "Optimized COW faults");
+VM_STATS_VM(v_zfod, "Pages zero-filled on demand");
+VM_STATS_VM(v_ozfod, "Optimized zero fill pages");
+VM_STATS_VM(v_swapin, "Swap pager pageins");
+VM_STATS_VM(v_swapout, "Swap pager pageouts");
+VM_STATS_VM(v_swappgsin, "Swap pages swapped in");
+VM_STATS_VM(v_swappgsout, "Swap pages swapped out");
+VM_STATS_VM(v_vnodein, "Vnode pager pageins");
+VM_STATS_VM(v_vnodeout, "Vnode pager pageouts");
+VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
+VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
+VM_STATS_VM(v_intrans, "In transit page faults");
+VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon");
+VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
+VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls");
+VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
+VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
+VM_STATS_VM(v_tfree, "Total pages freed");
+VM_STATS_VM(v_forks, "Number of fork() calls");
+VM_STATS_VM(v_vforks, "Number of vfork() calls");
+VM_STATS_VM(v_rforks, "Number of rfork() calls");
+VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel");
+VM_STATS_VM(v_forkpages, "VM pages affected by fork()");
+VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()");
+VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()");
+VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel");
+
+static int
+sysctl_handle_vmstat_proc(SYSCTL_HANDLER_ARGS)
+{
+	u_int (*fn)(void);
+	uint32_t val;
+
+	fn = arg1;
+	val = fn();
+	return (SYSCTL_OUT(req, &val, sizeof(val)));
+}
+
+#define	VM_STATS_PROC(var, descr, fn) \
+    SYSCTL_OID(_vm_stats_vm, OID_AUTO, var, CTLTYPE_U32 | CTLFLAG_MPSAFE | \
+    CTLFLAG_RD, fn, 0, sysctl_handle_vmstat_proc, "IU", descr)
+
+#define	VM_STATS_UINT(var, descr)	\
+    SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr)
+
+VM_STATS_UINT(v_page_size, "Page size in bytes");
+VM_STATS_UINT(v_page_count, "Total number of pages in system");
+VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock");
+VM_STATS_UINT(v_free_target, "Pages desired free");
+VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold");
+VM_STATS_PROC(v_free_count, "Free pages", vm_free_count);
+VM_STATS_PROC(v_wire_count, "Wired pages", vm_wire_count);
+VM_STATS_PROC(v_active_count, "Active pages", vm_active_count);
+VM_STATS_UINT(v_inactive_target, "Desired inactive pages");
+VM_STATS_PROC(v_inactive_count, "Inactive pages", vm_inactive_count);
+VM_STATS_PROC(v_laundry_count, "Pages eligible for laundering",
+    vm_laundry_count);
+VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel");
+VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code");
+VM_STATS_UINT(v_free_severe, "Severe page depletion point");
+
+#ifdef COMPAT_FREEBSD11
+/*
+ * Provide compatibility sysctls for the benefit of old utilities which exit
+ * with an error if they cannot be found.
+ */
+SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_count, CTLFLAG_RD,
+    SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility");
+SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD,
+    SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility");
+#endif
+
+u_int
+vm_free_count(void)
+{
+	u_int v;
+	int i;
+
+	v = 0;
+	for (i = 0; i < vm_ndomains; i++)
+		v += vm_dom[i].vmd_free_count;
+
+	return (v);
+}
+
+static u_int
+vm_pagequeue_count(int pq)
+{
+	u_int v;
+	int i;
+
+	v = 0;
+	for (i = 0; i < vm_ndomains; i++)
+		v += vm_dom[i].vmd_pagequeues[pq].pq_cnt;
+
+	return (v);
+}
+
+u_int
+vm_active_count(void)
+{
+
+	return (vm_pagequeue_count(PQ_ACTIVE));
+}
+
+u_int
+vm_inactive_count(void)
+{
+
+	return (vm_pagequeue_count(PQ_INACTIVE));
+}
+
+u_int
+vm_laundry_count(void)
+{
+
+	return (vm_pagequeue_count(PQ_LAUNDRY));
+}
+
+static int
+sysctl_vm_pdpages(SYSCTL_HANDLER_ARGS)
+{
+	struct vm_pagequeue *pq;
+	uint64_t ret;
+	int dom, i;
+
+	ret = counter_u64_fetch(vm_cnt.v_pdpages);
+	for (dom = 0; dom < vm_ndomains; dom++)
+		for (i = 0; i < PQ_COUNT; i++) {
+			pq = &VM_DOMAIN(dom)->vmd_pagequeues[i];
+			ret += pq->pq_pdpages;
+		}
+	return (SYSCTL_OUT(req, &ret, sizeof(ret)));
+}
+SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_vm_pdpages, "QU",
+    "Pages analyzed by pagedaemon");
+
+static void
+vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent)
+{
+	struct sysctl_oid *oid;
+
+	vmd->vmd_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
+	    vmd->vmd_name, CTLFLAG_RD, NULL, "");
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
+	    "stats", CTLFLAG_RD, NULL, "");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_count", CTLFLAG_RD, &vmd->vmd_free_count, 0,
+	    "Free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "active", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_cnt, 0,
+	    "Active pages");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "actpdpgs", CTLFLAG_RD,
+	    &vmd->vmd_pagequeues[PQ_ACTIVE].pq_pdpages, 0,
+	    "Active pages scanned by the page daemon");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "inactive", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt, 0,
+	    "Inactive pages");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "inactpdpgs", CTLFLAG_RD,
+	    &vmd->vmd_pagequeues[PQ_INACTIVE].pq_pdpages, 0,
+	    "Inactive pages scanned by the page daemon");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "laundry", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 0,
+	    "laundry pages");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "laundpdpgs", CTLFLAG_RD,
+	    &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_pdpages, 0,
+	    "Laundry pages scanned by the page daemon");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswappable",
+	    CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt, 0,
+	    "Unswappable pages");
+	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "unswppdpgs", CTLFLAG_RD,
+	    &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_pdpages, 0,
+	    "Unswappable pages scanned by the page daemon");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "inactive_target", CTLFLAG_RD, &vmd->vmd_inactive_target, 0,
+	    "Target inactive pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_target", CTLFLAG_RD, &vmd->vmd_free_target, 0,
+	    "Target free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_reserved", CTLFLAG_RD, &vmd->vmd_free_reserved, 0,
+	    "Reserved free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_min", CTLFLAG_RD, &vmd->vmd_free_min, 0,
+	    "Minimum free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0,
+	    "Severe free pages");
+
+}
+
+static void
+vm_stats_init(void *arg __unused)
+{
+	struct sysctl_oid *oid;
+	int i;
+
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm), OID_AUTO,
+	    "domain", CTLFLAG_RD, NULL, "");
+	for (i = 0; i < vm_ndomains; i++)
+		vm_domain_stats_init(VM_DOMAIN(i), oid);
+}
+
+SYSINIT(vmstats_init, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_stats_init, NULL);
-- 
cgit v1.2.3