summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Johns <chrisj@rtems.org>2021-07-22 11:50:13 +1000
committerChris Johns <chrisj@rtems.org>2021-08-28 10:24:38 +1000
commit1739d74f7dc53232fe20ed3ea9d8b4b0730b4025 (patch)
tree52f0a4a2a32aae249489540f1217b373db493f7b
parentkern: Add kernel trace support (KTR) (diff)
downloadrtems-libbsd-1739d74f7dc53232fe20ed3ea9d8b4b0730b4025.tar.bz2
freebsd/sys: Import VFS support
Update #4475
-rw-r--r--freebsd/sys/fs/deadfs/dead_vnops.c159
-rw-r--r--freebsd/sys/fs/pseudofs/pseudofs.c491
-rw-r--r--freebsd/sys/fs/pseudofs/pseudofs.h312
-rw-r--r--freebsd/sys/fs/pseudofs/pseudofs_fileno.c159
-rw-r--r--freebsd/sys/fs/pseudofs/pseudofs_internal.h213
-rw-r--r--freebsd/sys/fs/pseudofs/pseudofs_vncache.c333
-rw-r--r--freebsd/sys/fs/pseudofs/pseudofs_vnops.c1060
-rw-r--r--freebsd/sys/kern/kern_descrip.c4283
-rw-r--r--freebsd/sys/kern/kern_lock.c1719
-rw-r--r--freebsd/sys/kern/subr_pctrie.c695
-rw-r--r--freebsd/sys/kern/vfs_acl.c600
-rw-r--r--freebsd/sys/kern/vfs_aio.c2987
-rw-r--r--freebsd/sys/kern/vfs_bio.c5474
-rw-r--r--freebsd/sys/kern/vfs_cache.c2604
-rw-r--r--freebsd/sys/kern/vfs_cluster.c1086
-rw-r--r--freebsd/sys/kern/vfs_default.c1286
-rw-r--r--freebsd/sys/kern/vfs_export.c528
-rw-r--r--freebsd/sys/kern/vfs_extattr.c757
-rw-r--r--freebsd/sys/kern/vfs_hash.c234
-rw-r--r--freebsd/sys/kern/vfs_init.c376
-rw-r--r--freebsd/sys/kern/vfs_lookup.c1450
-rw-r--r--freebsd/sys/kern/vfs_mount.c2052
-rw-r--r--freebsd/sys/kern/vfs_subr.c5719
-rw-r--r--freebsd/sys/kern/vfs_syscalls.c4748
-rw-r--r--freebsd/sys/kern/vfs_vnops.c2607
-rw-r--r--freebsd/sys/sys/bio.h184
-rw-r--r--freebsd/sys/sys/namei.h226
-rw-r--r--freebsd/sys/sys/pctrie.h152
-rw-r--r--freebsd/sys/sys/syscallsubr.h317
-rw-r--r--freebsd/sys/sys/sysent.h327
-rw-r--r--freebsd/sys/sys/vmem.h145
-rw-r--r--freebsd/sys/vm/vm_meter.c561
32 files changed, 43844 insertions, 0 deletions
diff --git a/freebsd/sys/fs/deadfs/dead_vnops.c b/freebsd/sys/fs/deadfs/dead_vnops.c
new file mode 100644
index 00000000..a3153aed
--- /dev/null
+++ b/freebsd/sys/fs/deadfs/dead_vnops.c
@@ -0,0 +1,159 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)dead_vnops.c 8.1 (Berkeley) 6/10/93
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/vnode.h>
+
+/*
+ * Prototypes for dead operations on vnodes.
+ */
+static vop_lookup_t dead_lookup;
+static vop_open_t dead_open;
+static vop_getwritemount_t dead_getwritemount;
+static vop_rename_t dead_rename;
+static vop_unset_text_t dead_unset_text;
+
+struct vop_vector dead_vnodeops = {
+ .vop_default = &default_vnodeops,
+
+ .vop_access = VOP_EBADF,
+ .vop_advlock = VOP_EBADF,
+ .vop_bmap = VOP_EBADF,
+ .vop_create = VOP_PANIC,
+ .vop_getattr = VOP_EBADF,
+ .vop_getwritemount = dead_getwritemount,
+ .vop_inactive = VOP_NULL,
+ .vop_ioctl = VOP_EBADF,
+ .vop_link = VOP_PANIC,
+ .vop_lookup = dead_lookup,
+ .vop_mkdir = VOP_PANIC,
+ .vop_mknod = VOP_PANIC,
+ .vop_open = dead_open,
+ .vop_pathconf = VOP_EBADF, /* per pathconf(2) */
+ .vop_poll = dead_poll,
+ .vop_read = dead_read,
+ .vop_readdir = VOP_EBADF,
+ .vop_readlink = VOP_EBADF,
+ .vop_reclaim = VOP_NULL,
+ .vop_remove = VOP_PANIC,
+ .vop_rename = dead_rename,
+ .vop_rmdir = VOP_PANIC,
+ .vop_setattr = VOP_EBADF,
+ .vop_symlink = VOP_PANIC,
+ .vop_vptocnp = VOP_EBADF,
+ .vop_unset_text = dead_unset_text,
+ .vop_write = dead_write,
+};
+
+static int
+dead_getwritemount(struct vop_getwritemount_args *ap)
+{
+
+ *(ap->a_mpp) = NULL;
+ return (0);
+}
+
+/*
+ * Trivial lookup routine that always fails.
+ */
+static int
+dead_lookup(struct vop_lookup_args *ap)
+{
+
+ *ap->a_vpp = NULL;
+ return (ENOTDIR);
+}
+
+/*
+ * Open always fails as if device did not exist.
+ */
+static int
+dead_open(struct vop_open_args *ap)
+{
+
+ return (ENXIO);
+}
+
+int
+dead_read(struct vop_read_args *ap)
+{
+
+ /*
+ * Return EOF for tty devices, EIO for others
+ */
+ if ((ap->a_vp->v_vflag & VV_ISTTY) == 0)
+ return (EIO);
+ return (0);
+}
+
+int
+dead_write(struct vop_write_args *ap)
+{
+
+ return (EIO);
+}
+
+int
+dead_poll(struct vop_poll_args *ap)
+{
+
+ if (ap->a_events & ~POLLSTANDARD)
+ return (POLLNVAL);
+
+ /*
+ * Let the user find out that the descriptor is gone.
+ */
+ return (POLLHUP | ((POLLIN | POLLRDNORM) & ap->a_events));
+
+}
+
+static int
+dead_rename(struct vop_rename_args *ap)
+{
+
+ vop_rename_fail(ap);
+ return (EXDEV);
+}
+
+static int
+dead_unset_text(struct vop_unset_text_args *ap)
+{
+
+ return (0);
+}
diff --git a/freebsd/sys/fs/pseudofs/pseudofs.c b/freebsd/sys/fs/pseudofs/pseudofs.c
new file mode 100644
index 00000000..73d3c7cb
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs.c
@@ -0,0 +1,491 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+static MALLOC_DEFINE(M_PFSNODES, "pfs_nodes", "pseudofs nodes");
+
+SYSCTL_NODE(_vfs, OID_AUTO, pfs, CTLFLAG_RW, 0,
+ "pseudofs");
+
+#ifdef PSEUDOFS_TRACE
+int pfs_trace;
+SYSCTL_INT(_vfs_pfs, OID_AUTO, trace, CTLFLAG_RW, &pfs_trace, 0,
+ "enable tracing of pseudofs vnode operations");
+#endif
+
+#if PFS_FSNAMELEN != MFSNAMELEN
+#error "PFS_FSNAMELEN is not equal to MFSNAMELEN"
+#endif
+
+/*
+ * Allocate and initialize a node
+ */
+static struct pfs_node *
+pfs_alloc_node_flags(struct pfs_info *pi, const char *name, pfs_type_t type, int flags)
+{
+ struct pfs_node *pn;
+ int malloc_flags;
+
+ KASSERT(strlen(name) < PFS_NAMELEN,
+ ("%s(): node name is too long", __func__));
+ if (flags & PFS_NOWAIT)
+ malloc_flags = M_NOWAIT | M_ZERO;
+ else
+ malloc_flags = M_WAITOK | M_ZERO;
+ pn = malloc(sizeof *pn, M_PFSNODES, malloc_flags);
+ if (pn == NULL)
+ return (NULL);
+ mtx_init(&pn->pn_mutex, "pfs_node", NULL, MTX_DEF | MTX_DUPOK);
+ strlcpy(pn->pn_name, name, sizeof pn->pn_name);
+ pn->pn_type = type;
+ pn->pn_info = pi;
+ return (pn);
+}
+
+static struct pfs_node *
+pfs_alloc_node(struct pfs_info *pi, const char *name, pfs_type_t type)
+{
+ return (pfs_alloc_node_flags(pi, name, type, 0));
+}
+
+/*
+ * Add a node to a directory
+ */
+static void
+pfs_add_node(struct pfs_node *parent, struct pfs_node *pn)
+{
+#ifdef INVARIANTS
+ struct pfs_node *iter;
+#endif
+
+ KASSERT(parent != NULL,
+ ("%s(): parent is NULL", __func__));
+ KASSERT(pn->pn_parent == NULL,
+ ("%s(): node already has a parent", __func__));
+ KASSERT(parent->pn_info != NULL,
+ ("%s(): parent has no pn_info", __func__));
+ KASSERT(parent->pn_type == pfstype_dir ||
+ parent->pn_type == pfstype_procdir ||
+ parent->pn_type == pfstype_root,
+ ("%s(): parent is not a directory", __func__));
+
+#ifdef INVARIANTS
+ /* XXX no locking! */
+ if (pn->pn_type == pfstype_procdir)
+ for (iter = parent; iter != NULL; iter = iter->pn_parent)
+ KASSERT(iter->pn_type != pfstype_procdir,
+ ("%s(): nested process directories", __func__));
+ for (iter = parent->pn_nodes; iter != NULL; iter = iter->pn_next) {
+ KASSERT(strcmp(pn->pn_name, iter->pn_name) != 0,
+ ("%s(): homonymous siblings", __func__));
+ if (pn->pn_type == pfstype_procdir)
+ KASSERT(iter->pn_type != pfstype_procdir,
+ ("%s(): sibling process directories", __func__));
+ }
+#endif
+
+ pn->pn_parent = parent;
+ pfs_fileno_alloc(pn);
+
+ pfs_lock(parent);
+ pn->pn_next = parent->pn_nodes;
+ if ((parent->pn_flags & PFS_PROCDEP) != 0)
+ pn->pn_flags |= PFS_PROCDEP;
+ parent->pn_nodes = pn;
+ pfs_unlock(parent);
+}
+
+/*
+ * Detach a node from its aprent
+ */
+static void
+pfs_detach_node(struct pfs_node *pn)
+{
+ struct pfs_node *parent = pn->pn_parent;
+ struct pfs_node **iter;
+
+ KASSERT(parent != NULL, ("%s(): node has no parent", __func__));
+ KASSERT(parent->pn_info == pn->pn_info,
+ ("%s(): parent has different pn_info", __func__));
+
+ pfs_lock(parent);
+ iter = &parent->pn_nodes;
+ while (*iter != NULL) {
+ if (*iter == pn) {
+ *iter = pn->pn_next;
+ break;
+ }
+ iter = &(*iter)->pn_next;
+ }
+ pn->pn_parent = NULL;
+ pfs_unlock(parent);
+}
+
+/*
+ * Add . and .. to a directory
+ */
+static int
+pfs_fixup_dir_flags(struct pfs_node *parent, int flags)
+{
+ struct pfs_node *dot, *dotdot;
+
+ dot = pfs_alloc_node_flags(parent->pn_info, ".", pfstype_this, flags);
+ if (dot == NULL)
+ return (ENOMEM);
+ dotdot = pfs_alloc_node_flags(parent->pn_info, "..", pfstype_parent, flags);
+ if (dotdot == NULL) {
+ pfs_destroy(dot);
+ return (ENOMEM);
+ }
+ pfs_add_node(parent, dot);
+ pfs_add_node(parent, dotdot);
+ return (0);
+}
+
+static void
+pfs_fixup_dir(struct pfs_node *parent)
+{
+
+ pfs_fixup_dir_flags(parent, 0);
+}
+
+/*
+ * Create a directory
+ */
+struct pfs_node *
+pfs_create_dir(struct pfs_node *parent, const char *name,
+ pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy,
+ int flags)
+{
+ struct pfs_node *pn;
+ int rc;
+
+ pn = pfs_alloc_node_flags(parent->pn_info, name,
+ (flags & PFS_PROCDEP) ? pfstype_procdir : pfstype_dir, flags);
+ if (pn == NULL)
+ return (NULL);
+ pn->pn_attr = attr;
+ pn->pn_vis = vis;
+ pn->pn_destroy = destroy;
+ pn->pn_flags = flags;
+ pfs_add_node(parent, pn);
+ rc = pfs_fixup_dir_flags(pn, flags);
+ if (rc) {
+ pfs_destroy(pn);
+ return (NULL);
+ }
+ return (pn);
+}
+
+/*
+ * Create a file
+ */
+struct pfs_node *
+pfs_create_file(struct pfs_node *parent, const char *name, pfs_fill_t fill,
+ pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy,
+ int flags)
+{
+ struct pfs_node *pn;
+
+ pn = pfs_alloc_node_flags(parent->pn_info, name, pfstype_file, flags);
+ if (pn == NULL)
+ return (NULL);
+ pn->pn_fill = fill;
+ pn->pn_attr = attr;
+ pn->pn_vis = vis;
+ pn->pn_destroy = destroy;
+ pn->pn_flags = flags;
+ pfs_add_node(parent, pn);
+
+ return (pn);
+}
+
+/*
+ * Create a symlink
+ */
+struct pfs_node *
+pfs_create_link(struct pfs_node *parent, const char *name, pfs_fill_t fill,
+ pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy,
+ int flags)
+{
+ struct pfs_node *pn;
+
+ pn = pfs_alloc_node_flags(parent->pn_info, name, pfstype_symlink, flags);
+ if (pn == NULL)
+ return (NULL);
+ pn->pn_fill = fill;
+ pn->pn_attr = attr;
+ pn->pn_vis = vis;
+ pn->pn_destroy = destroy;
+ pn->pn_flags = flags;
+ pfs_add_node(parent, pn);
+
+ return (pn);
+}
+
+/*
+ * Locate a node by name
+ */
+struct pfs_node *
+pfs_find_node(struct pfs_node *parent, const char *name)
+{
+ struct pfs_node *pn;
+
+ pfs_lock(parent);
+ for (pn = parent->pn_nodes; pn != NULL; pn = pn->pn_next)
+ if (strcmp(pn->pn_name, name) == 0)
+ break;
+ pfs_unlock(parent);
+ return (pn);
+}
+
+/*
+ * Destroy a node and all its descendants. If the node to be destroyed
+ * has a parent, the parent's mutex must be held.
+ */
+int
+pfs_destroy(struct pfs_node *pn)
+{
+ struct pfs_node *iter;
+
+ KASSERT(pn != NULL,
+ ("%s(): node is NULL", __func__));
+ KASSERT(pn->pn_info != NULL,
+ ("%s(): node has no pn_info", __func__));
+
+ if (pn->pn_parent)
+ pfs_detach_node(pn);
+
+ /* destroy children */
+ if (pn->pn_type == pfstype_dir ||
+ pn->pn_type == pfstype_procdir ||
+ pn->pn_type == pfstype_root) {
+ pfs_lock(pn);
+ while (pn->pn_nodes != NULL) {
+ iter = pn->pn_nodes;
+ pn->pn_nodes = iter->pn_next;
+ iter->pn_parent = NULL;
+ pfs_unlock(pn);
+ pfs_destroy(iter);
+ pfs_lock(pn);
+ }
+ pfs_unlock(pn);
+ }
+
+ /* revoke vnodes and fileno */
+ pfs_purge(pn);
+
+ /* callback to free any private resources */
+ if (pn->pn_destroy != NULL)
+ pn_destroy(pn);
+
+ /* destroy the node */
+ pfs_fileno_free(pn);
+ mtx_destroy(&pn->pn_mutex);
+ free(pn, M_PFSNODES);
+
+ return (0);
+}
+
+/*
+ * Mount a pseudofs instance
+ */
+int
+pfs_mount(struct pfs_info *pi, struct mount *mp)
+{
+ struct statfs *sbp;
+
+ if (mp->mnt_flag & MNT_UPDATE)
+ return (EOPNOTSUPP);
+
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_LOCAL;
+ MNT_IUNLOCK(mp);
+ mp->mnt_data = pi;
+ vfs_getnewfsid(mp);
+
+ sbp = &mp->mnt_stat;
+ vfs_mountedfrom(mp, pi->pi_name);
+ sbp->f_bsize = PAGE_SIZE;
+ sbp->f_iosize = PAGE_SIZE;
+ sbp->f_blocks = 1;
+ sbp->f_bfree = 0;
+ sbp->f_bavail = 0;
+ sbp->f_files = 1;
+ sbp->f_ffree = 0;
+
+ return (0);
+}
+
+/*
+ * Compatibility shim for old mount(2) system call
+ */
+int
+pfs_cmount(struct mntarg *ma, void *data, uint64_t flags)
+{
+ int error;
+
+ error = kernel_mount(ma, flags);
+ return (error);
+}
+
+/*
+ * Unmount a pseudofs instance
+ */
+int
+pfs_unmount(struct mount *mp, int mntflags)
+{
+ int error;
+
+ error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0,
+ curthread);
+ return (error);
+}
+
+/*
+ * Return a root vnode
+ */
+int
+pfs_root(struct mount *mp, int flags, struct vnode **vpp)
+{
+ struct pfs_info *pi;
+
+ pi = (struct pfs_info *)mp->mnt_data;
+ return (pfs_vncache_alloc(mp, vpp, pi->pi_root, NO_PID));
+}
+
+/*
+ * Return filesystem stats
+ */
+int
+pfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+ /* no-op: always called with mp->mnt_stat */
+ return (0);
+}
+
+/*
+ * Initialize a pseudofs instance
+ */
+int
+pfs_init(struct pfs_info *pi, struct vfsconf *vfc)
+{
+ struct pfs_node *root;
+ int error;
+
+ pfs_fileno_init(pi);
+
+ /* set up the root directory */
+ root = pfs_alloc_node(pi, "/", pfstype_root);
+ pi->pi_root = root;
+ pfs_fileno_alloc(root);
+ pfs_fixup_dir(root);
+
+ /* construct file hierarchy */
+ error = (pi->pi_init)(pi, vfc);
+ if (error) {
+ pfs_destroy(root);
+ pi->pi_root = NULL;
+ return (error);
+ }
+
+ if (bootverbose)
+ printf("%s registered\n", pi->pi_name);
+ return (0);
+}
+
+/*
+ * Destroy a pseudofs instance
+ */
+int
+pfs_uninit(struct pfs_info *pi, struct vfsconf *vfc)
+{
+ int error;
+
+ pfs_destroy(pi->pi_root);
+ pi->pi_root = NULL;
+ pfs_fileno_uninit(pi);
+ if (bootverbose)
+ printf("%s unregistered\n", pi->pi_name);
+ error = (pi->pi_uninit)(pi, vfc);
+ return (error);
+}
+
+/*
+ * Handle load / unload events
+ */
+static int
+pfs_modevent(module_t mod, int evt, void *arg)
+{
+ switch (evt) {
+ case MOD_LOAD:
+ pfs_vncache_load();
+ break;
+ case MOD_UNLOAD:
+ case MOD_SHUTDOWN:
+ pfs_vncache_unload();
+ break;
+ default:
+ return EOPNOTSUPP;
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Module declaration
+ */
+static moduledata_t pseudofs_data = {
+ "pseudofs",
+ pfs_modevent,
+ NULL
+};
+DECLARE_MODULE(pseudofs, pseudofs_data, SI_SUB_EXEC, SI_ORDER_FIRST);
+MODULE_VERSION(pseudofs, 1);
diff --git a/freebsd/sys/fs/pseudofs/pseudofs.h b/freebsd/sys/fs/pseudofs/pseudofs.h
new file mode 100644
index 00000000..602e1fbf
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs.h
@@ -0,0 +1,312 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PSEUDOFS_H_INCLUDED
+#define _PSEUDOFS_H_INCLUDED
+
+#include <sys/jail.h>
+
+/*
+ * Opaque structures
+ */
+struct mntarg;
+struct mount;
+struct nameidata;
+struct proc;
+struct sbuf;
+struct statfs;
+struct thread;
+struct uio;
+struct vfsconf;
+struct vnode;
+
+/*
+ * Limits and constants
+ */
+#define PFS_NAMELEN 128
+#define PFS_FSNAMELEN 16 /* equal to MFSNAMELEN */
+#define PFS_DELEN (offsetof(struct dirent, d_name) + PFS_NAMELEN)
+
+typedef enum {
+ pfstype_none = 0,
+ pfstype_root,
+ pfstype_dir,
+ pfstype_this,
+ pfstype_parent,
+ pfstype_file,
+ pfstype_symlink,
+ pfstype_procdir
+} pfs_type_t;
+
+/*
+ * Flags
+ */
+#define PFS_RD 0x0001 /* readable */
+#define PFS_WR 0x0002 /* writeable */
+#define PFS_RDWR (PFS_RD|PFS_WR)
+#define PFS_RAWRD 0x0004 /* raw reader */
+#define PFS_RAWWR 0x0008 /* raw writer */
+#define PFS_RAW (PFS_RAWRD|PFS_RAWWR)
+#define PFS_PROCDEP 0x0010 /* process-dependent */
+#define PFS_NOWAIT 0x0020 /* allow malloc to fail */
+
+/*
+ * Data structures
+ */
+struct pfs_info;
+struct pfs_node;
+
+/*
+ * Init / uninit callback
+ */
+#define PFS_INIT_ARGS \
+ struct pfs_info *pi, struct vfsconf *vfc
+#define PFS_INIT_ARGNAMES \
+ pi, vfc
+#define PFS_INIT_PROTO(name) \
+ int name(PFS_INIT_ARGS);
+typedef int (*pfs_init_t)(PFS_INIT_ARGS);
+
+/*
+ * Filler callback
+ * Called with proc held but unlocked
+ */
+#define PFS_FILL_ARGS \
+ struct thread *td, struct proc *p, struct pfs_node *pn, \
+ struct sbuf *sb, struct uio *uio
+#define PFS_FILL_ARGNAMES \
+ td, p, pn, sb, uio
+#define PFS_FILL_PROTO(name) \
+ int name(PFS_FILL_ARGS);
+typedef int (*pfs_fill_t)(PFS_FILL_ARGS);
+
+/*
+ * Attribute callback
+ * Called with proc locked
+ */
+struct vattr;
+#define PFS_ATTR_ARGS \
+ struct thread *td, struct proc *p, struct pfs_node *pn, \
+ struct vattr *vap
+#define PFS_ATTR_ARGNAMES \
+ td, p, pn, vap
+#define PFS_ATTR_PROTO(name) \
+ int name(PFS_ATTR_ARGS);
+typedef int (*pfs_attr_t)(PFS_ATTR_ARGS);
+
+/*
+ * Visibility callback
+ * Called with proc locked
+ */
+#define PFS_VIS_ARGS \
+ struct thread *td, struct proc *p, struct pfs_node *pn
+#define PFS_VIS_ARGNAMES \
+ td, p, pn
+#define PFS_VIS_PROTO(name) \
+ int name(PFS_VIS_ARGS);
+typedef int (*pfs_vis_t)(PFS_VIS_ARGS);
+
+/*
+ * Ioctl callback
+ * Called with proc locked
+ */
+#define PFS_IOCTL_ARGS \
+ struct thread *td, struct proc *p, struct pfs_node *pn, \
+ unsigned long cmd, void *data
+#define PFS_IOCTL_ARGNAMES \
+ td, p, pn, cmd, data
+#define PFS_IOCTL_PROTO(name) \
+ int name(PFS_IOCTL_ARGS);
+typedef int (*pfs_ioctl_t)(PFS_IOCTL_ARGS);
+
+/*
+ * Getextattr callback
+ * Called with proc locked
+ */
+#define PFS_GETEXTATTR_ARGS \
+ struct thread *td, struct proc *p, struct pfs_node *pn, \
+ int attrnamespace, const char *name, struct uio *uio, \
+ size_t *size, struct ucred *cred
+#define PFS_GETEXTATTR_ARGNAMES \
+ td, p, pn, attrnamespace, name, uio, size, cred
+#define PFS_GETEXTATTR_PROTO(name) \
+ int name(PFS_GETEXTATTR_ARGS);
+struct ucred;
+typedef int (*pfs_getextattr_t)(PFS_GETEXTATTR_ARGS);
+
+/*
+ * Last-close callback
+ * Called with proc locked
+ */
+#define PFS_CLOSE_ARGS \
+ struct thread *td, struct proc *p, struct pfs_node *pn
+#define PFS_CLOSE_ARGNAMES \
+ td, p, pn
+#define PFS_CLOSE_PROTO(name) \
+ int name(PFS_CLOSE_ARGS);
+typedef int (*pfs_close_t)(PFS_CLOSE_ARGS);
+
+/*
+ * Destroy callback
+ */
+#define PFS_DESTROY_ARGS \
+ struct pfs_node *pn
+#define PFS_DESTROY_ARGNAMES \
+ pn
+#define PFS_DESTROY_PROTO(name) \
+ int name(PFS_DESTROY_ARGS);
+typedef int (*pfs_destroy_t)(PFS_DESTROY_ARGS);
+
+/*
+ * pfs_info: describes a pseudofs instance
+ *
+ * The pi_mutex is only used to avoid using the global subr_unit lock
+ * for unrhdr. The rest of struct pfs_info is only modified during
+ * vfs_init() and vfs_uninit() of the consumer filesystem.
+ */
+struct pfs_info {
+ char pi_name[PFS_FSNAMELEN];
+ pfs_init_t pi_init;
+ pfs_init_t pi_uninit;
+
+ /* members below this line are initialized at run time */
+ struct pfs_node *pi_root;
+ struct mtx pi_mutex;
+ struct unrhdr *pi_unrhdr;
+};
+
+/*
+ * pfs_node: describes a node (file or directory) within a pseudofs
+ *
+ * - Fields marked (o) are protected by the node's own mutex.
+ * - Fields marked (p) are protected by the node's parent's mutex.
+ * - Remaining fields are not protected by any lock and are assumed to be
+ * immutable once the node has been created.
+ *
+ * To prevent deadlocks, if a node's mutex is to be held at the same time
+ * as its parent's (e.g. when adding or removing nodes to a directory),
+ * the parent's mutex must always be acquired first. Unfortunately, this
+ * is not enforcable by WITNESS.
+ */
+struct pfs_node {
+ char pn_name[PFS_NAMELEN];
+ pfs_type_t pn_type;
+ int pn_flags;
+ struct mtx pn_mutex;
+ void *pn_data; /* (o) */
+
+ pfs_fill_t pn_fill;
+ pfs_ioctl_t pn_ioctl;
+ pfs_close_t pn_close;
+ pfs_attr_t pn_attr;
+ pfs_vis_t pn_vis;
+ pfs_getextattr_t pn_getextattr;
+ pfs_destroy_t pn_destroy;
+
+ struct pfs_info *pn_info;
+ u_int32_t pn_fileno; /* (o) */
+
+ struct pfs_node *pn_parent; /* (o) */
+ struct pfs_node *pn_nodes; /* (o) */
+ struct pfs_node *pn_next; /* (p) */
+};
+
+/*
+ * VFS interface
+ */
+int pfs_mount (struct pfs_info *pi, struct mount *mp);
+int pfs_cmount (struct mntarg *ma, void *data, uint64_t flags);
+int pfs_unmount (struct mount *mp, int mntflags);
+int pfs_root (struct mount *mp, int flags,
+ struct vnode **vpp);
+int pfs_statfs (struct mount *mp, struct statfs *sbp);
+int pfs_init (struct pfs_info *pi, struct vfsconf *vfc);
+int pfs_uninit (struct pfs_info *pi, struct vfsconf *vfc);
+
+/*
+ * Directory structure construction and manipulation
+ */
+struct pfs_node *pfs_create_dir (struct pfs_node *parent, const char *name,
+ pfs_attr_t attr, pfs_vis_t vis,
+ pfs_destroy_t destroy, int flags);
+struct pfs_node *pfs_create_file(struct pfs_node *parent, const char *name,
+ pfs_fill_t fill, pfs_attr_t attr,
+ pfs_vis_t vis, pfs_destroy_t destroy,
+ int flags);
+struct pfs_node *pfs_create_link(struct pfs_node *parent, const char *name,
+ pfs_fill_t fill, pfs_attr_t attr,
+ pfs_vis_t vis, pfs_destroy_t destroy,
+ int flags);
+struct pfs_node *pfs_find_node (struct pfs_node *parent, const char *name);
+void pfs_purge (struct pfs_node *pn);
+int pfs_destroy (struct pfs_node *pn);
+
+/*
+ * Now for some initialization magic...
+ */
+#define PSEUDOFS(name, version, flags) \
+ \
+static struct pfs_info name##_info = { \
+ #name, \
+ name##_init, \
+ name##_uninit, \
+}; \
+ \
+static int \
+_##name##_mount(struct mount *mp) { \
+ return (pfs_mount(&name##_info, mp)); \
+} \
+ \
+static int \
+_##name##_init(struct vfsconf *vfc) { \
+ return (pfs_init(&name##_info, vfc)); \
+} \
+ \
+static int \
+_##name##_uninit(struct vfsconf *vfc) { \
+ return (pfs_uninit(&name##_info, vfc)); \
+} \
+ \
+static struct vfsops name##_vfsops = { \
+ .vfs_cmount = pfs_cmount, \
+ .vfs_init = _##name##_init, \
+ .vfs_mount = _##name##_mount, \
+ .vfs_root = pfs_root, \
+ .vfs_statfs = pfs_statfs, \
+ .vfs_uninit = _##name##_uninit, \
+ .vfs_unmount = pfs_unmount, \
+}; \
+VFS_SET(name##_vfsops, name, VFCF_SYNTHETIC | flags); \
+MODULE_VERSION(name, version); \
+MODULE_DEPEND(name, pseudofs, 1, 1, 1);
+
+#endif
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_fileno.c b/freebsd/sys/fs/pseudofs/pseudofs_fileno.c
new file mode 100644
index 00000000..2c6b2d1f
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_fileno.c
@@ -0,0 +1,159 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+/*
+ * Initialize fileno bitmap
+ */
+void
+pfs_fileno_init(struct pfs_info *pi)
+{
+
+ mtx_init(&pi->pi_mutex, "pfs_fileno", NULL, MTX_DEF);
+ pi->pi_unrhdr = new_unrhdr(3, INT_MAX / NO_PID, &pi->pi_mutex);
+}
+
+/*
+ * Tear down fileno bitmap
+ */
+void
+pfs_fileno_uninit(struct pfs_info *pi)
+{
+
+ delete_unrhdr(pi->pi_unrhdr);
+ pi->pi_unrhdr = NULL;
+ mtx_destroy(&pi->pi_mutex);
+}
+
+/*
+ * Allocate a file number
+ */
+void
+pfs_fileno_alloc(struct pfs_node *pn)
+{
+
+ if (pn->pn_parent)
+ PFS_TRACE(("%s/%s", pn->pn_parent->pn_name, pn->pn_name));
+ else
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ switch (pn->pn_type) {
+ case pfstype_root:
+ /* root must always be 2 */
+ pn->pn_fileno = 2;
+ break;
+ case pfstype_dir:
+ case pfstype_file:
+ case pfstype_symlink:
+ case pfstype_procdir:
+ pn->pn_fileno = alloc_unr(pn->pn_info->pi_unrhdr);
+ break;
+ case pfstype_this:
+ KASSERT(pn->pn_parent != NULL,
+ ("%s(): pfstype_this node has no parent", __func__));
+ pn->pn_fileno = pn->pn_parent->pn_fileno;
+ break;
+ case pfstype_parent:
+ KASSERT(pn->pn_parent != NULL,
+ ("%s(): pfstype_parent node has no parent", __func__));
+ if (pn->pn_parent->pn_type == pfstype_root) {
+ pn->pn_fileno = pn->pn_parent->pn_fileno;
+ break;
+ }
+ KASSERT(pn->pn_parent->pn_parent != NULL,
+ ("%s(): pfstype_parent node has no grandparent", __func__));
+ pn->pn_fileno = pn->pn_parent->pn_parent->pn_fileno;
+ break;
+ case pfstype_none:
+ KASSERT(0,
+ ("%s(): pfstype_none node", __func__));
+ break;
+ }
+
+#if 0
+ printf("%s(): %s: ", __func__, pn->pn_info->pi_name);
+ if (pn->pn_parent) {
+ if (pn->pn_parent->pn_parent) {
+ printf("%s/", pn->pn_parent->pn_parent->pn_name);
+ }
+ printf("%s/", pn->pn_parent->pn_name);
+ }
+ printf("%s -> %d\n", pn->pn_name, pn->pn_fileno);
+#endif
+}
+
+/*
+ * Release a file number
+ */
+void
+pfs_fileno_free(struct pfs_node *pn)
+{
+
+ pfs_assert_not_owned(pn);
+
+ switch (pn->pn_type) {
+ case pfstype_root:
+ /* not allocated from unrhdr */
+ return;
+ case pfstype_dir:
+ case pfstype_file:
+ case pfstype_symlink:
+ case pfstype_procdir:
+ free_unr(pn->pn_info->pi_unrhdr, pn->pn_fileno);
+ break;
+ case pfstype_this:
+ case pfstype_parent:
+ /* ignore these, as they don't "own" their file number */
+ break;
+ case pfstype_none:
+ KASSERT(0,
+ ("pfs_fileno_free() called for pfstype_none node"));
+ break;
+ }
+}
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_internal.h b/freebsd/sys/fs/pseudofs/pseudofs_internal.h
new file mode 100644
index 00000000..3ec49e71
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_internal.h
@@ -0,0 +1,213 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PSEUDOFS_INTERNAL_H_INCLUDED
+#define _PSEUDOFS_INTERNAL_H_INCLUDED
+
+/*
+ * Sysctl subtree
+ */
+SYSCTL_DECL(_vfs_pfs);
+
+/*
+ * Vnode data
+ */
+struct pfs_vdata {
+ struct pfs_node *pvd_pn;
+ pid_t pvd_pid;
+ struct vnode *pvd_vnode;
+ struct pfs_vdata*pvd_prev, *pvd_next;
+ int pvd_dead:1;
+};
+
+/*
+ * Vnode cache
+ */
+void pfs_vncache_load (void);
+void pfs_vncache_unload (void);
+int pfs_vncache_alloc (struct mount *, struct vnode **,
+ struct pfs_node *, pid_t pid);
+int pfs_vncache_free (struct vnode *);
+
+/*
+ * File number bitmap
+ */
+void pfs_fileno_init (struct pfs_info *);
+void pfs_fileno_uninit (struct pfs_info *);
+void pfs_fileno_alloc (struct pfs_node *);
+void pfs_fileno_free (struct pfs_node *);
+
+/*
+ * Debugging
+ */
+#ifdef PSEUDOFS_TRACE
+extern int pfs_trace;
+
+#define PFS_TRACE(foo) \
+ do { \
+ if (pfs_trace) { \
+ printf("%s(): line %d: ", __func__, __LINE__); \
+ printf foo ; \
+ printf("\n"); \
+ } \
+ } while (0)
+#define PFS_RETURN(err) \
+ do { \
+ if (pfs_trace) { \
+ printf("%s(): line %d: returning %d\n", \
+ __func__, __LINE__, err); \
+ } \
+ return (err); \
+ } while (0)
+#else
+#define PFS_TRACE(foo) \
+ do { /* nothing */ } while (0)
+#define PFS_RETURN(err) \
+ return (err)
+#endif
+
+/*
+ * Inline helpers for locking
+ */
+static inline void
+pfs_lock(struct pfs_node *pn)
+{
+
+ mtx_lock(&pn->pn_mutex);
+}
+
+static inline void
+pfs_unlock(struct pfs_node *pn)
+{
+
+ mtx_unlock(&pn->pn_mutex);
+}
+
+static inline void
+pfs_assert_owned(struct pfs_node *pn)
+{
+
+ mtx_assert(&pn->pn_mutex, MA_OWNED);
+}
+
+static inline void
+pfs_assert_not_owned(struct pfs_node *pn)
+{
+
+ mtx_assert(&pn->pn_mutex, MA_NOTOWNED);
+}
+
+static inline int
+pn_fill(PFS_FILL_ARGS)
+{
+
+ PFS_TRACE(("%s", pn->pn_name));
+ KASSERT(pn->pn_fill != NULL, ("%s(): no callback", __func__));
+ if (p != NULL) {
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ PROC_ASSERT_HELD(p);
+ }
+ pfs_assert_not_owned(pn);
+ return ((pn->pn_fill)(PFS_FILL_ARGNAMES));
+}
+
+static inline int
+pn_attr(PFS_ATTR_ARGS)
+{
+
+ PFS_TRACE(("%s", pn->pn_name));
+ KASSERT(pn->pn_attr != NULL, ("%s(): no callback", __func__));
+ if (p != NULL)
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ pfs_assert_not_owned(pn);
+ return ((pn->pn_attr)(PFS_ATTR_ARGNAMES));
+}
+
+static inline int
+pn_vis(PFS_VIS_ARGS)
+{
+
+ PFS_TRACE(("%s", pn->pn_name));
+ KASSERT(pn->pn_vis != NULL, ("%s(): no callback", __func__));
+ KASSERT(p != NULL, ("%s(): no process", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ pfs_assert_not_owned(pn);
+ return ((pn->pn_vis)(PFS_VIS_ARGNAMES));
+}
+
+static inline int
+pn_ioctl(PFS_IOCTL_ARGS)
+{
+
+ PFS_TRACE(("%s", pn->pn_name));
+ KASSERT(pn->pn_ioctl != NULL, ("%s(): no callback", __func__));
+ if (p != NULL)
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ pfs_assert_not_owned(pn);
+ return ((pn->pn_ioctl)(PFS_IOCTL_ARGNAMES));
+}
+
+static inline int
+pn_getextattr(PFS_GETEXTATTR_ARGS)
+{
+
+ PFS_TRACE(("%s", pn->pn_name));
+ KASSERT(pn->pn_getextattr != NULL, ("%s(): no callback", __func__));
+ if (p != NULL)
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ pfs_assert_not_owned(pn);
+ return ((pn->pn_getextattr)(PFS_GETEXTATTR_ARGNAMES));
+}
+
+static inline int
+pn_close(PFS_CLOSE_ARGS)
+{
+
+ PFS_TRACE(("%s", pn->pn_name));
+ KASSERT(pn->pn_close != NULL, ("%s(): no callback", __func__));
+ if (p != NULL)
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ pfs_assert_not_owned(pn);
+ return ((pn->pn_close)(PFS_CLOSE_ARGNAMES));
+}
+
+static inline int
+pn_destroy(PFS_DESTROY_ARGS)
+{
+
+ PFS_TRACE(("%s", pn->pn_name));
+ KASSERT(pn->pn_destroy != NULL, ("%s(): no callback", __func__));
+ pfs_assert_not_owned(pn);
+ return ((pn->pn_destroy)(PFS_DESTROY_ARGNAMES));
+}
+
+#endif
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_vncache.c b/freebsd/sys/fs/pseudofs/pseudofs_vncache.c
new file mode 100644
index 00000000..05dd6569
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_vncache.c
@@ -0,0 +1,333 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+static MALLOC_DEFINE(M_PFSVNCACHE, "pfs_vncache", "pseudofs vnode cache");
+
+static struct mtx pfs_vncache_mutex;
+static struct pfs_vdata *pfs_vncache;
+static eventhandler_tag pfs_exit_tag;
+static void pfs_exit(void *arg, struct proc *p);
+static void pfs_purge_locked(struct pfs_node *pn, bool force);
+
+static SYSCTL_NODE(_vfs_pfs, OID_AUTO, vncache, CTLFLAG_RW, 0,
+ "pseudofs vnode cache");
+
+static int pfs_vncache_entries;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, entries, CTLFLAG_RD,
+ &pfs_vncache_entries, 0,
+ "number of entries in the vnode cache");
+
+static int pfs_vncache_maxentries;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, maxentries, CTLFLAG_RD,
+ &pfs_vncache_maxentries, 0,
+ "highest number of entries in the vnode cache");
+
+static int pfs_vncache_hits;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, hits, CTLFLAG_RD,
+ &pfs_vncache_hits, 0,
+ "number of cache hits since initialization");
+
+static int pfs_vncache_misses;
+SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, misses, CTLFLAG_RD,
+ &pfs_vncache_misses, 0,
+ "number of cache misses since initialization");
+
+extern struct vop_vector pfs_vnodeops; /* XXX -> .h file */
+
+/*
+ * Initialize vnode cache
+ */
+void
+pfs_vncache_load(void)
+{
+
+ mtx_init(&pfs_vncache_mutex, "pfs_vncache", NULL, MTX_DEF);
+ pfs_exit_tag = EVENTHANDLER_REGISTER(process_exit, pfs_exit, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+/*
+ * Tear down vnode cache
+ */
+void
+pfs_vncache_unload(void)
+{
+
+ EVENTHANDLER_DEREGISTER(process_exit, pfs_exit_tag);
+ mtx_lock(&pfs_vncache_mutex);
+ pfs_purge_locked(NULL, true);
+ mtx_unlock(&pfs_vncache_mutex);
+ KASSERT(pfs_vncache_entries == 0,
+ ("%d vncache entries remaining", pfs_vncache_entries));
+ mtx_destroy(&pfs_vncache_mutex);
+}
+
+/*
+ * Allocate a vnode
+ */
+int
+pfs_vncache_alloc(struct mount *mp, struct vnode **vpp,
+ struct pfs_node *pn, pid_t pid)
+{
+ struct pfs_vdata *pvd, *pvd2;
+ struct vnode *vp;
+ int error;
+
+ /*
+ * See if the vnode is in the cache.
+ * XXX linear search is not very efficient.
+ */
+retry:
+ mtx_lock(&pfs_vncache_mutex);
+ for (pvd = pfs_vncache; pvd; pvd = pvd->pvd_next) {
+ if (pvd->pvd_pn == pn && pvd->pvd_pid == pid &&
+ pvd->pvd_vnode->v_mount == mp) {
+ vp = pvd->pvd_vnode;
+ VI_LOCK(vp);
+ mtx_unlock(&pfs_vncache_mutex);
+ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) {
+ ++pfs_vncache_hits;
+ *vpp = vp;
+ /*
+ * Some callers cache_enter(vp) later, so
+ * we have to make sure it's not in the
+ * VFS cache so it doesn't get entered
+ * twice. A better solution would be to
+ * make pfs_vncache_alloc() responsible
+ * for entering the vnode in the VFS
+ * cache.
+ */
+ cache_purge(vp);
+ return (0);
+ }
+ goto retry;
+ }
+ }
+ mtx_unlock(&pfs_vncache_mutex);
+
+ /* nope, get a new one */
+ pvd = malloc(sizeof *pvd, M_PFSVNCACHE, M_WAITOK);
+ pvd->pvd_next = pvd->pvd_prev = NULL;
+ error = getnewvnode("pseudofs", mp, &pfs_vnodeops, vpp);
+ if (error) {
+ free(pvd, M_PFSVNCACHE);
+ return (error);
+ }
+ pvd->pvd_pn = pn;
+ pvd->pvd_pid = pid;
+ (*vpp)->v_data = pvd;
+ switch (pn->pn_type) {
+ case pfstype_root:
+ (*vpp)->v_vflag = VV_ROOT;
+#if 0
+ printf("root vnode allocated\n");
+#endif
+ /* fall through */
+ case pfstype_dir:
+ case pfstype_this:
+ case pfstype_parent:
+ case pfstype_procdir:
+ (*vpp)->v_type = VDIR;
+ break;
+ case pfstype_file:
+ (*vpp)->v_type = VREG;
+ break;
+ case pfstype_symlink:
+ (*vpp)->v_type = VLNK;
+ break;
+ case pfstype_none:
+ KASSERT(0, ("pfs_vncache_alloc called for null node\n"));
+ default:
+ panic("%s has unexpected type: %d", pn->pn_name, pn->pn_type);
+ }
+ /*
+ * Propagate flag through to vnode so users know it can change
+ * if the process changes (i.e. execve)
+ */
+ if ((pn->pn_flags & PFS_PROCDEP) != 0)
+ (*vpp)->v_vflag |= VV_PROCDEP;
+ pvd->pvd_vnode = *vpp;
+ vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+ VN_LOCK_AREC(*vpp);
+ error = insmntque(*vpp, mp);
+ if (error != 0) {
+ free(pvd, M_PFSVNCACHE);
+ *vpp = NULLVP;
+ return (error);
+ }
+retry2:
+ mtx_lock(&pfs_vncache_mutex);
+ /*
+ * Other thread may race with us, creating the entry we are
+ * going to insert into the cache. Recheck after
+ * pfs_vncache_mutex is reacquired.
+ */
+ for (pvd2 = pfs_vncache; pvd2; pvd2 = pvd2->pvd_next) {
+ if (pvd2->pvd_pn == pn && pvd2->pvd_pid == pid &&
+ pvd2->pvd_vnode->v_mount == mp) {
+ vp = pvd2->pvd_vnode;
+ VI_LOCK(vp);
+ mtx_unlock(&pfs_vncache_mutex);
+ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) {
+ ++pfs_vncache_hits;
+ vgone(*vpp);
+ vput(*vpp);
+ *vpp = vp;
+ cache_purge(vp);
+ return (0);
+ }
+ goto retry2;
+ }
+ }
+ ++pfs_vncache_misses;
+ if (++pfs_vncache_entries > pfs_vncache_maxentries)
+ pfs_vncache_maxentries = pfs_vncache_entries;
+ pvd->pvd_prev = NULL;
+ pvd->pvd_next = pfs_vncache;
+ if (pvd->pvd_next)
+ pvd->pvd_next->pvd_prev = pvd;
+ pfs_vncache = pvd;
+ mtx_unlock(&pfs_vncache_mutex);
+ return (0);
+}
+
+/*
+ * Free a vnode
+ */
+int
+pfs_vncache_free(struct vnode *vp)
+{
+ struct pfs_vdata *pvd;
+
+ mtx_lock(&pfs_vncache_mutex);
+ pvd = (struct pfs_vdata *)vp->v_data;
+ KASSERT(pvd != NULL, ("pfs_vncache_free(): no vnode data\n"));
+ if (pvd->pvd_next)
+ pvd->pvd_next->pvd_prev = pvd->pvd_prev;
+ if (pvd->pvd_prev) {
+ pvd->pvd_prev->pvd_next = pvd->pvd_next;
+ --pfs_vncache_entries;
+ } else if (pfs_vncache == pvd) {
+ pfs_vncache = pvd->pvd_next;
+ --pfs_vncache_entries;
+ }
+ mtx_unlock(&pfs_vncache_mutex);
+
+ free(pvd, M_PFSVNCACHE);
+ vp->v_data = NULL;
+ return (0);
+}
+
+/*
+ * Purge the cache of dead entries
+ *
+ * This is extremely inefficient due to the fact that vgone() not only
+ * indirectly modifies the vnode cache, but may also sleep. We can
+ * neither hold pfs_vncache_mutex across a vgone() call, nor make any
+ * assumptions about the state of the cache after vgone() returns. In
+ * consequence, we must start over after every vgone() call, and keep
+ * trying until we manage to traverse the entire cache.
+ *
+ * The only way to improve this situation is to change the data structure
+ * used to implement the cache.
+ */
+static void
+pfs_purge_locked(struct pfs_node *pn, bool force)
+{
+ struct pfs_vdata *pvd;
+ struct vnode *vnp;
+
+ mtx_assert(&pfs_vncache_mutex, MA_OWNED);
+ pvd = pfs_vncache;
+ while (pvd != NULL) {
+ if (force || pvd->pvd_dead ||
+ (pn != NULL && pvd->pvd_pn == pn)) {
+ vnp = pvd->pvd_vnode;
+ vhold(vnp);
+ mtx_unlock(&pfs_vncache_mutex);
+ VOP_LOCK(vnp, LK_EXCLUSIVE);
+ vgone(vnp);
+ VOP_UNLOCK(vnp, 0);
+ mtx_lock(&pfs_vncache_mutex);
+ vdrop(vnp);
+ pvd = pfs_vncache;
+ } else {
+ pvd = pvd->pvd_next;
+ }
+ }
+}
+
+void
+pfs_purge(struct pfs_node *pn)
+{
+
+ mtx_lock(&pfs_vncache_mutex);
+ pfs_purge_locked(pn, false);
+ mtx_unlock(&pfs_vncache_mutex);
+}
+
+/*
+ * Free all vnodes associated with a defunct process
+ */
+static void
+pfs_exit(void *arg, struct proc *p)
+{
+ struct pfs_vdata *pvd;
+ int dead;
+
+ if (pfs_vncache == NULL)
+ return;
+ mtx_lock(&pfs_vncache_mutex);
+ for (pvd = pfs_vncache, dead = 0; pvd != NULL; pvd = pvd->pvd_next)
+ if (pvd->pvd_pid == p->p_pid)
+ dead = pvd->pvd_dead = 1;
+ if (dead)
+ pfs_purge_locked(NULL, false);
+ mtx_unlock(&pfs_vncache_mutex);
+}
diff --git a/freebsd/sys/fs/pseudofs/pseudofs_vnops.c b/freebsd/sys/fs/pseudofs/pseudofs_vnops.c
new file mode 100644
index 00000000..da35f062
--- /dev/null
+++ b/freebsd/sys/fs/pseudofs/pseudofs_vnops.c
@@ -0,0 +1,1060 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pseudofs.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/ctype.h>
+#include <sys/dirent.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <fs/pseudofs/pseudofs.h>
+#include <fs/pseudofs/pseudofs_internal.h>
+
+#define KASSERT_PN_IS_DIR(pn) \
+ KASSERT((pn)->pn_type == pfstype_root || \
+ (pn)->pn_type == pfstype_dir || \
+ (pn)->pn_type == pfstype_procdir, \
+ ("%s(): VDIR vnode refers to non-directory pfs_node", __func__))
+
+#define KASSERT_PN_IS_FILE(pn) \
+ KASSERT((pn)->pn_type == pfstype_file, \
+ ("%s(): VREG vnode refers to non-file pfs_node", __func__))
+
+#define KASSERT_PN_IS_LINK(pn) \
+ KASSERT((pn)->pn_type == pfstype_symlink, \
+ ("%s(): VLNK vnode refers to non-link pfs_node", __func__))
+
+/*
+ * Returns the fileno, adjusted for target pid
+ */
+static uint32_t
+pn_fileno(struct pfs_node *pn, pid_t pid)
+{
+
+ KASSERT(pn->pn_fileno > 0,
+ ("%s(): no fileno allocated", __func__));
+ if (pid != NO_PID)
+ return (pn->pn_fileno * NO_PID + pid);
+ return (pn->pn_fileno);
+}
+
+/*
+ * Returns non-zero if given file is visible to given thread.
+ */
+static int
+pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc)
+{
+ int visible;
+
+ if (proc == NULL)
+ return (0);
+
+ PROC_LOCK_ASSERT(proc, MA_OWNED);
+
+ visible = ((proc->p_flag & P_WEXIT) == 0);
+ if (visible)
+ visible = (p_cansee(td, proc) == 0);
+ if (visible && pn->pn_vis != NULL)
+ visible = pn_vis(td, proc, pn);
+ if (!visible)
+ return (0);
+ return (1);
+}
+
+static int
+pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid,
+ bool allproc_locked, struct proc **p)
+{
+ struct proc *proc;
+
+ PFS_TRACE(("%s (pid: %d, req: %d)",
+ pn->pn_name, pid, td->td_proc->p_pid));
+
+ if (p)
+ *p = NULL;
+ if (pid == NO_PID)
+ PFS_RETURN (1);
+ proc = allproc_locked ? pfind_locked(pid) : pfind(pid);
+ if (proc == NULL)
+ PFS_RETURN (0);
+ if (pfs_visible_proc(td, pn, proc)) {
+ if (p)
+ *p = proc;
+ else
+ PROC_UNLOCK(proc);
+ PFS_RETURN (1);
+ }
+ PROC_UNLOCK(proc);
+ PFS_RETURN (0);
+}
+
+/*
+ * Verify permissions
+ */
+static int
+pfs_access(struct vop_access_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct vattr vattr;
+ int error;
+
+ PFS_TRACE(("%s", pvd->pvd_pn->pn_name));
+ (void)pvd;
+
+ error = VOP_GETATTR(vn, &vattr, va->a_cred);
+ if (error)
+ PFS_RETURN (error);
+ error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid,
+ vattr.va_gid, va->a_accmode, va->a_cred, NULL);
+ PFS_RETURN (error);
+}
+
+/*
+ * Close a file or directory
+ */
+static int
+pfs_close(struct vop_close_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+ struct proc *proc;
+ int error;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ /*
+ * Do nothing unless this is the last close and the node has a
+ * last-close handler.
+ */
+ if (vrefcnt(vn) > 1 || pn->pn_close == NULL)
+ PFS_RETURN (0);
+
+ if (pvd->pvd_pid != NO_PID) {
+ proc = pfind(pvd->pvd_pid);
+ } else {
+ proc = NULL;
+ }
+
+ error = pn_close(va->a_td, proc, pn);
+
+ if (proc != NULL)
+ PROC_UNLOCK(proc);
+
+ PFS_RETURN (error);
+}
+
+/*
+ * Get file attributes
+ */
+static int
+pfs_getattr(struct vop_getattr_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+ struct vattr *vap = va->a_vap;
+ struct proc *proc;
+ int error = 0;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+ PFS_RETURN (ENOENT);
+
+ vap->va_type = vn->v_type;
+ vap->va_fileid = pn_fileno(pn, pvd->pvd_pid);
+ vap->va_flags = 0;
+ vap->va_blocksize = PAGE_SIZE;
+ vap->va_bytes = vap->va_size = 0;
+ vap->va_filerev = 0;
+ vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_nlink = 1;
+ nanotime(&vap->va_ctime);
+ vap->va_atime = vap->va_mtime = vap->va_ctime;
+
+ switch (pn->pn_type) {
+ case pfstype_procdir:
+ case pfstype_root:
+ case pfstype_dir:
+#if 0
+ pfs_lock(pn);
+ /* compute link count */
+ pfs_unlock(pn);
+#endif
+ vap->va_mode = 0555;
+ break;
+ case pfstype_file:
+ case pfstype_symlink:
+ vap->va_mode = 0444;
+ break;
+ default:
+ printf("shouldn't be here!\n");
+ vap->va_mode = 0;
+ break;
+ }
+
+ if (proc != NULL) {
+ vap->va_uid = proc->p_ucred->cr_ruid;
+ vap->va_gid = proc->p_ucred->cr_rgid;
+ } else {
+ vap->va_uid = 0;
+ vap->va_gid = 0;
+ }
+
+ if (pn->pn_attr != NULL)
+ error = pn_attr(curthread, proc, pn, vap);
+
+ if(proc != NULL)
+ PROC_UNLOCK(proc);
+
+ PFS_RETURN (error);
+}
+
+/*
+ * Perform an ioctl
+ */
+static int
+pfs_ioctl(struct vop_ioctl_args *va)
+{
+ struct vnode *vn;
+ struct pfs_vdata *pvd;
+ struct pfs_node *pn;
+ struct proc *proc;
+ int error;
+
+ vn = va->a_vp;
+ vn_lock(vn, LK_SHARED | LK_RETRY);
+ if (vn->v_iflag & VI_DOOMED) {
+ VOP_UNLOCK(vn, 0);
+ return (EBADF);
+ }
+ pvd = vn->v_data;
+ pn = pvd->pvd_pn;
+
+ PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command));
+ pfs_assert_not_owned(pn);
+
+ if (vn->v_type != VREG) {
+ VOP_UNLOCK(vn, 0);
+ PFS_RETURN (EINVAL);
+ }
+ KASSERT_PN_IS_FILE(pn);
+
+ if (pn->pn_ioctl == NULL) {
+ VOP_UNLOCK(vn, 0);
+ PFS_RETURN (ENOTTY);
+ }
+
+ /*
+ * This is necessary because process' privileges may
+ * have changed since the open() call.
+ */
+ if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) {
+ VOP_UNLOCK(vn, 0);
+ PFS_RETURN (EIO);
+ }
+
+ error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data);
+
+ if (proc != NULL)
+ PROC_UNLOCK(proc);
+
+ VOP_UNLOCK(vn, 0);
+ PFS_RETURN (error);
+}
+
+/*
+ * Perform getextattr
+ */
+static int
+pfs_getextattr(struct vop_getextattr_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+ struct proc *proc;
+ int error;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ /*
+ * This is necessary because either process' privileges may
+ * have changed since the open() call.
+ */
+ if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+ PFS_RETURN (EIO);
+
+ if (pn->pn_getextattr == NULL)
+ error = EOPNOTSUPP;
+ else
+ error = pn_getextattr(curthread, proc, pn,
+ va->a_attrnamespace, va->a_name, va->a_uio,
+ va->a_size, va->a_cred);
+
+ if (proc != NULL)
+ PROC_UNLOCK(proc);
+
+ PFS_RETURN (error);
+}
+
+/*
+ * Convert a vnode to its component name
+ */
+static int
+pfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct vnode **dvp = ap->a_vpp;
+ struct pfs_vdata *pvd = vp->v_data;
+ struct pfs_node *pd = pvd->pvd_pn;
+ struct pfs_node *pn;
+ struct mount *mp;
+ char *buf = ap->a_buf;
+ int *buflen = ap->a_buflen;
+ char pidbuf[PFS_NAMELEN];
+ pid_t pid = pvd->pvd_pid;
+ int len, i, error, locked;
+
+ i = *buflen;
+ error = 0;
+
+ pfs_lock(pd);
+
+ if (vp->v_type == VDIR && pd->pn_type == pfstype_root) {
+ *dvp = vp;
+ vhold(*dvp);
+ pfs_unlock(pd);
+ PFS_RETURN (0);
+ } else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) {
+ len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
+ i -= len;
+ if (i < 0) {
+ error = ENOMEM;
+ goto failed;
+ }
+ bcopy(pidbuf, buf + i, len);
+ } else {
+ len = strlen(pd->pn_name);
+ i -= len;
+ if (i < 0) {
+ error = ENOMEM;
+ goto failed;
+ }
+ bcopy(pd->pn_name, buf + i, len);
+ }
+
+ pn = pd->pn_parent;
+ pfs_unlock(pd);
+
+ mp = vp->v_mount;
+ error = vfs_busy(mp, 0);
+ if (error)
+ return (error);
+
+ /*
+ * vp is held by caller.
+ */
+ locked = VOP_ISLOCKED(vp);
+ VOP_UNLOCK(vp, 0);
+
+ error = pfs_vncache_alloc(mp, dvp, pn, pid);
+ if (error) {
+ vn_lock(vp, locked | LK_RETRY);
+ vfs_unbusy(mp);
+ PFS_RETURN(error);
+ }
+
+ *buflen = i;
+ VOP_UNLOCK(*dvp, 0);
+ vn_lock(vp, locked | LK_RETRY);
+ vfs_unbusy(mp);
+
+ PFS_RETURN (0);
+failed:
+ pfs_unlock(pd);
+ PFS_RETURN(error);
+}
+
+/*
+ * Look up a file or directory
+ */
+static int
+pfs_lookup(struct vop_cachedlookup_args *va)
+{
+ struct vnode *vn = va->a_dvp;
+ struct vnode **vpp = va->a_vpp;
+ struct componentname *cnp = va->a_cnp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pd = pvd->pvd_pn;
+ struct pfs_node *pn, *pdn = NULL;
+ struct mount *mp;
+ pid_t pid = pvd->pvd_pid;
+ char *pname;
+ int error, i, namelen, visible;
+
+ PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr));
+ pfs_assert_not_owned(pd);
+
+ if (vn->v_type != VDIR)
+ PFS_RETURN (ENOTDIR);
+ KASSERT_PN_IS_DIR(pd);
+
+ /*
+ * Don't support DELETE or RENAME. CREATE is supported so
+ * that O_CREAT will work, but the lookup will still fail if
+ * the file does not exist.
+ */
+ if ((cnp->cn_flags & ISLASTCN) &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+ PFS_RETURN (EOPNOTSUPP);
+
+ /* shortcut: check if the name is too long */
+ if (cnp->cn_namelen >= PFS_NAMELEN)
+ PFS_RETURN (ENOENT);
+
+ /* check that parent directory is visible... */
+ if (!pfs_visible(curthread, pd, pvd->pvd_pid, false, NULL))
+ PFS_RETURN (ENOENT);
+
+ /* self */
+ namelen = cnp->cn_namelen;
+ pname = cnp->cn_nameptr;
+ if (namelen == 1 && pname[0] == '.') {
+ pn = pd;
+ *vpp = vn;
+ VREF(vn);
+ PFS_RETURN (0);
+ }
+
+ mp = vn->v_mount;
+
+ /* parent */
+ if (cnp->cn_flags & ISDOTDOT) {
+ if (pd->pn_type == pfstype_root)
+ PFS_RETURN (EIO);
+ error = vfs_busy(mp, MBF_NOWAIT);
+ if (error != 0) {
+ vfs_ref(mp);
+ VOP_UNLOCK(vn, 0);
+ error = vfs_busy(mp, 0);
+ vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
+ vfs_rel(mp);
+ if (error != 0)
+ PFS_RETURN(ENOENT);
+ if (vn->v_iflag & VI_DOOMED) {
+ vfs_unbusy(mp);
+ PFS_RETURN(ENOENT);
+ }
+ }
+ VOP_UNLOCK(vn, 0);
+ KASSERT(pd->pn_parent != NULL,
+ ("%s(): non-root directory has no parent", __func__));
+ /*
+ * This one is tricky. Descendents of procdir nodes
+ * inherit their parent's process affinity, but
+ * there's no easy reverse mapping. For simplicity,
+ * we assume that if this node is a procdir, its
+ * parent isn't (which is correct as long as
+ * descendents of procdir nodes are never procdir
+ * nodes themselves)
+ */
+ if (pd->pn_type == pfstype_procdir)
+ pid = NO_PID;
+ pfs_lock(pd);
+ pn = pd->pn_parent;
+ pfs_unlock(pd);
+ goto got_pnode;
+ }
+
+ pfs_lock(pd);
+
+ /* named node */
+ for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next)
+ if (pn->pn_type == pfstype_procdir)
+ pdn = pn;
+ else if (pn->pn_name[namelen] == '\0' &&
+ bcmp(pname, pn->pn_name, namelen) == 0) {
+ pfs_unlock(pd);
+ goto got_pnode;
+ }
+
+ /* process dependent node */
+ if ((pn = pdn) != NULL) {
+ pid = 0;
+ for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i)
+ if ((pid = pid * 10 + pname[i] - '0') > PID_MAX)
+ break;
+ if (i == cnp->cn_namelen) {
+ pfs_unlock(pd);
+ goto got_pnode;
+ }
+ }
+
+ pfs_unlock(pd);
+
+ PFS_RETURN (ENOENT);
+
+ got_pnode:
+ pfs_assert_not_owned(pd);
+ pfs_assert_not_owned(pn);
+ visible = pfs_visible(curthread, pn, pid, false, NULL);
+ if (!visible) {
+ error = ENOENT;
+ goto failed;
+ }
+
+ error = pfs_vncache_alloc(mp, vpp, pn, pid);
+ if (error)
+ goto failed;
+
+ if (cnp->cn_flags & ISDOTDOT) {
+ vfs_unbusy(mp);
+ vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
+ if (vn->v_iflag & VI_DOOMED) {
+ vput(*vpp);
+ *vpp = NULL;
+ PFS_RETURN(ENOENT);
+ }
+ }
+ if (cnp->cn_flags & MAKEENTRY && !(vn->v_iflag & VI_DOOMED))
+ cache_enter(vn, *vpp, cnp);
+ PFS_RETURN (0);
+ failed:
+ if (cnp->cn_flags & ISDOTDOT) {
+ vfs_unbusy(mp);
+ vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
+ *vpp = NULL;
+ }
+ PFS_RETURN(error);
+}
+
+/*
+ * Open a file or directory.
+ */
+static int
+pfs_open(struct vop_open_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+ int mode = va->a_mode;
+
+ PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode));
+ pfs_assert_not_owned(pn);
+
+ /* check if the requested mode is permitted */
+ if (((mode & FREAD) && !(mode & PFS_RD)) ||
+ ((mode & FWRITE) && !(mode & PFS_WR)))
+ PFS_RETURN (EPERM);
+
+ /* we don't support locking */
+ if ((mode & O_SHLOCK) || (mode & O_EXLOCK))
+ PFS_RETURN (EOPNOTSUPP);
+
+ PFS_RETURN (0);
+}
+
+/*
+ * Read from a file
+ */
+static int
+pfs_read(struct vop_read_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+ struct uio *uio = va->a_uio;
+ struct proc *proc;
+ struct sbuf *sb = NULL;
+ int error, locked;
+ off_t buflen;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ if (vn->v_type != VREG)
+ PFS_RETURN (EINVAL);
+ KASSERT_PN_IS_FILE(pn);
+
+ if (!(pn->pn_flags & PFS_RD))
+ PFS_RETURN (EBADF);
+
+ if (pn->pn_fill == NULL)
+ PFS_RETURN (EIO);
+
+ /*
+ * This is necessary because either process' privileges may
+ * have changed since the open() call.
+ */
+ if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+ PFS_RETURN (EIO);
+ if (proc != NULL) {
+ _PHOLD(proc);
+ PROC_UNLOCK(proc);
+ }
+
+ vhold(vn);
+ locked = VOP_ISLOCKED(vn);
+ VOP_UNLOCK(vn, 0);
+
+ if (pn->pn_flags & PFS_RAWRD) {
+ PFS_TRACE(("%zd resid", uio->uio_resid));
+ error = pn_fill(curthread, proc, pn, NULL, uio);
+ PFS_TRACE(("%zd resid", uio->uio_resid));
+ goto ret;
+ }
+
+ if (uio->uio_resid < 0 || uio->uio_offset < 0 ||
+ uio->uio_resid > OFF_MAX - uio->uio_offset) {
+ error = EINVAL;
+ goto ret;
+ }
+ buflen = uio->uio_offset + uio->uio_resid;
+ if (buflen > MAXPHYS)
+ buflen = MAXPHYS;
+
+ sb = sbuf_new(sb, NULL, buflen + 1, 0);
+ if (sb == NULL) {
+ error = EIO;
+ goto ret;
+ }
+
+ error = pn_fill(curthread, proc, pn, sb, uio);
+
+ if (error) {
+ sbuf_delete(sb);
+ goto ret;
+ }
+
+ /*
+ * XXX: If the buffer overflowed, sbuf_len() will not return
+ * the data length. Then just use the full length because an
+ * overflowed sbuf must be full.
+ */
+ if (sbuf_finish(sb) == 0)
+ buflen = sbuf_len(sb);
+ error = uiomove_frombuf(sbuf_data(sb), buflen, uio);
+ sbuf_delete(sb);
+ret:
+ vn_lock(vn, locked | LK_RETRY);
+ vdrop(vn);
+ if (proc != NULL)
+ PRELE(proc);
+ PFS_RETURN (error);
+}
+
+/*
+ * Iterate through directory entries
+ */
+static int
+pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd,
+ struct pfs_node **pn, struct proc **p)
+{
+ int visible;
+
+ sx_assert(&allproc_lock, SX_SLOCKED);
+ pfs_assert_owned(pd);
+ again:
+ if (*pn == NULL) {
+ /* first node */
+ *pn = pd->pn_nodes;
+ } else if ((*pn)->pn_type != pfstype_procdir) {
+ /* next node */
+ *pn = (*pn)->pn_next;
+ }
+ if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
+ /* next process */
+ if (*p == NULL)
+ *p = LIST_FIRST(&allproc);
+ else
+ *p = LIST_NEXT(*p, p_list);
+ /* out of processes: next node */
+ if (*p == NULL)
+ *pn = (*pn)->pn_next;
+ else
+ PROC_LOCK(*p);
+ }
+
+ if ((*pn) == NULL)
+ return (-1);
+
+ if (*p != NULL) {
+ visible = pfs_visible_proc(td, *pn, *p);
+ PROC_UNLOCK(*p);
+ } else if (proc != NULL) {
+ visible = pfs_visible_proc(td, *pn, proc);
+ } else {
+ visible = 1;
+ }
+ if (!visible)
+ goto again;
+
+ return (0);
+}
+
+/* Directory entry list */
+struct pfsentry {
+ STAILQ_ENTRY(pfsentry) link;
+ struct dirent entry;
+};
+STAILQ_HEAD(pfsdirentlist, pfsentry);
+
+/*
+ * Return directory entries.
+ */
+static int
+pfs_readdir(struct vop_readdir_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pd = pvd->pvd_pn;
+ pid_t pid = pvd->pvd_pid;
+ struct proc *p, *proc;
+ struct pfs_node *pn;
+ struct uio *uio;
+ struct pfsentry *pfsent, *pfsent2;
+ struct pfsdirentlist lst;
+ off_t offset;
+ int error, i, resid;
+
+ STAILQ_INIT(&lst);
+ error = 0;
+ KASSERT(pd->pn_info == vn->v_mount->mnt_data,
+ ("%s(): pn_info does not match mountpoint", __func__));
+ PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid));
+ pfs_assert_not_owned(pd);
+
+ if (vn->v_type != VDIR)
+ PFS_RETURN (ENOTDIR);
+ KASSERT_PN_IS_DIR(pd);
+ uio = va->a_uio;
+
+ /* only allow reading entire entries */
+ offset = uio->uio_offset;
+ resid = uio->uio_resid;
+ if (offset < 0 || offset % PFS_DELEN != 0 ||
+ (resid && resid < PFS_DELEN))
+ PFS_RETURN (EINVAL);
+ if (resid == 0)
+ PFS_RETURN (0);
+
+ sx_slock(&allproc_lock);
+ pfs_lock(pd);
+
+ /* check if the directory is visible to the caller */
+ if (!pfs_visible(curthread, pd, pid, true, &proc)) {
+ sx_sunlock(&allproc_lock);
+ pfs_unlock(pd);
+ PFS_RETURN (ENOENT);
+ }
+ KASSERT(pid == NO_PID || proc != NULL,
+ ("%s(): no process for pid %lu", __func__, (unsigned long)pid));
+
+ /* skip unwanted entries */
+ for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) {
+ if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) {
+ /* nothing left... */
+ if (proc != NULL)
+ PROC_UNLOCK(proc);
+ pfs_unlock(pd);
+ sx_sunlock(&allproc_lock);
+ PFS_RETURN (0);
+ }
+ }
+
+ /* fill in entries */
+ while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 &&
+ resid >= PFS_DELEN) {
+ if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV,
+ M_NOWAIT | M_ZERO)) == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ pfsent->entry.d_reclen = PFS_DELEN;
+ pfsent->entry.d_fileno = pn_fileno(pn, pid);
+ /* PFS_DELEN was picked to fit PFS_NAMLEN */
+ for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i)
+ pfsent->entry.d_name[i] = pn->pn_name[i];
+ pfsent->entry.d_namlen = i;
+ /* NOTE: d_off is the offset of the *next* entry. */
+ pfsent->entry.d_off = offset + PFS_DELEN;
+ switch (pn->pn_type) {
+ case pfstype_procdir:
+ KASSERT(p != NULL,
+ ("reached procdir node with p == NULL"));
+ pfsent->entry.d_namlen = snprintf(pfsent->entry.d_name,
+ PFS_NAMELEN, "%d", p->p_pid);
+ /* fall through */
+ case pfstype_root:
+ case pfstype_dir:
+ case pfstype_this:
+ case pfstype_parent:
+ pfsent->entry.d_type = DT_DIR;
+ break;
+ case pfstype_file:
+ pfsent->entry.d_type = DT_REG;
+ break;
+ case pfstype_symlink:
+ pfsent->entry.d_type = DT_LNK;
+ break;
+ default:
+ panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type);
+ }
+ PFS_TRACE(("%s", pfsent->entry.d_name));
+ dirent_terminate(&pfsent->entry);
+ STAILQ_INSERT_TAIL(&lst, pfsent, link);
+ offset += PFS_DELEN;
+ resid -= PFS_DELEN;
+ }
+ if (proc != NULL)
+ PROC_UNLOCK(proc);
+ pfs_unlock(pd);
+ sx_sunlock(&allproc_lock);
+ i = 0;
+ STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) {
+ if (error == 0)
+ error = uiomove(&pfsent->entry, PFS_DELEN, uio);
+ free(pfsent, M_IOV);
+ i++;
+ }
+ PFS_TRACE(("%ju bytes", (uintmax_t)(i * PFS_DELEN)));
+ PFS_RETURN (error);
+}
+
+/*
+ * Read a symbolic link
+ */
+static int
+pfs_readlink(struct vop_readlink_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+ struct uio *uio = va->a_uio;
+ struct proc *proc = NULL;
+ char buf[PATH_MAX];
+ struct sbuf sb;
+ int error, locked;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ if (vn->v_type != VLNK)
+ PFS_RETURN (EINVAL);
+ KASSERT_PN_IS_LINK(pn);
+
+ if (pn->pn_fill == NULL)
+ PFS_RETURN (EIO);
+
+ if (pvd->pvd_pid != NO_PID) {
+ if ((proc = pfind(pvd->pvd_pid)) == NULL)
+ PFS_RETURN (EIO);
+ if (proc->p_flag & P_WEXIT) {
+ PROC_UNLOCK(proc);
+ PFS_RETURN (EIO);
+ }
+ _PHOLD(proc);
+ PROC_UNLOCK(proc);
+ }
+ vhold(vn);
+ locked = VOP_ISLOCKED(vn);
+ VOP_UNLOCK(vn, 0);
+
+ /* sbuf_new() can't fail with a static buffer */
+ sbuf_new(&sb, buf, sizeof buf, 0);
+
+ error = pn_fill(curthread, proc, pn, &sb, NULL);
+
+ if (proc != NULL)
+ PRELE(proc);
+ vn_lock(vn, locked | LK_RETRY);
+ vdrop(vn);
+
+ if (error) {
+ sbuf_delete(&sb);
+ PFS_RETURN (error);
+ }
+
+ if (sbuf_finish(&sb) != 0) {
+ sbuf_delete(&sb);
+ PFS_RETURN (ENAMETOOLONG);
+ }
+
+ error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio);
+ sbuf_delete(&sb);
+ PFS_RETURN (error);
+}
+
+/*
+ * Reclaim a vnode
+ */
+static int
+pfs_reclaim(struct vop_reclaim_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ return (pfs_vncache_free(va->a_vp));
+}
+
+/*
+ * Set attributes
+ */
+static int
+pfs_setattr(struct vop_setattr_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ /* Silently ignore unchangeable attributes. */
+ PFS_RETURN (0);
+}
+
+/*
+ * Write to a file
+ */
+static int
+pfs_write(struct vop_write_args *va)
+{
+ struct vnode *vn = va->a_vp;
+ struct pfs_vdata *pvd = vn->v_data;
+ struct pfs_node *pn = pvd->pvd_pn;
+ struct uio *uio = va->a_uio;
+ struct proc *proc;
+ struct sbuf sb;
+ int error;
+
+ PFS_TRACE(("%s", pn->pn_name));
+ pfs_assert_not_owned(pn);
+
+ if (vn->v_type != VREG)
+ PFS_RETURN (EINVAL);
+ KASSERT_PN_IS_FILE(pn);
+
+ if (!(pn->pn_flags & PFS_WR))
+ PFS_RETURN (EBADF);
+
+ if (pn->pn_fill == NULL)
+ PFS_RETURN (EIO);
+
+ /*
+ * This is necessary because either process' privileges may
+ * have changed since the open() call.
+ */
+ if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc))
+ PFS_RETURN (EIO);
+ if (proc != NULL) {
+ _PHOLD(proc);
+ PROC_UNLOCK(proc);
+ }
+
+ if (pn->pn_flags & PFS_RAWWR) {
+ error = pn_fill(curthread, proc, pn, NULL, uio);
+ if (proc != NULL)
+ PRELE(proc);
+ PFS_RETURN (error);
+ }
+
+ sbuf_uionew(&sb, uio, &error);
+ if (error) {
+ if (proc != NULL)
+ PRELE(proc);
+ PFS_RETURN (error);
+ }
+
+ error = pn_fill(curthread, proc, pn, &sb, uio);
+
+ sbuf_delete(&sb);
+ if (proc != NULL)
+ PRELE(proc);
+ PFS_RETURN (error);
+}
+
+/*
+ * Vnode operations
+ */
+struct vop_vector pfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+
+ .vop_access = pfs_access,
+ .vop_cachedlookup = pfs_lookup,
+ .vop_close = pfs_close,
+ .vop_create = VOP_EOPNOTSUPP,
+ .vop_getattr = pfs_getattr,
+ .vop_getextattr = pfs_getextattr,
+ .vop_ioctl = pfs_ioctl,
+ .vop_link = VOP_EOPNOTSUPP,
+ .vop_lookup = vfs_cache_lookup,
+ .vop_mkdir = VOP_EOPNOTSUPP,
+ .vop_mknod = VOP_EOPNOTSUPP,
+ .vop_open = pfs_open,
+ .vop_read = pfs_read,
+ .vop_readdir = pfs_readdir,
+ .vop_readlink = pfs_readlink,
+ .vop_reclaim = pfs_reclaim,
+ .vop_remove = VOP_EOPNOTSUPP,
+ .vop_rename = VOP_EOPNOTSUPP,
+ .vop_rmdir = VOP_EOPNOTSUPP,
+ .vop_setattr = pfs_setattr,
+ .vop_symlink = VOP_EOPNOTSUPP,
+ .vop_vptocnp = pfs_vptocnp,
+ .vop_write = pfs_write,
+ /* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */
+};
diff --git a/freebsd/sys/kern/kern_descrip.c b/freebsd/sys/kern/kern_descrip.c
new file mode 100644
index 00000000..423968b2
--- /dev/null
+++ b/freebsd/sys/kern/kern_descrip.c
@@ -0,0 +1,4283 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/capsicum.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/selinfo.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kdb.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/unistd.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+
+#include <ddb/ddb.h>
+
+static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
+static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
+ "file desc to leader structures");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
+
+MALLOC_DECLARE(M_FADVISE);
+
+static __read_mostly uma_zone_t file_zone;
+static __read_mostly uma_zone_t filedesc0_zone;
+
+static int closefp(struct filedesc *fdp, int fd, struct file *fp,
+ struct thread *td, int holdleaders);
+static int fd_first_free(struct filedesc *fdp, int low, int size);
+static int fd_last_used(struct filedesc *fdp, int size);
+static void fdgrowtable(struct filedesc *fdp, int nfd);
+static void fdgrowtable_exp(struct filedesc *fdp, int nfd);
+static void fdunused(struct filedesc *fdp, int fd);
+static void fdused(struct filedesc *fdp, int fd);
+static int getmaxfd(struct thread *td);
+static u_long *filecaps_copy_prep(const struct filecaps *src);
+static void filecaps_copy_finish(const struct filecaps *src,
+ struct filecaps *dst, u_long *ioctls);
+static u_long *filecaps_free_prep(struct filecaps *fcaps);
+static void filecaps_free_finish(u_long *ioctls);
+
+/*
+ * Each process has:
+ *
+ * - An array of open file descriptors (fd_ofiles)
+ * - An array of file flags (fd_ofileflags)
+ * - A bitmap recording which descriptors are in use (fd_map)
+ *
+ * A process starts out with NDFILE descriptors. The value of NDFILE has
+ * been selected based the historical limit of 20 open files, and an
+ * assumption that the majority of processes, especially short-lived
+ * processes like shells, will never need more.
+ *
+ * If this initial allocation is exhausted, a larger descriptor table and
+ * map are allocated dynamically, and the pointers in the process's struct
+ * filedesc are updated to point to those. This is repeated every time
+ * the process runs out of file descriptors (provided it hasn't hit its
+ * resource limit).
+ *
+ * Since threads may hold references to individual descriptor table
+ * entries, the tables are never freed. Instead, they are placed on a
+ * linked list and freed only when the struct filedesc is released.
+ */
+#define NDFILE 20
+#define NDSLOTSIZE sizeof(NDSLOTTYPE)
+#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT)
+#define NDSLOT(x) ((x) / NDENTRIES)
+#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
+#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)
+
+/*
+ * SLIST entry used to keep track of ofiles which must be reclaimed when
+ * the process exits.
+ */
+struct freetable {
+ struct fdescenttbl *ft_table;
+ SLIST_ENTRY(freetable) ft_next;
+};
+
+/*
+ * Initial allocation: a filedesc structure + the head of SLIST used to
+ * keep track of old ofiles + enough space for NDFILE descriptors.
+ */
+
+struct fdescenttbl0 {
+ int fdt_nfiles;
+ struct filedescent fdt_ofiles[NDFILE];
+};
+
+struct filedesc0 {
+ struct filedesc fd_fd;
+ SLIST_HEAD(, freetable) fd_free;
+ struct fdescenttbl0 fd_dfiles;
+ NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
+};
+
+/*
+ * Descriptor management.
+ */
+volatile int __exclusive_cache_line openfiles; /* actual number of open files */
+struct mtx sigio_lock; /* mtx to protect pointers to sigio */
+void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+
+/*
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
+ */
+static int
+fd_first_free(struct filedesc *fdp, int low, int size)
+{
+ NDSLOTTYPE *map = fdp->fd_map;
+ NDSLOTTYPE mask;
+ int off, maxoff;
+
+ if (low >= size)
+ return (low);
+
+ off = NDSLOT(low);
+ if (low % NDENTRIES) {
+ mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
+ if ((mask &= ~map[off]) != 0UL)
+ return (off * NDENTRIES + ffsl(mask) - 1);
+ ++off;
+ }
+ for (maxoff = NDSLOTS(size); off < maxoff; ++off)
+ if (map[off] != ~0UL)
+ return (off * NDENTRIES + ffsl(~map[off]) - 1);
+ return (size);
+}
+
+/*
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
+ */
+static int
+fd_last_used(struct filedesc *fdp, int size)
+{
+ NDSLOTTYPE *map = fdp->fd_map;
+ NDSLOTTYPE mask;
+ int off, minoff;
+
+ off = NDSLOT(size);
+ if (size % NDENTRIES) {
+ mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
+ if ((mask &= map[off]) != 0)
+ return (off * NDENTRIES + flsl(mask) - 1);
+ --off;
+ }
+ for (minoff = NDSLOT(0); off >= minoff; --off)
+ if (map[off] != 0)
+ return (off * NDENTRIES + flsl(map[off]) - 1);
+ return (-1);
+}
+
+static int
+fdisused(struct filedesc *fdp, int fd)
+{
+
+ KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+ ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+ return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
+}
+
+/*
+ * Mark a file descriptor as used.
+ */
+static void
+fdused_init(struct filedesc *fdp, int fd)
+{
+
+ KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
+
+ fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
+}
+
+static void
+fdused(struct filedesc *fdp, int fd)
+{
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ fdused_init(fdp, fd);
+ if (fd > fdp->fd_lastfile)
+ fdp->fd_lastfile = fd;
+ if (fd == fdp->fd_freefile)
+ fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
+}
+
+/*
+ * Mark a file descriptor as unused.
+ */
+static void
+fdunused(struct filedesc *fdp, int fd)
+{
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+ KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+ ("fd=%d is still in use", fd));
+
+ fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
+ if (fd < fdp->fd_freefile)
+ fdp->fd_freefile = fd;
+ if (fd == fdp->fd_lastfile)
+ fdp->fd_lastfile = fd_last_used(fdp, fd);
+}
+
+/*
+ * Free a file descriptor.
+ *
+ * Avoid some work if fdp is about to be destroyed.
+ */
+static inline void
+fdefree_last(struct filedescent *fde)
+{
+
+ filecaps_free(&fde->fde_caps);
+}
+
+static inline void
+fdfree(struct filedesc *fdp, int fd)
+{
+ struct filedescent *fde;
+
+ fde = &fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+ seq_write_begin(&fde->fde_seq);
+#endif
+ fde->fde_file = NULL;
+#ifdef CAPABILITIES
+ seq_write_end(&fde->fde_seq);
+#endif
+ fdefree_last(fde);
+ fdunused(fdp, fd);
+}
+
+void
+pwd_ensure_dirs(void)
+{
+ struct filedesc *fdp;
+
+ fdp = curproc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_cdir == NULL) {
+ fdp->fd_cdir = rootvnode;
+ vrefact(rootvnode);
+ }
+ if (fdp->fd_rdir == NULL) {
+ fdp->fd_rdir = rootvnode;
+ vrefact(rootvnode);
+ }
+ FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
+{
+#ifdef RACCT
+ uint64_t lim;
+#endif
+
+ td->td_retval[0] =
+ min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc);
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
+ PROC_UNLOCK(td->td_proc);
+ if (lim < td->td_retval[0])
+ td->td_retval[0] = lim;
+#endif
+ return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ *
+ * Note: keep in mind that a potential race condition exists when closing
+ * descriptors from a shared descriptor table (via rfork).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+ u_int from;
+ u_int to;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup2(struct thread *td, struct dup2_args *uap)
+{
+
+ return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+ u_int fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup(struct thread *td, struct dup_args *uap)
+{
+
+ return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+ int fd;
+ int cmd;
+ long arg;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fcntl(struct thread *td, struct fcntl_args *uap)
+{
+
+ return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
+}
+
+int
+kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
+{
+ struct flock fl;
+ struct __oflock ofl;
+ intptr_t arg1;
+ int error, newcmd;
+
+ error = 0;
+ newcmd = cmd;
+ switch (cmd) {
+ case F_OGETLK:
+ case F_OSETLK:
+ case F_OSETLKW:
+ /*
+ * Convert old flock structure to new.
+ */
+ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
+ fl.l_start = ofl.l_start;
+ fl.l_len = ofl.l_len;
+ fl.l_pid = ofl.l_pid;
+ fl.l_type = ofl.l_type;
+ fl.l_whence = ofl.l_whence;
+ fl.l_sysid = 0;
+
+ switch (cmd) {
+ case F_OGETLK:
+ newcmd = F_GETLK;
+ break;
+ case F_OSETLK:
+ newcmd = F_SETLK;
+ break;
+ case F_OSETLKW:
+ newcmd = F_SETLKW;
+ break;
+ }
+ arg1 = (intptr_t)&fl;
+ break;
+ case F_GETLK:
+ case F_SETLK:
+ case F_SETLKW:
+ case F_SETLK_REMOTE:
+ error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
+ arg1 = (intptr_t)&fl;
+ break;
+ default:
+ arg1 = arg;
+ break;
+ }
+ if (error)
+ return (error);
+ error = kern_fcntl(td, fd, newcmd, arg1);
+ if (error)
+ return (error);
+ if (cmd == F_OGETLK) {
+ ofl.l_start = fl.l_start;
+ ofl.l_len = fl.l_len;
+ ofl.l_pid = fl.l_pid;
+ ofl.l_type = fl.l_type;
+ ofl.l_whence = fl.l_whence;
+ error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
+ } else if (cmd == F_GETLK) {
+ error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
+ }
+ return (error);
+}
+
+int
+kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
+{
+ struct filedesc *fdp;
+ struct flock *flp;
+ struct file *fp, *fp2;
+ struct filedescent *fde;
+ struct proc *p;
+ struct vnode *vp;
+ int error, flg, tmp;
+ uint64_t bsize;
+ off_t foffset;
+
+ error = 0;
+ flg = F_POSIX;
+ p = td->td_proc;
+ fdp = p->p_fd;
+
+ AUDIT_ARG_FD(cmd);
+ AUDIT_ARG_CMD(cmd);
+ switch (cmd) {
+ case F_DUPFD:
+ tmp = arg;
+ error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
+ break;
+
+ case F_DUPFD_CLOEXEC:
+ tmp = arg;
+ error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
+ break;
+
+ case F_DUP2FD:
+ tmp = arg;
+ error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
+ break;
+
+ case F_DUP2FD_CLOEXEC:
+ tmp = arg;
+ error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
+ break;
+
+ case F_GETFD:
+ error = EBADF;
+ FILEDESC_SLOCK(fdp);
+ fde = fdeget_locked(fdp, fd);
+ if (fde != NULL) {
+ td->td_retval[0] =
+ (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+ error = 0;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ break;
+
+ case F_SETFD:
+ error = EBADF;
+ FILEDESC_XLOCK(fdp);
+ fde = fdeget_locked(fdp, fd);
+ if (fde != NULL) {
+ fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
+ (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+ error = 0;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ break;
+
+ case F_GETFL:
+ error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
+ if (error != 0)
+ break;
+ td->td_retval[0] = OFLAGS(fp->f_flag);
+ fdrop(fp, td);
+ break;
+
+ case F_SETFL:
+ error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
+ if (error != 0)
+ break;
+ do {
+ tmp = flg = fp->f_flag;
+ tmp &= ~FCNTLFLAGS;
+ tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
+ } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
+ tmp = fp->f_flag & FNONBLOCK;
+ error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ tmp = fp->f_flag & FASYNC;
+ error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
+ if (error == 0) {
+ fdrop(fp, td);
+ break;
+ }
+ atomic_clear_int(&fp->f_flag, FNONBLOCK);
+ tmp = 0;
+ (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+ fdrop(fp, td);
+ break;
+
+ case F_GETOWN:
+ error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
+ if (error != 0)
+ break;
+ error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
+ if (error == 0)
+ td->td_retval[0] = tmp;
+ fdrop(fp, td);
+ break;
+
+ case F_SETOWN:
+ error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
+ if (error != 0)
+ break;
+ tmp = arg;
+ error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
+ fdrop(fp, td);
+ break;
+
+ case F_SETLK_REMOTE:
+ error = priv_check(td, PRIV_NFS_LOCKD);
+ if (error != 0)
+ return (error);
+ flg = F_REMOTE;
+ goto do_setlk;
+
+ case F_SETLKW:
+ flg |= F_WAIT;
+ /* FALLTHROUGH F_SETLK */
+
+ case F_SETLK:
+ do_setlk:
+ flp = (struct flock *)arg;
+ if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
+ error = EINVAL;
+ break;
+ }
+
+ error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL);
+ if (error != 0)
+ break;
+ if (fp->f_type != DTYPE_VNODE) {
+ error = EBADF;
+ fdrop(fp, td);
+ break;
+ }
+
+ if (flp->l_whence == SEEK_CUR) {
+ foffset = foffset_get(fp);
+ if (foffset < 0 ||
+ (flp->l_start > 0 &&
+ foffset > OFF_MAX - flp->l_start)) {
+ error = EOVERFLOW;
+ fdrop(fp, td);
+ break;
+ }
+ flp->l_start += foffset;
+ }
+
+ vp = fp->f_vnode;
+ switch (flp->l_type) {
+ case F_RDLCK:
+ if ((fp->f_flag & FREAD) == 0) {
+ error = EBADF;
+ break;
+ }
+ if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
+ PROC_LOCK(p->p_leader);
+ p->p_leader->p_flag |= P_ADVLOCK;
+ PROC_UNLOCK(p->p_leader);
+ }
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+ flp, flg);
+ break;
+ case F_WRLCK:
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ break;
+ }
+ if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
+ PROC_LOCK(p->p_leader);
+ p->p_leader->p_flag |= P_ADVLOCK;
+ PROC_UNLOCK(p->p_leader);
+ }
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+ flp, flg);
+ break;
+ case F_UNLCK:
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
+ flp, flg);
+ break;
+ case F_UNLCKSYS:
+ if (flg != F_REMOTE) {
+ error = EINVAL;
+ break;
+ }
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+ F_UNLCKSYS, flp, flg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (error != 0 || flp->l_type == F_UNLCK ||
+ flp->l_type == F_UNLCKSYS) {
+ fdrop(fp, td);
+ break;
+ }
+
+ /*
+ * Check for a race with close.
+ *
+ * The vnode is now advisory locked (or unlocked, but this case
+ * is not really important) as the caller requested.
+ * We had to drop the filedesc lock, so we need to recheck if
+ * the descriptor is still valid, because if it was closed
+ * in the meantime we need to remove advisory lock from the
+ * vnode - close on any descriptor leading to an advisory
+ * locked vnode, removes that lock.
+ * We will return 0 on purpose in that case, as the result of
+ * successful advisory lock might have been externally visible
+ * already. This is fine - effectively we pretend to the caller
+ * that the closing thread was a bit slower and that the
+ * advisory lock succeeded before the close.
+ */
+ error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2, NULL);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ if (fp != fp2) {
+ flp->l_whence = SEEK_SET;
+ flp->l_start = 0;
+ flp->l_len = 0;
+ flp->l_type = F_UNLCK;
+ (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+ F_UNLCK, flp, F_POSIX);
+ }
+ fdrop(fp, td);
+ fdrop(fp2, td);
+ break;
+
+ case F_GETLK:
+ error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL);
+ if (error != 0)
+ break;
+ if (fp->f_type != DTYPE_VNODE) {
+ error = EBADF;
+ fdrop(fp, td);
+ break;
+ }
+ flp = (struct flock *)arg;
+ if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
+ flp->l_type != F_UNLCK) {
+ error = EINVAL;
+ fdrop(fp, td);
+ break;
+ }
+ if (flp->l_whence == SEEK_CUR) {
+ foffset = foffset_get(fp);
+ if ((flp->l_start > 0 &&
+ foffset > OFF_MAX - flp->l_start) ||
+ (flp->l_start < 0 &&
+ foffset < OFF_MIN - flp->l_start)) {
+ error = EOVERFLOW;
+ fdrop(fp, td);
+ break;
+ }
+ flp->l_start += foffset;
+ }
+ vp = fp->f_vnode;
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
+ F_POSIX);
+ fdrop(fp, td);
+ break;
+
+ case F_RDAHEAD:
+ arg = arg ? 128 * 1024: 0;
+ /* FALLTHROUGH */
+ case F_READAHEAD:
+ error = fget_unlocked(fdp, fd, &cap_no_rights, &fp, NULL);
+ if (error != 0)
+ break;
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ error = EBADF;
+ break;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ fdrop(fp, td);
+ error = ENOTTY;
+ break;
+ }
+
+ /*
+ * Exclusive lock synchronizes against f_seqcount reads and
+ * writes in sequential_heuristic().
+ */
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ if (arg >= 0) {
+ bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
+ arg = MIN(arg, INT_MAX - bsize + 1);
+ fp->f_seqcount = MIN(IO_SEQMAX,
+ (arg + bsize - 1) / bsize);
+ atomic_set_int(&fp->f_flag, FRDAHEAD);
+ } else {
+ atomic_clear_int(&fp->f_flag, FRDAHEAD);
+ }
+ VOP_UNLOCK(vp, 0);
+ fdrop(fp, td);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static int
+getmaxfd(struct thread *td)
+{
+
+ return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
+}
+
+/*
+ * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
+ */
+int
+kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
+{
+ struct filedesc *fdp;
+ struct filedescent *oldfde, *newfde;
+ struct proc *p;
+ struct file *delfp;
+ u_long *oioctls, *nioctls;
+ int error, maxfd;
+
+ p = td->td_proc;
+ fdp = p->p_fd;
+
+ MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
+ MPASS(mode < FDDUP_LASTMODE);
+
+ AUDIT_ARG_FD(old);
+ /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
+
+ /*
+ * Verify we have a valid descriptor to dup from and possibly to
+ * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
+ * return EINVAL when the new descriptor is out of bounds.
+ */
+ if (old < 0)
+ return (EBADF);
+ if (new < 0)
+ return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
+ maxfd = getmaxfd(td);
+ if (new >= maxfd)
+ return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
+
+ error = EBADF;
+ FILEDESC_XLOCK(fdp);
+ if (fget_locked(fdp, old) == NULL)
+ goto unlock;
+ if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
+ td->td_retval[0] = new;
+ if (flags & FDDUP_FLAG_CLOEXEC)
+ fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
+ error = 0;
+ goto unlock;
+ }
+
+ oldfde = &fdp->fd_ofiles[old];
+ if (!fhold(oldfde->fde_file))
+ goto unlock;
+
+ /*
+ * If the caller specified a file descriptor, make sure the file
+ * table is large enough to hold it, and grab it. Otherwise, just
+ * allocate a new descriptor the usual way.
+ */
+ switch (mode) {
+ case FDDUP_NORMAL:
+ case FDDUP_FCNTL:
+ if ((error = fdalloc(td, new, &new)) != 0) {
+ fdrop(oldfde->fde_file, td);
+ goto unlock;
+ }
+ break;
+ case FDDUP_MUSTREPLACE:
+ /* Target file descriptor must exist. */
+ if (fget_locked(fdp, new) == NULL) {
+ fdrop(oldfde->fde_file, td);
+ goto unlock;
+ }
+ break;
+ case FDDUP_FIXED:
+ if (new >= fdp->fd_nfiles) {
+ /*
+ * The resource limits are here instead of e.g.
+ * fdalloc(), because the file descriptor table may be
+ * shared between processes, so we can't really use
+ * racct_add()/racct_sub(). Instead of counting the
+ * number of actually allocated descriptors, just put
+ * the limit on the size of the file descriptor table.
+ */
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(p);
+ error = racct_set(p, RACCT_NOFILE, new + 1);
+ PROC_UNLOCK(p);
+ if (error != 0) {
+ error = EMFILE;
+ fdrop(oldfde->fde_file, td);
+ goto unlock;
+ }
+ }
+#endif
+ fdgrowtable_exp(fdp, new + 1);
+ }
+ if (!fdisused(fdp, new))
+ fdused(fdp, new);
+ break;
+ default:
+ KASSERT(0, ("%s unsupported mode %d", __func__, mode));
+ }
+
+ KASSERT(old != new, ("new fd is same as old"));
+
+ newfde = &fdp->fd_ofiles[new];
+ delfp = newfde->fde_file;
+
+ oioctls = filecaps_free_prep(&newfde->fde_caps);
+ nioctls = filecaps_copy_prep(&oldfde->fde_caps);
+
+ /*
+ * Duplicate the source descriptor.
+ */
+#ifdef CAPABILITIES
+ seq_write_begin(&newfde->fde_seq);
+#endif
+ memcpy(newfde, oldfde, fde_change_size);
+ filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
+ nioctls);
+ if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
+ newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
+ else
+ newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+#ifdef CAPABILITIES
+ seq_write_end(&newfde->fde_seq);
+#endif
+ filecaps_free_finish(oioctls);
+ td->td_retval[0] = new;
+
+ error = 0;
+
+ if (delfp != NULL) {
+ (void) closefp(fdp, new, delfp, td, 1);
+ FILEDESC_UNLOCK_ASSERT(fdp);
+ } else {
+unlock:
+ FILEDESC_XUNLOCK(fdp);
+ }
+
+ return (error);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(struct sigio **sigiop)
+{
+ struct sigio *sigio;
+
+ if (*sigiop == NULL)
+ return;
+ SIGIO_LOCK();
+ sigio = *sigiop;
+ if (sigio == NULL) {
+ SIGIO_UNLOCK();
+ return;
+ }
+ *(sigio->sio_myref) = NULL;
+ if ((sigio)->sio_pgid < 0) {
+ struct pgrp *pg = (sigio)->sio_pgrp;
+ PGRP_LOCK(pg);
+ SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ PGRP_UNLOCK(pg);
+ } else {
+ struct proc *p = (sigio)->sio_proc;
+ PROC_LOCK(p);
+ SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ PROC_UNLOCK(p);
+ }
+ SIGIO_UNLOCK();
+ crfree(sigio->sio_ucred);
+ free(sigio, M_SIGIO);
+}
+
+/*
+ * Free a list of sigio structures.
+ * We only need to lock the SIGIO_LOCK because we have made ourselves
+ * inaccessible to callers of fsetown and therefore do not need to lock
+ * the proc or pgrp struct for the list manipulation.
+ */
+void
+funsetownlst(struct sigiolst *sigiolst)
+{
+ struct proc *p;
+ struct pgrp *pg;
+ struct sigio *sigio;
+
+ sigio = SLIST_FIRST(sigiolst);
+ if (sigio == NULL)
+ return;
+ p = NULL;
+ pg = NULL;
+
+ /*
+ * Every entry of the list should belong
+ * to a single proc or pgrp.
+ */
+ if (sigio->sio_pgid < 0) {
+ pg = sigio->sio_pgrp;
+ PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
+ } else /* if (sigio->sio_pgid > 0) */ {
+ p = sigio->sio_proc;
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ }
+
+ SIGIO_LOCK();
+ while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
+ *(sigio->sio_myref) = NULL;
+ if (pg != NULL) {
+ KASSERT(sigio->sio_pgid < 0,
+ ("Proc sigio in pgrp sigio list"));
+ KASSERT(sigio->sio_pgrp == pg,
+ ("Bogus pgrp in sigio list"));
+ PGRP_LOCK(pg);
+ SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
+ sio_pgsigio);
+ PGRP_UNLOCK(pg);
+ } else /* if (p != NULL) */ {
+ KASSERT(sigio->sio_pgid > 0,
+ ("Pgrp sigio in proc sigio list"));
+ KASSERT(sigio->sio_proc == p,
+ ("Bogus proc in sigio list"));
+ PROC_LOCK(p);
+ SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
+ sio_pgsigio);
+ PROC_UNLOCK(p);
+ }
+ SIGIO_UNLOCK();
+ crfree(sigio->sio_ucred);
+ free(sigio, M_SIGIO);
+ SIGIO_LOCK();
+ }
+ SIGIO_UNLOCK();
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pid_t pgid, struct sigio **sigiop)
+{
+ struct proc *proc;
+ struct pgrp *pgrp;
+ struct sigio *sigio;
+ int ret;
+
+ if (pgid == 0) {
+ funsetown(sigiop);
+ return (0);
+ }
+
+ ret = 0;
+
+ /* Allocate and fill in the new sigio out of locks. */
+ sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
+ sigio->sio_pgid = pgid;
+ sigio->sio_ucred = crhold(curthread->td_ucred);
+ sigio->sio_myref = sigiop;
+
+ sx_slock(&proctree_lock);
+ if (pgid > 0) {
+ proc = pfind(pgid);
+ if (proc == NULL) {
+ ret = ESRCH;
+ goto fail;
+ }
+
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ PROC_UNLOCK(proc);
+ if (proc->p_session != curthread->td_proc->p_session) {
+ ret = EPERM;
+ goto fail;
+ }
+
+ pgrp = NULL;
+ } else /* if (pgid < 0) */ {
+ pgrp = pgfind(-pgid);
+ if (pgrp == NULL) {
+ ret = ESRCH;
+ goto fail;
+ }
+ PGRP_UNLOCK(pgrp);
+
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ if (pgrp->pg_session != curthread->td_proc->p_session) {
+ ret = EPERM;
+ goto fail;
+ }
+
+ proc = NULL;
+ }
+ funsetown(sigiop);
+ if (pgid > 0) {
+ PROC_LOCK(proc);
+ /*
+ * Since funsetownlst() is called without the proctree
+ * locked, we need to check for P_WEXIT.
+ * XXX: is ESRCH correct?
+ */
+ if ((proc->p_flag & P_WEXIT) != 0) {
+ PROC_UNLOCK(proc);
+ ret = ESRCH;
+ goto fail;
+ }
+ SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_proc = proc;
+ PROC_UNLOCK(proc);
+ } else {
+ PGRP_LOCK(pgrp);
+ SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_pgrp = pgrp;
+ PGRP_UNLOCK(pgrp);
+ }
+ sx_sunlock(&proctree_lock);
+ SIGIO_LOCK();
+ *sigiop = sigio;
+ SIGIO_UNLOCK();
+ return (0);
+
+fail:
+ sx_sunlock(&proctree_lock);
+ crfree(sigio->sio_ucred);
+ free(sigio, M_SIGIO);
+ return (ret);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(struct sigio **sigiop)
+{
+ pid_t pgid;
+
+ SIGIO_LOCK();
+ pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
+ SIGIO_UNLOCK();
+ return (pgid);
+}
+
+/*
+ * Function drops the filedesc lock on return.
+ */
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+ int holdleaders)
+{
+ int error;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ if (holdleaders) {
+ if (td->td_proc->p_fdtol != NULL) {
+ /*
+ * Ask fdfree() to sleep to ensure that all relevant
+ * process leaders can be traversed in closef().
+ */
+ fdp->fd_holdleaderscount++;
+ } else {
+ holdleaders = 0;
+ }
+ }
+
+ /*
+ * We now hold the fp reference that used to be owned by the
+ * descriptor array. We have to unlock the FILEDESC *AFTER*
+ * knote_fdclose to prevent a race of the fd getting opened, a knote
+ * added, and deleteing a knote for the new fd.
+ */
+ knote_fdclose(td, fd);
+
+ /*
+ * We need to notify mqueue if the object is of type mqueue.
+ */
+ if (fp->f_type == DTYPE_MQUEUE)
+ mq_fdclose(td, fd, fp);
+ FILEDESC_XUNLOCK(fdp);
+
+ error = closef(fp, td);
+ if (holdleaders) {
+ FILEDESC_XLOCK(fdp);
+ fdp->fd_holdleaderscount--;
+ if (fdp->fd_holdleaderscount == 0 &&
+ fdp->fd_holdleaderswakeup != 0) {
+ fdp->fd_holdleaderswakeup = 0;
+ wakeup(&fdp->fd_holdleaderscount);
+ }
+ FILEDESC_XUNLOCK(fdp);
+ }
+ return (error);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_close(struct thread *td, struct close_args *uap)
+{
+
+ return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(struct thread *td, int fd)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+
+ fdp = td->td_proc->p_fd;
+
+ AUDIT_SYSCLOSE(td, fd);
+
+ FILEDESC_XLOCK(fdp);
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+ fdfree(fdp, fd);
+
+ /* closefp() drops the FILEDESC lock for us. */
+ return (closefp(fdp, fd, fp, td, 1));
+}
+
+/*
+ * Close open file descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct closefrom_args {
+ int lowfd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_closefrom(struct thread *td, struct closefrom_args *uap)
+{
+ struct filedesc *fdp;
+ int fd;
+
+ fdp = td->td_proc->p_fd;
+ AUDIT_ARG_FD(uap->lowfd);
+
+ /*
+ * Treat negative starting file descriptor values identical to
+ * closefrom(0) which closes all files.
+ */
+ if (uap->lowfd < 0)
+ uap->lowfd = 0;
+ FILEDESC_SLOCK(fdp);
+ for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
+ if (fdp->fd_ofiles[fd].fde_file != NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ (void)kern_close(td, fd);
+ FILEDESC_SLOCK(fdp);
+ }
+ }
+ FILEDESC_SUNLOCK(fdp);
+ return (0);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+ int fd;
+ struct ostat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+ofstat(struct thread *td, struct ofstat_args *uap)
+{
+ struct ostat oub;
+ struct stat ub;
+ int error;
+
+ error = kern_fstat(td, uap->fd, &ub);
+ if (error == 0) {
+ cvtstat(&ub, &oub);
+ error = copyout(&oub, uap->sb, sizeof(oub));
+ }
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD11)
+int
+freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
+{
+ struct stat sb;
+ struct freebsd11_stat osb;
+ int error;
+
+ error = kern_fstat(td, uap->fd, &sb);
+ if (error != 0)
+ return (error);
+ error = freebsd11_cvtstat(&sb, &osb);
+ if (error == 0)
+ error = copyout(&osb, uap->sb, sizeof(osb));
+ return (error);
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+ int fd;
+ struct stat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fstat(struct thread *td, struct fstat_args *uap)
+{
+ struct stat ub;
+ int error;
+
+ error = kern_fstat(td, uap->fd, &ub);
+ if (error == 0)
+ error = copyout(&ub, uap->sb, sizeof(ub));
+ return (error);
+}
+
+int
+kern_fstat(struct thread *td, int fd, struct stat *sbp)
+{
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+
+ error = fget(td, fd, &cap_fstat_rights, &fp);
+ if (error != 0)
+ return (error);
+
+ AUDIT_ARG_FILE(td->td_proc, fp);
+
+ error = fo_stat(fp, sbp, td->td_ucred, td);
+ fdrop(fp, td);
+#ifdef __STAT_TIME_T_EXT
+ if (error == 0) {
+ sbp->st_atim_ext = 0;
+ sbp->st_mtim_ext = 0;
+ sbp->st_ctim_ext = 0;
+ sbp->st_btim_ext = 0;
+ }
+#endif
+#ifdef KTRACE
+ if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+ ktrstat(sbp);
+#endif
+ return (error);
+}
+
+#if defined(COMPAT_FREEBSD11)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_nfstat_args {
+ int fd;
+ struct nstat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
+{
+ struct nstat nub;
+ struct stat ub;
+ int error;
+
+ error = kern_fstat(td, uap->fd, &ub);
+ if (error == 0) {
+ freebsd11_cvtnstat(&ub, &nub);
+ error = copyout(&nub, uap->sb, sizeof(nub));
+ }
+ return (error);
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+ int fd;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
+{
+ long value;
+ int error;
+
+ error = kern_fpathconf(td, uap->fd, uap->name, &value);
+ if (error == 0)
+ td->td_retval[0] = value;
+ return (error);
+}
+
+int
+kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
+{
+ struct file *fp;
+ struct vnode *vp;
+ int error;
+
+ error = fget(td, fd, &cap_fpathconf_rights, &fp);
+ if (error != 0)
+ return (error);
+
+ if (name == _PC_ASYNC_IO) {
+ *valuep = _POSIX_ASYNCHRONOUS_IO;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp != NULL) {
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_PATHCONF(vp, name, valuep);
+ VOP_UNLOCK(vp, 0);
+ } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
+ if (name != _PC_PIPE_BUF) {
+ error = EINVAL;
+ } else {
+ *valuep = PIPE_BUF;
+ error = 0;
+ }
+ } else {
+ error = EOPNOTSUPP;
+ }
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Initialize filecaps structure.
+ */
+void
+filecaps_init(struct filecaps *fcaps)
+{
+
+ bzero(fcaps, sizeof(*fcaps));
+ fcaps->fc_nioctls = -1;
+}
+
+/*
+ * Copy filecaps structure allocating memory for ioctls array if needed.
+ *
+ * The last parameter indicates whether the fdtable is locked. If it is not and
+ * ioctls are encountered, copying fails and the caller must lock the table.
+ *
+ * Note that if the table was not locked, the caller has to check the relevant
+ * sequence counter to determine whether the operation was successful.
+ */
+bool
+filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
+{
+ size_t size;
+
+ if (src->fc_ioctls != NULL && !locked)
+ return (false);
+ memcpy(dst, src, sizeof(*src));
+ if (src->fc_ioctls == NULL)
+ return (true);
+
+ KASSERT(src->fc_nioctls > 0,
+ ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+ size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+ dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+ memcpy(dst->fc_ioctls, src->fc_ioctls, size);
+ return (true);
+}
+
+static u_long *
+filecaps_copy_prep(const struct filecaps *src)
+{
+ u_long *ioctls;
+ size_t size;
+
+ if (src->fc_ioctls == NULL)
+ return (NULL);
+
+ KASSERT(src->fc_nioctls > 0,
+ ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+ size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+ ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+ return (ioctls);
+}
+
+static void
+filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
+ u_long *ioctls)
+{
+ size_t size;
+
+ *dst = *src;
+ if (src->fc_ioctls == NULL) {
+ MPASS(ioctls == NULL);
+ return;
+ }
+
+ size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+ dst->fc_ioctls = ioctls;
+ bcopy(src->fc_ioctls, dst->fc_ioctls, size);
+}
+
+/*
+ * Move filecaps structure to the new place and clear the old place.
+ */
+void
+filecaps_move(struct filecaps *src, struct filecaps *dst)
+{
+
+ *dst = *src;
+ bzero(src, sizeof(*src));
+}
+
+/*
+ * Fill the given filecaps structure with full rights.
+ */
+static void
+filecaps_fill(struct filecaps *fcaps)
+{
+
+ CAP_ALL(&fcaps->fc_rights);
+ fcaps->fc_ioctls = NULL;
+ fcaps->fc_nioctls = -1;
+ fcaps->fc_fcntls = CAP_FCNTL_ALL;
+}
+
+/*
+ * Free memory allocated within filecaps structure.
+ */
+void
+filecaps_free(struct filecaps *fcaps)
+{
+
+ free(fcaps->fc_ioctls, M_FILECAPS);
+ bzero(fcaps, sizeof(*fcaps));
+}
+
+static u_long *
+filecaps_free_prep(struct filecaps *fcaps)
+{
+ u_long *ioctls;
+
+ ioctls = fcaps->fc_ioctls;
+ bzero(fcaps, sizeof(*fcaps));
+ return (ioctls);
+}
+
+static void
+filecaps_free_finish(u_long *ioctls)
+{
+
+ free(ioctls, M_FILECAPS);
+}
+
+/*
+ * Validate the given filecaps structure.
+ */
+static void
+filecaps_validate(const struct filecaps *fcaps, const char *func)
+{
+
+ KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
+ ("%s: invalid rights", func));
+ KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
+ ("%s: invalid fcntls", func));
+ KASSERT(fcaps->fc_fcntls == 0 ||
+ cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
+ ("%s: fcntls without CAP_FCNTL", func));
+ KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
+ (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
+ ("%s: invalid ioctls", func));
+ KASSERT(fcaps->fc_nioctls == 0 ||
+ cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
+ ("%s: ioctls without CAP_IOCTL", func));
+}
+
+static void
+fdgrowtable_exp(struct filedesc *fdp, int nfd)
+{
+ int nfd1;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ nfd1 = fdp->fd_nfiles * 2;
+ if (nfd1 < nfd)
+ nfd1 = nfd;
+ fdgrowtable(fdp, nfd1);
+}
+
+/*
+ * Grow the file table to accommodate (at least) nfd descriptors.
+ */
+static void
+fdgrowtable(struct filedesc *fdp, int nfd)
+{
+ struct filedesc0 *fdp0;
+ struct freetable *ft;
+ struct fdescenttbl *ntable;
+ struct fdescenttbl *otable;
+ int nnfiles, onfiles;
+ NDSLOTTYPE *nmap, *omap;
+
+ /*
+ * If lastfile is -1 this struct filedesc was just allocated and we are
+ * growing it to accommodate for the one we are going to copy from. There
+ * is no need to have a lock on this one as it's not visible to anyone.
+ */
+ if (fdp->fd_lastfile != -1)
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
+
+ /* save old values */
+ onfiles = fdp->fd_nfiles;
+ otable = fdp->fd_files;
+ omap = fdp->fd_map;
+
+ /* compute the size of the new table */
+ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
+ if (nnfiles <= onfiles)
+ /* the table is already large enough */
+ return;
+
+ /*
+ * Allocate a new table. We need enough space for the number of
+ * entries, file entries themselves and the struct freetable we will use
+ * when we decommission the table and place it on the freelist.
+ * We place the struct freetable in the middle so we don't have
+ * to worry about padding.
+ */
+ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
+ nnfiles * sizeof(ntable->fdt_ofiles[0]) +
+ sizeof(struct freetable),
+ M_FILEDESC, M_ZERO | M_WAITOK);
+ /* copy the old data */
+ ntable->fdt_nfiles = nnfiles;
+ memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
+ onfiles * sizeof(ntable->fdt_ofiles[0]));
+
+ /*
+ * Allocate a new map only if the old is not large enough. It will
+ * grow at a slower rate than the table as it can map more
+ * entries than the table can hold.
+ */
+ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
+ nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
+ M_ZERO | M_WAITOK);
+ /* copy over the old data and update the pointer */
+ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
+ fdp->fd_map = nmap;
+ }
+
+ /*
+ * Make sure that ntable is correctly initialized before we replace
+ * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
+ * data.
+ */
+ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
+
+ /*
+ * Do not free the old file table, as some threads may still
+ * reference entries within it. Instead, place it on a freelist
+ * which will be processed when the struct filedesc is released.
+ *
+ * Note that if onfiles == NDFILE, we're dealing with the original
+ * static allocation contained within (struct filedesc0 *)fdp,
+ * which must not be freed.
+ */
+ if (onfiles > NDFILE) {
+ ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
+ fdp0 = (struct filedesc0 *)fdp;
+ ft->ft_table = otable;
+ SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
+ }
+ /*
+ * The map does not have the same possibility of threads still
+ * holding references to it. So always free it as long as it
+ * does not reference the original static allocation.
+ */
+ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
+ free(omap, M_FILEDESC);
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+int
+fdalloc(struct thread *td, int minfd, int *result)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ int fd, maxfd, allocfd;
+#ifdef RACCT
+ int error;
+#endif
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ if (fdp->fd_freefile > minfd)
+ minfd = fdp->fd_freefile;
+
+ maxfd = getmaxfd(td);
+
+ /*
+ * Search the bitmap for a free descriptor starting at minfd.
+ * If none is found, grow the file table.
+ */
+ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+ if (fd >= maxfd)
+ return (EMFILE);
+ if (fd >= fdp->fd_nfiles) {
+ allocfd = min(fd * 2, maxfd);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(p);
+ error = racct_set(p, RACCT_NOFILE, allocfd);
+ PROC_UNLOCK(p);
+ if (error != 0)
+ return (EMFILE);
+ }
+#endif
+ /*
+ * fd is already equal to first free descriptor >= minfd, so
+ * we only need to grow the table and we are done.
+ */
+ fdgrowtable_exp(fdp, allocfd);
+ }
+
+ /*
+ * Perform some sanity checks, then mark the file descriptor as
+ * used and return it to the caller.
+ */
+ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+ ("invalid descriptor %d", fd));
+ KASSERT(!fdisused(fdp, fd),
+ ("fd_first_free() returned non-free descriptor"));
+ KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+ ("file descriptor isn't free"));
+ fdused(fdp, fd);
+ *result = fd;
+ return (0);
+}
+
+/*
+ * Allocate n file descriptors for the process.
+ */
+int
+fdallocn(struct thread *td, int minfd, int *fds, int n)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ int i;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ for (i = 0; i < n; i++)
+ if (fdalloc(td, 0, &fds[i]) != 0)
+ break;
+
+ if (i < n) {
+ for (i--; i >= 0; i--)
+ fdunused(fdp, fds[i]);
+ return (EMFILE);
+ }
+
+ return (0);
+}
+
+/*
+ * Create a new open file structure and allocate a file descriptor for the
+ * process that refers to it. We add one reference to the file for the
+ * descriptor table and one reference for resultfp. This is to prevent us
+ * being preempted and the entry in the descriptor table closed after we
+ * release the FILEDESC lock.
+ */
+int
+falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
+ struct filecaps *fcaps)
+{
+ struct file *fp;
+ int error, fd;
+
+ error = falloc_noinstall(td, &fp);
+ if (error)
+ return (error); /* no reference held on error */
+
+ error = finstall(td, fp, &fd, flags, fcaps);
+ if (error) {
+ fdrop(fp, td); /* one reference (fp only) */
+ return (error);
+ }
+
+ if (resultfp != NULL)
+ *resultfp = fp; /* copy out result */
+ else
+ fdrop(fp, td); /* release local reference */
+
+ if (resultfd != NULL)
+ *resultfd = fd;
+
+ return (0);
+}
+
+/*
+ * Create a new open file structure without allocating a file descriptor.
+ */
+int
+falloc_noinstall(struct thread *td, struct file **resultfp)
+{
+ struct file *fp;
+ int maxuserfiles = maxfiles - (maxfiles / 20);
+ int openfiles_new;
+ static struct timeval lastfail;
+ static int curfail;
+
+ KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
+
+ openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
+ if ((openfiles_new >= maxuserfiles &&
+ priv_check(td, PRIV_MAXFILES) != 0) ||
+ openfiles_new >= maxfiles) {
+ atomic_subtract_int(&openfiles, 1);
+ if (ppsratecheck(&lastfail, &curfail, 1)) {
+ printf("kern.maxfiles limit exceeded by uid %i, (%s) "
+ "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
+ }
+ return (ENFILE);
+ }
+ fp = uma_zalloc(file_zone, M_WAITOK);
+ bzero(fp, sizeof(*fp));
+ refcount_init(&fp->f_count, 1);
+ fp->f_cred = crhold(td->td_ucred);
+ fp->f_ops = &badfileops;
+ *resultfp = fp;
+ return (0);
+}
+
+/*
+ * Install a file in a file descriptor table.
+ */
+void
+_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
+ struct filecaps *fcaps)
+{
+ struct filedescent *fde;
+
+ MPASS(fp != NULL);
+ if (fcaps != NULL)
+ filecaps_validate(fcaps, __func__);
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ fde = &fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+ seq_write_begin(&fde->fde_seq);
+#endif
+ fde->fde_file = fp;
+ fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
+ if (fcaps != NULL)
+ filecaps_move(fcaps, &fde->fde_caps);
+ else
+ filecaps_fill(&fde->fde_caps);
+#ifdef CAPABILITIES
+ seq_write_end(&fde->fde_seq);
+#endif
+}
+
+int
+finstall(struct thread *td, struct file *fp, int *fd, int flags,
+ struct filecaps *fcaps)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ int error;
+
+ MPASS(fd != NULL);
+
+ if (!fhold(fp))
+ return (EBADF);
+ FILEDESC_XLOCK(fdp);
+ if ((error = fdalloc(td, 0, fd))) {
+ FILEDESC_XUNLOCK(fdp);
+ fdrop(fp, td);
+ return (error);
+ }
+ _finstall(fdp, fp, *fd, flags, fcaps);
+ FILEDESC_XUNLOCK(fdp);
+ return (0);
+}
+
+/*
+ * Build a new filedesc structure from another.
+ * Copy the current, root, and jail root vnode references.
+ *
+ * If fdp is not NULL, return with it shared locked.
+ */
+struct filedesc *
+fdinit(struct filedesc *fdp, bool prepfiles)
+{
+ struct filedesc0 *newfdp0;
+ struct filedesc *newfdp;
+
+ newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
+ newfdp = &newfdp0->fd_fd;
+
+ /* Create the file descriptor table. */
+ FILEDESC_LOCK_INIT(newfdp);
+ refcount_init(&newfdp->fd_refcnt, 1);
+ refcount_init(&newfdp->fd_holdcnt, 1);
+ newfdp->fd_cmask = CMASK;
+ newfdp->fd_map = newfdp0->fd_dmap;
+ newfdp->fd_lastfile = -1;
+ newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
+ newfdp->fd_files->fdt_nfiles = NDFILE;
+
+ if (fdp == NULL)
+ return (newfdp);
+
+ if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles)
+ fdgrowtable(newfdp, fdp->fd_lastfile + 1);
+
+ FILEDESC_SLOCK(fdp);
+ newfdp->fd_cdir = fdp->fd_cdir;
+ if (newfdp->fd_cdir)
+ vrefact(newfdp->fd_cdir);
+ newfdp->fd_rdir = fdp->fd_rdir;
+ if (newfdp->fd_rdir)
+ vrefact(newfdp->fd_rdir);
+ newfdp->fd_jdir = fdp->fd_jdir;
+ if (newfdp->fd_jdir)
+ vrefact(newfdp->fd_jdir);
+
+ if (!prepfiles) {
+ FILEDESC_SUNLOCK(fdp);
+ } else {
+ while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
+ FILEDESC_SUNLOCK(fdp);
+ fdgrowtable(newfdp, fdp->fd_lastfile + 1);
+ FILEDESC_SLOCK(fdp);
+ }
+ }
+
+ return (newfdp);
+}
+
+static struct filedesc *
+fdhold(struct proc *p)
+{
+ struct filedesc *fdp;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ fdp = p->p_fd;
+ if (fdp != NULL)
+ refcount_acquire(&fdp->fd_holdcnt);
+ return (fdp);
+}
+
+static void
+fddrop(struct filedesc *fdp)
+{
+
+ if (fdp->fd_holdcnt > 1) {
+ if (refcount_release(&fdp->fd_holdcnt) == 0)
+ return;
+ }
+
+ FILEDESC_LOCK_DESTROY(fdp);
+ uma_zfree(filedesc0_zone, fdp);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(struct filedesc *fdp)
+{
+
+ refcount_acquire(&fdp->fd_refcnt);
+ return (fdp);
+}
+
+/*
+ * Unshare a filedesc structure, if necessary by making a copy
+ */
+void
+fdunshare(struct thread *td)
+{
+ struct filedesc *tmp;
+ struct proc *p = td->td_proc;
+
+ if (p->p_fd->fd_refcnt == 1)
+ return;
+
+ tmp = fdcopy(p->p_fd);
+ fdescfree(td);
+ p->p_fd = tmp;
+}
+
+void
+fdinstall_remapped(struct thread *td, struct filedesc *fdp)
+{
+
+ fdescfree(td);
+ td->td_proc->p_fd = fdp;
+}
+
+/*
+ * Copy a filedesc structure. A NULL pointer in returns a NULL reference,
+ * this is to ease callers, not catch errors.
+ */
+struct filedesc *
+fdcopy(struct filedesc *fdp)
+{
+ struct filedesc *newfdp;
+ struct filedescent *nfde, *ofde;
+ int i;
+
+ MPASS(fdp != NULL);
+
+ newfdp = fdinit(fdp, true);
+ /* copy all passable descriptors (i.e. not kqueue) */
+ newfdp->fd_freefile = -1;
+ for (i = 0; i <= fdp->fd_lastfile; ++i) {
+ ofde = &fdp->fd_ofiles[i];
+ if (ofde->fde_file == NULL ||
+ (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
+ !fhold(ofde->fde_file)) {
+ if (newfdp->fd_freefile == -1)
+ newfdp->fd_freefile = i;
+ continue;
+ }
+ nfde = &newfdp->fd_ofiles[i];
+ *nfde = *ofde;
+ filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
+ fdused_init(newfdp, i);
+ newfdp->fd_lastfile = i;
+ }
+ if (newfdp->fd_freefile == -1)
+ newfdp->fd_freefile = i;
+ newfdp->fd_cmask = fdp->fd_cmask;
+ FILEDESC_SUNLOCK(fdp);
+ return (newfdp);
+}
+
+/*
+ * Copies a filedesc structure, while remapping all file descriptors
+ * stored inside using a translation table.
+ *
+ * File descriptors are copied over to the new file descriptor table,
+ * regardless of whether the close-on-exec flag is set.
+ */
+int
+fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
+ struct filedesc **ret)
+{
+ struct filedesc *newfdp;
+ struct filedescent *nfde, *ofde;
+ int error, i;
+
+ MPASS(fdp != NULL);
+
+ newfdp = fdinit(fdp, true);
+ if (nfds > fdp->fd_lastfile + 1) {
+ /* New table cannot be larger than the old one. */
+ error = E2BIG;
+ goto bad;
+ }
+ /* Copy all passable descriptors (i.e. not kqueue). */
+ newfdp->fd_freefile = nfds;
+ for (i = 0; i < nfds; ++i) {
+ if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) {
+ /* File descriptor out of bounds. */
+ error = EBADF;
+ goto bad;
+ }
+ ofde = &fdp->fd_ofiles[fds[i]];
+ if (ofde->fde_file == NULL) {
+ /* Unused file descriptor. */
+ error = EBADF;
+ goto bad;
+ }
+ if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
+ /* File descriptor cannot be passed. */
+ error = EINVAL;
+ goto bad;
+ }
+ if (!fhold(nfde->fde_file)) {
+ error = EBADF;
+ goto bad;
+ }
+ nfde = &newfdp->fd_ofiles[i];
+ *nfde = *ofde;
+ filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
+ fdused_init(newfdp, i);
+ newfdp->fd_lastfile = i;
+ }
+ newfdp->fd_cmask = fdp->fd_cmask;
+ FILEDESC_SUNLOCK(fdp);
+ *ret = newfdp;
+ return (0);
+bad:
+ FILEDESC_SUNLOCK(fdp);
+ fdescfree_remapped(newfdp);
+ return (error);
+}
+
+/*
+ * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
+ * one of processes using it exits) and the table used to be shared.
+ */
+static void
+fdclearlocks(struct thread *td)
+{
+ struct filedesc *fdp;
+ struct filedesc_to_leader *fdtol;
+ struct flock lf;
+ struct file *fp;
+ struct proc *p;
+ struct vnode *vp;
+ int i;
+
+ p = td->td_proc;
+ fdp = p->p_fd;
+ fdtol = p->p_fdtol;
+ MPASS(fdtol != NULL);
+
+ FILEDESC_XLOCK(fdp);
+ KASSERT(fdtol->fdl_refcount > 0,
+ ("filedesc_to_refcount botch: fdl_refcount=%d",
+ fdtol->fdl_refcount));
+ if (fdtol->fdl_refcount == 1 &&
+ (p->p_leader->p_flag & P_ADVLOCK) != 0) {
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp == NULL || fp->f_type != DTYPE_VNODE ||
+ !fhold(fp))
+ continue;
+ FILEDESC_XUNLOCK(fdp);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = fp->f_vnode;
+ (void) VOP_ADVLOCK(vp,
+ (caddr_t)p->p_leader, F_UNLCK,
+ &lf, F_POSIX);
+ FILEDESC_XLOCK(fdp);
+ fdrop(fp, td);
+ }
+ }
+retry:
+ if (fdtol->fdl_refcount == 1) {
+ if (fdp->fd_holdleaderscount > 0 &&
+ (p->p_leader->p_flag & P_ADVLOCK) != 0) {
+ /*
+ * close() or kern_dup() has cleared a reference
+ * in a shared file descriptor table.
+ */
+ fdp->fd_holdleaderswakeup = 1;
+ sx_sleep(&fdp->fd_holdleaderscount,
+ FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
+ goto retry;
+ }
+ if (fdtol->fdl_holdcount > 0) {
+ /*
+ * Ensure that fdtol->fdl_leader remains
+ * valid in closef().
+ */
+ fdtol->fdl_wakeup = 1;
+ sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
+ "fdlhold", 0);
+ goto retry;
+ }
+ }
+ fdtol->fdl_refcount--;
+ if (fdtol->fdl_refcount == 0 &&
+ fdtol->fdl_holdcount == 0) {
+ fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
+ fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
+ } else
+ fdtol = NULL;
+ p->p_fdtol = NULL;
+ FILEDESC_XUNLOCK(fdp);
+ if (fdtol != NULL)
+ free(fdtol, M_FILEDESC_TO_LEADER);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+static void
+fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
+{
+ struct filedesc0 *fdp0;
+ struct freetable *ft, *tft;
+ struct filedescent *fde;
+ struct file *fp;
+ int i;
+
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fde = &fdp->fd_ofiles[i];
+ fp = fde->fde_file;
+ if (fp != NULL) {
+ fdefree_last(fde);
+ if (needclose)
+ (void) closef(fp, td);
+ else
+ fdrop(fp, td);
+ }
+ }
+
+ if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
+ free(fdp->fd_map, M_FILEDESC);
+ if (fdp->fd_nfiles > NDFILE)
+ free(fdp->fd_files, M_FILEDESC);
+
+ fdp0 = (struct filedesc0 *)fdp;
+ SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
+ free(ft->ft_table, M_FILEDESC);
+
+ fddrop(fdp);
+}
+
+void
+fdescfree(struct thread *td)
+{
+ struct proc *p;
+ struct filedesc *fdp;
+ struct vnode *cdir, *jdir, *rdir;
+
+ p = td->td_proc;
+ fdp = p->p_fd;
+ MPASS(fdp != NULL);
+
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(p);
+ racct_set(p, RACCT_NOFILE, 0);
+ PROC_UNLOCK(p);
+ }
+#endif
+
+ if (p->p_fdtol != NULL)
+ fdclearlocks(td);
+
+ PROC_LOCK(p);
+ p->p_fd = NULL;
+ PROC_UNLOCK(p);
+
+ if (refcount_release(&fdp->fd_refcnt) == 0)
+ return;
+
+ FILEDESC_XLOCK(fdp);
+ cdir = fdp->fd_cdir;
+ fdp->fd_cdir = NULL;
+ rdir = fdp->fd_rdir;
+ fdp->fd_rdir = NULL;
+ jdir = fdp->fd_jdir;
+ fdp->fd_jdir = NULL;
+ FILEDESC_XUNLOCK(fdp);
+
+ if (cdir != NULL)
+ vrele(cdir);
+ if (rdir != NULL)
+ vrele(rdir);
+ if (jdir != NULL)
+ vrele(jdir);
+
+ fdescfree_fds(td, fdp, 1);
+}
+
+void
+fdescfree_remapped(struct filedesc *fdp)
+{
+
+ if (fdp->fd_cdir != NULL)
+ vrele(fdp->fd_cdir);
+ if (fdp->fd_rdir != NULL)
+ vrele(fdp->fd_rdir);
+ if (fdp->fd_jdir != NULL)
+ vrele(fdp->fd_jdir);
+
+ fdescfree_fds(curthread, fdp, 0);
+}
+
+/*
+ * For setugid programs, we don't want to people to use that setugidness
+ * to generate error messages which write to a file which otherwise would
+ * otherwise be off-limits to the process. We check for filesystems where
+ * the vnode can change out from under us after execve (like [lin]procfs).
+ *
+ * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
+ * sufficient. We also don't check for setugidness since we know we are.
+ */
+static bool
+is_unsafe(struct file *fp)
+{
+ struct vnode *vp;
+
+ if (fp->f_type != DTYPE_VNODE)
+ return (false);
+
+ vp = fp->f_vnode;
+ return ((vp->v_vflag & VV_PROCDEP) != 0);
+}
+
+/*
+ * Make this setguid thing safe, if at all possible.
+ */
+void
+fdsetugidsafety(struct thread *td)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ int i;
+
+ fdp = td->td_proc->p_fd;
+ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+ MPASS(fdp->fd_nfiles >= 3);
+ for (i = 0; i <= 2; i++) {
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp != NULL && is_unsafe(fp)) {
+ FILEDESC_XLOCK(fdp);
+ knote_fdclose(td, i);
+ /*
+ * NULL-out descriptor prior to close to avoid
+ * a race while close blocks.
+ */
+ fdfree(fdp, i);
+ FILEDESC_XUNLOCK(fdp);
+ (void) closef(fp, td);
+ }
+ }
+}
+
+/*
+ * If a specific file object occupies a specific file descriptor, close the
+ * file descriptor entry and drop a reference on the file object. This is a
+ * convenience function to handle a subsequent error in a function that calls
+ * falloc() that handles the race that another thread might have closed the
+ * file descriptor out from under the thread creating the file object.
+ */
+void
+fdclose(struct thread *td, struct file *fp, int idx)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_ofiles[idx].fde_file == fp) {
+ fdfree(fdp, idx);
+ FILEDESC_XUNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(struct thread *td)
+{
+ struct filedesc *fdp;
+ struct filedescent *fde;
+ struct file *fp;
+ int i;
+
+ fdp = td->td_proc->p_fd;
+ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fde = &fdp->fd_ofiles[i];
+ fp = fde->fde_file;
+ if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
+ (fde->fde_flags & UF_EXCLOSE))) {
+ FILEDESC_XLOCK(fdp);
+ fdfree(fdp, i);
+ (void) closefp(fdp, i, fp, td, 0);
+ FILEDESC_UNLOCK_ASSERT(fdp);
+ }
+ }
+}
+
+/*
+ * It is unsafe for set[ug]id processes to be started with file
+ * descriptors 0..2 closed, as these descriptors are given implicit
+ * significance in the Standard C library. fdcheckstd() will create a
+ * descriptor referencing /dev/null for each of stdin, stdout, and
+ * stderr that is not already open.
+ */
+int
+fdcheckstd(struct thread *td)
+{
+ struct filedesc *fdp;
+ register_t save;
+ int i, error, devnull;
+
+ fdp = td->td_proc->p_fd;
+ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+ MPASS(fdp->fd_nfiles >= 3);
+ devnull = -1;
+ for (i = 0; i <= 2; i++) {
+ if (fdp->fd_ofiles[i].fde_file != NULL)
+ continue;
+
+ save = td->td_retval[0];
+ if (devnull != -1) {
+ error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
+ } else {
+ error = kern_openat(td, AT_FDCWD, "/dev/null",
+ UIO_SYSSPACE, O_RDWR, 0);
+ if (error == 0) {
+ devnull = td->td_retval[0];
+ KASSERT(devnull == i, ("we didn't get our fd"));
+ }
+ }
+ td->td_retval[0] = save;
+ if (error != 0)
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * Internal form of close. Decrement reference count on file structure.
+ * Note: td may be NULL when closing a file that was being passed in a
+ * message.
+ */
+int
+closef(struct file *fp, struct thread *td)
+{
+ struct vnode *vp;
+ struct flock lf;
+ struct filedesc_to_leader *fdtol;
+ struct filedesc *fdp;
+
+ /*
+ * POSIX record locking dictates that any close releases ALL
+ * locks owned by this process. This is handled by setting
+ * a flag in the unlock to free ONLY locks obeying POSIX
+ * semantics, and not to free BSD-style file locks.
+ * If the descriptor was in a message, POSIX-style locks
+ * aren't passed with the descriptor, and the thread pointer
+ * will be NULL. Callers should be careful only to pass a
+ * NULL thread pointer when there really is no owning
+ * context that might have locks, or the locks will be
+ * leaked.
+ */
+ if (fp->f_type == DTYPE_VNODE && td != NULL) {
+ vp = fp->f_vnode;
+ if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
+ F_UNLCK, &lf, F_POSIX);
+ }
+ fdtol = td->td_proc->p_fdtol;
+ if (fdtol != NULL) {
+ /*
+ * Handle special case where file descriptor table is
+ * shared between multiple process leaders.
+ */
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ for (fdtol = fdtol->fdl_next;
+ fdtol != td->td_proc->p_fdtol;
+ fdtol = fdtol->fdl_next) {
+ if ((fdtol->fdl_leader->p_flag &
+ P_ADVLOCK) == 0)
+ continue;
+ fdtol->fdl_holdcount++;
+ FILEDESC_XUNLOCK(fdp);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = fp->f_vnode;
+ (void) VOP_ADVLOCK(vp,
+ (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+ F_POSIX);
+ FILEDESC_XLOCK(fdp);
+ fdtol->fdl_holdcount--;
+ if (fdtol->fdl_holdcount == 0 &&
+ fdtol->fdl_wakeup != 0) {
+ fdtol->fdl_wakeup = 0;
+ wakeup(fdtol);
+ }
+ }
+ FILEDESC_XUNLOCK(fdp);
+ }
+ }
+ return (fdrop(fp, td));
+}
+
+/*
+ * Initialize the file pointer with the specified properties.
+ *
+ * The ops are set with release semantics to be certain that the flags, type,
+ * and data are visible when ops is. This is to prevent ops methods from being
+ * called with bad data.
+ */
+void
+finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
+{
+ fp->f_data = data;
+ fp->f_flag = flag;
+ fp->f_type = type;
+ atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
+}
+
+int
+fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+ struct file **fpp, struct filecaps *havecapsp)
+{
+ struct filedescent *fde;
+ int error;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ fde = fdeget_locked(fdp, fd);
+ if (fde == NULL) {
+ error = EBADF;
+ goto out;
+ }
+
+#ifdef CAPABILITIES
+ error = cap_check(cap_rights_fde_inline(fde), needrightsp);
+ if (error != 0)
+ goto out;
+#endif
+
+ if (havecapsp != NULL)
+ filecaps_copy(&fde->fde_caps, havecapsp, true);
+
+ *fpp = fde->fde_file;
+
+ error = 0;
+out:
+ return (error);
+}
+
+int
+fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
+ struct file **fpp, struct filecaps *havecapsp)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ int error;
+#ifndef CAPABILITIES
+ error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL);
+ if (error == 0 && havecapsp != NULL)
+ filecaps_fill(havecapsp);
+#else
+ struct file *fp;
+ seq_t seq;
+
+ for (;;) {
+ error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq);
+ if (error != 0)
+ return (error);
+
+ if (havecapsp != NULL) {
+ if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
+ havecapsp, false)) {
+ fdrop(fp, td);
+ goto get_locked;
+ }
+ }
+
+ if (!fd_modified(fdp, fd, seq))
+ break;
+ fdrop(fp, td);
+ }
+
+ *fpp = fp;
+ return (0);
+
+get_locked:
+ FILEDESC_SLOCK(fdp);
+ error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
+ if (error == 0 && !fhold(*fpp))
+ error = EBADF;
+ FILEDESC_SUNLOCK(fdp);
+#endif
+ return (error);
+}
+
+int
+fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+ struct file **fpp, seq_t *seqp)
+{
+#ifdef CAPABILITIES
+ const struct filedescent *fde;
+#endif
+ const struct fdescenttbl *fdt;
+ struct file *fp;
+ u_int count;
+#ifdef CAPABILITIES
+ seq_t seq;
+ cap_rights_t haverights;
+ int error;
+#endif
+
+ fdt = fdp->fd_files;
+ if ((u_int)fd >= fdt->fdt_nfiles)
+ return (EBADF);
+ /*
+ * Fetch the descriptor locklessly. We avoid fdrop() races by
+ * never raising a refcount above 0. To accomplish this we have
+ * to use a cmpset loop rather than an atomic_add. The descriptor
+ * must be re-verified once we acquire a reference to be certain
+ * that the identity is still correct and we did not lose a race
+ * due to preemption.
+ */
+ for (;;) {
+#ifdef CAPABILITIES
+ seq = seq_load(fd_seq(fdt, fd));
+ fde = &fdt->fdt_ofiles[fd];
+ haverights = *cap_rights_fde_inline(fde);
+ fp = fde->fde_file;
+ if (!seq_consistent(fd_seq(fdt, fd), seq))
+ continue;
+#else
+ fp = fdt->fdt_ofiles[fd].fde_file;
+#endif
+ if (fp == NULL)
+ return (EBADF);
+#ifdef CAPABILITIES
+ error = cap_check(&haverights, needrightsp);
+ if (error != 0)
+ return (error);
+#endif
+ count = fp->f_count;
+ retry:
+ if (count == 0) {
+ /*
+ * Force a reload. Other thread could reallocate the
+ * table before this fd was closed, so it possible that
+ * there is a stale fp pointer in cached version.
+ */
+ fdt = *(const struct fdescenttbl * const volatile *)
+ &(fdp->fd_files);
+ continue;
+ }
+ if (__predict_false(count + 1 < count))
+ return (EBADF);
+
+ /*
+ * Use an acquire barrier to force re-reading of fdt so it is
+ * refreshed for verification.
+ */
+ if (__predict_false(atomic_fcmpset_acq_int(&fp->f_count,
+ &count, count + 1) == 0))
+ goto retry;
+ fdt = fdp->fd_files;
+#ifdef CAPABILITIES
+ if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
+#else
+ if (fp == fdt->fdt_ofiles[fd].fde_file)
+#endif
+ break;
+ fdrop(fp, curthread);
+ }
+ *fpp = fp;
+ if (seqp != NULL) {
+#ifdef CAPABILITIES
+ *seqp = seq;
+#endif
+ }
+ return (0);
+}
+
+/*
+ * Extract the file pointer associated with the specified descriptor for the
+ * current user process.
+ *
+ * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
+ * returned.
+ *
+ * File's rights will be checked against the capability rights mask.
+ *
+ * If an error occurred the non-zero error is returned and *fpp is set to
+ * NULL. Otherwise *fpp is held and set and zero is returned. Caller is
+ * responsible for fdrop().
+ */
+static __inline int
+_fget(struct thread *td, int fd, struct file **fpp, int flags,
+ cap_rights_t *needrightsp, seq_t *seqp)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ int error;
+
+ *fpp = NULL;
+ fdp = td->td_proc->p_fd;
+ error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp);
+ if (error != 0)
+ return (error);
+ if (fp->f_ops == &badfileops) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+
+ /*
+ * FREAD and FWRITE failure return EBADF as per POSIX.
+ */
+ error = 0;
+ switch (flags) {
+ case FREAD:
+ case FWRITE:
+ if ((fp->f_flag & flags) == 0)
+ error = EBADF;
+ break;
+ case FEXEC:
+ if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
+ ((fp->f_flag & FWRITE) != 0))
+ error = EBADF;
+ break;
+ case 0:
+ break;
+ default:
+ KASSERT(0, ("wrong flags"));
+ }
+
+ if (error != 0) {
+ fdrop(fp, td);
+ return (error);
+ }
+
+ *fpp = fp;
+ return (0);
+}
+
+int
+fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+ return (_fget(td, fd, fpp, 0, rightsp, NULL));
+}
+
+int
+fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
+ struct file **fpp)
+{
+ int error;
+#ifndef CAPABILITIES
+ error = _fget(td, fd, fpp, 0, rightsp, NULL);
+ if (maxprotp != NULL)
+ *maxprotp = VM_PROT_ALL;
+#else
+ cap_rights_t fdrights;
+ struct filedesc *fdp = td->td_proc->p_fd;
+ seq_t seq;
+
+ MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
+ for (;;) {
+ error = _fget(td, fd, fpp, 0, rightsp, &seq);
+ if (error != 0)
+ return (error);
+ if (maxprotp != NULL)
+ fdrights = *cap_rights(fdp, fd);
+ if (!fd_modified(fdp, fd, seq))
+ break;
+ fdrop(*fpp, td);
+ }
+
+ /*
+ * If requested, convert capability rights to access flags.
+ */
+ if (maxprotp != NULL)
+ *maxprotp = cap_rights_to_vmprot(&fdrights);
+#endif
+ return (error);
+}
+
+int
+fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+ return (_fget(td, fd, fpp, FREAD, rightsp, NULL));
+}
+
+int
+fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+ return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
+}
+
+int
+fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
+ struct file **fpp)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+#ifndef CAPABILITIES
+ return (fget_unlocked(fdp, fd, rightsp, fpp, NULL));
+#else
+ int error;
+ seq_t seq;
+
+ MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
+ for (;;) {
+ error = fget_unlocked(fdp, fd, rightsp, fpp, &seq);
+ if (error != 0)
+ return (error);
+ error = cap_fcntl_check(fdp, fd, needfcntl);
+ if (!fd_modified(fdp, fd, seq))
+ break;
+ fdrop(*fpp, td);
+ }
+ if (error != 0) {
+ fdrop(*fpp, td);
+ *fpp = NULL;
+ }
+ return (error);
+#endif
+}
+
+/*
+ * Like fget() but loads the underlying vnode, or returns an error if the
+ * descriptor does not represent a vnode. Note that pipes use vnodes but
+ * never have VM objects. The returned vnode will be vref()'d.
+ *
+ * XXX: what about the unused flags ?
+ */
+static __inline int
+_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
+ struct vnode **vpp)
+{
+ struct file *fp;
+ int error;
+
+ *vpp = NULL;
+ error = _fget(td, fd, &fp, flags, needrightsp, NULL);
+ if (error != 0)
+ return (error);
+ if (fp->f_vnode == NULL) {
+ error = EINVAL;
+ } else {
+ *vpp = fp->f_vnode;
+ vrefact(*vpp);
+ }
+ fdrop(fp, td);
+
+ return (error);
+}
+
+int
+fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, 0, rightsp, vpp));
+}
+
+int
+fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
+ struct filecaps *havecaps, struct vnode **vpp)
+{
+ struct filedesc *fdp;
+ struct filecaps caps;
+ struct file *fp;
+ int error;
+
+ fdp = td->td_proc->p_fd;
+ error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps);
+ if (error != 0)
+ return (error);
+ if (fp->f_ops == &badfileops) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_vnode == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ *havecaps = caps;
+ *vpp = fp->f_vnode;
+ vrefact(*vpp);
+
+ return (0);
+out:
+ filecaps_free(&caps);
+ return (error);
+}
+
+int
+fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, FREAD, rightsp, vpp));
+}
+
+int
+fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
+}
+
+#ifdef notyet
+int
+fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
+ struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
+}
+#endif
+
+/*
+ * Handle the last reference to a file being closed.
+ *
+ * Without the noinline attribute clang keeps inlining the func thorough this
+ * file when fdrop is used.
+ */
+int __noinline
+_fdrop(struct file *fp, struct thread *td)
+{
+ int error;
+
+ if (fp->f_count != 0)
+ panic("fdrop: count %d", fp->f_count);
+ error = fo_close(fp, td);
+ atomic_subtract_int(&openfiles, 1);
+ crfree(fp->f_cred);
+ free(fp->f_advice, M_FADVISE);
+ uma_zfree(file_zone, fp);
+
+ return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on the entire file
+ * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+ int fd;
+ int how;
+};
+#endif
+/* ARGSUSED */
+int
+sys_flock(struct thread *td, struct flock_args *uap)
+{
+ struct file *fp;
+ struct vnode *vp;
+ struct flock lf;
+ int error;
+
+ error = fget(td, uap->fd, &cap_flock_rights, &fp);
+ if (error != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ return (EOPNOTSUPP);
+ }
+
+ vp = fp->f_vnode;
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (uap->how & LOCK_UN) {
+ lf.l_type = F_UNLCK;
+ atomic_clear_int(&fp->f_flag, FHASLOCK);
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+ goto done2;
+ }
+ if (uap->how & LOCK_EX)
+ lf.l_type = F_WRLCK;
+ else if (uap->how & LOCK_SH)
+ lf.l_type = F_RDLCK;
+ else {
+ error = EBADF;
+ goto done2;
+ }
+ atomic_set_int(&fp->f_flag, FHASLOCK);
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+ (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
+done2:
+ fdrop(fp, td);
+ return (error);
+}
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
+ int openerror, int *indxp)
+{
+ struct filedescent *newfde, *oldfde;
+ struct file *fp;
+ u_long *ioctls;
+ int error, indx;
+
+ KASSERT(openerror == ENODEV || openerror == ENXIO,
+ ("unexpected error %d in %s", openerror, __func__));
+
+ /*
+ * If the to-be-dup'd fd number is greater than the allowed number
+ * of file descriptors, or the fd to be dup'd has already been
+ * closed, then reject.
+ */
+ FILEDESC_XLOCK(fdp);
+ if ((fp = fget_locked(fdp, dfd)) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+
+ error = fdalloc(td, 0, &indx);
+ if (error != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ return (error);
+ }
+
+ /*
+ * There are two cases of interest here.
+ *
+ * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
+ *
+ * For ENXIO steal away the file structure from (dfd) and store it in
+ * (indx). (dfd) is effectively closed by this operation.
+ */
+ switch (openerror) {
+ case ENODEV:
+ /*
+ * Check that the mode the file is being opened for is a
+ * subset of the mode of the existing descriptor.
+ */
+ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+ fdunused(fdp, indx);
+ FILEDESC_XUNLOCK(fdp);
+ return (EACCES);
+ }
+ if (!fhold(fp)) {
+ fdunused(fdp, indx);
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+ newfde = &fdp->fd_ofiles[indx];
+ oldfde = &fdp->fd_ofiles[dfd];
+ ioctls = filecaps_copy_prep(&oldfde->fde_caps);
+#ifdef CAPABILITIES
+ seq_write_begin(&newfde->fde_seq);
+#endif
+ memcpy(newfde, oldfde, fde_change_size);
+ filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
+ ioctls);
+#ifdef CAPABILITIES
+ seq_write_end(&newfde->fde_seq);
+#endif
+ break;
+ case ENXIO:
+ /*
+ * Steal away the file pointer from dfd and stuff it into indx.
+ */
+ newfde = &fdp->fd_ofiles[indx];
+ oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+ seq_write_begin(&newfde->fde_seq);
+#endif
+ memcpy(newfde, oldfde, fde_change_size);
+ oldfde->fde_file = NULL;
+ fdunused(fdp, dfd);
+#ifdef CAPABILITIES
+ seq_write_end(&newfde->fde_seq);
+#endif
+ break;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ *indxp = indx;
+ return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ * 0: disallowed for all processes.
+ * 1: allowed for processes that were not already chroot(2)'ed.
+ * 2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+ &chroot_allow_open_directories, 0,
+ "Allow a process to chroot(2) if it has a directory open");
+
+/*
+ * Helper function for raised chroot(2) security function: Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(struct filedesc *fdp)
+{
+ struct vnode *vp;
+ struct file *fp;
+ int fd;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
+ fp = fget_locked(fdp, fd);
+ if (fp == NULL)
+ continue;
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = fp->f_vnode;
+ if (vp->v_type == VDIR)
+ return (EPERM);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Common routine for kern_chroot() and jail_attach(). The caller is
+ * responsible for invoking priv_check() and mac_vnode_check_chroot() to
+ * authorize this operation.
+ */
+int
+pwd_chroot(struct thread *td, struct vnode *vp)
+{
+ struct filedesc *fdp;
+ struct vnode *oldvp;
+ int error;
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ if (chroot_allow_open_directories == 0 ||
+ (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+ error = chroot_refuse_vdir_fds(fdp);
+ if (error != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ return (error);
+ }
+ }
+ oldvp = fdp->fd_rdir;
+ vrefact(vp);
+ fdp->fd_rdir = vp;
+ if (fdp->fd_jdir == NULL) {
+ vrefact(vp);
+ fdp->fd_jdir = vp;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ vrele(oldvp);
+ return (0);
+}
+
+void
+pwd_chdir(struct thread *td, struct vnode *vp)
+{
+ struct filedesc *fdp;
+ struct vnode *oldvp;
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ VNASSERT(vp->v_usecount > 0, vp,
+ ("chdir to a vnode with zero usecount"));
+ oldvp = fdp->fd_cdir;
+ fdp->fd_cdir = vp;
+ FILEDESC_XUNLOCK(fdp);
+ vrele(oldvp);
+}
+
+/*
+ * Scan all active processes and prisons to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new mount point.
+ */
+void
+mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
+{
+ struct filedesc *fdp;
+ struct prison *pr;
+ struct proc *p;
+ int nrele;
+
+ if (vrefcnt(olddp) == 1)
+ return;
+ nrele = 0;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ continue;
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_cdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_cdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_rdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_rdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_jdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_jdir = newdp;
+ nrele++;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ fddrop(fdp);
+ }
+ sx_sunlock(&allproc_lock);
+ if (rootvnode == olddp) {
+ vrefact(newdp);
+ rootvnode = newdp;
+ nrele++;
+ }
+ mtx_lock(&prison0.pr_mtx);
+ if (prison0.pr_root == olddp) {
+ vrefact(newdp);
+ prison0.pr_root = newdp;
+ nrele++;
+ }
+ mtx_unlock(&prison0.pr_mtx);
+ sx_slock(&allprison_lock);
+ TAILQ_FOREACH(pr, &allprison, pr_list) {
+ mtx_lock(&pr->pr_mtx);
+ if (pr->pr_root == olddp) {
+ vrefact(newdp);
+ pr->pr_root = newdp;
+ nrele++;
+ }
+ mtx_unlock(&pr->pr_mtx);
+ }
+ sx_sunlock(&allprison_lock);
+ while (nrele--)
+ vrele(olddp);
+}
+
+struct filedesc_to_leader *
+filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
+{
+ struct filedesc_to_leader *fdtol;
+
+ fdtol = malloc(sizeof(struct filedesc_to_leader),
+ M_FILEDESC_TO_LEADER, M_WAITOK);
+ fdtol->fdl_refcount = 1;
+ fdtol->fdl_holdcount = 0;
+ fdtol->fdl_wakeup = 0;
+ fdtol->fdl_leader = leader;
+ if (old != NULL) {
+ FILEDESC_XLOCK(fdp);
+ fdtol->fdl_next = old->fdl_next;
+ fdtol->fdl_prev = old;
+ old->fdl_next = fdtol;
+ fdtol->fdl_next->fdl_prev = fdtol;
+ FILEDESC_XUNLOCK(fdp);
+ } else {
+ fdtol->fdl_next = fdtol;
+ fdtol->fdl_prev = fdtol;
+ }
+ return (fdtol);
+}
+
+static int
+sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
+{
+ struct filedesc *fdp;
+ int i, count, slots;
+
+ if (*(int *)arg1 != 0)
+ return (EINVAL);
+
+ fdp = curproc->p_fd;
+ count = 0;
+ FILEDESC_SLOCK(fdp);
+ slots = NDSLOTS(fdp->fd_lastfile + 1);
+ for (i = 0; i < slots; i++)
+ count += bitcountl(fdp->fd_map[i]);
+ FILEDESC_SUNLOCK(fdp);
+
+ return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
+ CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
+ "Number of open file descriptors");
+
+/*
+ * Get file structures globally.
+ */
+static int
+sysctl_kern_file(SYSCTL_HANDLER_ARGS)
+{
+ struct xfile xf;
+ struct filedesc *fdp;
+ struct file *fp;
+ struct proc *p;
+ int error, n;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ if (req->oldptr == NULL) {
+ n = 0;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ continue;
+ /* overestimates sparse tables. */
+ if (fdp->fd_lastfile > 0)
+ n += fdp->fd_lastfile;
+ fddrop(fdp);
+ }
+ sx_sunlock(&allproc_lock);
+ return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
+ }
+ error = 0;
+ bzero(&xf, sizeof(xf));
+ xf.xf_size = sizeof(xf);
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (p_cansee(req->td, p) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ xf.xf_pid = p->p_pid;
+ xf.xf_uid = p->p_ucred->cr_uid;
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ continue;
+ FILEDESC_SLOCK(fdp);
+ for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ xf.xf_fd = n;
+ xf.xf_file = (uintptr_t)fp;
+ xf.xf_data = (uintptr_t)fp->f_data;
+ xf.xf_vnode = (uintptr_t)fp->f_vnode;
+ xf.xf_type = (uintptr_t)fp->f_type;
+ xf.xf_count = fp->f_count;
+ xf.xf_msgcount = 0;
+ xf.xf_offset = foffset_get(fp);
+ xf.xf_flag = fp->f_flag;
+ error = SYSCTL_OUT(req, &xf, sizeof(xf));
+ if (error)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
+ if (error)
+ break;
+ }
+ sx_sunlock(&allproc_lock);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
+ 0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
+
+#ifdef KINFO_FILE_SIZE
+CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
+#endif
+
+static int
+xlate_fflags(int fflags)
+{
+ static const struct {
+ int fflag;
+ int kf_fflag;
+ } fflags_table[] = {
+ { FAPPEND, KF_FLAG_APPEND },
+ { FASYNC, KF_FLAG_ASYNC },
+ { FFSYNC, KF_FLAG_FSYNC },
+ { FHASLOCK, KF_FLAG_HASLOCK },
+ { FNONBLOCK, KF_FLAG_NONBLOCK },
+ { FREAD, KF_FLAG_READ },
+ { FWRITE, KF_FLAG_WRITE },
+ { O_CREAT, KF_FLAG_CREAT },
+ { O_DIRECT, KF_FLAG_DIRECT },
+ { O_EXCL, KF_FLAG_EXCL },
+ { O_EXEC, KF_FLAG_EXEC },
+ { O_EXLOCK, KF_FLAG_EXLOCK },
+ { O_NOFOLLOW, KF_FLAG_NOFOLLOW },
+ { O_SHLOCK, KF_FLAG_SHLOCK },
+ { O_TRUNC, KF_FLAG_TRUNC }
+ };
+ unsigned int i;
+ int kflags;
+
+ kflags = 0;
+ for (i = 0; i < nitems(fflags_table); i++)
+ if (fflags & fflags_table[i].fflag)
+ kflags |= fflags_table[i].kf_fflag;
+ return (kflags);
+}
+
+/* Trim unused data from kf_path by truncating the structure size. */
+void
+pack_kinfo(struct kinfo_file *kif)
+{
+
+ kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
+ strlen(kif->kf_path) + 1;
+ kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
+}
+
+static void
+export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
+ struct kinfo_file *kif, struct filedesc *fdp, int flags)
+{
+ int error;
+
+ bzero(kif, sizeof(*kif));
+
+ /* Set a default type to allow for empty fill_kinfo() methods. */
+ kif->kf_type = KF_TYPE_UNKNOWN;
+ kif->kf_flags = xlate_fflags(fp->f_flag);
+ if (rightsp != NULL)
+ kif->kf_cap_rights = *rightsp;
+ else
+ cap_rights_init(&kif->kf_cap_rights);
+ kif->kf_fd = fd;
+ kif->kf_ref_count = fp->f_count;
+ kif->kf_offset = foffset_get(fp);
+
+ /*
+ * This may drop the filedesc lock, so the 'fp' cannot be
+ * accessed after this call.
+ */
+ error = fo_fill_kinfo(fp, kif, fdp);
+ if (error == 0)
+ kif->kf_status |= KF_ATTR_VALID;
+ if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
+ pack_kinfo(kif);
+ else
+ kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
+}
+
+static void
+export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
+ struct kinfo_file *kif, int flags)
+{
+ int error;
+
+ bzero(kif, sizeof(*kif));
+
+ kif->kf_type = KF_TYPE_VNODE;
+ error = vn_fill_kinfo_vnode(vp, kif);
+ if (error == 0)
+ kif->kf_status |= KF_ATTR_VALID;
+ kif->kf_flags = xlate_fflags(fflags);
+ cap_rights_init(&kif->kf_cap_rights);
+ kif->kf_fd = fd;
+ kif->kf_ref_count = -1;
+ kif->kf_offset = -1;
+ if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
+ pack_kinfo(kif);
+ else
+ kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
+ vrele(vp);
+}
+
+struct export_fd_buf {
+ struct filedesc *fdp;
+ struct sbuf *sb;
+ ssize_t remainder;
+ struct kinfo_file kif;
+ int flags;
+};
+
+static int
+export_kinfo_to_sb(struct export_fd_buf *efbuf)
+{
+ struct kinfo_file *kif;
+
+ kif = &efbuf->kif;
+ if (efbuf->remainder != -1) {
+ if (efbuf->remainder < kif->kf_structsize) {
+ /* Terminate export. */
+ efbuf->remainder = 0;
+ return (0);
+ }
+ efbuf->remainder -= kif->kf_structsize;
+ }
+ return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
+}
+
+static int
+export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
+ struct export_fd_buf *efbuf)
+{
+ int error;
+
+ if (efbuf->remainder == 0)
+ return (0);
+ export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
+ efbuf->flags);
+ FILEDESC_SUNLOCK(efbuf->fdp);
+ error = export_kinfo_to_sb(efbuf);
+ FILEDESC_SLOCK(efbuf->fdp);
+ return (error);
+}
+
+static int
+export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
+ struct export_fd_buf *efbuf)
+{
+ int error;
+
+ if (efbuf->remainder == 0)
+ return (0);
+ if (efbuf->fdp != NULL)
+ FILEDESC_SUNLOCK(efbuf->fdp);
+ export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
+ error = export_kinfo_to_sb(efbuf);
+ if (efbuf->fdp != NULL)
+ FILEDESC_SLOCK(efbuf->fdp);
+ return (error);
+}
+
+/*
+ * Store a process file descriptor information to sbuf.
+ *
+ * Takes a locked proc as argument, and returns with the proc unlocked.
+ */
+int
+kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen,
+ int flags)
+{
+ struct file *fp;
+ struct filedesc *fdp;
+ struct export_fd_buf *efbuf;
+ struct vnode *cttyvp, *textvp, *tracevp;
+ int error, i;
+ cap_rights_t rights;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /* ktrace vnode */
+ tracevp = p->p_tracevp;
+ if (tracevp != NULL)
+ vrefact(tracevp);
+ /* text vnode */
+ textvp = p->p_textvp;
+ if (textvp != NULL)
+ vrefact(textvp);
+ /* Controlling tty. */
+ cttyvp = NULL;
+ if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
+ cttyvp = p->p_pgrp->pg_session->s_ttyvp;
+ if (cttyvp != NULL)
+ vrefact(cttyvp);
+ }
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+ efbuf->fdp = NULL;
+ efbuf->sb = sb;
+ efbuf->remainder = maxlen;
+ efbuf->flags = flags;
+ if (tracevp != NULL)
+ export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
+ efbuf);
+ if (textvp != NULL)
+ export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
+ if (cttyvp != NULL)
+ export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
+ efbuf);
+ error = 0;
+ if (fdp == NULL)
+ goto fail;
+ efbuf->fdp = fdp;
+ FILEDESC_SLOCK(fdp);
+ /* working directory */
+ if (fdp->fd_cdir != NULL) {
+ vrefact(fdp->fd_cdir);
+ export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
+ }
+ /* root directory */
+ if (fdp->fd_rdir != NULL) {
+ vrefact(fdp->fd_rdir);
+ export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
+ }
+ /* jail directory */
+ if (fdp->fd_jdir != NULL) {
+ vrefact(fdp->fd_jdir);
+ export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
+ }
+ for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+ if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+ continue;
+#ifdef CAPABILITIES
+ rights = *cap_rights(fdp, i);
+#else /* !CAPABILITIES */
+ rights = cap_no_rights;
+#endif
+ /*
+ * Create sysctl entry. It is OK to drop the filedesc
+ * lock inside of export_file_to_sb() as we will
+ * re-validate and re-evaluate its properties when the
+ * loop continues.
+ */
+ error = export_file_to_sb(fp, i, &rights, efbuf);
+ if (error != 0 || efbuf->remainder == 0)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
+fail:
+ free(efbuf, M_TEMP);
+ return (error);
+}
+
+#define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5)
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sb;
+ struct proc *p;
+ ssize_t maxlen;
+ int error, error2, *name;
+
+ name = (int *)arg1;
+
+ sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
+ sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
+ error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+ if (error != 0) {
+ sbuf_delete(&sb);
+ return (error);
+ }
+ maxlen = req->oldptr != NULL ? req->oldlen : -1;
+ error = kern_proc_filedesc_out(p, &sb, maxlen,
+ KERN_FILEDESC_PACK_KINFO);
+ error2 = sbuf_finish(&sb);
+ sbuf_delete(&sb);
+ return (error != 0 ? error : error2);
+}
+
+#ifdef COMPAT_FREEBSD7
+#ifdef KINFO_OFILE_SIZE
+CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
+#endif
+
+static void
+kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
+{
+
+ okif->kf_structsize = sizeof(*okif);
+ okif->kf_type = kif->kf_type;
+ okif->kf_fd = kif->kf_fd;
+ okif->kf_ref_count = kif->kf_ref_count;
+ okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
+ KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
+ KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
+ okif->kf_offset = kif->kf_offset;
+ if (kif->kf_type == KF_TYPE_VNODE)
+ okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
+ else
+ okif->kf_vnode_type = KF_VTYPE_VNON;
+ strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
+ if (kif->kf_type == KF_TYPE_SOCKET) {
+ okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
+ okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
+ okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
+ okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
+ okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
+ } else {
+ okif->kf_sa_local.ss_family = AF_UNSPEC;
+ okif->kf_sa_peer.ss_family = AF_UNSPEC;
+ }
+}
+
+static int
+export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
+ struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
+{
+ int error;
+
+ vrefact(vp);
+ FILEDESC_SUNLOCK(fdp);
+ export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
+ kinfo_to_okinfo(kif, okif);
+ error = SYSCTL_OUT(req, okif, sizeof(*okif));
+ FILEDESC_SLOCK(fdp);
+ return (error);
+}
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
+{
+ struct kinfo_ofile *okif;
+ struct kinfo_file *kif;
+ struct filedesc *fdp;
+ int error, i, *name;
+ struct file *fp;
+ struct proc *p;
+
+ name = (int *)arg1;
+ error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+ if (error != 0)
+ return (error);
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ return (ENOENT);
+ kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
+ okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
+ FILEDESC_SLOCK(fdp);
+ if (fdp->fd_cdir != NULL)
+ export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
+ okif, fdp, req);
+ if (fdp->fd_rdir != NULL)
+ export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
+ okif, fdp, req);
+ if (fdp->fd_jdir != NULL)
+ export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
+ okif, fdp, req);
+ for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+ if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+ continue;
+ export_file_to_kinfo(fp, i, NULL, kif, fdp,
+ KERN_FILEDESC_PACK_KINFO);
+ FILEDESC_SUNLOCK(fdp);
+ kinfo_to_okinfo(kif, okif);
+ error = SYSCTL_OUT(req, okif, sizeof(*okif));
+ FILEDESC_SLOCK(fdp);
+ if (error)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
+ free(kif, M_TEMP);
+ free(okif, M_TEMP);
+ return (0);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
+ CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
+ "Process ofiledesc entries");
+#endif /* COMPAT_FREEBSD7 */
+
+int
+vntype_to_kinfo(int vtype)
+{
+ struct {
+ int vtype;
+ int kf_vtype;
+ } vtypes_table[] = {
+ { VBAD, KF_VTYPE_VBAD },
+ { VBLK, KF_VTYPE_VBLK },
+ { VCHR, KF_VTYPE_VCHR },
+ { VDIR, KF_VTYPE_VDIR },
+ { VFIFO, KF_VTYPE_VFIFO },
+ { VLNK, KF_VTYPE_VLNK },
+ { VNON, KF_VTYPE_VNON },
+ { VREG, KF_VTYPE_VREG },
+ { VSOCK, KF_VTYPE_VSOCK }
+ };
+ unsigned int i;
+
+ /*
+ * Perform vtype translation.
+ */
+ for (i = 0; i < nitems(vtypes_table); i++)
+ if (vtypes_table[i].vtype == vtype)
+ return (vtypes_table[i].kf_vtype);
+
+ return (KF_VTYPE_UNKNOWN);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
+ CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
+ "Process filedesc entries");
+
+/*
+ * Store a process current working directory information to sbuf.
+ *
+ * Takes a locked proc as argument, and returns with the proc unlocked.
+ */
+int
+kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen)
+{
+ struct filedesc *fdp;
+ struct export_fd_buf *efbuf;
+ int error;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ return (EINVAL);
+
+ efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+ efbuf->fdp = fdp;
+ efbuf->sb = sb;
+ efbuf->remainder = maxlen;
+
+ FILEDESC_SLOCK(fdp);
+ if (fdp->fd_cdir == NULL)
+ error = EINVAL;
+ else {
+ vrefact(fdp->fd_cdir);
+ error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD,
+ FREAD, efbuf);
+ }
+ FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
+ free(efbuf, M_TEMP);
+ return (error);
+}
+
+/*
+ * Get per-process current working directory.
+ */
+static int
+sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sb;
+ struct proc *p;
+ ssize_t maxlen;
+ int error, error2, *name;
+
+ name = (int *)arg1;
+
+ sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
+ sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
+ error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+ if (error != 0) {
+ sbuf_delete(&sb);
+ return (error);
+ }
+ maxlen = req->oldptr != NULL ? req->oldlen : -1;
+ error = kern_proc_cwd_out(p, &sb, maxlen);
+ error2 = sbuf_finish(&sb);
+ sbuf_delete(&sb);
+ return (error != 0 ? error : error2);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
+ sysctl_kern_proc_cwd, "Process current working directory");
+
+#ifdef DDB
+/*
+ * For the purposes of debugging, generate a human-readable string for the
+ * file type.
+ */
+static const char *
+file_type_to_name(short type)
+{
+
+ switch (type) {
+ case 0:
+ return ("zero");
+ case DTYPE_VNODE:
+ return ("vnode");
+ case DTYPE_SOCKET:
+ return ("socket");
+ case DTYPE_PIPE:
+ return ("pipe");
+ case DTYPE_FIFO:
+ return ("fifo");
+ case DTYPE_KQUEUE:
+ return ("kqueue");
+ case DTYPE_CRYPTO:
+ return ("crypto");
+ case DTYPE_MQUEUE:
+ return ("mqueue");
+ case DTYPE_SHM:
+ return ("shm");
+ case DTYPE_SEM:
+ return ("ksem");
+ case DTYPE_PTS:
+ return ("pts");
+ case DTYPE_DEV:
+ return ("dev");
+ case DTYPE_PROCDESC:
+ return ("proc");
+ case DTYPE_LINUXEFD:
+ return ("levent");
+ case DTYPE_LINUXTFD:
+ return ("ltimer");
+ default:
+ return ("unkn");
+ }
+}
+
+/*
+ * For the purposes of debugging, identify a process (if any, perhaps one of
+ * many) that references the passed file in its file descriptor array. Return
+ * NULL if none.
+ */
+static struct proc *
+file_to_first_proc(struct file *fp)
+{
+ struct filedesc *fdp;
+ struct proc *p;
+ int n;
+
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ fdp = p->p_fd;
+ if (fdp == NULL)
+ continue;
+ for (n = 0; n <= fdp->fd_lastfile; n++) {
+ if (fp == fdp->fd_ofiles[n].fde_file)
+ return (p);
+ }
+ }
+ return (NULL);
+}
+
+static void
+db_print_file(struct file *fp, int header)
+{
+#define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
+ struct proc *p;
+
+ if (header)
+ db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
+ XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
+ "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
+ "FCmd");
+ p = file_to_first_proc(fp);
+ db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
+ fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
+ fp->f_flag, 0, fp->f_count, 0, XPTRWIDTH, fp->f_vnode,
+ p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+
+#undef XPTRWIDTH
+}
+
+DB_SHOW_COMMAND(file, db_show_file)
+{
+ struct file *fp;
+
+ if (!have_addr) {
+ db_printf("usage: show file <addr>\n");
+ return;
+ }
+ fp = (struct file *)addr;
+ db_print_file(fp, 1);
+}
+
+DB_SHOW_COMMAND(files, db_show_files)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ struct proc *p;
+ int header;
+ int n;
+
+ header = 1;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ if ((fdp = p->p_fd) == NULL)
+ continue;
+ for (n = 0; n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ db_print_file(fp, header);
+ header = 0;
+ }
+ }
+}
+#endif
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
+ &maxfilesperproc, 0, "Maximum files allowed open per process");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
+ &maxfiles, 0, "Maximum number of files");
+
+SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
+ __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
+
+/* ARGSUSED*/
+static void
+filelistinit(void *dummy)
+{
+
+ file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
+}
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
+
+/*-------------------------------------------------------------------*/
+
+static int
+badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+static int
+badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (0);
+}
+
+static int
+badfo_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_close(struct file *fp, struct thread *td)
+{
+
+ return (0);
+}
+
+static int
+badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+ struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
+{
+
+ return (0);
+}
+
+struct fileops badfileops = {
+ .fo_read = badfo_readwrite,
+ .fo_write = badfo_readwrite,
+ .fo_truncate = badfo_truncate,
+ .fo_ioctl = badfo_ioctl,
+ .fo_poll = badfo_poll,
+ .fo_kqfilter = badfo_kqfilter,
+ .fo_stat = badfo_stat,
+ .fo_close = badfo_close,
+ .fo_chmod = badfo_chmod,
+ .fo_chown = badfo_chown,
+ .fo_sendfile = badfo_sendfile,
+ .fo_fill_kinfo = badfo_fill_kinfo,
+};
+
+int
+invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+int
+invfo_ioctl(struct file *fp, u_long com, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+
+ return (ENOTTY);
+}
+
+int
+invfo_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (poll_no_poll(events));
+}
+
+int
+invfo_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (EINVAL);
+}
+
+int
+invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+int
+invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+int
+invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+ struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+/*-------------------------------------------------------------------*/
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process. Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ *
+ * XXX: we could give this one a cloning event handler if necessary.
+ */
+
+/* ARGSUSED */
+static int
+fdopen(struct cdev *dev, int mode, int type, struct thread *td)
+{
+
+ /*
+ * XXX Kludge: set curthread->td_dupfd to contain the value of the
+ * the file descriptor being sought for duplication. The error
+ * return ensures that the vnode for this device will be released
+ * by vn_open. Open will detect this special error and take the
+ * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+ * will simply report the error.
+ */
+ td->td_dupfd = dev2unit(dev);
+ return (ENODEV);
+}
+
+static struct cdevsw fildesc_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = fdopen,
+ .d_name = "FD",
+};
+
+static void
+fildesc_drvinit(void *unused)
+{
+ struct cdev *dev;
+
+ dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
+ UID_ROOT, GID_WHEEL, 0666, "fd/0");
+ make_dev_alias(dev, "stdin");
+ dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
+ UID_ROOT, GID_WHEEL, 0666, "fd/1");
+ make_dev_alias(dev, "stdout");
+ dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
+ UID_ROOT, GID_WHEEL, 0666, "fd/2");
+ make_dev_alias(dev, "stderr");
+}
+
+SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/freebsd/sys/kern/kern_lock.c b/freebsd/sys/kern/kern_lock.c
new file mode 100644
index 00000000..d769a185
--- /dev/null
+++ b/freebsd/sys/kern/kern_lock.c
@@ -0,0 +1,1719 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2008 Attilio Rao <attilio@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/lock_profile.h>
+#include <sys/lockmgr.h>
+#include <sys/lockstat.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sleepqueue.h>
+#ifdef DEBUG_LOCKS
+#include <sys/stack.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
+ (LK_ADAPTIVE | LK_NOSHARE));
+CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
+ ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));
+
+#define SQ_EXCLUSIVE_QUEUE 0
+#define SQ_SHARED_QUEUE 1
+
+#ifndef INVARIANTS
+#define _lockmgr_assert(lk, what, file, line)
+#endif
+
+#define TD_SLOCKS_INC(td) ((td)->td_lk_slocks++)
+#define TD_SLOCKS_DEC(td) ((td)->td_lk_slocks--)
+
+#ifndef DEBUG_LOCKS
+#define STACK_PRINT(lk)
+#define STACK_SAVE(lk)
+#define STACK_ZERO(lk)
+#else
+#define STACK_PRINT(lk) stack_print_ddb(&(lk)->lk_stack)
+#define STACK_SAVE(lk) stack_save(&(lk)->lk_stack)
+#define STACK_ZERO(lk) stack_zero(&(lk)->lk_stack)
+#endif
+
+#define LOCK_LOG2(lk, string, arg1, arg2) \
+ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \
+ CTR2(KTR_LOCK, (string), (arg1), (arg2))
+#define LOCK_LOG3(lk, string, arg1, arg2, arg3) \
+ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \
+ CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3))
+
+#define GIANT_DECLARE \
+ int _i = 0; \
+ WITNESS_SAVE_DECL(Giant)
+#define GIANT_RESTORE() do { \
+ if (_i > 0) { \
+ while (_i--) \
+ mtx_lock(&Giant); \
+ WITNESS_RESTORE(&Giant.lock_object, Giant); \
+ } \
+} while (0)
+#define GIANT_SAVE() do { \
+ if (mtx_owned(&Giant)) { \
+ WITNESS_SAVE(&Giant.lock_object, Giant); \
+ while (mtx_owned(&Giant)) { \
+ _i++; \
+ mtx_unlock(&Giant); \
+ } \
+ } \
+} while (0)
+
+static bool __always_inline
+LK_CAN_SHARE(uintptr_t x, int flags, bool fp)
+{
+
+ if ((x & (LK_SHARE | LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) ==
+ LK_SHARE)
+ return (true);
+ if (fp || (!(x & LK_SHARE)))
+ return (false);
+ if ((curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) ||
+ (curthread->td_pflags & TDP_DEADLKTREAT))
+ return (true);
+ return (false);
+}
+
+#define LK_TRYOP(x) \
+ ((x) & LK_NOWAIT)
+
+#define LK_CAN_WITNESS(x) \
+ (((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x))
+#define LK_TRYWIT(x) \
+ (LK_TRYOP(x) ? LOP_TRYLOCK : 0)
+
+#define LK_CAN_ADAPT(lk, f) \
+ (((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 && \
+ ((f) & LK_SLEEPFAIL) == 0)
+
+#define lockmgr_disowned(lk) \
+ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC)
+
+#define lockmgr_xlocked_v(v) \
+ (((v) & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread)
+
+#define lockmgr_xlocked(lk) lockmgr_xlocked_v((lk)->lk_lock)
+
+static void assert_lockmgr(const struct lock_object *lock, int how);
+#ifdef DDB
+static void db_show_lockmgr(const struct lock_object *lock);
+#endif
+static void lock_lockmgr(struct lock_object *lock, uintptr_t how);
+#ifdef KDTRACE_HOOKS
+static int owner_lockmgr(const struct lock_object *lock,
+ struct thread **owner);
+#endif
+static uintptr_t unlock_lockmgr(struct lock_object *lock);
+
+struct lock_class lock_class_lockmgr = {
+ .lc_name = "lockmgr",
+ .lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE,
+ .lc_assert = assert_lockmgr,
+#ifdef DDB
+ .lc_ddb_show = db_show_lockmgr,
+#endif
+ .lc_lock = lock_lockmgr,
+ .lc_unlock = unlock_lockmgr,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_lockmgr,
+#endif
+};
+
+struct lockmgr_wait {
+ const char *iwmesg;
+ int ipri;
+ int itimo;
+};
+
+static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp,
+ int flags, bool fp);
+static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp);
+
+static void
+lockmgr_exit(u_int flags, struct lock_object *ilk, int wakeup_swapper)
+{
+ struct lock_class *class;
+
+ if (flags & LK_INTERLOCK) {
+ class = LOCK_CLASS(ilk);
+ class->lc_unlock(ilk);
+ }
+
+ if (__predict_false(wakeup_swapper))
+ kick_proc0();
+}
+
+static void
+lockmgr_note_shared_acquire(struct lock *lk, int contested,
+ uint64_t waittime, const char *file, int line, int flags)
+{
+
+ LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested,
+ waittime, file, line, LOCKSTAT_READER);
+ LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line);
+ WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line);
+ TD_LOCKS_INC(curthread);
+ TD_SLOCKS_INC(curthread);
+ STACK_SAVE(lk);
+}
+
+static void
+lockmgr_note_shared_release(struct lock *lk, const char *file, int line)
+{
+
+ LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_READER);
+ WITNESS_UNLOCK(&lk->lock_object, 0, file, line);
+ LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line);
+ TD_LOCKS_DEC(curthread);
+ TD_SLOCKS_DEC(curthread);
+}
+
+static void
+lockmgr_note_exclusive_acquire(struct lock *lk, int contested,
+ uint64_t waittime, const char *file, int line, int flags)
+{
+
+ LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested,
+ waittime, file, line, LOCKSTAT_WRITER);
+ LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line);
+ WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file,
+ line);
+ TD_LOCKS_INC(curthread);
+ STACK_SAVE(lk);
+}
+
+static void
+lockmgr_note_exclusive_release(struct lock *lk, const char *file, int line)
+{
+
+ LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_WRITER);
+ LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file,
+ line);
+ WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+ TD_LOCKS_DEC(curthread);
+}
+
+static __inline struct thread *
+lockmgr_xholder(const struct lock *lk)
+{
+ uintptr_t x;
+
+ x = lk->lk_lock;
+ return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x));
+}
+
+/*
+ * It assumes sleepq_lock held and returns with this one unheld.
+ * It also assumes the generic interlock is sane and previously checked.
+ * If LK_INTERLOCK is specified the interlock is not reacquired after the
+ * sleep.
+ */
+static __inline int
+sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *wmesg, int pri, int timo, int queue)
+{
+ GIANT_DECLARE;
+ struct lock_class *class;
+ int catch, error;
+
+ class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+ catch = pri & PCATCH;
+ pri &= PRIMASK;
+ error = 0;
+
+ LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk,
+ (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared");
+
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
+ lk->lk_exslpfail++;
+ GIANT_SAVE();
+ sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
+ SLEEPQ_INTERRUPTIBLE : 0), queue);
+ if ((flags & LK_TIMELOCK) && timo)
+ sleepq_set_timeout(&lk->lock_object, timo);
+
+ /*
+ * Decisional switch for real sleeping.
+ */
+ if ((flags & LK_TIMELOCK) && timo && catch)
+ error = sleepq_timedwait_sig(&lk->lock_object, pri);
+ else if ((flags & LK_TIMELOCK) && timo)
+ error = sleepq_timedwait(&lk->lock_object, pri);
+ else if (catch)
+ error = sleepq_wait_sig(&lk->lock_object, pri);
+ else
+ sleepq_wait(&lk->lock_object, pri);
+ GIANT_RESTORE();
+ if ((flags & LK_SLEEPFAIL) && error == 0)
+ error = ENOLCK;
+
+ return (error);
+}
+
+static __inline int
+wakeupshlk(struct lock *lk, const char *file, int line)
+{
+ uintptr_t v, x, orig_x;
+ u_int realexslp;
+ int queue, wakeup_swapper;
+
+ wakeup_swapper = 0;
+ for (;;) {
+ x = lk->lk_lock;
+ if (lockmgr_sunlock_try(lk, &x))
+ break;
+
+ /*
+ * We should have a sharer with waiters, so enter the hard
+ * path in order to handle wakeups correctly.
+ */
+ sleepq_lock(&lk->lock_object);
+ orig_x = lk->lk_lock;
+retry_sleepq:
+ x = orig_x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+ v = LK_UNLOCKED;
+
+ /*
+ * If the lock has exclusive waiters, give them preference in
+ * order to avoid deadlock with shared runners up.
+ * If interruptible sleeps left the exclusive queue empty
+ * avoid a starvation for the threads sleeping on the shared
+ * queue by giving them precedence and cleaning up the
+ * exclusive waiters bit anyway.
+ * Please note that lk_exslpfail count may be lying about
+ * the real number of waiters with the LK_SLEEPFAIL flag on
+ * because they may be used in conjunction with interruptible
+ * sleeps so lk_exslpfail might be considered an 'upper limit'
+ * bound, including the edge cases.
+ */
+ realexslp = sleepq_sleepcnt(&lk->lock_object,
+ SQ_EXCLUSIVE_QUEUE);
+ if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+ if (lk->lk_exslpfail < realexslp) {
+ lk->lk_exslpfail = 0;
+ queue = SQ_EXCLUSIVE_QUEUE;
+ v |= (x & LK_SHARED_WAITERS);
+ } else {
+ lk->lk_exslpfail = 0;
+ LOCK_LOG2(lk,
+ "%s: %p has only LK_SLEEPFAIL sleepers",
+ __func__, lk);
+ LOCK_LOG2(lk,
+ "%s: %p waking up threads on the exclusive queue",
+ __func__, lk);
+ wakeup_swapper =
+ sleepq_broadcast(&lk->lock_object,
+ SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+ queue = SQ_SHARED_QUEUE;
+ }
+
+ } else {
+
+ /*
+ * Exclusive waiters sleeping with LK_SLEEPFAIL on
+ * and using interruptible sleeps/timeout may have
+ * left spourious lk_exslpfail counts on, so clean
+ * it up anyway.
+ */
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ }
+
+ if (lockmgr_sunlock_try(lk, &orig_x)) {
+ sleepq_release(&lk->lock_object);
+ break;
+ }
+
+ x |= LK_SHARERS_LOCK(1);
+ if (!atomic_fcmpset_rel_ptr(&lk->lk_lock, &x, v)) {
+ orig_x = x;
+ goto retry_sleepq;
+ }
+ LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
+ __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+ "exclusive");
+ wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
+ 0, queue);
+ sleepq_release(&lk->lock_object);
+ break;
+ }
+
+ lockmgr_note_shared_release(lk, file, line);
+ return (wakeup_swapper);
+}
+
+static void
+assert_lockmgr(const struct lock_object *lock, int what)
+{
+
+ panic("lockmgr locks do not support assertions");
+}
+
+static void
+lock_lockmgr(struct lock_object *lock, uintptr_t how)
+{
+
+ panic("lockmgr locks do not support sleep interlocking");
+}
+
+static uintptr_t
+unlock_lockmgr(struct lock_object *lock)
+{
+
+ panic("lockmgr locks do not support sleep interlocking");
+}
+
+#ifdef KDTRACE_HOOKS
+static int
+owner_lockmgr(const struct lock_object *lock, struct thread **owner)
+{
+
+ panic("lockmgr locks do not support owner inquiring");
+}
+#endif
+
+void
+lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags)
+{
+ int iflags;
+
+ MPASS((flags & ~LK_INIT_MASK) == 0);
+ ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock,
+ ("%s: lockmgr not aligned for %s: %p", __func__, wmesg,
+ &lk->lk_lock));
+
+ iflags = LO_SLEEPABLE | LO_UPGRADABLE;
+ if (flags & LK_CANRECURSE)
+ iflags |= LO_RECURSABLE;
+ if ((flags & LK_NODUP) == 0)
+ iflags |= LO_DUPOK;
+ if (flags & LK_NOPROFILE)
+ iflags |= LO_NOPROFILE;
+ if ((flags & LK_NOWITNESS) == 0)
+ iflags |= LO_WITNESS;
+ if (flags & LK_QUIET)
+ iflags |= LO_QUIET;
+ if (flags & LK_IS_VNODE)
+ iflags |= LO_IS_VNODE;
+ iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE);
+
+ lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags);
+ lk->lk_lock = LK_UNLOCKED;
+ lk->lk_recurse = 0;
+ lk->lk_exslpfail = 0;
+ lk->lk_timo = timo;
+ lk->lk_pri = pri;
+ STACK_ZERO(lk);
+}
+
+/*
+ * XXX: Gross hacks to manipulate external lock flags after
+ * initialization. Used for certain vnode and buf locks.
+ */
+void
+lockallowshare(struct lock *lk)
+{
+
+ lockmgr_assert(lk, KA_XLOCKED);
+ lk->lock_object.lo_flags &= ~LK_NOSHARE;
+}
+
+void
+lockdisableshare(struct lock *lk)
+{
+
+ lockmgr_assert(lk, KA_XLOCKED);
+ lk->lock_object.lo_flags |= LK_NOSHARE;
+}
+
+void
+lockallowrecurse(struct lock *lk)
+{
+
+ lockmgr_assert(lk, KA_XLOCKED);
+ lk->lock_object.lo_flags |= LO_RECURSABLE;
+}
+
+void
+lockdisablerecurse(struct lock *lk)
+{
+
+ lockmgr_assert(lk, KA_XLOCKED);
+ lk->lock_object.lo_flags &= ~LO_RECURSABLE;
+}
+
+void
+lockdestroy(struct lock *lk)
+{
+
+ KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
+ KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
+ KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters"));
+ lock_destroy(&lk->lock_object);
+}
+
+static bool __always_inline
+lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags, bool fp)
+{
+
+ /*
+ * If no other thread has an exclusive lock, or
+ * no exclusive waiter is present, bump the count of
+ * sharers. Since we have to preserve the state of
+ * waiters, if we fail to acquire the shared lock
+ * loop back and retry.
+ */
+ *xp = lk->lk_lock;
+ while (LK_CAN_SHARE(*xp, flags, fp)) {
+ if (atomic_fcmpset_acq_ptr(&lk->lk_lock, xp,
+ *xp + LK_ONE_SHARER)) {
+ return (true);
+ }
+ }
+ return (false);
+}
+
+static bool __always_inline
+lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp)
+{
+
+ for (;;) {
+ if (LK_SHARERS(*xp) > 1 || !(*xp & LK_ALL_WAITERS)) {
+ if (atomic_fcmpset_rel_ptr(&lk->lk_lock, xp,
+ *xp - LK_ONE_SHARER))
+ return (true);
+ continue;
+ }
+ break;
+ }
+ return (false);
+}
+
+static __noinline int
+lockmgr_slock_hard(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *file, int line, struct lockmgr_wait *lwa)
+{
+ uintptr_t tid, x;
+ int error = 0;
+ const char *iwmesg;
+ int ipri, itimo;
+
+#ifdef KDTRACE_HOOKS
+ uint64_t sleep_time = 0;
+#endif
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+
+ if (__predict_false(panicstr != NULL))
+ goto out;
+
+ tid = (uintptr_t)curthread;
+
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
+ file, line, flags & LK_INTERLOCK ? ilk : NULL);
+ for (;;) {
+ if (lockmgr_slock_try(lk, &x, flags, false))
+ break;
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&lk->lock_object,
+ &contested, &waittime);
+
+ /*
+ * If the lock is already held by curthread in
+ * exclusive way avoid a deadlock.
+ */
+ if (LK_HOLDER(x) == tid) {
+ LOCK_LOG2(lk,
+ "%s: %p already held in exclusive mode",
+ __func__, lk);
+ error = EDEADLK;
+ break;
+ }
+
+ /*
+ * If the lock is expected to not sleep just give up
+ * and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk, "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * Acquire the sleepqueue chain lock because we
+ * probabilly will need to manipulate waiters flags.
+ */
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+retry_sleepq:
+
+ /*
+ * if the lock can be acquired in shared mode, try
+ * again.
+ */
+ if (LK_CAN_SHARE(x, flags, false)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+
+ /*
+ * Try to set the LK_SHARED_WAITERS flag. If we fail,
+ * loop back and retry.
+ */
+ if ((x & LK_SHARED_WAITERS) == 0) {
+ if (!atomic_fcmpset_acq_ptr(&lk->lk_lock, &x,
+ x | LK_SHARED_WAITERS)) {
+ goto retry_sleepq;
+ }
+ LOCK_LOG2(lk, "%s: %p set shared waiters flag",
+ __func__, lk);
+ }
+
+ if (lwa == NULL) {
+ iwmesg = lk->lock_object.lo_name;
+ ipri = lk->lk_pri;
+ itimo = lk->lk_timo;
+ } else {
+ iwmesg = lwa->iwmesg;
+ ipri = lwa->ipri;
+ itimo = lwa->itimo;
+ }
+
+ /*
+ * As far as we have been unable to acquire the
+ * shared lock and the shared waiters flag is set,
+ * we will sleep.
+ */
+#ifdef KDTRACE_HOOKS
+ sleep_time -= lockstat_nsecs(&lk->lock_object);
+#endif
+ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+ SQ_SHARED_QUEUE);
+#ifdef KDTRACE_HOOKS
+ sleep_time += lockstat_nsecs(&lk->lock_object);
+#endif
+ flags &= ~LK_INTERLOCK;
+ if (error) {
+ LOCK_LOG3(lk,
+ "%s: interrupted sleep for %p with %d",
+ __func__, lk, error);
+ break;
+ }
+ LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+ __func__, lk);
+ }
+ if (error == 0) {
+#ifdef KDTRACE_HOOKS
+ if (sleep_time != 0)
+ LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time,
+ LOCKSTAT_READER, (x & LK_SHARE) == 0,
+ (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x));
+#endif
+#ifdef LOCK_PROFILING
+ lockmgr_note_shared_acquire(lk, contested, waittime,
+ file, line, flags);
+#else
+ lockmgr_note_shared_acquire(lk, 0, 0, file, line,
+ flags);
+#endif
+ }
+
+out:
+ lockmgr_exit(flags, ilk, 0);
+ return (error);
+}
+
+static __noinline int
+lockmgr_xlock_hard(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *file, int line, struct lockmgr_wait *lwa)
+{
+ struct lock_class *class;
+ uintptr_t tid, x, v;
+ int error = 0;
+ const char *iwmesg;
+ int ipri, itimo;
+
+#ifdef KDTRACE_HOOKS
+ uint64_t sleep_time = 0;
+#endif
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+
+ if (__predict_false(panicstr != NULL))
+ goto out;
+
+ tid = (uintptr_t)curthread;
+
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+ LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+ ilk : NULL);
+
+ /*
+ * If curthread already holds the lock and this one is
+ * allowed to recurse, simply recurse on it.
+ */
+ if (lockmgr_xlocked(lk)) {
+ if ((flags & LK_CANRECURSE) == 0 &&
+ (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) {
+ /*
+ * If the lock is expected to not panic just
+ * give up and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk,
+ "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ goto out;
+ }
+ if (flags & LK_INTERLOCK) {
+ class = LOCK_CLASS(ilk);
+ class->lc_unlock(ilk);
+ }
+ panic("%s: recursing on non recursive lockmgr %p "
+ "@ %s:%d\n", __func__, lk, file, line);
+ }
+ lk->lk_recurse++;
+ LOCK_LOG2(lk, "%s: %p recursing", __func__, lk);
+ LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
+ lk->lk_recurse, file, line);
+ WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+ LK_TRYWIT(flags), file, line);
+ TD_LOCKS_INC(curthread);
+ goto out;
+ }
+
+ for (;;) {
+ if (lk->lk_lock == LK_UNLOCKED &&
+ atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid))
+ break;
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&lk->lock_object,
+ &contested, &waittime);
+
+ /*
+ * If the lock is expected to not sleep just give up
+ * and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk, "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * Acquire the sleepqueue chain lock because we
+ * probabilly will need to manipulate waiters flags.
+ */
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+retry_sleepq:
+
+ /*
+ * if the lock has been released while we spun on
+ * the sleepqueue chain lock just try again.
+ */
+ if (x == LK_UNLOCKED) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+
+ /*
+ * The lock can be in the state where there is a
+ * pending queue of waiters, but still no owner.
+ * This happens when the lock is contested and an
+ * owner is going to claim the lock.
+ * If curthread is the one successfully acquiring it
+ * claim lock ownership and return, preserving waiters
+ * flags.
+ */
+ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+ if ((x & ~v) == LK_UNLOCKED) {
+ v &= ~LK_EXCLUSIVE_SPINNERS;
+ if (atomic_fcmpset_acq_ptr(&lk->lk_lock, &x,
+ tid | v)) {
+ sleepq_release(&lk->lock_object);
+ LOCK_LOG2(lk,
+ "%s: %p claimed by a new writer",
+ __func__, lk);
+ break;
+ }
+ goto retry_sleepq;
+ }
+
+ /*
+ * Try to set the LK_EXCLUSIVE_WAITERS flag. If we
+ * fail, loop back and retry.
+ */
+ if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+ if (!atomic_fcmpset_ptr(&lk->lk_lock, &x,
+ x | LK_EXCLUSIVE_WAITERS)) {
+ goto retry_sleepq;
+ }
+ LOCK_LOG2(lk, "%s: %p set excl waiters flag",
+ __func__, lk);
+ }
+
+ if (lwa == NULL) {
+ iwmesg = lk->lock_object.lo_name;
+ ipri = lk->lk_pri;
+ itimo = lk->lk_timo;
+ } else {
+ iwmesg = lwa->iwmesg;
+ ipri = lwa->ipri;
+ itimo = lwa->itimo;
+ }
+
+ /*
+ * As far as we have been unable to acquire the
+ * exclusive lock and the exclusive waiters flag
+ * is set, we will sleep.
+ */
+#ifdef KDTRACE_HOOKS
+ sleep_time -= lockstat_nsecs(&lk->lock_object);
+#endif
+ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+ SQ_EXCLUSIVE_QUEUE);
+#ifdef KDTRACE_HOOKS
+ sleep_time += lockstat_nsecs(&lk->lock_object);
+#endif
+ flags &= ~LK_INTERLOCK;
+ if (error) {
+ LOCK_LOG3(lk,
+ "%s: interrupted sleep for %p with %d",
+ __func__, lk, error);
+ break;
+ }
+ LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+ __func__, lk);
+ }
+ if (error == 0) {
+#ifdef KDTRACE_HOOKS
+ if (sleep_time != 0)
+ LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time,
+ LOCKSTAT_WRITER, (x & LK_SHARE) == 0,
+ (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x));
+#endif
+#ifdef LOCK_PROFILING
+ lockmgr_note_exclusive_acquire(lk, contested, waittime,
+ file, line, flags);
+#else
+ lockmgr_note_exclusive_acquire(lk, 0, 0, file, line,
+ flags);
+#endif
+ }
+
+out:
+ lockmgr_exit(flags, ilk, 0);
+ return (error);
+}
+
+static __noinline int
+lockmgr_upgrade(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *file, int line, struct lockmgr_wait *lwa)
+{
+ uintptr_t tid, x, v;
+ int error = 0;
+ int wakeup_swapper = 0;
+ int op;
+
+ if (__predict_false(panicstr != NULL))
+ goto out;
+
+ tid = (uintptr_t)curthread;
+
+ _lockmgr_assert(lk, KA_SLOCKED, file, line);
+ v = lk->lk_lock;
+ x = v & LK_ALL_WAITERS;
+ v &= LK_EXCLUSIVE_SPINNERS;
+
+ /*
+ * Try to switch from one shared lock to an exclusive one.
+ * We need to preserve waiters flags during the operation.
+ */
+ if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
+ tid | x)) {
+ LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
+ line);
+ WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
+ LK_TRYWIT(flags), file, line);
+ LOCKSTAT_RECORD0(lockmgr__upgrade, lk);
+ TD_SLOCKS_DEC(curthread);
+ goto out;
+ }
+
+ op = flags & LK_TYPE_MASK;
+
+ /*
+ * In LK_TRYUPGRADE mode, do not drop the lock,
+ * returning EBUSY instead.
+ */
+ if (op == LK_TRYUPGRADE) {
+ LOCK_LOG2(lk, "%s: %p failed the nowait upgrade",
+ __func__, lk);
+ error = EBUSY;
+ goto out;
+ }
+
+ /*
+ * We have been unable to succeed in upgrading, so just
+ * give up the shared lock.
+ */
+ wakeup_swapper |= wakeupshlk(lk, file, line);
+ error = lockmgr_xlock_hard(lk, flags, ilk, file, line, lwa);
+ flags &= ~LK_INTERLOCK;
+out:
+ lockmgr_exit(flags, ilk, wakeup_swapper);
+ return (error);
+}
+
+int
+lockmgr_lock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *file, int line)
+{
+ struct lock_class *class;
+ uintptr_t x, tid;
+ u_int op;
+ bool locked;
+
+ if (__predict_false(panicstr != NULL))
+ return (0);
+
+ op = flags & LK_TYPE_MASK;
+ locked = false;
+ switch (op) {
+ case LK_SHARED:
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
+ file, line, flags & LK_INTERLOCK ? ilk : NULL);
+ if (__predict_false(lk->lock_object.lo_flags & LK_NOSHARE))
+ break;
+ if (lockmgr_slock_try(lk, &x, flags, true)) {
+ lockmgr_note_shared_acquire(lk, 0, 0,
+ file, line, flags);
+ locked = true;
+ } else {
+ return (lockmgr_slock_hard(lk, flags, ilk, file, line,
+ NULL));
+ }
+ break;
+ case LK_EXCLUSIVE:
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+ LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+ ilk : NULL);
+ tid = (uintptr_t)curthread;
+ if (lk->lk_lock == LK_UNLOCKED &&
+ atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) {
+ lockmgr_note_exclusive_acquire(lk, 0, 0, file, line,
+ flags);
+ locked = true;
+ } else {
+ return (lockmgr_xlock_hard(lk, flags, ilk, file, line,
+ NULL));
+ }
+ break;
+ case LK_UPGRADE:
+ case LK_TRYUPGRADE:
+ return (lockmgr_upgrade(lk, flags, ilk, file, line, NULL));
+ default:
+ break;
+ }
+ if (__predict_true(locked)) {
+ if (__predict_false(flags & LK_INTERLOCK)) {
+ class = LOCK_CLASS(ilk);
+ class->lc_unlock(ilk);
+ }
+ return (0);
+ } else {
+ return (__lockmgr_args(lk, flags, ilk, LK_WMESG_DEFAULT,
+ LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, file, line));
+ }
+}
+
+static __noinline int
+lockmgr_sunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk,
+ const char *file, int line)
+
+{
+ int wakeup_swapper = 0;
+
+ if (__predict_false(panicstr != NULL))
+ goto out;
+
+ wakeup_swapper = wakeupshlk(lk, file, line);
+
+out:
+ lockmgr_exit(flags, ilk, wakeup_swapper);
+ return (0);
+}
+
+static __noinline int
+lockmgr_xunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk,
+ const char *file, int line)
+{
+ uintptr_t tid, v;
+ int wakeup_swapper = 0;
+ u_int realexslp;
+ int queue;
+
+ if (__predict_false(panicstr != NULL))
+ goto out;
+
+ tid = (uintptr_t)curthread;
+
+ /*
+ * As first option, treact the lock as if it has not
+ * any waiter.
+ * Fix-up the tid var if the lock has been disowned.
+ */
+ if (LK_HOLDER(x) == LK_KERNPROC)
+ tid = LK_KERNPROC;
+ else {
+ WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+ TD_LOCKS_DEC(curthread);
+ }
+ LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line);
+
+ /*
+ * The lock is held in exclusive mode.
+ * If the lock is recursed also, then unrecurse it.
+ */
+ if (lockmgr_xlocked_v(x) && lockmgr_recursed(lk)) {
+ LOCK_LOG2(lk, "%s: %p unrecursing", __func__, lk);
+ lk->lk_recurse--;
+ goto out;
+ }
+ if (tid != LK_KERNPROC)
+ LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk,
+ LOCKSTAT_WRITER);
+
+ if (x == tid && atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED))
+ goto out;
+
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+ v = LK_UNLOCKED;
+
+ /*
+ * If the lock has exclusive waiters, give them
+ * preference in order to avoid deadlock with
+ * shared runners up.
+ * If interruptible sleeps left the exclusive queue
+ * empty avoid a starvation for the threads sleeping
+ * on the shared queue by giving them precedence
+ * and cleaning up the exclusive waiters bit anyway.
+ * Please note that lk_exslpfail count may be lying
+ * about the real number of waiters with the
+ * LK_SLEEPFAIL flag on because they may be used in
+ * conjunction with interruptible sleeps so
+ * lk_exslpfail might be considered an 'upper limit'
+ * bound, including the edge cases.
+ */
+ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+ realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE);
+ if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+ if (lk->lk_exslpfail < realexslp) {
+ lk->lk_exslpfail = 0;
+ queue = SQ_EXCLUSIVE_QUEUE;
+ v |= (x & LK_SHARED_WAITERS);
+ } else {
+ lk->lk_exslpfail = 0;
+ LOCK_LOG2(lk,
+ "%s: %p has only LK_SLEEPFAIL sleepers",
+ __func__, lk);
+ LOCK_LOG2(lk,
+ "%s: %p waking up threads on the exclusive queue",
+ __func__, lk);
+ wakeup_swapper = sleepq_broadcast(&lk->lock_object,
+ SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+ queue = SQ_SHARED_QUEUE;
+ }
+ } else {
+
+ /*
+ * Exclusive waiters sleeping with LK_SLEEPFAIL
+ * on and using interruptible sleeps/timeout
+ * may have left spourious lk_exslpfail counts
+ * on, so clean it up anyway.
+ */
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ }
+
+ LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
+ __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+ "exclusive");
+ atomic_store_rel_ptr(&lk->lk_lock, v);
+ wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue);
+ sleepq_release(&lk->lock_object);
+
+out:
+ lockmgr_exit(flags, ilk, wakeup_swapper);
+ return (0);
+}
+
+int
+lockmgr_unlock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk)
+{
+ struct lock_class *class;
+ uintptr_t x, tid;
+ const char *file;
+ int line;
+
+ if (__predict_false(panicstr != NULL))
+ return (0);
+
+ file = __FILE__;
+ line = __LINE__;
+
+ _lockmgr_assert(lk, KA_LOCKED, file, line);
+ x = lk->lk_lock;
+ if (__predict_true(x & LK_SHARE) != 0) {
+ if (lockmgr_sunlock_try(lk, &x)) {
+ lockmgr_note_shared_release(lk, file, line);
+ } else {
+ return (lockmgr_sunlock_hard(lk, x, flags, ilk, file, line));
+ }
+ } else {
+ tid = (uintptr_t)curthread;
+ if (!lockmgr_recursed(lk) &&
+ atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) {
+ lockmgr_note_exclusive_release(lk, file, line);
+ } else {
+ return (lockmgr_xunlock_hard(lk, x, flags, ilk, file, line));
+ }
+ }
+ if (__predict_false(flags & LK_INTERLOCK)) {
+ class = LOCK_CLASS(ilk);
+ class->lc_unlock(ilk);
+ }
+ return (0);
+}
+
+int
+__lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *wmesg, int pri, int timo, const char *file, int line)
+{
+ GIANT_DECLARE;
+ struct lockmgr_wait lwa;
+ struct lock_class *class;
+ const char *iwmesg;
+ uintptr_t tid, v, x;
+ u_int op, realexslp;
+ int error, ipri, itimo, queue, wakeup_swapper;
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+
+ if (panicstr != NULL)
+ return (0);
+
+ error = 0;
+ tid = (uintptr_t)curthread;
+ op = (flags & LK_TYPE_MASK);
+ iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg;
+ ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri;
+ itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo;
+
+ lwa.iwmesg = iwmesg;
+ lwa.ipri = ipri;
+ lwa.itimo = itimo;
+
+ MPASS((flags & ~LK_TOTAL_MASK) == 0);
+ KASSERT((op & (op - 1)) == 0,
+ ("%s: Invalid requested operation @ %s:%d", __func__, file, line));
+ KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 ||
+ (op != LK_DOWNGRADE && op != LK_RELEASE),
+ ("%s: Invalid flags in regard of the operation desired @ %s:%d",
+ __func__, file, line));
+ KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL,
+ ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d",
+ __func__, file, line));
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread,
+ lk->lock_object.lo_name, file, line));
+
+ class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+
+ if (lk->lock_object.lo_flags & LK_NOSHARE) {
+ switch (op) {
+ case LK_SHARED:
+ op = LK_EXCLUSIVE;
+ break;
+ case LK_UPGRADE:
+ case LK_TRYUPGRADE:
+ case LK_DOWNGRADE:
+ _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED,
+ file, line);
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ return (0);
+ }
+ }
+
+ wakeup_swapper = 0;
+ switch (op) {
+ case LK_SHARED:
+ return (lockmgr_slock_hard(lk, flags, ilk, file, line, &lwa));
+ break;
+ case LK_UPGRADE:
+ case LK_TRYUPGRADE:
+ return (lockmgr_upgrade(lk, flags, ilk, file, line, &lwa));
+ break;
+ case LK_EXCLUSIVE:
+ return (lockmgr_xlock_hard(lk, flags, ilk, file, line, &lwa));
+ break;
+ case LK_DOWNGRADE:
+ _lockmgr_assert(lk, KA_XLOCKED, file, line);
+ WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line);
+
+ /*
+ * Panic if the lock is recursed.
+ */
+ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n",
+ __func__, iwmesg, file, line);
+ }
+ TD_SLOCKS_INC(curthread);
+
+ /*
+ * In order to preserve waiters flags, just spin.
+ */
+ for (;;) {
+ x = lk->lk_lock;
+ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+ x &= LK_ALL_WAITERS;
+ if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+ LK_SHARERS_LOCK(1) | x))
+ break;
+ cpu_spinwait();
+ }
+ LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line);
+ LOCKSTAT_RECORD0(lockmgr__downgrade, lk);
+ break;
+ case LK_RELEASE:
+ _lockmgr_assert(lk, KA_LOCKED, file, line);
+ x = lk->lk_lock;
+
+ if (__predict_true(x & LK_SHARE) != 0) {
+ return (lockmgr_sunlock_hard(lk, x, flags, ilk, file, line));
+ } else {
+ return (lockmgr_xunlock_hard(lk, x, flags, ilk, file, line));
+ }
+ break;
+ case LK_DRAIN:
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+ LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+ ilk : NULL);
+
+ /*
+ * Trying to drain a lock we already own will result in a
+ * deadlock.
+ */
+ if (lockmgr_xlocked(lk)) {
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ panic("%s: draining %s with the lock held @ %s:%d\n",
+ __func__, iwmesg, file, line);
+ }
+
+ for (;;) {
+ if (lk->lk_lock == LK_UNLOCKED &&
+ atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid))
+ break;
+
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&lk->lock_object,
+ &contested, &waittime);
+
+ /*
+ * If the lock is expected to not sleep just give up
+ * and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk, "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * Acquire the sleepqueue chain lock because we
+ * probabilly will need to manipulate waiters flags.
+ */
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+
+ /*
+ * if the lock has been released while we spun on
+ * the sleepqueue chain lock just try again.
+ */
+ if (x == LK_UNLOCKED) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+
+ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+ if ((x & ~v) == LK_UNLOCKED) {
+ v = (x & ~LK_EXCLUSIVE_SPINNERS);
+
+ /*
+ * If interruptible sleeps left the exclusive
+ * queue empty avoid a starvation for the
+ * threads sleeping on the shared queue by
+ * giving them precedence and cleaning up the
+ * exclusive waiters bit anyway.
+ * Please note that lk_exslpfail count may be
+ * lying about the real number of waiters with
+ * the LK_SLEEPFAIL flag on because they may
+ * be used in conjunction with interruptible
+ * sleeps so lk_exslpfail might be considered
+ * an 'upper limit' bound, including the edge
+ * cases.
+ */
+ if (v & LK_EXCLUSIVE_WAITERS) {
+ queue = SQ_EXCLUSIVE_QUEUE;
+ v &= ~LK_EXCLUSIVE_WAITERS;
+ } else {
+
+ /*
+ * Exclusive waiters sleeping with
+ * LK_SLEEPFAIL on and using
+ * interruptible sleeps/timeout may
+ * have left spourious lk_exslpfail
+ * counts on, so clean it up anyway.
+ */
+ MPASS(v & LK_SHARED_WAITERS);
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ v &= ~LK_SHARED_WAITERS;
+ }
+ if (queue == SQ_EXCLUSIVE_QUEUE) {
+ realexslp =
+ sleepq_sleepcnt(&lk->lock_object,
+ SQ_EXCLUSIVE_QUEUE);
+ if (lk->lk_exslpfail >= realexslp) {
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ v &= ~LK_SHARED_WAITERS;
+ if (realexslp != 0) {
+ LOCK_LOG2(lk,
+ "%s: %p has only LK_SLEEPFAIL sleepers",
+ __func__, lk);
+ LOCK_LOG2(lk,
+ "%s: %p waking up threads on the exclusive queue",
+ __func__, lk);
+ wakeup_swapper =
+ sleepq_broadcast(
+ &lk->lock_object,
+ SLEEPQ_LK, 0,
+ SQ_EXCLUSIVE_QUEUE);
+ }
+ } else
+ lk->lk_exslpfail = 0;
+ }
+ if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ LOCK_LOG3(lk,
+ "%s: %p waking up all threads on the %s queue",
+ __func__, lk, queue == SQ_SHARED_QUEUE ?
+ "shared" : "exclusive");
+ wakeup_swapper |= sleepq_broadcast(
+ &lk->lock_object, SLEEPQ_LK, 0, queue);
+
+ /*
+ * If shared waiters have been woken up we need
+ * to wait for one of them to acquire the lock
+ * before to set the exclusive waiters in
+ * order to avoid a deadlock.
+ */
+ if (queue == SQ_SHARED_QUEUE) {
+ for (v = lk->lk_lock;
+ (v & LK_SHARE) && !LK_SHARERS(v);
+ v = lk->lk_lock)
+ cpu_spinwait();
+ }
+ }
+
+ /*
+ * Try to set the LK_EXCLUSIVE_WAITERS flag. If we
+ * fail, loop back and retry.
+ */
+ if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+ if (!atomic_cmpset_ptr(&lk->lk_lock, x,
+ x | LK_EXCLUSIVE_WAITERS)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ LOCK_LOG2(lk, "%s: %p set drain waiters flag",
+ __func__, lk);
+ }
+
+ /*
+ * As far as we have been unable to acquire the
+ * exclusive lock and the exclusive waiters flag
+ * is set, we will sleep.
+ */
+ if (flags & LK_INTERLOCK) {
+ class->lc_unlock(ilk);
+ flags &= ~LK_INTERLOCK;
+ }
+ GIANT_SAVE();
+ sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK,
+ SQ_EXCLUSIVE_QUEUE);
+ sleepq_wait(&lk->lock_object, ipri & PRIMASK);
+ GIANT_RESTORE();
+ LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+ __func__, lk);
+ }
+
+ if (error == 0) {
+ lock_profile_obtain_lock_success(&lk->lock_object,
+ contested, waittime, file, line);
+ LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0,
+ lk->lk_recurse, file, line);
+ WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+ LK_TRYWIT(flags), file, line);
+ TD_LOCKS_INC(curthread);
+ STACK_SAVE(lk);
+ }
+ break;
+ default:
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ panic("%s: unknown lockmgr request 0x%x\n", __func__, op);
+ }
+
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ if (wakeup_swapper)
+ kick_proc0();
+
+ return (error);
+}
+
+void
+_lockmgr_disown(struct lock *lk, const char *file, int line)
+{
+ uintptr_t tid, x;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ tid = (uintptr_t)curthread;
+ _lockmgr_assert(lk, KA_XLOCKED, file, line);
+
+ /*
+ * Panic if the lock is recursed.
+ */
+ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk))
+ panic("%s: disown a recursed lockmgr @ %s:%d\n",
+ __func__, file, line);
+
+ /*
+ * If the owner is already LK_KERNPROC just skip the whole operation.
+ */
+ if (LK_HOLDER(lk->lk_lock) != tid)
+ return;
+ lock_profile_release_lock(&lk->lock_object);
+ LOCKSTAT_RECORD1(lockmgr__disown, lk, LOCKSTAT_WRITER);
+ LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line);
+ WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+ TD_LOCKS_DEC(curthread);
+ STACK_SAVE(lk);
+
+ /*
+ * In order to preserve waiters flags, just spin.
+ */
+ for (;;) {
+ x = lk->lk_lock;
+ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+ x &= LK_ALL_WAITERS;
+ if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+ LK_KERNPROC | x))
+ return;
+ cpu_spinwait();
+ }
+}
+
+void
+lockmgr_printinfo(const struct lock *lk)
+{
+ struct thread *td;
+ uintptr_t x;
+
+ if (lk->lk_lock == LK_UNLOCKED)
+ printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name);
+ else if (lk->lk_lock & LK_SHARE)
+ printf("lock type %s: SHARED (count %ju)\n",
+ lk->lock_object.lo_name,
+ (uintmax_t)LK_SHARERS(lk->lk_lock));
+ else {
+ td = lockmgr_xholder(lk);
+ if (td == (struct thread *)LK_KERNPROC)
+ printf("lock type %s: EXCL by KERNPROC\n",
+ lk->lock_object.lo_name);
+ else
+ printf("lock type %s: EXCL by thread %p "
+ "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name,
+ td, td->td_proc->p_pid, td->td_proc->p_comm,
+ td->td_tid);
+ }
+
+ x = lk->lk_lock;
+ if (x & LK_EXCLUSIVE_WAITERS)
+ printf(" with exclusive waiters pending\n");
+ if (x & LK_SHARED_WAITERS)
+ printf(" with shared waiters pending\n");
+ if (x & LK_EXCLUSIVE_SPINNERS)
+ printf(" with exclusive spinners pending\n");
+
+ STACK_PRINT(lk);
+}
+
+int
+lockstatus(const struct lock *lk)
+{
+ uintptr_t v, x;
+ int ret;
+
+ ret = LK_SHARED;
+ x = lk->lk_lock;
+ v = LK_HOLDER(x);
+
+ if ((x & LK_SHARE) == 0) {
+ if (v == (uintptr_t)curthread || v == LK_KERNPROC)
+ ret = LK_EXCLUSIVE;
+ else
+ ret = LK_EXCLOTHER;
+ } else if (x == LK_UNLOCKED)
+ ret = 0;
+
+ return (ret);
+}
+
+#ifdef INVARIANT_SUPPORT
+
+FEATURE(invariant_support,
+ "Support for modules compiled with INVARIANTS option");
+
+#ifndef INVARIANTS
+#undef _lockmgr_assert
+#endif
+
+void
+_lockmgr_assert(const struct lock *lk, int what, const char *file, int line)
+{
+ int slocked = 0;
+
+ if (panicstr != NULL)
+ return;
+ switch (what) {
+ case KA_SLOCKED:
+ case KA_SLOCKED | KA_NOTRECURSED:
+ case KA_SLOCKED | KA_RECURSED:
+ slocked = 1;
+ case KA_LOCKED:
+ case KA_LOCKED | KA_NOTRECURSED:
+ case KA_LOCKED | KA_RECURSED:
+#ifdef WITNESS
+
+ /*
+ * We cannot trust WITNESS if the lock is held in exclusive
+ * mode and a call to lockmgr_disown() happened.
+ * Workaround this skipping the check if the lock is held in
+ * exclusive mode even for the KA_LOCKED case.
+ */
+ if (slocked || (lk->lk_lock & LK_SHARE)) {
+ witness_assert(&lk->lock_object, what, file, line);
+ break;
+ }
+#endif
+ if (lk->lk_lock == LK_UNLOCKED ||
+ ((lk->lk_lock & LK_SHARE) == 0 && (slocked ||
+ (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)))))
+ panic("Lock %s not %slocked @ %s:%d\n",
+ lk->lock_object.lo_name, slocked ? "share" : "",
+ file, line);
+
+ if ((lk->lk_lock & LK_SHARE) == 0) {
+ if (lockmgr_recursed(lk)) {
+ if (what & KA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file,
+ line);
+ } else if (what & KA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ }
+ break;
+ case KA_XLOCKED:
+ case KA_XLOCKED | KA_NOTRECURSED:
+ case KA_XLOCKED | KA_RECURSED:
+ if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))
+ panic("Lock %s not exclusively locked @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ if (lockmgr_recursed(lk)) {
+ if (what & KA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ } else if (what & KA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ break;
+ case KA_UNLOCKED:
+ if (lockmgr_xlocked(lk) || lockmgr_disowned(lk))
+ panic("Lock %s exclusively locked @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ break;
+ default:
+ panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file,
+ line);
+ }
+}
+#endif
+
+#ifdef DDB
+int
+lockmgr_chain(struct thread *td, struct thread **ownerp)
+{
+ struct lock *lk;
+
+ lk = td->td_wchan;
+
+ if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
+ return (0);
+ db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
+ if (lk->lk_lock & LK_SHARE)
+ db_printf("SHARED (count %ju)\n",
+ (uintmax_t)LK_SHARERS(lk->lk_lock));
+ else
+ db_printf("EXCL\n");
+ *ownerp = lockmgr_xholder(lk);
+
+ return (1);
+}
+
+static void
+db_show_lockmgr(const struct lock_object *lock)
+{
+ struct thread *td;
+ const struct lock *lk;
+
+ lk = (const struct lock *)lock;
+
+ db_printf(" state: ");
+ if (lk->lk_lock == LK_UNLOCKED)
+ db_printf("UNLOCKED\n");
+ else if (lk->lk_lock & LK_SHARE)
+ db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock));
+ else {
+ td = lockmgr_xholder(lk);
+ if (td == (struct thread *)LK_KERNPROC)
+ db_printf("XLOCK: LK_KERNPROC\n");
+ else
+ db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid,
+ td->td_proc->p_comm);
+ if (lockmgr_recursed(lk))
+ db_printf(" recursed: %d\n", lk->lk_recurse);
+ }
+ db_printf(" waiters: ");
+ switch (lk->lk_lock & LK_ALL_WAITERS) {
+ case LK_SHARED_WAITERS:
+ db_printf("shared\n");
+ break;
+ case LK_EXCLUSIVE_WAITERS:
+ db_printf("exclusive\n");
+ break;
+ case LK_ALL_WAITERS:
+ db_printf("shared and exclusive\n");
+ break;
+ default:
+ db_printf("none\n");
+ }
+ db_printf(" spinners: ");
+ if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS)
+ db_printf("exclusive\n");
+ else
+ db_printf("none\n");
+}
+#endif
diff --git a/freebsd/sys/kern/subr_pctrie.c b/freebsd/sys/kern/subr_pctrie.c
new file mode 100644
index 00000000..c5f2c06e
--- /dev/null
+++ b/freebsd/sys/kern/subr_pctrie.c
@@ -0,0 +1,695 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Path-compressed radix trie implementation.
+ *
+ * The implementation takes into account the following rationale:
+ * - Size of the nodes should be as small as possible but still big enough
+ * to avoid a large maximum depth for the trie. This is a balance
+ * between the necessity to not wire too much physical memory for the nodes
+ * and the necessity to avoid too much cache pollution during the trie
+ * operations.
+ * - There is not a huge bias toward the number of lookup operations over
+ * the number of insert and remove operations. This basically implies
+ * that optimizations supposedly helping one operation but hurting the
+ * other might be carefully evaluated.
+ * - On average not many nodes are expected to be fully populated, hence
+ * level compression may just complicate things.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/pctrie.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#define PCTRIE_MASK (PCTRIE_COUNT - 1)
+#define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
+
+/* Flag bits stored in node pointers. */
+#define PCTRIE_ISLEAF 0x1
+#define PCTRIE_FLAGS 0x1
+#define PCTRIE_PAD PCTRIE_FLAGS
+
+/* Returns one unit associated with specified level. */
+#define PCTRIE_UNITLEVEL(lev) \
+ ((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
+
+struct pctrie_node {
+ uint64_t pn_owner; /* Owner of record. */
+ uint16_t pn_count; /* Valid children. */
+ uint16_t pn_clev; /* Current level. */
+ void *pn_child[PCTRIE_COUNT]; /* Child nodes. */
+};
+
+/*
+ * Allocate a node. Pre-allocation should ensure that the request
+ * will always be satisfied.
+ */
+static __inline struct pctrie_node *
+pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
+ uint16_t count, uint16_t clevel)
+{
+ struct pctrie_node *node;
+
+ node = allocfn(ptree);
+ if (node == NULL)
+ return (NULL);
+ node->pn_owner = owner;
+ node->pn_count = count;
+ node->pn_clev = clevel;
+
+ return (node);
+}
+
+/*
+ * Free radix node.
+ */
+static __inline void
+pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
+ pctrie_free_t freefn)
+{
+#ifdef INVARIANTS
+ int slot;
+
+ KASSERT(node->pn_count == 0,
+ ("pctrie_node_put: node %p has %d children", node,
+ node->pn_count));
+ for (slot = 0; slot < PCTRIE_COUNT; slot++)
+ KASSERT(node->pn_child[slot] == NULL,
+ ("pctrie_node_put: node %p has a child", node));
+#endif
+ freefn(ptree, node);
+}
+
+/*
+ * Return the position in the array for a given level.
+ */
+static __inline int
+pctrie_slot(uint64_t index, uint16_t level)
+{
+
+ return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
+}
+
+/* Trims the key after the specified level. */
+static __inline uint64_t
+pctrie_trimkey(uint64_t index, uint16_t level)
+{
+ uint64_t ret;
+
+ ret = index;
+ if (level > 0) {
+ ret >>= level * PCTRIE_WIDTH;
+ ret <<= level * PCTRIE_WIDTH;
+ }
+ return (ret);
+}
+
+/*
+ * Get the root node for a tree.
+ */
+static __inline struct pctrie_node *
+pctrie_getroot(struct pctrie *ptree)
+{
+
+ return ((struct pctrie_node *)ptree->pt_root);
+}
+
+/*
+ * Set the root node for a tree.
+ */
+static __inline void
+pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
+{
+
+ ptree->pt_root = (uintptr_t)node;
+}
+
+/*
+ * Returns TRUE if the specified node is a leaf and FALSE otherwise.
+ */
+static __inline boolean_t
+pctrie_isleaf(struct pctrie_node *node)
+{
+
+ return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
+}
+
+/*
+ * Returns the associated val extracted from node.
+ */
+static __inline uint64_t *
+pctrie_toval(struct pctrie_node *node)
+{
+
+ return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
+}
+
+/*
+ * Adds the val as a child of the provided node.
+ */
+static __inline void
+pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
+ uint64_t *val)
+{
+ int slot;
+
+ slot = pctrie_slot(index, clev);
+ node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
+}
+
+/*
+ * Returns the slot where two keys differ.
+ * It cannot accept 2 equal keys.
+ */
+static __inline uint16_t
+pctrie_keydiff(uint64_t index1, uint64_t index2)
+{
+ uint16_t clev;
+
+ KASSERT(index1 != index2, ("%s: passing the same key value %jx",
+ __func__, (uintmax_t)index1));
+
+ index1 ^= index2;
+ for (clev = PCTRIE_LIMIT;; clev--)
+ if (pctrie_slot(index1, clev) != 0)
+ return (clev);
+}
+
+/*
+ * Returns TRUE if it can be determined that key does not belong to the
+ * specified node. Otherwise, returns FALSE.
+ */
+static __inline boolean_t
+pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
+{
+
+ if (node->pn_clev < PCTRIE_LIMIT) {
+ idx = pctrie_trimkey(idx, node->pn_clev + 1);
+ return (idx != node->pn_owner);
+ }
+ return (FALSE);
+}
+
+/*
+ * Internal helper for pctrie_reclaim_allnodes().
+ * This function is recursive.
+ */
+static void
+pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
+ pctrie_free_t freefn)
+{
+ int slot;
+
+ KASSERT(node->pn_count <= PCTRIE_COUNT,
+ ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
+ for (slot = 0; node->pn_count != 0; slot++) {
+ if (node->pn_child[slot] == NULL)
+ continue;
+ if (!pctrie_isleaf(node->pn_child[slot]))
+ pctrie_reclaim_allnodes_int(ptree,
+ node->pn_child[slot], freefn);
+ node->pn_child[slot] = NULL;
+ node->pn_count--;
+ }
+ pctrie_node_put(ptree, node, freefn);
+}
+
+/*
+ * pctrie node zone initializer.
+ */
+int
+pctrie_zone_init(void *mem, int size __unused, int flags __unused)
+{
+ struct pctrie_node *node;
+
+ node = mem;
+ memset(node->pn_child, 0, sizeof(node->pn_child));
+ return (0);
+}
+
+size_t
+pctrie_node_size(void)
+{
+
+ return (sizeof(struct pctrie_node));
+}
+
+/*
+ * Inserts the key-value pair into the trie.
+ * Panics if the key already exists.
+ */
+int
+pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
+{
+ uint64_t index, newind;
+ void **parentp;
+ struct pctrie_node *node, *tmp;
+ uint64_t *m;
+ int slot;
+ uint16_t clev;
+
+ index = *val;
+
+ /*
+ * The owner of record for root is not really important because it
+ * will never be used.
+ */
+ node = pctrie_getroot(ptree);
+ if (node == NULL) {
+ ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
+ return (0);
+ }
+ parentp = (void **)&ptree->pt_root;
+ for (;;) {
+ if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m == index)
+ panic("%s: key %jx is already present",
+ __func__, (uintmax_t)index);
+ clev = pctrie_keydiff(*m, index);
+ tmp = pctrie_node_get(ptree, allocfn,
+ pctrie_trimkey(index, clev + 1), 2, clev);
+ if (tmp == NULL)
+ return (ENOMEM);
+ *parentp = tmp;
+ pctrie_addval(tmp, index, clev, val);
+ pctrie_addval(tmp, *m, clev, m);
+ return (0);
+ } else if (pctrie_keybarr(node, index))
+ break;
+ slot = pctrie_slot(index, node->pn_clev);
+ if (node->pn_child[slot] == NULL) {
+ node->pn_count++;
+ pctrie_addval(node, index, node->pn_clev, val);
+ return (0);
+ }
+ parentp = &node->pn_child[slot];
+ node = node->pn_child[slot];
+ }
+
+ /*
+ * A new node is needed because the right insertion level is reached.
+ * Setup the new intermediate node and add the 2 children: the
+ * new object and the older edge.
+ */
+ newind = node->pn_owner;
+ clev = pctrie_keydiff(newind, index);
+ tmp = pctrie_node_get(ptree, allocfn,
+ pctrie_trimkey(index, clev + 1), 2, clev);
+ if (tmp == NULL)
+ return (ENOMEM);
+ *parentp = tmp;
+ pctrie_addval(tmp, index, clev, val);
+ slot = pctrie_slot(newind, clev);
+ tmp->pn_child[slot] = node;
+
+ return (0);
+}
+
+/*
+ * Returns the value stored at the index. If the index is not present,
+ * NULL is returned.
+ */
+uint64_t *
+pctrie_lookup(struct pctrie *ptree, uint64_t index)
+{
+ struct pctrie_node *node;
+ uint64_t *m;
+ int slot;
+
+ node = pctrie_getroot(ptree);
+ while (node != NULL) {
+ if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m == index)
+ return (m);
+ else
+ break;
+ } else if (pctrie_keybarr(node, index))
+ break;
+ slot = pctrie_slot(index, node->pn_clev);
+ node = node->pn_child[slot];
+ }
+ return (NULL);
+}
+
+/*
+ * Look up the nearest entry at a position bigger than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
+{
+ struct pctrie_node *stack[PCTRIE_LIMIT];
+ uint64_t inc;
+ uint64_t *m;
+ struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+ int loops = 0;
+#endif
+ int slot, tos;
+
+ node = pctrie_getroot(ptree);
+ if (node == NULL)
+ return (NULL);
+ else if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m >= index)
+ return (m);
+ else
+ return (NULL);
+ }
+ tos = 0;
+ for (;;) {
+ /*
+ * If the keys differ before the current bisection node,
+ * then the search key might rollback to the earliest
+ * available bisection node or to the smallest key
+ * in the current node (if the owner is bigger than the
+ * search key).
+ */
+ if (pctrie_keybarr(node, index)) {
+ if (index > node->pn_owner) {
+ascend:
+ KASSERT(++loops < 1000,
+ ("pctrie_lookup_ge: too many loops"));
+
+ /*
+ * Pop nodes from the stack until either the
+ * stack is empty or a node that could have a
+ * matching descendant is found.
+ */
+ do {
+ if (tos == 0)
+ return (NULL);
+ node = stack[--tos];
+ } while (pctrie_slot(index,
+ node->pn_clev) == (PCTRIE_COUNT - 1));
+
+ /*
+ * The following computation cannot overflow
+ * because index's slot at the current level
+ * is less than PCTRIE_COUNT - 1.
+ */
+ index = pctrie_trimkey(index,
+ node->pn_clev);
+ index += PCTRIE_UNITLEVEL(node->pn_clev);
+ } else
+ index = node->pn_owner;
+ KASSERT(!pctrie_keybarr(node, index),
+ ("pctrie_lookup_ge: keybarr failed"));
+ }
+ slot = pctrie_slot(index, node->pn_clev);
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m >= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+
+ /*
+ * Look for an available edge or val within the current
+ * bisection node.
+ */
+ if (slot < (PCTRIE_COUNT - 1)) {
+ inc = PCTRIE_UNITLEVEL(node->pn_clev);
+ index = pctrie_trimkey(index, node->pn_clev);
+ do {
+ index += inc;
+ slot++;
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m >= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+ } while (slot < (PCTRIE_COUNT - 1));
+ }
+ KASSERT(child == NULL || pctrie_isleaf(child),
+ ("pctrie_lookup_ge: child is radix node"));
+
+ /*
+ * If a value or edge bigger than the search slot is not found
+ * in the current node, ascend to the next higher-level node.
+ */
+ goto ascend;
+descend:
+ KASSERT(node->pn_clev > 0,
+ ("pctrie_lookup_ge: pushing leaf's parent"));
+ KASSERT(tos < PCTRIE_LIMIT,
+ ("pctrie_lookup_ge: stack overflow"));
+ stack[tos++] = node;
+ node = child;
+ }
+}
+
+/*
+ * Look up the nearest entry at a position less than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
+{
+ struct pctrie_node *stack[PCTRIE_LIMIT];
+ uint64_t inc;
+ uint64_t *m;
+ struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+ int loops = 0;
+#endif
+ int slot, tos;
+
+ node = pctrie_getroot(ptree);
+ if (node == NULL)
+ return (NULL);
+ else if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m <= index)
+ return (m);
+ else
+ return (NULL);
+ }
+ tos = 0;
+ for (;;) {
+ /*
+ * If the keys differ before the current bisection node,
+ * then the search key might rollback to the earliest
+ * available bisection node or to the largest key
+ * in the current node (if the owner is smaller than the
+ * search key).
+ */
+ if (pctrie_keybarr(node, index)) {
+ if (index > node->pn_owner) {
+ index = node->pn_owner + PCTRIE_COUNT *
+ PCTRIE_UNITLEVEL(node->pn_clev);
+ } else {
+ascend:
+ KASSERT(++loops < 1000,
+ ("pctrie_lookup_le: too many loops"));
+
+ /*
+ * Pop nodes from the stack until either the
+ * stack is empty or a node that could have a
+ * matching descendant is found.
+ */
+ do {
+ if (tos == 0)
+ return (NULL);
+ node = stack[--tos];
+ } while (pctrie_slot(index,
+ node->pn_clev) == 0);
+
+ /*
+ * The following computation cannot overflow
+ * because index's slot at the current level
+ * is greater than 0.
+ */
+ index = pctrie_trimkey(index,
+ node->pn_clev);
+ }
+ index--;
+ KASSERT(!pctrie_keybarr(node, index),
+ ("pctrie_lookup_le: keybarr failed"));
+ }
+ slot = pctrie_slot(index, node->pn_clev);
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m <= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+
+ /*
+ * Look for an available edge or value within the current
+ * bisection node.
+ */
+ if (slot > 0) {
+ inc = PCTRIE_UNITLEVEL(node->pn_clev);
+ index |= inc - 1;
+ do {
+ index -= inc;
+ slot--;
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m <= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+ } while (slot > 0);
+ }
+ KASSERT(child == NULL || pctrie_isleaf(child),
+ ("pctrie_lookup_le: child is radix node"));
+
+ /*
+ * If a value or edge smaller than the search slot is not found
+ * in the current node, ascend to the next higher-level node.
+ */
+ goto ascend;
+descend:
+ KASSERT(node->pn_clev > 0,
+ ("pctrie_lookup_le: pushing leaf's parent"));
+ KASSERT(tos < PCTRIE_LIMIT,
+ ("pctrie_lookup_le: stack overflow"));
+ stack[tos++] = node;
+ node = child;
+ }
+}
+
+/*
+ * Remove the specified index from the tree.
+ * Panics if the key is not present.
+ */
+void
+pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
+{
+ struct pctrie_node *node, *parent;
+ uint64_t *m;
+ int i, slot;
+
+ node = pctrie_getroot(ptree);
+ if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m != index)
+ panic("%s: invalid key found", __func__);
+ pctrie_setroot(ptree, NULL);
+ return;
+ }
+ parent = NULL;
+ for (;;) {
+ if (node == NULL)
+ panic("pctrie_remove: impossible to locate the key");
+ slot = pctrie_slot(index, node->pn_clev);
+ if (pctrie_isleaf(node->pn_child[slot])) {
+ m = pctrie_toval(node->pn_child[slot]);
+ if (*m != index)
+ panic("%s: invalid key found", __func__);
+ node->pn_child[slot] = NULL;
+ node->pn_count--;
+ if (node->pn_count > 1)
+ break;
+ for (i = 0; i < PCTRIE_COUNT; i++)
+ if (node->pn_child[i] != NULL)
+ break;
+ KASSERT(i != PCTRIE_COUNT,
+ ("%s: invalid node configuration", __func__));
+ if (parent == NULL)
+ pctrie_setroot(ptree, node->pn_child[i]);
+ else {
+ slot = pctrie_slot(index, parent->pn_clev);
+ KASSERT(parent->pn_child[slot] == node,
+ ("%s: invalid child value", __func__));
+ parent->pn_child[slot] = node->pn_child[i];
+ }
+ node->pn_count--;
+ node->pn_child[i] = NULL;
+ pctrie_node_put(ptree, node, freefn);
+ break;
+ }
+ parent = node;
+ node = node->pn_child[slot];
+ }
+}
+
+/*
+ * Remove and free all the nodes from the tree.
+ * This function is recursive but there is a tight control on it as the
+ * maximum depth of the tree is fixed.
+ */
+void
+pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
+{
+ struct pctrie_node *root;
+
+ root = pctrie_getroot(ptree);
+ if (root == NULL)
+ return;
+ pctrie_setroot(ptree, NULL);
+ if (!pctrie_isleaf(root))
+ pctrie_reclaim_allnodes_int(ptree, root, freefn);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given node.
+ */
+DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
+{
+ struct pctrie_node *node;
+ int i;
+
+ if (!have_addr)
+ return;
+ node = (struct pctrie_node *)addr;
+ db_printf("node %p, owner %jx, children count %u, level %u:\n",
+ (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
+ node->pn_clev);
+ for (i = 0; i < PCTRIE_COUNT; i++)
+ if (node->pn_child[i] != NULL)
+ db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
+ i, (void *)node->pn_child[i],
+ pctrie_isleaf(node->pn_child[i]) ?
+ pctrie_toval(node->pn_child[i]) : NULL,
+ node->pn_clev);
+}
+#endif /* DDB */
diff --git a/freebsd/sys/kern/vfs_acl.c b/freebsd/sys/kern/vfs_acl.c
new file mode 100644
index 00000000..56192cfb
--- /dev/null
+++ b/freebsd/sys/kern/vfs_acl.c
@@ -0,0 +1,600 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 1999-2006, 2016-2017 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Portions of this software were developed by BAE Systems, the University of
+ * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
+ * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
+ * Computing (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL system calls and other functions common across different ACL types.
+ * Type-specific routines go into subr_acl_<type>.c.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capsicum.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/acl.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES);
+
+MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists");
+
+
+static int kern___acl_aclcheck_path(struct thread *td, const char *path,
+ acl_type_t type, struct acl *aclp, int follow);
+static int kern___acl_delete_path(struct thread *td, const char *path,
+ acl_type_t type, int follow);
+static int kern___acl_get_path(struct thread *td, const char *path,
+ acl_type_t type, struct acl *aclp, int follow);
+static int kern___acl_set_path(struct thread *td, const char *path,
+ acl_type_t type, const struct acl *aclp, int follow);
+static int vacl_set_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, const struct acl *aclp);
+static int vacl_get_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_aclcheck(struct thread *td, struct vnode *vp,
+ acl_type_t type, const struct acl *aclp);
+
+int
+acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest)
+{
+ int i;
+
+ if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES)
+ return (EINVAL);
+
+ bzero(dest, sizeof(*dest));
+
+ dest->acl_cnt = source->acl_cnt;
+ dest->acl_maxcnt = ACL_MAX_ENTRIES;
+
+ for (i = 0; i < dest->acl_cnt; i++) {
+ dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+ dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+ dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+ }
+
+ return (0);
+}
+
+int
+acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest)
+{
+ int i;
+
+ if (source->acl_cnt > OLDACL_MAX_ENTRIES)
+ return (EINVAL);
+
+ bzero(dest, sizeof(*dest));
+
+ dest->acl_cnt = source->acl_cnt;
+
+ for (i = 0; i < dest->acl_cnt; i++) {
+ dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+ dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+ dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+ }
+
+ return (0);
+}
+
+/*
+ * At one time, "struct ACL" was extended in order to add support for NFSv4
+ * ACLs. Instead of creating compatibility versions of all the ACL-related
+ * syscalls, they were left intact. It's possible to find out what the code
+ * calling these syscalls (libc) expects basing on "type" argument - if it's
+ * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were
+ * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct
+ * oldacl". If it's something else, then it's the new "struct acl". In the
+ * latter case, the routines below just copyin/copyout the contents. In the
+ * former case, they copyin the "struct oldacl" and convert it to the new
+ * format.
+ */
+static int
+acl_copyin(const void *user_acl, struct acl *kernel_acl, acl_type_t type)
+{
+ int error;
+ struct oldacl old;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS_OLD:
+ case ACL_TYPE_DEFAULT_OLD:
+ error = copyin(user_acl, &old, sizeof(old));
+ if (error != 0)
+ break;
+ acl_copy_oldacl_into_acl(&old, kernel_acl);
+ break;
+
+ default:
+ error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl));
+ if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES)
+ return (EINVAL);
+ }
+
+ return (error);
+}
+
+static int
+acl_copyout(const struct acl *kernel_acl, void *user_acl, acl_type_t type)
+{
+ uint32_t am;
+ int error;
+ struct oldacl old;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS_OLD:
+ case ACL_TYPE_DEFAULT_OLD:
+ error = acl_copy_acl_into_oldacl(kernel_acl, &old);
+ if (error != 0)
+ break;
+
+ error = copyout(&old, user_acl, sizeof(old));
+ break;
+
+ default:
+ error = fueword32((char *)user_acl +
+ offsetof(struct acl, acl_maxcnt), &am);
+ if (error == -1)
+ return (EFAULT);
+ if (am != ACL_MAX_ENTRIES)
+ return (EINVAL);
+
+ error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl));
+ }
+
+ return (error);
+}
+
+/*
+ * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new"
+ * counterpart. It's required for old (pre-NFSv4 ACLs) libc to work
+ * with new kernel. Fixing 'type' for old binaries with new libc
+ * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold().
+ */
+static int
+acl_type_unold(int type)
+{
+ switch (type) {
+ case ACL_TYPE_ACCESS_OLD:
+ return (ACL_TYPE_ACCESS);
+
+ case ACL_TYPE_DEFAULT_OLD:
+ return (ACL_TYPE_DEFAULT);
+
+ default:
+ return (type);
+ }
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the syscall
+ * code once the syscall has converted the path or file descriptor to a vnode
+ * (unlocked). The aclp pointer is assumed still to point to userland, so
+ * this should not be consumed within the kernel except by syscall code.
+ * Other code should directly invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ const struct acl *aclp)
+{
+ struct acl *inkernelacl;
+ struct mount *mp;
+ int error;
+
+ AUDIT_ARG_VALUE(type);
+ inkernelacl = acl_alloc(M_WAITOK);
+ error = acl_copyin(aclp, inkernelacl, type);
+ if (error != 0)
+ goto out;
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ goto out;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+ error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl);
+ if (error != 0)
+ goto out_unlock;
+#endif
+ error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl,
+ td->td_ucred, td);
+#ifdef MAC
+out_unlock:
+#endif
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+out:
+ acl_free(inkernelacl);
+ return (error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl *inkernelacl;
+ int error;
+
+ AUDIT_ARG_VALUE(type);
+ inkernelacl = acl_alloc(M_WAITOK | M_ZERO);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+ error = mac_vnode_check_getacl(td->td_ucred, vp, type);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl,
+ td->td_ucred, td);
+
+#ifdef MAC
+out:
+#endif
+ VOP_UNLOCK(vp, 0);
+ if (error == 0)
+ error = acl_copyout(inkernelacl, aclp, type);
+ acl_free(inkernelacl);
+ return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+ struct mount *mp;
+ int error;
+
+ AUDIT_ARG_VALUE(type);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+ error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ *
+ * XXXRW: No vnode lock held so can't audit vnode state...?
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+ const struct acl *aclp)
+{
+ struct acl *inkernelacl;
+ int error;
+
+ inkernelacl = acl_alloc(M_WAITOK);
+ error = acl_copyin(aclp, inkernelacl, type);
+ if (error != 0)
+ goto out;
+ error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl,
+ td->td_ucred, td);
+out:
+ acl_free(inkernelacl);
+ return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't
+ * need to lock, as the vacl_ code will get/release any locks required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ */
+int
+sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+
+ return (kern___acl_get_path(td, uap->path, uap->type, uap->aclp,
+ FOLLOW));
+}
+
+/*
+ * Given a file path, get an ACL for it; don't follow links.
+ */
+int
+sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
+{
+
+ return(kern___acl_get_path(td, uap->path, uap->type, uap->aclp,
+ NOFOLLOW));
+}
+
+static int
+kern___acl_get_path(struct thread *td, const char *path, acl_type_t type,
+ struct acl *aclp, int follow)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, type, aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it.
+ */
+int
+sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+
+ return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp,
+ FOLLOW));
+}
+
+/*
+ * Given a file path, set an ACL for it; don't follow links.
+ */
+int
+sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
+{
+
+ return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp,
+ NOFOLLOW));
+}
+
+static int
+kern___acl_set_path(struct thread *td, const char *path,
+ acl_type_t type, const struct acl *aclp, int follow)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, type, aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it.
+ */
+int
+sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->filedes);
+ error = getvnode(td, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_GET), &fp);
+ if (error == 0) {
+ error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it.
+ */
+int
+sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->filedes);
+ error = getvnode(td, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_SET), &fp);
+ if (error == 0) {
+ error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+
+ return (kern___acl_delete_path(td, uap->path, uap->type, FOLLOW));
+}
+
+/*
+ * Given a file path, delete an ACL from it; don't follow links.
+ */
+int
+sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
+{
+
+ return (kern___acl_delete_path(td, uap->path, uap->type, NOFOLLOW));
+}
+
+static int
+kern___acl_delete_path(struct thread *td, const char *path,
+ acl_type_t type, int follow)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, type);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->filedes);
+ error = getvnode(td, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_DELETE), &fp);
+ if (error == 0) {
+ error = vacl_delete(td, fp->f_vnode, uap->type);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+
+ return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp,
+ FOLLOW));
+}
+
+/*
+ * Given a file path, check an ACL for it; don't follow links.
+ */
+int
+sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
+{
+ return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp,
+ NOFOLLOW));
+}
+
+static int
+kern___acl_aclcheck_path(struct thread *td, const char *path, acl_type_t type,
+ struct acl *aclp, int follow)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, type, aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->filedes);
+ error = getvnode(td, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_CHECK), &fp);
+ if (error == 0) {
+ error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+struct acl *
+acl_alloc(int flags)
+{
+ struct acl *aclp;
+
+ aclp = malloc(sizeof(*aclp), M_ACL, flags);
+ if (aclp == NULL)
+ return (NULL);
+
+ aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+
+ return (aclp);
+}
+
+void
+acl_free(struct acl *aclp)
+{
+
+ free(aclp, M_ACL);
+}
diff --git a/freebsd/sys/kern/vfs_aio.c b/freebsd/sys/kern/vfs_aio.c
new file mode 100644
index 00000000..350c51a0
--- /dev/null
+++ b/freebsd/sys/kern/vfs_aio.c
@@ -0,0 +1,2987 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 1997 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER: This code isn't warranted to do anything useful. Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author. This software is distributed AS-IS.
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capsicum.h>
+#include <sys/eventhandler.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/kthread.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/posix4.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/protosw.h>
+#include <sys/rwlock.h>
+#include <sys/sema.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/mount.h>
+#include <geom/geom.h>
+
+#include <machine/atomic.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/uma.h>
+#include <sys/aio.h>
+
+/*
+ * Counter for allocating reference ids to new jobs. Wrapped to 1 on
+ * overflow. (XXX will be removed soon.)
+ */
+static u_long jobrefid;
+
+/*
+ * Counter for aio_fsync.
+ */
+static uint64_t jobseqno;
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC 32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC 256
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO 16
+#endif
+
+FEATURE(aio, "Asynchronous I/O");
+SYSCTL_DECL(_p1003_1b);
+
+static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
+static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list");
+
+static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0,
+ "Async IO management");
+
+static int enable_aio_unsafe = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0,
+ "Permit asynchronous IO on all file types, not just known-safe types");
+
+static unsigned int unsafe_warningcnt = 1;
+SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW,
+ &unsafe_warningcnt, 0,
+ "Warnings that will be triggered upon failed IO requests on unsafe files");
+
+static int max_aio_procs = MAX_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0,
+ "Maximum number of kernel processes to use for handling async IO ");
+
+static int num_aio_procs = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0,
+ "Number of presently active kernel processes for async IO");
+
+/*
+ * The code will adjust the actual number of AIO processes towards this
+ * number when it gets a chance.
+ */
+static int target_aio_procs = TARGET_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
+ 0,
+ "Preferred number of ready kernel processes for async IO");
+
+static int max_queue_count = MAX_AIO_QUEUE;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
+ "Maximum number of aio requests to queue, globally");
+
+static int num_queue_count = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
+ "Number of queued aio requests");
+
+static int num_buf_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
+ "Number of aio requests presently handled by the buf subsystem");
+
+static int num_unmapped_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio,
+ 0,
+ "Number of aio requests presently handled by unmapped I/O buffers");
+
+/* Number of async I/O processes in the process of being started */
+/* XXX This should be local to aio_aqueue() */
+static int num_aio_resv_start = 0;
+
+static int aiod_lifetime;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
+ "Maximum lifetime for idle aiod");
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
+ 0,
+ "Maximum active aio requests per process");
+
+static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
+ &max_aio_queue_per_proc, 0,
+ "Maximum queued aio requests per process");
+
+static int max_buf_aio = MAX_BUF_AIO;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
+ "Maximum buf aio requests per process");
+
+/*
+ * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires
+ * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with
+ * vfs.aio.aio_listio_max.
+ */
+SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max,
+ CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc,
+ 0, "Maximum aio requests for a single lio_listio call");
+
+#ifdef COMPAT_FREEBSD6
+typedef struct oaiocb {
+ int aio_fildes; /* File descriptor */
+ off_t aio_offset; /* File offset for I/O */
+ volatile void *aio_buf; /* I/O buffer in process space */
+ size_t aio_nbytes; /* Number of bytes for I/O */
+ struct osigevent aio_sigevent; /* Signal to deliver */
+ int aio_lio_opcode; /* LIO opcode */
+ int aio_reqprio; /* Request priority -- ignored */
+ struct __aiocb_private _aiocb_private;
+} oaiocb_t;
+#endif
+
+/*
+ * Below is a key of locks used to protect each member of struct kaiocb
+ * aioliojob and kaioinfo and any backends.
+ *
+ * * - need not protected
+ * a - locked by kaioinfo lock
+ * b - locked by backend lock, the backend lock can be null in some cases,
+ * for example, BIO belongs to this type, in this case, proc lock is
+ * reused.
+ * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
+ */
+
+/*
+ * If the routine that services an AIO request blocks while running in an
+ * AIO kernel process it can starve other I/O requests. BIO requests
+ * queued via aio_qbio() complete asynchronously and do not use AIO kernel
+ * processes at all. Socket I/O requests use a separate pool of
+ * kprocs and also force non-blocking I/O. Other file I/O requests
+ * use the generic fo_read/fo_write operations which can block. The
+ * fsync and mlock operations can also block while executing. Ideally
+ * none of these requests would block while executing.
+ *
+ * Note that the service routines cannot toggle O_NONBLOCK in the file
+ * structure directly while handling a request due to races with
+ * userland threads.
+ */
+
+/* jobflags */
+#define KAIOCB_QUEUEING 0x01
+#define KAIOCB_CANCELLED 0x02
+#define KAIOCB_CANCELLING 0x04
+#define KAIOCB_CHECKSYNC 0x08
+#define KAIOCB_CLEARED 0x10
+#define KAIOCB_FINISHED 0x20
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE 0x1 /* proc on free queue */
+
+struct aioproc {
+ int aioprocflags; /* (c) AIO proc flags */
+ TAILQ_ENTRY(aioproc) list; /* (c) list of processes */
+ struct proc *aioproc; /* (*) the AIO proc */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aioliojob {
+ int lioj_flags; /* (a) listio flags */
+ int lioj_count; /* (a) listio flags */
+ int lioj_finished_count; /* (a) listio flags */
+ struct sigevent lioj_signal; /* (a) signal on all I/O done */
+ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
+ struct knlist klist; /* (a) list of knotes */
+ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
+};
+
+#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
+#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
+#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+ struct mtx kaio_mtx; /* the lock to protect this struct */
+ int kaio_flags; /* (a) per process kaio flags */
+ int kaio_active_count; /* (c) number of currently used AIOs */
+ int kaio_count; /* (a) size of AIO queue */
+ int kaio_buffer_count; /* (a) number of bio buffers */
+ TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */
+ TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */
+ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
+ TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */
+ TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */
+ TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */
+ struct task kaio_task; /* (*) task to kick aio processes */
+ struct task kaio_sync_task; /* (*) task to schedule fsync jobs */
+};
+
+#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
+#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
+#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
+#define AIO_MTX(ki) (&(ki)->kaio_mtx)
+
+#define KAIO_RUNDOWN 0x1 /* process is being run down */
+#define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */
+
+/*
+ * Operations used to interact with userland aio control blocks.
+ * Different ABIs provide their own operations.
+ */
+struct aiocb_ops {
+ int (*copyin)(struct aiocb *ujob, struct aiocb *kjob);
+ long (*fetch_status)(struct aiocb *ujob);
+ long (*fetch_error)(struct aiocb *ujob);
+ int (*store_status)(struct aiocb *ujob, long status);
+ int (*store_error)(struct aiocb *ujob, long error);
+ int (*store_kernelinfo)(struct aiocb *ujob, long jobref);
+ int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
+};
+
+static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */
+static struct sema aio_newproc_sem;
+static struct mtx aio_job_mtx;
+static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */
+static struct unrhdr *aiod_unr;
+
+void aio_init_aioinfo(struct proc *p);
+static int aio_onceonly(void);
+static int aio_free_entry(struct kaiocb *job);
+static void aio_process_rw(struct kaiocb *job);
+static void aio_process_sync(struct kaiocb *job);
+static void aio_process_mlock(struct kaiocb *job);
+static void aio_schedule_fsync(void *context, int pending);
+static int aio_newproc(int *);
+int aio_aqueue(struct thread *td, struct aiocb *ujob,
+ struct aioliojob *lio, int type, struct aiocb_ops *ops);
+static int aio_queue_file(struct file *fp, struct kaiocb *job);
+static void aio_biowakeup(struct bio *bp);
+static void aio_proc_rundown(void *arg, struct proc *p);
+static void aio_proc_rundown_exec(void *arg, struct proc *p,
+ struct image_params *imgp);
+static int aio_qbio(struct proc *p, struct kaiocb *job);
+static void aio_daemon(void *param);
+static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job);
+static bool aio_clear_cancel_function_locked(struct kaiocb *job);
+static int aio_kick(struct proc *userp);
+static void aio_kick_nowait(struct proc *userp);
+static void aio_kick_helper(void *context, int pending);
+static int filt_aioattach(struct knote *kn);
+static void filt_aiodetach(struct knote *kn);
+static int filt_aio(struct knote *kn, long hint);
+static int filt_lioattach(struct knote *kn);
+static void filt_liodetach(struct knote *kn);
+static int filt_lio(struct knote *kn, long hint);
+
+/*
+ * Zones for:
+ * kaio Per process async io info
+ * aiop async io process data
+ * aiocb async io jobs
+ * aiolio list io jobs
+ */
+static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone;
+
+/* kqueue filters for aio */
+static struct filterops aio_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_aioattach,
+ .f_detach = filt_aiodetach,
+ .f_event = filt_aio,
+};
+static struct filterops lio_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_lioattach,
+ .f_detach = filt_liodetach,
+ .f_event = filt_lio
+};
+
+static eventhandler_tag exit_tag, exec_tag;
+
+TASKQUEUE_DEFINE_THREAD(aiod_kick);
+
+/*
+ * Main operations function for use as a kernel module.
+ */
+static int
+aio_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ aio_onceonly();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t aio_mod = {
+ "aio",
+ &aio_modload,
+ NULL
+};
+
+DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(aio, 1);
+
+/*
+ * Startup initialization
+ */
+static int
+aio_onceonly(void)
+{
+
+ exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
+ EVENTHANDLER_PRI_ANY);
+ exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec,
+ NULL, EVENTHANDLER_PRI_ANY);
+ kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+ kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
+ TAILQ_INIT(&aio_freeproc);
+ sema_init(&aio_newproc_sem, 0, "aio_new_proc");
+ mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
+ TAILQ_INIT(&aio_jobs);
+ aiod_unr = new_unrhdr(1, INT_MAX, NULL);
+ kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+ jobrefid = 1;
+ p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO);
+ p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
+ p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
+
+ return (0);
+}
+
+/*
+ * Init the per-process aioinfo structure. The aioinfo limits are set
+ * per-process for user limit (resource) management.
+ */
+void
+aio_init_aioinfo(struct proc *p)
+{
+ struct kaioinfo *ki;
+
+ ki = uma_zalloc(kaio_zone, M_WAITOK);
+ mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW);
+ ki->kaio_flags = 0;
+ ki->kaio_active_count = 0;
+ ki->kaio_count = 0;
+ ki->kaio_buffer_count = 0;
+ TAILQ_INIT(&ki->kaio_all);
+ TAILQ_INIT(&ki->kaio_done);
+ TAILQ_INIT(&ki->kaio_jobqueue);
+ TAILQ_INIT(&ki->kaio_liojoblist);
+ TAILQ_INIT(&ki->kaio_syncqueue);
+ TAILQ_INIT(&ki->kaio_syncready);
+ TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
+ TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki);
+ PROC_LOCK(p);
+ if (p->p_aioinfo == NULL) {
+ p->p_aioinfo = ki;
+ PROC_UNLOCK(p);
+ } else {
+ PROC_UNLOCK(p);
+ mtx_destroy(&ki->kaio_mtx);
+ uma_zfree(kaio_zone, ki);
+ }
+
+ while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
+ aio_newproc(NULL);
+}
+
+static int
+aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+ struct thread *td;
+ int error;
+
+ error = sigev_findtd(p, sigev, &td);
+ if (error)
+ return (error);
+ if (!KSI_ONQ(ksi)) {
+ ksiginfo_set_sigev(ksi, sigev);
+ ksi->ksi_code = SI_ASYNCIO;
+ ksi->ksi_flags |= KSI_EXT | KSI_INS;
+ tdsendsignal(p, td, ksi->ksi_signo, ksi);
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
+ * Free a job entry. Wait for completion if it is currently active, but don't
+ * delay forever. If we delay, we return a flag that says that we have to
+ * restart the queue scan.
+ */
+static int
+aio_free_entry(struct kaiocb *job)
+{
+ struct kaioinfo *ki;
+ struct aioliojob *lj;
+ struct proc *p;
+
+ p = job->userproc;
+ MPASS(curproc == p);
+ ki = p->p_aioinfo;
+ MPASS(ki != NULL);
+
+ AIO_LOCK_ASSERT(ki, MA_OWNED);
+ MPASS(job->jobflags & KAIOCB_FINISHED);
+
+ atomic_subtract_int(&num_queue_count, 1);
+
+ ki->kaio_count--;
+ MPASS(ki->kaio_count >= 0);
+
+ TAILQ_REMOVE(&ki->kaio_done, job, plist);
+ TAILQ_REMOVE(&ki->kaio_all, job, allist);
+
+ lj = job->lio;
+ if (lj) {
+ lj->lioj_count--;
+ lj->lioj_finished_count--;
+
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ /* lio is going away, we need to destroy any knotes */
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ uma_zfree(aiolio_zone, lj);
+ }
+ }
+
+ /* job is going away, we need to destroy any knotes */
+ knlist_delete(&job->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&job->ksi);
+ PROC_UNLOCK(p);
+
+ AIO_UNLOCK(ki);
+
+ /*
+ * The thread argument here is used to find the owning process
+ * and is also passed to fo_close() which may pass it to various
+ * places such as devsw close() routines. Because of that, we
+ * need a thread pointer from the process owning the job that is
+ * persistent and won't disappear out from under us or move to
+ * another process.
+ *
+ * Currently, all the callers of this function call it to remove
+ * a kaiocb from the current process' job list either via a
+ * syscall or due to the current process calling exit() or
+ * execve(). Thus, we know that p == curproc. We also know that
+ * curthread can't exit since we are curthread.
+ *
+ * Therefore, we use curthread as the thread to pass to
+ * knlist_delete(). This does mean that it is possible for the
+ * thread pointer at close time to differ from the thread pointer
+ * at open time, but this is already true of file descriptors in
+ * a multithreaded process.
+ */
+ if (job->fd_file)
+ fdrop(job->fd_file, curthread);
+ crfree(job->cred);
+ uma_zfree(aiocb_zone, job);
+ AIO_LOCK(ki);
+
+ return (0);
+}
+
+static void
+aio_proc_rundown_exec(void *arg, struct proc *p,
+ struct image_params *imgp __unused)
+{
+ aio_proc_rundown(arg, p);
+}
+
+static int
+aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job)
+{
+ aio_cancel_fn_t *func;
+ int cancelled;
+
+ AIO_LOCK_ASSERT(ki, MA_OWNED);
+ if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED))
+ return (0);
+ MPASS((job->jobflags & KAIOCB_CANCELLING) == 0);
+ job->jobflags |= KAIOCB_CANCELLED;
+
+ func = job->cancel_fn;
+
+ /*
+ * If there is no cancel routine, just leave the job marked as
+ * cancelled. The job should be in active use by a caller who
+ * should complete it normally or when it fails to install a
+ * cancel routine.
+ */
+ if (func == NULL)
+ return (0);
+
+ /*
+ * Set the CANCELLING flag so that aio_complete() will defer
+ * completions of this job. This prevents the job from being
+ * freed out from under the cancel callback. After the
+ * callback any deferred completion (whether from the callback
+ * or any other source) will be completed.
+ */
+ job->jobflags |= KAIOCB_CANCELLING;
+ AIO_UNLOCK(ki);
+ func(job);
+ AIO_LOCK(ki);
+ job->jobflags &= ~KAIOCB_CANCELLING;
+ if (job->jobflags & KAIOCB_FINISHED) {
+ cancelled = job->uaiocb._aiocb_private.error == ECANCELED;
+ TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
+ aio_bio_done_notify(p, job);
+ } else {
+ /*
+ * The cancel callback might have scheduled an
+ * operation to cancel this request, but it is
+ * only counted as cancelled if the request is
+ * cancelled when the callback returns.
+ */
+ cancelled = 0;
+ }
+ return (cancelled);
+}
+
+/*
+ * Rundown the jobs for a given process.
+ */
+static void
+aio_proc_rundown(void *arg, struct proc *p)
+{
+ struct kaioinfo *ki;
+ struct aioliojob *lj;
+ struct kaiocb *job, *jobn;
+
+ KASSERT(curthread->td_proc == p,
+ ("%s: called on non-curproc", __func__));
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return;
+
+ AIO_LOCK(ki);
+ ki->kaio_flags |= KAIO_RUNDOWN;
+
+restart:
+
+ /*
+ * Try to cancel all pending requests. This code simulates
+ * aio_cancel on all pending I/O requests.
+ */
+ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
+ aio_cancel_job(p, ki, job);
+ }
+
+ /* Wait for all running I/O to be finished */
+ if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
+ goto restart;
+ }
+
+ /* Free all completed I/O requests. */
+ while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL)
+ aio_free_entry(job);
+
+ while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ uma_zfree(aiolio_zone, lj);
+ } else {
+ panic("LIO job not cleaned up: C:%d, FC:%d\n",
+ lj->lioj_count, lj->lioj_finished_count);
+ }
+ }
+ AIO_UNLOCK(ki);
+ taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task);
+ taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task);
+ mtx_destroy(&ki->kaio_mtx);
+ uma_zfree(kaio_zone, ki);
+ p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon).
+ */
+static struct kaiocb *
+aio_selectjob(struct aioproc *aiop)
+{
+ struct kaiocb *job;
+ struct kaioinfo *ki;
+ struct proc *userp;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+restart:
+ TAILQ_FOREACH(job, &aio_jobs, list) {
+ userp = job->userproc;
+ ki = userp->p_aioinfo;
+
+ if (ki->kaio_active_count < max_aio_per_proc) {
+ TAILQ_REMOVE(&aio_jobs, job, list);
+ if (!aio_clear_cancel_function(job))
+ goto restart;
+
+ /* Account for currently active jobs. */
+ ki->kaio_active_count++;
+ break;
+ }
+ }
+ return (job);
+}
+
+/*
+ * Move all data to a permanent storage device. This code
+ * simulates the fsync syscall.
+ */
+static int
+aio_fsync_vnode(struct thread *td, struct vnode *vp)
+{
+ struct mount *mp;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto drop;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_object != NULL) {
+ VM_OBJECT_WLOCK(vp->v_object);
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ VM_OBJECT_WUNLOCK(vp->v_object);
+ }
+ error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+drop:
+ return (error);
+}
+
+/*
+ * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that
+ * does the I/O request for the non-bio version of the operations. The normal
+ * vn operations are used, and this code should work in all instances for every
+ * type of file, including pipes, sockets, fifos, and regular files.
+ *
+ * XXX I don't think it works well for socket, pipe, and fifo.
+ */
+static void
+aio_process_rw(struct kaiocb *job)
+{
+ struct ucred *td_savedcred;
+ struct thread *td;
+ struct aiocb *cb;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t cnt;
+ long msgsnd_st, msgsnd_end;
+ long msgrcv_st, msgrcv_end;
+ long oublock_st, oublock_end;
+ long inblock_st, inblock_end;
+ int error;
+
+ KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
+ job->uaiocb.aio_lio_opcode == LIO_WRITE,
+ ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
+
+ aio_switch_vmspace(job);
+ td = curthread;
+ td_savedcred = td->td_ucred;
+ td->td_ucred = job->cred;
+ cb = &job->uaiocb;
+ fp = job->fd_file;
+
+ aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
+ aiov.iov_len = cb->aio_nbytes;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = cb->aio_offset;
+ auio.uio_resid = cb->aio_nbytes;
+ cnt = cb->aio_nbytes;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+
+ msgrcv_st = td->td_ru.ru_msgrcv;
+ msgsnd_st = td->td_ru.ru_msgsnd;
+ inblock_st = td->td_ru.ru_inblock;
+ oublock_st = td->td_ru.ru_oublock;
+
+ /*
+ * aio_aqueue() acquires a reference to the file that is
+ * released in aio_free_entry().
+ */
+ if (cb->aio_lio_opcode == LIO_READ) {
+ auio.uio_rw = UIO_READ;
+ if (auio.uio_resid == 0)
+ error = 0;
+ else
+ error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+ } else {
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ auio.uio_rw = UIO_WRITE;
+ error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+ }
+ msgrcv_end = td->td_ru.ru_msgrcv;
+ msgsnd_end = td->td_ru.ru_msgsnd;
+ inblock_end = td->td_ru.ru_inblock;
+ oublock_end = td->td_ru.ru_oublock;
+
+ job->msgrcv = msgrcv_end - msgrcv_st;
+ job->msgsnd = msgsnd_end - msgsnd_st;
+ job->inblock = inblock_end - inblock_st;
+ job->outblock = oublock_end - oublock_st;
+
+ if ((error) && (auio.uio_resid != cnt)) {
+ if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+ error = 0;
+ if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
+ PROC_LOCK(job->userproc);
+ kern_psignal(job->userproc, SIGPIPE);
+ PROC_UNLOCK(job->userproc);
+ }
+ }
+
+ cnt -= auio.uio_resid;
+ td->td_ucred = td_savedcred;
+ if (error)
+ aio_complete(job, -1, error);
+ else
+ aio_complete(job, cnt, 0);
+}
+
+static void
+aio_process_sync(struct kaiocb *job)
+{
+ struct thread *td = curthread;
+ struct ucred *td_savedcred = td->td_ucred;
+ struct file *fp = job->fd_file;
+ int error = 0;
+
+ KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC,
+ ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
+
+ td->td_ucred = job->cred;
+ if (fp->f_vnode != NULL)
+ error = aio_fsync_vnode(td, fp->f_vnode);
+ td->td_ucred = td_savedcred;
+ if (error)
+ aio_complete(job, -1, error);
+ else
+ aio_complete(job, 0, 0);
+}
+
+static void
+aio_process_mlock(struct kaiocb *job)
+{
+ struct aiocb *cb = &job->uaiocb;
+ int error;
+
+ KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK,
+ ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
+
+ aio_switch_vmspace(job);
+ error = kern_mlock(job->userproc, job->cred,
+ __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes);
+ aio_complete(job, error != 0 ? -1 : 0, error);
+}
+
+static void
+aio_bio_done_notify(struct proc *userp, struct kaiocb *job)
+{
+ struct aioliojob *lj;
+ struct kaioinfo *ki;
+ struct kaiocb *sjob, *sjobn;
+ int lj_done;
+ bool schedule_fsync;
+
+ ki = userp->p_aioinfo;
+ AIO_LOCK_ASSERT(ki, MA_OWNED);
+ lj = job->lio;
+ lj_done = 0;
+ if (lj) {
+ lj->lioj_finished_count++;
+ if (lj->lioj_count == lj->lioj_finished_count)
+ lj_done = 1;
+ }
+ TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist);
+ MPASS(job->jobflags & KAIOCB_FINISHED);
+
+ if (ki->kaio_flags & KAIO_RUNDOWN)
+ goto notification_done;
+
+ if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+ job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
+ aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi);
+
+ KNOTE_LOCKED(&job->klist, 1);
+
+ if (lj_done) {
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+ KNOTE_LOCKED(&lj->klist, 1);
+ }
+ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+ == LIOJ_SIGNAL
+ && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+ aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+
+notification_done:
+ if (job->jobflags & KAIOCB_CHECKSYNC) {
+ schedule_fsync = false;
+ TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) {
+ if (job->fd_file != sjob->fd_file ||
+ job->seqno >= sjob->seqno)
+ continue;
+ if (--sjob->pending > 0)
+ continue;
+ TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list);
+ if (!aio_clear_cancel_function_locked(sjob))
+ continue;
+ TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list);
+ schedule_fsync = true;
+ }
+ if (schedule_fsync)
+ taskqueue_enqueue(taskqueue_aiod_kick,
+ &ki->kaio_sync_task);
+ }
+ if (ki->kaio_flags & KAIO_WAKEUP) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(&userp->p_aioinfo);
+ }
+}
+
+static void
+aio_schedule_fsync(void *context, int pending)
+{
+ struct kaioinfo *ki;
+ struct kaiocb *job;
+
+ ki = context;
+ AIO_LOCK(ki);
+ while (!TAILQ_EMPTY(&ki->kaio_syncready)) {
+ job = TAILQ_FIRST(&ki->kaio_syncready);
+ TAILQ_REMOVE(&ki->kaio_syncready, job, list);
+ AIO_UNLOCK(ki);
+ aio_schedule(job, aio_process_sync);
+ AIO_LOCK(ki);
+ }
+ AIO_UNLOCK(ki);
+}
+
+bool
+aio_cancel_cleared(struct kaiocb *job)
+{
+
+ /*
+ * The caller should hold the same queue lock held when
+ * aio_clear_cancel_function() was called and set this flag
+ * ensuring this check sees an up-to-date value. However,
+ * there is no way to assert that.
+ */
+ return ((job->jobflags & KAIOCB_CLEARED) != 0);
+}
+
+static bool
+aio_clear_cancel_function_locked(struct kaiocb *job)
+{
+
+ AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
+ MPASS(job->cancel_fn != NULL);
+ if (job->jobflags & KAIOCB_CANCELLING) {
+ job->jobflags |= KAIOCB_CLEARED;
+ return (false);
+ }
+ job->cancel_fn = NULL;
+ return (true);
+}
+
+bool
+aio_clear_cancel_function(struct kaiocb *job)
+{
+ struct kaioinfo *ki;
+ bool ret;
+
+ ki = job->userproc->p_aioinfo;
+ AIO_LOCK(ki);
+ ret = aio_clear_cancel_function_locked(job);
+ AIO_UNLOCK(ki);
+ return (ret);
+}
+
+static bool
+aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func)
+{
+
+ AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED);
+ if (job->jobflags & KAIOCB_CANCELLED)
+ return (false);
+ job->cancel_fn = func;
+ return (true);
+}
+
+bool
+aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func)
+{
+ struct kaioinfo *ki;
+ bool ret;
+
+ ki = job->userproc->p_aioinfo;
+ AIO_LOCK(ki);
+ ret = aio_set_cancel_function_locked(job, func);
+ AIO_UNLOCK(ki);
+ return (ret);
+}
+
+void
+aio_complete(struct kaiocb *job, long status, int error)
+{
+ struct kaioinfo *ki;
+ struct proc *userp;
+
+ job->uaiocb._aiocb_private.error = error;
+ job->uaiocb._aiocb_private.status = status;
+
+ userp = job->userproc;
+ ki = userp->p_aioinfo;
+
+ AIO_LOCK(ki);
+ KASSERT(!(job->jobflags & KAIOCB_FINISHED),
+ ("duplicate aio_complete"));
+ job->jobflags |= KAIOCB_FINISHED;
+ if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) {
+ TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist);
+ aio_bio_done_notify(userp, job);
+ }
+ AIO_UNLOCK(ki);
+}
+
+void
+aio_cancel(struct kaiocb *job)
+{
+
+ aio_complete(job, -1, ECANCELED);
+}
+
+void
+aio_switch_vmspace(struct kaiocb *job)
+{
+
+ vmspace_switch_aio(job->userproc->p_vmspace);
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process_*,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *_id)
+{
+ struct kaiocb *job;
+ struct aioproc *aiop;
+ struct kaioinfo *ki;
+ struct proc *p;
+ struct vmspace *myvm;
+ struct thread *td = curthread;
+ int id = (intptr_t)_id;
+
+ /*
+ * Grab an extra reference on the daemon's vmspace so that it
+ * doesn't get freed by jobs that switch to a different
+ * vmspace.
+ */
+ p = td->td_proc;
+ myvm = vmspace_acquire_ref(p);
+
+ KASSERT(p->p_textvp == NULL, ("kthread has a textvp"));
+
+ /*
+ * Allocate and ready the aio control info. There is one aiop structure
+ * per daemon.
+ */
+ aiop = uma_zalloc(aiop_zone, M_WAITOK);
+ aiop->aioproc = p;
+ aiop->aioprocflags = 0;
+
+ /*
+ * Wakeup parent process. (Parent sleeps to keep from blasting away
+ * and creating too many daemons.)
+ */
+ sema_post(&aio_newproc_sem);
+
+ mtx_lock(&aio_job_mtx);
+ for (;;) {
+ /*
+ * Take daemon off of free queue
+ */
+ if (aiop->aioprocflags & AIOP_FREE) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ aiop->aioprocflags &= ~AIOP_FREE;
+ }
+
+ /*
+ * Check for jobs.
+ */
+ while ((job = aio_selectjob(aiop)) != NULL) {
+ mtx_unlock(&aio_job_mtx);
+
+ ki = job->userproc->p_aioinfo;
+ job->handle_fn(job);
+
+ mtx_lock(&aio_job_mtx);
+ /* Decrement the active job count. */
+ ki->kaio_active_count--;
+ }
+
+ /*
+ * Disconnect from user address space.
+ */
+ if (p->p_vmspace != myvm) {
+ mtx_unlock(&aio_job_mtx);
+ vmspace_switch_aio(myvm);
+ mtx_lock(&aio_job_mtx);
+ /*
+ * We have to restart to avoid race, we only sleep if
+ * no job can be selected.
+ */
+ continue;
+ }
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+
+ TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+ aiop->aioprocflags |= AIOP_FREE;
+
+ /*
+ * If daemon is inactive for a long time, allow it to exit,
+ * thereby freeing resources.
+ */
+ if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy",
+ aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) &&
+ (aiop->aioprocflags & AIOP_FREE) &&
+ num_aio_procs > target_aio_procs)
+ break;
+ }
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ num_aio_procs--;
+ mtx_unlock(&aio_job_mtx);
+ uma_zfree(aiop_zone, aiop);
+ free_unr(aiod_unr, id);
+ vmspace_free(myvm);
+
+ KASSERT(p->p_vmspace == myvm,
+ ("AIOD: bad vmspace for exiting daemon"));
+ KASSERT(myvm->vm_refcnt > 1,
+ ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt));
+ kproc_exit(0);
+}
+
+/*
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
+ * AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc(int *start)
+{
+ int error;
+ struct proc *p;
+ int id;
+
+ id = alloc_unr(aiod_unr);
+ error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
+ RFNOWAIT, 0, "aiod%d", id);
+ if (error == 0) {
+ /*
+ * Wait until daemon is started.
+ */
+ sema_wait(&aio_newproc_sem);
+ mtx_lock(&aio_job_mtx);
+ num_aio_procs++;
+ if (start != NULL)
+ (*start)--;
+ mtx_unlock(&aio_job_mtx);
+ } else {
+ free_unr(aiod_unr, id);
+ }
+ return (error);
+}
+
+/*
+ * Try the high-performance, low-overhead bio method for eligible
+ * VCHR devices. This method doesn't use an aio helper thread, and
+ * thus has very low overhead.
+ *
+ * Assumes that the caller, aio_aqueue(), has incremented the file
+ * structure's reference count, preventing its deallocation for the
+ * duration of this call.
+ */
+static int
+aio_qbio(struct proc *p, struct kaiocb *job)
+{
+ struct aiocb *cb;
+ struct file *fp;
+ struct bio *bp;
+ struct buf *pbuf;
+ struct vnode *vp;
+ struct cdevsw *csw;
+ struct cdev *dev;
+ struct kaioinfo *ki;
+ int error, ref, poff;
+ vm_prot_t prot;
+
+ cb = &job->uaiocb;
+ fp = job->fd_file;
+
+ if (!(cb->aio_lio_opcode == LIO_WRITE ||
+ cb->aio_lio_opcode == LIO_READ))
+ return (-1);
+ if (fp == NULL || fp->f_type != DTYPE_VNODE)
+ return (-1);
+
+ vp = fp->f_vnode;
+ if (vp->v_type != VCHR)
+ return (-1);
+ if (vp->v_bufobj.bo_bsize == 0)
+ return (-1);
+ if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+ return (-1);
+
+ ref = 0;
+ csw = devvn_refthread(vp, &dev, &ref);
+ if (csw == NULL)
+ return (ENXIO);
+
+ if ((csw->d_flags & D_DISK) == 0) {
+ error = -1;
+ goto unref;
+ }
+ if (cb->aio_nbytes > dev->si_iosize_max) {
+ error = -1;
+ goto unref;
+ }
+
+ ki = p->p_aioinfo;
+ poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
+ if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
+ if (cb->aio_nbytes > MAXPHYS) {
+ error = -1;
+ goto unref;
+ }
+
+ pbuf = NULL;
+ } else {
+ if (cb->aio_nbytes > MAXPHYS - poff) {
+ error = -1;
+ goto unref;
+ }
+ if (ki->kaio_buffer_count >= max_buf_aio) {
+ error = EAGAIN;
+ goto unref;
+ }
+
+ job->pbuf = pbuf = (struct buf *)getpbuf(NULL);
+ BUF_KERNPROC(pbuf);
+ AIO_LOCK(ki);
+ ki->kaio_buffer_count++;
+ AIO_UNLOCK(ki);
+ }
+ job->bp = bp = g_alloc_bio();
+
+ bp->bio_length = cb->aio_nbytes;
+ bp->bio_bcount = cb->aio_nbytes;
+ bp->bio_done = aio_biowakeup;
+ bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
+ bp->bio_offset = cb->aio_offset;
+ bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+ bp->bio_dev = dev;
+ bp->bio_caller1 = (void *)job;
+
+ prot = VM_PROT_READ;
+ if (cb->aio_lio_opcode == LIO_READ)
+ prot |= VM_PROT_WRITE; /* Less backwards than it looks */
+ job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+ (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages,
+ nitems(job->pages));
+ if (job->npages < 0) {
+ error = EFAULT;
+ goto doerror;
+ }
+ if (pbuf != NULL) {
+ pmap_qenter((vm_offset_t)pbuf->b_data,
+ job->pages, job->npages);
+ bp->bio_data = pbuf->b_data + poff;
+ atomic_add_int(&num_buf_aio, 1);
+ } else {
+ bp->bio_ma = job->pages;
+ bp->bio_ma_n = job->npages;
+ bp->bio_ma_offset = poff;
+ bp->bio_data = unmapped_buf;
+ bp->bio_flags |= BIO_UNMAPPED;
+ atomic_add_int(&num_unmapped_aio, 1);
+ }
+
+ /* Perform transfer. */
+ csw->d_strategy(bp);
+ dev_relthread(dev, ref);
+ return (0);
+
+doerror:
+ if (pbuf != NULL) {
+ AIO_LOCK(ki);
+ ki->kaio_buffer_count--;
+ AIO_UNLOCK(ki);
+ relpbuf(pbuf, NULL);
+ job->pbuf = NULL;
+ }
+ g_destroy_bio(bp);
+ job->bp = NULL;
+unref:
+ dev_relthread(dev, ref);
+ return (error);
+}
+
+#ifdef COMPAT_FREEBSD6
+static int
+convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
+{
+
+ /*
+ * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+ * supported by AIO with the old sigevent structure.
+ */
+ nsig->sigev_notify = osig->sigev_notify;
+ switch (nsig->sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+ break;
+ case SIGEV_KEVENT:
+ nsig->sigev_notify_kqueue =
+ osig->__sigev_u.__sigev_notify_kqueue;
+ nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+ struct oaiocb *ojob;
+ int error;
+
+ bzero(kjob, sizeof(struct aiocb));
+ error = copyin(ujob, kjob, sizeof(struct oaiocb));
+ if (error)
+ return (error);
+ ojob = (struct oaiocb *)kjob;
+ return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
+}
+#endif
+
+static int
+aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+
+ return (copyin(ujob, kjob, sizeof(struct aiocb)));
+}
+
+static long
+aiocb_fetch_status(struct aiocb *ujob)
+{
+
+ return (fuword(&ujob->_aiocb_private.status));
+}
+
+static long
+aiocb_fetch_error(struct aiocb *ujob)
+{
+
+ return (fuword(&ujob->_aiocb_private.error));
+}
+
+static int
+aiocb_store_status(struct aiocb *ujob, long status)
+{
+
+ return (suword(&ujob->_aiocb_private.status, status));
+}
+
+static int
+aiocb_store_error(struct aiocb *ujob, long error)
+{
+
+ return (suword(&ujob->_aiocb_private.error, error));
+}
+
+static int
+aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+
+ return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+ return (suword(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb_ops = {
+ .copyin = aiocb_copyin,
+ .fetch_status = aiocb_fetch_status,
+ .fetch_error = aiocb_fetch_error,
+ .store_status = aiocb_store_status,
+ .store_error = aiocb_store_error,
+ .store_kernelinfo = aiocb_store_kernelinfo,
+ .store_aiocb = aiocb_store_aiocb,
+};
+
+#ifdef COMPAT_FREEBSD6
+static struct aiocb_ops aiocb_ops_osigevent = {
+ .copyin = aiocb_copyin_old_sigevent,
+ .fetch_status = aiocb_fetch_status,
+ .fetch_error = aiocb_fetch_error,
+ .store_status = aiocb_store_status,
+ .store_error = aiocb_store_error,
+ .store_kernelinfo = aiocb_store_kernelinfo,
+ .store_aiocb = aiocb_store_aiocb,
+};
+#endif
+
+/*
+ * Queue a new AIO request. Choosing either the threaded or direct bio VCHR
+ * technique is done in this code.
+ */
+int
+aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,
+ int type, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct file *fp;
+ struct kaiocb *job;
+ struct kaioinfo *ki;
+ struct kevent kev;
+ int opcode;
+ int error;
+ int fd, kqfd;
+ int jid;
+ u_short evflags;
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+
+ ki = p->p_aioinfo;
+
+ ops->store_status(ujob, -1);
+ ops->store_error(ujob, 0);
+ ops->store_kernelinfo(ujob, -1);
+
+ if (num_queue_count >= max_queue_count ||
+ ki->kaio_count >= max_aio_queue_per_proc) {
+ ops->store_error(ujob, EAGAIN);
+ return (EAGAIN);
+ }
+
+ job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
+ knlist_init_mtx(&job->klist, AIO_MTX(ki));
+
+ error = ops->copyin(ujob, &job->uaiocb);
+ if (error) {
+ ops->store_error(ujob, error);
+ uma_zfree(aiocb_zone, job);
+ return (error);
+ }
+
+ if (job->uaiocb.aio_nbytes > IOSIZE_MAX) {
+ uma_zfree(aiocb_zone, job);
+ return (EINVAL);
+ }
+
+ if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
+ job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
+ job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
+ job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
+ ops->store_error(ujob, EINVAL);
+ uma_zfree(aiocb_zone, job);
+ return (EINVAL);
+ }
+
+ if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+ job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
+ !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) {
+ uma_zfree(aiocb_zone, job);
+ return (EINVAL);
+ }
+
+ ksiginfo_init(&job->ksi);
+
+ /* Save userspace address of the job info. */
+ job->ujob = ujob;
+
+ /* Get the opcode. */
+ if (type != LIO_NOP)
+ job->uaiocb.aio_lio_opcode = type;
+ opcode = job->uaiocb.aio_lio_opcode;
+
+ /*
+ * Validate the opcode and fetch the file object for the specified
+ * file descriptor.
+ *
+ * XXXRW: Moved the opcode validation up here so that we don't
+ * retrieve a file descriptor without knowing what the capabiltity
+ * should be.
+ */
+ fd = job->uaiocb.aio_fildes;
+ switch (opcode) {
+ case LIO_WRITE:
+ error = fget_write(td, fd, &cap_pwrite_rights, &fp);
+ break;
+ case LIO_READ:
+ error = fget_read(td, fd, &cap_pread_rights, &fp);
+ break;
+ case LIO_SYNC:
+ error = fget(td, fd, &cap_fsync_rights, &fp);
+ break;
+ case LIO_MLOCK:
+ fp = NULL;
+ break;
+ case LIO_NOP:
+ error = fget(td, fd, &cap_no_rights, &fp);
+ break;
+ default:
+ error = EINVAL;
+ }
+ if (error) {
+ uma_zfree(aiocb_zone, job);
+ ops->store_error(ujob, error);
+ return (error);
+ }
+
+ if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+
+ if ((opcode == LIO_READ || opcode == LIO_WRITE) &&
+ job->uaiocb.aio_offset < 0 &&
+ (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+
+ job->fd_file = fp;
+
+ mtx_lock(&aio_job_mtx);
+ jid = jobrefid++;
+ job->seqno = jobseqno++;
+ mtx_unlock(&aio_job_mtx);
+ error = ops->store_kernelinfo(ujob, jid);
+ if (error) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+ job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
+
+ if (opcode == LIO_NOP) {
+ fdrop(fp, td);
+ uma_zfree(aiocb_zone, job);
+ return (0);
+ }
+
+ if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
+ goto no_kqueue;
+ evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
+ if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+ kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;
+ memset(&kev, 0, sizeof(kev));
+ kev.ident = (uintptr_t)job->ujob;
+ kev.filter = EVFILT_AIO;
+ kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
+ kev.data = (intptr_t)job;
+ kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;
+ error = kqfd_register(kqfd, &kev, td, M_WAITOK);
+ if (error)
+ goto aqueue_fail;
+
+no_kqueue:
+
+ ops->store_error(ujob, EINPROGRESS);
+ job->uaiocb._aiocb_private.error = EINPROGRESS;
+ job->userproc = p;
+ job->cred = crhold(td->td_ucred);
+ job->jobflags = KAIOCB_QUEUEING;
+ job->lio = lj;
+
+ if (opcode == LIO_MLOCK) {
+ aio_schedule(job, aio_process_mlock);
+ error = 0;
+ } else if (fp->f_ops->fo_aio_queue == NULL)
+ error = aio_queue_file(fp, job);
+ else
+ error = fo_aio_queue(fp, job);
+ if (error)
+ goto aqueue_fail;
+
+ AIO_LOCK(ki);
+ job->jobflags &= ~KAIOCB_QUEUEING;
+ TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist);
+ ki->kaio_count++;
+ if (lj)
+ lj->lioj_count++;
+ atomic_add_int(&num_queue_count, 1);
+ if (job->jobflags & KAIOCB_FINISHED) {
+ /*
+ * The queue callback completed the request synchronously.
+ * The bulk of the completion is deferred in that case
+ * until this point.
+ */
+ aio_bio_done_notify(p, job);
+ } else
+ TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist);
+ AIO_UNLOCK(ki);
+ return (0);
+
+aqueue_fail:
+ knlist_delete(&job->klist, curthread, 0);
+ if (fp)
+ fdrop(fp, td);
+ uma_zfree(aiocb_zone, job);
+ ops->store_error(ujob, error);
+ return (error);
+}
+
+static void
+aio_cancel_daemon_job(struct kaiocb *job)
+{
+
+ mtx_lock(&aio_job_mtx);
+ if (!aio_cancel_cleared(job))
+ TAILQ_REMOVE(&aio_jobs, job, list);
+ mtx_unlock(&aio_job_mtx);
+ aio_cancel(job);
+}
+
+void
+aio_schedule(struct kaiocb *job, aio_handle_fn_t *func)
+{
+
+ mtx_lock(&aio_job_mtx);
+ if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) {
+ mtx_unlock(&aio_job_mtx);
+ aio_cancel(job);
+ return;
+ }
+ job->handle_fn = func;
+ TAILQ_INSERT_TAIL(&aio_jobs, job, list);
+ aio_kick_nowait(job->userproc);
+ mtx_unlock(&aio_job_mtx);
+}
+
+static void
+aio_cancel_sync(struct kaiocb *job)
+{
+ struct kaioinfo *ki;
+
+ ki = job->userproc->p_aioinfo;
+ AIO_LOCK(ki);
+ if (!aio_cancel_cleared(job))
+ TAILQ_REMOVE(&ki->kaio_syncqueue, job, list);
+ AIO_UNLOCK(ki);
+ aio_cancel(job);
+}
+
+int
+aio_queue_file(struct file *fp, struct kaiocb *job)
+{
+ struct kaioinfo *ki;
+ struct kaiocb *job2;
+ struct vnode *vp;
+ struct mount *mp;
+ int error;
+ bool safe;
+
+ ki = job->userproc->p_aioinfo;
+ error = aio_qbio(job->userproc, job);
+ if (error >= 0)
+ return (error);
+ safe = false;
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = fp->f_vnode;
+ if (vp->v_type == VREG || vp->v_type == VDIR) {
+ mp = fp->f_vnode->v_mount;
+ if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0)
+ safe = true;
+ }
+ }
+ if (!(safe || enable_aio_unsafe)) {
+ counted_warning(&unsafe_warningcnt,
+ "is attempting to use unsafe AIO requests");
+ return (EOPNOTSUPP);
+ }
+
+ switch (job->uaiocb.aio_lio_opcode) {
+ case LIO_READ:
+ case LIO_WRITE:
+ aio_schedule(job, aio_process_rw);
+ error = 0;
+ break;
+ case LIO_SYNC:
+ AIO_LOCK(ki);
+ TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) {
+ if (job2->fd_file == job->fd_file &&
+ job2->uaiocb.aio_lio_opcode != LIO_SYNC &&
+ job2->seqno < job->seqno) {
+ job2->jobflags |= KAIOCB_CHECKSYNC;
+ job->pending++;
+ }
+ }
+ if (job->pending != 0) {
+ if (!aio_set_cancel_function_locked(job,
+ aio_cancel_sync)) {
+ AIO_UNLOCK(ki);
+ aio_cancel(job);
+ return (0);
+ }
+ TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list);
+ AIO_UNLOCK(ki);
+ return (0);
+ }
+ AIO_UNLOCK(ki);
+ aio_schedule(job, aio_process_sync);
+ error = 0;
+ break;
+ default:
+ error = EINVAL;
+ }
+ return (error);
+}
+
+static void
+aio_kick_nowait(struct proc *userp)
+{
+ struct kaioinfo *ki = userp->p_aioinfo;
+ struct aioproc *aiop;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+ if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ aiop->aioprocflags &= ~AIOP_FREE;
+ wakeup(aiop->aioproc);
+ } else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
+ ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
+ taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task);
+ }
+}
+
+static int
+aio_kick(struct proc *userp)
+{
+ struct kaioinfo *ki = userp->p_aioinfo;
+ struct aioproc *aiop;
+ int error, ret = 0;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+retryproc:
+ if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ aiop->aioprocflags &= ~AIOP_FREE;
+ wakeup(aiop->aioproc);
+ } else if (num_aio_resv_start + num_aio_procs < max_aio_procs &&
+ ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) {
+ num_aio_resv_start++;
+ mtx_unlock(&aio_job_mtx);
+ error = aio_newproc(&num_aio_resv_start);
+ mtx_lock(&aio_job_mtx);
+ if (error) {
+ num_aio_resv_start--;
+ goto retryproc;
+ }
+ } else {
+ ret = -1;
+ }
+ return (ret);
+}
+
+static void
+aio_kick_helper(void *context, int pending)
+{
+ struct proc *userp = context;
+
+ mtx_lock(&aio_job_mtx);
+ while (--pending >= 0) {
+ if (aio_kick(userp))
+ break;
+ }
+ mtx_unlock(&aio_job_mtx);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel resources are
+ * released.
+ */
+static int
+kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct kaiocb *job;
+ struct kaioinfo *ki;
+ long status, error;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return (EINVAL);
+ AIO_LOCK(ki);
+ TAILQ_FOREACH(job, &ki->kaio_done, plist) {
+ if (job->ujob == ujob)
+ break;
+ }
+ if (job != NULL) {
+ MPASS(job->jobflags & KAIOCB_FINISHED);
+ status = job->uaiocb._aiocb_private.status;
+ error = job->uaiocb._aiocb_private.error;
+ td->td_retval[0] = status;
+ td->td_ru.ru_oublock += job->outblock;
+ td->td_ru.ru_inblock += job->inblock;
+ td->td_ru.ru_msgsnd += job->msgsnd;
+ td->td_ru.ru_msgrcv += job->msgrcv;
+ aio_free_entry(job);
+ AIO_UNLOCK(ki);
+ ops->store_error(ujob, error);
+ ops->store_status(ujob, status);
+ } else {
+ error = EINVAL;
+ AIO_UNLOCK(ki);
+ }
+ return (error);
+}
+
+int
+sys_aio_return(struct thread *td, struct aio_return_args *uap)
+{
+
+ return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are completed.
+ */
+static int
+kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
+ struct timespec *ts)
+{
+ struct proc *p = td->td_proc;
+ struct timeval atv;
+ struct kaioinfo *ki;
+ struct kaiocb *firstjob, *job;
+ int error, i, timo;
+
+ timo = 0;
+ if (ts) {
+ if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+ return (EINVAL);
+
+ TIMESPEC_TO_TIMEVAL(&atv, ts);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ timo = tvtohz(&atv);
+ }
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return (EAGAIN);
+
+ if (njoblist == 0)
+ return (0);
+
+ AIO_LOCK(ki);
+ for (;;) {
+ firstjob = NULL;
+ error = 0;
+ TAILQ_FOREACH(job, &ki->kaio_all, allist) {
+ for (i = 0; i < njoblist; i++) {
+ if (job->ujob == ujoblist[i]) {
+ if (firstjob == NULL)
+ firstjob = job;
+ if (job->jobflags & KAIOCB_FINISHED)
+ goto RETURN;
+ }
+ }
+ }
+ /* All tasks were finished. */
+ if (firstjob == NULL)
+ break;
+
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+ "aiospn", timo);
+ if (error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
+RETURN:
+ AIO_UNLOCK(ki);
+ return (error);
+}
+
+int
+sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
+{
+ struct timespec ts, *tsp;
+ struct aiocb **ujoblist;
+ int error;
+
+ if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
+ return (EINVAL);
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
+ return (error);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
+ error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
+ if (error == 0)
+ error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+ free(ujoblist, M_AIOS);
+ return (error);
+}
+
+/*
+ * aio_cancel cancels any non-bio aio operations not currently in progress.
+ */
+int
+sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct kaioinfo *ki;
+ struct kaiocb *job, *jobn;
+ struct file *fp;
+ int error;
+ int cancelled = 0;
+ int notcancelled = 0;
+ struct vnode *vp;
+
+ /* Lookup file object. */
+ error = fget(td, uap->fd, &cap_no_rights, &fp);
+ if (error)
+ return (error);
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ goto done;
+
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = fp->f_vnode;
+ if (vn_isdisk(vp, &error)) {
+ fdrop(fp, td);
+ td->td_retval[0] = AIO_NOTCANCELED;
+ return (0);
+ }
+ }
+
+ AIO_LOCK(ki);
+ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) {
+ if ((uap->fd == job->uaiocb.aio_fildes) &&
+ ((uap->aiocbp == NULL) ||
+ (uap->aiocbp == job->ujob))) {
+ if (aio_cancel_job(p, ki, job)) {
+ cancelled++;
+ } else {
+ notcancelled++;
+ }
+ if (uap->aiocbp != NULL)
+ break;
+ }
+ }
+ AIO_UNLOCK(ki);
+
+done:
+ fdrop(fp, td);
+
+ if (uap->aiocbp != NULL) {
+ if (cancelled) {
+ td->td_retval[0] = AIO_CANCELED;
+ return (0);
+ }
+ }
+
+ if (notcancelled) {
+ td->td_retval[0] = AIO_NOTCANCELED;
+ return (0);
+ }
+
+ if (cancelled) {
+ td->td_retval[0] = AIO_CANCELED;
+ return (0);
+ }
+
+ td->td_retval[0] = AIO_ALLDONE;
+
+ return (0);
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility purposes
+ * only. For a user mode async implementation, it would be best to do it in
+ * a userland subroutine.
+ */
+static int
+kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct kaiocb *job;
+ struct kaioinfo *ki;
+ int status;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL) {
+ td->td_retval[0] = EINVAL;
+ return (0);
+ }
+
+ AIO_LOCK(ki);
+ TAILQ_FOREACH(job, &ki->kaio_all, allist) {
+ if (job->ujob == ujob) {
+ if (job->jobflags & KAIOCB_FINISHED)
+ td->td_retval[0] =
+ job->uaiocb._aiocb_private.error;
+ else
+ td->td_retval[0] = EINPROGRESS;
+ AIO_UNLOCK(ki);
+ return (0);
+ }
+ }
+ AIO_UNLOCK(ki);
+
+ /*
+ * Hack for failure of aio_aqueue.
+ */
+ status = ops->fetch_status(ujob);
+ if (status == -1) {
+ td->td_retval[0] = ops->fetch_error(ujob);
+ return (0);
+ }
+
+ td->td_retval[0] = EINVAL;
+ return (0);
+}
+
+int
+sys_aio_error(struct thread *td, struct aio_error_args *uap)
+{
+
+ return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
+}
+
+/* syscall - asynchronous read from a file (REALTIME) */
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+ &aiocb_ops_osigevent));
+}
+#endif
+
+int
+sys_aio_read(struct thread *td, struct aio_read_args *uap)
+{
+
+ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
+}
+
+/* syscall - asynchronous write to a file (REALTIME) */
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+ &aiocb_ops_osigevent));
+}
+#endif
+
+int
+sys_aio_write(struct thread *td, struct aio_write_args *uap)
+{
+
+ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
+}
+
+int
+sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
+{
+
+ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
+}
+
+static int
+kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
+ struct aiocb **acb_list, int nent, struct sigevent *sig,
+ struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct aiocb *job;
+ struct kaioinfo *ki;
+ struct aioliojob *lj;
+ struct kevent kev;
+ int error;
+ int nagain, nerror;
+ int i;
+
+ if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
+ return (EINVAL);
+
+ if (nent < 0 || nent > max_aio_queue_per_proc)
+ return (EINVAL);
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+
+ ki = p->p_aioinfo;
+
+ lj = uma_zalloc(aiolio_zone, M_WAITOK);
+ lj->lioj_flags = 0;
+ lj->lioj_count = 0;
+ lj->lioj_finished_count = 0;
+ knlist_init_mtx(&lj->klist, AIO_MTX(ki));
+ ksiginfo_init(&lj->lioj_ksi);
+
+ /*
+ * Setup signal.
+ */
+ if (sig && (mode == LIO_NOWAIT)) {
+ bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ /* Assume only new style KEVENT */
+ memset(&kev, 0, sizeof(kev));
+ kev.filter = EVFILT_LIO;
+ kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+ kev.ident = (uintptr_t)uacb_list; /* something unique */
+ kev.data = (intptr_t)lj;
+ /* pass user defined sigval data */
+ kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
+ error = kqfd_register(
+ lj->lioj_signal.sigev_notify_kqueue, &kev, td,
+ M_WAITOK);
+ if (error) {
+ uma_zfree(aiolio_zone, lj);
+ return (error);
+ }
+ } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
+ ;
+ } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
+ if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+ uma_zfree(aiolio_zone, lj);
+ return EINVAL;
+ }
+ lj->lioj_flags |= LIOJ_SIGNAL;
+ } else {
+ uma_zfree(aiolio_zone, lj);
+ return EINVAL;
+ }
+ }
+
+ AIO_LOCK(ki);
+ TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+ /*
+ * Add extra aiocb count to avoid the lio to be freed
+ * by other threads doing aio_waitcomplete or aio_return,
+ * and prevent event from being sent until we have queued
+ * all tasks.
+ */
+ lj->lioj_count = 1;
+ AIO_UNLOCK(ki);
+
+ /*
+ * Get pointers to the list of I/O requests.
+ */
+ nagain = 0;
+ nerror = 0;
+ for (i = 0; i < nent; i++) {
+ job = acb_list[i];
+ if (job != NULL) {
+ error = aio_aqueue(td, job, lj, LIO_NOP, ops);
+ if (error == EAGAIN)
+ nagain++;
+ else if (error != 0)
+ nerror++;
+ }
+ }
+
+ error = 0;
+ AIO_LOCK(ki);
+ if (mode == LIO_WAIT) {
+ while (lj->lioj_count - 1 != lj->lioj_finished_count) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki),
+ PRIBIO | PCATCH, "aiospn", 0);
+ if (error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
+ } else {
+ if (lj->lioj_count - 1 == lj->lioj_finished_count) {
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+ KNOTE_LOCKED(&lj->klist, 1);
+ }
+ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+ == LIOJ_SIGNAL
+ && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+ aio_sendsig(p, &lj->lioj_signal,
+ &lj->lioj_ksi);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+ }
+ lj->lioj_count--;
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ AIO_UNLOCK(ki);
+ uma_zfree(aiolio_zone, lj);
+ } else
+ AIO_UNLOCK(ki);
+
+ if (nerror)
+ return (EIO);
+ else if (nagain)
+ return (EAGAIN);
+ else
+ return (error);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ struct osigevent osig;
+ int error, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > max_aio_queue_per_proc)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &osig, sizeof(osig));
+ if (error)
+ return (error);
+ error = convert_old_sigevent(&osig, &sig);
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+ if (error == 0)
+ error = kern_lio_listio(td, uap->mode,
+ (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+ &aiocb_ops_osigevent);
+ free(acb_list, M_LIO);
+ return (error);
+}
+#endif
+
+/* syscall - list directed I/O (REALTIME) */
+int
+sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ int error, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > max_aio_queue_per_proc)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &sig, sizeof(sig));
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+ if (error == 0)
+ error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
+ nent, sigp, &aiocb_ops);
+ free(acb_list, M_LIO);
+ return (error);
+}
+
+static void
+aio_biowakeup(struct bio *bp)
+{
+ struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;
+ struct proc *userp;
+ struct kaioinfo *ki;
+ size_t nbytes;
+ int error, nblks;
+
+ /* Release mapping into kernel space. */
+ userp = job->userproc;
+ ki = userp->p_aioinfo;
+ if (job->pbuf) {
+ pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages);
+ relpbuf(job->pbuf, NULL);
+ job->pbuf = NULL;
+ atomic_subtract_int(&num_buf_aio, 1);
+ AIO_LOCK(ki);
+ ki->kaio_buffer_count--;
+ AIO_UNLOCK(ki);
+ } else
+ atomic_subtract_int(&num_unmapped_aio, 1);
+ vm_page_unhold_pages(job->pages, job->npages);
+
+ bp = job->bp;
+ job->bp = NULL;
+ nbytes = job->uaiocb.aio_nbytes - bp->bio_resid;
+ error = 0;
+ if (bp->bio_flags & BIO_ERROR)
+ error = bp->bio_error;
+ nblks = btodb(nbytes);
+ if (job->uaiocb.aio_lio_opcode == LIO_WRITE)
+ job->outblock += nblks;
+ else
+ job->inblock += nblks;
+
+ if (error)
+ aio_complete(job, -1, error);
+ else
+ aio_complete(job, nbytes, 0);
+
+ g_destroy_bio(bp);
+}
+
+/* syscall - wait for the next completion of an aio request */
+static int
+kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp,
+ struct timespec *ts, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct timeval atv;
+ struct kaioinfo *ki;
+ struct kaiocb *job;
+ struct aiocb *ujob;
+ long error, status;
+ int timo;
+
+ ops->store_aiocb(ujobp, NULL);
+
+ if (ts == NULL) {
+ timo = 0;
+ } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
+ timo = -1;
+ } else {
+ if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
+ return (EINVAL);
+
+ TIMESPEC_TO_TIMEVAL(&atv, ts);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ timo = tvtohz(&atv);
+ }
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+ ki = p->p_aioinfo;
+
+ error = 0;
+ job = NULL;
+ AIO_LOCK(ki);
+ while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
+ if (timo == -1) {
+ error = EWOULDBLOCK;
+ break;
+ }
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+ "aiowc", timo);
+ if (timo && error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
+
+ if (job != NULL) {
+ MPASS(job->jobflags & KAIOCB_FINISHED);
+ ujob = job->ujob;
+ status = job->uaiocb._aiocb_private.status;
+ error = job->uaiocb._aiocb_private.error;
+ td->td_retval[0] = status;
+ td->td_ru.ru_oublock += job->outblock;
+ td->td_ru.ru_inblock += job->inblock;
+ td->td_ru.ru_msgsnd += job->msgsnd;
+ td->td_ru.ru_msgrcv += job->msgrcv;
+ aio_free_entry(job);
+ AIO_UNLOCK(ki);
+ ops->store_aiocb(ujobp, ujob);
+ ops->store_error(ujob, error);
+ ops->store_status(ujob, status);
+ } else
+ AIO_UNLOCK(ki);
+
+ return (error);
+}
+
+int
+sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
+{
+ struct timespec ts, *tsp;
+ int error;
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ error = copyin(uap->timeout, &ts, sizeof(ts));
+ if (error)
+ return (error);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
+}
+
+static int
+kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob,
+ struct aiocb_ops *ops)
+{
+
+ if (op != O_SYNC) /* XXX lack of O_DSYNC */
+ return (EINVAL);
+ return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops));
+}
+
+int
+sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
+{
+
+ return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
+}
+
+/* kqueue attach function */
+static int
+filt_aioattach(struct knote *kn)
+{
+ struct kaiocb *job;
+
+ job = (struct kaiocb *)(uintptr_t)kn->kn_sdata;
+
+ /*
+ * The job pointer must be validated before using it, so
+ * registration is restricted to the kernel; the user cannot
+ * set EV_FLAG1.
+ */
+ if ((kn->kn_flags & EV_FLAG1) == 0)
+ return (EPERM);
+ kn->kn_ptr.p_aio = job;
+ kn->kn_flags &= ~EV_FLAG1;
+
+ knlist_add(&job->klist, kn, 0);
+
+ return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_aiodetach(struct knote *kn)
+{
+ struct knlist *knl;
+
+ knl = &kn->kn_ptr.p_aio->klist;
+ knl->kl_lock(knl->kl_lockarg);
+ if (!knlist_empty(knl))
+ knlist_remove(knl, kn, 1);
+ knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_aio(struct knote *kn, long hint)
+{
+ struct kaiocb *job = kn->kn_ptr.p_aio;
+
+ kn->kn_data = job->uaiocb._aiocb_private.error;
+ if (!(job->jobflags & KAIOCB_FINISHED))
+ return (0);
+ kn->kn_flags |= EV_EOF;
+ return (1);
+}
+
+/* kqueue attach function */
+static int
+filt_lioattach(struct knote *kn)
+{
+ struct aioliojob *lj;
+
+ lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata;
+
+ /*
+ * The aioliojob pointer must be validated before using it, so
+ * registration is restricted to the kernel; the user cannot
+ * set EV_FLAG1.
+ */
+ if ((kn->kn_flags & EV_FLAG1) == 0)
+ return (EPERM);
+ kn->kn_ptr.p_lio = lj;
+ kn->kn_flags &= ~EV_FLAG1;
+
+ knlist_add(&lj->klist, kn, 0);
+
+ return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_liodetach(struct knote *kn)
+{
+ struct knlist *knl;
+
+ knl = &kn->kn_ptr.p_lio->klist;
+ knl->kl_lock(knl->kl_lockarg);
+ if (!knlist_empty(knl))
+ knlist_remove(knl, kn, 1);
+ knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_lio(struct knote *kn, long hint)
+{
+ struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+ return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
+}
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+struct __aiocb_private32 {
+ int32_t status;
+ int32_t error;
+ uint32_t kernelinfo;
+};
+
+#ifdef COMPAT_FREEBSD6
+typedef struct oaiocb32 {
+ int aio_fildes; /* File descriptor */
+ uint64_t aio_offset __packed; /* File offset for I/O */
+ uint32_t aio_buf; /* I/O buffer in process space */
+ uint32_t aio_nbytes; /* Number of bytes for I/O */
+ struct osigevent32 aio_sigevent; /* Signal to deliver */
+ int aio_lio_opcode; /* LIO opcode */
+ int aio_reqprio; /* Request priority -- ignored */
+ struct __aiocb_private32 _aiocb_private;
+} oaiocb32_t;
+#endif
+
+typedef struct aiocb32 {
+ int32_t aio_fildes; /* File descriptor */
+ uint64_t aio_offset __packed; /* File offset for I/O */
+ uint32_t aio_buf; /* I/O buffer in process space */
+ uint32_t aio_nbytes; /* Number of bytes for I/O */
+ int __spare__[2];
+ uint32_t __spare2__;
+ int aio_lio_opcode; /* LIO opcode */
+ int aio_reqprio; /* Request priority -- ignored */
+ struct __aiocb_private32 _aiocb_private;
+ struct sigevent32 aio_sigevent; /* Signal to deliver */
+} aiocb32_t;
+
+#ifdef COMPAT_FREEBSD6
+static int
+convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
+{
+
+ /*
+ * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+ * supported by AIO with the old sigevent structure.
+ */
+ CP(*osig, *nsig, sigev_notify);
+ switch (nsig->sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+ break;
+ case SIGEV_KEVENT:
+ nsig->sigev_notify_kqueue =
+ osig->__sigev_u.__sigev_notify_kqueue;
+ PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+ struct oaiocb32 job32;
+ int error;
+
+ bzero(kjob, sizeof(struct aiocb));
+ error = copyin(ujob, &job32, sizeof(job32));
+ if (error)
+ return (error);
+
+ CP(job32, *kjob, aio_fildes);
+ CP(job32, *kjob, aio_offset);
+ PTRIN_CP(job32, *kjob, aio_buf);
+ CP(job32, *kjob, aio_nbytes);
+ CP(job32, *kjob, aio_lio_opcode);
+ CP(job32, *kjob, aio_reqprio);
+ CP(job32, *kjob, _aiocb_private.status);
+ CP(job32, *kjob, _aiocb_private.error);
+ PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+ return (convert_old_sigevent32(&job32.aio_sigevent,
+ &kjob->aio_sigevent));
+}
+#endif
+
+static int
+aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+ struct aiocb32 job32;
+ int error;
+
+ error = copyin(ujob, &job32, sizeof(job32));
+ if (error)
+ return (error);
+ CP(job32, *kjob, aio_fildes);
+ CP(job32, *kjob, aio_offset);
+ PTRIN_CP(job32, *kjob, aio_buf);
+ CP(job32, *kjob, aio_nbytes);
+ CP(job32, *kjob, aio_lio_opcode);
+ CP(job32, *kjob, aio_reqprio);
+ CP(job32, *kjob, _aiocb_private.status);
+ CP(job32, *kjob, _aiocb_private.error);
+ PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+ return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
+}
+
+static long
+aiocb32_fetch_status(struct aiocb *ujob)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (fuword32(&ujob32->_aiocb_private.status));
+}
+
+static long
+aiocb32_fetch_error(struct aiocb *ujob)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (fuword32(&ujob32->_aiocb_private.error));
+}
+
+static int
+aiocb32_store_status(struct aiocb *ujob, long status)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (suword32(&ujob32->_aiocb_private.status, status));
+}
+
+static int
+aiocb32_store_error(struct aiocb *ujob, long error)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (suword32(&ujob32->_aiocb_private.error, error));
+}
+
+static int
+aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+ return (suword32(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb32_ops = {
+ .copyin = aiocb32_copyin,
+ .fetch_status = aiocb32_fetch_status,
+ .fetch_error = aiocb32_fetch_error,
+ .store_status = aiocb32_store_status,
+ .store_error = aiocb32_store_error,
+ .store_kernelinfo = aiocb32_store_kernelinfo,
+ .store_aiocb = aiocb32_store_aiocb,
+};
+
+#ifdef COMPAT_FREEBSD6
+static struct aiocb_ops aiocb32_ops_osigevent = {
+ .copyin = aiocb32_copyin_old_sigevent,
+ .fetch_status = aiocb32_fetch_status,
+ .fetch_error = aiocb32_fetch_error,
+ .store_status = aiocb32_store_status,
+ .store_error = aiocb32_store_error,
+ .store_kernelinfo = aiocb32_store_kernelinfo,
+ .store_aiocb = aiocb32_store_aiocb,
+};
+#endif
+
+int
+freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
+{
+
+ return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+int
+freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
+{
+ struct timespec32 ts32;
+ struct timespec ts, *tsp;
+ struct aiocb **ujoblist;
+ uint32_t *ujoblist32;
+ int error, i;
+
+ if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc)
+ return (EINVAL);
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
+ return (error);
+ CP(ts32, ts, tv_sec);
+ CP(ts32, ts, tv_nsec);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK);
+ ujoblist32 = (uint32_t *)ujoblist;
+ error = copyin(uap->aiocbp, ujoblist32, uap->nent *
+ sizeof(ujoblist32[0]));
+ if (error == 0) {
+ for (i = uap->nent - 1; i >= 0; i--)
+ ujoblist[i] = PTRIN(ujoblist32[i]);
+
+ error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+ }
+ free(ujoblist, M_AIOS);
+ return (error);
+}
+
+int
+freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
+{
+
+ return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_freebsd32_aio_read(struct thread *td,
+ struct freebsd6_freebsd32_aio_read_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+ &aiocb32_ops_osigevent));
+}
+#endif
+
+int
+freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+ &aiocb32_ops));
+}
+
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_freebsd32_aio_write(struct thread *td,
+ struct freebsd6_freebsd32_aio_write_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+ &aiocb32_ops_osigevent));
+}
+#endif
+
+int
+freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_aio_waitcomplete(struct thread *td,
+ struct freebsd32_aio_waitcomplete_args *uap)
+{
+ struct timespec32 ts32;
+ struct timespec ts, *tsp;
+ int error;
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ error = copyin(uap->timeout, &ts32, sizeof(ts32));
+ if (error)
+ return (error);
+ CP(ts32, ts, tv_sec);
+ CP(ts32, ts, tv_nsec);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
+{
+
+ return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
+ &aiocb32_ops));
+}
+
+#ifdef COMPAT_FREEBSD6
+int
+freebsd6_freebsd32_lio_listio(struct thread *td,
+ struct freebsd6_freebsd32_lio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ struct osigevent32 osig;
+ uint32_t *acb_list32;
+ int error, i, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > max_aio_queue_per_proc)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &osig, sizeof(osig));
+ if (error)
+ return (error);
+ error = convert_old_sigevent32(&osig, &sig);
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+ if (error) {
+ free(acb_list32, M_LIO);
+ return (error);
+ }
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ for (i = 0; i < nent; i++)
+ acb_list[i] = PTRIN(acb_list32[i]);
+ free(acb_list32, M_LIO);
+
+ error = kern_lio_listio(td, uap->mode,
+ (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+ &aiocb32_ops_osigevent);
+ free(acb_list, M_LIO);
+ return (error);
+}
+#endif
+
+int
+freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ struct sigevent32 sig32;
+ uint32_t *acb_list32;
+ int error, i, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > max_aio_queue_per_proc)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &sig32, sizeof(sig32));
+ if (error)
+ return (error);
+ error = convert_sigevent32(&sig32, &sig);
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+ if (error) {
+ free(acb_list32, M_LIO);
+ return (error);
+ }
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ for (i = 0; i < nent; i++)
+ acb_list[i] = PTRIN(acb_list32[i]);
+ free(acb_list32, M_LIO);
+
+ error = kern_lio_listio(td, uap->mode,
+ (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+ &aiocb32_ops);
+ free(acb_list, M_LIO);
+ return (error);
+}
+
+#endif
diff --git a/freebsd/sys/kern/vfs_bio.c b/freebsd/sys/kern/vfs_bio.c
new file mode 100644
index 00000000..2277bf67
--- /dev/null
+++ b/freebsd/sys/kern/vfs_bio.c
@@ -0,0 +1,5474 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme. Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author: John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/bitset.h>
+#include <sys/conf.h>
+#include <sys/counter.h>
+#include <sys/buf.h>
+#include <sys/devicestat.h>
+#include <sys/eventhandler.h>
+#include <sys/fail.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vmem.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+#include <geom/geom.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/swap_pager.h>
+#include "opt_swap.h"
+
+static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
+
+struct bio_ops bioops; /* I/O operation notification */
+
+struct buf_ops buf_ops_bio = {
+ .bop_name = "buf_ops_bio",
+ .bop_write = bufwrite,
+ .bop_strategy = bufstrategy,
+ .bop_sync = bufsync,
+ .bop_bdflush = bufbdflush,
+};
+
+struct bufqueue {
+ struct mtx_padalign bq_lock;
+ TAILQ_HEAD(, buf) bq_queue;
+ uint8_t bq_index;
+ uint16_t bq_subqueue;
+ int bq_len;
+} __aligned(CACHE_LINE_SIZE);
+
+#define BQ_LOCKPTR(bq) (&(bq)->bq_lock)
+#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq)))
+#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq)))
+#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufdomain {
+ struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
+ struct bufqueue bd_dirtyq;
+ struct bufqueue *bd_cleanq;
+ struct mtx_padalign bd_run_lock;
+ /* Constants */
+ long bd_maxbufspace;
+ long bd_hibufspace;
+ long bd_lobufspace;
+ long bd_bufspacethresh;
+ int bd_hifreebuffers;
+ int bd_lofreebuffers;
+ int bd_hidirtybuffers;
+ int bd_lodirtybuffers;
+ int bd_dirtybufthresh;
+ int bd_lim;
+ /* atomics */
+ int bd_wanted;
+ int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers;
+ int __aligned(CACHE_LINE_SIZE) bd_running;
+ long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+ int __aligned(CACHE_LINE_SIZE) bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock)
+#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd)))
+#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd)))
+#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock)
+#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd)))
+#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd)))
+#define BD_DOMAIN(bd) (bd - bdomain)
+
+static struct buf *buf; /* buffer header pool */
+extern struct buf *swbuf; /* Swap buffer header pool. */
+caddr_t __read_mostly unmapped_buf;
+
+/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
+struct proc *bufdaemonproc;
+
+static int inmem(struct vnode *vp, daddr_t blkno);
+static void vm_hold_free_pages(struct buf *bp, int newbsize);
+static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
+ vm_offset_t to);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
+static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
+ vm_page_t m);
+static void vfs_clean_pages_dirty_buf(struct buf *bp);
+static void vfs_setdirty_locked_object(struct buf *bp);
+static void vfs_vmio_invalidate(struct buf *bp);
+static void vfs_vmio_truncate(struct buf *bp, int npages);
+static void vfs_vmio_extend(struct buf *bp, int npages, int size);
+static int vfs_bio_clcheck(struct vnode *vp, int size,
+ daddr_t lblkno, daddr_t blkno);
+static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
+ void (*)(struct buf *));
+static int buf_flush(struct vnode *vp, struct bufdomain *, int);
+static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
+static void buf_daemon(void);
+static __inline void bd_wakeup(void);
+static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
+static void bufkva_reclaim(vmem_t *, int);
+static void bufkva_free(struct buf *);
+static int buf_import(void *, void **, int, int, int);
+static void buf_release(void *, void **, int);
+static void maxbcachebuf_adjust(void);
+static inline struct bufdomain *bufdomain(struct buf *);
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+ const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
+static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
+static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);
+
+static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
+int vmiodirenable = TRUE;
+SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
+ "Use the VM system for directory writes");
+long runningbufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
+ "Amount of presently outstanding async buffer io");
+SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
+ NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
+static counter_u64_t bufkvaspace;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
+ "Kernel virtual memory used for buffers");
+static long maxbufspace;
+SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
+ CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace,
+ __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
+ "Maximum allowed value of bufspace (including metadata)");
+static long bufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
+ "Amount of malloced memory for buffers");
+static long maxbufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
+ 0, "Maximum amount of malloced memory for buffers");
+static long lobufspace;
+SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
+ CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace,
+ __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
+ "Minimum amount of buffers we want to have");
+long hibufspace;
+SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
+ CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace,
+ __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
+ "Maximum allowed value of bufspace (excluding metadata)");
+long bufspacethresh;
+SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
+ CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh,
+ __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L",
+ "Bufspace consumed before waking the daemon to free some");
+static counter_u64_t buffreekvacnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
+ "Number of times we have freed the KVA space from some buffer");
+static counter_u64_t bufdefragcnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
+ "Number of times we have had to repeat buffer allocation to defragment");
+static long lorunningspace;
+SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
+ CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
+ "Minimum preferred space used for in-progress I/O");
+static long hirunningspace;
+SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
+ CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
+ "Maximum amount of space to use for in-progress I/O");
+int dirtybufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
+ 0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
+int bdwriteskip;
+SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
+ 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
+int altbufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
+ 0, "Number of fsync flushes to limit dirty buffers");
+static int recursiveflushes;
+SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
+ 0, "Number of flushes skipped due to being recursive");
+static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
+ CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I",
+ "Number of buffers that are dirty (has unwritten changes) at the moment");
+static int lodirtybuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
+ CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers,
+ __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I",
+ "How many buffers we want to have free before bufdaemon can sleep");
+static int hidirtybuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
+ CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers,
+ __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I",
+ "When the number of dirty buffers is considered severe");
+int dirtybufthresh;
+SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
+ CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh,
+ __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I",
+ "Number of bdwrite to bawrite conversions to clear dirty buffers");
+static int numfreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
+ "Number of free buffers");
+static int lofreebuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
+ CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers,
+ __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I",
+ "Target number of free buffers");
+static int hifreebuffers;
+SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
+ CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers,
+ __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I",
+ "Threshold for clean buffer recycling");
+static counter_u64_t getnewbufcalls;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
+ &getnewbufcalls, "Number of calls to getnewbuf");
+static counter_u64_t getnewbufrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
+ &getnewbufrestarts,
+ "Number of times getnewbuf has had to restart a buffer acquisition");
+static counter_u64_t mappingrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
+ &mappingrestarts,
+ "Number of times getblk has had to restart a buffer mapping for "
+ "unmapped buffer");
+static counter_u64_t numbufallocfails;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
+ &numbufallocfails, "Number of times buffer allocations failed");
+static int flushbufqtarget = 100;
+SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
+ "Amount of work to do in flushbufqueues when helping bufdaemon");
+static counter_u64_t notbufdflushes;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes,
+ "Number of dirty buffer flushes done by the bufdaemon helpers");
+static long barrierwrites;
+SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
+ "Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+ &unmapped_buf_allowed, 0,
+ "Permit the use of the unmapped i/o");
+int maxbcachebuf = MAXBCACHEBUF;
+SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
+ "Maximum size of a buffer cache block");
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign __exclusive_cache_line bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign __exclusive_cache_line rbreqlock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign __exclusive_cache_line bdirtylock;
+
+/*
+ * Wakeup point for bufdaemon, as well as indicator of whether it is already
+ * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
+ * is idling.
+ */
+static int bd_request;
+
+/*
+ * Request for the buf daemon to write more buffers than is indicated by
+ * lodirtybuf. This may be necessary to push out excess dependencies or
+ * defragment the address space where a simple count of the number of dirty
+ * buffers is insufficient to characterize the demand for flushing them.
+ */
+static int bd_speedupreq;
+
+/*
+ * Synchronization (sleep/wakeup) variable for active buffer space requests.
+ * Set when wait starts, cleared prior to wakeup().
+ * Used in runningbufwakeup() and waitrunningbufspace().
+ */
+static int runningbufreq;
+
+/*
+ * Synchronization for bwillwrite() waiters.
+ */
+static int bdirtywait;
+
+/*
+ * Definitions for the buffer free lists.
+ */
+#define QUEUE_NONE 0 /* on no queue */
+#define QUEUE_EMPTY 1 /* empty buffer headers */
+#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
+#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
+#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */
+
+/* Maximum number of buffer domains. */
+#define BUF_DOMAINS 8
+
+struct bufdomainset bdlodirty; /* Domains > lodirty */
+struct bufdomainset bdhidirty; /* Domains > hidirty */
+
+/* Configured number of clean queues. */
+static int __read_mostly buf_domains;
+
+BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
+struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
+struct bufqueue __exclusive_cache_line bqempty;
+
+/*
+ * per-cpu empty buffer cache.
+ */
+uma_zone_t buf_zone;
+
+/*
+ * Single global constant for BUF_WMESG, to avoid getting multiple references.
+ * buf_wmesg is referred from macros.
+ */
+const char *buf_wmesg = BUF_WMESG;
+
+static int
+sysctl_runningspace(SYSCTL_HANDLER_ARGS)
+{
+ long value;
+ int error;
+
+ value = *(long *)arg1;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ mtx_lock(&rbreqlock);
+ if (arg1 == &hirunningspace) {
+ if (value < lorunningspace)
+ error = EINVAL;
+ else
+ hirunningspace = value;
+ } else {
+ KASSERT(arg1 == &lorunningspace,
+ ("%s: unknown arg1", __func__));
+ if (value > hirunningspace)
+ error = EINVAL;
+ else
+ lorunningspace = value;
+ }
+ mtx_unlock(&rbreqlock);
+ return (error);
+}
+
+static int
+sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int value;
+ int i;
+
+ value = *(int *)arg1;
+ error = sysctl_handle_int(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ *(int *)arg1 = value;
+ for (i = 0; i < buf_domains; i++)
+ *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
+ value / buf_domains;
+
+ return (error);
+}
+
+static int
+sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
+{
+ long value;
+ int error;
+ int i;
+
+ value = *(long *)arg1;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ *(long *)arg1 = value;
+ for (i = 0; i < buf_domains; i++)
+ *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) =
+ value / buf_domains;
+
+ return (error);
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+static int
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
+{
+ long lvalue;
+ int ivalue;
+ int i;
+
+ lvalue = 0;
+ for (i = 0; i < buf_domains; i++)
+ lvalue += bdomain[i].bd_bufspace;
+ if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
+ if (lvalue > INT_MAX)
+ /* On overflow, still write out a long to trigger ENOMEM. */
+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
+ ivalue = lvalue;
+ return (sysctl_handle_int(oidp, &ivalue, 0, req));
+}
+#else
+static int
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
+{
+ long lvalue;
+ int i;
+
+ lvalue = 0;
+ for (i = 0; i < buf_domains; i++)
+ lvalue += bdomain[i].bd_bufspace;
+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
+}
+#endif
+
+static int
+sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
+{
+ int value;
+ int i;
+
+ value = 0;
+ for (i = 0; i < buf_domains; i++)
+ value += bdomain[i].bd_numdirtybuffers;
+ return (sysctl_handle_int(oidp, &value, 0, req));
+}
+
+/*
+ * bdirtywakeup:
+ *
+ * Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
+{
+ mtx_lock(&bdirtylock);
+ if (bdirtywait) {
+ bdirtywait = 0;
+ wakeup(&bdirtywait);
+ }
+ mtx_unlock(&bdirtylock);
+}
+
+/*
+ * bd_clear:
+ *
+ * Clear a domain from the appropriate bitsets when dirtybuffers
+ * is decremented.
+ */
+static void
+bd_clear(struct bufdomain *bd)
+{
+
+ mtx_lock(&bdirtylock);
+ if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
+ BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+ if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
+ BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+ mtx_unlock(&bdirtylock);
+}
+
+/*
+ * bd_set:
+ *
+ * Set a domain in the appropriate bitsets when dirtybuffers
+ * is incremented.
+ */
+static void
+bd_set(struct bufdomain *bd)
+{
+
+ mtx_lock(&bdirtylock);
+ if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
+ BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+ if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
+ BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+ mtx_unlock(&bdirtylock);
+}
+
+/*
+ * bdirtysub:
+ *
+ * Decrement the numdirtybuffers count by one and wakeup any
+ * threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(struct buf *bp)
+{
+ struct bufdomain *bd;
+ int num;
+
+ bd = bufdomain(bp);
+ num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
+ if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
+ bdirtywakeup();
+ if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+ bd_clear(bd);
+}
+
+/*
+ * bdirtyadd:
+ *
+ * Increment the numdirtybuffers count by one and wakeup the buf
+ * daemon if needed.
+ */
+static void
+bdirtyadd(struct buf *bp)
+{
+ struct bufdomain *bd;
+ int num;
+
+ /*
+ * Only do the wakeup once as we cross the boundary. The
+ * buf daemon will keep running until the condition clears.
+ */
+ bd = bufdomain(bp);
+ num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
+ if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
+ bd_wakeup();
+ if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+ bd_set(bd);
+}
+
+/*
+ * bufspace_daemon_wakeup:
+ *
+ * Wakeup the daemons responsible for freeing clean bufs.
+ */
+static void
+bufspace_daemon_wakeup(struct bufdomain *bd)
+{
+
+ /*
+ * avoid the lock if the daemon is running.
+ */
+ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
+ BD_RUN_LOCK(bd);
+ atomic_store_int(&bd->bd_running, 1);
+ wakeup(&bd->bd_running);
+ BD_RUN_UNLOCK(bd);
+ }
+}
+
+/*
+ * bufspace_daemon_wait:
+ *
+ * Sleep until the domain falls below a limit or one second passes.
+ */
+static void
+bufspace_daemon_wait(struct bufdomain *bd)
+{
+ /*
+ * Re-check our limits and sleep. bd_running must be
+ * cleared prior to checking the limits to avoid missed
+ * wakeups. The waker will adjust one of bufspace or
+ * freebuffers prior to checking bd_running.
+ */
+ BD_RUN_LOCK(bd);
+ atomic_store_int(&bd->bd_running, 0);
+ if (bd->bd_bufspace < bd->bd_bufspacethresh &&
+ bd->bd_freebuffers > bd->bd_lofreebuffers) {
+ msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
+ "-", hz);
+ } else {
+ /* Avoid spurious wakeups while running. */
+ atomic_store_int(&bd->bd_running, 1);
+ BD_RUN_UNLOCK(bd);
+ }
+}
+
+/*
+ * bufspace_adjust:
+ *
+ * Adjust the reported bufspace for a KVA managed buffer, possibly
+ * waking any waiters.
+ */
+static void
+bufspace_adjust(struct buf *bp, int bufsize)
+{
+ struct bufdomain *bd;
+ long space;
+ int diff;
+
+ KASSERT((bp->b_flags & B_MALLOC) == 0,
+ ("bufspace_adjust: malloc buf %p", bp));
+ bd = bufdomain(bp);
+ diff = bufsize - bp->b_bufsize;
+ if (diff < 0) {
+ atomic_subtract_long(&bd->bd_bufspace, -diff);
+ } else if (diff > 0) {
+ space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
+ /* Wake up the daemon on the transition. */
+ if (space < bd->bd_bufspacethresh &&
+ space + diff >= bd->bd_bufspacethresh)
+ bufspace_daemon_wakeup(bd);
+ }
+ bp->b_bufsize = bufsize;
+}
+
+/*
+ * bufspace_reserve:
+ *
+ * Reserve bufspace before calling allocbuf(). metadata has a
+ * different space limit than data.
+ */
+static int
+bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
+{
+ long limit, new;
+ long space;
+
+ if (metadata)
+ limit = bd->bd_maxbufspace;
+ else
+ limit = bd->bd_hibufspace;
+ space = atomic_fetchadd_long(&bd->bd_bufspace, size);
+ new = space + size;
+ if (new > limit) {
+ atomic_subtract_long(&bd->bd_bufspace, size);
+ return (ENOSPC);
+ }
+
+ /* Wake up the daemon on the transition. */
+ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
+ bufspace_daemon_wakeup(bd);
+
+ return (0);
+}
+
+/*
+ * bufspace_release:
+ *
+ * Release reserved bufspace after bufspace_adjust() has consumed it.
+ */
+static void
+bufspace_release(struct bufdomain *bd, int size)
+{
+
+ atomic_subtract_long(&bd->bd_bufspace, size);
+}
+
+/*
+ * bufspace_wait:
+ *
+ * Wait for bufspace, acting as the buf daemon if a locked vnode is
+ * supplied. bd_wanted must be set prior to polling for space. The
+ * operation must be re-tried on return.
+ */
+static void
+bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
+ int slpflag, int slptimeo)
+{
+ struct thread *td;
+ int error, fl, norunbuf;
+
+ if ((gbflags & GB_NOWAIT_BD) != 0)
+ return;
+
+ td = curthread;
+ BD_LOCK(bd);
+ while (bd->bd_wanted) {
+ if (vp != NULL && vp->v_type != VCHR &&
+ (td->td_pflags & TDP_BUFNEED) == 0) {
+ BD_UNLOCK(bd);
+ /*
+ * getblk() is called with a vnode locked, and
+ * some majority of the dirty buffers may as
+ * well belong to the vnode. Flushing the
+ * buffers there would make a progress that
+ * cannot be achieved by the buf_daemon, that
+ * cannot lock the vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+
+ /*
+ * Play bufdaemon. The getnewbuf() function
+ * may be called while the thread owns lock
+ * for another dirty buffer for the same
+ * vnode, which makes it impossible to use
+ * VOP_FSYNC() there, due to the buffer lock
+ * recursion.
+ */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_flush(vp, bd, flushbufqtarget);
+ td->td_pflags &= norunbuf;
+ BD_LOCK(bd);
+ if (fl != 0)
+ continue;
+ if (bd->bd_wanted == 0)
+ break;
+ }
+ error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+ (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
+ if (error != 0)
+ break;
+ }
+ BD_UNLOCK(bd);
+}
+
+
+/*
+ * bufspace_daemon:
+ *
+ * buffer space management daemon. Tries to maintain some marginal
+ * amount of free buffer space so that requesting processes neither
+ * block nor work to reclaim buffers.
+ */
+static void
+bufspace_daemon(void *arg)
+{
+ struct bufdomain *bd;
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
+ SHUTDOWN_PRI_LAST + 100);
+
+ bd = arg;
+ for (;;) {
+ kthread_suspend_check();
+
+ /*
+ * Free buffers from the clean queue until we meet our
+ * targets.
+ *
+ * Theory of operation: The buffer cache is most efficient
+ * when some free buffer headers and space are always
+ * available to getnewbuf(). This daemon attempts to prevent
+ * the excessive blocking and synchronization associated
+ * with shortfall. It goes through three phases according
+ * demand:
+ *
+ * 1) The daemon wakes up voluntarily once per-second
+ * during idle periods when the counters are below
+ * the wakeup thresholds (bufspacethresh, lofreebuffers).
+ *
+ * 2) The daemon wakes up as we cross the thresholds
+ * ahead of any potential blocking. This may bounce
+ * slightly according to the rate of consumption and
+ * release.
+ *
+ * 3) The daemon and consumers are starved for working
+ * clean buffers. This is the 'bufspace' sleep below
+ * which will inefficiently trade bufs with bqrelse
+ * until we return to condition 2.
+ */
+ while (bd->bd_bufspace > bd->bd_lobufspace ||
+ bd->bd_freebuffers < bd->bd_hifreebuffers) {
+ if (buf_recycle(bd, false) != 0) {
+ if (bd_flushall(bd))
+ continue;
+ /*
+ * Speedup dirty if we've run out of clean
+ * buffers. This is possible in particular
+ * because softdep may held many bufs locked
+ * pending writes to other bufs which are
+ * marked for delayed write, exhausting
+ * clean space until they are written.
+ */
+ bd_speedup();
+ BD_LOCK(bd);
+ if (bd->bd_wanted) {
+ msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+ PRIBIO|PDROP, "bufspace", hz/10);
+ } else
+ BD_UNLOCK(bd);
+ }
+ maybe_yield();
+ }
+ bufspace_daemon_wait(bd);
+ }
+}
+
+/*
+ * bufmallocadjust:
+ *
+ * Adjust the reported bufspace for a malloc managed buffer, possibly
+ * waking any waiters.
+ */
+static void
+bufmallocadjust(struct buf *bp, int bufsize)
+{
+ int diff;
+
+ KASSERT((bp->b_flags & B_MALLOC) != 0,
+ ("bufmallocadjust: non-malloc buf %p", bp));
+ diff = bufsize - bp->b_bufsize;
+ if (diff < 0)
+ atomic_subtract_long(&bufmallocspace, -diff);
+ else
+ atomic_add_long(&bufmallocspace, diff);
+ bp->b_bufsize = bufsize;
+}
+
+/*
+ * runningwakeup:
+ *
+ * Wake up processes that are waiting on asynchronous writes to fall
+ * below lorunningspace.
+ */
+static void
+runningwakeup(void)
+{
+
+ mtx_lock(&rbreqlock);
+ if (runningbufreq) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ mtx_unlock(&rbreqlock);
+}
+
+/*
+ * runningbufwakeup:
+ *
+ * Decrement the outstanding write count according.
+ */
+void
+runningbufwakeup(struct buf *bp)
+{
+ long space, bspace;
+
+ bspace = bp->b_runningbufspace;
+ if (bspace == 0)
+ return;
+ space = atomic_fetchadd_long(&runningbufspace, -bspace);
+ KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
+ space, bspace));
+ bp->b_runningbufspace = 0;
+ /*
+ * Only acquire the lock and wakeup on the transition from exceeding
+ * the threshold to falling below it.
+ */
+ if (space < lorunningspace)
+ return;
+ if (space - bspace > lorunningspace)
+ return;
+ runningwakeup();
+}
+
+/*
+ * waitrunningbufspace()
+ *
+ * runningbufspace is a measure of the amount of I/O currently
+ * running. This routine is used in async-write situations to
+ * prevent creating huge backups of pending writes to a device.
+ * Only asynchronous writes are governed by this function.
+ *
+ * This does NOT turn an async write into a sync write. It waits
+ * for earlier writes to complete and generally returns before the
+ * caller's write has reached the device.
+ */
+void
+waitrunningbufspace(void)
+{
+
+ mtx_lock(&rbreqlock);
+ while (runningbufspace > hirunningspace) {
+ runningbufreq = 1;
+ msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
+ }
+ mtx_unlock(&rbreqlock);
+}
+
+
+/*
+ * vfs_buf_test_cache:
+ *
+ * Called when a buffer is extended. This function clears the B_CACHE
+ * bit if the newly extended portion of the buffer does not contain
+ * valid data.
+ */
+static __inline void
+vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
+ vm_offset_t size, vm_page_t m)
+{
+
+ VM_OBJECT_ASSERT_LOCKED(m->object);
+ if (bp->b_flags & B_CACHE) {
+ int base = (foff + off) & PAGE_MASK;
+ if (vm_page_is_valid(m, base, size) == 0)
+ bp->b_flags &= ~B_CACHE;
+ }
+}
+
+/* Wake up the buffer daemon if necessary */
+static void
+bd_wakeup(void)
+{
+
+ mtx_lock(&bdlock);
+ if (bd_request == 0) {
+ bd_request = 1;
+ wakeup(&bd_request);
+ }
+ mtx_unlock(&bdlock);
+}
+
+/*
+ * Adjust the maxbcachbuf tunable.
+ */
+static void
+maxbcachebuf_adjust(void)
+{
+ int i;
+
+ /*
+ * maxbcachebuf must be a power of 2 >= MAXBSIZE.
+ */
+ i = 2;
+ while (i * 2 <= maxbcachebuf)
+ i *= 2;
+ maxbcachebuf = i;
+ if (maxbcachebuf < MAXBSIZE)
+ maxbcachebuf = MAXBSIZE;
+ if (maxbcachebuf > MAXPHYS)
+ maxbcachebuf = MAXPHYS;
+ if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
+ printf("maxbcachebuf=%d\n", maxbcachebuf);
+}
+
+/*
+ * bd_speedup - speedup the buffer cache flushing code
+ */
+void
+bd_speedup(void)
+{
+ int needwake;
+
+ mtx_lock(&bdlock);
+ needwake = 0;
+ if (bd_speedupreq == 0 || bd_request == 0)
+ needwake = 1;
+ bd_speedupreq = 1;
+ bd_request = 1;
+ if (needwake)
+ wakeup(&bd_request);
+ mtx_unlock(&bdlock);
+}
+
+#ifndef NSWBUF_MIN
+#define NSWBUF_MIN 16
+#endif
+
+#ifdef __i386__
+#define TRANSIENT_DENOM 5
+#else
+#define TRANSIENT_DENOM 10
+#endif
+
+/*
+ * Calculating buffer cache scaling values and reserve space for buffer
+ * headers. This is called during low level kernel initialization and
+ * may be called more then once. We CANNOT write to the memory area
+ * being reserved at this time.
+ */
+caddr_t
+kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
+{
+ int tuned_nbuf;
+ long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;
+
+ /*
+ * physmem_est is in pages. Convert it to kilobytes (assumes
+ * PAGE_SIZE is >= 1K)
+ */
+ physmem_est = physmem_est * (PAGE_SIZE / 1024);
+
+ maxbcachebuf_adjust();
+ /*
+ * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
+ * For the first 64MB of ram nominally allocate sufficient buffers to
+ * cover 1/4 of our ram. Beyond the first 64MB allocate additional
+ * buffers to cover 1/10 of our ram over 64MB. When auto-sizing
+ * the buffer cache we limit the eventual kva reservation to
+ * maxbcache bytes.
+ *
+ * factor represents the 1/4 x ram conversion.
+ */
+ if (nbuf == 0) {
+ int factor = 4 * BKVASIZE / 1024;
+
+ nbuf = 50;
+ if (physmem_est > 4096)
+ nbuf += min((physmem_est - 4096) / factor,
+ 65536 / factor);
+ if (physmem_est > 65536)
+ nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
+ 32 * 1024 * 1024 / (factor * 5));
+
+ if (maxbcache && nbuf > maxbcache / BKVASIZE)
+ nbuf = maxbcache / BKVASIZE;
+ tuned_nbuf = 1;
+ } else
+ tuned_nbuf = 0;
+
+ /* XXX Avoid unsigned long overflows later on with maxbufspace. */
+ maxbuf = (LONG_MAX / 3) / BKVASIZE;
+ if (nbuf > maxbuf) {
+ if (!tuned_nbuf)
+ printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
+ maxbuf);
+ nbuf = maxbuf;
+ }
+
+ /*
+ * Ideal allocation size for the transient bio submap is 10%
+ * of the maximal space buffer map. This roughly corresponds
+ * to the amount of the buffer mapped for typical UFS load.
+ *
+ * Clip the buffer map to reserve space for the transient
+ * BIOs, if its extent is bigger than 90% (80% on i386) of the
+ * maximum buffer map extent on the platform.
+ *
+ * The fall-back to the maxbuf in case of maxbcache unset,
+ * allows to not trim the buffer KVA for the architectures
+ * with ample KVA space.
+ */
+ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
+ maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+ buf_sz = (long)nbuf * BKVASIZE;
+ if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
+ (TRANSIENT_DENOM - 1)) {
+ /*
+ * There is more KVA than memory. Do not
+ * adjust buffer map size, and assign the rest
+ * of maxbuf to transient map.
+ */
+ biotmap_sz = maxbuf_sz - buf_sz;
+ } else {
+ /*
+ * Buffer map spans all KVA we could afford on
+ * this platform. Give 10% (20% on i386) of
+ * the buffer map to the transient bio map.
+ */
+ biotmap_sz = buf_sz / TRANSIENT_DENOM;
+ buf_sz -= biotmap_sz;
+ }
+ if (biotmap_sz / INT_MAX > MAXPHYS)
+ bio_transient_maxcnt = INT_MAX;
+ else
+ bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+ /*
+ * Artificially limit to 1024 simultaneous in-flight I/Os
+ * using the transient mapping.
+ */
+ if (bio_transient_maxcnt > 1024)
+ bio_transient_maxcnt = 1024;
+ if (tuned_nbuf)
+ nbuf = buf_sz / BKVASIZE;
+ }
+
+ /*
+ * swbufs are used as temporary holders for I/O, such as paging I/O.
+ * We have no less then 16 and no more then 256.
+ */
+ nswbuf = min(nbuf / 4, 256);
+ TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
+ if (nswbuf < NSWBUF_MIN)
+ nswbuf = NSWBUF_MIN;
+
+ /*
+ * Reserve space for the buffer cache buffers
+ */
+ swbuf = (void *)v;
+ v = (caddr_t)(swbuf + nswbuf);
+ buf = (void *)v;
+ v = (caddr_t)(buf + nbuf);
+
+ return(v);
+}
+
+/* Initialize the buffer subsystem. Called before use of any buffers. */
+void
+bufinit(void)
+{
+ struct buf *bp;
+ int i;
+
+ KASSERT(maxbcachebuf >= MAXBSIZE,
+ ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
+ MAXBSIZE));
+ bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
+ mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
+ mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+ mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
+
+ unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
+
+ /* finally, initialize each buffer header and stick on empty q */
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ bzero(bp, sizeof *bp);
+ bp->b_flags = B_INVAL;
+ bp->b_rcred = NOCRED;
+ bp->b_wcred = NOCRED;
+ bp->b_qindex = QUEUE_NONE;
+ bp->b_domain = -1;
+ bp->b_subqueue = mp_maxid + 1;
+ bp->b_xflags = 0;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ LIST_INIT(&bp->b_dep);
+ BUF_LOCKINIT(bp);
+ bq_insert(&bqempty, bp, false);
+ }
+
+ /*
+ * maxbufspace is the absolute maximum amount of buffer space we are
+ * allowed to reserve in KVM and in real terms. The absolute maximum
+ * is nominally used by metadata. hibufspace is the nominal maximum
+ * used by most other requests. The differential is required to
+ * ensure that metadata deadlocks don't occur.
+ *
+ * maxbufspace is based on BKVASIZE. Allocating buffers larger then
+ * this may result in KVM fragmentation which is not handled optimally
+ * by the system. XXX This is less true with vmem. We could use
+ * PAGE_SIZE.
+ */
+ maxbufspace = (long)nbuf * BKVASIZE;
+ hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
+ lobufspace = (hibufspace / 20) * 19; /* 95% */
+ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
+
+ /*
+ * Note: The 16 MiB upper limit for hirunningspace was chosen
+ * arbitrarily and may need further tuning. It corresponds to
+ * 128 outstanding write IO requests (if IO size is 128 KiB),
+ * which fits with many RAID controllers' tagged queuing limits.
+ * The lower 1 MiB limit is the historical upper limit for
+ * hirunningspace.
+ */
+ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
+ 16 * 1024 * 1024), 1024 * 1024);
+ lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
+
+ /*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space. Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on
+ * average (small) directories.
+ */
+ maxbufmallocspace = hibufspace / 20;
+
+ /*
+ * Reduce the chance of a deadlock occurring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
+ */
+ hidirtybuffers = nbuf / 4 + 20;
+ dirtybufthresh = hidirtybuffers * 9 / 10;
+ /*
+ * To support extreme low-memory systems, make sure hidirtybuffers
+ * cannot eat up all available buffer space. This occurs when our
+ * minimum cannot be met. We try to size hidirtybuffers to 3/4 our
+ * buffer space assuming BKVASIZE'd buffers.
+ */
+ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
+ hidirtybuffers >>= 1;
+ }
+ lodirtybuffers = hidirtybuffers / 2;
+
+ /*
+ * lofreebuffers should be sufficient to avoid stalling waiting on
+ * buf headers under heavy utilization. The bufs in per-cpu caches
+ * are counted as free but will be unavailable to threads executing
+ * on other cpus.
+ *
+ * hifreebuffers is the free target for the bufspace daemon. This
+ * should be set appropriately to limit work per-iteration.
+ */
+ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
+ hifreebuffers = (3 * lofreebuffers) / 2;
+ numfreebuffers = nbuf;
+
+ /* Setup the kva and free list allocators. */
+ vmem_set_reclaim(buffer_arena, bufkva_reclaim);
+ buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
+ NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
+
+ /*
+ * Size the clean queue according to the amount of buffer space.
+ * One queue per-256mb up to the max. More queues gives better
+ * concurrency but less accurate LRU.
+ */
+ buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS);
+ for (i = 0 ; i < buf_domains; i++) {
+ struct bufdomain *bd;
+
+ bd = &bdomain[i];
+ bd_init(bd);
+ bd->bd_freebuffers = nbuf / buf_domains;
+ bd->bd_hifreebuffers = hifreebuffers / buf_domains;
+ bd->bd_lofreebuffers = lofreebuffers / buf_domains;
+ bd->bd_bufspace = 0;
+ bd->bd_maxbufspace = maxbufspace / buf_domains;
+ bd->bd_hibufspace = hibufspace / buf_domains;
+ bd->bd_lobufspace = lobufspace / buf_domains;
+ bd->bd_bufspacethresh = bufspacethresh / buf_domains;
+ bd->bd_numdirtybuffers = 0;
+ bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
+ bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
+ bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
+ /* Don't allow more than 2% of bufs in the per-cpu caches. */
+ bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
+ }
+ getnewbufcalls = counter_u64_alloc(M_WAITOK);
+ getnewbufrestarts = counter_u64_alloc(M_WAITOK);
+ mappingrestarts = counter_u64_alloc(M_WAITOK);
+ numbufallocfails = counter_u64_alloc(M_WAITOK);
+ notbufdflushes = counter_u64_alloc(M_WAITOK);
+ buffreekvacnt = counter_u64_alloc(M_WAITOK);
+ bufdefragcnt = counter_u64_alloc(M_WAITOK);
+ bufkvaspace = counter_u64_alloc(M_WAITOK);
+}
+
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+ KASSERT(bp->b_kvabase != unmapped_buf,
+ ("mapped buf: b_kvabase was not updated %p", bp));
+ KASSERT(bp->b_data != unmapped_buf,
+ ("mapped buf: b_data was not updated %p", bp));
+ KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
+ MAXPHYS, ("b_data + b_offset unmapped %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+ KASSERT(bp->b_data == unmapped_buf,
+ ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define BUF_CHECK_MAPPED(bp) do {} while (0)
+#define BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static int
+isbufbusy(struct buf *bp)
+{
+ if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
+ ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
+ return (1);
+ return (0);
+}
+
+/*
+ * Shutdown the system cleanly to prepare for reboot, halt, or power off.
+ */
+void
+bufshutdown(int show_busybufs)
+{
+ static int first_buf_printf = 1;
+ struct buf *bp;
+ int iter, nbusy, pbusy;
+#ifndef PREEMPTION
+ int subiter;
+#endif
+
+ /*
+ * Sync filesystems for shutdown
+ */
+ wdog_kern_pat(WD_LASTVAL);
+ sys_sync(curthread, NULL);
+
+ /*
+ * With soft updates, some buffers that are
+ * written will be remarked as dirty until other
+ * buffers are written.
+ */
+ for (iter = pbusy = 0; iter < 20; iter++) {
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; )
+ if (isbufbusy(bp))
+ nbusy++;
+ if (nbusy == 0) {
+ if (first_buf_printf)
+ printf("All buffers synced.");
+ break;
+ }
+ if (first_buf_printf) {
+ printf("Syncing disks, buffers remaining... ");
+ first_buf_printf = 0;
+ }
+ printf("%d ", nbusy);
+ if (nbusy < pbusy)
+ iter = 0;
+ pbusy = nbusy;
+
+ wdog_kern_pat(WD_LASTVAL);
+ sys_sync(curthread, NULL);
+
+#ifdef PREEMPTION
+ /*
+ * Spin for a while to allow interrupt threads to run.
+ */
+ DELAY(50000 * iter);
+#else
+ /*
+ * Context switch several times to allow interrupt
+ * threads to run.
+ */
+ for (subiter = 0; subiter < 50 * iter; subiter++) {
+ thread_lock(curthread);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(curthread);
+ DELAY(1000);
+ }
+#endif
+ }
+ printf("\n");
+ /*
+ * Count only busy local buffers to prevent forcing
+ * a fsck if we're just a client of a wedged NFS server
+ */
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if (isbufbusy(bp)) {
+#if 0
+/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */
+ if (bp->b_dev == NULL) {
+ TAILQ_REMOVE(&mountlist,
+ bp->b_vp->v_mount, mnt_list);
+ continue;
+ }
+#endif
+ nbusy++;
+ if (show_busybufs > 0) {
+ printf(
+ "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
+ nbusy, bp, bp->b_vp, bp->b_flags,
+ (intmax_t)bp->b_blkno,
+ (intmax_t)bp->b_lblkno);
+ BUF_LOCKPRINTINFO(bp);
+ if (show_busybufs > 1)
+ vn_printf(bp->b_vp,
+ "vnode content: ");
+ }
+ }
+ }
+ if (nbusy) {
+ /*
+ * Failed to sync all blocks. Indicate this and don't
+ * unmount filesystems (thus forcing an fsck on reboot).
+ */
+ printf("Giving up on %d buffers\n", nbusy);
+ DELAY(5000000); /* 5 seconds */
+ } else {
+ if (!first_buf_printf)
+ printf("Final sync complete\n");
+ /*
+ * Unmount filesystems
+ */
+ if (panicstr == NULL)
+ vfs_unmountall();
+ }
+ swapoff_all();
+ DELAY(100000); /* wait for console output to finish */
+}
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+ BUF_CHECK_MAPPED(bp);
+
+ /*
+ * bp->b_data is relative to bp->b_offset, but
+ * bp->b_offset may be offset into the first page.
+ */
+ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+ pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+ bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+ (vm_offset_t)(bp->b_offset & PAGE_MASK));
+}
+
+static inline struct bufdomain *
+bufdomain(struct buf *bp)
+{
+
+ return (&bdomain[bp->b_domain]);
+}
+
+static struct bufqueue *
+bufqueue(struct buf *bp)
+{
+
+ switch (bp->b_qindex) {
+ case QUEUE_NONE:
+ /* FALLTHROUGH */
+ case QUEUE_SENTINEL:
+ return (NULL);
+ case QUEUE_EMPTY:
+ return (&bqempty);
+ case QUEUE_DIRTY:
+ return (&bufdomain(bp)->bd_dirtyq);
+ case QUEUE_CLEAN:
+ return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
+ default:
+ break;
+ }
+ panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
+}
+
+/*
+ * Return the locked bufqueue that bp is a member of.
+ */
+static struct bufqueue *
+bufqueue_acquire(struct buf *bp)
+{
+ struct bufqueue *bq, *nbq;
+
+ /*
+ * bp can be pushed from a per-cpu queue to the
+ * cleanq while we're waiting on the lock. Retry
+ * if the queues don't match.
+ */
+ bq = bufqueue(bp);
+ BQ_LOCK(bq);
+ for (;;) {
+ nbq = bufqueue(bp);
+ if (bq == nbq)
+ break;
+ BQ_UNLOCK(bq);
+ BQ_LOCK(nbq);
+ bq = nbq;
+ }
+ return (bq);
+}
+
+/*
+ * binsfree:
+ *
+ * Insert the buffer into the appropriate free list. Requires a
+ * locked buffer on entry and buffer is unlocked before return.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+ struct bufdomain *bd;
+ struct bufqueue *bq;
+
+ KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
+ ("binsfree: Invalid qindex %d", qindex));
+ BUF_ASSERT_XLOCKED(bp);
+
+ /*
+ * Handle delayed bremfree() processing.
+ */
+ if (bp->b_flags & B_REMFREE) {
+ if (bp->b_qindex == qindex) {
+ bp->b_flags |= B_REUSE;
+ bp->b_flags &= ~B_REMFREE;
+ BUF_UNLOCK(bp);
+ return;
+ }
+ bq = bufqueue_acquire(bp);
+ bq_remove(bq, bp);
+ BQ_UNLOCK(bq);
+ }
+ bd = bufdomain(bp);
+ if (qindex == QUEUE_CLEAN) {
+ if (bd->bd_lim != 0)
+ bq = &bd->bd_subq[PCPU_GET(cpuid)];
+ else
+ bq = bd->bd_cleanq;
+ } else
+ bq = &bd->bd_dirtyq;
+ bq_insert(bq, bp, true);
+}
+
+/*
+ * buf_free:
+ *
+ * Free a buffer to the buf zone once it no longer has valid contents.
+ */
+static void
+buf_free(struct buf *bp)
+{
+
+ if (bp->b_flags & B_REMFREE)
+ bremfreef(bp);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 1");
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ bufkva_free(bp);
+ atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
+ BUF_UNLOCK(bp);
+ uma_zfree(buf_zone, bp);
+}
+
+/*
+ * buf_import:
+ *
+ * Import bufs into the uma cache from the buf list. The system still
+ * expects a static array of bufs and much of the synchronization
+ * around bufs assumes type stable storage. As a result, UMA is used
+ * only as a per-cpu cache of bufs still maintained on a global list.
+ */
+static int
+buf_import(void *arg, void **store, int cnt, int domain, int flags)
+{
+ struct buf *bp;
+ int i;
+
+ BQ_LOCK(&bqempty);
+ for (i = 0; i < cnt; i++) {
+ bp = TAILQ_FIRST(&bqempty.bq_queue);
+ if (bp == NULL)
+ break;
+ bq_remove(&bqempty, bp);
+ store[i] = bp;
+ }
+ BQ_UNLOCK(&bqempty);
+
+ return (i);
+}
+
+/*
+ * buf_release:
+ *
+ * Release bufs from the uma cache back to the buffer queues.
+ */
+static void
+buf_release(void *arg, void **store, int cnt)
+{
+ struct bufqueue *bq;
+ struct buf *bp;
+ int i;
+
+ bq = &bqempty;
+ BQ_LOCK(bq);
+ for (i = 0; i < cnt; i++) {
+ bp = store[i];
+ /* Inline bq_insert() to batch locking. */
+ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+ bp->b_flags &= ~(B_AGE | B_REUSE);
+ bq->bq_len++;
+ bp->b_qindex = bq->bq_index;
+ }
+ BQ_UNLOCK(bq);
+}
+
+/*
+ * buf_alloc:
+ *
+ * Allocate an empty buffer header.
+ */
+static struct buf *
+buf_alloc(struct bufdomain *bd)
+{
+ struct buf *bp;
+ int freebufs;
+
+ /*
+ * We can only run out of bufs in the buf zone if the average buf
+ * is less than BKVASIZE. In this case the actual wait/block will
+ * come from buf_reycle() failing to flush one of these small bufs.
+ */
+ bp = NULL;
+ freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
+ if (freebufs > 0)
+ bp = uma_zalloc(buf_zone, M_NOWAIT);
+ if (bp == NULL) {
+ atomic_add_int(&bd->bd_freebuffers, 1);
+ bufspace_daemon_wakeup(bd);
+ counter_u64_add(numbufallocfails, 1);
+ return (NULL);
+ }
+ /*
+ * Wake-up the bufspace daemon on transition below threshold.
+ */
+ if (freebufs == bd->bd_lofreebuffers)
+ bufspace_daemon_wakeup(bd);
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
+
+ KASSERT(bp->b_vp == NULL,
+ ("bp: %p still has vnode %p.", bp, bp->b_vp));
+ KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
+ ("invalid buffer %p flags %#x", bp, bp->b_flags));
+ KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+ ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+ KASSERT(bp->b_npages == 0,
+ ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
+ KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
+ KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+
+ bp->b_domain = BD_DOMAIN(bd);
+ bp->b_flags = 0;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ bp->b_vflags = 0;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_bufobj = NULL;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
+ LIST_INIT(&bp->b_dep);
+
+ return (bp);
+}
+
+/*
+ * buf_recycle:
+ *
+ * Free a buffer from the given bufqueue. kva controls whether the
+ * freed buf must own some kva resources. This is used for
+ * defragmenting.
+ */
+static int
+buf_recycle(struct bufdomain *bd, bool kva)
+{
+ struct bufqueue *bq;
+ struct buf *bp, *nbp;
+
+ if (kva)
+ counter_u64_add(bufdefragcnt, 1);
+ nbp = NULL;
+ bq = bd->bd_cleanq;
+ BQ_LOCK(bq);
+ KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
+ ("buf_recycle: Locks don't match"));
+ nbp = TAILQ_FIRST(&bq->bq_queue);
+
+ /*
+ * Run scan, possibly freeing data and/or kva mappings on the fly
+ * depending.
+ */
+ while ((bp = nbp) != NULL) {
+ /*
+ * Calculate next bp (we can only use it if we do not
+ * release the bqlock).
+ */
+ nbp = TAILQ_NEXT(bp, b_freelist);
+
+ /*
+ * If we are defragging then we need a buffer with
+ * some kva to reclaim.
+ */
+ if (kva && bp->b_kvasize == 0)
+ continue;
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ continue;
+
+ /*
+ * Implement a second chance algorithm for frequently
+ * accessed buffers.
+ */
+ if ((bp->b_flags & B_REUSE) != 0) {
+ TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+ bp->b_flags &= ~B_REUSE;
+ BUF_UNLOCK(bp);
+ continue;
+ }
+
+ /*
+ * Skip buffers with background writes in progress.
+ */
+ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+
+ KASSERT(bp->b_qindex == QUEUE_CLEAN,
+ ("buf_recycle: inconsistent queue %d bp %p",
+ bp->b_qindex, bp));
+ KASSERT(bp->b_domain == BD_DOMAIN(bd),
+ ("getnewbuf: queue domain %d doesn't match request %d",
+ bp->b_domain, (int)BD_DOMAIN(bd)));
+ /*
+ * NOTE: nbp is now entirely invalid. We can only restart
+ * the scan from this point on.
+ */
+ bq_remove(bq, bp);
+ BQ_UNLOCK(bq);
+
+ /*
+ * Requeue the background write buffer with error and
+ * restart the scan.
+ */
+ if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+ bqrelse(bp);
+ BQ_LOCK(bq);
+ nbp = TAILQ_FIRST(&bq->bq_queue);
+ continue;
+ }
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ return (0);
+ }
+ bd->bd_wanted = 1;
+ BQ_UNLOCK(bq);
+
+ return (ENOBUFS);
+}
+
+/*
+ * bremfree:
+ *
+ * Mark the buffer for removal from the appropriate free list.
+ *
+ */
+void
+bremfree(struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT((bp->b_flags & B_REMFREE) == 0,
+ ("bremfree: buffer %p already marked for delayed removal.", bp));
+ KASSERT(bp->b_qindex != QUEUE_NONE,
+ ("bremfree: buffer %p not on a queue.", bp));
+ BUF_ASSERT_XLOCKED(bp);
+
+ bp->b_flags |= B_REMFREE;
+}
+
+/*
+ * bremfreef:
+ *
+ * Force an immediate removal from a free list. Used only in nfs when
+ * it abuses the b_freelist pointer.
+ */
+void
+bremfreef(struct buf *bp)
+{
+ struct bufqueue *bq;
+
+ bq = bufqueue_acquire(bp);
+ bq_remove(bq, bp);
+ BQ_UNLOCK(bq);
+}
+
+static void
+bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname)
+{
+
+ mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF);
+ TAILQ_INIT(&bq->bq_queue);
+ bq->bq_len = 0;
+ bq->bq_index = qindex;
+ bq->bq_subqueue = subqueue;
+}
+
+static void
+bd_init(struct bufdomain *bd)
+{
+ int i;
+
+ bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
+ bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
+ bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
+ for (i = 0; i <= mp_maxid; i++)
+ bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
+ "bufq clean subqueue lock");
+ mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF);
+}
+
+/*
+ * bq_remove:
+ *
+ * Removes a buffer from the free list, must be called with the
+ * correct qlock held.
+ */
+static void
+bq_remove(struct bufqueue *bq, struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_qindex != QUEUE_NONE,
+ ("bq_remove: buffer %p not on a queue.", bp));
+ KASSERT(bufqueue(bp) == bq,
+ ("bq_remove: Remove buffer %p from wrong queue.", bp));
+
+ BQ_ASSERT_LOCKED(bq);
+ if (bp->b_qindex != QUEUE_EMPTY) {
+ BUF_ASSERT_XLOCKED(bp);
+ }
+ KASSERT(bq->bq_len >= 1,
+ ("queue %d underflow", bp->b_qindex));
+ TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+ bq->bq_len--;
+ bp->b_qindex = QUEUE_NONE;
+ bp->b_flags &= ~(B_REMFREE | B_REUSE);
+}
+
+static void
+bd_flush(struct bufdomain *bd, struct bufqueue *bq)
+{
+ struct buf *bp;
+
+ BQ_ASSERT_LOCKED(bq);
+ if (bq != bd->bd_cleanq) {
+ BD_LOCK(bd);
+ while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) {
+ TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp,
+ b_freelist);
+ bp->b_subqueue = bd->bd_cleanq->bq_subqueue;
+ }
+ bd->bd_cleanq->bq_len += bq->bq_len;
+ bq->bq_len = 0;
+ }
+ if (bd->bd_wanted) {
+ bd->bd_wanted = 0;
+ wakeup(&bd->bd_wanted);
+ }
+ if (bq != bd->bd_cleanq)
+ BD_UNLOCK(bd);
+}
+
+static int
+bd_flushall(struct bufdomain *bd)
+{
+ struct bufqueue *bq;
+ int flushed;
+ int i;
+
+ if (bd->bd_lim == 0)
+ return (0);
+ flushed = 0;
+ for (i = 0; i <= mp_maxid; i++) {
+ bq = &bd->bd_subq[i];
+ if (bq->bq_len == 0)
+ continue;
+ BQ_LOCK(bq);
+ bd_flush(bd, bq);
+ BQ_UNLOCK(bq);
+ flushed++;
+ }
+
+ return (flushed);
+}
+
+static void
+bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock)
+{
+ struct bufdomain *bd;
+
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("bq_insert: free buffer %p onto another queue?", bp);
+
+ bd = bufdomain(bp);
+ if (bp->b_flags & B_AGE) {
+ /* Place this buf directly on the real queue. */
+ if (bq->bq_index == QUEUE_CLEAN)
+ bq = bd->bd_cleanq;
+ BQ_LOCK(bq);
+ TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist);
+ } else {
+ BQ_LOCK(bq);
+ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+ }
+ bp->b_flags &= ~(B_AGE | B_REUSE);
+ bq->bq_len++;
+ bp->b_qindex = bq->bq_index;
+ bp->b_subqueue = bq->bq_subqueue;
+
+ /*
+ * Unlock before we notify so that we don't wakeup a waiter that
+ * fails a trylock on the buf and sleeps again.
+ */
+ if (unlock)
+ BUF_UNLOCK(bp);
+
+ if (bp->b_qindex == QUEUE_CLEAN) {
+ /*
+ * Flush the per-cpu queue and notify any waiters.
+ */
+ if (bd->bd_wanted || (bq != bd->bd_cleanq &&
+ bq->bq_len >= bd->bd_lim))
+ bd_flush(bd, bq);
+ }
+ BQ_UNLOCK(bq);
+}
+
+/*
+ * bufkva_free:
+ *
+ * Free the kva allocation for a buffer.
+ *
+ */
+static void
+bufkva_free(struct buf *bp)
+{
+
+#ifdef INVARIANTS
+ if (bp->b_kvasize == 0) {
+ KASSERT(bp->b_kvabase == unmapped_buf &&
+ bp->b_data == unmapped_buf,
+ ("Leaked KVA space on %p", bp));
+ } else if (buf_mapped(bp))
+ BUF_CHECK_MAPPED(bp);
+ else
+ BUF_CHECK_UNMAPPED(bp);
+#endif
+ if (bp->b_kvasize == 0)
+ return;
+
+ vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
+ counter_u64_add(bufkvaspace, -bp->b_kvasize);
+ counter_u64_add(buffreekvacnt, 1);
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_kvasize = 0;
+}
+
+/*
+ * bufkva_alloc:
+ *
+ * Allocate the buffer KVA and set b_kvasize and b_kvabase.
+ */
+static int
+bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
+{
+ vm_offset_t addr;
+ int error;
+
+ KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
+ ("Invalid gbflags 0x%x in %s", gbflags, __func__));
+
+ bufkva_free(bp);
+
+ addr = 0;
+ error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
+ if (error != 0) {
+ /*
+ * Buffer map is too fragmented. Request the caller
+ * to defragment the map.
+ */
+ return (error);
+ }
+ bp->b_kvabase = (caddr_t)addr;
+ bp->b_kvasize = maxsize;
+ counter_u64_add(bufkvaspace, bp->b_kvasize);
+ if ((gbflags & GB_UNMAPPED) != 0) {
+ bp->b_data = unmapped_buf;
+ BUF_CHECK_UNMAPPED(bp);
+ } else {
+ bp->b_data = bp->b_kvabase;
+ BUF_CHECK_MAPPED(bp);
+ }
+ return (0);
+}
+
+/*
+ * bufkva_reclaim:
+ *
+ * Reclaim buffer kva by freeing buffers holding kva. This is a vmem
+ * callback that fires to avoid returning failure.
+ */
+static void
+bufkva_reclaim(vmem_t *vmem, int flags)
+{
+ bool done;
+ int q;
+ int i;
+
+ done = false;
+ for (i = 0; i < 5; i++) {
+ for (q = 0; q < buf_domains; q++)
+ if (buf_recycle(&bdomain[q], true) != 0)
+ done = true;
+ if (done)
+ break;
+ }
+ return;
+}
+
+/*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks. We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+static void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt,
+ struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *))
+{
+ struct buf *rabp;
+ struct thread *td;
+ int i;
+
+ td = curthread;
+
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+ if ((rabp->b_flags & B_CACHE) != 0) {
+ brelse(rabp);
+ continue;
+ }
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, rabp, 0);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+ td->td_ru.ru_inblock++;
+ rabp->b_flags |= B_ASYNC;
+ rabp->b_flags &= ~B_INVAL;
+ if ((flags & GB_CKHASH) != 0) {
+ rabp->b_flags |= B_CKHASH;
+ rabp->b_ckhashcalc = ckhashfunc;
+ }
+ rabp->b_ioflags &= ~BIO_ERROR;
+ rabp->b_iocmd = BIO_READ;
+ if (rabp->b_rcred == NOCRED && cred != NOCRED)
+ rabp->b_rcred = crhold(cred);
+ vfs_busy_pages(rabp, 0);
+ BUF_KERNPROC(rabp);
+ rabp->b_iooffset = dbtob(rabp->b_blkno);
+ bstrategy(rabp);
+ }
+}
+
+/*
+ * Entry point for bread() and breadn() via #defines in sys/buf.h.
+ *
+ * Get a buffer with the specified data. Look in the cache first. We
+ * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything, see
+ * getblk(). Also starts asynchronous I/O on read-ahead blocks.
+ *
+ * Always return a NULL buffer pointer (in bpp) when returning an error.
+ */
+int
+breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
+ int *rabsize, int cnt, struct ucred *cred, int flags,
+ void (*ckhashfunc)(struct buf *), struct buf **bpp)
+{
+ struct buf *bp;
+ struct thread *td;
+ int error, readwait, rv;
+
+ CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
+ td = curthread;
+ /*
+ * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags
+ * are specified.
+ */
+ error = getblkx(vp, blkno, size, 0, 0, flags, &bp);
+ if (error != 0) {
+ *bpp = NULL;
+ return (error);
+ }
+ flags &= ~GB_NOSPARSE;
+ *bpp = bp;
+
+ /*
+ * If not found in cache, do some I/O
+ */
+ readwait = 0;
+ if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ racct_add_buf(td->td_proc, bp, 0);
+ PROC_UNLOCK(td->td_proc);
+ }
+#endif /* RACCT */
+ td->td_ru.ru_inblock++;
+ bp->b_iocmd = BIO_READ;
+ bp->b_flags &= ~B_INVAL;
+ if ((flags & GB_CKHASH) != 0) {
+ bp->b_flags |= B_CKHASH;
+ bp->b_ckhashcalc = ckhashfunc;
+ }
+ bp->b_ioflags &= ~BIO_ERROR;
+ if (bp->b_rcred == NOCRED && cred != NOCRED)
+ bp->b_rcred = crhold(cred);
+ vfs_busy_pages(bp, 0);
+ bp->b_iooffset = dbtob(bp->b_blkno);
+ bstrategy(bp);
+ ++readwait;
+ }
+
+ /*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks.
+ */
+ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc);
+
+ rv = 0;
+ if (readwait) {
+ rv = bufwait(bp);
+ if (rv != 0) {
+ brelse(bp);
+ *bpp = NULL;
+ }
+ }
+ return (rv);
+}
+
+/*
+ * Write, release buffer on completion. (Done by iodone
+ * if async). Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable. This is true even of NFS
+ * now so we set it generally. This could be set either here
+ * or in biodone() since the I/O is synchronous. We put it
+ * here.
+ */
+int
+bufwrite(struct buf *bp)
+{
+ int oldflags;
+ struct vnode *vp;
+ long space;
+ int vp_md;
+
+ CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
+ bp->b_flags |= B_INVAL | B_RELBUF;
+ bp->b_flags &= ~B_CACHE;
+ brelse(bp);
+ return (ENXIO);
+ }
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return (0);
+ }
+
+ if (bp->b_flags & B_BARRIER)
+ atomic_add_long(&barrierwrites, 1);
+
+ oldflags = bp->b_flags;
+
+ BUF_ASSERT_HELD(bp);
+
+ KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
+ ("FFS background buffer should not get here %p", bp));
+
+ vp = bp->b_vp;
+ if (vp)
+ vp_md = vp->v_vflag & VV_MD;
+ else
+ vp_md = 0;
+
+ /*
+ * Mark the buffer clean. Increment the bufobj write count
+ * before bundirty() call, to prevent other thread from seeing
+ * empty dirty list and zero counter for writes in progress,
+ * falsely indicating that the bufobj is clean.
+ */
+ bufobj_wref(bp->b_bufobj);
+ bundirty(bp);
+
+ bp->b_flags &= ~B_DONE;
+ bp->b_ioflags &= ~BIO_ERROR;
+ bp->b_flags |= B_CACHE;
+ bp->b_iocmd = BIO_WRITE;
+
+ vfs_busy_pages(bp, 1);
+
+ /*
+ * Normal bwrites pipeline writes
+ */
+ bp->b_runningbufspace = bp->b_bufsize;
+ space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
+
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(curproc);
+ racct_add_buf(curproc, bp, 1);
+ PROC_UNLOCK(curproc);
+ }
+#endif /* RACCT */
+ curthread->td_ru.ru_oublock++;
+ if (oldflags & B_ASYNC)
+ BUF_KERNPROC(bp);
+ bp->b_iooffset = dbtob(bp->b_blkno);
+ buf_track(bp, __func__);
+ bstrategy(bp);
+
+ if ((oldflags & B_ASYNC) == 0) {
+ int rtval = bufwait(bp);
+ brelse(bp);
+ return (rtval);
+ } else if (space > hirunningspace) {
+ /*
+ * don't allow the async write to saturate the I/O
+ * system. We will not deadlock here because
+ * we are blocking waiting for I/O that is already in-progress
+ * to complete. We do not block here if it is the update
+ * or syncer daemon trying to clean up as that can lead
+ * to deadlock.
+ */
+ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
+ waitrunningbufspace();
+ }
+
+ return (0);
+}
+
+void
+bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+ struct buf *nbp;
+
+ if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
+ (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
+ altbufferflushes++;
+ } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
+ BO_LOCK(bo);
+ /*
+ * Try to find a buffer to flush.
+ */
+ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+ BUF_LOCK(nbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))
+ continue;
+ if (bp == nbp)
+ panic("bdwrite: found ourselves");
+ BO_UNLOCK(bo);
+ /* Don't countdeps with the bo lock held. */
+ if (buf_countdeps(nbp, 0)) {
+ BO_LOCK(bo);
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ if (nbp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(nbp);
+ } else {
+ bremfree(nbp);
+ bawrite(nbp);
+ }
+ dirtybufferflushes++;
+ break;
+ }
+ if (nbp == NULL)
+ BO_UNLOCK(bo);
+ }
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty). Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE. In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
+ */
+void
+bdwrite(struct buf *bp)
+{
+ struct thread *td = curthread;
+ struct vnode *vp;
+ struct bufobj *bo;
+
+ CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ KASSERT((bp->b_flags & B_BARRIER) == 0,
+ ("Barrier request in delayed write %p", bp));
+ BUF_ASSERT_HELD(bp);
+
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return;
+ }
+
+ /*
+ * If we have too many dirty buffers, don't create any more.
+ * If we are wildly over our limit, then force a complete
+ * cleanup. Otherwise, just keep the situation from getting
+ * out of control. Note that we have to avoid a recursive
+ * disaster and not try to clean up after our own cleanup!
+ */
+ vp = bp->b_vp;
+ bo = bp->b_bufobj;
+ if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
+ td->td_pflags |= TDP_INBDFLUSH;
+ BO_BDFLUSH(bo, bp);
+ td->td_pflags &= ~TDP_INBDFLUSH;
+ } else
+ recursiveflushes++;
+
+ bdirty(bp);
+ /*
+ * Set B_CACHE, indicating that the buffer is fully valid. This is
+ * true even of NFS now.
+ */
+ bp->b_flags |= B_CACHE;
+
+ /*
+ * This bmap keeps the system from needing to do the bmap later,
+ * perhaps when the system is attempting to do a sync. Since it
+ * is likely that the indirect block -- or whatever other datastructure
+ * that the filesystem needs is still in memory now, it is a good
+ * thing to do this. Note also, that if the pageout daemon is
+ * requesting a sync -- there might not be enough memory to do
+ * the bmap then... So, this is important to do.
+ */
+ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
+ VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+ }
+
+ buf_track(bp, __func__);
+
+ /*
+ * Set the *dirty* buffer range based upon the VM system dirty
+ * pages.
+ *
+ * Mark the buffer pages as clean. We need to do this here to
+ * satisfy the vnode_pager and the pageout daemon, so that it
+ * thinks that the pages have been "cleaned". Note that since
+ * the pages are in a delayed write buffer -- the VFS layer
+ * "will" see that the pages get written out on the next sync,
+ * or perhaps the cluster will be completed.
+ */
+ vfs_clean_pages_dirty_buf(bp);
+ bqrelse(bp);
+
+ /*
+ * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+ * due to the softdep code.
+ */
+}
+
+/*
+ * bdirty:
+ *
+ * Turn buffer into delayed write request. We must clear BIO_READ and
+ * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to
+ * itself to properly update it in the dirty/clean lists. We mark it
+ * B_DONE to ensure that any asynchronization of the buffer properly
+ * clears B_DONE ( else a panic will occur later ).
+ *
+ * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty()
+ * should only be called if the buffer is known-good.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * The buffer must be on QUEUE_NONE.
+ */
+void
+bdirty(struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+ ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
+ BUF_ASSERT_HELD(bp);
+ bp->b_flags &= ~(B_RELBUF);
+ bp->b_iocmd = BIO_WRITE;
+
+ if ((bp->b_flags & B_DELWRI) == 0) {
+ bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
+ reassignbuf(bp);
+ bdirtyadd(bp);
+ }
+}
+
+/*
+ * bundirty:
+ *
+ * Clear B_DELWRI for buffer.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * The buffer must be on QUEUE_NONE.
+ */
+
+void
+bundirty(struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+ ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
+ BUF_ASSERT_HELD(bp);
+
+ if (bp->b_flags & B_DELWRI) {
+ bp->b_flags &= ~B_DELWRI;
+ reassignbuf(bp);
+ bdirtysub(bp);
+ }
+ /*
+ * Since it is now being written, we can clear its deferred write flag.
+ */
+ bp->b_flags &= ~B_DEFERRED;
+}
+
+/*
+ * bawrite:
+ *
+ * Asynchronous write. Start output on a buffer, but do not wait for
+ * it to complete. The buffer is released when the output completes.
+ *
+ * bwrite() ( or the VOP routine anyway ) is responsible for handling
+ * B_INVAL buffers. Not us.
+ */
+void
+bawrite(struct buf *bp)
+{
+
+ bp->b_flags |= B_ASYNC;
+ (void) bwrite(bp);
+}
+
+/*
+ * babarrierwrite:
+ *
+ * Asynchronous barrier write. Start output on a buffer, but do not
+ * wait for it to complete. Place a write barrier after this write so
+ * that this buffer and all buffers written before it are committed to
+ * the disk before any buffers written after this write are committed
+ * to the disk. The buffer is released when the output completes.
+ */
+void
+babarrierwrite(struct buf *bp)
+{
+
+ bp->b_flags |= B_ASYNC | B_BARRIER;
+ (void) bwrite(bp);
+}
+
+/*
+ * bbarrierwrite:
+ *
+ * Synchronous barrier write. Start output on a buffer and wait for
+ * it to complete. Place a write barrier after this write so that
+ * this buffer and all buffers written before it are committed to
+ * the disk before any buffers written after this write are committed
+ * to the disk. The buffer is released when the output completes.
+ */
+int
+bbarrierwrite(struct buf *bp)
+{
+
+ bp->b_flags |= B_BARRIER;
+ return (bwrite(bp));
+}
+
+/*
+ * bwillwrite:
+ *
+ * Called prior to the locking of any vnodes when we are expecting to
+ * write. We do not want to starve the buffer cache with too many
+ * dirty buffers so we block here. By blocking prior to the locking
+ * of any vnodes we attempt to avoid the situation where a locked vnode
+ * prevents the various system daemons from flushing related buffers.
+ */
+void
+bwillwrite(void)
+{
+
+ if (buf_dirty_count_severe()) {
+ mtx_lock(&bdirtylock);
+ while (buf_dirty_count_severe()) {
+ bdirtywait = 1;
+ msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+ "flswai", 0);
+ }
+ mtx_unlock(&bdirtylock);
+ }
+}
+
+/*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+
+ return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty));
+}
+
+/*
+ * brelse:
+ *
+ * Release a busy buffer and, if requested, free its resources. The
+ * buffer will be stashed in the appropriate bufqueue[] allowing it
+ * to be accessed later as a cache entity or reused for other purposes.
+ */
+void
+brelse(struct buf *bp)
+{
+ struct mount *v_mnt;
+ int qindex;
+
+ /*
+ * Many functions erroneously call brelse with a NULL bp under rare
+ * error conditions. Simply return when called with a NULL bp.
+ */
+ if (bp == NULL)
+ return;
+ CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+ ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+ KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
+ ("brelse: non-VMIO buffer marked NOREUSE"));
+
+ if (BUF_LOCKRECURSED(bp)) {
+ /*
+ * Do not process, in particular, do not handle the
+ * B_INVAL/B_RELBUF and do not release to free list.
+ */
+ BUF_UNLOCK(bp);
+ return;
+ }
+
+ if (bp->b_flags & B_MANAGED) {
+ bqrelse(bp);
+ return;
+ }
+
+ if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
+ BO_LOCK(bp->b_bufobj);
+ bp->b_vflags &= ~BV_BKGRDERR;
+ BO_UNLOCK(bp->b_bufobj);
+ bdirty(bp);
+ }
+ if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
+ (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) &&
+ !(bp->b_flags & B_INVAL)) {
+ /*
+ * Failed write, redirty. All errors except ENXIO (which
+ * means the device is gone) are treated as being
+ * transient.
+ *
+ * XXX Treating EIO as transient is not correct; the
+ * contract with the local storage device drivers is that
+ * they will only return EIO once the I/O is no longer
+ * retriable. Network I/O also respects this through the
+ * guarantees of TCP and/or the internal retries of NFS.
+ * ENOMEM might be transient, but we also have no way of
+ * knowing when its ok to retry/reschedule. In general,
+ * this entire case should be made obsolete through better
+ * error handling/recovery and resource scheduling.
+ *
+ * Do this also for buffers that failed with ENXIO, but have
+ * non-empty dependencies - the soft updates code might need
+ * to access the buffer to untangle them.
+ *
+ * Must clear BIO_ERROR to prevent pages from being scrapped.
+ */
+ bp->b_ioflags &= ~BIO_ERROR;
+ bdirty(bp);
+ } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
+ (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
+ /*
+ * Either a failed read I/O, or we were asked to free or not
+ * cache the buffer, or we failed to write to a device that's
+ * no longer present.
+ */
+ bp->b_flags |= B_INVAL;
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ if (bp->b_flags & B_DELWRI)
+ bdirtysub(bp);
+ bp->b_flags &= ~(B_DELWRI | B_CACHE);
+ if ((bp->b_flags & B_VMIO) == 0) {
+ allocbuf(bp, 0);
+ if (bp->b_vp)
+ brelvp(bp);
+ }
+ }
+
+ /*
+ * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate()
+ * is called with B_DELWRI set, the underlying pages may wind up
+ * getting freed causing a previous write (bdwrite()) to get 'lost'
+ * because pages associated with a B_DELWRI bp are marked clean.
+ *
+ * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
+ * if B_DELWRI is set.
+ */
+ if (bp->b_flags & B_DELWRI)
+ bp->b_flags &= ~B_RELBUF;
+
+ /*
+ * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
+ * constituted, not even NFS buffers now. Two flags effect this. If
+ * B_INVAL, the struct buf is invalidated but the VM object is kept
+ * around ( i.e. so it is trivial to reconstitute the buffer later ).
+ *
+ * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
+ * invalidated. BIO_ERROR cannot be set for a failed write unless the
+ * buffer is also B_INVAL because it hits the re-dirtying code above.
+ *
+ * Normally we can do this whether a buffer is B_DELWRI or not. If
+ * the buffer is an NFS buffer, it is tracking piecemeal writes or
+ * the commit state and we cannot afford to lose the buffer. If the
+ * buffer has a background write in progress, we need to keep it
+ * around to prevent it from being reconstituted and starting a second
+ * background write.
+ */
+
+ v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL;
+
+ if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
+ (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
+ (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 ||
+ vn_isdisk(bp->b_vp, NULL) || (bp->b_flags & B_DELWRI) == 0)) {
+ vfs_vmio_invalidate(bp);
+ allocbuf(bp, 0);
+ }
+
+ if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
+ (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
+ allocbuf(bp, 0);
+ bp->b_flags &= ~B_NOREUSE;
+ if (bp->b_vp != NULL)
+ brelvp(bp);
+ }
+
+ /*
+ * If the buffer has junk contents signal it and eventually
+ * clean up B_DELWRI and diassociate the vnode so that gbincore()
+ * doesn't find it.
+ */
+ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
+ (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
+ bp->b_flags |= B_INVAL;
+ if (bp->b_flags & B_INVAL) {
+ if (bp->b_flags & B_DELWRI)
+ bundirty(bp);
+ if (bp->b_vp)
+ brelvp(bp);
+ }
+
+ buf_track(bp, __func__);
+
+ /* buffers with no memory */
+ if (bp->b_bufsize == 0) {
+ buf_free(bp);
+ return;
+ }
+ /* buffers with junk contents */
+ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
+ (bp->b_ioflags & BIO_ERROR)) {
+ bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 2");
+ qindex = QUEUE_CLEAN;
+ bp->b_flags |= B_AGE;
+ /* remaining buffers */
+ } else if (bp->b_flags & B_DELWRI)
+ qindex = QUEUE_DIRTY;
+ else
+ qindex = QUEUE_CLEAN;
+
+ if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+ panic("brelse: not dirty");
+
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT);
+ /* binsfree unlocks bp. */
+ binsfree(bp, qindex);
+}
+
+/*
+ * Release a buffer back to the appropriate queue but do not try to free
+ * it. The buffer is expected to be used again soon.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion. It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
+ */
+void
+bqrelse(struct buf *bp)
+{
+ int qindex;
+
+ CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+ ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+ qindex = QUEUE_NONE;
+ if (BUF_LOCKRECURSED(bp)) {
+ /* do not release to free list */
+ BUF_UNLOCK(bp);
+ return;
+ }
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+
+ if (bp->b_flags & B_MANAGED) {
+ if (bp->b_flags & B_REMFREE)
+ bremfreef(bp);
+ goto out;
+ }
+
+ /* buffers with stale but valid contents */
+ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
+ BV_BKGRDERR)) == BV_BKGRDERR) {
+ BO_LOCK(bp->b_bufobj);
+ bp->b_vflags &= ~BV_BKGRDERR;
+ BO_UNLOCK(bp->b_bufobj);
+ qindex = QUEUE_DIRTY;
+ } else {
+ if ((bp->b_flags & B_DELWRI) == 0 &&
+ (bp->b_xflags & BX_VNDIRTY))
+ panic("bqrelse: not dirty");
+ if ((bp->b_flags & B_NOREUSE) != 0) {
+ brelse(bp);
+ return;
+ }
+ qindex = QUEUE_CLEAN;
+ }
+ buf_track(bp, __func__);
+ /* binsfree unlocks bp. */
+ binsfree(bp, qindex);
+ return;
+
+out:
+ buf_track(bp, __func__);
+ /* unlock */
+ BUF_UNLOCK(bp);
+}
+
+/*
+ * Complete I/O to a VMIO backed page. Validate the pages as appropriate,
+ * restore bogus pages.
+ */
+static void
+vfs_vmio_iodone(struct buf *bp)
+{
+ vm_ooffset_t foff;
+ vm_page_t m;
+ vm_object_t obj;
+ struct vnode *vp __unused;
+ int i, iosize, resid;
+ bool bogus;
+
+ obj = bp->b_bufobj->bo_object;
+ KASSERT(obj->paging_in_progress >= bp->b_npages,
+ ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
+ obj->paging_in_progress, bp->b_npages));
+
+ vp = bp->b_vp;
+ KASSERT(vp->v_holdcnt > 0,
+ ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
+ KASSERT(vp->v_object != NULL,
+ ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
+
+ bogus = false;
+ iosize = bp->b_bcount - bp->b_resid;
+ VM_OBJECT_WLOCK(obj);
+ for (i = 0; i < bp->b_npages; i++) {
+ resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+ if (resid > iosize)
+ resid = iosize;
+
+ /*
+ * cleanup bogus pages, restoring the originals
+ */
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ bogus = true;
+ m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+ if (m == NULL)
+ panic("biodone: page disappeared!");
+ bp->b_pages[i] = m;
+ } else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
+ /*
+ * In the write case, the valid and clean bits are
+ * already changed correctly ( see bdwrite() ), so we
+ * only need to do this here in the read case.
+ */
+ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
+ resid)) == 0, ("vfs_vmio_iodone: page %p "
+ "has unexpected dirty bits", m));
+ vfs_page_set_valid(bp, foff, m);
+ }
+ KASSERT(OFF_TO_IDX(foff) == m->pindex,
+ ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
+ (intmax_t)foff, (uintmax_t)m->pindex));
+
+ vm_page_sunbusy(m);
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ iosize -= resid;
+ }
+ vm_object_pip_wakeupn(obj, bp->b_npages);
+ VM_OBJECT_WUNLOCK(obj);
+ if (bogus && buf_mapped(bp)) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ }
+}
+
+/*
+ * Perform page invalidation when a buffer is released. The fully invalid
+ * pages will be reclaimed later in vfs_vmio_truncate().
+ */
+static void
+vfs_vmio_invalidate(struct buf *bp)
+{
+ vm_object_t obj;
+ vm_page_t m;
+ int flags, i, resid, poffset, presid;
+
+ if (buf_mapped(bp)) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+ /*
+ * Get the base offset and length of the buffer. Note that
+ * in the VMIO case if the buffer block size is not
+ * page-aligned then b_data pointer may not be page-aligned.
+ * But our b_pages[] array *IS* page aligned.
+ *
+ * block sizes less then DEV_BSIZE (usually 512) are not
+ * supported due to the page granularity bits (m->valid,
+ * m->dirty, etc...).
+ *
+ * See man buf(9) for more information
+ */
+ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
+ obj = bp->b_bufobj->bo_object;
+ resid = bp->b_bufsize;
+ poffset = bp->b_offset & PAGE_MASK;
+ VM_OBJECT_WLOCK(obj);
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ if (m == bogus_page)
+ panic("vfs_vmio_invalidate: Unexpected bogus page.");
+ bp->b_pages[i] = NULL;
+
+ presid = resid > (PAGE_SIZE - poffset) ?
+ (PAGE_SIZE - poffset) : resid;
+ KASSERT(presid >= 0, ("brelse: extra page"));
+ while (vm_page_xbusied(m)) {
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(obj);
+ vm_page_busy_sleep(m, "mbncsh", true);
+ VM_OBJECT_WLOCK(obj);
+ }
+ if (pmap_page_wired_mappings(m) == 0)
+ vm_page_set_invalid(m, poffset, presid);
+ vm_page_release_locked(m, flags);
+ resid -= presid;
+ poffset = 0;
+ }
+ VM_OBJECT_WUNLOCK(obj);
+ bp->b_npages = 0;
+}
+
+/*
+ * Page-granular truncation of an existing VMIO buffer.
+ */
+static void
+vfs_vmio_truncate(struct buf *bp, int desiredpages)
+{
+ vm_object_t obj;
+ vm_page_t m;
+ int flags, i;
+
+ if (bp->b_npages == desiredpages)
+ return;
+
+ if (buf_mapped(bp)) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+
+ /*
+ * The object lock is needed only if we will attempt to free pages.
+ */
+ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0;
+ if ((bp->b_flags & B_DIRECT) != 0) {
+ flags |= VPR_TRYFREE;
+ obj = bp->b_bufobj->bo_object;
+ VM_OBJECT_WLOCK(obj);
+ } else {
+ obj = NULL;
+ }
+ for (i = desiredpages; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
+ bp->b_pages[i] = NULL;
+ if (obj != NULL)
+ vm_page_release_locked(m, flags);
+ else
+ vm_page_release(m, flags);
+ }
+ if (obj != NULL)
+ VM_OBJECT_WUNLOCK(obj);
+ bp->b_npages = desiredpages;
+}
+
+/*
+ * Byte granular extension of VMIO buffers.
+ */
+static void
+vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
+{
+ /*
+ * We are growing the buffer, possibly in a
+ * byte-granular fashion.
+ */
+ vm_object_t obj;
+ vm_offset_t toff;
+ vm_offset_t tinc;
+ vm_page_t m;
+
+ /*
+ * Step 1, bring in the VM pages from the object, allocating
+ * them if necessary. We must clear B_CACHE if these pages
+ * are not valid for the range covered by the buffer.
+ */
+ obj = bp->b_bufobj->bo_object;
+ VM_OBJECT_WLOCK(obj);
+ if (bp->b_npages < desiredpages) {
+ /*
+ * We must allocate system pages since blocking
+ * here could interfere with paging I/O, no
+ * matter which process we are.
+ *
+ * Only exclusive busy can be tested here.
+ * Blocking on shared busy might lead to
+ * deadlocks once allocbuf() is called after
+ * pages are vfs_busy_pages().
+ */
+ (void)vm_page_grab_pages(obj,
+ OFF_TO_IDX(bp->b_offset) + bp->b_npages,
+ VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY |
+ VM_ALLOC_NOBUSY | VM_ALLOC_WIRED,
+ &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages);
+ bp->b_npages = desiredpages;
+ }
+
+ /*
+ * Step 2. We've loaded the pages into the buffer,
+ * we have to figure out if we can still have B_CACHE
+ * set. Note that B_CACHE is set according to the
+ * byte-granular range ( bcount and size ), not the
+ * aligned range ( newbsize ).
+ *
+ * The VM test is against m->valid, which is DEV_BSIZE
+ * aligned. Needless to say, the validity of the data
+ * needs to also be DEV_BSIZE aligned. Note that this
+ * fails with NFS if the server or some other client
+ * extends the file's EOF. If our buffer is resized,
+ * B_CACHE may remain set! XXX
+ */
+ toff = bp->b_bcount;
+ tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+ while ((bp->b_flags & B_CACHE) && toff < size) {
+ vm_pindex_t pi;
+
+ if (tinc > (size - toff))
+ tinc = size - toff;
+ pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
+ m = bp->b_pages[pi];
+ vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
+ toff += tinc;
+ tinc = PAGE_SIZE;
+ }
+ VM_OBJECT_WUNLOCK(obj);
+
+ /*
+ * Step 3, fixup the KVA pmap.
+ */
+ if (buf_mapped(bp))
+ bpmap_qenter(bp);
+ else
+ BUF_CHECK_UNMAPPED(bp);
+}
+
+/*
+ * Check to see if a block at a particular lbn is available for a clustered
+ * write.
+ */
+static int
+vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
+{
+ struct buf *bpa;
+ int match;
+
+ match = 0;
+
+ /* If the buf isn't in core skip it */
+ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
+ return (0);
+
+ /* If the buf is busy we don't want to wait for it */
+ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ return (0);
+
+ /* Only cluster with valid clusterable delayed write buffers */
+ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
+ (B_DELWRI | B_CLUSTEROK))
+ goto done;
+
+ if (bpa->b_bufsize != size)
+ goto done;
+
+ /*
+ * Check to see if it is in the expected place on disk and that the
+ * block has been mapped.
+ */
+ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
+ match = 1;
+done:
+ BUF_UNLOCK(bpa);
+ return (match);
+}
+
+/*
+ * vfs_bio_awrite:
+ *
+ * Implement clustered async writes for clearing out B_DELWRI buffers.
+ * This is much better then the old way of writing only one buffer at
+ * a time. Note that we may not be presented with the buffers in the
+ * correct order, so we search for the cluster in both directions.
+ */
+int
+vfs_bio_awrite(struct buf *bp)
+{
+ struct bufobj *bo;
+ int i;
+ int j;
+ daddr_t lblkno = bp->b_lblkno;
+ struct vnode *vp = bp->b_vp;
+ int ncl;
+ int nwritten;
+ int size;
+ int maxcl;
+ int gbflags;
+
+ bo = &vp->v_bufobj;
+ gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
+ /*
+ * right now we support clustered writing only to regular files. If
+ * we find a clusterable block we could be in the middle of a cluster
+ * rather then at the beginning.
+ */
+ if ((vp->v_type == VREG) &&
+ (vp->v_mount != 0) && /* Only on nodes that have the size info */
+ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+ size = vp->v_mount->mnt_stat.f_iosize;
+ maxcl = MAXPHYS / size;
+
+ BO_RLOCK(bo);
+ for (i = 1; i < maxcl; i++)
+ if (vfs_bio_clcheck(vp, size, lblkno + i,
+ bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
+ break;
+
+ for (j = 1; i + j <= maxcl && j <= lblkno; j++)
+ if (vfs_bio_clcheck(vp, size, lblkno - j,
+ bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
+ break;
+ BO_RUNLOCK(bo);
+ --j;
+ ncl = i + j;
+ /*
+ * this is a possible cluster write
+ */
+ if (ncl != 1) {
+ BUF_UNLOCK(bp);
+ nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
+ gbflags);
+ return (nwritten);
+ }
+ }
+ bremfree(bp);
+ bp->b_flags |= B_ASYNC;
+ /*
+ * default (old) behavior, writing out only one block
+ *
+ * XXX returns b_bufsize instead of b_bcount for nwritten?
+ */
+ nwritten = bp->b_bufsize;
+ (void) bwrite(bp);
+
+ return (nwritten);
+}
+
+/*
+ * getnewbuf_kva:
+ *
+ * Allocate KVA for an empty buf header according to gbflags.
+ */
+static int
+getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
+{
+
+ if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
+ /*
+ * In order to keep fragmentation sane we only allocate kva
+ * in BKVASIZE chunks. XXX with vmem we can do page size.
+ */
+ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
+
+ if (maxsize != bp->b_kvasize &&
+ bufkva_alloc(bp, maxsize, gbflags))
+ return (ENOSPC);
+ }
+ return (0);
+}
+
+/*
+ * getnewbuf:
+ *
+ * Find and initialize a new buffer header, freeing up existing buffers
+ * in the bufqueues as necessary. The new buffer is returned locked.
+ *
+ * We block if:
+ * We have insufficient buffer headers
+ * We have insufficient buffer space
+ * buffer_arena is too fragmented ( space reservation fails )
+ * If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ * The caller is responsible for releasing the reserved bufspace after
+ * allocbuf() is called.
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
+{
+ struct bufdomain *bd;
+ struct buf *bp;
+ bool metadata, reserved;
+
+ bp = NULL;
+ KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+ if (!unmapped_buf_allowed)
+ gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+ if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+ vp->v_type == VCHR)
+ metadata = true;
+ else
+ metadata = false;
+ if (vp == NULL)
+ bd = &bdomain[0];
+ else
+ bd = &bdomain[vp->v_bufobj.bo_domain];
+
+ counter_u64_add(getnewbufcalls, 1);
+ reserved = false;
+ do {
+ if (reserved == false &&
+ bufspace_reserve(bd, maxsize, metadata) != 0) {
+ counter_u64_add(getnewbufrestarts, 1);
+ continue;
+ }
+ reserved = true;
+ if ((bp = buf_alloc(bd)) == NULL) {
+ counter_u64_add(getnewbufrestarts, 1);
+ continue;
+ }
+ if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
+ return (bp);
+ break;
+ } while (buf_recycle(bd, false) == 0);
+
+ if (reserved)
+ bufspace_release(bd, maxsize);
+ if (bp != NULL) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ }
+ bufspace_wait(bd, vp, gbflags, slpflag, slptimeo);
+
+ return (NULL);
+}
+
+/*
+ * buf_daemon:
+ *
+ * buffer flushing daemon. Buffers are normally flushed by the
+ * update daemon but if it cannot keep up this process starts to
+ * take the load in an attempt to prevent getnewbuf() from blocking.
+ */
+static struct kproc_desc buf_kp = {
+ "bufdaemon",
+ buf_daemon,
+ &bufdaemonproc
+};
+SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
+
+static int
+buf_flush(struct vnode *vp, struct bufdomain *bd, int target)
+{
+ int flushed;
+
+ flushed = flushbufqueues(vp, bd, target, 0);
+ if (flushed == 0) {
+ /*
+ * Could not find any buffers without rollback
+ * dependencies, so just write the first one
+ * in the hopes of eventually making progress.
+ */
+ if (vp != NULL && target > 2)
+ target /= 2;
+ flushbufqueues(vp, bd, target, 1);
+ }
+ return (flushed);
+}
+
+static void
+buf_daemon()
+{
+ struct bufdomain *bd;
+ int speedupreq;
+ int lodirty;
+ int i;
+
+ /*
+ * This process needs to be suspended prior to shutdown sync.
+ */
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread,
+ SHUTDOWN_PRI_LAST + 100);
+
+ /*
+ * Start the buf clean daemons as children threads.
+ */
+ for (i = 0 ; i < buf_domains; i++) {
+ int error;
+
+ error = kthread_add((void (*)(void *))bufspace_daemon,
+ &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
+ if (error)
+ panic("error %d spawning bufspace daemon", error);
+ }
+
+ /*
+ * This process is allowed to take the buffer cache to the limit
+ */
+ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
+ mtx_lock(&bdlock);
+ for (;;) {
+ bd_request = 0;
+ mtx_unlock(&bdlock);
+
+ kthread_suspend_check();
+
+ /*
+ * Save speedupreq for this pass and reset to capture new
+ * requests.
+ */
+ speedupreq = bd_speedupreq;
+ bd_speedupreq = 0;
+
+ /*
+ * Flush each domain sequentially according to its level and
+ * the speedup request.
+ */
+ for (i = 0; i < buf_domains; i++) {
+ bd = &bdomain[i];
+ if (speedupreq)
+ lodirty = bd->bd_numdirtybuffers / 2;
+ else
+ lodirty = bd->bd_lodirtybuffers;
+ while (bd->bd_numdirtybuffers > lodirty) {
+ if (buf_flush(NULL, bd,
+ bd->bd_numdirtybuffers - lodirty) == 0)
+ break;
+ kern_yield(PRI_USER);
+ }
+ }
+
+ /*
+ * Only clear bd_request if we have reached our low water
+ * mark. The buf_daemon normally waits 1 second and
+ * then incrementally flushes any dirty buffers that have
+ * built up, within reason.
+ *
+ * If we were unable to hit our low water mark and couldn't
+ * find any flushable buffers, we sleep for a short period
+ * to avoid endless loops on unlockable buffers.
+ */
+ mtx_lock(&bdlock);
+ if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) {
+ /*
+ * We reached our low water mark, reset the
+ * request and sleep until we are needed again.
+ * The sleep is just so the suspend code works.
+ */
+ bd_request = 0;
+ /*
+ * Do an extra wakeup in case dirty threshold
+ * changed via sysctl and the explicit transition
+ * out of shortfall was missed.
+ */
+ bdirtywakeup();
+ if (runningbufspace <= lorunningspace)
+ runningwakeup();
+ msleep(&bd_request, &bdlock, PVM, "psleep", hz);
+ } else {
+ /*
+ * We couldn't find any flushable dirty buffers but
+ * still have too many dirty buffers, we
+ * have to sleep and try again. (rare)
+ */
+ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
+ }
+ }
+}
+
+/*
+ * flushbufqueues:
+ *
+ * Try to flush a buffer in the dirty queue. We must be careful to
+ * free up B_INVAL buffers instead of write them, which NFS is
+ * particularly sensitive to.
+ */
+static int flushwithdeps = 0;
+SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
+ 0, "Number of buffers flushed with dependecies that require rollbacks");
+
+static int
+flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target,
+ int flushdeps)
+{
+ struct bufqueue *bq;
+ struct buf *sentinel;
+ struct vnode *vp;
+ struct mount *mp;
+ struct buf *bp;
+ int hasdeps;
+ int flushed;
+ int error;
+ bool unlock;
+
+ flushed = 0;
+ bq = &bd->bd_dirtyq;
+ bp = NULL;
+ sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
+ sentinel->b_qindex = QUEUE_SENTINEL;
+ BQ_LOCK(bq);
+ TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist);
+ BQ_UNLOCK(bq);
+ while (flushed != target) {
+ maybe_yield();
+ BQ_LOCK(bq);
+ bp = TAILQ_NEXT(sentinel, b_freelist);
+ if (bp != NULL) {
+ TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
+ TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel,
+ b_freelist);
+ } else {
+ BQ_UNLOCK(bq);
+ break;
+ }
+ /*
+ * Skip sentinels inserted by other invocations of the
+ * flushbufqueues(), taking care to not reorder them.
+ *
+ * Only flush the buffers that belong to the
+ * vnode locked by the curthread.
+ */
+ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
+ bp->b_vp != lvp)) {
+ BQ_UNLOCK(bq);
+ continue;
+ }
+ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+ BQ_UNLOCK(bq);
+ if (error != 0)
+ continue;
+
+ /*
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
+ */
+ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
+ (bp->b_flags & B_DELWRI) == 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+ if (bp->b_flags & B_INVAL) {
+ bremfreef(bp);
+ brelse(bp);
+ flushed++;
+ continue;
+ }
+
+ if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
+ if (flushdeps == 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+ hasdeps = 1;
+ } else
+ hasdeps = 0;
+ /*
+ * We must hold the lock on a vnode before writing
+ * one of its buffers. Otherwise we may confuse, or
+ * in the case of a snapshot vnode, deadlock the
+ * system.
+ *
+ * The lock order here is the reverse of the normal
+ * of vnode followed by buf lock. This is ok because
+ * the NOWAIT will prevent deadlock.
+ */
+ vp = bp->b_vp;
+ if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+ if (lvp == NULL) {
+ unlock = true;
+ error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+ } else {
+ ASSERT_VOP_LOCKED(vp, "getbuf");
+ unlock = false;
+ error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
+ vn_lock(vp, LK_TRYUPGRADE);
+ }
+ if (error == 0) {
+ CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ if (curproc == bufdaemonproc) {
+ vfs_bio_awrite(bp);
+ } else {
+ bremfree(bp);
+ bwrite(bp);
+ counter_u64_add(notbufdflushes, 1);
+ }
+ vn_finished_write(mp);
+ if (unlock)
+ VOP_UNLOCK(vp, 0);
+ flushwithdeps += hasdeps;
+ flushed++;
+
+ /*
+ * Sleeping on runningbufspace while holding
+ * vnode lock leads to deadlock.
+ */
+ if (curproc == bufdaemonproc &&
+ runningbufspace > hirunningspace)
+ waitrunningbufspace();
+ continue;
+ }
+ vn_finished_write(mp);
+ BUF_UNLOCK(bp);
+ }
+ BQ_LOCK(bq);
+ TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist);
+ BQ_UNLOCK(bq);
+ free(sentinel, M_TEMP);
+ return (flushed);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct bufobj *bo, daddr_t blkno)
+{
+ struct buf *bp;
+
+ BO_RLOCK(bo);
+ bp = gbincore(bo, blkno);
+ BO_RUNLOCK(bo);
+ return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object. This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+static int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+ vm_object_t obj;
+ vm_offset_t toff, tinc, size;
+ vm_page_t m;
+ vm_ooffset_t off;
+
+ ASSERT_VOP_LOCKED(vp, "inmem");
+
+ if (incore(&vp->v_bufobj, blkno))
+ return 1;
+ if (vp->v_mount == NULL)
+ return 0;
+ obj = vp->v_object;
+ if (obj == NULL)
+ return (0);
+
+ size = PAGE_SIZE;
+ if (size > vp->v_mount->mnt_stat.f_iosize)
+ size = vp->v_mount->mnt_stat.f_iosize;
+ off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+ VM_OBJECT_RLOCK(obj);
+ for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+ if (!m)
+ goto notinmem;
+ tinc = size;
+ if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+ tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+ if (vm_page_is_valid(m,
+ (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+ goto notinmem;
+ }
+ VM_OBJECT_RUNLOCK(obj);
+ return 1;
+
+notinmem:
+ VM_OBJECT_RUNLOCK(obj);
+ return (0);
+}
+
+/*
+ * Set the dirty range for a buffer based on the status of the dirty
+ * bits in the pages comprising the buffer. The range is limited
+ * to the size of the buffer.
+ *
+ * Tell the VM system that the pages associated with this buffer
+ * are clean. This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
+ */
+static void
+vfs_clean_pages_dirty_buf(struct buf *bp)
+{
+ vm_ooffset_t foff, noff, eoff;
+ vm_page_t m;
+ int i;
+
+ if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
+ return;
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_clean_pages_dirty_buf: no buffer offset"));
+
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ vfs_drain_busy_pages(bp);
+ vfs_setdirty_locked_object(bp);
+ for (i = 0; i < bp->b_npages; i++) {
+ noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ eoff = noff;
+ if (eoff > bp->b_offset + bp->b_bufsize)
+ eoff = bp->b_offset + bp->b_bufsize;
+ m = bp->b_pages[i];
+ vfs_page_set_validclean(bp, foff, m);
+ /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+ foff = noff;
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+static void
+vfs_setdirty_locked_object(struct buf *bp)
+{
+ vm_object_t object;
+ int i;
+
+ object = bp->b_bufobj->bo_object;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ /*
+ * We qualify the scan for modified pages on whether the
+ * object has been flushed yet.
+ */
+ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
+ vm_offset_t boffset;
+ vm_offset_t eoffset;
+
+ /*
+ * test the pages to see if they have been modified directly
+ * by users through the VM system.
+ */
+ for (i = 0; i < bp->b_npages; i++)
+ vm_page_test_dirty(bp->b_pages[i]);
+
+ /*
+ * Calculate the encompassing dirty range, boffset and eoffset,
+ * (eoffset - boffset) bytes.
+ */
+
+ for (i = 0; i < bp->b_npages; i++) {
+ if (bp->b_pages[i]->dirty)
+ break;
+ }
+ boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+ for (i = bp->b_npages - 1; i >= 0; --i) {
+ if (bp->b_pages[i]->dirty) {
+ break;
+ }
+ }
+ eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+ /*
+ * Fit it to the buffer.
+ */
+
+ if (eoffset > bp->b_bcount)
+ eoffset = bp->b_bcount;
+
+ /*
+ * If we have a good dirty range, merge with the existing
+ * dirty range.
+ */
+
+ if (boffset < eoffset) {
+ if (bp->b_dirtyoff > boffset)
+ bp->b_dirtyoff = boffset;
+ if (bp->b_dirtyend < eoffset)
+ bp->b_dirtyend = eoffset;
+ }
+ }
+}
+
+/*
+ * Allocate the KVA mapping for an existing buffer.
+ * If an unmapped buffer is provided but a mapped buffer is requested, take
+ * also care to properly setup mappings between pages and KVA.
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+ int bsize, maxsize, need_mapping, need_kva;
+ off_t offset;
+
+ need_mapping = bp->b_data == unmapped_buf &&
+ (gbflags & GB_UNMAPPED) == 0;
+ need_kva = bp->b_kvabase == unmapped_buf &&
+ bp->b_data == unmapped_buf &&
+ (gbflags & GB_KVAALLOC) != 0;
+ if (!need_mapping && !need_kva)
+ return;
+
+ BUF_CHECK_UNMAPPED(bp);
+
+ if (need_mapping && bp->b_kvabase != unmapped_buf) {
+ /*
+ * Buffer is not mapped, but the KVA was already
+ * reserved at the time of the instantiation. Use the
+ * allocated space.
+ */
+ goto has_addr;
+ }
+
+ /*
+ * Calculate the amount of the address space we would reserve
+ * if the buffer was mapped.
+ */
+ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+ KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
+ offset = blkno * bsize;
+ maxsize = size + (offset & PAGE_MASK);
+ maxsize = imax(maxsize, bsize);
+
+ while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
+ if ((gbflags & GB_NOWAIT_BD) != 0) {
+ /*
+ * XXXKIB: defragmentation cannot
+ * succeed, not sure what else to do.
+ */
+ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
+ }
+ counter_u64_add(mappingrestarts, 1);
+ bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0);
+ }
+has_addr:
+ if (need_mapping) {
+ /* b_offset is handled by bpmap_qenter. */
+ bp->b_data = bp->b_kvabase;
+ BUF_CHECK_MAPPED(bp);
+ bpmap_qenter(bp);
+ }
+}
+
+struct buf *
+getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+ int flags)
+{
+ struct buf *bp;
+ int error;
+
+ error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp);
+ if (error != 0)
+ return (NULL);
+ return (bp);
+}
+
+/*
+ * getblkx:
+ *
+ * Get a block given a specified block and offset into a file/device.
+ * The buffers B_DONE bit will be cleared on return, making it almost
+ * ready for an I/O initiation. B_INVAL may or may not be set on
+ * return. The caller should clear B_INVAL prior to initiating a
+ * READ.
+ *
+ * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ * an existing buffer.
+ *
+ * For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ * and then cleared based on the backing VM. If the previous buffer is
+ * non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ * If getblk() must create a new buffer, the new buffer is returned with
+ * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ * case it is returned with B_INVAL clear and B_CACHE set based on the
+ * backing VM.
+ *
+ * getblk() also forces a bwrite() for any B_DELWRI buffer whos
+ * B_CACHE bit is clear.
+ *
+ * What this means, basically, is that the caller should use B_CACHE to
+ * determine whether the buffer is fully valid or not and should clear
+ * B_INVAL prior to issuing a read. If the caller intends to validate
+ * the buffer by loading its data area with something, the caller needs
+ * to clear B_INVAL. If the caller does this without issuing an I/O,
+ * the caller should set B_CACHE ( as an optimization ), else the caller
+ * should issue the I/O and biodone() will set B_CACHE if the I/O was
+ * a write attempt or if it was a successful read. If the caller
+ * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
+ * prior to issuing the READ. biodone() will *not* clear B_INVAL.
+ */
+int
+getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+ int flags, struct buf **bpp)
+{
+ struct buf *bp;
+ struct bufobj *bo;
+ daddr_t d_blkno;
+ int bsize, error, maxsize, vmio;
+ off_t offset;
+
+ CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+ KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+ ASSERT_VOP_LOCKED(vp, "getblk");
+ if (size > maxbcachebuf)
+ panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
+ maxbcachebuf);
+ if (!unmapped_buf_allowed)
+ flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+ bo = &vp->v_bufobj;
+ d_blkno = blkno;
+loop:
+ BO_RLOCK(bo);
+ bp = gbincore(bo, blkno);
+ if (bp != NULL) {
+ int lockflags;
+ /*
+ * Buffer is in-core. If the buffer is not busy nor managed,
+ * it must be on a queue.
+ */
+ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
+
+ if ((flags & GB_LOCK_NOWAIT) != 0)
+ lockflags |= LK_NOWAIT;
+
+ error = BUF_TIMELOCK(bp, lockflags,
+ BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
+
+ /*
+ * If we slept and got the lock we have to restart in case
+ * the buffer changed identities.
+ */
+ if (error == ENOLCK)
+ goto loop;
+ /* We timed out or were interrupted. */
+ else if (error != 0)
+ return (error);
+ /* If recursed, assume caller knows the rules. */
+ else if (BUF_LOCKRECURSED(bp))
+ goto end;
+
+ /*
+ * The buffer is locked. B_CACHE is cleared if the buffer is
+ * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set
+ * and for a VMIO buffer B_CACHE is adjusted according to the
+ * backing VM cache.
+ */
+ if (bp->b_flags & B_INVAL)
+ bp->b_flags &= ~B_CACHE;
+ else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
+ bp->b_flags |= B_CACHE;
+ if (bp->b_flags & B_MANAGED)
+ MPASS(bp->b_qindex == QUEUE_NONE);
+ else
+ bremfree(bp);
+
+ /*
+ * check for size inconsistencies for non-VMIO case.
+ */
+ if (bp->b_bcount != size) {
+ if ((bp->b_flags & B_VMIO) == 0 ||
+ (size > bp->b_kvasize)) {
+ if (bp->b_flags & B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
+ bwrite(bp);
+ } else {
+ if (LIST_EMPTY(&bp->b_dep)) {
+ bp->b_flags |= B_RELBUF;
+ brelse(bp);
+ } else {
+ bp->b_flags |= B_NOCACHE;
+ bwrite(bp);
+ }
+ }
+ goto loop;
+ }
+ }
+
+ /*
+ * Handle the case of unmapped buffer which should
+ * become mapped, or the buffer for which KVA
+ * reservation is requested.
+ */
+ bp_unmapped_get_kva(bp, blkno, size, flags);
+
+ /*
+ * If the size is inconsistent in the VMIO case, we can resize
+ * the buffer. This might lead to B_CACHE getting set or
+ * cleared. If the size has not changed, B_CACHE remains
+ * unchanged from its previous state.
+ */
+ allocbuf(bp, size);
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("getblk: no buffer offset"));
+
+ /*
+ * A buffer with B_DELWRI set and B_CACHE clear must
+ * be committed before we can return the buffer in
+ * order to prevent the caller from issuing a read
+ * ( due to B_CACHE not being set ) and overwriting
+ * it.
+ *
+ * Most callers, including NFS and FFS, need this to
+ * operate properly either because they assume they
+ * can issue a read if B_CACHE is not set, or because
+ * ( for example ) an uncached B_DELWRI might loop due
+ * to softupdates re-dirtying the buffer. In the latter
+ * case, B_CACHE is set after the first write completes,
+ * preventing further loops.
+ * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
+ * above while extending the buffer, we cannot allow the
+ * buffer to remain with B_CACHE set after the write
+ * completes or it will represent a corrupt state. To
+ * deal with this we set B_NOCACHE to scrap the buffer
+ * after the write.
+ *
+ * We might be able to do something fancy, like setting
+ * B_CACHE in bwrite() except if B_DELWRI is already set,
+ * so the below call doesn't set B_CACHE, but that gets real
+ * confusing. This is much easier.
+ */
+
+ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
+ bwrite(bp);
+ goto loop;
+ }
+ bp->b_flags &= ~B_DONE;
+ } else {
+ /*
+ * Buffer is not in-core, create new buffer. The buffer
+ * returned by getnewbuf() is locked. Note that the returned
+ * buffer is also considered valid (not marked B_INVAL).
+ */
+ BO_RUNLOCK(bo);
+ /*
+ * If the user does not want us to create the buffer, bail out
+ * here.
+ */
+ if (flags & GB_NOCREAT)
+ return (EEXIST);
+
+ bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
+ KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
+ offset = blkno * bsize;
+ vmio = vp->v_object != NULL;
+ if (vmio) {
+ maxsize = size + (offset & PAGE_MASK);
+ } else {
+ maxsize = size;
+ /* Do not allow non-VMIO notmapped buffers. */
+ flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+ }
+ maxsize = imax(maxsize, bsize);
+ if ((flags & GB_NOSPARSE) != 0 && vmio &&
+ !vn_isdisk(vp, NULL)) {
+ error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0);
+ KASSERT(error != EOPNOTSUPP,
+ ("GB_NOSPARSE from fs not supporting bmap, vp %p",
+ vp));
+ if (error != 0)
+ return (error);
+ if (d_blkno == -1)
+ return (EJUSTRETURN);
+ }
+
+ bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
+ if (bp == NULL) {
+ if (slpflag || slptimeo)
+ return (ETIMEDOUT);
+ /*
+ * XXX This is here until the sleep path is diagnosed
+ * enough to work under very low memory conditions.
+ *
+ * There's an issue on low memory, 4BSD+non-preempt
+ * systems (eg MIPS routers with 32MB RAM) where buffer
+ * exhaustion occurs without sleeping for buffer
+ * reclaimation. This just sticks in a loop and
+ * constantly attempts to allocate a buffer, which
+ * hits exhaustion and tries to wakeup bufdaemon.
+ * This never happens because we never yield.
+ *
+ * The real solution is to identify and fix these cases
+ * so we aren't effectively busy-waiting in a loop
+ * until the reclaimation path has cycles to run.
+ */
+ kern_yield(PRI_USER);
+ goto loop;
+ }
+
+ /*
+ * This code is used to make sure that a buffer is not
+ * created while the getnewbuf routine is blocked.
+ * This can be a problem whether the vnode is locked or not.
+ * If the buffer is created out from under us, we have to
+ * throw away the one we just created.
+ *
+ * Note: this must occur before we associate the buffer
+ * with the vp especially considering limitations in
+ * the splay tree implementation when dealing with duplicate
+ * lblkno's.
+ */
+ BO_LOCK(bo);
+ if (gbincore(bo, blkno)) {
+ BO_UNLOCK(bo);
+ bp->b_flags |= B_INVAL;
+ bufspace_release(bufdomain(bp), maxsize);
+ brelse(bp);
+ goto loop;
+ }
+
+ /*
+ * Insert the buffer into the hash, so that it can
+ * be found by incore.
+ */
+ bp->b_lblkno = blkno;
+ bp->b_blkno = d_blkno;
+ bp->b_offset = offset;
+ bgetvp(vp, bp);
+ BO_UNLOCK(bo);
+
+ /*
+ * set B_VMIO bit. allocbuf() the buffer bigger. Since the
+ * buffer size starts out as 0, B_CACHE will be set by
+ * allocbuf() for the VMIO case prior to it testing the
+ * backing store for validity.
+ */
+
+ if (vmio) {
+ bp->b_flags |= B_VMIO;
+ KASSERT(vp->v_object == bp->b_bufobj->bo_object,
+ ("ARGH! different b_bufobj->bo_object %p %p %p\n",
+ bp, vp->v_object, bp->b_bufobj->bo_object));
+ } else {
+ bp->b_flags &= ~B_VMIO;
+ KASSERT(bp->b_bufobj->bo_object == NULL,
+ ("ARGH! has b_bufobj->bo_object %p %p\n",
+ bp, bp->b_bufobj->bo_object));
+ BUF_CHECK_MAPPED(bp);
+ }
+
+ allocbuf(bp, size);
+ bufspace_release(bufdomain(bp), maxsize);
+ bp->b_flags &= ~B_DONE;
+ }
+ CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
+ BUF_ASSERT_HELD(bp);
+end:
+ buf_track(bp, __func__);
+ KASSERT(bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ *bpp = bp;
+ return (0);
+}
+
+/*
+ * Get an empty, disassociated buffer of given size. The buffer is initially
+ * set to B_INVAL.
+ */
+struct buf *
+geteblk(int size, int flags)
+{
+ struct buf *bp;
+ int maxsize;
+
+ maxsize = (size + BKVAMASK) & ~BKVAMASK;
+ while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
+ if ((flags & GB_NOWAIT_BD) &&
+ (curthread->td_pflags & TDP_BUFNEED) != 0)
+ return (NULL);
+ }
+ allocbuf(bp, size);
+ bufspace_release(bufdomain(bp), maxsize);
+ bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+ BUF_ASSERT_HELD(bp);
+ return (bp);
+}
+
+/*
+ * Truncate the backing store for a non-vmio buffer.
+ */
+static void
+vfs_nonvmio_truncate(struct buf *bp, int newbsize)
+{
+
+ if (bp->b_flags & B_MALLOC) {
+ /*
+ * malloced buffers are not shrunk
+ */
+ if (newbsize == 0) {
+ bufmallocadjust(bp, 0);
+ free(bp->b_data, M_BIOBUF);
+ bp->b_data = bp->b_kvabase;
+ bp->b_flags &= ~B_MALLOC;
+ }
+ return;
+ }
+ vm_hold_free_pages(bp, newbsize);
+ bufspace_adjust(bp, newbsize);
+}
+
+/*
+ * Extend the backing for a non-VMIO buffer.
+ */
+static void
+vfs_nonvmio_extend(struct buf *bp, int newbsize)
+{
+ caddr_t origbuf;
+ int origbufsize;
+
+ /*
+ * We only use malloced memory on the first allocation.
+ * and revert to page-allocated memory when the buffer
+ * grows.
+ *
+ * There is a potential smp race here that could lead
+ * to bufmallocspace slightly passing the max. It
+ * is probably extremely rare and not worth worrying
+ * over.
+ */
+ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
+ bufmallocspace < maxbufmallocspace) {
+ bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
+ bp->b_flags |= B_MALLOC;
+ bufmallocadjust(bp, newbsize);
+ return;
+ }
+
+ /*
+ * If the buffer is growing on its other-than-first
+ * allocation then we revert to the page-allocation
+ * scheme.
+ */
+ origbuf = NULL;
+ origbufsize = 0;
+ if (bp->b_flags & B_MALLOC) {
+ origbuf = bp->b_data;
+ origbufsize = bp->b_bufsize;
+ bp->b_data = bp->b_kvabase;
+ bufmallocadjust(bp, 0);
+ bp->b_flags &= ~B_MALLOC;
+ newbsize = round_page(newbsize);
+ }
+ vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
+ (vm_offset_t) bp->b_data + newbsize);
+ if (origbuf != NULL) {
+ bcopy(origbuf, bp->b_data, origbufsize);
+ free(origbuf, M_BIOBUF);
+ }
+ bufspace_adjust(bp, newbsize);
+}
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations). This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistent data situations. Tread lightly!!!
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by
+ * the caller. Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
+ * B_CACHE for the non-VMIO case.
+ */
+int
+allocbuf(struct buf *bp, int size)
+{
+ int newbsize;
+
+ BUF_ASSERT_HELD(bp);
+
+ if (bp->b_bcount == size)
+ return (1);
+
+ if (bp->b_kvasize != 0 && bp->b_kvasize < size)
+ panic("allocbuf: buffer too small");
+
+ newbsize = roundup2(size, DEV_BSIZE);
+ if ((bp->b_flags & B_VMIO) == 0) {
+ if ((bp->b_flags & B_MALLOC) == 0)
+ newbsize = round_page(newbsize);
+ /*
+ * Just get anonymous memory from the kernel. Don't
+ * mess with B_CACHE.
+ */
+ if (newbsize < bp->b_bufsize)
+ vfs_nonvmio_truncate(bp, newbsize);
+ else if (newbsize > bp->b_bufsize)
+ vfs_nonvmio_extend(bp, newbsize);
+ } else {
+ int desiredpages;
+
+ desiredpages = (size == 0) ? 0 :
+ num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+ if (bp->b_flags & B_MALLOC)
+ panic("allocbuf: VMIO buffer can't be malloced");
+ /*
+ * Set B_CACHE initially if buffer is 0 length or will become
+ * 0-length.
+ */
+ if (size == 0 || bp->b_bufsize == 0)
+ bp->b_flags |= B_CACHE;
+
+ if (newbsize < bp->b_bufsize)
+ vfs_vmio_truncate(bp, desiredpages);
+ /* XXX This looks as if it should be newbsize > b_bufsize */
+ else if (size > bp->b_bcount)
+ vfs_vmio_extend(bp, desiredpages, size);
+ bufspace_adjust(bp, newbsize);
+ }
+ bp->b_bcount = size; /* requested buffer size. */
+ return (1);
+}
+
+extern int inflight_transient_maps;
+
+static struct bio_queue nondump_bios;
+
+void
+biodone(struct bio *bp)
+{
+ struct mtx *mtxp;
+ void (*done)(struct bio *);
+ vm_offset_t start, end;
+
+ biotrack(bp, __func__);
+
+ /*
+ * Avoid completing I/O when dumping after a panic since that may
+ * result in a deadlock in the filesystem or pager code. Note that
+ * this doesn't affect dumps that were started manually since we aim
+ * to keep the system usable after it has been resumed.
+ */
+ if (__predict_false(dumping && SCHEDULER_STOPPED())) {
+ TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue);
+ return;
+ }
+ if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+ bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
+ bp->bio_flags |= BIO_UNMAPPED;
+ start = trunc_page((vm_offset_t)bp->bio_data);
+ end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+ bp->bio_data = unmapped_buf;
+ pmap_qremove(start, atop(end - start));
+ vmem_free(transient_arena, start, end - start);
+ atomic_add_int(&inflight_transient_maps, -1);
+ }
+ done = bp->bio_done;
+ if (done == NULL) {
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ bp->bio_flags |= BIO_DONE;
+ wakeup(bp);
+ mtx_unlock(mtxp);
+ } else
+ done(bp);
+}
+
+/*
+ * Wait for a BIO to finish.
+ */
+int
+biowait(struct bio *bp, const char *wchan)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ while ((bp->bio_flags & BIO_DONE) == 0)
+ msleep(bp, mtxp, PRIBIO, wchan, 0);
+ mtx_unlock(mtxp);
+ if (bp->bio_error != 0)
+ return (bp->bio_error);
+ if (!(bp->bio_flags & BIO_ERROR))
+ return (0);
+ return (EIO);
+}
+
+void
+biofinish(struct bio *bp, struct devstat *stat, int error)
+{
+
+ if (error) {
+ bp->bio_error = error;
+ bp->bio_flags |= BIO_ERROR;
+ }
+ if (stat != NULL)
+ devstat_end_transaction_bio(stat, bp);
+ biodone(bp);
+}
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+void
+biotrack_buf(struct bio *bp, const char *location)
+{
+
+ buf_track(bp->bio_track_bp, location);
+}
+#endif
+
+/*
+ * bufwait:
+ *
+ * Wait for buffer I/O completion, returning error status. The buffer
+ * is left locked and B_DONE on return. B_EINTR is converted into an EINTR
+ * error and cleared.
+ */
+int
+bufwait(struct buf *bp)
+{
+ if (bp->b_iocmd == BIO_READ)
+ bwait(bp, PRIBIO, "biord");
+ else
+ bwait(bp, PRIBIO, "biowr");
+ if (bp->b_flags & B_EINTR) {
+ bp->b_flags &= ~B_EINTR;
+ return (EINTR);
+ }
+ if (bp->b_ioflags & BIO_ERROR) {
+ return (bp->b_error ? bp->b_error : EIO);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * bufdone:
+ *
+ * Finish I/O on a buffer, optionally calling a completion function.
+ * This is usually called from an interrupt so process blocking is
+ * not allowed.
+ *
+ * biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ * In a non-VMIO bp, B_CACHE will be set on the next getblk()
+ * assuming B_INVAL is clear.
+ *
+ * For the VMIO case, we set B_CACHE if the op was a read and no
+ * read error occurred, or if the op was a write. B_CACHE is never
+ * set if the buffer is invalid or otherwise uncacheable.
+ *
+ * bufdone does not mess with B_INVAL, allowing the I/O routine or the
+ * initiator to leave B_INVAL set to brelse the buffer out of existence
+ * in the biodone routine.
+ */
+void
+bufdone(struct buf *bp)
+{
+ struct bufobj *dropobj;
+ void (*biodone)(struct buf *);
+
+ buf_track(bp, __func__);
+ CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ dropobj = NULL;
+
+ KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
+ BUF_ASSERT_HELD(bp);
+
+ runningbufwakeup(bp);
+ if (bp->b_iocmd == BIO_WRITE)
+ dropobj = bp->b_bufobj;
+ /* call optional completion function if requested */
+ if (bp->b_iodone != NULL) {
+ biodone = bp->b_iodone;
+ bp->b_iodone = NULL;
+ (*biodone) (bp);
+ if (dropobj)
+ bufobj_wdrop(dropobj);
+ return;
+ }
+ if (bp->b_flags & B_VMIO) {
+ /*
+ * Set B_CACHE if the op was a normal read and no error
+ * occurred. B_CACHE is set for writes in the b*write()
+ * routines.
+ */
+ if (bp->b_iocmd == BIO_READ &&
+ !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
+ !(bp->b_ioflags & BIO_ERROR))
+ bp->b_flags |= B_CACHE;
+ vfs_vmio_iodone(bp);
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_complete(bp);
+ if ((bp->b_flags & B_CKHASH) != 0) {
+ KASSERT(bp->b_iocmd == BIO_READ,
+ ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd));
+ KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp));
+ (*bp->b_ckhashcalc)(bp);
+ }
+ /*
+ * For asynchronous completions, release the buffer now. The brelse
+ * will do a wakeup there if necessary - so no need to do a wakeup
+ * here in the async case. The sync case always needs to do a wakeup.
+ */
+ if (bp->b_flags & B_ASYNC) {
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
+ (bp->b_ioflags & BIO_ERROR))
+ brelse(bp);
+ else
+ bqrelse(bp);
+ } else
+ bdone(bp);
+ if (dropobj)
+ bufobj_wdrop(dropobj);
+}
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O. This keeps the busy status for pages
+ * consistent.
+ */
+void
+vfs_unbusy_pages(struct buf *bp)
+{
+ int i;
+ vm_object_t obj;
+ vm_page_t m;
+
+ runningbufwakeup(bp);
+ if (!(bp->b_flags & B_VMIO))
+ return;
+
+ obj = bp->b_bufobj->bo_object;
+ VM_OBJECT_WLOCK(obj);
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+ if (!m)
+ panic("vfs_unbusy_pages: page missing\n");
+ bp->b_pages[i] = m;
+ if (buf_mapped(bp)) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+ }
+ vm_page_sunbusy(m);
+ }
+ vm_object_pip_wakeupn(obj, bp->b_npages);
+ VM_OBJECT_WUNLOCK(obj);
+}
+
+/*
+ * vfs_page_set_valid:
+ *
+ * Set the valid bits in a page based on the supplied offset. The
+ * range is restricted to the buffer's size.
+ *
+ * This routine is typically called after a read completes.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+ vm_ooffset_t eoff;
+
+ /*
+ * Compute the end offset, eoff, such that [off, eoff) does not span a
+ * page boundary and eoff is not greater than the end of the buffer.
+ * The end of the buffer, in this case, is our file EOF, not the
+ * allocation size of the buffer.
+ */
+ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
+ if (eoff > bp->b_offset + bp->b_bcount)
+ eoff = bp->b_offset + bp->b_bcount;
+
+ /*
+ * Set valid range. This is typically the entire buffer and thus the
+ * entire page.
+ */
+ if (eoff > off)
+ vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
+}
+
+/*
+ * vfs_page_set_validclean:
+ *
+ * Set the valid bits and clear the dirty bits in a page based on the
+ * supplied offset. The range is restricted to the buffer's size.
+ */
+static void
+vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+ vm_ooffset_t soff, eoff;
+
+ /*
+ * Start and end offsets in buffer. eoff - soff may not cross a
+ * page boundary or cross the end of the buffer. The end of the
+ * buffer, in this case, is our file EOF, not the allocation size
+ * of the buffer.
+ */
+ soff = off;
+ eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ if (eoff > bp->b_offset + bp->b_bcount)
+ eoff = bp->b_offset + bp->b_bcount;
+
+ /*
+ * Set valid range. This is typically the entire buffer and thus the
+ * entire page.
+ */
+ if (eoff > soff) {
+ vm_page_set_validclean(
+ m,
+ (vm_offset_t) (soff & PAGE_MASK),
+ (vm_offset_t) (eoff - soff)
+ );
+ }
+}
+
+/*
+ * Ensure that all buffer pages are not exclusive busied. If any page is
+ * exclusive busy, drain it.
+ */
+void
+vfs_drain_busy_pages(struct buf *bp)
+{
+ vm_page_t m;
+ int i, last_busied;
+
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
+ last_busied = 0;
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ if (vm_page_xbusied(m)) {
+ for (; last_busied < i; last_busied++)
+ vm_page_sbusy(bp->b_pages[last_busied]);
+ while (vm_page_xbusied(m)) {
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ vm_page_busy_sleep(m, "vbpage", true);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ }
+ }
+ }
+ for (i = 0; i < last_busied; i++)
+ vm_page_sunbusy(bp->b_pages[i]);
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being exclusive busy. Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistent.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as BIO_ERROR or B_INVAL may be in an inconsistent state
+ * and should be ignored.
+ */
+void
+vfs_busy_pages(struct buf *bp, int clear_modify)
+{
+ vm_object_t obj;
+ vm_ooffset_t foff;
+ vm_page_t m;
+ int i;
+ bool bogus;
+
+ if (!(bp->b_flags & B_VMIO))
+ return;
+
+ obj = bp->b_bufobj->bo_object;
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_busy_pages: no buffer offset"));
+ VM_OBJECT_WLOCK(obj);
+ vfs_drain_busy_pages(bp);
+ if (bp->b_bufsize != 0)
+ vfs_setdirty_locked_object(bp);
+ bogus = false;
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ vm_object_pip_add(obj, 1);
+ vm_page_sbusy(m);
+ }
+ /*
+ * When readying a buffer for a read ( i.e
+ * clear_modify == 0 ), it is important to do
+ * bogus_page replacement for valid pages in
+ * partially instantiated buffers. Partially
+ * instantiated buffers can, in turn, occur when
+ * reconstituting a buffer from its VM backing store
+ * base. We only have to do this if B_CACHE is
+ * clear ( which causes the I/O to occur in the
+ * first place ). The replacement prevents the read
+ * I/O from overwriting potentially dirty VM-backed
+ * pages. XXX bogus page replacement is, uh, bogus.
+ * It may not work properly with small-block devices.
+ * We need to find a better way.
+ */
+ if (clear_modify) {
+ pmap_remove_write(m);
+ vfs_page_set_validclean(bp, foff, m);
+ } else if (m->valid == VM_PAGE_BITS_ALL &&
+ (bp->b_flags & B_CACHE) == 0) {
+ bp->b_pages[i] = bogus_page;
+ bogus = true;
+ }
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ }
+ VM_OBJECT_WUNLOCK(obj);
+ if (bogus && buf_mapped(bp)) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ }
+}
+
+/*
+ * vfs_bio_set_valid:
+ *
+ * Set the range within the buffer to valid. The range is
+ * relative to the beginning of the buffer, b_offset. Note that
+ * b_offset itself may be offset from the beginning of the first
+ * page.
+ */
+void
+vfs_bio_set_valid(struct buf *bp, int base, int size)
+{
+ int i, n;
+ vm_page_t m;
+
+ if (!(bp->b_flags & B_VMIO))
+ return;
+
+ /*
+ * Fixup base to be relative to beginning of first page.
+ * Set initial n to be the maximum number of bytes in the
+ * first page that can be validated.
+ */
+ base += (bp->b_offset & PAGE_MASK);
+ n = PAGE_SIZE - (base & PAGE_MASK);
+
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+ m = bp->b_pages[i];
+ if (n > size)
+ n = size;
+ vm_page_set_valid_range(m, base & PAGE_MASK, n);
+ base += n;
+ size -= n;
+ n = PAGE_SIZE;
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+/*
+ * vfs_bio_clrbuf:
+ *
+ * If the specified buffer is a non-VMIO buffer, clear the entire
+ * buffer. If the specified buffer is a VMIO buffer, clear and
+ * validate only the previously invalid portions of the buffer.
+ * This routine essentially fakes an I/O, so we need to clear
+ * BIO_ERROR and B_INVAL.
+ *
+ * Note that while we only theoretically need to clear through b_bcount,
+ * we go ahead and clear through b_bufsize.
+ */
+void
+vfs_bio_clrbuf(struct buf *bp)
+{
+ int i, j, mask, sa, ea, slide;
+
+ if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
+ clrbuf(bp);
+ return;
+ }
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+ (bp->b_offset & PAGE_MASK) == 0) {
+ if (bp->b_pages[0] == bogus_page)
+ goto unlock;
+ mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
+ if ((bp->b_pages[0]->valid & mask) == mask)
+ goto unlock;
+ if ((bp->b_pages[0]->valid & mask) == 0) {
+ pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
+ bp->b_pages[0]->valid |= mask;
+ goto unlock;
+ }
+ }
+ sa = bp->b_offset & PAGE_MASK;
+ slide = 0;
+ for (i = 0; i < bp->b_npages; i++, sa = 0) {
+ slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
+ ea = slide & PAGE_MASK;
+ if (ea == 0)
+ ea = PAGE_SIZE;
+ if (bp->b_pages[i] == bogus_page)
+ continue;
+ j = sa / DEV_BSIZE;
+ mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
+ if ((bp->b_pages[i]->valid & mask) == mask)
+ continue;
+ if ((bp->b_pages[i]->valid & mask) == 0)
+ pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
+ else {
+ for (; sa < ea; sa += DEV_BSIZE, j++) {
+ if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
+ pmap_zero_page_area(bp->b_pages[i],
+ sa, DEV_BSIZE);
+ }
+ }
+ }
+ bp->b_pages[i]->valid |= mask;
+ }
+unlock:
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ bp->b_resid = 0;
+}
+
+void
+vfs_bio_bzero_buf(struct buf *bp, int base, int size)
+{
+ vm_page_t m;
+ int i, n;
+
+ if (buf_mapped(bp)) {
+ BUF_CHECK_MAPPED(bp);
+ bzero(bp->b_data + base, size);
+ } else {
+ BUF_CHECK_UNMAPPED(bp);
+ n = PAGE_SIZE - (base & PAGE_MASK);
+ for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+ m = bp->b_pages[i];
+ if (n > size)
+ n = size;
+ pmap_zero_page_area(m, base & PAGE_MASK, n);
+ base += n;
+ size -= n;
+ n = PAGE_SIZE;
+ }
+ }
+}
+
+/*
+ * Update buffer flags based on I/O request parameters, optionally releasing the
+ * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM,
+ * where they may be placed on a page queue (VMIO) or freed immediately (direct
+ * I/O). Otherwise the buffer is released to the cache.
+ */
+static void
+b_io_dismiss(struct buf *bp, int ioflag, bool release)
+{
+
+ KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0,
+ ("buf %p non-VMIO noreuse", bp));
+
+ if ((ioflag & IO_DIRECT) != 0)
+ bp->b_flags |= B_DIRECT;
+ if ((ioflag & IO_EXT) != 0)
+ bp->b_xflags |= BX_ALTDATA;
+ if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) {
+ bp->b_flags |= B_RELBUF;
+ if ((ioflag & IO_NOREUSE) != 0)
+ bp->b_flags |= B_NOREUSE;
+ if (release)
+ brelse(bp);
+ } else if (release)
+ bqrelse(bp);
+}
+
+void
+vfs_bio_brelse(struct buf *bp, int ioflag)
+{
+
+ b_io_dismiss(bp, ioflag, true);
+}
+
+void
+vfs_bio_set_flags(struct buf *bp, int ioflag)
+{
+
+ b_io_dismiss(bp, ioflag, false);
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_free_pages get pages into
+ * a buffers address space. The pages are anonymous and are
+ * not associated with a file object.
+ */
+static void
+vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index;
+
+ BUF_CHECK_MAPPED(bp);
+
+ to = round_page(to);
+ from = round_page(from);
+ index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+ /*
+ * note: must allocate system pages since blocking here
+ * could interfere with paging I/O, no matter which
+ * process we are.
+ */
+ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
+ VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) |
+ VM_ALLOC_WAITOK);
+ pmap_qenter(pg, &p, 1);
+ bp->b_pages[index] = p;
+ }
+ bp->b_npages = index;
+}
+
+/* Return pages associated with this buf to the vm system */
+static void
+vm_hold_free_pages(struct buf *bp, int newbsize)
+{
+ vm_offset_t from;
+ vm_page_t p;
+ int index, newnpages;
+
+ BUF_CHECK_MAPPED(bp);
+
+ from = round_page((vm_offset_t)bp->b_data + newbsize);
+ newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+ if (bp->b_npages > newnpages)
+ pmap_qremove(from, bp->b_npages - newnpages);
+ for (index = newnpages; index < bp->b_npages; index++) {
+ p = bp->b_pages[index];
+ bp->b_pages[index] = NULL;
+ p->wire_count--;
+ vm_page_free(p);
+ }
+ vm_wire_sub(bp->b_npages - newnpages);
+ bp->b_npages = newnpages;
+}
+
+/*
+ * Map an IO request into kernel virtual address space.
+ *
+ * All requests are (re)mapped into kernel VA space.
+ * Notice that we use b_bufsize for the size of the buffer
+ * to be mapped. b_bcount might be modified by the driver.
+ *
+ * Note that even if the caller determines that the address space should
+ * be valid, a race or a smaller-file mapped into a larger space may
+ * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
+ * check the return value.
+ *
+ * This function only works with pager buffers.
+ */
+int
+vmapbuf(struct buf *bp, int mapbuf)
+{
+ vm_prot_t prot;
+ int pidx;
+
+ if (bp->b_bufsize < 0)
+ return (-1);
+ prot = VM_PROT_READ;
+ if (bp->b_iocmd == BIO_READ)
+ prot |= VM_PROT_WRITE; /* Less backwards than it looks */
+ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+ (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
+ btoc(MAXPHYS))) < 0)
+ return (-1);
+ bp->b_npages = pidx;
+ bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
+ if (mapbuf || !unmapped_buf_allowed) {
+ pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
+ bp->b_data = bp->b_kvabase + bp->b_offset;
+ } else
+ bp->b_data = unmapped_buf;
+ return(0);
+}
+
+/*
+ * Free the io map PTEs associated with this IO operation.
+ * We also invalidate the TLB entries and restore the original b_addr.
+ *
+ * This function only works with pager buffers.
+ */
+void
+vunmapbuf(struct buf *bp)
+{
+ int npages;
+
+ npages = bp->b_npages;
+ if (buf_mapped(bp))
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
+ vm_page_unhold_pages(bp->b_pages, npages);
+
+ bp->b_data = unmapped_buf;
+}
+
+void
+bdone(struct buf *bp)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ bp->b_flags |= B_DONE;
+ wakeup(bp);
+ mtx_unlock(mtxp);
+}
+
+void
+bwait(struct buf *bp, u_char pri, const char *wchan)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ while ((bp->b_flags & B_DONE) == 0)
+ msleep(bp, mtxp, pri, wchan, 0);
+ mtx_unlock(mtxp);
+}
+
+int
+bufsync(struct bufobj *bo, int waitfor)
+{
+
+ return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread));
+}
+
+void
+bufstrategy(struct bufobj *bo, struct buf *bp)
+{
+ int i __unused;
+ struct vnode *vp;
+
+ vp = bp->b_vp;
+ KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
+ KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
+ ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
+ i = VOP_STRATEGY(vp, bp);
+ KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
+}
+
+/*
+ * Initialize a struct bufobj before use. Memory is assumed zero filled.
+ */
+void
+bufobj_init(struct bufobj *bo, void *private)
+{
+ static volatile int bufobj_cleanq;
+
+ bo->bo_domain =
+ atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains;
+ rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+ bo->bo_private = private;
+ TAILQ_INIT(&bo->bo_clean.bv_hd);
+ TAILQ_INIT(&bo->bo_dirty.bv_hd);
+}
+
+void
+bufobj_wrefl(struct bufobj *bo)
+{
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+ ASSERT_BO_WLOCKED(bo);
+ bo->bo_numoutput++;
+}
+
+void
+bufobj_wref(struct bufobj *bo)
+{
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+ BO_LOCK(bo);
+ bo->bo_numoutput++;
+ BO_UNLOCK(bo);
+}
+
+void
+bufobj_wdrop(struct bufobj *bo)
+{
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
+ BO_LOCK(bo);
+ KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
+ if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
+ bo->bo_flag &= ~BO_WWAIT;
+ wakeup(&bo->bo_numoutput);
+ }
+ BO_UNLOCK(bo);
+}
+
+int
+bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
+{
+ int error;
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
+ ASSERT_BO_WLOCKED(bo);
+ error = 0;
+ while (bo->bo_numoutput) {
+ bo->bo_flag |= BO_WWAIT;
+ error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
+ slpflag | (PRIBIO + 1), "bo_wwait", timeo);
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+ if (!buf_mapped(bp)) {
+ KASSERT(unmapped_buf_allowed, ("unmapped"));
+ bip->bio_ma = bp->b_pages;
+ bip->bio_ma_n = bp->b_npages;
+ bip->bio_data = unmapped_buf;
+ bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+ bip->bio_flags |= BIO_UNMAPPED;
+ KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+ PAGE_SIZE == bp->b_npages,
+ ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
+ (long long)bip->bio_length, bip->bio_ma_n));
+ } else {
+ bip->bio_data = bp->b_data;
+ bip->bio_ma = NULL;
+ }
+}
+
+/*
+ * The MIPS pmap code currently doesn't handle aliased pages.
+ * The VIPT caches may not handle page aliasing themselves, leading
+ * to data corruption.
+ *
+ * As such, this code makes a system extremely unhappy if said
+ * system doesn't support unaliasing the above situation in hardware.
+ * Some "recent" systems (eg some mips24k/mips74k cores) don't enable
+ * this feature at build time, so it has to be handled in software.
+ *
+ * Once the MIPS pmap/cache code grows to support this function on
+ * earlier chips, it should be flipped back off.
+ */
+#ifdef __mips__
+static int buf_pager_relbuf = 1;
+#else
+static int buf_pager_relbuf = 0;
+#endif
+SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
+ &buf_pager_relbuf, 0,
+ "Make buffer pager release buffers after reading");
+
+/*
+ * The buffer pager. It uses buffer reads to validate pages.
+ *
+ * In contrast to the generic local pager from vm/vnode_pager.c, this
+ * pager correctly and easily handles volumes where the underlying
+ * device block size is greater than the machine page size. The
+ * buffer cache transparently extends the requested page run to be
+ * aligned at the block boundary, and does the necessary bogus page
+ * replacements in the addends to avoid obliterating already valid
+ * pages.
+ *
+ * The only non-trivial issue is that the exclusive busy state for
+ * pages, which is assumed by the vm_pager_getpages() interface, is
+ * incompatible with the VMIO buffer cache's desire to share-busy the
+ * pages. This function performs a trivial downgrade of the pages'
+ * state before reading buffers, and a less trivial upgrade from the
+ * shared-busy to excl-busy state after the read.
+ */
+int
+vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
+ int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
+ vbg_get_blksize_t get_blksize)
+{
+ vm_page_t m;
+ vm_object_t object;
+ struct buf *bp;
+ struct mount *mp;
+ daddr_t lbn, lbnp;
+ vm_ooffset_t la, lb, poff, poffe;
+ long bsize;
+ int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
+ bool redo, lpart;
+
+ object = vp->v_object;
+ mp = vp->v_mount;
+ error = 0;
+ la = IDX_TO_OFF(ma[count - 1]->pindex);
+ if (la >= object->un_pager.vnp.vnp_size)
+ return (VM_PAGER_BAD);
+
+ /*
+ * Change the meaning of la from where the last requested page starts
+ * to where it ends, because that's the end of the requested region
+ * and the start of the potential read-ahead region.
+ */
+ la += PAGE_SIZE;
+ lpart = la > object->un_pager.vnp.vnp_size;
+ bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
+
+ /*
+ * Calculate read-ahead, behind and total pages.
+ */
+ pgsin = count;
+ lb = IDX_TO_OFF(ma[0]->pindex);
+ pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
+ pgsin += pgsin_b;
+ if (rbehind != NULL)
+ *rbehind = pgsin_b;
+ pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
+ if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
+ pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
+ PAGE_SIZE) - la);
+ pgsin += pgsin_a;
+ if (rahead != NULL)
+ *rahead = pgsin_a;
+ VM_CNT_INC(v_vnodein);
+ VM_CNT_ADD(v_vnodepgsin, pgsin);
+
+ br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
+ != 0) ? GB_UNMAPPED : 0;
+ VM_OBJECT_WLOCK(object);
+again:
+ for (i = 0; i < count; i++)
+ vm_page_busy_downgrade(ma[i]);
+ VM_OBJECT_WUNLOCK(object);
+
+ lbnp = -1;
+ for (i = 0; i < count; i++) {
+ m = ma[i];
+
+ /*
+ * Pages are shared busy and the object lock is not
+ * owned, which together allow for the pages'
+ * invalidation. The racy test for validity avoids
+ * useless creation of the buffer for the most typical
+ * case when invalidation is not used in redo or for
+ * parallel read. The shared->excl upgrade loop at
+ * the end of the function catches the race in a
+ * reliable way (protected by the object lock).
+ */
+ if (m->valid == VM_PAGE_BITS_ALL)
+ continue;
+
+ poff = IDX_TO_OFF(m->pindex);
+ poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
+ for (; poff < poffe; poff += bsize) {
+ lbn = get_lblkno(vp, poff);
+ if (lbn == lbnp)
+ goto next_page;
+ lbnp = lbn;
+
+ bsize = get_blksize(vp, lbn);
+ error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
+ br_flags, &bp);
+ if (error != 0)
+ goto end_pages;
+ if (LIST_EMPTY(&bp->b_dep)) {
+ /*
+ * Invalidation clears m->valid, but
+ * may leave B_CACHE flag if the
+ * buffer existed at the invalidation
+ * time. In this case, recycle the
+ * buffer to do real read on next
+ * bread() after redo.
+ *
+ * Otherwise B_RELBUF is not strictly
+ * necessary, enable to reduce buf
+ * cache pressure.
+ */
+ if (buf_pager_relbuf ||
+ m->valid != VM_PAGE_BITS_ALL)
+ bp->b_flags |= B_RELBUF;
+
+ bp->b_flags &= ~B_NOCACHE;
+ brelse(bp);
+ } else {
+ bqrelse(bp);
+ }
+ }
+ KASSERT(1 /* racy, enable for debugging */ ||
+ m->valid == VM_PAGE_BITS_ALL || i == count - 1,
+ ("buf %d %p invalid", i, m));
+ if (i == count - 1 && lpart) {
+ VM_OBJECT_WLOCK(object);
+ if (m->valid != 0 &&
+ m->valid != VM_PAGE_BITS_ALL)
+ vm_page_zero_invalid(m, TRUE);
+ VM_OBJECT_WUNLOCK(object);
+ }
+next_page:;
+ }
+end_pages:
+
+ VM_OBJECT_WLOCK(object);
+ redo = false;
+ for (i = 0; i < count; i++) {
+ vm_page_sunbusy(ma[i]);
+ ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
+
+ /*
+ * Since the pages were only sbusy while neither the
+ * buffer nor the object lock was held by us, or
+ * reallocated while vm_page_grab() slept for busy
+ * relinguish, they could have been invalidated.
+ * Recheck the valid bits and re-read as needed.
+ *
+ * Note that the last page is made fully valid in the
+ * read loop, and partial validity for the page at
+ * index count - 1 could mean that the page was
+ * invalidated or removed, so we must restart for
+ * safety as well.
+ */
+ if (ma[i]->valid != VM_PAGE_BITS_ALL)
+ redo = true;
+ }
+ if (redo && error == 0)
+ goto again;
+ VM_OBJECT_WUNLOCK(object);
+ return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+/* DDB command to show buffer data */
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+ /* get args */
+ struct buf *bp = (struct buf *)addr;
+#ifdef FULL_BUF_TRACKING
+ uint32_t i, j;
+#endif
+
+ if (!have_addr) {
+ db_printf("usage: show buffer <addr>\n");
+ return;
+ }
+
+ db_printf("buf at %p\n", bp);
+ db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
+ (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
+ PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
+ db_printf(
+ "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
+ "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
+ "b_dep = %p\n",
+ bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+ bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
+ (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
+ db_printf("b_kvabase = %p, b_kvasize = %d\n",
+ bp->b_kvabase, bp->b_kvasize);
+ if (bp->b_npages) {
+ int i;
+ db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m;
+ m = bp->b_pages[i];
+ if (m != NULL)
+ db_printf("(%p, 0x%lx, 0x%lx)", m->object,
+ (u_long)m->pindex,
+ (u_long)VM_PAGE_TO_PHYS(m));
+ else
+ db_printf("( ??? )");
+ if ((i + 1) < bp->b_npages)
+ db_printf(",");
+ }
+ db_printf("\n");
+ }
+ BUF_LOCKPRINTINFO(bp);
+#if defined(FULL_BUF_TRACKING)
+ db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
+
+ i = bp->b_io_tcnt % BUF_TRACKING_SIZE;
+ for (j = 1; j <= BUF_TRACKING_SIZE; j++) {
+ if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL)
+ continue;
+ db_printf(" %2u: %s\n", j,
+ bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]);
+ }
+#elif defined(BUF_TRACKING)
+ db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
+#endif
+ db_printf(" ");
+}
+
+DB_SHOW_COMMAND(bufqueues, bufqueues)
+{
+ struct bufdomain *bd;
+ struct buf *bp;
+ long total;
+ int i, j, cnt;
+
+ db_printf("bqempty: %d\n", bqempty.bq_len);
+
+ for (i = 0; i < buf_domains; i++) {
+ bd = &bdomain[i];
+ db_printf("Buf domain %d\n", i);
+ db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers);
+ db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers);
+ db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers);
+ db_printf("\n");
+ db_printf("\tbufspace\t%ld\n", bd->bd_bufspace);
+ db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace);
+ db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace);
+ db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace);
+ db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh);
+ db_printf("\n");
+ db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers);
+ db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers);
+ db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers);
+ db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh);
+ db_printf("\n");
+ total = 0;
+ TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist)
+ total += bp->b_bufsize;
+ db_printf("\tcleanq count\t%d (%ld)\n",
+ bd->bd_cleanq->bq_len, total);
+ total = 0;
+ TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist)
+ total += bp->b_bufsize;
+ db_printf("\tdirtyq count\t%d (%ld)\n",
+ bd->bd_dirtyq.bq_len, total);
+ db_printf("\twakeup\t\t%d\n", bd->bd_wanted);
+ db_printf("\tlim\t\t%d\n", bd->bd_lim);
+ db_printf("\tCPU ");
+ for (j = 0; j <= mp_maxid; j++)
+ db_printf("%d, ", bd->bd_subq[j].bq_len);
+ db_printf("\n");
+ cnt = 0;
+ total = 0;
+ for (j = 0; j < nbuf; j++)
+ if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) {
+ cnt++;
+ total += buf[j].b_bufsize;
+ }
+ db_printf("\tLocked buffers: %d space %ld\n", cnt, total);
+ cnt = 0;
+ total = 0;
+ for (j = 0; j < nbuf; j++)
+ if (buf[j].b_domain == i) {
+ cnt++;
+ total += buf[j].b_bufsize;
+ }
+ db_printf("\tTotal buffers: %d space %ld\n", cnt, total);
+ }
+}
+
+DB_SHOW_COMMAND(lockedbufs, lockedbufs)
+{
+ struct buf *bp;
+ int i;
+
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ if (BUF_ISLOCKED(bp)) {
+ db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+ db_printf("\n");
+ if (db_pager_quit)
+ break;
+ }
+ }
+}
+
+DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
+{
+ struct vnode *vp;
+ struct buf *bp;
+
+ if (!have_addr) {
+ db_printf("usage: show vnodebufs <addr>\n");
+ return;
+ }
+ vp = (struct vnode *)addr;
+ db_printf("Clean buffers:\n");
+ TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
+ db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+ db_printf("\n");
+ }
+ db_printf("Dirty buffers:\n");
+ TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
+ db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+ db_printf("\n");
+ }
+}
+
+DB_COMMAND(countfreebufs, db_coundfreebufs)
+{
+ struct buf *bp;
+ int i, used = 0, nfree = 0;
+
+ if (have_addr) {
+ db_printf("usage: countfreebufs\n");
+ return;
+ }
+
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ if (bp->b_qindex == QUEUE_EMPTY)
+ nfree++;
+ else
+ used++;
+ }
+
+ db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
+ nfree + used);
+ db_printf("numfreebuffers is %d\n", numfreebuffers);
+}
+#endif /* DDB */
diff --git a/freebsd/sys/kern/vfs_cache.c b/freebsd/sys/kern/vfs_cache.c
new file mode 100644
index 00000000..7c14b080
--- /dev/null
+++ b/freebsd/sys/kern/vfs_cache.c
@@ -0,0 +1,2604 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <vm/uma.h>
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
+ "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
+ "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
+ "char *", "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
+ "struct vnode *", "char *");
+SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
+ "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
+ "struct vnode *", "char *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
+ "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
+ "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *",
+ "char *", "int");
+SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *",
+ "char *", "int");
+
+/*
+ * This structure describes the elements in the cache of recent
+ * names looked up by namei.
+ */
+
+struct namecache {
+ LIST_ENTRY(namecache) nc_hash; /* hash chain */
+ LIST_ENTRY(namecache) nc_src; /* source vnode list */
+ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
+ struct vnode *nc_dvp; /* vnode of parent of name */
+ union {
+ struct vnode *nu_vp; /* vnode the name refers to */
+ u_int nu_neghits; /* negative entry hits */
+ } n_un;
+ u_char nc_flag; /* flag bits */
+ u_char nc_nlen; /* length of name */
+ char nc_name[0]; /* segment name + nul */
+};
+
+/*
+ * struct namecache_ts repeats struct namecache layout up to the
+ * nc_nlen member.
+ * struct namecache_ts is used in place of struct namecache when time(s) need
+ * to be stored. The nc_dotdottime field is used when a cache entry is mapping
+ * both a non-dotdot directory name plus dotdot for the directory's
+ * parent.
+ */
+struct namecache_ts {
+ struct timespec nc_time; /* timespec provided by fs */
+ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
+ int nc_ticks; /* ticks value when entry was added */
+ struct namecache nc_nc;
+};
+
+#define nc_vp n_un.nu_vp
+#define nc_neghits n_un.nu_neghits
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE 0x01
+#define NCF_ISDOTDOT 0x02
+#define NCF_TS 0x04
+#define NCF_DTS 0x08
+#define NCF_DVDROP 0x10
+#define NCF_NEGATIVE 0x20
+#define NCF_HOTNEGATIVE 0x40
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference. It is managed LRU, so frequently
+ * used names will hang around. Cache is indexed by hash value
+ * obtained from (dvp, name) where dvp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME TYPE ROLE
+ * vnodelock mtx vnode lists and v_cache_dd field protection
+ * bucketlock rwlock for access to given set of hash buckets
+ * neglist mtx negative entry LRU management
+ *
+ * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
+ * shrinking the LRU list.
+ *
+ * It is legal to take multiple vnodelock and bucketlock locks. The locking
+ * order is lower address first. Both are recursive.
+ *
+ * "." lookups are lockless.
+ *
+ * ".." and vnode -> name lookups require vnodelock.
+ *
+ * name -> vnode lookup requires the relevant bucketlock to be held for reading.
+ *
+ * Insertions and removals of entries require involved vnodes and bucketlocks
+ * to be write-locked to prevent other threads from seeing the entry.
+ *
+ * Some lookups result in removal of the found entry (e.g. getting rid of a
+ * negative entry with the intent to create a positive one), which poses a
+ * problem when multiple threads reach the state. Similarly, two different
+ * threads can purge two different vnodes and try to remove the same name.
+ *
+ * If the already held vnode lock is lower than the second required lock, we
+ * can just take the other lock. However, in the opposite case, this could
+ * deadlock. As such, this is resolved by trylocking and if that fails unlocking
+ * the first node, locking everything in order and revalidating the state.
+ */
+
+/*
+ * Structures associated with name caching.
+ */
+#define NCHHASH(hash) \
+ (&nchashtbl[(hash) & nchash])
+static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
+static u_long __read_mostly nchash; /* size of hash table */
+SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
+ "Size of namecache hash table");
+static u_long __read_mostly ncnegfactor = 12; /* ratio of negative entries */
+SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
+ "Ratio of negative namecache entries");
+static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
+ "Number of negative entries in namecache");
+static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
+ "Number of namecache entries");
+static u_long __exclusive_cache_line numcachehv;/* number of cache entries with vnodes held */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
+ "Number of namecache entries with vnodes held");
+u_int __read_mostly ncsizefactor = 2;
+SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
+ "Size factor for namecache");
+static u_int __read_mostly ncpurgeminvnodes;
+SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
+ "Number of vnodes below which purgevfs ignores the request");
+static u_int __read_mostly ncneghitsrequeue = 8;
+SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0,
+ "Number of hits to requeue a negative entry in the LRU list");
+
+struct nchstats nchstats; /* cache effectiveness statistics */
+
+static struct mtx ncneg_shrink_lock;
+static int shrink_list_turn;
+
+struct neglist {
+ struct mtx nl_lock;
+ TAILQ_HEAD(, namecache) nl_list;
+} __aligned(CACHE_LINE_SIZE);
+
+static struct neglist __read_mostly *neglists;
+static struct neglist ncneg_hot;
+
+#define numneglists (ncneghash + 1)
+static u_int __read_mostly ncneghash;
+static inline struct neglist *
+NCP2NEGLIST(struct namecache *ncp)
+{
+
+ return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
+}
+
+#define numbucketlocks (ncbuckethash + 1)
+static u_int __read_mostly ncbuckethash;
+static struct rwlock_padalign __read_mostly *bucketlocks;
+#define HASH2BUCKETLOCK(hash) \
+ ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
+
+#define numvnodelocks (ncvnodehash + 1)
+static u_int __read_mostly ncvnodehash;
+static struct mtx __read_mostly *vnodelocks;
+static inline struct mtx *
+VP2VNODELOCK(struct vnode *vp)
+{
+
+ return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
+}
+
+/*
+ * UMA zones for the VFS cache.
+ *
+ * The small cache is used for entries with short names, which are the
+ * most common. The large cache is used for entries which are too big to
+ * fit in the small cache.
+ */
+static uma_zone_t __read_mostly cache_zone_small;
+static uma_zone_t __read_mostly cache_zone_small_ts;
+static uma_zone_t __read_mostly cache_zone_large;
+static uma_zone_t __read_mostly cache_zone_large_ts;
+
+#define CACHE_PATH_CUTOFF 35
+
+static struct namecache *
+cache_alloc(int len, int ts)
+{
+ struct namecache_ts *ncp_ts;
+ struct namecache *ncp;
+
+ if (__predict_false(ts)) {
+ if (len <= CACHE_PATH_CUTOFF)
+ ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK);
+ else
+ ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK);
+ ncp = &ncp_ts->nc_nc;
+ } else {
+ if (len <= CACHE_PATH_CUTOFF)
+ ncp = uma_zalloc(cache_zone_small, M_WAITOK);
+ else
+ ncp = uma_zalloc(cache_zone_large, M_WAITOK);
+ }
+ return (ncp);
+}
+
+static void
+cache_free(struct namecache *ncp)
+{
+ struct namecache_ts *ncp_ts;
+
+ if (ncp == NULL)
+ return;
+ if ((ncp->nc_flag & NCF_DVDROP) != 0)
+ vdrop(ncp->nc_dvp);
+ if (__predict_false(ncp->nc_flag & NCF_TS)) {
+ ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+ if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
+ uma_zfree(cache_zone_small_ts, ncp_ts);
+ else
+ uma_zfree(cache_zone_large_ts, ncp_ts);
+ } else {
+ if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
+ uma_zfree(cache_zone_small, ncp);
+ else
+ uma_zfree(cache_zone_large, ncp);
+ }
+}
+
+static void
+cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
+{
+ struct namecache_ts *ncp_ts;
+
+ KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
+ (tsp == NULL && ticksp == NULL),
+ ("No NCF_TS"));
+
+ if (tsp == NULL && ticksp == NULL)
+ return;
+
+ ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+ if (tsp != NULL)
+ *tsp = ncp_ts->nc_time;
+ if (ticksp != NULL)
+ *ticksp = ncp_ts->nc_ticks;
+}
+
+static int __read_mostly doingcache = 1; /* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
+ "VFS namecache enabled");
+
+/* Export size information to userland */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
+ sizeof(struct namecache), "sizeof(struct namecache)");
+
+/*
+ * The new name cache statistics
+ */
+static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
+ "Name cache statistics");
+#define STATNODE_ULONG(name, descr) \
+ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
+#define STATNODE_COUNTER(name, descr) \
+ static counter_u64_t __read_mostly name; \
+ SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
+STATNODE_ULONG(numneg, "Number of negative cache entries");
+STATNODE_ULONG(numcache, "Number of cache entries");
+STATNODE_COUNTER(numcalls, "Number of cache lookups");
+STATNODE_COUNTER(dothits, "Number of '.' hits");
+STATNODE_COUNTER(dotdothits, "Number of '..' hits");
+STATNODE_COUNTER(numchecks, "Number of checks in lookup");
+STATNODE_COUNTER(nummiss, "Number of cache misses");
+STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
+STATNODE_COUNTER(numposzaps,
+ "Number of cache hits (positive) we do not want to cache");
+STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
+STATNODE_COUNTER(numnegzaps,
+ "Number of cache hits (negative) we do not want to cache");
+STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
+/* These count for kern___getcwd(), too. */
+STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
+STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
+STATNODE_COUNTER(numfullpathfail2,
+ "Number of fullpath search errors (VOP_VPTOCNP failures)");
+STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
+STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+ "Number of times zap_and_exit failed to lock");
+static long cache_lock_vnodes_cel_3_failures;
+STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
+ "Number of times 3-way vnode locking failed");
+
+static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
+static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+ char *buf, char **retbuf, u_int buflen);
+
+static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
+
+static int cache_yield;
+SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
+ "Number of times cache called yield");
+
+static void
+cache_maybe_yield(void)
+{
+
+ if (should_yield()) {
+ cache_yield++;
+ kern_yield(PRI_USER);
+ }
+}
+
+static inline void
+cache_assert_vlp_locked(struct mtx *vlp)
+{
+
+ if (vlp != NULL)
+ mtx_assert(vlp, MA_OWNED);
+}
+
+static inline void
+cache_assert_vnode_locked(struct vnode *vp)
+{
+ struct mtx *vlp;
+
+ vlp = VP2VNODELOCK(vp);
+ cache_assert_vlp_locked(vlp);
+}
+
+static uint32_t
+cache_get_hash(char *name, u_char len, struct vnode *dvp)
+{
+ uint32_t hash;
+
+ hash = fnv_32_buf(name, len, FNV1_32_INIT);
+ hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
+ return (hash);
+}
+
+static inline struct rwlock *
+NCP2BUCKETLOCK(struct namecache *ncp)
+{
+ uint32_t hash;
+
+ hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
+ return (HASH2BUCKETLOCK(hash));
+}
+
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+ struct rwlock *blp;
+
+ blp = NCP2BUCKETLOCK(ncp);
+ rw_assert(blp, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+#define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y))
+static void
+_cache_sort(void **p1, void **p2)
+{
+ void *tmp;
+
+ if (*p1 > *p2) {
+ tmp = *p2;
+ *p2 = *p1;
+ *p1 = tmp;
+ }
+}
+
+static void
+cache_lock_all_buckets(void)
+{
+ u_int i;
+
+ for (i = 0; i < numbucketlocks; i++)
+ rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+ u_int i;
+
+ for (i = 0; i < numbucketlocks; i++)
+ rw_wunlock(&bucketlocks[i]);
+}
+
+static void
+cache_lock_all_vnodes(void)
+{
+ u_int i;
+
+ for (i = 0; i < numvnodelocks; i++)
+ mtx_lock(&vnodelocks[i]);
+}
+
+static void
+cache_unlock_all_vnodes(void)
+{
+ u_int i;
+
+ for (i = 0; i < numvnodelocks; i++)
+ mtx_unlock(&vnodelocks[i]);
+}
+
+static int
+cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+ cache_sort(&vlp1, &vlp2);
+ MPASS(vlp2 != NULL);
+
+ if (vlp1 != NULL) {
+ if (!mtx_trylock(vlp1))
+ return (EAGAIN);
+ }
+ if (!mtx_trylock(vlp2)) {
+ if (vlp1 != NULL)
+ mtx_unlock(vlp1);
+ return (EAGAIN);
+ }
+
+ return (0);
+}
+
+static void
+cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+ MPASS(vlp1 != NULL || vlp2 != NULL);
+
+ if (vlp1 != NULL)
+ mtx_unlock(vlp1);
+ if (vlp2 != NULL)
+ mtx_unlock(vlp2);
+}
+
+static int
+sysctl_nchstats(SYSCTL_HANDLER_ARGS)
+{
+ struct nchstats snap;
+
+ if (req->oldptr == NULL)
+ return (SYSCTL_OUT(req, 0, sizeof(snap)));
+
+ snap = nchstats;
+ snap.ncs_goodhits = counter_u64_fetch(numposhits);
+ snap.ncs_neghits = counter_u64_fetch(numneghits);
+ snap.ncs_badhits = counter_u64_fetch(numposzaps) +
+ counter_u64_fetch(numnegzaps);
+ snap.ncs_miss = counter_u64_fetch(nummisszap) +
+ counter_u64_fetch(nummiss);
+
+ return (SYSCTL_OUT(req, &snap, sizeof(snap)));
+}
+SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
+ "VFS cache effectiveness statistics");
+
+#ifdef DIAGNOSTIC
+/*
+ * Grab an atomic snapshot of the name cache hash chain lengths
+ */
+static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
+ "hash table stats");
+
+static int
+sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
+{
+ struct nchashhead *ncpp;
+ struct namecache *ncp;
+ int i, error, n_nchash, *cntbuf;
+
+retry:
+ n_nchash = nchash + 1; /* nchash is max index, not count */
+ if (req->oldptr == NULL)
+ return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
+ cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
+ cache_lock_all_buckets();
+ if (n_nchash != nchash + 1) {
+ cache_unlock_all_buckets();
+ free(cntbuf, M_TEMP);
+ goto retry;
+ }
+ /* Scan hash tables counting entries */
+ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
+ LIST_FOREACH(ncp, ncpp, nc_hash)
+ cntbuf[i]++;
+ cache_unlock_all_buckets();
+ for (error = 0, i = 0; i < n_nchash; i++)
+ if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
+ break;
+ free(cntbuf, M_TEMP);
+ return (error);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
+ CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
+ "nchash chain lengths");
+
+static int
+sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct nchashhead *ncpp;
+ struct namecache *ncp;
+ int n_nchash;
+ int count, maxlength, used, pct;
+
+ if (!req->oldptr)
+ return SYSCTL_OUT(req, 0, 4 * sizeof(int));
+
+ cache_lock_all_buckets();
+ n_nchash = nchash + 1; /* nchash is max index, not count */
+ used = 0;
+ maxlength = 0;
+
+ /* Scan hash tables for applicable entries */
+ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+ count = 0;
+ LIST_FOREACH(ncp, ncpp, nc_hash) {
+ count++;
+ }
+ if (count)
+ used++;
+ if (maxlength < count)
+ maxlength = count;
+ }
+ n_nchash = nchash + 1;
+ cache_unlock_all_buckets();
+ pct = (used * 100) / (n_nchash / 100);
+ error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &used, sizeof(used));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &pct, sizeof(pct));
+ if (error)
+ return (error);
+ return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
+ CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
+ "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
+#endif
+
+/*
+ * Negative entries management
+ *
+ * A variation of LRU scheme is used. New entries are hashed into one of
+ * numneglists cold lists. Entries get promoted to the hot list on first hit.
+ * Partial LRU for the hot list is maintained by requeueing them every
+ * ncneghitsrequeue hits.
+ *
+ * The shrinker will demote hot list head and evict from the cold list in a
+ * round-robin manner.
+ */
+static void
+cache_negative_hit(struct namecache *ncp)
+{
+ struct neglist *neglist;
+ u_int hits;
+
+ MPASS(ncp->nc_flag & NCF_NEGATIVE);
+ hits = atomic_fetchadd_int(&ncp->nc_neghits, 1);
+ if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+ if ((hits % ncneghitsrequeue) != 0)
+ return;
+ mtx_lock(&ncneg_hot.nl_lock);
+ if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+ TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+ TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
+ mtx_unlock(&ncneg_hot.nl_lock);
+ return;
+ }
+ /*
+ * The shrinker cleared the flag and removed the entry from
+ * the hot list. Put it back.
+ */
+ } else {
+ mtx_lock(&ncneg_hot.nl_lock);
+ }
+ neglist = NCP2NEGLIST(ncp);
+ mtx_lock(&neglist->nl_lock);
+ if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
+ TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
+ TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
+ ncp->nc_flag |= NCF_HOTNEGATIVE;
+ }
+ mtx_unlock(&neglist->nl_lock);
+ mtx_unlock(&ncneg_hot.nl_lock);
+}
+
+static void
+cache_negative_insert(struct namecache *ncp, bool neg_locked)
+{
+ struct neglist *neglist;
+
+ MPASS(ncp->nc_flag & NCF_NEGATIVE);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+ neglist = NCP2NEGLIST(ncp);
+ if (!neg_locked) {
+ mtx_lock(&neglist->nl_lock);
+ } else {
+ mtx_assert(&neglist->nl_lock, MA_OWNED);
+ }
+ TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
+ if (!neg_locked)
+ mtx_unlock(&neglist->nl_lock);
+ atomic_add_rel_long(&numneg, 1);
+}
+
+static void
+cache_negative_remove(struct namecache *ncp, bool neg_locked)
+{
+ struct neglist *neglist;
+ bool hot_locked = false;
+ bool list_locked = false;
+
+ MPASS(ncp->nc_flag & NCF_NEGATIVE);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+ neglist = NCP2NEGLIST(ncp);
+ if (!neg_locked) {
+ if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+ hot_locked = true;
+ mtx_lock(&ncneg_hot.nl_lock);
+ if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
+ list_locked = true;
+ mtx_lock(&neglist->nl_lock);
+ }
+ } else {
+ list_locked = true;
+ mtx_lock(&neglist->nl_lock);
+ }
+ }
+ if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+ mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
+ TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+ } else {
+ mtx_assert(&neglist->nl_lock, MA_OWNED);
+ TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
+ }
+ if (list_locked)
+ mtx_unlock(&neglist->nl_lock);
+ if (hot_locked)
+ mtx_unlock(&ncneg_hot.nl_lock);
+ atomic_subtract_rel_long(&numneg, 1);
+}
+
+static void
+cache_negative_shrink_select(int start, struct namecache **ncpp,
+ struct neglist **neglistpp)
+{
+ struct neglist *neglist;
+ struct namecache *ncp;
+ int i;
+
+ *ncpp = ncp = NULL;
+ neglist = NULL;
+
+ for (i = start; i < numneglists; i++) {
+ neglist = &neglists[i];
+ if (TAILQ_FIRST(&neglist->nl_list) == NULL)
+ continue;
+ mtx_lock(&neglist->nl_lock);
+ ncp = TAILQ_FIRST(&neglist->nl_list);
+ if (ncp != NULL)
+ break;
+ mtx_unlock(&neglist->nl_lock);
+ }
+
+ *neglistpp = neglist;
+ *ncpp = ncp;
+}
+
+static void
+cache_negative_zap_one(void)
+{
+ struct namecache *ncp, *ncp2;
+ struct neglist *neglist;
+ struct mtx *dvlp;
+ struct rwlock *blp;
+
+ if (!mtx_trylock(&ncneg_shrink_lock))
+ return;
+
+ mtx_lock(&ncneg_hot.nl_lock);
+ ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
+ if (ncp != NULL) {
+ neglist = NCP2NEGLIST(ncp);
+ mtx_lock(&neglist->nl_lock);
+ TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+ TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
+ ncp->nc_flag &= ~NCF_HOTNEGATIVE;
+ mtx_unlock(&neglist->nl_lock);
+ }
+
+ cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
+ shrink_list_turn++;
+ if (shrink_list_turn == numneglists)
+ shrink_list_turn = 0;
+ if (ncp == NULL && shrink_list_turn == 0)
+ cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
+ if (ncp == NULL) {
+ mtx_unlock(&ncneg_hot.nl_lock);
+ goto out;
+ }
+
+ MPASS(ncp->nc_flag & NCF_NEGATIVE);
+ dvlp = VP2VNODELOCK(ncp->nc_dvp);
+ blp = NCP2BUCKETLOCK(ncp);
+ mtx_unlock(&neglist->nl_lock);
+ mtx_unlock(&ncneg_hot.nl_lock);
+ mtx_lock(dvlp);
+ rw_wlock(blp);
+ mtx_lock(&neglist->nl_lock);
+ ncp2 = TAILQ_FIRST(&neglist->nl_list);
+ if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
+ blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
+ ncp = NULL;
+ goto out_unlock_all;
+ }
+ SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
+ ncp->nc_name, ncp->nc_neghits);
+
+ cache_zap_locked(ncp, true);
+out_unlock_all:
+ mtx_unlock(&neglist->nl_lock);
+ rw_wunlock(blp);
+ mtx_unlock(dvlp);
+out:
+ mtx_unlock(&ncneg_shrink_lock);
+ cache_free(ncp);
+}
+
+/*
+ * cache_zap_locked():
+ *
+ * Removes a namecache entry from cache, whether it contains an actual
+ * pointer to a vnode or if it is just a negative cache entry.
+ */
+static void
+cache_zap_locked(struct namecache *ncp, bool neg_locked)
+{
+
+ if (!(ncp->nc_flag & NCF_NEGATIVE))
+ cache_assert_vnode_locked(ncp->nc_vp);
+ cache_assert_vnode_locked(ncp->nc_dvp);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+ CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
+ (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
+ if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+ SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
+ ncp->nc_name, ncp->nc_vp);
+ } else {
+ SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp,
+ ncp->nc_name, ncp->nc_neghits);
+ }
+ LIST_REMOVE(ncp, nc_hash);
+ if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+ TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+ if (ncp == ncp->nc_vp->v_cache_dd)
+ ncp->nc_vp->v_cache_dd = NULL;
+ } else {
+ cache_negative_remove(ncp, neg_locked);
+ }
+ if (ncp->nc_flag & NCF_ISDOTDOT) {
+ if (ncp == ncp->nc_dvp->v_cache_dd)
+ ncp->nc_dvp->v_cache_dd = NULL;
+ } else {
+ LIST_REMOVE(ncp, nc_src);
+ if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
+ ncp->nc_flag |= NCF_DVDROP;
+ atomic_subtract_rel_long(&numcachehv, 1);
+ }
+ }
+ atomic_subtract_rel_long(&numcache, 1);
+}
+
+static void
+cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
+{
+ struct rwlock *blp;
+
+ MPASS(ncp->nc_dvp == vp);
+ MPASS(ncp->nc_flag & NCF_NEGATIVE);
+ cache_assert_vnode_locked(vp);
+
+ blp = NCP2BUCKETLOCK(ncp);
+ rw_wlock(blp);
+ cache_zap_locked(ncp, false);
+ rw_wunlock(blp);
+}
+
+static bool
+cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
+ struct mtx **vlpp)
+{
+ struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+ struct rwlock *blp;
+
+ MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+ cache_assert_vnode_locked(vp);
+
+ if (ncp->nc_flag & NCF_NEGATIVE) {
+ if (*vlpp != NULL) {
+ mtx_unlock(*vlpp);
+ *vlpp = NULL;
+ }
+ cache_zap_negative_locked_vnode_kl(ncp, vp);
+ return (true);
+ }
+
+ pvlp = VP2VNODELOCK(vp);
+ blp = NCP2BUCKETLOCK(ncp);
+ vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+ vlp2 = VP2VNODELOCK(ncp->nc_vp);
+
+ if (*vlpp == vlp1 || *vlpp == vlp2) {
+ to_unlock = *vlpp;
+ *vlpp = NULL;
+ } else {
+ if (*vlpp != NULL) {
+ mtx_unlock(*vlpp);
+ *vlpp = NULL;
+ }
+ cache_sort(&vlp1, &vlp2);
+ if (vlp1 == pvlp) {
+ mtx_lock(vlp2);
+ to_unlock = vlp2;
+ } else {
+ if (!mtx_trylock(vlp1))
+ goto out_relock;
+ to_unlock = vlp1;
+ }
+ }
+ rw_wlock(blp);
+ cache_zap_locked(ncp, false);
+ rw_wunlock(blp);
+ if (to_unlock != NULL)
+ mtx_unlock(to_unlock);
+ return (true);
+
+out_relock:
+ mtx_unlock(vlp2);
+ mtx_lock(vlp1);
+ mtx_lock(vlp2);
+ MPASS(*vlpp == NULL);
+ *vlpp = vlp1;
+ return (false);
+}
+
+static int
+cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
+{
+ struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+ struct rwlock *blp;
+ int error = 0;
+
+ MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+ cache_assert_vnode_locked(vp);
+
+ pvlp = VP2VNODELOCK(vp);
+ if (ncp->nc_flag & NCF_NEGATIVE) {
+ cache_zap_negative_locked_vnode_kl(ncp, vp);
+ goto out;
+ }
+
+ blp = NCP2BUCKETLOCK(ncp);
+ vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+ vlp2 = VP2VNODELOCK(ncp->nc_vp);
+ cache_sort(&vlp1, &vlp2);
+ if (vlp1 == pvlp) {
+ mtx_lock(vlp2);
+ to_unlock = vlp2;
+ } else {
+ if (!mtx_trylock(vlp1)) {
+ error = EAGAIN;
+ goto out;
+ }
+ to_unlock = vlp1;
+ }
+ rw_wlock(blp);
+ cache_zap_locked(ncp, false);
+ rw_wunlock(blp);
+ mtx_unlock(to_unlock);
+out:
+ mtx_unlock(pvlp);
+ return (error);
+}
+
+static int
+cache_zap_wlocked_bucket(struct namecache *ncp, struct rwlock *blp)
+{
+ struct mtx *dvlp, *vlp;
+
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+ dvlp = VP2VNODELOCK(ncp->nc_dvp);
+ vlp = NULL;
+ if (!(ncp->nc_flag & NCF_NEGATIVE))
+ vlp = VP2VNODELOCK(ncp->nc_vp);
+ if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+ cache_zap_locked(ncp, false);
+ rw_wunlock(blp);
+ cache_unlock_vnodes(dvlp, vlp);
+ return (0);
+ }
+
+ rw_wunlock(blp);
+ return (EAGAIN);
+}
+
+static int
+cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp)
+{
+ struct mtx *dvlp, *vlp;
+
+ cache_assert_bucket_locked(ncp, RA_RLOCKED);
+
+ dvlp = VP2VNODELOCK(ncp->nc_dvp);
+ vlp = NULL;
+ if (!(ncp->nc_flag & NCF_NEGATIVE))
+ vlp = VP2VNODELOCK(ncp->nc_vp);
+ if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+ rw_runlock(blp);
+ rw_wlock(blp);
+ cache_zap_locked(ncp, false);
+ rw_wunlock(blp);
+ cache_unlock_vnodes(dvlp, vlp);
+ return (0);
+ }
+
+ rw_runlock(blp);
+ return (EAGAIN);
+}
+
+static int
+cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
+ struct mtx **vlpp1, struct mtx **vlpp2)
+{
+ struct mtx *dvlp, *vlp;
+
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+ dvlp = VP2VNODELOCK(ncp->nc_dvp);
+ vlp = NULL;
+ if (!(ncp->nc_flag & NCF_NEGATIVE))
+ vlp = VP2VNODELOCK(ncp->nc_vp);
+ cache_sort(&dvlp, &vlp);
+
+ if (*vlpp1 == dvlp && *vlpp2 == vlp) {
+ cache_zap_locked(ncp, false);
+ cache_unlock_vnodes(dvlp, vlp);
+ *vlpp1 = NULL;
+ *vlpp2 = NULL;
+ return (0);
+ }
+
+ if (*vlpp1 != NULL)
+ mtx_unlock(*vlpp1);
+ if (*vlpp2 != NULL)
+ mtx_unlock(*vlpp2);
+ *vlpp1 = NULL;
+ *vlpp2 = NULL;
+
+ if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+ cache_zap_locked(ncp, false);
+ cache_unlock_vnodes(dvlp, vlp);
+ return (0);
+ }
+
+ rw_wunlock(blp);
+ *vlpp1 = dvlp;
+ *vlpp2 = vlp;
+ if (*vlpp1 != NULL)
+ mtx_lock(*vlpp1);
+ mtx_lock(*vlpp2);
+ rw_wlock(blp);
+ return (EAGAIN);
+}
+
+static void
+cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
+{
+
+ if (blp != NULL) {
+ rw_runlock(blp);
+ } else {
+ mtx_unlock(vlp);
+ }
+}
+
+static int __noinline
+cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
+ struct timespec *tsp, int *ticksp)
+{
+ int ltype;
+
+ *vpp = dvp;
+ CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
+ dvp, cnp->cn_nameptr);
+ counter_u64_add(dothits, 1);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
+ if (tsp != NULL)
+ timespecclear(tsp);
+ if (ticksp != NULL)
+ *ticksp = ticks;
+ vrefact(*vpp);
+ /*
+ * When we lookup "." we still can be asked to lock it
+ * differently...
+ */
+ ltype = cnp->cn_lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(*vpp)) {
+ if (ltype == LK_EXCLUSIVE) {
+ vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
+ if ((*vpp)->v_iflag & VI_DOOMED) {
+ /* forced unmount */
+ vrele(*vpp);
+ *vpp = NULL;
+ return (ENOENT);
+ }
+ } else
+ vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
+ }
+ return (-1);
+}
+
+static __noinline int
+cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp, struct timespec *tsp, int *ticksp)
+{
+ struct namecache *ncp;
+ struct rwlock *blp;
+ struct mtx *dvlp, *dvlp2;
+ uint32_t hash;
+ int error;
+
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
+ counter_u64_add(dotdothits, 1);
+ dvlp = VP2VNODELOCK(dvp);
+ dvlp2 = NULL;
+ mtx_lock(dvlp);
+retry_dotdot:
+ ncp = dvp->v_cache_dd;
+ if (ncp == NULL) {
+ SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
+ "..", NULL);
+ mtx_unlock(dvlp);
+ if (dvlp2 != NULL)
+ mtx_unlock(dvlp2);
+ return (0);
+ }
+ if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
+ if (ncp->nc_dvp != dvp)
+ panic("dvp %p v_cache_dd %p\n", dvp, ncp);
+ if (!cache_zap_locked_vnode_kl2(ncp,
+ dvp, &dvlp2))
+ goto retry_dotdot;
+ MPASS(dvp->v_cache_dd == NULL);
+ mtx_unlock(dvlp);
+ if (dvlp2 != NULL)
+ mtx_unlock(dvlp2);
+ cache_free(ncp);
+ } else {
+ dvp->v_cache_dd = NULL;
+ mtx_unlock(dvlp);
+ if (dvlp2 != NULL)
+ mtx_unlock(dvlp2);
+ }
+ return (0);
+ }
+
+ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+ blp = HASH2BUCKETLOCK(hash);
+retry:
+ if (LIST_EMPTY(NCHHASH(hash)))
+ goto out_no_entry;
+
+ rw_wlock(blp);
+
+ LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ counter_u64_add(numchecks, 1);
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ /* We failed to find an entry */
+ if (ncp == NULL) {
+ rw_wunlock(blp);
+ goto out_no_entry;
+ }
+
+ counter_u64_add(numposzaps, 1);
+
+ error = cache_zap_wlocked_bucket(ncp, blp);
+ if (error != 0) {
+ zap_and_exit_bucket_fail++;
+ cache_maybe_yield();
+ goto retry;
+ }
+ cache_free(ncp);
+ return (0);
+out_no_entry:
+ SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
+ counter_u64_add(nummisszap, 1);
+ return (0);
+}
+
+/**
+ * Lookup a name in the name cache
+ *
+ * # Arguments
+ *
+ * - dvp: Parent directory in which to search.
+ * - vpp: Return argument. Will contain desired vnode on cache hit.
+ * - cnp: Parameters of the name search. The most interesting bits of
+ * the cn_flags field have the following meanings:
+ * - MAKEENTRY: If clear, free an entry from the cache rather than look
+ * it up.
+ * - ISDOTDOT: Must be set if and only if cn_nameptr == ".."
+ * - tsp: Return storage for cache timestamp. On a successful (positive
+ * or negative) lookup, tsp will be filled with any timespec that
+ * was stored when this cache entry was created. However, it will
+ * be clear for "." entries.
+ * - ticks: Return storage for alternate cache timestamp. On a successful
+ * (positive or negative) lookup, it will contain the ticks value
+ * that was current when the cache entry was created, unless cnp
+ * was ".".
+ *
+ * # Returns
+ *
+ * - -1: A positive cache hit. vpp will contain the desired vnode.
+ * - ENOENT: A negative cache hit, or dvp was recycled out from under us due
+ * to a forced unmount. vpp will not be modified. If the entry
+ * is a whiteout, then the ISWHITEOUT flag will be set in
+ * cnp->cn_flags.
+ * - 0: A cache miss. vpp will not be modified.
+ *
+ * # Locking
+ *
+ * On a cache hit, vpp will be returned locked and ref'd. If we're looking up
+ * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the
+ * lock is not recursively acquired.
+ */
+int
+cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
+ struct timespec *tsp, int *ticksp)
+{
+ struct namecache_ts *ncp_ts;
+ struct namecache *ncp;
+ struct rwlock *blp;
+ struct mtx *dvlp;
+ uint32_t hash;
+ int error, ltype;
+
+ if (__predict_false(!doingcache)) {
+ cnp->cn_flags &= ~MAKEENTRY;
+ return (0);
+ }
+
+ counter_u64_add(numcalls, 1);
+
+ if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
+ return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
+
+ if ((cnp->cn_flags & MAKEENTRY) == 0)
+ return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
+
+retry:
+ blp = NULL;
+ dvlp = NULL;
+ error = 0;
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
+ counter_u64_add(dotdothits, 1);
+ dvlp = VP2VNODELOCK(dvp);
+ mtx_lock(dvlp);
+ ncp = dvp->v_cache_dd;
+ if (ncp == NULL) {
+ SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
+ "..", NULL);
+ mtx_unlock(dvlp);
+ return (0);
+ }
+ if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
+ if (ncp->nc_flag & NCF_NEGATIVE)
+ *vpp = NULL;
+ else
+ *vpp = ncp->nc_vp;
+ } else
+ *vpp = ncp->nc_dvp;
+ /* Return failure if negative entry was found. */
+ if (*vpp == NULL)
+ goto negative_success;
+ CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
+ dvp, cnp->cn_nameptr, *vpp);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
+ *vpp);
+ cache_out_ts(ncp, tsp, ticksp);
+ if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
+ NCF_DTS && tsp != NULL) {
+ ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+ *tsp = ncp_ts->nc_dotdottime;
+ }
+ goto success;
+ }
+
+ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+ blp = HASH2BUCKETLOCK(hash);
+ rw_rlock(blp);
+
+ LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ counter_u64_add(numchecks, 1);
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ /* We failed to find an entry */
+ if (ncp == NULL) {
+ rw_runlock(blp);
+ SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
+ NULL);
+ counter_u64_add(nummiss, 1);
+ return (0);
+ }
+
+ /* We found a "positive" match, return the vnode */
+ if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+ counter_u64_add(numposhits, 1);
+ *vpp = ncp->nc_vp;
+ CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
+ dvp, cnp->cn_nameptr, *vpp, ncp);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
+ *vpp);
+ cache_out_ts(ncp, tsp, ticksp);
+ goto success;
+ }
+
+negative_success:
+ /* We found a negative match, and want to create it, so purge */
+ if (cnp->cn_nameiop == CREATE) {
+ counter_u64_add(numnegzaps, 1);
+ goto zap_and_exit;
+ }
+
+ counter_u64_add(numneghits, 1);
+ cache_negative_hit(ncp);
+ if (ncp->nc_flag & NCF_WHITE)
+ cnp->cn_flags |= ISWHITEOUT;
+ SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+ ncp->nc_name);
+ cache_out_ts(ncp, tsp, ticksp);
+ cache_lookup_unlock(blp, dvlp);
+ return (ENOENT);
+
+success:
+ /*
+ * On success we return a locked and ref'd vnode as per the lookup
+ * protocol.
+ */
+ MPASS(dvp != *vpp);
+ ltype = 0; /* silence gcc warning */
+ if (cnp->cn_flags & ISDOTDOT) {
+ ltype = VOP_ISLOCKED(dvp);
+ VOP_UNLOCK(dvp, 0);
+ }
+ vhold(*vpp);
+ cache_lookup_unlock(blp, dvlp);
+ error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
+ if (cnp->cn_flags & ISDOTDOT) {
+ vn_lock(dvp, ltype | LK_RETRY);
+ if (dvp->v_iflag & VI_DOOMED) {
+ if (error == 0)
+ vput(*vpp);
+ *vpp = NULL;
+ return (ENOENT);
+ }
+ }
+ if (error) {
+ *vpp = NULL;
+ goto retry;
+ }
+ if ((cnp->cn_flags & ISLASTCN) &&
+ (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
+ ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
+ }
+ return (-1);
+
+zap_and_exit:
+ if (blp != NULL)
+ error = cache_zap_rlocked_bucket(ncp, blp);
+ else
+ error = cache_zap_locked_vnode(ncp, dvp);
+ if (error != 0) {
+ zap_and_exit_bucket_fail++;
+ cache_maybe_yield();
+ goto retry;
+ }
+ cache_free(ncp);
+ return (0);
+}
+
+struct celockstate {
+ struct mtx *vlp[3];
+ struct rwlock *blp[2];
+};
+CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
+CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
+
+static inline void
+cache_celockstate_init(struct celockstate *cel)
+{
+
+ bzero(cel, sizeof(*cel));
+}
+
+static void
+cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
+ struct vnode *dvp)
+{
+ struct mtx *vlp1, *vlp2;
+
+ MPASS(cel->vlp[0] == NULL);
+ MPASS(cel->vlp[1] == NULL);
+ MPASS(cel->vlp[2] == NULL);
+
+ MPASS(vp != NULL || dvp != NULL);
+
+ vlp1 = VP2VNODELOCK(vp);
+ vlp2 = VP2VNODELOCK(dvp);
+ cache_sort(&vlp1, &vlp2);
+
+ if (vlp1 != NULL) {
+ mtx_lock(vlp1);
+ cel->vlp[0] = vlp1;
+ }
+ mtx_lock(vlp2);
+ cel->vlp[1] = vlp2;
+}
+
+static void
+cache_unlock_vnodes_cel(struct celockstate *cel)
+{
+
+ MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
+
+ if (cel->vlp[0] != NULL)
+ mtx_unlock(cel->vlp[0]);
+ if (cel->vlp[1] != NULL)
+ mtx_unlock(cel->vlp[1]);
+ if (cel->vlp[2] != NULL)
+ mtx_unlock(cel->vlp[2]);
+}
+
+static bool
+cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
+{
+ struct mtx *vlp;
+ bool ret;
+
+ cache_assert_vlp_locked(cel->vlp[0]);
+ cache_assert_vlp_locked(cel->vlp[1]);
+ MPASS(cel->vlp[2] == NULL);
+
+ MPASS(vp != NULL);
+ vlp = VP2VNODELOCK(vp);
+
+ ret = true;
+ if (vlp >= cel->vlp[1]) {
+ mtx_lock(vlp);
+ } else {
+ if (mtx_trylock(vlp))
+ goto out;
+ cache_lock_vnodes_cel_3_failures++;
+ cache_unlock_vnodes_cel(cel);
+ if (vlp < cel->vlp[0]) {
+ mtx_lock(vlp);
+ mtx_lock(cel->vlp[0]);
+ mtx_lock(cel->vlp[1]);
+ } else {
+ if (cel->vlp[0] != NULL)
+ mtx_lock(cel->vlp[0]);
+ mtx_lock(vlp);
+ mtx_lock(cel->vlp[1]);
+ }
+ ret = false;
+ }
+out:
+ cel->vlp[2] = vlp;
+ return (ret);
+}
+
+static void
+cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
+ struct rwlock *blp2)
+{
+
+ MPASS(cel->blp[0] == NULL);
+ MPASS(cel->blp[1] == NULL);
+
+ cache_sort(&blp1, &blp2);
+
+ if (blp1 != NULL) {
+ rw_wlock(blp1);
+ cel->blp[0] = blp1;
+ }
+ rw_wlock(blp2);
+ cel->blp[1] = blp2;
+}
+
+static void
+cache_unlock_buckets_cel(struct celockstate *cel)
+{
+
+ if (cel->blp[0] != NULL)
+ rw_wunlock(cel->blp[0]);
+ rw_wunlock(cel->blp[1]);
+}
+
+/*
+ * Lock part of the cache affected by the insertion.
+ *
+ * This means vnodelocks for dvp, vp and the relevant bucketlock.
+ * However, insertion can result in removal of an old entry. In this
+ * case we have an additional vnode and bucketlock pair to lock. If the
+ * entry is negative, ncelock is locked instead of the vnode.
+ *
+ * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
+ * preserving the locking order (smaller address first).
+ */
+static void
+cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+ uint32_t hash)
+{
+ struct namecache *ncp;
+ struct rwlock *blps[2];
+
+ blps[0] = HASH2BUCKETLOCK(hash);
+ for (;;) {
+ blps[1] = NULL;
+ cache_lock_vnodes_cel(cel, dvp, vp);
+ if (vp == NULL || vp->v_type != VDIR)
+ break;
+ ncp = vp->v_cache_dd;
+ if (ncp == NULL)
+ break;
+ if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+ break;
+ MPASS(ncp->nc_dvp == vp);
+ blps[1] = NCP2BUCKETLOCK(ncp);
+ if (ncp->nc_flag & NCF_NEGATIVE)
+ break;
+ if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+ break;
+ /*
+ * All vnodes got re-locked. Re-validate the state and if
+ * nothing changed we are done. Otherwise restart.
+ */
+ if (ncp == vp->v_cache_dd &&
+ (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+ blps[1] == NCP2BUCKETLOCK(ncp) &&
+ VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+ break;
+ cache_unlock_vnodes_cel(cel);
+ cel->vlp[0] = NULL;
+ cel->vlp[1] = NULL;
+ cel->vlp[2] = NULL;
+ }
+ cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+ uint32_t hash)
+{
+ struct namecache *ncp;
+ struct rwlock *blps[2];
+
+ blps[0] = HASH2BUCKETLOCK(hash);
+ for (;;) {
+ blps[1] = NULL;
+ cache_lock_vnodes_cel(cel, dvp, vp);
+ ncp = dvp->v_cache_dd;
+ if (ncp == NULL)
+ break;
+ if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+ break;
+ MPASS(ncp->nc_dvp == dvp);
+ blps[1] = NCP2BUCKETLOCK(ncp);
+ if (ncp->nc_flag & NCF_NEGATIVE)
+ break;
+ if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+ break;
+ if (ncp == dvp->v_cache_dd &&
+ (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+ blps[1] == NCP2BUCKETLOCK(ncp) &&
+ VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+ break;
+ cache_unlock_vnodes_cel(cel);
+ cel->vlp[0] = NULL;
+ cel->vlp[1] = NULL;
+ cel->vlp[2] = NULL;
+ }
+ cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_unlock(struct celockstate *cel)
+{
+
+ cache_unlock_buckets_cel(cel);
+ cache_unlock_vnodes_cel(cel);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
+ struct timespec *tsp, struct timespec *dtsp)
+{
+ struct celockstate cel;
+ struct namecache *ncp, *n2, *ndd;
+ struct namecache_ts *ncp_ts, *n2_ts;
+ struct nchashhead *ncpp;
+ struct neglist *neglist;
+ uint32_t hash;
+ int flag;
+ int len;
+ bool neg_locked;
+ int lnumcache;
+
+ CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
+ VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
+ ("cache_enter: Adding a doomed vnode"));
+ VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
+ ("cache_enter: Doomed vnode used as src"));
+
+ if (__predict_false(!doingcache))
+ return;
+
+ /*
+ * Avoid blowout in namecache entries.
+ */
+ if (__predict_false(numcache >= desiredvnodes * ncsizefactor))
+ return;
+
+ cache_celockstate_init(&cel);
+ ndd = NULL;
+ ncp_ts = NULL;
+ flag = 0;
+ if (cnp->cn_nameptr[0] == '.') {
+ if (cnp->cn_namelen == 1)
+ return;
+ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+ len = cnp->cn_namelen;
+ hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
+ cache_enter_lock_dd(&cel, dvp, vp, hash);
+ /*
+ * If dotdot entry already exists, just retarget it
+ * to new parent vnode, otherwise continue with new
+ * namecache entry allocation.
+ */
+ if ((ncp = dvp->v_cache_dd) != NULL &&
+ ncp->nc_flag & NCF_ISDOTDOT) {
+ KASSERT(ncp->nc_dvp == dvp,
+ ("wrong isdotdot parent"));
+ neg_locked = false;
+ if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) {
+ neglist = NCP2NEGLIST(ncp);
+ mtx_lock(&ncneg_hot.nl_lock);
+ mtx_lock(&neglist->nl_lock);
+ neg_locked = true;
+ }
+ if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+ TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
+ ncp, nc_dst);
+ } else {
+ cache_negative_remove(ncp, true);
+ }
+ if (vp != NULL) {
+ TAILQ_INSERT_HEAD(&vp->v_cache_dst,
+ ncp, nc_dst);
+ ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE);
+ } else {
+ ncp->nc_flag &= ~(NCF_HOTNEGATIVE);
+ ncp->nc_flag |= NCF_NEGATIVE;
+ cache_negative_insert(ncp, true);
+ }
+ if (neg_locked) {
+ mtx_unlock(&neglist->nl_lock);
+ mtx_unlock(&ncneg_hot.nl_lock);
+ }
+ ncp->nc_vp = vp;
+ cache_enter_unlock(&cel);
+ return;
+ }
+ dvp->v_cache_dd = NULL;
+ cache_enter_unlock(&cel);
+ cache_celockstate_init(&cel);
+ SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
+ flag = NCF_ISDOTDOT;
+ }
+ }
+
+ /*
+ * Calculate the hash key and setup as much of the new
+ * namecache entry as possible before acquiring the lock.
+ */
+ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
+ ncp->nc_flag = flag;
+ ncp->nc_vp = vp;
+ if (vp == NULL)
+ ncp->nc_flag |= NCF_NEGATIVE;
+ ncp->nc_dvp = dvp;
+ if (tsp != NULL) {
+ ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
+ ncp_ts->nc_time = *tsp;
+ ncp_ts->nc_ticks = ticks;
+ ncp_ts->nc_nc.nc_flag |= NCF_TS;
+ if (dtsp != NULL) {
+ ncp_ts->nc_dotdottime = *dtsp;
+ ncp_ts->nc_nc.nc_flag |= NCF_DTS;
+ }
+ }
+ len = ncp->nc_nlen = cnp->cn_namelen;
+ hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
+ strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
+ cache_enter_lock(&cel, dvp, vp, hash);
+
+ /*
+ * See if this vnode or negative entry is already in the cache
+ * with this name. This can happen with concurrent lookups of
+ * the same path name.
+ */
+ ncpp = NCHHASH(hash);
+ LIST_FOREACH(n2, ncpp, nc_hash) {
+ if (n2->nc_dvp == dvp &&
+ n2->nc_nlen == cnp->cn_namelen &&
+ !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
+ if (tsp != NULL) {
+ KASSERT((n2->nc_flag & NCF_TS) != 0,
+ ("no NCF_TS"));
+ n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
+ n2_ts->nc_time = ncp_ts->nc_time;
+ n2_ts->nc_ticks = ncp_ts->nc_ticks;
+ if (dtsp != NULL) {
+ n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
+ if (ncp->nc_flag & NCF_NEGATIVE)
+ mtx_lock(&ncneg_hot.nl_lock);
+ n2_ts->nc_nc.nc_flag |= NCF_DTS;
+ if (ncp->nc_flag & NCF_NEGATIVE)
+ mtx_unlock(&ncneg_hot.nl_lock);
+ }
+ }
+ goto out_unlock_free;
+ }
+ }
+
+ if (flag == NCF_ISDOTDOT) {
+ /*
+ * See if we are trying to add .. entry, but some other lookup
+ * has populated v_cache_dd pointer already.
+ */
+ if (dvp->v_cache_dd != NULL)
+ goto out_unlock_free;
+ KASSERT(vp == NULL || vp->v_type == VDIR,
+ ("wrong vnode type %p", vp));
+ dvp->v_cache_dd = ncp;
+ }
+
+ if (vp != NULL) {
+ if (vp->v_type == VDIR) {
+ if (flag != NCF_ISDOTDOT) {
+ /*
+ * For this case, the cache entry maps both the
+ * directory name in it and the name ".." for the
+ * directory's parent.
+ */
+ if ((ndd = vp->v_cache_dd) != NULL) {
+ if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
+ cache_zap_locked(ndd, false);
+ else
+ ndd = NULL;
+ }
+ vp->v_cache_dd = ncp;
+ }
+ } else {
+ vp->v_cache_dd = NULL;
+ }
+ }
+
+ if (flag != NCF_ISDOTDOT) {
+ if (LIST_EMPTY(&dvp->v_cache_src)) {
+ vhold(dvp);
+ atomic_add_rel_long(&numcachehv, 1);
+ }
+ LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+ }
+
+ /*
+ * Insert the new namecache entry into the appropriate chain
+ * within the cache entries table.
+ */
+ LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
+ /*
+ * If the entry is "negative", we place it into the
+ * "negative" cache queue, otherwise, we place it into the
+ * destination vnode's cache entries queue.
+ */
+ if (vp != NULL) {
+ TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+ SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
+ vp);
+ } else {
+ if (cnp->cn_flags & ISWHITEOUT)
+ ncp->nc_flag |= NCF_WHITE;
+ cache_negative_insert(ncp, false);
+ SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
+ ncp->nc_name);
+ }
+ cache_enter_unlock(&cel);
+ lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
+ if (numneg * ncnegfactor > lnumcache)
+ cache_negative_zap_one();
+ cache_free(ndd);
+ return;
+out_unlock_free:
+ cache_enter_unlock(&cel);
+ cache_free(ncp);
+ return;
+}
+
+static u_int
+cache_roundup_2(u_int val)
+{
+ u_int res;
+
+ for (res = 1; res <= val; res <<= 1)
+ continue;
+
+ return (res);
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+static void
+nchinit(void *dummy __unused)
+{
+ u_int i;
+
+ cache_zone_small = uma_zcreate("S VFS Cache",
+ sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+ UMA_ZONE_ZINIT);
+ cache_zone_small_ts = uma_zcreate("STS VFS Cache",
+ sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+ UMA_ZONE_ZINIT);
+ cache_zone_large = uma_zcreate("L VFS Cache",
+ sizeof(struct namecache) + NAME_MAX + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+ UMA_ZONE_ZINIT);
+ cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
+ sizeof(struct namecache_ts) + NAME_MAX + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+ UMA_ZONE_ZINIT);
+
+ nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+ ncbuckethash = cache_roundup_2(mp_ncpus * 64) - 1;
+ if (ncbuckethash > nchash)
+ ncbuckethash = nchash;
+ bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < numbucketlocks; i++)
+ rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
+ ncvnodehash = cache_roundup_2(mp_ncpus * 64) - 1;
+ vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < numvnodelocks; i++)
+ mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
+ ncpurgeminvnodes = numbucketlocks;
+
+ ncneghash = 3;
+ neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < numneglists; i++) {
+ mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
+ TAILQ_INIT(&neglists[i].nl_list);
+ }
+ mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
+ TAILQ_INIT(&ncneg_hot.nl_list);
+
+ mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
+
+ numcalls = counter_u64_alloc(M_WAITOK);
+ dothits = counter_u64_alloc(M_WAITOK);
+ dotdothits = counter_u64_alloc(M_WAITOK);
+ numchecks = counter_u64_alloc(M_WAITOK);
+ nummiss = counter_u64_alloc(M_WAITOK);
+ nummisszap = counter_u64_alloc(M_WAITOK);
+ numposzaps = counter_u64_alloc(M_WAITOK);
+ numposhits = counter_u64_alloc(M_WAITOK);
+ numnegzaps = counter_u64_alloc(M_WAITOK);
+ numneghits = counter_u64_alloc(M_WAITOK);
+ numfullpathcalls = counter_u64_alloc(M_WAITOK);
+ numfullpathfail1 = counter_u64_alloc(M_WAITOK);
+ numfullpathfail2 = counter_u64_alloc(M_WAITOK);
+ numfullpathfail4 = counter_u64_alloc(M_WAITOK);
+ numfullpathfound = counter_u64_alloc(M_WAITOK);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
+
+void
+cache_changesize(int newmaxvnodes)
+{
+ struct nchashhead *new_nchashtbl, *old_nchashtbl;
+ u_long new_nchash, old_nchash;
+ struct namecache *ncp;
+ uint32_t hash;
+ int i;
+
+ newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
+ if (newmaxvnodes < numbucketlocks)
+ newmaxvnodes = numbucketlocks;
+
+ new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
+ /* If same hash table size, nothing to do */
+ if (nchash == new_nchash) {
+ free(new_nchashtbl, M_VFSCACHE);
+ return;
+ }
+ /*
+ * Move everything from the old hash table to the new table.
+ * None of the namecache entries in the table can be removed
+ * because to do so, they have to be removed from the hash table.
+ */
+ cache_lock_all_vnodes();
+ cache_lock_all_buckets();
+ old_nchashtbl = nchashtbl;
+ old_nchash = nchash;
+ nchashtbl = new_nchashtbl;
+ nchash = new_nchash;
+ for (i = 0; i <= old_nchash; i++) {
+ while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
+ hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
+ ncp->nc_dvp);
+ LIST_REMOVE(ncp, nc_hash);
+ LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
+ }
+ }
+ cache_unlock_all_buckets();
+ cache_unlock_all_vnodes();
+ free(old_nchashtbl, M_VFSCACHE);
+}
+
+/*
+ * Invalidate all entries from and to a particular vnode.
+ */
+void
+cache_purge(struct vnode *vp)
+{
+ TAILQ_HEAD(, namecache) ncps;
+ struct namecache *ncp, *nnp;
+ struct mtx *vlp, *vlp2;
+
+ CTR1(KTR_VFS, "cache_purge(%p)", vp);
+ SDT_PROBE1(vfs, namecache, purge, done, vp);
+ if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
+ vp->v_cache_dd == NULL)
+ return;
+ TAILQ_INIT(&ncps);
+ vlp = VP2VNODELOCK(vp);
+ vlp2 = NULL;
+ mtx_lock(vlp);
+retry:
+ while (!LIST_EMPTY(&vp->v_cache_src)) {
+ ncp = LIST_FIRST(&vp->v_cache_src);
+ if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+ goto retry;
+ TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+ }
+ while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
+ ncp = TAILQ_FIRST(&vp->v_cache_dst);
+ if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+ goto retry;
+ TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+ }
+ ncp = vp->v_cache_dd;
+ if (ncp != NULL) {
+ KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
+ ("lost dotdot link"));
+ if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+ goto retry;
+ TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+ }
+ KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
+ mtx_unlock(vlp);
+ if (vlp2 != NULL)
+ mtx_unlock(vlp2);
+ TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+ cache_free(ncp);
+ }
+}
+
+/*
+ * Invalidate all negative entries for a particular directory vnode.
+ */
+void
+cache_purge_negative(struct vnode *vp)
+{
+ TAILQ_HEAD(, namecache) ncps;
+ struct namecache *ncp, *nnp;
+ struct mtx *vlp;
+
+ CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
+ SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
+ if (LIST_EMPTY(&vp->v_cache_src))
+ return;
+ TAILQ_INIT(&ncps);
+ vlp = VP2VNODELOCK(vp);
+ mtx_lock(vlp);
+ LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
+ if (!(ncp->nc_flag & NCF_NEGATIVE))
+ continue;
+ cache_zap_negative_locked_vnode_kl(ncp, vp);
+ TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+ }
+ mtx_unlock(vlp);
+ TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+ cache_free(ncp);
+ }
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ */
+void
+cache_purgevfs(struct mount *mp, bool force)
+{
+ TAILQ_HEAD(, namecache) ncps;
+ struct mtx *vlp1, *vlp2;
+ struct rwlock *blp;
+ struct nchashhead *bucket;
+ struct namecache *ncp, *nnp;
+ u_long i, j, n_nchash;
+ int error;
+
+ /* Scan hash tables for applicable entries */
+ SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
+ if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
+ return;
+ TAILQ_INIT(&ncps);
+ n_nchash = nchash + 1;
+ vlp1 = vlp2 = NULL;
+ for (i = 0; i < numbucketlocks; i++) {
+ blp = (struct rwlock *)&bucketlocks[i];
+ rw_wlock(blp);
+ for (j = i; j < n_nchash; j += numbucketlocks) {
+retry:
+ bucket = &nchashtbl[j];
+ LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+ if (ncp->nc_dvp->v_mount != mp)
+ continue;
+ error = cache_zap_wlocked_bucket_kl(ncp, blp,
+ &vlp1, &vlp2);
+ if (error != 0)
+ goto retry;
+ TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+ }
+ }
+ rw_wunlock(blp);
+ if (vlp1 == NULL && vlp2 == NULL)
+ cache_maybe_yield();
+ }
+ if (vlp1 != NULL)
+ mtx_unlock(vlp1);
+ if (vlp2 != NULL)
+ mtx_unlock(vlp2);
+
+ TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+ cache_free(ncp);
+ }
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(struct vop_lookup_args *ap)
+{
+ struct vnode *dvp;
+ int error;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ int flags = cnp->cn_flags;
+
+ *vpp = NULL;
+ dvp = ap->a_dvp;
+
+ if (dvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+ return (EROFS);
+
+ error = vn_dir_check_exec(dvp, cnp);
+ if (error != 0)
+ return (error);
+
+ error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
+ if (error == 0)
+ return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * XXX All of these sysctls would probably be more productive dead.
+ */
+static int __read_mostly disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
+ "Disable the getcwd syscall");
+
+/* Implementation of the getcwd syscall. */
+int
+sys___getcwd(struct thread *td, struct __getcwd_args *uap)
+{
+
+ return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
+ MAXPATHLEN));
+}
+
+int
+kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen,
+ size_t path_max)
+{
+ char *bp, *tmpbuf;
+ struct filedesc *fdp;
+ struct vnode *cdir, *rdir;
+ int error;
+
+ if (__predict_false(disablecwd))
+ return (ENODEV);
+ if (__predict_false(buflen < 2))
+ return (EINVAL);
+ if (buflen > path_max)
+ buflen = path_max;
+
+ tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
+ fdp = td->td_proc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ cdir = fdp->fd_cdir;
+ vrefact(cdir);
+ rdir = fdp->fd_rdir;
+ vrefact(rdir);
+ FILEDESC_SUNLOCK(fdp);
+ error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
+ vrele(rdir);
+ vrele(cdir);
+
+ if (!error) {
+ if (bufseg == UIO_SYSSPACE)
+ bcopy(bp, buf, strlen(bp) + 1);
+ else
+ error = copyout(bp, buf, strlen(bp) + 1);
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_NAMEI))
+ ktrnamei(bp);
+#endif
+ }
+ free(tmpbuf, M_TEMP);
+ return (error);
+}
+
+/*
+ * Thus begins the fullpath magic.
+ */
+
+static int __read_mostly disablefullpath;
+SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
+ "Disable the vn_fullpath function");
+
+/*
+ * Retrieve the full filesystem path that correspond to a vnode from the name
+ * cache (if available)
+ */
+int
+vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
+{
+ char *buf;
+ struct filedesc *fdp;
+ struct vnode *rdir;
+ int error;
+
+ if (__predict_false(disablefullpath))
+ return (ENODEV);
+ if (__predict_false(vn == NULL))
+ return (EINVAL);
+
+ buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ fdp = td->td_proc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ rdir = fdp->fd_rdir;
+ vrefact(rdir);
+ FILEDESC_SUNLOCK(fdp);
+ error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
+ vrele(rdir);
+
+ if (!error)
+ *freebuf = buf;
+ else
+ free(buf, M_TEMP);
+ return (error);
+}
+
+/*
+ * This function is similar to vn_fullpath, but it attempts to lookup the
+ * pathname relative to the global root mount point. This is required for the
+ * auditing sub-system, as audited pathnames must be absolute, relative to the
+ * global root mount point.
+ */
+int
+vn_fullpath_global(struct thread *td, struct vnode *vn,
+ char **retbuf, char **freebuf)
+{
+ char *buf;
+ int error;
+
+ if (__predict_false(disablefullpath))
+ return (ENODEV);
+ if (__predict_false(vn == NULL))
+ return (EINVAL);
+ buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
+ if (!error)
+ *freebuf = buf;
+ else
+ free(buf, M_TEMP);
+ return (error);
+}
+
+int
+vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
+{
+ struct vnode *dvp;
+ struct namecache *ncp;
+ struct mtx *vlp;
+ int error;
+
+ vlp = VP2VNODELOCK(*vp);
+ mtx_lock(vlp);
+ TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
+ if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+ break;
+ }
+ if (ncp != NULL) {
+ if (*buflen < ncp->nc_nlen) {
+ mtx_unlock(vlp);
+ vrele(*vp);
+ counter_u64_add(numfullpathfail4, 1);
+ error = ENOMEM;
+ SDT_PROBE3(vfs, namecache, fullpath, return, error,
+ vp, NULL);
+ return (error);
+ }
+ *buflen -= ncp->nc_nlen;
+ memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
+ SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
+ ncp->nc_name, vp);
+ dvp = *vp;
+ *vp = ncp->nc_dvp;
+ vref(*vp);
+ mtx_unlock(vlp);
+ vrele(dvp);
+ return (0);
+ }
+ SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
+
+ mtx_unlock(vlp);
+ vn_lock(*vp, LK_SHARED | LK_RETRY);
+ error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
+ vput(*vp);
+ if (error) {
+ counter_u64_add(numfullpathfail2, 1);
+ SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
+ return (error);
+ }
+
+ *vp = dvp;
+ if (dvp->v_iflag & VI_DOOMED) {
+ /* forced unmount */
+ vrele(dvp);
+ error = ENOENT;
+ SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
+ return (error);
+ }
+ /*
+ * *vp has its use count incremented still.
+ */
+
+ return (0);
+}
+
+/*
+ * The magic behind kern___getcwd() and vn_fullpath().
+ */
+static int
+vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+ char *buf, char **retbuf, u_int buflen)
+{
+ int error, slash_prefixed;
+#ifdef KDTRACE_HOOKS
+ struct vnode *startvp = vp;
+#endif
+ struct vnode *vp1;
+
+ buflen--;
+ buf[buflen] = '\0';
+ error = 0;
+ slash_prefixed = 0;
+
+ SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
+ counter_u64_add(numfullpathcalls, 1);
+ vref(vp);
+ if (vp->v_type != VDIR) {
+ error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
+ if (error)
+ return (error);
+ if (buflen == 0) {
+ vrele(vp);
+ return (ENOMEM);
+ }
+ buf[--buflen] = '/';
+ slash_prefixed = 1;
+ }
+ while (vp != rdir && vp != rootvnode) {
+ /*
+ * The vp vnode must be already fully constructed,
+ * since it is either found in namecache or obtained
+ * from VOP_VPTOCNP(). We may test for VV_ROOT safely
+ * without obtaining the vnode lock.
+ */
+ if ((vp->v_vflag & VV_ROOT) != 0) {
+ vn_lock(vp, LK_RETRY | LK_SHARED);
+
+ /*
+ * With the vnode locked, check for races with
+ * unmount, forced or not. Note that we
+ * already verified that vp is not equal to
+ * the root vnode, which means that
+ * mnt_vnodecovered can be NULL only for the
+ * case of unmount.
+ */
+ if ((vp->v_iflag & VI_DOOMED) != 0 ||
+ (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
+ vp1->v_mountedhere != vp->v_mount) {
+ vput(vp);
+ error = ENOENT;
+ SDT_PROBE3(vfs, namecache, fullpath, return,
+ error, vp, NULL);
+ break;
+ }
+
+ vref(vp1);
+ vput(vp);
+ vp = vp1;
+ continue;
+ }
+ if (vp->v_type != VDIR) {
+ vrele(vp);
+ counter_u64_add(numfullpathfail1, 1);
+ error = ENOTDIR;
+ SDT_PROBE3(vfs, namecache, fullpath, return,
+ error, vp, NULL);
+ break;
+ }
+ error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
+ if (error)
+ break;
+ if (buflen == 0) {
+ vrele(vp);
+ error = ENOMEM;
+ SDT_PROBE3(vfs, namecache, fullpath, return, error,
+ startvp, NULL);
+ break;
+ }
+ buf[--buflen] = '/';
+ slash_prefixed = 1;
+ }
+ if (error)
+ return (error);
+ if (!slash_prefixed) {
+ if (buflen == 0) {
+ vrele(vp);
+ counter_u64_add(numfullpathfail4, 1);
+ SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
+ startvp, NULL);
+ return (ENOMEM);
+ }
+ buf[--buflen] = '/';
+ }
+ counter_u64_add(numfullpathfound, 1);
+ vrele(vp);
+
+ SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
+ *retbuf = buf + buflen;
+ return (0);
+}
+
+struct vnode *
+vn_dir_dd_ino(struct vnode *vp)
+{
+ struct namecache *ncp;
+ struct vnode *ddvp;
+ struct mtx *vlp;
+
+ ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
+ vlp = VP2VNODELOCK(vp);
+ mtx_lock(vlp);
+ TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
+ if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+ continue;
+ ddvp = ncp->nc_dvp;
+ vhold(ddvp);
+ mtx_unlock(vlp);
+ if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
+ return (NULL);
+ return (ddvp);
+ }
+ mtx_unlock(vlp);
+ return (NULL);
+}
+
+int
+vn_commname(struct vnode *vp, char *buf, u_int buflen)
+{
+ struct namecache *ncp;
+ struct mtx *vlp;
+ int l;
+
+ vlp = VP2VNODELOCK(vp);
+ mtx_lock(vlp);
+ TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
+ if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+ break;
+ if (ncp == NULL) {
+ mtx_unlock(vlp);
+ return (ENOENT);
+ }
+ l = min(ncp->nc_nlen, buflen - 1);
+ memcpy(buf, ncp->nc_name, l);
+ mtx_unlock(vlp);
+ buf[l] = '\0';
+ return (0);
+}
+
+/* ABI compat shims for old kernel modules. */
+#undef cache_enter
+
+void cache_enter(struct vnode *dvp, struct vnode *vp,
+ struct componentname *cnp);
+
+void
+cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
+{
+
+ cache_enter_time(dvp, vp, cnp, NULL, NULL);
+}
+
+/*
+ * This function updates path string to vnode's full global path
+ * and checks the size of the new path string against the pathlen argument.
+ *
+ * Requires a locked, referenced vnode.
+ * Vnode is re-locked on success or ENODEV, otherwise unlocked.
+ *
+ * If sysctl debug.disablefullpath is set, ENODEV is returned,
+ * vnode is left locked and path remain untouched.
+ *
+ * If vp is a directory, the call to vn_fullpath_global() always succeeds
+ * because it falls back to the ".." lookup if the namecache lookup fails.
+ */
+int
+vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
+ u_int pathlen)
+{
+ struct nameidata nd;
+ struct vnode *vp1;
+ char *rpath, *fbuf;
+ int error;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ /* Return ENODEV if sysctl debug.disablefullpath==1 */
+ if (__predict_false(disablefullpath))
+ return (ENODEV);
+
+ /* Construct global filesystem path from vp. */
+ VOP_UNLOCK(vp, 0);
+ error = vn_fullpath_global(td, vp, &rpath, &fbuf);
+
+ if (error != 0) {
+ vrele(vp);
+ return (error);
+ }
+
+ if (strlen(rpath) >= pathlen) {
+ vrele(vp);
+ error = ENAMETOOLONG;
+ goto out;
+ }
+
+ /*
+ * Re-lookup the vnode by path to detect a possible rename.
+ * As a side effect, the vnode is relocked.
+ * If vnode was renamed, return ENOENT.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_SYSSPACE, path, td);
+ error = namei(&nd);
+ if (error != 0) {
+ vrele(vp);
+ goto out;
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp1 = nd.ni_vp;
+ vrele(vp);
+ if (vp1 == vp)
+ strcpy(path, rpath);
+ else {
+ vput(vp1);
+ error = ENOENT;
+ }
+
+out:
+ free(fbuf, M_TEMP);
+ return (error);
+}
+
+#ifdef DDB
+static void
+db_print_vpath(struct vnode *vp)
+{
+
+ while (vp != NULL) {
+ db_printf("%p: ", vp);
+ if (vp == rootvnode) {
+ db_printf("/");
+ vp = NULL;
+ } else {
+ if (vp->v_vflag & VV_ROOT) {
+ db_printf("<mount point>");
+ vp = vp->v_mount->mnt_vnodecovered;
+ } else {
+ struct namecache *ncp;
+ char *ncn;
+ int i;
+
+ ncp = TAILQ_FIRST(&vp->v_cache_dst);
+ if (ncp != NULL) {
+ ncn = ncp->nc_name;
+ for (i = 0; i < ncp->nc_nlen; i++)
+ db_printf("%c", *ncn++);
+ vp = ncp->nc_dvp;
+ } else {
+ vp = NULL;
+ }
+ }
+ }
+ db_printf("\n");
+ }
+
+ return;
+}
+
+DB_SHOW_COMMAND(vpath, db_show_vpath)
+{
+ struct vnode *vp;
+
+ if (!have_addr) {
+ db_printf("usage: show vpath <struct vnode *>\n");
+ return;
+ }
+
+ vp = (struct vnode *)addr;
+ db_print_vpath(vp);
+}
+
+#endif
diff --git a/freebsd/sys/kern/vfs_cluster.c b/freebsd/sys/kern/vfs_cluster.c
new file mode 100644
index 00000000..1ebe4a56
--- /dev/null
+++ b/freebsd/sys/kern/vfs_cluster.c
@@ -0,0 +1,1086 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1993
+ * The Regents of the University of California. All rights reserved.
+ * Modifications/enhancements:
+ * Copyright (c) 1995 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <sys/sysctl.h>
+
+#if defined(CLUSTERDEBUG)
+static int rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
+ "Debug VFS clustering code");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
+
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+ struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+ daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+ struct buf *fbp);
+static void cluster_callback(struct buf *);
+
+static int write_behind = 1;
+SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
+ "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
+
+static int read_max = 64;
+SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
+ "Cluster read-ahead max block count");
+
+static int read_min = 1;
+SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
+ "Cluster read min block count");
+
+/*
+ * Read data to a buf, including read-ahead if we find this to be beneficial.
+ * cluster_read replaces bread.
+ */
+int
+cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
+ struct ucred *cred, long totread, int seqcount, int gbflags,
+ struct buf **bpp)
+{
+ struct buf *bp, *rbp, *reqbp;
+ struct bufobj *bo;
+ struct thread *td;
+ daddr_t blkno, origblkno;
+ int maxra, racluster;
+ int error, ncontig;
+ int i;
+
+ error = 0;
+ td = curthread;
+ bo = &vp->v_bufobj;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
+ /*
+ * Try to limit the amount of read-ahead by a few
+ * ad-hoc parameters. This needs work!!!
+ */
+ racluster = vp->v_mount->mnt_iosize_max / size;
+ maxra = seqcount;
+ maxra = min(read_max, maxra);
+ maxra = min(nbuf/8, maxra);
+ if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
+ maxra = (filesize / size) - lblkno;
+
+ /*
+ * get the requested block
+ */
+ error = getblkx(vp, lblkno, size, 0, 0, gbflags, &bp);
+ if (error != 0) {
+ *bpp = NULL;
+ return (error);
+ }
+ gbflags &= ~GB_NOSPARSE;
+ origblkno = lblkno;
+ *bpp = reqbp = bp;
+
+ /*
+ * if it is in the cache, then check to see if the reads have been
+ * sequential. If they have, then try some read-ahead, otherwise
+ * back-off on prospective read-aheads.
+ */
+ if (bp->b_flags & B_CACHE) {
+ if (!seqcount) {
+ return 0;
+ } else if ((bp->b_flags & B_RAM) == 0) {
+ return 0;
+ } else {
+ bp->b_flags &= ~B_RAM;
+ BO_RLOCK(bo);
+ for (i = 1; i < maxra; i++) {
+ /*
+ * Stop if the buffer does not exist or it
+ * is invalid (about to go away?)
+ */
+ rbp = gbincore(&vp->v_bufobj, lblkno+i);
+ if (rbp == NULL || (rbp->b_flags & B_INVAL))
+ break;
+
+ /*
+ * Set another read-ahead mark so we know
+ * to check again. (If we can lock the
+ * buffer without waiting)
+ */
+ if ((((i % racluster) == (racluster - 1)) ||
+ (i == (maxra - 1)))
+ && (0 == BUF_LOCK(rbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
+ rbp->b_flags |= B_RAM;
+ BUF_UNLOCK(rbp);
+ }
+ }
+ BO_RUNLOCK(bo);
+ if (i >= maxra) {
+ return 0;
+ }
+ lblkno += i;
+ }
+ reqbp = bp = NULL;
+ /*
+ * If it isn't in the cache, then get a chunk from
+ * disk if sequential, otherwise just get the block.
+ */
+ } else {
+ off_t firstread = bp->b_offset;
+ int nblks;
+ long minread;
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("cluster_read: no buffer offset"));
+
+ ncontig = 0;
+
+ /*
+ * Adjust totread if needed
+ */
+ minread = read_min * size;
+ if (minread > totread)
+ totread = minread;
+
+ /*
+ * Compute the total number of blocks that we should read
+ * synchronously.
+ */
+ if (firstread + totread > filesize)
+ totread = filesize - firstread;
+ nblks = howmany(totread, size);
+ if (nblks > racluster)
+ nblks = racluster;
+
+ /*
+ * Now compute the number of contiguous blocks.
+ */
+ if (nblks > 1) {
+ error = VOP_BMAP(vp, lblkno, NULL,
+ &blkno, &ncontig, NULL);
+ /*
+ * If this failed to map just do the original block.
+ */
+ if (error || blkno == -1)
+ ncontig = 0;
+ }
+
+ /*
+ * If we have contiguous data available do a cluster
+ * otherwise just read the requested block.
+ */
+ if (ncontig) {
+ /* Account for our first block. */
+ ncontig = min(ncontig + 1, nblks);
+ if (ncontig < nblks)
+ nblks = ncontig;
+ bp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, nblks, gbflags, bp);
+ lblkno += (bp->b_bufsize / size);
+ } else {
+ bp->b_flags |= B_RAM;
+ bp->b_iocmd = BIO_READ;
+ lblkno += 1;
+ }
+ }
+
+ /*
+ * handle the synchronous read so that it is available ASAP.
+ */
+ if (bp) {
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ vfs_busy_pages(bp, 0);
+ }
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
+ BUF_KERNPROC(bp);
+ bp->b_iooffset = dbtob(bp->b_blkno);
+ bstrategy(bp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ racct_add_buf(td->td_proc, bp, 0);
+ PROC_UNLOCK(td->td_proc);
+ }
+#endif /* RACCT */
+ td->td_ru.ru_inblock++;
+ }
+
+ /*
+ * If we have been doing sequential I/O, then do some read-ahead.
+ */
+ while (lblkno < (origblkno + maxra)) {
+ error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
+ if (error)
+ break;
+
+ if (blkno == -1)
+ break;
+
+ /*
+ * We could throttle ncontig here by maxra but we might as
+ * well read the data if it is contiguous. We're throttled
+ * by racluster anyway.
+ */
+ if (ncontig) {
+ ncontig = min(ncontig + 1, racluster);
+ rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
+ size, ncontig, gbflags, NULL);
+ lblkno += (rbp->b_bufsize / size);
+ if (rbp->b_flags & B_DELWRI) {
+ bqrelse(rbp);
+ continue;
+ }
+ } else {
+ rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
+ lblkno += 1;
+ if (rbp->b_flags & B_DELWRI) {
+ bqrelse(rbp);
+ continue;
+ }
+ rbp->b_flags |= B_ASYNC | B_RAM;
+ rbp->b_iocmd = BIO_READ;
+ rbp->b_blkno = blkno;
+ }
+ if (rbp->b_flags & B_CACHE) {
+ rbp->b_flags &= ~B_ASYNC;
+ bqrelse(rbp);
+ continue;
+ }
+ if ((rbp->b_flags & B_CLUSTER) == 0) {
+ vfs_busy_pages(rbp, 0);
+ }
+ rbp->b_flags &= ~B_INVAL;
+ rbp->b_ioflags &= ~BIO_ERROR;
+ if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
+ BUF_KERNPROC(rbp);
+ rbp->b_iooffset = dbtob(rbp->b_blkno);
+ bstrategy(rbp);
+#ifdef RACCT
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ racct_add_buf(td->td_proc, rbp, 0);
+ PROC_UNLOCK(td->td_proc);
+ }
+#endif /* RACCT */
+ td->td_ru.ru_inblock++;
+ }
+
+ if (reqbp) {
+ /*
+ * Like bread, always brelse() the buffer when
+ * returning an error.
+ */
+ error = bufwait(reqbp);
+ if (error != 0) {
+ brelse(reqbp);
+ *bpp = NULL;
+ }
+ }
+ return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead. We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
+{
+ struct buf *bp, *tbp;
+ daddr_t bn;
+ off_t off;
+ long tinc, tsize;
+ int i, inc, j, k, toff;
+
+ KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+ ("cluster_rbuild: size %ld != f_iosize %jd\n",
+ size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
+
+ /*
+ * avoid a division
+ */
+ while ((u_quad_t) size * (lbn + run) > filesize) {
+ --run;
+ }
+
+ if (fbp) {
+ tbp = fbp;
+ tbp->b_iocmd = BIO_READ;
+ } else {
+ tbp = getblk(vp, lbn, size, 0, 0, gbflags);
+ if (tbp->b_flags & B_CACHE)
+ return tbp;
+ tbp->b_flags |= B_ASYNC | B_RAM;
+ tbp->b_iocmd = BIO_READ;
+ }
+ tbp->b_blkno = blkno;
+ if( (tbp->b_flags & B_MALLOC) ||
+ ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+ return tbp;
+
+ bp = trypbuf(&cluster_pbuf_freecnt);
+ if (bp == NULL)
+ return tbp;
+
+ /*
+ * We are synthesizing a buffer out of vm_page_t's, but
+ * if the block size is not page aligned then the starting
+ * address may not be either. Inherit the b_data offset
+ * from the original buffer.
+ */
+ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+ if ((gbflags & GB_UNMAPPED) != 0) {
+ bp->b_data = unmapped_buf;
+ } else {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ }
+ bp->b_iocmd = BIO_READ;
+ bp->b_iodone = cluster_callback;
+ bp->b_blkno = blkno;
+ bp->b_lblkno = lbn;
+ bp->b_offset = tbp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+ pbgetvp(vp, bp);
+
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+
+ inc = btodb(size);
+ for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+ if (i == 0) {
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ vfs_drain_busy_pages(tbp);
+ vm_object_pip_add(tbp->b_bufobj->bo_object,
+ tbp->b_npages);
+ for (k = 0; k < tbp->b_npages; k++)
+ vm_page_sbusy(tbp->b_pages[k]);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ } else {
+ if ((bp->b_npages * PAGE_SIZE) +
+ round_page(size) > vp->v_mount->mnt_iosize_max) {
+ break;
+ }
+
+ tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+ (gbflags & GB_UNMAPPED));
+
+ /* Don't wait around for locked bufs. */
+ if (tbp == NULL)
+ break;
+
+ /*
+ * Stop scanning if the buffer is fully valid
+ * (marked B_CACHE), or locked (may be doing a
+ * background write), or if the buffer is not
+ * VMIO backed. The clustering code can only deal
+ * with VMIO-backed buffers. The bo lock is not
+ * required for the BKGRDINPROG check since it
+ * can not be set without the buf lock.
+ */
+ if ((tbp->b_vflags & BV_BKGRDINPROG) ||
+ (tbp->b_flags & B_CACHE) ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bqrelse(tbp);
+ break;
+ }
+
+ /*
+ * The buffer must be completely invalid in order to
+ * take part in the cluster. If it is partially valid
+ * then we stop.
+ */
+ off = tbp->b_offset;
+ tsize = size;
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ for (j = 0; tsize > 0; j++) {
+ toff = off & PAGE_MASK;
+ tinc = tsize;
+ if (toff + tinc > PAGE_SIZE)
+ tinc = PAGE_SIZE - toff;
+ VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
+ if ((tbp->b_pages[j]->valid &
+ vm_page_bits(toff, tinc)) != 0)
+ break;
+ if (vm_page_xbusied(tbp->b_pages[j]))
+ break;
+ vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
+ vm_page_sbusy(tbp->b_pages[j]);
+ off += tinc;
+ tsize -= tinc;
+ }
+ if (tsize > 0) {
+clean_sbusy:
+ vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
+ for (k = 0; k < j; k++)
+ vm_page_sunbusy(tbp->b_pages[k]);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ bqrelse(tbp);
+ break;
+ }
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+
+ /*
+ * Set a read-ahead mark as appropriate
+ */
+ if ((fbp && (i == 1)) || (i == (run - 1)))
+ tbp->b_flags |= B_RAM;
+
+ /*
+ * Set the buffer up for an async read (XXX should
+ * we do this only if we do not wind up brelse()ing?).
+ * Set the block number if it isn't set, otherwise
+ * if it is make sure it matches the block number we
+ * expect.
+ */
+ tbp->b_flags |= B_ASYNC;
+ tbp->b_iocmd = BIO_READ;
+ if (tbp->b_blkno == tbp->b_lblkno) {
+ tbp->b_blkno = bn;
+ } else if (tbp->b_blkno != bn) {
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ goto clean_sbusy;
+ }
+ }
+ /*
+ * XXX fbp from caller may not be B_ASYNC, but we are going
+ * to biodone() it in cluster_callback() anyway
+ */
+ BUF_KERNPROC(tbp);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ vm_page_t m;
+ m = tbp->b_pages[j];
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages-1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ if (m->valid == VM_PAGE_BITS_ALL)
+ tbp->b_pages[j] = bogus_page;
+ }
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ /*
+ * Don't inherit tbp->b_bufsize as it may be larger due to
+ * a non-page-aligned size. Instead just aggregate using
+ * 'size'.
+ */
+ if (tbp->b_bcount != size)
+ printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
+ if (tbp->b_bufsize != size)
+ printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+ }
+
+ /*
+ * Fully valid pages in the cluster are already good and do not need
+ * to be re-read from disk. Replace the page with bogus_page
+ */
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ for (j = 0; j < bp->b_npages; j++) {
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
+ if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
+ bp->b_pages[j] = bogus_page;
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+
+ if (buf_mapped(bp)) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
+ return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+static void
+cluster_callback(struct buf *bp)
+{
+ struct buf *nbp, *tbp;
+ int error = 0;
+
+ /*
+ * Must propagate errors to all the components.
+ */
+ if (bp->b_ioflags & BIO_ERROR)
+ error = bp->b_error;
+
+ if (buf_mapped(bp)) {
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+ bp->b_npages);
+ }
+ /*
+ * Move memory from the large cluster buffer into the component
+ * buffers and mark IO as done on these.
+ */
+ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+ tbp; tbp = nbp) {
+ nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+ if (error) {
+ tbp->b_ioflags |= BIO_ERROR;
+ tbp->b_error = error;
+ } else {
+ tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+ tbp->b_flags &= ~B_INVAL;
+ tbp->b_ioflags &= ~BIO_ERROR;
+ /*
+ * XXX the bdwrite()/bqrelse() issued during
+ * cluster building clears B_RELBUF (see bqrelse()
+ * comment). If direct I/O was specified, we have
+ * to restore it here to allow the buffer and VM
+ * to be freed.
+ */
+ if (tbp->b_flags & B_DIRECT)
+ tbp->b_flags |= B_RELBUF;
+ }
+ bufdone(tbp);
+ }
+ pbrelvp(bp);
+ relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ * cluster_wbuild_wb:
+ *
+ * Implement modified write build for cluster.
+ *
+ * write_behind = 0 write behind disabled
+ * write_behind = 1 write behind normal (default)
+ * write_behind = 2 write behind backed-off
+ */
+
+static __inline int
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+ int gbflags)
+{
+ int r = 0;
+
+ switch (write_behind) {
+ case 2:
+ if (start_lbn < len)
+ break;
+ start_lbn -= len;
+ /* FALLTHROUGH */
+ case 1:
+ r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
+ /* FALLTHROUGH */
+ default:
+ /* FALLTHROUGH */
+ break;
+ }
+ return(r);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ * 1. Write is not sequential (write asynchronously)
+ * Write is sequential:
+ * 2. beginning of cluster - begin cluster
+ * 3. middle of a cluster - add to cluster
+ * 4. end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
+ int gbflags)
+{
+ daddr_t lbn;
+ int maxclen, cursize;
+ int lblocksize;
+ int async;
+
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
+ if (vp->v_type == VREG) {
+ async = DOINGASYNC(vp);
+ lblocksize = vp->v_mount->mnt_stat.f_iosize;
+ } else {
+ async = 0;
+ lblocksize = bp->b_bufsize;
+ }
+ lbn = bp->b_lblkno;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+ /* Initialize vnode to beginning of file. */
+ if (lbn == 0)
+ vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+ if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+ (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+ maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
+ if (vp->v_clen != 0) {
+ /*
+ * Next block is not sequential.
+ *
+ * If we are not writing at end of file, the process
+ * seeked to another point in the file since its last
+ * write, or we have reached our maximum cluster size,
+ * then push the previous cluster. Otherwise try
+ * reallocating to make it sequential.
+ *
+ * Change to algorithm: only push previous cluster if
+ * it was sequential from the point of view of the
+ * seqcount heuristic, otherwise leave the buffer
+ * intact so we can potentially optimize the I/O
+ * later on in the buf_daemon or update daemon
+ * flush.
+ */
+ cursize = vp->v_lastw - vp->v_cstart + 1;
+ if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+ lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+ if (!async && seqcount > 0) {
+ cluster_wbuild_wb(vp, lblocksize,
+ vp->v_cstart, cursize, gbflags);
+ }
+ } else {
+ struct buf **bpp, **endbp;
+ struct cluster_save *buflist;
+
+ buflist = cluster_collectbufs(vp, bp, gbflags);
+ if (buflist == NULL) {
+ /*
+ * Cluster build failed so just write
+ * it now.
+ */
+ bawrite(bp);
+ return;
+ }
+ endbp = &buflist->bs_children
+ [buflist->bs_nchildren - 1];
+ if (VOP_REALLOCBLKS(vp, buflist)) {
+ /*
+ * Failed, push the previous cluster
+ * if *really* writing sequentially
+ * in the logical file (seqcount > 1),
+ * otherwise delay it in the hopes that
+ * the low level disk driver can
+ * optimize the write ordering.
+ */
+ for (bpp = buflist->bs_children;
+ bpp < endbp; bpp++)
+ brelse(*bpp);
+ free(buflist, M_SEGMENT);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp,
+ lblocksize, vp->v_cstart,
+ cursize, gbflags);
+ }
+ } else {
+ /*
+ * Succeeded, keep building cluster.
+ */
+ for (bpp = buflist->bs_children;
+ bpp <= endbp; bpp++)
+ bdwrite(*bpp);
+ free(buflist, M_SEGMENT);
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+ return;
+ }
+ }
+ }
+ /*
+ * Consider beginning a cluster. If at end of file, make
+ * cluster as large as possible, otherwise find size of
+ * existing cluster.
+ */
+ if ((vp->v_type == VREG) &&
+ ((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+ (bp->b_blkno == bp->b_lblkno) &&
+ (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+ bp->b_blkno == -1)) {
+ bawrite(bp);
+ vp->v_clen = 0;
+ vp->v_lasta = bp->b_blkno;
+ vp->v_cstart = lbn + 1;
+ vp->v_lastw = lbn;
+ return;
+ }
+ vp->v_clen = maxclen;
+ if (!async && maxclen == 0) { /* I/O not contiguous */
+ vp->v_cstart = lbn + 1;
+ bawrite(bp);
+ } else { /* Wait for rest of cluster */
+ vp->v_cstart = lbn;
+ bdwrite(bp);
+ }
+ } else if (lbn == vp->v_cstart + vp->v_clen) {
+ /*
+ * At end of cluster, write it out if seqcount tells us we
+ * are operating sequentially, otherwise let the buf or
+ * update daemon handle it.
+ */
+ bdwrite(bp);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+ vp->v_clen + 1, gbflags);
+ }
+ vp->v_clen = 0;
+ vp->v_cstart = lbn + 1;
+ } else if (vm_page_count_severe()) {
+ /*
+ * We are low on memory, get it going NOW
+ */
+ bawrite(bp);
+ } else {
+ /*
+ * In the middle of a cluster, so just delay the I/O for now.
+ */
+ bdwrite(bp);
+ }
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed. Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
+ int gbflags)
+{
+ struct buf *bp, *tbp;
+ struct bufobj *bo;
+ int i, j;
+ int totalwritten = 0;
+ int dbsize = btodb(size);
+
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
+ bo = &vp->v_bufobj;
+ while (len > 0) {
+ /*
+ * If the buffer is not delayed-write (i.e. dirty), or it
+ * is delayed-write but either locked or inval, it cannot
+ * partake in the clustered write.
+ */
+ BO_LOCK(bo);
+ if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
+ (tbp->b_vflags & BV_BKGRDINPROG)) {
+ BO_UNLOCK(bo);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+ if (BUF_LOCK(tbp,
+ LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
+ ++start_lbn;
+ --len;
+ continue;
+ }
+ if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
+ BUF_UNLOCK(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+ bremfree(tbp);
+ tbp->b_flags &= ~B_DONE;
+
+ /*
+ * Extra memory in the buffer, punt on this buffer.
+ * XXX we could handle this in most cases, but we would
+ * have to push the extra memory down to after our max
+ * possible cluster size and then potentially pull it back
+ * up if the cluster was terminated prematurely--too much
+ * hassle.
+ */
+ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) !=
+ (B_CLUSTEROK | B_VMIO)) ||
+ (tbp->b_bcount != tbp->b_bufsize) ||
+ (tbp->b_bcount != size) ||
+ (len == 1) ||
+ ((bp = (vp->v_vflag & VV_MD) != 0 ?
+ trypbuf(&cluster_pbuf_freecnt) :
+ getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+ totalwritten += tbp->b_bufsize;
+ bawrite(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+
+ /*
+ * We got a pbuf to make the cluster in.
+ * so initialise it.
+ */
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+ if (tbp->b_wcred != NOCRED)
+ bp->b_wcred = crhold(tbp->b_wcred);
+
+ bp->b_blkno = tbp->b_blkno;
+ bp->b_lblkno = tbp->b_lblkno;
+ bp->b_offset = tbp->b_offset;
+
+ /*
+ * We are synthesizing a buffer out of vm_page_t's, but
+ * if the block size is not page aligned then the starting
+ * address may not be either. Inherit the b_data offset
+ * from the original buffer.
+ */
+ if ((gbflags & GB_UNMAPPED) == 0 ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ } else {
+ bp->b_data = unmapped_buf;
+ }
+ bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+ B_NEEDCOMMIT));
+ bp->b_iodone = cluster_callback;
+ pbgetvp(vp, bp);
+ /*
+ * From this location in the file, scan forward to see
+ * if there are buffers with adjacent data that need to
+ * be written as well.
+ */
+ for (i = 0; i < len; ++i, ++start_lbn) {
+ if (i != 0) { /* If not the first buffer */
+ /*
+ * If the adjacent data is not even in core it
+ * can't need to be written.
+ */
+ BO_LOCK(bo);
+ if ((tbp = gbincore(bo, start_lbn)) == NULL ||
+ (tbp->b_vflags & BV_BKGRDINPROG)) {
+ BO_UNLOCK(bo);
+ break;
+ }
+
+ /*
+ * If it IS in core, but has different
+ * characteristics, or is locked (which
+ * means it could be undergoing a background
+ * I/O or be in a weird state), then don't
+ * cluster with it.
+ */
+ if (BUF_LOCK(tbp,
+ LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
+ BO_LOCKPTR(bo)))
+ break;
+
+ if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
+ B_INVAL | B_DELWRI | B_NEEDCOMMIT))
+ != (B_DELWRI | B_CLUSTEROK |
+ (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
+ tbp->b_wcred != bp->b_wcred) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
+ /*
+ * Check that the combined cluster
+ * would make sense with regard to pages
+ * and would not be too large
+ */
+ if ((tbp->b_bcount != size) ||
+ ((bp->b_blkno + (dbsize * i)) !=
+ tbp->b_blkno) ||
+ ((tbp->b_npages + bp->b_npages) >
+ (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
+ /*
+ * Ok, it's passed all the tests,
+ * so remove it from the free list
+ * and mark it busy. We will use it.
+ */
+ bremfree(tbp);
+ tbp->b_flags &= ~B_DONE;
+ } /* end of code for non-first buffers only */
+ /*
+ * If the IO is via the VM then we do some
+ * special VM hackery (yuck). Since the buffer's
+ * block size may not be page-aligned it is possible
+ * for a page to be shared between two buffers. We
+ * have to get rid of the duplication when building
+ * the cluster.
+ */
+ if (tbp->b_flags & B_VMIO) {
+ vm_page_t m;
+
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ if (i == 0) {
+ vfs_drain_busy_pages(tbp);
+ } else { /* if not first buffer */
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ if (vm_page_xbusied(m)) {
+ VM_OBJECT_WUNLOCK(
+ tbp->b_object);
+ bqrelse(tbp);
+ goto finishcluster;
+ }
+ }
+ }
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ vm_page_sbusy(m);
+ vm_object_pip_add(m->object, 1);
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages - 1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ }
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ }
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+ /*
+ * If any of the clustered buffers have their
+ * B_BARRIER flag set, transfer that request to
+ * the cluster.
+ */
+ bp->b_flags |= (tbp->b_flags & B_BARRIER);
+ tbp->b_flags &= ~(B_DONE | B_BARRIER);
+ tbp->b_flags |= B_ASYNC;
+ tbp->b_ioflags &= ~BIO_ERROR;
+ tbp->b_iocmd = BIO_WRITE;
+ bundirty(tbp);
+ reassignbuf(tbp); /* put on clean list */
+ bufobj_wref(tbp->b_bufobj);
+ BUF_KERNPROC(tbp);
+ buf_track(tbp, __func__);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ }
+ finishcluster:
+ if (buf_mapped(bp)) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic(
+ "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ totalwritten += bp->b_bufsize;
+ bp->b_dirtyoff = 0;
+ bp->b_dirtyend = bp->b_bufsize;
+ bawrite(bp);
+
+ len -= i;
+ }
+ return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
+{
+ struct cluster_save *buflist;
+ struct buf *bp;
+ daddr_t lbn;
+ int i, j, len, error;
+
+ len = vp->v_lastw - vp->v_cstart + 1;
+ buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+ M_SEGMENT, M_WAITOK);
+ buflist->bs_nchildren = 0;
+ buflist->bs_children = (struct buf **) (buflist + 1);
+ for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+ error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+ gbflags, &bp);
+ if (error != 0) {
+ /*
+ * If read fails, release collected buffers
+ * and return failure.
+ */
+ for (j = 0; j < i; j++)
+ brelse(buflist->bs_children[j]);
+ free(buflist, M_SEGMENT);
+ return (NULL);
+ }
+ buflist->bs_children[i] = bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL, NULL);
+ }
+ buflist->bs_children[i] = bp = last_bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+ buflist->bs_nchildren = i + 1;
+ return (buflist);
+}
diff --git a/freebsd/sys/kern/vfs_default.c b/freebsd/sys/kern/vfs_default.c
new file mode 100644
index 00000000..40041c9d
--- /dev/null
+++ b/freebsd/sys/kern/vfs_default.c
@@ -0,0 +1,1286 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/rwlock.h>
+#include <sys/fcntl.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/dirent.h>
+#include <sys/poll.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+
+static int vop_nolookup(struct vop_lookup_args *);
+static int vop_norename(struct vop_rename_args *);
+static int vop_nostrategy(struct vop_strategy_args *);
+static int get_next_dirent(struct vnode *vp, struct dirent **dpp,
+ char *dirbuf, int dirbuflen, off_t *off,
+ char **cpos, int *len, int *eofflag,
+ struct thread *td);
+static int dirent_exists(struct vnode *vp, const char *dirname,
+ struct thread *td);
+
+#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
+
+static int vop_stdis_text(struct vop_is_text_args *ap);
+static int vop_stdunset_text(struct vop_unset_text_args *ap);
+static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
+static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
+static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ * Note that every filesystem has to implement either vop_access
+ * or vop_accessx; failing to do so will result in immediate crash
+ * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(),
+ * which calls vop_stdaccess() etc.
+ */
+
+struct vop_vector default_vnodeops = {
+ .vop_default = NULL,
+ .vop_bypass = VOP_EOPNOTSUPP,
+
+ .vop_access = vop_stdaccess,
+ .vop_accessx = vop_stdaccessx,
+ .vop_advise = vop_stdadvise,
+ .vop_advlock = vop_stdadvlock,
+ .vop_advlockasync = vop_stdadvlockasync,
+ .vop_advlockpurge = vop_stdadvlockpurge,
+ .vop_allocate = vop_stdallocate,
+ .vop_bmap = vop_stdbmap,
+ .vop_close = VOP_NULL,
+ .vop_fsync = VOP_NULL,
+ .vop_fdatasync = vop_stdfdatasync,
+ .vop_getpages = vop_stdgetpages,
+ .vop_getpages_async = vop_stdgetpages_async,
+ .vop_getwritemount = vop_stdgetwritemount,
+ .vop_inactive = VOP_NULL,
+ .vop_ioctl = VOP_ENOTTY,
+ .vop_kqfilter = vop_stdkqfilter,
+ .vop_islocked = vop_stdislocked,
+ .vop_lock1 = vop_stdlock,
+ .vop_lookup = vop_nolookup,
+ .vop_open = VOP_NULL,
+ .vop_pathconf = VOP_EINVAL,
+ .vop_poll = vop_nopoll,
+ .vop_putpages = vop_stdputpages,
+ .vop_readlink = VOP_EINVAL,
+ .vop_rename = vop_norename,
+ .vop_revoke = VOP_PANIC,
+ .vop_strategy = vop_nostrategy,
+ .vop_unlock = vop_stdunlock,
+ .vop_vptocnp = vop_stdvptocnp,
+ .vop_vptofh = vop_stdvptofh,
+ .vop_unp_bind = vop_stdunp_bind,
+ .vop_unp_connect = vop_stdunp_connect,
+ .vop_unp_detach = vop_stdunp_detach,
+ .vop_is_text = vop_stdis_text,
+ .vop_set_text = vop_stdset_text,
+ .vop_unset_text = vop_stdunset_text,
+ .vop_add_writecount = vop_stdadd_writecount,
+};
+
+/*
+ * Series of placeholder functions for various error returns for
+ * VOPs.
+ */
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+ /*
+ printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+ */
+
+ return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+ return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+ return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+ return (EINVAL);
+}
+
+int
+vop_enoent(struct vop_generic_args *ap)
+{
+
+ return (ENOENT);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+ return (0);
+}
+
+/*
+ * Helper function to panic on some bad VOPs in some filesystems.
+ */
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+ panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
+}
+
+/*
+ * vop_std<something> and vop_no<something> are default functions for use by
+ * filesystems that need the "default reasonable" implementation for a
+ * particular operation.
+ *
+ * The documentation for the operations they implement exists (if it exists)
+ * in the VOP_<SOMETHING>(9) manpage (all uppercase).
+ */
+
+/*
+ * Default vop for filesystems that do not support name lookup
+ */
+static int
+vop_nolookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+
+ *ap->a_vpp = NULL;
+ return (ENOTDIR);
+}
+
+/*
+ * vop_norename:
+ *
+ * Handle unlock and reference counting for arguments of vop_rename
+ * for filesystems that do not implement rename operation.
+ */
+static int
+vop_norename(struct vop_rename_args *ap)
+{
+
+ vop_rename_fail(ap);
+ return (EOPNOTSUPP);
+}
+
+/*
+ * vop_nostrategy:
+ *
+ * Strategy routine for VFS devices that have none.
+ *
+ * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
+ * routine. Typically this is done for a BIO_READ strategy call.
+ * Typically B_INVAL is assumed to already be clear prior to a write
+ * and should not be cleared manually unless you just made the buffer
+ * invalid. BIO_ERROR should be cleared either way.
+ */
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+ printf("No strategy for buffer at %p\n", ap->a_bp);
+ vn_printf(ap->a_vp, "vnode ");
+ ap->a_bp->b_ioflags |= BIO_ERROR;
+ ap->a_bp->b_error = EOPNOTSUPP;
+ bufdone(ap->a_bp);
+ return (EOPNOTSUPP);
+}
+
+static int
+get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
+ int dirbuflen, off_t *off, char **cpos, int *len,
+ int *eofflag, struct thread *td)
+{
+ int error, reclen;
+ struct uio uio;
+ struct iovec iov;
+ struct dirent *dp;
+
+ KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+ KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+ if (*len == 0) {
+ iov.iov_base = dirbuf;
+ iov.iov_len = dirbuflen;
+
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = *off;
+ uio.uio_resid = dirbuflen;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_READ;
+ uio.uio_td = td;
+
+ *eofflag = 0;
+
+#ifdef MAC
+ error = mac_vnode_check_readdir(td->td_ucred, vp);
+ if (error == 0)
+#endif
+ error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
+ NULL, NULL);
+ if (error)
+ return (error);
+
+ *off = uio.uio_offset;
+
+ *cpos = dirbuf;
+ *len = (dirbuflen - uio.uio_resid);
+
+ if (*len == 0)
+ return (ENOENT);
+ }
+
+ dp = (struct dirent *)(*cpos);
+ reclen = dp->d_reclen;
+ *dpp = dp;
+
+ /* check for malformed directory.. */
+ if (reclen < DIRENT_MINSIZE)
+ return (EINVAL);
+
+ *cpos += reclen;
+ *len -= reclen;
+
+ return (0);
+}
+
+/*
+ * Check if a named file exists in a given directory vnode.
+ */
+static int
+dirent_exists(struct vnode *vp, const char *dirname, struct thread *td)
+{
+ char *dirbuf, *cpos;
+ int error, eofflag, dirbuflen, len, found;
+ off_t off;
+ struct dirent *dp;
+ struct vattr va;
+
+ KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+ KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+ found = 0;
+
+ error = VOP_GETATTR(vp, &va, td->td_ucred);
+ if (error)
+ return (found);
+
+ dirbuflen = DEV_BSIZE;
+ if (dirbuflen < va.va_blocksize)
+ dirbuflen = va.va_blocksize;
+ dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+ off = 0;
+ len = 0;
+ do {
+ error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
+ &cpos, &len, &eofflag, td);
+ if (error)
+ goto out;
+
+ if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+ strcmp(dp->d_name, dirname) == 0) {
+ found = 1;
+ goto out;
+ }
+ } while (len > 0 || !eofflag);
+
+out:
+ free(dirbuf, M_TEMP);
+ return (found);
+}
+
+int
+vop_stdaccess(struct vop_access_args *ap)
+{
+
+ KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
+ VAPPEND)) == 0, ("invalid bit in accmode"));
+
+ return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td));
+}
+
+int
+vop_stdaccessx(struct vop_accessx_args *ap)
+{
+ int error;
+ accmode_t accmode = ap->a_accmode;
+
+ error = vfs_unixify_accmode(&accmode);
+ if (error != 0)
+ return (error);
+
+ if (accmode == 0)
+ return (0);
+
+ return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td));
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+vop_stdadvlock(struct vop_advlock_args *ap)
+{
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+
+ vp = ap->a_vp;
+ if (ap->a_fl->l_whence == SEEK_END) {
+ /*
+ * The NFSv4 server must avoid doing a vn_lock() here, since it
+ * can deadlock the nfsd threads, due to a LOR. Fortunately
+ * the NFSv4 server always uses SEEK_SET and this code is
+ * only required for the SEEK_END case.
+ */
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ return (error);
+ } else
+ vattr.va_size = 0;
+
+ return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockasync(struct vop_advlockasync_args *ap)
+{
+ struct vnode *vp;
+ struct vattr vattr;
+ int error;
+
+ vp = ap->a_vp;
+ if (ap->a_fl->l_whence == SEEK_END) {
+ /* The size argument is only needed for SEEK_END. */
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ return (error);
+ } else
+ vattr.va_size = 0;
+
+ return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockpurge(struct vop_advlockpurge_args *ap)
+{
+ struct vnode *vp;
+
+ vp = ap->a_vp;
+ lf_purgelocks(vp, &vp->v_lockf);
+ return (0);
+}
+
+/*
+ * vop_stdpathconf:
+ *
+ * Standard implementation of POSIX pathconf, to get information about limits
+ * for a filesystem.
+ * Override per filesystem for the case where the filesystem has smaller
+ * limits.
+ */
+int
+vop_stdpathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
+{
+
+ switch (ap->a_name) {
+ case _PC_ASYNC_IO:
+ *ap->a_retval = _POSIX_ASYNCHRONOUS_IO;
+ return (0);
+ case _PC_PATH_MAX:
+ *ap->a_retval = PATH_MAX;
+ return (0);
+ case _PC_ACL_EXTENDED:
+ case _PC_ACL_NFS4:
+ case _PC_CAP_PRESENT:
+ case _PC_INF_PRESENT:
+ case _PC_MAC_PRESENT:
+ *ap->a_retval = 0;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ */
+int
+vop_stdlock(ap)
+ struct vop_lock1_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ char *file;
+ int line;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct mtx *ilk;
+
+ ilk = VI_MTX(vp);
+ return (lockmgr_lock_fast_path(vp->v_vnlock, ap->a_flags,
+ &ilk->lock_object, ap->a_file, ap->a_line));
+}
+
+/* See above. */
+int
+vop_stdunlock(ap)
+ struct vop_unlock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct mtx *ilk;
+
+ ilk = VI_MTX(vp);
+ return (lockmgr_unlock_fast_path(vp->v_vnlock, ap->a_flags,
+ &ilk->lock_object));
+}
+
+/* See above. */
+int
+vop_stdislocked(ap)
+ struct vop_islocked_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+
+ return (lockstatus(ap->a_vp->v_vnlock));
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ return (poll_no_poll(ap->a_events));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ if (ap->a_events & ~POLLSTANDARD)
+ return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
+ return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Return our mount point, as we will take charge of the writes.
+ */
+int
+vop_stdgetwritemount(ap)
+ struct vop_getwritemount_args /* {
+ struct vnode *a_vp;
+ struct mount **a_mpp;
+ } */ *ap;
+{
+ struct mount *mp;
+
+ /*
+ * XXX Since this is called unlocked we may be recycled while
+ * attempting to ref the mount. If this is the case or mountpoint
+ * will be set to NULL. We only have to prevent this call from
+ * returning with a ref to an incorrect mountpoint. It is not
+ * harmful to return with a ref to our previous mountpoint.
+ */
+ mp = ap->a_vp->v_mount;
+ if (mp != NULL) {
+ vfs_ref(mp);
+ if (mp != ap->a_vp->v_mount) {
+ vfs_rel(mp);
+ mp = NULL;
+ }
+ }
+ *(ap->a_mpp) = mp;
+ return (0);
+}
+
+/*
+ * If the file system doesn't implement VOP_BMAP, then return sensible defaults:
+ * - Return the vnode's bufobj instead of any underlying device's bufobj
+ * - Calculate the physical block number as if there were equal size
+ * consecutive blocks, but
+ * - Report no contiguous runs of blocks.
+ */
+int
+vop_stdbmap(ap)
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct bufobj **a_bop;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+ } */ *ap;
+{
+
+ if (ap->a_bop != NULL)
+ *ap->a_bop = &ap->a_vp->v_bufobj;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ if (ap->a_runb != NULL)
+ *ap->a_runb = 0;
+ return (0);
+}
+
+int
+vop_stdfsync(ap)
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ int a_waitfor;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ return (vn_fsync_buf(ap->a_vp, ap->a_waitfor));
+}
+
+static int
+vop_stdfdatasync(struct vop_fdatasync_args *ap)
+{
+
+ return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td));
+}
+
+int
+vop_stdfdatasync_buf(struct vop_fdatasync_args *ap)
+{
+
+ return (vn_fsync_buf(ap->a_vp, MNT_WAIT));
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
+int
+vop_stdgetpages(ap)
+ struct vop_getpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int *a_rbehind;
+ int *a_rahead;
+ } */ *ap;
+{
+
+ return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
+ ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL);
+}
+
+static int
+vop_stdgetpages_async(struct vop_getpages_async_args *ap)
+{
+ int error;
+
+ error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
+ ap->a_rahead);
+ ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
+ return (error);
+}
+
+int
+vop_stdkqfilter(struct vop_kqfilter_args *ap)
+{
+ return vfs_kqfilter(ap);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
+int
+vop_stdputpages(ap)
+ struct vop_putpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_sync;
+ int *a_rtvals;
+ } */ *ap;
+{
+
+ return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
+ ap->a_sync, ap->a_rtvals);
+}
+
+int
+vop_stdvptofh(struct vop_vptofh_args *ap)
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vop_stdvptocnp(struct vop_vptocnp_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct vnode **dvp = ap->a_vpp;
+ struct ucred *cred = ap->a_cred;
+ char *buf = ap->a_buf;
+ int *buflen = ap->a_buflen;
+ char *dirbuf, *cpos;
+ int i, error, eofflag, dirbuflen, flags, locked, len, covered;
+ off_t off;
+ ino_t fileno;
+ struct vattr va;
+ struct nameidata nd;
+ struct thread *td;
+ struct dirent *dp;
+ struct vnode *mvp;
+
+ i = *buflen;
+ error = 0;
+ covered = 0;
+ td = curthread;
+
+ if (vp->v_type != VDIR)
+ return (ENOENT);
+
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error)
+ return (error);
+
+ VREF(vp);
+ locked = VOP_ISLOCKED(vp);
+ VOP_UNLOCK(vp, 0);
+ NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE,
+ "..", vp, td);
+ flags = FREAD;
+ error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL);
+ if (error) {
+ vn_lock(vp, locked | LK_RETRY);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ mvp = *dvp = nd.ni_vp;
+
+ if (vp->v_mount != (*dvp)->v_mount &&
+ ((*dvp)->v_vflag & VV_ROOT) &&
+ ((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
+ *dvp = (*dvp)->v_mount->mnt_vnodecovered;
+ VREF(mvp);
+ VOP_UNLOCK(mvp, 0);
+ vn_close(mvp, FREAD, cred, td);
+ VREF(*dvp);
+ vn_lock(*dvp, LK_SHARED | LK_RETRY);
+ covered = 1;
+ }
+
+ fileno = va.va_fileid;
+
+ dirbuflen = DEV_BSIZE;
+ if (dirbuflen < va.va_blocksize)
+ dirbuflen = va.va_blocksize;
+ dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+ if ((*dvp)->v_type != VDIR) {
+ error = ENOENT;
+ goto out;
+ }
+
+ off = 0;
+ len = 0;
+ do {
+ /* call VOP_READDIR of parent */
+ error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
+ &cpos, &len, &eofflag, td);
+ if (error)
+ goto out;
+
+ if ((dp->d_type != DT_WHT) &&
+ (dp->d_fileno == fileno)) {
+ if (covered) {
+ VOP_UNLOCK(*dvp, 0);
+ vn_lock(mvp, LK_SHARED | LK_RETRY);
+ if (dirent_exists(mvp, dp->d_name, td)) {
+ error = ENOENT;
+ VOP_UNLOCK(mvp, 0);
+ vn_lock(*dvp, LK_SHARED | LK_RETRY);
+ goto out;
+ }
+ VOP_UNLOCK(mvp, 0);
+ vn_lock(*dvp, LK_SHARED | LK_RETRY);
+ }
+ i -= dp->d_namlen;
+
+ if (i < 0) {
+ error = ENOMEM;
+ goto out;
+ }
+ if (dp->d_namlen == 1 && dp->d_name[0] == '.') {
+ error = ENOENT;
+ } else {
+ bcopy(dp->d_name, buf + i, dp->d_namlen);
+ error = 0;
+ }
+ goto out;
+ }
+ } while (len > 0 || !eofflag);
+ error = ENOENT;
+
+out:
+ free(dirbuf, M_TEMP);
+ if (!error) {
+ *buflen = i;
+ vref(*dvp);
+ }
+ if (covered) {
+ vput(*dvp);
+ vrele(mvp);
+ } else {
+ VOP_UNLOCK(mvp, 0);
+ vn_close(mvp, FREAD, cred, td);
+ }
+ vn_lock(vp, locked | LK_RETRY);
+ return (error);
+}
+
+int
+vop_stdallocate(struct vop_allocate_args *ap)
+{
+#ifdef __notyet__
+ struct statfs *sfs;
+ off_t maxfilesize = 0;
+#endif
+ struct iovec aiov;
+ struct vattr vattr, *vap;
+ struct uio auio;
+ off_t fsize, len, cur, offset;
+ uint8_t *buf;
+ struct thread *td;
+ struct vnode *vp;
+ size_t iosize;
+ int error;
+
+ buf = NULL;
+ error = 0;
+ td = curthread;
+ vap = &vattr;
+ vp = ap->a_vp;
+ len = *ap->a_len;
+ offset = *ap->a_offset;
+
+ error = VOP_GETATTR(vp, vap, td->td_ucred);
+ if (error != 0)
+ goto out;
+ fsize = vap->va_size;
+ iosize = vap->va_blocksize;
+ if (iosize == 0)
+ iosize = BLKDEV_IOSIZE;
+ if (iosize > MAXPHYS)
+ iosize = MAXPHYS;
+ buf = malloc(iosize, M_TEMP, M_WAITOK);
+
+#ifdef __notyet__
+ /*
+ * Check if the filesystem sets f_maxfilesize; if not use
+ * VOP_SETATTR to perform the check.
+ */
+ sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = VFS_STATFS(vp->v_mount, sfs, td);
+ if (error == 0)
+ maxfilesize = sfs->f_maxfilesize;
+ free(sfs, M_STATFS);
+ if (error != 0)
+ goto out;
+ if (maxfilesize) {
+ if (offset > maxfilesize || len > maxfilesize ||
+ offset + len > maxfilesize) {
+ error = EFBIG;
+ goto out;
+ }
+ } else
+#endif
+ if (offset + len > vap->va_size) {
+ /*
+ * Test offset + len against the filesystem's maxfilesize.
+ */
+ VATTR_NULL(vap);
+ vap->va_size = offset + len;
+ error = VOP_SETATTR(vp, vap, td->td_ucred);
+ if (error != 0)
+ goto out;
+ VATTR_NULL(vap);
+ vap->va_size = fsize;
+ error = VOP_SETATTR(vp, vap, td->td_ucred);
+ if (error != 0)
+ goto out;
+ }
+
+ for (;;) {
+ /*
+ * Read and write back anything below the nominal file
+ * size. There's currently no way outside the filesystem
+ * to know whether this area is sparse or not.
+ */
+ cur = iosize;
+ if ((offset % iosize) != 0)
+ cur -= (offset % iosize);
+ if (cur > len)
+ cur = len;
+ if (offset < fsize) {
+ aiov.iov_base = buf;
+ aiov.iov_len = cur;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ auio.uio_resid = cur;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ error = VOP_READ(vp, &auio, 0, td->td_ucred);
+ if (error != 0)
+ break;
+ if (auio.uio_resid > 0) {
+ bzero(buf + cur - auio.uio_resid,
+ auio.uio_resid);
+ }
+ } else {
+ bzero(buf, cur);
+ }
+
+ aiov.iov_base = buf;
+ aiov.iov_len = cur;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ auio.uio_resid = cur;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+
+ error = VOP_WRITE(vp, &auio, 0, td->td_ucred);
+ if (error != 0)
+ break;
+
+ len -= cur;
+ offset += cur;
+ if (len == 0)
+ break;
+ if (should_yield())
+ break;
+ }
+
+ out:
+ *ap->a_len = len;
+ *ap->a_offset = offset;
+ free(buf, M_TEMP);
+ return (error);
+}
+
+int
+vop_stdadvise(struct vop_advise_args *ap)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+ daddr_t startn, endn;
+ off_t start, end;
+ int bsize, error;
+
+ vp = ap->a_vp;
+ switch (ap->a_advice) {
+ case POSIX_FADV_WILLNEED:
+ /*
+ * Do nothing for now. Filesystems should provide a
+ * custom method which starts an asynchronous read of
+ * the requested region.
+ */
+ error = 0;
+ break;
+ case POSIX_FADV_DONTNEED:
+ error = 0;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_iflag & VI_DOOMED) {
+ VOP_UNLOCK(vp, 0);
+ break;
+ }
+
+ /*
+ * Deactivate pages in the specified range from the backing VM
+ * object. Pages that are resident in the buffer cache will
+ * remain wired until their corresponding buffers are released
+ * below.
+ */
+ if (vp->v_object != NULL) {
+ start = trunc_page(ap->a_start);
+ end = round_page(ap->a_end);
+ VM_OBJECT_RLOCK(vp->v_object);
+ vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start),
+ OFF_TO_IDX(end));
+ VM_OBJECT_RUNLOCK(vp->v_object);
+ }
+
+ bo = &vp->v_bufobj;
+ BO_RLOCK(bo);
+ bsize = vp->v_bufobj.bo_bsize;
+ startn = ap->a_start / bsize;
+ endn = ap->a_end / bsize;
+ error = bnoreuselist(&bo->bo_clean, bo, startn, endn);
+ if (error == 0)
+ error = bnoreuselist(&bo->bo_dirty, bo, startn, endn);
+ BO_RUNLOCK(bo);
+ VOP_UNLOCK(vp, 0);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+int
+vop_stdunp_bind(struct vop_unp_bind_args *ap)
+{
+
+ ap->a_vp->v_unpcb = ap->a_unpcb;
+ return (0);
+}
+
+int
+vop_stdunp_connect(struct vop_unp_connect_args *ap)
+{
+
+ *ap->a_unpcb = ap->a_vp->v_unpcb;
+ return (0);
+}
+
+int
+vop_stdunp_detach(struct vop_unp_detach_args *ap)
+{
+
+ ap->a_vp->v_unpcb = NULL;
+ return (0);
+}
+
+static int
+vop_stdis_text(struct vop_is_text_args *ap)
+{
+
+ return (ap->a_vp->v_writecount < 0);
+}
+
+int
+vop_stdset_text(struct vop_set_text_args *ap)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ int error;
+
+ vp = ap->a_vp;
+ VI_LOCK(vp);
+ if (vp->v_writecount > 0) {
+ error = ETXTBSY;
+ } else {
+ /*
+ * If requested by fs, keep a use reference to the
+ * vnode until the last text reference is released.
+ */
+ mp = vp->v_mount;
+ if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 &&
+ vp->v_writecount == 0) {
+ vp->v_iflag |= VI_TEXT_REF;
+ vrefl(vp);
+ }
+
+ vp->v_writecount--;
+ error = 0;
+ }
+ VI_UNLOCK(vp);
+ return (error);
+}
+
+static int
+vop_stdunset_text(struct vop_unset_text_args *ap)
+{
+ struct vnode *vp;
+ int error;
+ bool last;
+
+ vp = ap->a_vp;
+ last = false;
+ VI_LOCK(vp);
+ if (vp->v_writecount < 0) {
+ if ((vp->v_iflag & VI_TEXT_REF) != 0 &&
+ vp->v_writecount == -1) {
+ last = true;
+ vp->v_iflag &= ~VI_TEXT_REF;
+ }
+ vp->v_writecount++;
+ error = 0;
+ } else {
+ error = EINVAL;
+ }
+ VI_UNLOCK(vp);
+ if (last)
+ vunref(vp);
+ return (error);
+}
+
+static int
+vop_stdadd_writecount(struct vop_add_writecount_args *ap)
+{
+ struct vnode *vp;
+ int error;
+
+ vp = ap->a_vp;
+ VI_LOCK_FLAGS(vp, MTX_DUPOK);
+ if (vp->v_writecount < 0) {
+ error = ETXTBSY;
+ } else {
+ VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
+ ("neg writecount increment %d", ap->a_inc));
+ vp->v_writecount += ap->a_inc;
+ error = 0;
+ }
+ VI_UNLOCK(vp);
+ return (error);
+}
+
+/*
+ * vfs default ops
+ * used to fill the vfs function table to get reasonable default return values.
+ */
+int
+vfs_stdroot (mp, flags, vpp)
+ struct mount *mp;
+ int flags;
+ struct vnode **vpp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstatfs (mp, sbp)
+ struct mount *mp;
+ struct statfs *sbp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdquotactl (mp, cmds, uid, arg)
+ struct mount *mp;
+ int cmds;
+ uid_t uid;
+ void *arg;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsync(mp, waitfor)
+ struct mount *mp;
+ int waitfor;
+{
+ struct vnode *vp, *mvp;
+ struct thread *td;
+ int error, lockreq, allerror = 0;
+
+ td = curthread;
+ lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
+ if (waitfor != MNT_WAIT)
+ lockreq |= LK_NOWAIT;
+ /*
+ * Force stale buffer cache information to be flushed.
+ */
+loop:
+ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+ if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
+ VI_UNLOCK(vp);
+ continue;
+ }
+ if ((error = vget(vp, lockreq, td)) != 0) {
+ if (error == ENOENT) {
+ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+ goto loop;
+ }
+ continue;
+ }
+ error = VOP_FSYNC(vp, waitfor, td);
+ if (error)
+ allerror = error;
+ vput(vp);
+ }
+ return (allerror);
+}
+
+int
+vfs_stdnosync (mp, waitfor)
+ struct mount *mp;
+ int waitfor;
+{
+
+ return (0);
+}
+
+int
+vfs_stdvget (mp, ino, flags, vpp)
+ struct mount *mp;
+ ino_t ino;
+ int flags;
+ struct vnode **vpp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdfhtovp (mp, fhp, flags, vpp)
+ struct mount *mp;
+ struct fid *fhp;
+ int flags;
+ struct vnode **vpp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdinit (vfsp)
+ struct vfsconf *vfsp;
+{
+
+ return (0);
+}
+
+int
+vfs_stduninit (vfsp)
+ struct vfsconf *vfsp;
+{
+
+ return(0);
+}
+
+int
+vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)
+ struct mount *mp;
+ int cmd;
+ struct vnode *filename_vp;
+ int attrnamespace;
+ const char *attrname;
+{
+
+ if (filename_vp != NULL)
+ VOP_UNLOCK(filename_vp, 0);
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsysctl(mp, op, req)
+ struct mount *mp;
+ fsctlop_t op;
+ struct sysctl_req *req;
+{
+
+ return (EOPNOTSUPP);
+}
+
+/* end of vfs default ops */
diff --git a/freebsd/sys/kern/vfs_export.c b/freebsd/sys/kern/vfs_export.c
new file mode 100644
index 00000000..669d4e9f
--- /dev/null
+++ b/freebsd/sys/kern/vfs_export.c
@@ -0,0 +1,528 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dirent.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/rmlock.h>
+#include <sys/refcount.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/vnode.h>
+
+#include <netinet/in.h>
+#include <net/radix.h>
+
+static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure");
+
+#if defined(INET) || defined(INET6)
+static struct radix_node_head *vfs_create_addrlist_af(
+ struct radix_node_head **prnh, int off);
+#endif
+static void vfs_free_addrlist(struct netexport *nep);
+static int vfs_free_netcred(struct radix_node *rn, void *w);
+static void vfs_free_addrlist_af(struct radix_node_head **prnh);
+static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp);
+static struct netcred *vfs_export_lookup(struct mount *, struct sockaddr *);
+
+/*
+ * Network address lookup element
+ */
+struct netcred {
+ struct radix_node netc_rnodes[2];
+ int netc_exflags;
+ struct ucred *netc_anon;
+ int netc_numsecflavors;
+ int netc_secflavors[MAXSECFLAVORS];
+};
+
+/*
+ * Network export information
+ */
+struct netexport {
+ struct netcred ne_defexported; /* Default export */
+ struct radix_node_head *ne4;
+ struct radix_node_head *ne6;
+};
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by vfs_export() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
+{
+ struct netcred *np;
+ struct radix_node_head *rnh;
+ int i;
+ struct radix_node *rn;
+ struct sockaddr *saddr, *smask = NULL;
+#if defined(INET6) || defined(INET)
+ int off;
+#endif
+ int error;
+
+ /*
+ * XXX: This routine converts from a `struct xucred'
+ * (argp->ex_anon) to a `struct ucred' (np->netc_anon). This
+ * operation is questionable; for example, what should be done
+ * with fields like cr_uidinfo and cr_prison? Currently, this
+ * routine does not touch them (leaves them as NULL).
+ */
+ if (argp->ex_anon.cr_version != XUCRED_VERSION) {
+ vfs_mount_error(mp, "ex_anon.cr_version: %d != %d",
+ argp->ex_anon.cr_version, XUCRED_VERSION);
+ return (EINVAL);
+ }
+
+ if (argp->ex_addrlen == 0) {
+ if (mp->mnt_flag & MNT_DEFEXPORTED) {
+ vfs_mount_error(mp,
+ "MNT_DEFEXPORTED already set for mount %p", mp);
+ return (EPERM);
+ }
+ np = &nep->ne_defexported;
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = crget();
+ np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+ crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+ argp->ex_anon.cr_groups);
+ np->netc_anon->cr_prison = &prison0;
+ prison_hold(np->netc_anon->cr_prison);
+ np->netc_numsecflavors = argp->ex_numsecflavors;
+ bcopy(argp->ex_secflavors, np->netc_secflavors,
+ sizeof(np->netc_secflavors));
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_DEFEXPORTED;
+ MNT_IUNLOCK(mp);
+ return (0);
+ }
+
+#if MSIZE <= 256
+ if (argp->ex_addrlen > MLEN) {
+ vfs_mount_error(mp, "ex_addrlen %d is greater than %d",
+ argp->ex_addrlen, MLEN);
+ return (EINVAL);
+ }
+#endif
+
+ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+ np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
+ saddr = (struct sockaddr *) (np + 1);
+ if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
+ goto out;
+ if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) {
+ error = EINVAL;
+ vfs_mount_error(mp, "Invalid saddr->sa_family: %d");
+ goto out;
+ }
+ if (saddr->sa_len > argp->ex_addrlen)
+ saddr->sa_len = argp->ex_addrlen;
+ if (argp->ex_masklen) {
+ smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
+ error = copyin(argp->ex_mask, smask, argp->ex_masklen);
+ if (error)
+ goto out;
+ if (smask->sa_len > argp->ex_masklen)
+ smask->sa_len = argp->ex_masklen;
+ }
+ rnh = NULL;
+ switch (saddr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ if ((rnh = nep->ne4) == NULL) {
+ off = offsetof(struct sockaddr_in, sin_addr) << 3;
+ rnh = vfs_create_addrlist_af(&nep->ne4, off);
+ }
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ if ((rnh = nep->ne6) == NULL) {
+ off = offsetof(struct sockaddr_in6, sin6_addr) << 3;
+ rnh = vfs_create_addrlist_af(&nep->ne6, off);
+ }
+ break;
+#endif
+ }
+ if (rnh == NULL) {
+ error = ENOBUFS;
+ vfs_mount_error(mp, "%s %s %d",
+ "Unable to initialize radix node head ",
+ "for address family", saddr->sa_family);
+ goto out;
+ }
+ RADIX_NODE_HEAD_LOCK(rnh);
+ rn = (*rnh->rnh_addaddr)(saddr, smask, &rnh->rh, np->netc_rnodes);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ if (rn == NULL || np != (struct netcred *)rn) { /* already exists */
+ error = EPERM;
+ vfs_mount_error(mp,
+ "netcred already exists for given addr/mask");
+ goto out;
+ }
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = crget();
+ np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+ crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+ argp->ex_anon.cr_groups);
+ np->netc_anon->cr_prison = &prison0;
+ prison_hold(np->netc_anon->cr_prison);
+ np->netc_numsecflavors = argp->ex_numsecflavors;
+ bcopy(argp->ex_secflavors, np->netc_secflavors,
+ sizeof(np->netc_secflavors));
+ return (0);
+out:
+ free(np, M_NETADDR);
+ return (error);
+}
+
+/* Helper for vfs_free_addrlist. */
+/* ARGSUSED */
+static int
+vfs_free_netcred(struct radix_node *rn, void *w)
+{
+ struct radix_node_head *rnh = (struct radix_node_head *) w;
+ struct ucred *cred;
+
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, &rnh->rh);
+ cred = ((struct netcred *)rn)->netc_anon;
+ if (cred != NULL)
+ crfree(cred);
+ free(rn, M_NETADDR);
+ return (0);
+}
+
+#if defined(INET) || defined(INET6)
+static struct radix_node_head *
+vfs_create_addrlist_af(struct radix_node_head **prnh, int off)
+{
+
+ if (rn_inithead((void **)prnh, off) == 0)
+ return (NULL);
+ RADIX_NODE_HEAD_LOCK_INIT(*prnh);
+ return (*prnh);
+}
+#endif
+
+static void
+vfs_free_addrlist_af(struct radix_node_head **prnh)
+{
+ struct radix_node_head *rnh;
+
+ rnh = *prnh;
+ RADIX_NODE_HEAD_LOCK(rnh);
+ (*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, rnh);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ RADIX_NODE_HEAD_DESTROY(rnh);
+ rn_detachhead((void **)prnh);
+ prnh = NULL;
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(struct netexport *nep)
+{
+ struct ucred *cred;
+
+ if (nep->ne4 != NULL)
+ vfs_free_addrlist_af(&nep->ne4);
+ if (nep->ne6 != NULL)
+ vfs_free_addrlist_af(&nep->ne6);
+
+ cred = nep->ne_defexported.netc_anon;
+ if (cred != NULL)
+ crfree(cred);
+
+}
+
+/*
+ * High level function to manipulate export options on a mount point
+ * and the passed in netexport.
+ * Struct export_args *argp is the variable used to twiddle options,
+ * the structure is described in sys/mount.h
+ */
+int
+vfs_export(struct mount *mp, struct export_args *argp)
+{
+ struct netexport *nep;
+ int error;
+
+ if (argp->ex_numsecflavors < 0
+ || argp->ex_numsecflavors >= MAXSECFLAVORS)
+ return (EINVAL);
+
+ error = 0;
+ lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL);
+ nep = mp->mnt_export;
+ if (argp->ex_flags & MNT_DELEXPORT) {
+ if (nep == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ if (mp->mnt_flag & MNT_EXPUBLIC) {
+ vfs_setpublicfs(NULL, NULL, NULL);
+ MNT_ILOCK(mp);
+ mp->mnt_flag &= ~MNT_EXPUBLIC;
+ MNT_IUNLOCK(mp);
+ }
+ vfs_free_addrlist(nep);
+ mp->mnt_export = NULL;
+ free(nep, M_MOUNT);
+ nep = NULL;
+ MNT_ILOCK(mp);
+ mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+ MNT_IUNLOCK(mp);
+ }
+ if (argp->ex_flags & MNT_EXPORTED) {
+ if (nep == NULL) {
+ nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO);
+ mp->mnt_export = nep;
+ }
+ if (argp->ex_flags & MNT_EXPUBLIC) {
+ if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+ goto out;
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_EXPUBLIC;
+ MNT_IUNLOCK(mp);
+ }
+ if ((error = vfs_hang_addrlist(mp, nep, argp)))
+ goto out;
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_EXPORTED;
+ MNT_IUNLOCK(mp);
+ }
+
+out:
+ lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+ /*
+ * Once we have executed the vfs_export() command, we do
+ * not want to keep the "export" option around in the
+ * options list, since that will cause subsequent MNT_UPDATE
+ * calls to fail. The export information is saved in
+ * mp->mnt_export, so we can safely delete the "export" mount option
+ * here.
+ */
+ vfs_deleteopt(mp->mnt_optnew, "export");
+ vfs_deleteopt(mp->mnt_opt, "export");
+ return (error);
+}
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
+{
+ int error;
+ struct vnode *rvp;
+ char *cp;
+
+ /*
+ * mp == NULL -> invalidate the current info, the FS is
+ * no longer exported. May be called from either vfs_export
+ * or unmount, so check if it hasn't already been done.
+ */
+ if (mp == NULL) {
+ if (nfs_pub.np_valid) {
+ nfs_pub.np_valid = 0;
+ if (nfs_pub.np_index != NULL) {
+ free(nfs_pub.np_index, M_TEMP);
+ nfs_pub.np_index = NULL;
+ }
+ }
+ return (0);
+ }
+
+ /*
+ * Only one allowed at a time.
+ */
+ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+ return (EBUSY);
+
+ /*
+ * Get real filehandle for root of exported FS.
+ */
+ bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+ nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp)))
+ return (error);
+
+ if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+ return (error);
+
+ vput(rvp);
+
+ /*
+ * If an indexfile was specified, pull it in.
+ */
+ if (argp->ex_indexfile != NULL) {
+ if (nfs_pub.np_index == NULL)
+ nfs_pub.np_index = malloc(MAXNAMLEN + 1, M_TEMP,
+ M_WAITOK);
+ error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+ MAXNAMLEN, (size_t *)0);
+ if (!error) {
+ /*
+ * Check for illegal filenames.
+ */
+ for (cp = nfs_pub.np_index; *cp; cp++) {
+ if (*cp == '/') {
+ error = EINVAL;
+ break;
+ }
+ }
+ }
+ if (error) {
+ free(nfs_pub.np_index, M_TEMP);
+ nfs_pub.np_index = NULL;
+ return (error);
+ }
+ }
+
+ nfs_pub.np_mount = mp;
+ nfs_pub.np_valid = 1;
+ return (0);
+}
+
+/*
+ * Used by the filesystems to determine if a given network address
+ * (passed in 'nam') is present in their exports list, returns a pointer
+ * to struct netcred so that the filesystem can examine it for
+ * access rights (read/write/etc).
+ */
+static struct netcred *
+vfs_export_lookup(struct mount *mp, struct sockaddr *nam)
+{
+ RADIX_NODE_HEAD_RLOCK_TRACKER;
+ struct netexport *nep;
+ struct netcred *np = NULL;
+ struct radix_node_head *rnh;
+ struct sockaddr *saddr;
+
+ nep = mp->mnt_export;
+ if (nep == NULL)
+ return (NULL);
+ if ((mp->mnt_flag & MNT_EXPORTED) == 0)
+ return (NULL);
+
+ /*
+ * Lookup in the export list
+ */
+ if (nam != NULL) {
+ saddr = nam;
+ rnh = NULL;
+ switch (saddr->sa_family) {
+ case AF_INET:
+ rnh = nep->ne4;
+ break;
+ case AF_INET6:
+ rnh = nep->ne6;
+ break;
+ }
+ if (rnh != NULL) {
+ RADIX_NODE_HEAD_RLOCK(rnh);
+ np = (struct netcred *) (*rnh->rnh_matchaddr)(saddr, &rnh->rh);
+ RADIX_NODE_HEAD_RUNLOCK(rnh);
+ if (np != NULL && (np->netc_rnodes->rn_flags & RNF_ROOT) != 0)
+ return (NULL);
+ }
+ }
+
+ /*
+ * If no address match, use the default if it exists.
+ */
+ if (np == NULL && (mp->mnt_flag & MNT_DEFEXPORTED) != 0)
+ return (&nep->ne_defexported);
+
+ return (np);
+}
+
+/*
+ * XXX: This comment comes from the deprecated ufs_check_export()
+ * XXX: and may not entirely apply, but lacking something better:
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ *
+ * Verify that a host should have access to a filesystem.
+ */
+
+int
+vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int **secflavors)
+{
+ struct netcred *np;
+
+ lockmgr(&mp->mnt_explock, LK_SHARED, NULL);
+ np = vfs_export_lookup(mp, nam);
+ if (np == NULL) {
+ lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+ *credanonp = NULL;
+ return (EACCES);
+ }
+ *extflagsp = np->netc_exflags;
+ if ((*credanonp = np->netc_anon) != NULL)
+ crhold(*credanonp);
+ if (numsecflavors)
+ *numsecflavors = np->netc_numsecflavors;
+ if (secflavors)
+ *secflavors = np->netc_secflavors;
+ lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+ return (0);
+}
+
diff --git a/freebsd/sys/kern/vfs_extattr.c b/freebsd/sys/kern/vfs_extattr.c
new file mode 100644
index 00000000..2903fd37
--- /dev/null
+++ b/freebsd/sys/kern/vfs_extattr.c
@@ -0,0 +1,757 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capsicum.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/fcntl.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/limits.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/extattr.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+static int kern_extattr_set_path(struct thread *td, const char *path,
+ int attrnamespace, const char *attrname, void *data,
+ size_t nbytes, int follow);
+static int kern_extattr_get_path(struct thread *td, const char *path,
+ int attrnamespace, const char *attrname, void *data,
+ size_t nbytes, int follow);
+static int kern_extattr_delete_path(struct thread *td, const char *path,
+ int attrnamespace, const char *attrname, int follow);
+static int kern_extattr_list_path(struct thread *td, const char *path,
+ int attrnamespace, void *data, size_t nbytes, int follow);
+
+/*
+ * Syscall to push extended attribute configuration information into the VFS.
+ * Accepts a path, which it converts to a mountpoint, as well as a command
+ * (int cmd), and attribute name and misc data.
+ *
+ * Currently this is used only by UFS1 extended attributes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct extattrctl_args {
+ const char *path;
+ int cmd;
+ const char *filename;
+ int attrnamespace;
+ const char *attrname;
+};
+#endif
+int
+sys_extattrctl(struct thread *td, struct extattrctl_args *uap)
+{
+ struct vnode *filename_vp;
+ struct nameidata nd;
+ struct mount *mp, *mp_writable;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_CMD(uap->cmd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ /*
+ * uap->attrname is not always defined. We check again later when we
+ * invoke the VFS call so as to pass in NULL there if needed.
+ */
+ if (uap->attrname != NULL) {
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+ NULL);
+ if (error)
+ return (error);
+ }
+ AUDIT_ARG_TEXT(attrname);
+
+ mp = NULL;
+ filename_vp = NULL;
+ if (uap->filename != NULL) {
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2,
+ UIO_USERSPACE, uap->filename, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ filename_vp = nd.ni_vp;
+ NDFREE(&nd, NDF_NO_VP_RELE);
+ }
+
+ /* uap->path is always defined. */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error)
+ goto out;
+ mp = nd.ni_vp->v_mount;
+ error = vfs_busy(mp, 0);
+ if (error) {
+ NDFREE(&nd, 0);
+ mp = NULL;
+ goto out;
+ }
+ VOP_UNLOCK(nd.ni_vp, 0);
+ error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+ NDFREE(&nd, NDF_NO_VP_UNLOCK);
+ if (error)
+ goto out;
+ if (filename_vp != NULL) {
+ /*
+ * uap->filename is not always defined. If it is,
+ * grab a vnode lock, which VFS_EXTATTRCTL() will
+ * later release.
+ */
+ error = vn_lock(filename_vp, LK_EXCLUSIVE);
+ if (error) {
+ vn_finished_write(mp_writable);
+ goto out;
+ }
+ }
+
+ error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
+ uap->attrname != NULL ? attrname : NULL);
+
+ vn_finished_write(mp_writable);
+out:
+ if (mp != NULL)
+ vfs_unbusy(mp);
+
+ /*
+ * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
+ * so vrele it if it is defined.
+ */
+ if (filename_vp != NULL)
+ vrele(filename_vp);
+ return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct mount *mp;
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ if (nbytes > IOSIZE_MAX)
+ return (EINVAL);
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ cnt = nbytes;
+
+#ifdef MAC
+ error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
+ attrname);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+ td->td_ucred, td);
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+
+#ifdef MAC
+done:
+#endif
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_set_fd_args {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_set_fd(struct thread *td, struct extattr_set_fd_args *uap)
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ error = getvnode(td, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+ fdrop(fp, td);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_set_file_args {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_set_file(struct thread *td, struct extattr_set_file_args *uap)
+{
+
+ return (kern_extattr_set_path(td, uap->path, uap->attrnamespace,
+ uap->attrname, uap->data, uap->nbytes, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_set_link_args {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_set_link(struct thread *td, struct extattr_set_link_args *uap)
+{
+
+ return (kern_extattr_set_path(td, uap->path, uap->attrnamespace,
+ uap->attrname, uap->data, uap->nbytes, NOFOLLOW));
+}
+
+static int
+kern_extattr_set_path(struct thread *td, const char *path, int attrnamespace,
+ const char *uattrname, void *data, size_t nbytes, int follow)
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(attrnamespace);
+ error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_set_vp(nd.ni_vp, attrnamespace, attrname, data,
+ nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ struct iovec aiov;
+ ssize_t cnt;
+ size_t size, *sizep;
+ int error;
+
+ if (nbytes > IOSIZE_MAX)
+ return (EINVAL);
+
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+
+ /*
+ * Slightly unusual semantics: if the user provides a NULL data
+ * pointer, they don't want to receive the data, just the maximum
+ * read length.
+ */
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+#ifdef MAC
+ error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
+ attrname);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+#ifdef MAC
+done:
+#endif
+ VOP_UNLOCK(vp, 0);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_get_fd_args {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_get_fd(struct thread *td, struct extattr_get_fd_args *uap)
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ error = getvnode(td, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_GET), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_get_file_args {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_get_file(struct thread *td, struct extattr_get_file_args *uap)
+{
+ return (kern_extattr_get_path(td, uap->path, uap->attrnamespace,
+ uap->attrname, uap->data, uap->nbytes, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_get_link_args {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_get_link(struct thread *td, struct extattr_get_link_args *uap)
+{
+ return (kern_extattr_get_path(td, uap->path, uap->attrnamespace,
+ uap->attrname, uap->data, uap->nbytes, NOFOLLOW));
+}
+
+static int
+kern_extattr_get_path(struct thread *td, const char *path, int attrnamespace,
+ const char *uattrname, void *data, size_t nbytes, int follow)
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(attrnamespace);
+ error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_get_vp(nd.ni_vp, attrnamespace, attrname, data,
+ nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ * directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+#ifdef MAC
+ error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
+ attrname);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
+ td);
+ if (error == EOPNOTSUPP)
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+ td->td_ucred, td);
+#ifdef MAC
+done:
+#endif
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_delete_fd_args {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+};
+#endif
+int
+sys_extattr_delete_fd(struct thread *td, struct extattr_delete_fd_args *uap)
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ error = getvnode(td, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, td);
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_delete_file_args {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+};
+#endif
+int
+sys_extattr_delete_file(struct thread *td, struct extattr_delete_file_args *uap)
+{
+
+ return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace,
+ uap->attrname, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_delete_link_args {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+};
+#endif
+int
+sys_extattr_delete_link(struct thread *td, struct extattr_delete_link_args *uap)
+{
+
+ return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace,
+ uap->attrname, NOFOLLOW));
+}
+
+static int
+kern_extattr_delete_path(struct thread *td, const char *path, int attrnamespace,
+ const char *uattrname, int follow)
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(attrnamespace);
+ error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return(error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error)
+ return(error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_delete_vp(nd.ni_vp, attrnamespace, attrname, td);
+ vrele(nd.ni_vp);
+ return(error);
+}
+
+/*-
+ * Retrieve a list of extended attributes on a file or directory.
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
+ * userspace buffer pointer "data", buffer length "nbytes",
+ * thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
+ size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ size_t size, *sizep;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ if (nbytes > IOSIZE_MAX)
+ return (EINVAL);
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+#ifdef MAC
+ error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+#ifdef MAC
+done:
+#endif
+ VOP_UNLOCK(vp, 0);
+ return (error);
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_list_fd_args {
+ int fd;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_list_fd(struct thread *td, struct extattr_list_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = getvnode(td, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
+ uap->nbytes, td);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_list_file_args {
+ const char *path;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+}
+#endif
+int
+sys_extattr_list_file(struct thread *td, struct extattr_list_file_args *uap)
+{
+
+ return (kern_extattr_list_path(td, uap->path, uap->attrnamespace,
+ uap->data, uap->nbytes, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct extattr_list_link_args {
+ const char *path;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+};
+#endif
+int
+sys_extattr_list_link(struct thread *td, struct extattr_list_link_args *uap)
+{
+
+ return (kern_extattr_list_path(td, uap->path, uap->attrnamespace,
+ uap->data, uap->nbytes, NOFOLLOW));
+}
+
+static int
+kern_extattr_list_path(struct thread *td, const char *path, int attrnamespace,
+ void *data, size_t nbytes, int follow)
+{
+ struct nameidata nd;
+ int error;
+
+ AUDIT_ARG_VALUE(attrnamespace);
+ NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_list_vp(nd.ni_vp, attrnamespace, data, nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
diff --git a/freebsd/sys/kern/vfs_hash.c b/freebsd/sys/kern/vfs_hash.c
new file mode 100644
index 00000000..b938f485
--- /dev/null
+++ b/freebsd/sys/kern/vfs_hash.c
@@ -0,0 +1,234 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2005 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/rwlock.h>
+#include <sys/vnode.h>
+
+static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
+
+static LIST_HEAD(vfs_hash_head, vnode) *vfs_hash_tbl;
+static LIST_HEAD(,vnode) vfs_hash_side;
+static u_long vfs_hash_mask;
+static struct rwlock vfs_hash_lock;
+
+static void
+vfs_hashinit(void *dummy __unused)
+{
+
+ vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask);
+ rw_init(&vfs_hash_lock, "vfs hash");
+ LIST_INIT(&vfs_hash_side);
+}
+
+/* Must be SI_ORDER_SECOND so desiredvnodes is available */
+SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL);
+
+u_int
+vfs_hash_index(struct vnode *vp)
+{
+
+ return (vp->v_hash + vp->v_mount->mnt_hashseed);
+}
+
+static struct vfs_hash_head *
+vfs_hash_bucket(const struct mount *mp, u_int hash)
+{
+
+ return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
+}
+
+int
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td,
+ struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+ struct vnode *vp;
+ int error;
+
+ while (1) {
+ rw_rlock(&vfs_hash_lock);
+ LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+ if (vp->v_hash != hash)
+ continue;
+ if (vp->v_mount != mp)
+ continue;
+ if (fn != NULL && fn(vp, arg))
+ continue;
+ vhold(vp);
+ rw_runlock(&vfs_hash_lock);
+ error = vget(vp, flags | LK_VNHELD, td);
+ if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+ break;
+ if (error)
+ return (error);
+ *vpp = vp;
+ return (0);
+ }
+ if (vp == NULL) {
+ rw_runlock(&vfs_hash_lock);
+ *vpp = NULL;
+ return (0);
+ }
+ }
+}
+
+void
+vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
+ struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+ struct vnode *vp;
+
+ while (1) {
+ rw_rlock(&vfs_hash_lock);
+ LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+ if (vp->v_hash != hash)
+ continue;
+ if (vp->v_mount != mp)
+ continue;
+ if (fn != NULL && fn(vp, arg))
+ continue;
+ vhold(vp);
+ rw_runlock(&vfs_hash_lock);
+ vref(vp);
+ vdrop(vp);
+ *vpp = vp;
+ return;
+ }
+ if (vp == NULL) {
+ rw_runlock(&vfs_hash_lock);
+ *vpp = NULL;
+ return;
+ }
+ }
+}
+
+void
+vfs_hash_remove(struct vnode *vp)
+{
+
+ rw_wlock(&vfs_hash_lock);
+ LIST_REMOVE(vp, v_hashlist);
+ rw_wunlock(&vfs_hash_lock);
+}
+
+int
+vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
+ struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+ struct vnode *vp2;
+ int error;
+
+ *vpp = NULL;
+ while (1) {
+ rw_wlock(&vfs_hash_lock);
+ LIST_FOREACH(vp2,
+ vfs_hash_bucket(vp->v_mount, hash), v_hashlist) {
+ if (vp2->v_hash != hash)
+ continue;
+ if (vp2->v_mount != vp->v_mount)
+ continue;
+ if (fn != NULL && fn(vp2, arg))
+ continue;
+ vhold(vp2);
+ rw_wunlock(&vfs_hash_lock);
+ error = vget(vp2, flags | LK_VNHELD, td);
+ if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+ break;
+ rw_wlock(&vfs_hash_lock);
+ LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist);
+ rw_wunlock(&vfs_hash_lock);
+ vput(vp);
+ if (!error)
+ *vpp = vp2;
+ return (error);
+ }
+ if (vp2 == NULL)
+ break;
+
+ }
+ vp->v_hash = hash;
+ LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+ rw_wunlock(&vfs_hash_lock);
+ return (0);
+}
+
+void
+vfs_hash_rehash(struct vnode *vp, u_int hash)
+{
+
+ rw_wlock(&vfs_hash_lock);
+ LIST_REMOVE(vp, v_hashlist);
+ LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+ vp->v_hash = hash;
+ rw_wunlock(&vfs_hash_lock);
+}
+
+void
+vfs_hash_changesize(int newmaxvnodes)
+{
+ struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl;
+ u_long vfs_hash_newmask, vfs_hash_oldmask;
+ struct vnode *vp;
+ int i;
+
+ vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH,
+ &vfs_hash_newmask);
+ /* If same hash table size, nothing to do */
+ if (vfs_hash_mask == vfs_hash_newmask) {
+ free(vfs_hash_newtbl, M_VFS_HASH);
+ return;
+ }
+ /*
+ * Move everything from the old hash table to the new table.
+ * None of the vnodes in the table can be recycled because to
+ * do so, they have to be removed from the hash table.
+ */
+ rw_wlock(&vfs_hash_lock);
+ vfs_hash_oldtbl = vfs_hash_tbl;
+ vfs_hash_oldmask = vfs_hash_mask;
+ vfs_hash_tbl = vfs_hash_newtbl;
+ vfs_hash_mask = vfs_hash_newmask;
+ for (i = 0; i <= vfs_hash_oldmask; i++) {
+ while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) {
+ LIST_REMOVE(vp, v_hashlist);
+ LIST_INSERT_HEAD(
+ vfs_hash_bucket(vp->v_mount, vp->v_hash),
+ vp, v_hashlist);
+ }
+ }
+ rw_wunlock(&vfs_hash_lock);
+ free(vfs_hash_oldtbl, M_VFS_HASH);
+}
diff --git a/freebsd/sys/kern/vfs_init.c b/freebsd/sys/kern/vfs_init.c
new file mode 100644
index 00000000..5eb38e6d
--- /dev/null
+++ b/freebsd/sys/kern/vfs_init.c
@@ -0,0 +1,376 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fnv_hash.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+
+static int vfs_register(struct vfsconf *);
+static int vfs_unregister(struct vfsconf *);
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * The highest defined VFS number.
+ */
+int maxvfsconf = VFS_GENERIC + 1;
+
+/*
+ * Single-linked list of configured VFSes.
+ * New entries are added/deleted by vfs_register()/vfs_unregister()
+ */
+struct vfsconfhead vfsconf = TAILQ_HEAD_INITIALIZER(vfsconf);
+struct sx vfsconf_sx;
+SX_SYSINIT(vfsconf, &vfsconf_sx, "vfsconf");
+
+/*
+ * Loader.conf variable vfs.typenumhash enables setting vfc_typenum using a hash
+ * calculation on vfc_name, so that it doesn't change when file systems are
+ * loaded in a different order. This will avoid the NFS server file handles from
+ * changing for file systems that use vfc_typenum in their fsid.
+ */
+static int vfs_typenumhash = 1;
+SYSCTL_INT(_vfs, OID_AUTO, typenumhash, CTLFLAG_RDTUN, &vfs_typenumhash, 0,
+ "Set vfc_typenum using a hash calculation on vfc_name, so that it does not"
+ "change when file systems are loaded in a different order.");
+
+/*
+ * A Zen vnode attribute structure.
+ *
+ * Initialized when the first filesystem registers by vfs_register().
+ */
+struct vattr va_null;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+static struct vfsconf *
+vfs_byname_locked(const char *name)
+{
+ struct vfsconf *vfsp;
+
+ sx_assert(&vfsconf_sx, SA_LOCKED);
+ if (!strcmp(name, "ffs"))
+ name = "ufs";
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+ if (!strcmp(name, vfsp->vfc_name))
+ return (vfsp);
+ }
+ return (NULL);
+}
+
+struct vfsconf *
+vfs_byname(const char *name)
+{
+ struct vfsconf *vfsp;
+
+ vfsconf_slock();
+ vfsp = vfs_byname_locked(name);
+ vfsconf_sunlock();
+ return (vfsp);
+}
+
+struct vfsconf *
+vfs_byname_kld(const char *fstype, struct thread *td, int *error)
+{
+ struct vfsconf *vfsp;
+ int fileid, loaded;
+
+ vfsp = vfs_byname(fstype);
+ if (vfsp != NULL)
+ return (vfsp);
+
+ /* Try to load the respective module. */
+ *error = kern_kldload(td, fstype, &fileid);
+ loaded = (*error == 0);
+ if (*error == EEXIST)
+ *error = 0;
+ if (*error)
+ return (NULL);
+
+ /* Look up again to see if the VFS was loaded. */
+ vfsp = vfs_byname(fstype);
+ if (vfsp == NULL) {
+ if (loaded)
+ (void)kern_kldunload(td, fileid, LINKER_UNLOAD_FORCE);
+ *error = ENODEV;
+ return (NULL);
+ }
+ return (vfsp);
+}
+
+
+/* Register a new filesystem type in the global table */
+static int
+vfs_register(struct vfsconf *vfc)
+{
+ struct sysctl_oid *oidp;
+ struct vfsops *vfsops;
+ static int once;
+ struct vfsconf *tvfc;
+ uint32_t hashval;
+ int secondpass;
+
+ if (!once) {
+ vattr_null(&va_null);
+ once = 1;
+ }
+
+ if (vfc->vfc_version != VFS_VERSION) {
+ printf("ERROR: filesystem %s, unsupported ABI version %x\n",
+ vfc->vfc_name, vfc->vfc_version);
+ return (EINVAL);
+ }
+ vfsconf_lock();
+ if (vfs_byname_locked(vfc->vfc_name) != NULL) {
+ vfsconf_unlock();
+ return (EEXIST);
+ }
+
+ if (vfs_typenumhash != 0) {
+ /*
+ * Calculate a hash on vfc_name to use for vfc_typenum. Unless
+ * all of 1<->255 are assigned, it is limited to 8bits since
+ * that is what ZFS uses from vfc_typenum and is also the
+ * preferred range for vfs_getnewfsid().
+ */
+ hashval = fnv_32_str(vfc->vfc_name, FNV1_32_INIT);
+ hashval &= 0xff;
+ secondpass = 0;
+ do {
+ /* Look for and fix any collision. */
+ TAILQ_FOREACH(tvfc, &vfsconf, vfc_list) {
+ if (hashval == tvfc->vfc_typenum) {
+ if (hashval == 255 && secondpass == 0) {
+ hashval = 1;
+ secondpass = 1;
+ } else
+ hashval++;
+ break;
+ }
+ }
+ } while (tvfc != NULL);
+ vfc->vfc_typenum = hashval;
+ if (vfc->vfc_typenum >= maxvfsconf)
+ maxvfsconf = vfc->vfc_typenum + 1;
+ } else
+ vfc->vfc_typenum = maxvfsconf++;
+ TAILQ_INSERT_TAIL(&vfsconf, vfc, vfc_list);
+
+ /*
+ * Initialise unused ``struct vfsops'' fields, to use
+ * the vfs_std*() functions. Note, we need the mount
+ * and unmount operations, at the least. The check
+ * for vfsops available is just a debugging aid.
+ */
+ KASSERT(vfc->vfc_vfsops != NULL,
+ ("Filesystem %s has no vfsops", vfc->vfc_name));
+ /*
+ * Check the mount and unmount operations.
+ */
+ vfsops = vfc->vfc_vfsops;
+ KASSERT(vfsops->vfs_mount != NULL,
+ ("Filesystem %s has no mount op", vfc->vfc_name));
+ KASSERT(vfsops->vfs_unmount != NULL,
+ ("Filesystem %s has no unmount op", vfc->vfc_name));
+
+ if (vfsops->vfs_root == NULL)
+ /* return file system's root vnode */
+ vfsops->vfs_root = vfs_stdroot;
+ if (vfsops->vfs_quotactl == NULL)
+ /* quota control */
+ vfsops->vfs_quotactl = vfs_stdquotactl;
+ if (vfsops->vfs_statfs == NULL)
+ /* return file system's status */
+ vfsops->vfs_statfs = vfs_stdstatfs;
+ if (vfsops->vfs_sync == NULL)
+ /*
+ * flush unwritten data (nosync)
+ * file systems can use vfs_stdsync
+ * explicitly by setting it in the
+ * vfsop vector.
+ */
+ vfsops->vfs_sync = vfs_stdnosync;
+ if (vfsops->vfs_vget == NULL)
+ /* convert an inode number to a vnode */
+ vfsops->vfs_vget = vfs_stdvget;
+ if (vfsops->vfs_fhtovp == NULL)
+ /* turn an NFS file handle into a vnode */
+ vfsops->vfs_fhtovp = vfs_stdfhtovp;
+ if (vfsops->vfs_checkexp == NULL)
+ /* check if file system is exported */
+ vfsops->vfs_checkexp = vfs_stdcheckexp;
+ if (vfsops->vfs_init == NULL)
+ /* file system specific initialisation */
+ vfsops->vfs_init = vfs_stdinit;
+ if (vfsops->vfs_uninit == NULL)
+ /* file system specific uninitialisation */
+ vfsops->vfs_uninit = vfs_stduninit;
+ if (vfsops->vfs_extattrctl == NULL)
+ /* extended attribute control */
+ vfsops->vfs_extattrctl = vfs_stdextattrctl;
+ if (vfsops->vfs_sysctl == NULL)
+ vfsops->vfs_sysctl = vfs_stdsysctl;
+
+ if (vfc->vfc_flags & VFCF_JAIL)
+ prison_add_vfs(vfc);
+
+ /*
+ * Call init function for this VFS...
+ */
+ (*(vfc->vfc_vfsops->vfs_init))(vfc);
+ vfsconf_unlock();
+
+ /*
+ * If this filesystem has a sysctl node under vfs
+ * (i.e. vfs.xxfs), then change the oid number of that node to
+ * match the filesystem's type number. This allows user code
+ * which uses the type number to read sysctl variables defined
+ * by the filesystem to continue working. Since the oids are
+ * in a sorted list, we need to make sure the order is
+ * preserved by re-registering the oid after modifying its
+ * number.
+ */
+ sysctl_wlock();
+ SLIST_FOREACH(oidp, SYSCTL_CHILDREN(&sysctl___vfs), oid_link) {
+ if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+ sysctl_unregister_oid(oidp);
+ oidp->oid_number = vfc->vfc_typenum;
+ sysctl_register_oid(oidp);
+ break;
+ }
+ }
+ sysctl_wunlock();
+
+ return (0);
+}
+
+
+/* Remove registration of a filesystem type */
+static int
+vfs_unregister(struct vfsconf *vfc)
+{
+ struct vfsconf *vfsp;
+ int error, maxtypenum;
+
+ vfsconf_lock();
+ vfsp = vfs_byname_locked(vfc->vfc_name);
+ if (vfsp == NULL) {
+ vfsconf_unlock();
+ return (EINVAL);
+ }
+ if (vfsp->vfc_refcount != 0) {
+ vfsconf_unlock();
+ return (EBUSY);
+ }
+ if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+ error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+ if (error != 0) {
+ vfsconf_unlock();
+ return (error);
+ }
+ }
+ TAILQ_REMOVE(&vfsconf, vfsp, vfc_list);
+ maxtypenum = VFS_GENERIC;
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+ if (maxtypenum < vfsp->vfc_typenum)
+ maxtypenum = vfsp->vfc_typenum;
+ maxvfsconf = maxtypenum + 1;
+ vfsconf_unlock();
+ return (0);
+}
+
+/*
+ * Standard kernel module handling code for filesystem modules.
+ * Referenced from VFS_SET().
+ */
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+ struct vfsconf *vfc;
+ int error = 0;
+
+ vfc = (struct vfsconf *)data;
+
+ switch (type) {
+ case MOD_LOAD:
+ if (vfc)
+ error = vfs_register(vfc);
+ break;
+
+ case MOD_UNLOAD:
+ if (vfc)
+ error = vfs_unregister(vfc);
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
diff --git a/freebsd/sys/kern/vfs_lookup.c b/freebsd/sys/kern/vfs_lookup.c
new file mode 100644
index 00000000..5ee3f219
--- /dev/null
+++ b/freebsd/sys/kern/vfs_lookup.c
@@ -0,0 +1,1450 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/capsicum.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+#define NAMEI_DIAGNOSTIC 1
+#undef NAMEI_DIAGNOSTIC
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
+ "unsigned long");
+SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
+
+/* Allocation zone for namei. */
+uma_zone_t namei_zone;
+
+/* Placeholder vnode for mp traversal. */
+static struct vnode *vp_crossmp;
+
+static int
+crossmp_vop_islocked(struct vop_islocked_args *ap)
+{
+
+ return (LK_SHARED);
+}
+
+static int
+crossmp_vop_lock1(struct vop_lock1_args *ap)
+{
+ struct vnode *vp;
+ struct lock *lk __unused;
+ const char *file __unused;
+ int flags, line __unused;
+
+ vp = ap->a_vp;
+ lk = vp->v_vnlock;
+ flags = ap->a_flags;
+ file = ap->a_file;
+ line = ap->a_line;
+
+ if ((flags & LK_SHARED) == 0)
+ panic("invalid lock request for crossmp");
+
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line,
+ flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL);
+ WITNESS_LOCK(&lk->lock_object, 0, file, line);
+ if ((flags & LK_INTERLOCK) != 0)
+ VI_UNLOCK(vp);
+ LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, line);
+ return (0);
+}
+
+static int
+crossmp_vop_unlock(struct vop_unlock_args *ap)
+{
+ struct vnode *vp;
+ struct lock *lk __unused;
+ int flags;
+
+ vp = ap->a_vp;
+ lk = vp->v_vnlock;
+ flags = ap->a_flags;
+
+ if ((flags & LK_INTERLOCK) != 0)
+ VI_UNLOCK(vp);
+ WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE);
+ LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE,
+ LOCK_LINE);
+ return (0);
+}
+
+static struct vop_vector crossmp_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_islocked = crossmp_vop_islocked,
+ .vop_lock1 = crossmp_vop_lock1,
+ .vop_unlock = crossmp_vop_unlock,
+};
+
+struct nameicap_tracker {
+ struct vnode *dp;
+ TAILQ_ENTRY(nameicap_tracker) nm_link;
+};
+
+/* Zone for cap mode tracker elements used for dotdot capability checks. */
+static uma_zone_t nt_zone;
+
+static void
+nameiinit(void *dummy __unused)
+{
+
+ namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
+
+static int lookup_cap_dotdot = 1;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
+ &lookup_cap_dotdot, 0,
+ "enables \"..\" components in path lookup in capability mode");
+static int lookup_cap_dotdot_nonlocal = 1;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
+ &lookup_cap_dotdot_nonlocal, 0,
+ "enables \"..\" components in path lookup in capability mode "
+ "on non-local mount");
+
+static void
+nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
+{
+ struct nameicap_tracker *nt;
+
+ if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
+ return;
+ nt = uma_zalloc(nt_zone, M_WAITOK);
+ vhold(dp);
+ nt->dp = dp;
+ TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+}
+
+static void
+nameicap_cleanup(struct nameidata *ndp)
+{
+ struct nameicap_tracker *nt, *nt1;
+
+ KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
+ (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
+ TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+ TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
+ vdrop(nt->dp);
+ uma_zfree(nt_zone, nt);
+ }
+}
+
+/*
+ * For dotdot lookups in capability mode, only allow the component
+ * lookup to succeed if the resulting directory was already traversed
+ * during the operation. Also fail dotdot lookups for non-local
+ * filesystems, where external agents might assist local lookups to
+ * escape the compartment.
+ */
+static int
+nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
+{
+ struct nameicap_tracker *nt;
+ struct mount *mp;
+
+ if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL ||
+ dp->v_type != VDIR)
+ return (0);
+ mp = dp->v_mount;
+ if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
+ (mp->mnt_flag & MNT_LOCAL) == 0)
+ return (ENOTCAPABLE);
+ TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
+ nm_link) {
+ if (dp == nt->dp)
+ return (0);
+ }
+ return (ENOTCAPABLE);
+}
+
+static void
+namei_cleanup_cnp(struct componentname *cnp)
+{
+
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+}
+
+static int
+namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
+{
+ struct componentname *cnp;
+
+ cnp = &ndp->ni_cnd;
+ if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+ return (ENOTCAPABLE);
+ }
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ *dpp = ndp->ni_rootdir;
+ vrefact(*dpp);
+ return (0);
+}
+
+/*
+ * Convert a pathname into a pointer to a locked vnode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ * copy in name
+ * get starting directory
+ * while (!done && !error) {
+ * call lookup to search path.
+ * if symbolic link, massage name in buffer and continue
+ * }
+ */
+int
+namei(struct nameidata *ndp)
+{
+ struct filedesc *fdp; /* pointer to file descriptor state */
+ char *cp; /* pointer into pathname argument */
+ struct vnode *dp; /* the directory we are searching */
+ struct iovec aiov; /* uio for reading symbolic links */
+ struct componentname *cnp;
+ struct file *dfp;
+ struct thread *td;
+ struct proc *p;
+ cap_rights_t rights;
+ struct uio auio;
+ int error, linklen, startdir_used;
+
+ cnp = &ndp->ni_cnd;
+ td = cnp->cn_thread;
+ p = td->td_proc;
+ ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+ KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+ KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+ ("namei: nameiop contaminated with flags"));
+ KASSERT((cnp->cn_flags & OPMASK) == 0,
+ ("namei: flags contaminated with nameiops"));
+ MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
+ ndp->ni_startdir->v_type == VBAD);
+ fdp = p->p_fd;
+ TAILQ_INIT(&ndp->ni_cap_tracker);
+ ndp->ni_lcf = 0;
+
+ /* We will set this ourselves if we need it. */
+ cnp->cn_flags &= ~TRAILINGSLASH;
+
+ /*
+ * Get a buffer for the name to be translated, and copy the
+ * name into the buffer.
+ */
+ if ((cnp->cn_flags & HASBUF) == 0)
+ cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+ if (ndp->ni_segflg == UIO_SYSSPACE)
+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
+ else
+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
+
+ /*
+ * Don't allow empty pathnames.
+ */
+ if (error == 0 && *cnp->cn_pnbuf == '\0')
+ error = ENOENT;
+
+#ifdef CAPABILITY_MODE
+ /*
+ * In capability mode, lookups must be restricted to happen in
+ * the subtree with the root specified by the file descriptor:
+ * - The root must be real file descriptor, not the pseudo-descriptor
+ * AT_FDCWD.
+ * - The passed path must be relative and not absolute.
+ * - If lookup_cap_dotdot is disabled, path must not contain the
+ * '..' components.
+ * - If lookup_cap_dotdot is enabled, we verify that all '..'
+ * components lookups result in the directories which were
+ * previously walked by us, which prevents an escape from
+ * the relative root.
+ */
+ if (error == 0 && IN_CAPABILITY_MODE(td) &&
+ (cnp->cn_flags & NOCAPCHECK) == 0) {
+ ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
+ if (ndp->ni_dirfd == AT_FDCWD) {
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+ error = ECAPMODE;
+ }
+ }
+#endif
+ if (error != 0) {
+ namei_cleanup_cnp(cnp);
+ ndp->ni_vp = NULL;
+ return (error);
+ }
+ ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_NAMEI)) {
+ KASSERT(cnp->cn_thread == curthread,
+ ("namei not using curthread"));
+ ktrnamei(cnp->cn_pnbuf);
+ }
+#endif
+ /*
+ * Get starting point for the translation.
+ */
+ FILEDESC_SLOCK(fdp);
+ ndp->ni_rootdir = fdp->fd_rdir;
+ vrefact(ndp->ni_rootdir);
+ ndp->ni_topdir = fdp->fd_jdir;
+
+ /*
+ * If we are auditing the kernel pathname, save the user pathname.
+ */
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+ if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+
+ startdir_used = 0;
+ dp = NULL;
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ if (cnp->cn_pnbuf[0] == '/') {
+ ndp->ni_resflags |= NIRES_ABS;
+ error = namei_handle_root(ndp, &dp);
+ } else {
+ if (ndp->ni_startdir != NULL) {
+ dp = ndp->ni_startdir;
+ startdir_used = 1;
+ } else if (ndp->ni_dirfd == AT_FDCWD) {
+ dp = fdp->fd_cdir;
+ vrefact(dp);
+ } else {
+ rights = ndp->ni_rightsneeded;
+ cap_rights_set(&rights, CAP_LOOKUP);
+
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_ATFD1(ndp->ni_dirfd);
+ if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_ATFD2(ndp->ni_dirfd);
+ /*
+ * Effectively inlined fgetvp_rights, because we need to
+ * inspect the file as well as grabbing the vnode.
+ */
+ error = fget_cap_locked(fdp, ndp->ni_dirfd, &rights,
+ &dfp, &ndp->ni_filecaps);
+ if (error != 0) {
+ /*
+ * Preserve the error; it should either be EBADF
+ * or capability-related, both of which can be
+ * safely returned to the caller.
+ */
+ } else if (dfp->f_ops == &badfileops) {
+ error = EBADF;
+ } else if (dfp->f_vnode == NULL) {
+ error = ENOTDIR;
+ } else {
+ dp = dfp->f_vnode;
+ vrefact(dp);
+
+ if ((dfp->f_flag & FSEARCH) != 0)
+ cnp->cn_flags |= NOEXECCHECK;
+ }
+#ifdef CAPABILITIES
+ /*
+ * If file descriptor doesn't have all rights,
+ * all lookups relative to it must also be
+ * strictly relative.
+ */
+ CAP_ALL(&rights);
+ if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
+ &rights) ||
+ ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
+ ndp->ni_filecaps.fc_nioctls != -1) {
+ ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
+ }
+#endif
+ }
+ if (error == 0 && dp->v_type != VDIR)
+ error = ENOTDIR;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ if (ndp->ni_startdir != NULL && !startdir_used)
+ vrele(ndp->ni_startdir);
+ if (error != 0) {
+ if (dp != NULL)
+ vrele(dp);
+ goto out;
+ }
+ if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
+ lookup_cap_dotdot != 0)
+ ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
+ SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
+ cnp->cn_flags);
+ for (;;) {
+ ndp->ni_startdir = dp;
+ error = lookup(ndp);
+ if (error != 0)
+ goto out;
+ /*
+ * If not a symbolic link, we're done.
+ */
+ if ((cnp->cn_flags & ISSYMLINK) == 0) {
+ vrele(ndp->ni_rootdir);
+ if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
+ namei_cleanup_cnp(cnp);
+ } else
+ cnp->cn_flags |= HASBUF;
+ nameicap_cleanup(ndp);
+ SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp);
+ return (0);
+ }
+ if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+ error = ELOOP;
+ break;
+ }
+#ifdef MAC
+ if ((cnp->cn_flags & NOMACCHECK) == 0) {
+ error = mac_vnode_check_readlink(td->td_ucred,
+ ndp->ni_vp);
+ if (error != 0)
+ break;
+ }
+#endif
+ if (ndp->ni_pathlen > 1)
+ cp = uma_zalloc(namei_zone, M_WAITOK);
+ else
+ cp = cnp->cn_pnbuf;
+ aiov.iov_base = cp;
+ aiov.iov_len = MAXPATHLEN;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = MAXPATHLEN;
+ error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+ if (error != 0) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ break;
+ }
+ linklen = MAXPATHLEN - auio.uio_resid;
+ if (linklen == 0) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ error = ENOENT;
+ break;
+ }
+ if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ error = ENAMETOOLONG;
+ break;
+ }
+ if (ndp->ni_pathlen > 1) {
+ bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+ cnp->cn_pnbuf = cp;
+ } else
+ cnp->cn_pnbuf[linklen] = '\0';
+ ndp->ni_pathlen += linklen;
+ vput(ndp->ni_vp);
+ dp = ndp->ni_dvp;
+ /*
+ * Check if root directory should replace current directory.
+ */
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ if (*(cnp->cn_nameptr) == '/') {
+ vrele(dp);
+ error = namei_handle_root(ndp, &dp);
+ if (error != 0)
+ goto out;
+ }
+ }
+ vput(ndp->ni_vp);
+ ndp->ni_vp = NULL;
+ vrele(ndp->ni_dvp);
+out:
+ vrele(ndp->ni_rootdir);
+ namei_cleanup_cnp(cnp);
+ nameicap_cleanup(ndp);
+ SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
+ return (error);
+}
+
+static int
+compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
+{
+
+ if (mp == NULL || ((lkflags & LK_SHARED) &&
+ (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
+ ((cnflags & ISDOTDOT) &&
+ (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
+ lkflags &= ~LK_SHARED;
+ lkflags |= LK_EXCLUSIVE;
+ }
+ lkflags |= LK_NODDLKTREAT;
+ return (lkflags);
+}
+
+static __inline int
+needs_exclusive_leaf(struct mount *mp, int flags)
+{
+
+ /*
+ * Intermediate nodes can use shared locks, we only need to
+ * force an exclusive lock for leaf nodes.
+ */
+ if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
+ return (0);
+
+ /* Always use exclusive locks if LOCKSHARED isn't set. */
+ if (!(flags & LOCKSHARED))
+ return (1);
+
+ /*
+ * For lookups during open(), if the mount point supports
+ * extended shared operations, then use a shared lock for the
+ * leaf node, otherwise use an exclusive lock.
+ */
+ if ((flags & ISOPEN) != 0)
+ return (!MNT_EXTENDED_SHARED(mp));
+
+ /*
+ * Lookup requests outside of open() that specify LOCKSHARED
+ * only need a shared lock on the leaf vnode.
+ */
+ return (0);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".". When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ * identify next component of name at ndp->ni_ptr
+ * handle degenerate case where name is null string
+ * if .. and crossing mount points and on mounted filesys, find parent
+ * call VOP_LOOKUP routine for next component name
+ * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ * component vnode returned in ni_vp (if it exists), locked.
+ * if result vnode is mounted on and crossing mount points,
+ * find mounted on vnode
+ * if more components of name, do next level at dirloop
+ * return the answer in ni_vp, locked if LOCKLEAF set
+ * if LOCKPARENT set, return locked parent in ni_dvp
+ * if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(struct nameidata *ndp)
+{
+ char *cp; /* pointer into pathname argument */
+ char *prev_ni_next; /* saved ndp->ni_next */
+ struct vnode *dp = NULL; /* the directory we are searching */
+ struct vnode *tdp; /* saved dp */
+ struct mount *mp; /* mount table entry */
+ struct prison *pr;
+ size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
+ int docache; /* == 0 do not cache last component */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int error = 0;
+ int dpunlocked = 0; /* dp has already been unlocked */
+ int relookup = 0; /* do not consume the path component */
+ struct componentname *cnp = &ndp->ni_cnd;
+ int lkflags_save;
+ int ni_dvp_unlocked;
+
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ ni_dvp_unlocked = 0;
+ wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+ KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
+ ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
+ docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+ if (cnp->cn_nameiop == DELETE ||
+ (wantparent && cnp->cn_nameiop != CREATE &&
+ cnp->cn_nameiop != LOOKUP))
+ docache = 0;
+ rdonly = cnp->cn_flags & RDONLY;
+ cnp->cn_flags &= ~ISSYMLINK;
+ ndp->ni_dvp = NULL;
+ /*
+ * We use shared locks until we hit the parent of the last cn then
+ * we adjust based on the requesting flags.
+ */
+ cnp->cn_lkflags = LK_SHARED;
+ dp = ndp->ni_startdir;
+ ndp->ni_startdir = NULLVP;
+ vn_lock(dp,
+ compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
+ cnp->cn_flags));
+
+dirloop:
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ continue;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ error = ENAMETOOLONG;
+ goto bad;
+ }
+#ifdef NAMEI_DIAGNOSTIC
+ { char c = *cp;
+ *cp = '\0';
+ printf("{%s}: ", cnp->cn_nameptr);
+ *cp = c; }
+#endif
+ prev_ni_pathlen = ndp->ni_pathlen;
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ KASSERT(ndp->ni_pathlen <= PATH_MAX,
+ ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
+ prev_ni_next = ndp->ni_next;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ *ndp->ni_next = '\0';
+ cnp->cn_flags |= TRAILINGSLASH;
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+ if (*cp == '\0' && docache == 0)
+ cnp->cn_flags &= ~MAKEENTRY;
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+ if ((cnp->cn_flags & ISLASTCN) != 0 &&
+ cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EINVAL;
+ goto bad;
+ }
+
+ nameicap_tracker_add(ndp, dp);
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ if (dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ error = EISDIR;
+ goto bad;
+ }
+ if (wantparent) {
+ ndp->ni_dvp = dp;
+ VREF(dp);
+ }
+ ndp->ni_vp = dp;
+
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_VNODE1(dp);
+ else if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_VNODE2(dp);
+
+ if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+ VOP_UNLOCK(dp, 0);
+ /* XXX This should probably move to the top of function. */
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ goto success;
+ }
+
+ /*
+ * Handle "..": five special cases.
+ * 0. If doing a capability lookup and lookup_cap_dotdot is
+ * disabled, return ENOTCAPABLE.
+ * 1. Return an error if this is the last component of
+ * the name and the operation is DELETE or RENAME.
+ * 2. If at root directory (e.g. after chroot)
+ * or at absolute root directory
+ * then ignore it so can't get out.
+ * 3. If this vnode is the root of a mounted
+ * filesystem, then replace it with the
+ * vnode which was mounted on so we take the
+ * .. in the other filesystem.
+ * 4. If the vnode is the top directory of
+ * the jail or chroot, don't let them out.
+ * 5. If doing a capability lookup and lookup_cap_dotdot is
+ * enabled, return ENOTCAPABLE if the lookup would escape
+ * from the initial file descriptor directory. Checks are
+ * done by ensuring that namei() already traversed the
+ * result of dotdot lookup.
+ */
+ if (cnp->cn_flags & ISDOTDOT) {
+ if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
+ == NI_LCF_STRICTRELATIVE) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+ error = ENOTCAPABLE;
+ goto bad;
+ }
+ if ((cnp->cn_flags & ISLASTCN) != 0 &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EINVAL;
+ goto bad;
+ }
+ for (;;) {
+ for (pr = cnp->cn_cred->cr_prison; pr != NULL;
+ pr = pr->pr_parent)
+ if (dp == pr->pr_root)
+ break;
+ if (dp == ndp->ni_rootdir ||
+ dp == ndp->ni_topdir ||
+ dp == rootvnode ||
+ pr != NULL ||
+ ((dp->v_vflag & VV_ROOT) != 0 &&
+ (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = dp;
+ VREF(dp);
+ goto nextname;
+ }
+ if ((dp->v_vflag & VV_ROOT) == 0)
+ break;
+ if (dp->v_iflag & VI_DOOMED) { /* forced unmount */
+ error = ENOENT;
+ goto bad;
+ }
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ VREF(dp);
+ vput(tdp);
+ vn_lock(dp,
+ compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+ LK_RETRY, ISDOTDOT));
+ error = nameicap_check_dotdot(ndp, dp);
+ if (error != 0) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+ goto bad;
+ }
+ }
+ }
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+unionlookup:
+#ifdef MAC
+ if ((cnp->cn_flags & NOMACCHECK) == 0) {
+ error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
+ cnp);
+ if (error)
+ goto bad;
+ }
+#endif
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = NULL;
+ ASSERT_VOP_LOCKED(dp, "lookup");
+ /*
+ * If we have a shared lock we may need to upgrade the lock for the
+ * last operation.
+ */
+ if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
+ dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED)
+ vn_lock(dp, LK_UPGRADE|LK_RETRY);
+ if ((dp->v_iflag & VI_DOOMED) != 0) {
+ error = ENOENT;
+ goto bad;
+ }
+ /*
+ * If we're looking up the last component and we need an exclusive
+ * lock, adjust our lkflags.
+ */
+ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+#ifdef NAMEI_DIAGNOSTIC
+ vn_printf(dp, "lookup in ");
+#endif
+ lkflags_save = cnp->cn_lkflags;
+ cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
+ cnp->cn_flags);
+ error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
+ cnp->cn_lkflags = lkflags_save;
+ if (error != 0) {
+ KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+ printf("not found\n");
+#endif
+ if ((error == ENOENT) &&
+ (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
+ (dp->v_mount->mnt_flag & MNT_UNION)) {
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ VREF(dp);
+ vput(tdp);
+ vn_lock(dp,
+ compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+ LK_RETRY, cnp->cn_flags));
+ nameicap_tracker_add(ndp, dp);
+ goto unionlookup;
+ }
+
+ if (error == ERELOOKUP) {
+ vref(dp);
+ ndp->ni_vp = dp;
+ error = 0;
+ relookup = 1;
+ goto good;
+ }
+
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * At this point, we know we're at the end of the
+ * pathname. If creating / renaming, we can consider
+ * allowing the file or directory to be created / renamed,
+ * provided we're not on a read-only filesystem.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ /* trailing slash only allowed for directories */
+ if ((cnp->cn_flags & TRAILINGSLASH) &&
+ !(cnp->cn_flags & WILLBEDIR)) {
+ error = ENOENT;
+ goto bad;
+ }
+ if ((cnp->cn_flags & LOCKPARENT) == 0)
+ VOP_UNLOCK(dp, 0);
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory vnode in ndp->ni_dvp.
+ */
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ goto success;
+ }
+
+good:
+#ifdef NAMEI_DIAGNOSTIC
+ printf("found\n");
+#endif
+ dp = ndp->ni_vp;
+
+ /*
+ * Check to see if the vnode has been mounted on;
+ * if so find the root of the mounted filesystem.
+ */
+ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+ (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+ if (vfs_busy(mp, 0))
+ continue;
+ vput(dp);
+ if (dp != ndp->ni_dvp)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ vrefact(vp_crossmp);
+ ndp->ni_dvp = vp_crossmp;
+ error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
+ cnp->cn_flags), &tdp);
+ vfs_unbusy(mp);
+ if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
+ panic("vp_crossmp exclusively locked or reclaimed");
+ if (error) {
+ dpunlocked = 1;
+ goto bad2;
+ }
+ ndp->ni_vp = dp = tdp;
+ }
+
+ /*
+ * Check for symbolic link
+ */
+ if ((dp->v_type == VLNK) &&
+ ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
+ *ndp->ni_next == '/')) {
+ cnp->cn_flags |= ISSYMLINK;
+ if (dp->v_iflag & VI_DOOMED) {
+ /*
+ * We can't know whether the directory was mounted with
+ * NOSYMFOLLOW, so we can't follow safely.
+ */
+ error = ENOENT;
+ goto bad2;
+ }
+ if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+ error = EACCES;
+ goto bad2;
+ }
+ /*
+ * Symlink code always expects an unlocked dvp.
+ */
+ if (ndp->ni_dvp != ndp->ni_vp) {
+ VOP_UNLOCK(ndp->ni_dvp, 0);
+ ni_dvp_unlocked = 1;
+ }
+ goto success;
+ }
+
+nextname:
+ /*
+ * Not a symbolic link that we will follow. Continue with the
+ * next component if there is any; otherwise, we're done.
+ */
+ KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
+ ("lookup: invalid path state."));
+ if (relookup) {
+ relookup = 0;
+ ndp->ni_pathlen = prev_ni_pathlen;
+ ndp->ni_next = prev_ni_next;
+ if (ndp->ni_dvp != dp)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ goto dirloop;
+ }
+ if (cnp->cn_flags & ISDOTDOT) {
+ error = nameicap_check_dotdot(ndp, ndp->ni_vp);
+ if (error != 0) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+ goto bad2;
+ }
+ }
+ if (*ndp->ni_next == '/') {
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ if (ndp->ni_dvp != dp)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ goto dirloop;
+ }
+ /*
+ * If we're processing a path with a trailing slash,
+ * check that the end result is a directory.
+ */
+ if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad2;
+ }
+ /*
+ * Disallow directory write attempts on read-only filesystems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EROFS;
+ goto bad2;
+ }
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ if (!wantparent) {
+ ni_dvp_unlocked = 2;
+ if (ndp->ni_dvp != dp)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
+ VOP_UNLOCK(ndp->ni_dvp, 0);
+ ni_dvp_unlocked = 1;
+ }
+
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_VNODE1(dp);
+ else if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_VNODE2(dp);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0);
+success:
+ /*
+ * Because of shared lookup we may have the vnode shared locked, but
+ * the caller may want it to be exclusively locked.
+ */
+ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
+ VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
+ vn_lock(dp, LK_UPGRADE | LK_RETRY);
+ if (dp->v_iflag & VI_DOOMED) {
+ error = ENOENT;
+ goto bad2;
+ }
+ }
+ return (0);
+
+bad2:
+ if (ni_dvp_unlocked != 2) {
+ if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ }
+bad:
+ if (!dpunlocked)
+ vput(dp);
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ * Used by lookup to re-acquire things.
+ */
+int
+relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
+{
+ struct vnode *dp = NULL; /* the directory we are searching */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int error = 0;
+
+ KASSERT(cnp->cn_flags & ISLASTCN,
+ ("relookup: Not given last component."));
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+ KASSERT(wantparent, ("relookup: parent not wanted."));
+ rdonly = cnp->cn_flags & RDONLY;
+ cnp->cn_flags &= ~ISSYMLINK;
+ dp = dvp;
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
+
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+#ifdef NAMEI_DIAGNOSTIC
+ printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+ /*
+ * Check for "" which represents the root directory after slash
+ * removal.
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ /*
+ * Support only LOOKUP for "/" because lookup()
+ * can't succeed for CREATE, DELETE and RENAME.
+ */
+ KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
+ KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
+
+ if (!(cnp->cn_flags & LOCKLEAF))
+ VOP_UNLOCK(dp, 0);
+ *vpp = dp;
+ /* XXX This should probably move to the top of function. */
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ return (0);
+ }
+
+ if (cnp->cn_flags & ISDOTDOT)
+ panic ("relookup: lookup on dot-dot");
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+#ifdef NAMEI_DIAGNOSTIC
+ vn_printf(dp, "search in ");
+#endif
+ if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
+ KASSERT(*vpp == NULL, ("leaf should be empty"));
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * If creating and at end of pathname, then can consider
+ * allowing file to be created.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+ if ((cnp->cn_flags & LOCKPARENT) == 0)
+ VOP_UNLOCK(dp, 0);
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory vnode in ndp->ni_dvp.
+ */
+ return (0);
+ }
+
+ dp = *vpp;
+
+ /*
+ * Disallow directory write attempts on read-only filesystems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ if (dvp == dp)
+ vrele(dvp);
+ else
+ vput(dvp);
+ error = EROFS;
+ goto bad;
+ }
+ /*
+ * Set the parent lock/ref state to the requested state.
+ */
+ if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
+ if (wantparent)
+ VOP_UNLOCK(dvp, 0);
+ else
+ vput(dvp);
+ } else if (!wantparent)
+ vrele(dvp);
+ /*
+ * Check for symbolic link
+ */
+ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+ ("relookup: symlink found.\n"));
+
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0);
+ return (0);
+bad:
+ vput(dp);
+ *vpp = NULL;
+ return (error);
+}
+
+void
+NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
+ const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
+ struct thread *td)
+{
+
+ ndp->ni_cnd.cn_nameiop = op;
+ ndp->ni_cnd.cn_flags = flags;
+ ndp->ni_segflg = segflg;
+ ndp->ni_dirp = namep;
+ ndp->ni_dirfd = dirfd;
+ ndp->ni_startdir = startdir;
+ ndp->ni_resflags = 0;
+ if (rightsp != NULL)
+ ndp->ni_rightsneeded = *rightsp;
+ else
+ cap_rights_init(&ndp->ni_rightsneeded);
+ filecaps_init(&ndp->ni_filecaps);
+ ndp->ni_cnd.cn_thread = td;
+}
+
+/*
+ * Free data allocated by namei(); see namei(9) for details.
+ */
+void
+NDFREE(struct nameidata *ndp, const u_int flags)
+{
+ int unlock_dvp;
+ int unlock_vp;
+
+ unlock_dvp = 0;
+ unlock_vp = 0;
+
+ if (!(flags & NDF_NO_FREE_PNBUF) &&
+ (ndp->ni_cnd.cn_flags & HASBUF)) {
+ uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+ ndp->ni_cnd.cn_flags &= ~HASBUF;
+ }
+ if (!(flags & NDF_NO_VP_UNLOCK) &&
+ (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
+ unlock_vp = 1;
+ if (!(flags & NDF_NO_DVP_UNLOCK) &&
+ (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
+ ndp->ni_dvp != ndp->ni_vp)
+ unlock_dvp = 1;
+ if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
+ if (unlock_vp) {
+ vput(ndp->ni_vp);
+ unlock_vp = 0;
+ } else
+ vrele(ndp->ni_vp);
+ ndp->ni_vp = NULL;
+ }
+ if (unlock_vp)
+ VOP_UNLOCK(ndp->ni_vp, 0);
+ if (!(flags & NDF_NO_DVP_RELE) &&
+ (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
+ if (unlock_dvp) {
+ vput(ndp->ni_dvp);
+ unlock_dvp = 0;
+ } else
+ vrele(ndp->ni_dvp);
+ ndp->ni_dvp = NULL;
+ }
+ if (unlock_dvp)
+ VOP_UNLOCK(ndp->ni_dvp, 0);
+ if (!(flags & NDF_NO_STARTDIR_RELE) &&
+ (ndp->ni_cnd.cn_flags & SAVESTART)) {
+ vrele(ndp->ni_startdir);
+ ndp->ni_startdir = NULL;
+ }
+}
+
+/*
+ * Determine if there is a suitable alternate filename under the specified
+ * prefix for the specified path. If the create flag is set, then the
+ * alternate prefix will be used so long as the parent directory exists.
+ * This is used by the various compatibility ABIs so that Linux binaries prefer
+ * files under /compat/linux for example. The chosen path (whether under
+ * the prefix or under /) is returned in a kernel malloc'd buffer pointed
+ * to by pathbuf. The caller is responsible for free'ing the buffer from
+ * the M_TEMP bucket if one is returned.
+ */
+int
+kern_alternate_path(struct thread *td, const char *prefix, const char *path,
+ enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
+{
+ struct nameidata nd, ndroot;
+ char *ptr, *buf, *cp;
+ size_t len, sz;
+ int error;
+
+ buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ *pathbuf = buf;
+
+ /* Copy the prefix into the new pathname as a starting point. */
+ len = strlcpy(buf, prefix, MAXPATHLEN);
+ if (len >= MAXPATHLEN) {
+ *pathbuf = NULL;
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+ sz = MAXPATHLEN - len;
+ ptr = buf + len;
+
+ /* Append the filename to the prefix. */
+ if (pathseg == UIO_SYSSPACE)
+ error = copystr(path, ptr, sz, &len);
+ else
+ error = copyinstr(path, ptr, sz, &len);
+
+ if (error) {
+ *pathbuf = NULL;
+ free(buf, M_TEMP);
+ return (error);
+ }
+
+ /* Only use a prefix with absolute pathnames. */
+ if (*ptr != '/') {
+ error = EINVAL;
+ goto keeporig;
+ }
+
+ if (dirfd != AT_FDCWD) {
+ /*
+ * We want the original because the "prefix" is
+ * included in the already opened dirfd.
+ */
+ bcopy(ptr, buf, len);
+ return (0);
+ }
+
+ /*
+ * We know that there is a / somewhere in this pathname.
+ * Search backwards for it, to find the file's parent dir
+ * to see if it exists in the alternate tree. If it does,
+ * and we want to create a file (cflag is set). We don't
+ * need to worry about the root comparison in this case.
+ */
+
+ if (create) {
+ for (cp = &ptr[len] - 1; *cp != '/'; cp--);
+ *cp = '\0';
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
+ error = namei(&nd);
+ *cp = '/';
+ if (error != 0)
+ goto keeporig;
+ } else {
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
+
+ error = namei(&nd);
+ if (error != 0)
+ goto keeporig;
+
+ /*
+ * We now compare the vnode of the prefix to the one
+ * vnode asked. If they resolve to be the same, then we
+ * ignore the match so that the real root gets used.
+ * This avoids the problem of traversing "../.." to find the
+ * root directory and never finding it, because "/" resolves
+ * to the emulation root directory. This is expensive :-(
+ */
+ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
+ td);
+
+ /* We shouldn't ever get an error from this namei(). */
+ error = namei(&ndroot);
+ if (error == 0) {
+ if (nd.ni_vp == ndroot.ni_vp)
+ error = ENOENT;
+
+ NDFREE(&ndroot, NDF_ONLY_PNBUF);
+ vrele(ndroot.ni_vp);
+ }
+ }
+
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+
+keeporig:
+ /* If there was an error, use the original path name. */
+ if (error)
+ bcopy(ptr, buf, len);
+ return (error);
+}
diff --git a/freebsd/sys/kern/vfs_mount.c b/freebsd/sys/kern/vfs_mount.c
new file mode 100644
index 00000000..3610763f
--- /dev/null
+++ b/freebsd/sys/kern/vfs_mount.c
@@ -0,0 +1,2052 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1999-2004 Poul-Henning Kamp
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/reboot.h>
+#include <sys/sbuf.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <vm/uma.h>
+
+#include <geom/geom.h>
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#define VFS_MOUNTARG_SIZE_MAX (1024 * 64)
+
+static int vfs_domount(struct thread *td, const char *fstype, char *fspath,
+ uint64_t fsflags, struct vfsoptlist **optlist);
+static void free_mntarg(struct mntarg *ma);
+
+static int usermount = 0;
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
+ "Unprivileged users may mount and unmount file systems");
+
+static bool default_autoro = false;
+SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
+ "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
+static uma_zone_t mount_zone;
+
+/* List of mounted filesystems. */
+struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
+
+/* For any iteration/modification of mountlist */
+struct mtx mountlist_mtx;
+MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
+
+EVENTHANDLER_LIST_DEFINE(vfs_mounted);
+EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
+
+/*
+ * Global opts, taken by all filesystems
+ */
+static const char *global_opts[] = {
+ "errmsg",
+ "fstype",
+ "fspath",
+ "ro",
+ "rw",
+ "nosuid",
+ "noexec",
+ NULL
+};
+
+static int
+mount_init(void *mem, int size, int flags)
+{
+ struct mount *mp;
+
+ mp = (struct mount *)mem;
+ mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
+ mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
+ lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+ return (0);
+}
+
+static void
+mount_fini(void *mem, int size)
+{
+ struct mount *mp;
+
+ mp = (struct mount *)mem;
+ lockdestroy(&mp->mnt_explock);
+ mtx_destroy(&mp->mnt_listmtx);
+ mtx_destroy(&mp->mnt_mtx);
+}
+
+static void
+vfs_mount_init(void *dummy __unused)
+{
+
+ mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
+ NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+}
+SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for building and sanitizing the mount options
+ */
+
+/* Remove one mount option. */
+static void
+vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
+{
+
+ TAILQ_REMOVE(opts, opt, link);
+ free(opt->name, M_MOUNT);
+ if (opt->value != NULL)
+ free(opt->value, M_MOUNT);
+ free(opt, M_MOUNT);
+}
+
+/* Release all resources related to the mount options. */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+ struct vfsopt *opt;
+
+ while (!TAILQ_EMPTY(opts)) {
+ opt = TAILQ_FIRST(opts);
+ vfs_freeopt(opts, opt);
+ }
+ free(opts, M_MOUNT);
+}
+
+void
+vfs_deleteopt(struct vfsoptlist *opts, const char *name)
+{
+ struct vfsopt *opt, *temp;
+
+ if (opts == NULL)
+ return;
+ TAILQ_FOREACH_SAFE(opt, opts, link, temp) {
+ if (strcmp(opt->name, name) == 0)
+ vfs_freeopt(opts, opt);
+ }
+}
+
+static int
+vfs_isopt_ro(const char *opt)
+{
+
+ if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
+ strcmp(opt, "norw") == 0)
+ return (1);
+ return (0);
+}
+
+static int
+vfs_isopt_rw(const char *opt)
+{
+
+ if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
+ return (1);
+ return (0);
+}
+
+/*
+ * Check if options are equal (with or without the "no" prefix).
+ */
+static int
+vfs_equalopts(const char *opt1, const char *opt2)
+{
+ char *p;
+
+ /* "opt" vs. "opt" or "noopt" vs. "noopt" */
+ if (strcmp(opt1, opt2) == 0)
+ return (1);
+ /* "noopt" vs. "opt" */
+ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+ return (1);
+ /* "opt" vs. "noopt" */
+ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+ return (1);
+ while ((p = strchr(opt1, '.')) != NULL &&
+ !strncmp(opt1, opt2, ++p - opt1)) {
+ opt2 += p - opt1;
+ opt1 = p;
+ /* "foo.noopt" vs. "foo.opt" */
+ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+ return (1);
+ /* "foo.opt" vs. "foo.noopt" */
+ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+ return (1);
+ }
+ /* "ro" / "rdonly" / "norw" / "rw" / "noro" */
+ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
+ (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
+ return (1);
+ return (0);
+}
+
+/*
+ * If a mount option is specified several times,
+ * (with or without the "no" prefix) only keep
+ * the last occurrence of it.
+ */
+static void
+vfs_sanitizeopts(struct vfsoptlist *opts)
+{
+ struct vfsopt *opt, *opt2, *tmp;
+
+ TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
+ opt2 = TAILQ_PREV(opt, vfsoptlist, link);
+ while (opt2 != NULL) {
+ if (vfs_equalopts(opt->name, opt2->name)) {
+ tmp = TAILQ_PREV(opt2, vfsoptlist, link);
+ vfs_freeopt(opts, opt2);
+ opt2 = tmp;
+ } else {
+ opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
+ }
+ }
+ }
+}
+
+/*
+ * Build a linked list of mount options from a struct uio.
+ */
+int
+vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
+{
+ struct vfsoptlist *opts;
+ struct vfsopt *opt;
+ size_t memused, namelen, optlen;
+ unsigned int i, iovcnt;
+ int error;
+
+ opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+ TAILQ_INIT(opts);
+ memused = 0;
+ iovcnt = auio->uio_iovcnt;
+ for (i = 0; i < iovcnt; i += 2) {
+ namelen = auio->uio_iov[i].iov_len;
+ optlen = auio->uio_iov[i + 1].iov_len;
+ memused += sizeof(struct vfsopt) + optlen + namelen;
+ /*
+ * Avoid consuming too much memory, and attempts to overflow
+ * memused.
+ */
+ if (memused > VFS_MOUNTARG_SIZE_MAX ||
+ optlen > VFS_MOUNTARG_SIZE_MAX ||
+ namelen > VFS_MOUNTARG_SIZE_MAX) {
+ error = EINVAL;
+ goto bad;
+ }
+
+ opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+ opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
+ opt->value = NULL;
+ opt->len = 0;
+ opt->pos = i / 2;
+ opt->seen = 0;
+
+ /*
+ * Do this early, so jumps to "bad" will free the current
+ * option.
+ */
+ TAILQ_INSERT_TAIL(opts, opt, link);
+
+ if (auio->uio_segflg == UIO_SYSSPACE) {
+ bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
+ } else {
+ error = copyin(auio->uio_iov[i].iov_base, opt->name,
+ namelen);
+ if (error)
+ goto bad;
+ }
+ /* Ensure names are null-terminated strings. */
+ if (namelen == 0 || opt->name[namelen - 1] != '\0') {
+ error = EINVAL;
+ goto bad;
+ }
+ if (optlen != 0) {
+ opt->len = optlen;
+ opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
+ if (auio->uio_segflg == UIO_SYSSPACE) {
+ bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
+ optlen);
+ } else {
+ error = copyin(auio->uio_iov[i + 1].iov_base,
+ opt->value, optlen);
+ if (error)
+ goto bad;
+ }
+ }
+ }
+ vfs_sanitizeopts(opts);
+ *options = opts;
+ return (0);
+bad:
+ vfs_freeopts(opts);
+ return (error);
+}
+
+/*
+ * Merge the old mount options with the new ones passed
+ * in the MNT_UPDATE case.
+ *
+ * XXX: This function will keep a "nofoo" option in the new
+ * options. E.g, if the option's canonical name is "foo",
+ * "nofoo" ends up in the mount point's active options.
+ */
+static void
+vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
+{
+ struct vfsopt *opt, *new;
+
+ TAILQ_FOREACH(opt, oldopts, link) {
+ new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+ new->name = strdup(opt->name, M_MOUNT);
+ if (opt->len != 0) {
+ new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+ bcopy(opt->value, new->value, opt->len);
+ } else
+ new->value = NULL;
+ new->len = opt->len;
+ new->seen = opt->seen;
+ TAILQ_INSERT_HEAD(toopts, new, link);
+ }
+ vfs_sanitizeopts(toopts);
+}
+
+/*
+ * Mount a filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nmount_args {
+ struct iovec *iovp;
+ unsigned int iovcnt;
+ int flags;
+};
+#endif
+int
+sys_nmount(struct thread *td, struct nmount_args *uap)
+{
+ struct uio *auio;
+ int error;
+ u_int iovcnt;
+ uint64_t flags;
+
+ /*
+ * Mount flags are now 64-bits. On 32-bit archtectures only
+ * 32-bits are passed in, but from here on everything handles
+ * 64-bit flags correctly.
+ */
+ flags = uap->flags;
+
+ AUDIT_ARG_FFLAGS(flags);
+ CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
+ uap->iovp, uap->iovcnt, flags);
+
+ /*
+ * Filter out MNT_ROOTFS. We do not want clients of nmount() in
+ * userspace to set this flag, but we must filter it out if we want
+ * MNT_UPDATE on the root file system to work.
+ * MNT_ROOTFS should only be set by the kernel when mounting its
+ * root file system.
+ */
+ flags &= ~MNT_ROOTFS;
+
+ iovcnt = uap->iovcnt;
+ /*
+ * Check that we have an even number of iovec's
+ * and that we have at least two options.
+ */
+ if ((iovcnt & 1) || (iovcnt < 4)) {
+ CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
+ uap->iovcnt);
+ return (EINVAL);
+ }
+
+ error = copyinuio(uap->iovp, iovcnt, &auio);
+ if (error) {
+ CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
+ __func__, error);
+ return (error);
+ }
+ error = vfs_donmount(td, flags, auio);
+
+ free(auio, M_IOV);
+ return (error);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Various utility functions
+ */
+
+void
+vfs_ref(struct mount *mp)
+{
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ MNT_IUNLOCK(mp);
+}
+
+void
+vfs_rel(struct mount *mp)
+{
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * Allocate and initialize the mount point struct.
+ */
+struct mount *
+vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
+ struct ucred *cred)
+{
+ struct mount *mp;
+
+ mp = uma_zalloc(mount_zone, M_WAITOK);
+ bzero(&mp->mnt_startzero,
+ __rangeof(struct mount, mnt_startzero, mnt_endzero));
+ TAILQ_INIT(&mp->mnt_nvnodelist);
+ mp->mnt_nvnodelistsize = 0;
+ TAILQ_INIT(&mp->mnt_activevnodelist);
+ mp->mnt_activevnodelistsize = 0;
+ TAILQ_INIT(&mp->mnt_tmpfreevnodelist);
+ mp->mnt_tmpfreevnodelistsize = 0;
+ mp->mnt_ref = 0;
+ (void) vfs_busy(mp, MBF_NOWAIT);
+ atomic_add_acq_int(&vfsp->vfc_refcount, 1);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_gen++;
+ strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_cred = crdup(cred);
+ mp->mnt_stat.f_owner = cred->cr_uid;
+ strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+ mp->mnt_iosize_max = DFLTPHYS;
+#ifdef MAC
+ mac_mount_init(mp);
+ mac_mount_create(cred, mp);
+#endif
+ arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
+ TAILQ_INIT(&mp->mnt_uppers);
+ return (mp);
+}
+
+/*
+ * Destroy the mount struct previously allocated by vfs_mount_alloc().
+ */
+void
+vfs_mount_destroy(struct mount *mp)
+{
+
+ MNT_ILOCK(mp);
+ mp->mnt_kern_flag |= MNTK_REFEXPIRE;
+ if (mp->mnt_kern_flag & MNTK_MWAIT) {
+ mp->mnt_kern_flag &= ~MNTK_MWAIT;
+ wakeup(mp);
+ }
+ while (mp->mnt_ref)
+ msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
+ KASSERT(mp->mnt_ref == 0,
+ ("%s: invalid refcount in the drain path @ %s:%d", __func__,
+ __FILE__, __LINE__));
+ if (mp->mnt_writeopcount != 0)
+ panic("vfs_mount_destroy: nonzero writeopcount");
+ if (mp->mnt_secondary_writes != 0)
+ panic("vfs_mount_destroy: nonzero secondary_writes");
+ atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
+ if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
+ struct vnode *vp;
+
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
+ vn_printf(vp, "dangling vnode ");
+ panic("unmount: dangling vnode");
+ }
+ KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
+ if (mp->mnt_nvnodelistsize != 0)
+ panic("vfs_mount_destroy: nonzero nvnodelistsize");
+ if (mp->mnt_activevnodelistsize != 0)
+ panic("vfs_mount_destroy: nonzero activevnodelistsize");
+ if (mp->mnt_tmpfreevnodelistsize != 0)
+ panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize");
+ if (mp->mnt_lockref != 0)
+ panic("vfs_mount_destroy: nonzero lock refcount");
+ MNT_IUNLOCK(mp);
+ if (mp->mnt_vnodecovered != NULL)
+ vrele(mp->mnt_vnodecovered);
+#ifdef MAC
+ mac_mount_destroy(mp);
+#endif
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ crfree(mp->mnt_cred);
+ uma_zfree(mount_zone, mp);
+}
+
+static bool
+vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
+{
+ /* This is an upgrade of an exisiting mount. */
+ if ((fsflags & MNT_UPDATE) != 0)
+ return (false);
+ /* This is already an R/O mount. */
+ if ((fsflags & MNT_RDONLY) != 0)
+ return (false);
+
+ switch (error) {
+ case ENODEV: /* generic, geom, ... */
+ case EACCES: /* cam/scsi, ... */
+ case EROFS: /* md, mmcsd, ... */
+ /*
+ * These errors can be returned by the storage layer to signal
+ * that the media is read-only. No harm in the R/O mount
+ * attempt if the error was returned for some other reason.
+ */
+ return (true);
+ default:
+ return (false);
+ }
+}
+
+int
+vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
+{
+ struct vfsoptlist *optlist;
+ struct vfsopt *opt, *tmp_opt;
+ char *fstype, *fspath, *errmsg;
+ int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+ bool autoro;
+
+ errmsg = fspath = NULL;
+ errmsg_len = fspathlen = 0;
+ errmsg_pos = -1;
+ autoro = default_autoro;
+
+ error = vfs_buildopts(fsoptions, &optlist);
+ if (error)
+ return (error);
+
+ if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
+ errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
+
+ /*
+ * We need these two options before the others,
+ * and they are mandatory for any filesystem.
+ * Ensure they are NUL terminated as well.
+ */
+ fstypelen = 0;
+ error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+ if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') {
+ error = EINVAL;
+ if (errmsg != NULL)
+ strncpy(errmsg, "Invalid fstype", errmsg_len);
+ goto bail;
+ }
+ fspathlen = 0;
+ error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+ if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') {
+ error = EINVAL;
+ if (errmsg != NULL)
+ strncpy(errmsg, "Invalid fspath", errmsg_len);
+ goto bail;
+ }
+
+ /*
+ * We need to see if we have the "update" option
+ * before we call vfs_domount(), since vfs_domount() has special
+ * logic based on MNT_UPDATE. This is very important
+ * when we want to update the root filesystem.
+ */
+ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
+ if (strcmp(opt->name, "update") == 0) {
+ fsflags |= MNT_UPDATE;
+ vfs_freeopt(optlist, opt);
+ }
+ else if (strcmp(opt->name, "async") == 0)
+ fsflags |= MNT_ASYNC;
+ else if (strcmp(opt->name, "force") == 0) {
+ fsflags |= MNT_FORCE;
+ vfs_freeopt(optlist, opt);
+ }
+ else if (strcmp(opt->name, "reload") == 0) {
+ fsflags |= MNT_RELOAD;
+ vfs_freeopt(optlist, opt);
+ }
+ else if (strcmp(opt->name, "multilabel") == 0)
+ fsflags |= MNT_MULTILABEL;
+ else if (strcmp(opt->name, "noasync") == 0)
+ fsflags &= ~MNT_ASYNC;
+ else if (strcmp(opt->name, "noatime") == 0)
+ fsflags |= MNT_NOATIME;
+ else if (strcmp(opt->name, "atime") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoatime", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noclusterr") == 0)
+ fsflags |= MNT_NOCLUSTERR;
+ else if (strcmp(opt->name, "clusterr") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoclusterr", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noclusterw") == 0)
+ fsflags |= MNT_NOCLUSTERW;
+ else if (strcmp(opt->name, "clusterw") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoclusterw", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noexec") == 0)
+ fsflags |= MNT_NOEXEC;
+ else if (strcmp(opt->name, "exec") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoexec", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "nosuid") == 0)
+ fsflags |= MNT_NOSUID;
+ else if (strcmp(opt->name, "suid") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonosuid", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "nosymfollow") == 0)
+ fsflags |= MNT_NOSYMFOLLOW;
+ else if (strcmp(opt->name, "symfollow") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonosymfollow", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noro") == 0) {
+ fsflags &= ~MNT_RDONLY;
+ autoro = false;
+ }
+ else if (strcmp(opt->name, "rw") == 0) {
+ fsflags &= ~MNT_RDONLY;
+ autoro = false;
+ }
+ else if (strcmp(opt->name, "ro") == 0) {
+ fsflags |= MNT_RDONLY;
+ autoro = false;
+ }
+ else if (strcmp(opt->name, "rdonly") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("ro", M_MOUNT);
+ fsflags |= MNT_RDONLY;
+ autoro = false;
+ }
+ else if (strcmp(opt->name, "autoro") == 0) {
+ vfs_freeopt(optlist, opt);
+ autoro = true;
+ }
+ else if (strcmp(opt->name, "suiddir") == 0)
+ fsflags |= MNT_SUIDDIR;
+ else if (strcmp(opt->name, "sync") == 0)
+ fsflags |= MNT_SYNCHRONOUS;
+ else if (strcmp(opt->name, "union") == 0)
+ fsflags |= MNT_UNION;
+ else if (strcmp(opt->name, "automounted") == 0) {
+ fsflags |= MNT_AUTOMOUNTED;
+ vfs_freeopt(optlist, opt);
+ }
+ }
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
+ error = ENAMETOOLONG;
+ goto bail;
+ }
+
+ error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+
+ /*
+ * See if we can mount in the read-only mode if the error code suggests
+ * that it could be possible and the mount options allow for that.
+ * Never try it if "[no]{ro|rw}" has been explicitly requested and not
+ * overridden by "autoro".
+ */
+ if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
+ printf("%s: R/W mount failed, possibly R/O media,"
+ " trying R/O mount\n", __func__);
+ fsflags |= MNT_RDONLY;
+ error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+ }
+bail:
+ /* copyout the errmsg */
+ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
+ && errmsg_len > 0 && errmsg != NULL) {
+ if (fsoptions->uio_segflg == UIO_SYSSPACE) {
+ bcopy(errmsg,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+ } else {
+ copyout(errmsg,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+ }
+ }
+
+ if (optlist != NULL)
+ vfs_freeopts(optlist);
+ return (error);
+}
+
+/*
+ * Old mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+sys_mount(struct thread *td, struct mount_args *uap)
+{
+ char *fstype;
+ struct vfsconf *vfsp = NULL;
+ struct mntarg *ma = NULL;
+ uint64_t flags;
+ int error;
+
+ /*
+ * Mount flags are now 64-bits. On 32-bit architectures only
+ * 32-bits are passed in, but from here on everything handles
+ * 64-bit flags correctly.
+ */
+ flags = uap->flags;
+
+ AUDIT_ARG_FFLAGS(flags);
+
+ /*
+ * Filter out MNT_ROOTFS. We do not want clients of mount() in
+ * userspace to set this flag, but we must filter it out if we want
+ * MNT_UPDATE on the root file system to work.
+ * MNT_ROOTFS should only be set by the kernel when mounting its
+ * root file system.
+ */
+ flags &= ~MNT_ROOTFS;
+
+ fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
+ if (error) {
+ free(fstype, M_TEMP);
+ return (error);
+ }
+
+ AUDIT_ARG_TEXT(fstype);
+ vfsp = vfs_byname_kld(fstype, td, &error);
+ free(fstype, M_TEMP);
+ if (vfsp == NULL)
+ return (ENOENT);
+ if (vfsp->vfc_vfsops->vfs_cmount == NULL)
+ return (EOPNOTSUPP);
+
+ ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
+ ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
+ ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
+ ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
+ ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
+
+ error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags);
+ return (error);
+}
+
+/*
+ * vfs_domount_first(): first file system mount (not update)
+ */
+static int
+vfs_domount_first(
+ struct thread *td, /* Calling thread. */
+ struct vfsconf *vfsp, /* File system type. */
+ char *fspath, /* Mount path. */
+ struct vnode *vp, /* Vnode to be covered. */
+ uint64_t fsflags, /* Flags common to all filesystems. */
+ struct vfsoptlist **optlist /* Options local to the filesystem. */
+ )
+{
+ struct vattr va;
+ struct mount *mp;
+ struct vnode *newdp;
+ int error, error1;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
+
+ /*
+ * If the jail of the calling thread lacks permission for this type of
+ * file system, deny immediately.
+ */
+ if (jailed(td->td_ucred) && !prison_allow(td->td_ucred,
+ vfsp->vfc_prison_flag)) {
+ vput(vp);
+ return (EPERM);
+ }
+
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ error = VOP_GETATTR(vp, &va, td->td_ucred);
+ if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
+ error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
+ if (error == 0)
+ error = vinvalbuf(vp, V_SAVE, 0, 0);
+ if (error == 0 && vp->v_type != VDIR)
+ error = ENOTDIR;
+ if (error == 0) {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+ vp->v_iflag |= VI_MOUNT;
+ else
+ error = EBUSY;
+ VI_UNLOCK(vp);
+ }
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0);
+
+ /* Allocate and initialize the filesystem. */
+ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
+ /* XXXMAC: pass to vfs_mount_alloc? */
+ mp->mnt_optnew = *optlist;
+ /* Set the mount level flags. */
+ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
+
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error1 = 0;
+ if ((error = VFS_MOUNT(mp)) != 0 ||
+ (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 ||
+ (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) {
+ if (error1 != 0) {
+ error = error1;
+ if ((error1 = VFS_UNMOUNT(mp, 0)) != 0)
+ printf("VFS_UNMOUNT returned %d\n", error1);
+ }
+ vfs_unbusy(mp);
+ mp->mnt_vnodecovered = NULL;
+ vfs_mount_destroy(mp);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vrele(vp);
+ return (error);
+ }
+ VOP_UNLOCK(newdp, 0);
+
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ *optlist = NULL;
+
+ /*
+ * Prevent external consumers of mount options from reading mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+
+ MNT_ILOCK(mp);
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+ (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ else
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ cache_purge(vp);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vp->v_mountedhere = mp;
+ /* Place the new filesystem at the end of the mount list. */
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ vfs_event_signal(NULL, VQ_MOUNT, 0);
+ vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_UNLOCK(vp, 0);
+ EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
+ VOP_UNLOCK(newdp, 0);
+ mountcheckdirs(vp, newdp);
+ vrele(newdp);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp);
+ return (0);
+}
+
+/*
+ * vfs_domount_update(): update of mounted file system
+ */
+static int
+vfs_domount_update(
+ struct thread *td, /* Calling thread. */
+ struct vnode *vp, /* Mount point vnode. */
+ uint64_t fsflags, /* Flags common to all filesystems. */
+ struct vfsoptlist **optlist /* Options local to the filesystem. */
+ )
+{
+ struct export_args export;
+ void *bufp;
+ struct mount *mp;
+ int error, export_error, len;
+ uint64_t flag;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
+ mp = vp->v_mount;
+
+ if ((vp->v_vflag & VV_ROOT) == 0) {
+ if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
+ == 0)
+ error = EXDEV;
+ else
+ error = EINVAL;
+ vput(vp);
+ return (error);
+ }
+
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ flag = mp->mnt_flag;
+ if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
+ vput(vp);
+ return (EOPNOTSUPP); /* Needs translation */
+ }
+ /*
+ * Only privileged root, or (if MNT_USER is set) the user that
+ * did the original mount is permitted to update it.
+ */
+ error = vfs_suser(mp, td);
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ if (vfs_busy(mp, MBF_NOWAIT)) {
+ vput(vp);
+ return (EBUSY);
+ }
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
+ VI_UNLOCK(vp);
+ vfs_unbusy(mp);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_iflag |= VI_MOUNT;
+ VI_UNLOCK(vp);
+ VOP_UNLOCK(vp, 0);
+
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+ MNT_IUNLOCK(mp);
+ error = EBUSY;
+ goto end;
+ }
+ mp->mnt_flag &= ~MNT_UPDATEMASK;
+ mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
+ MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
+ if ((mp->mnt_flag & MNT_ASYNC) == 0)
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+ mp->mnt_optnew = *optlist;
+ vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
+
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error = VFS_MOUNT(mp);
+
+ export_error = 0;
+ /* Process the export option. */
+ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
+ &len) == 0) {
+ /* Assume that there is only 1 ABI for each length. */
+ switch (len) {
+ case (sizeof(struct oexport_args)):
+ bzero(&export, sizeof(export));
+ /* FALLTHROUGH */
+ case (sizeof(export)):
+ bcopy(bufp, &export, len);
+ export_error = vfs_export(mp, &export);
+ break;
+ default:
+ export_error = EINVAL;
+ break;
+ }
+ }
+
+ MNT_ILOCK(mp);
+ if (error == 0) {
+ mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
+ MNT_SNAPSHOT);
+ } else {
+ /*
+ * If we fail, restore old mount flags. MNT_QUOTA is special,
+ * because it is not part of MNT_UPDATEMASK, but it could have
+ * changed in the meantime if quotactl(2) was called.
+ * All in all we want current value of MNT_QUOTA, not the old
+ * one.
+ */
+ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
+ }
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+ (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ else
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+
+ if (error != 0)
+ goto end;
+
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ *optlist = NULL;
+ (void)VFS_STATFS(mp, &mp->mnt_stat);
+ /*
+ * Prevent external consumers of mount options from reading
+ * mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ vfs_allocate_syncvnode(mp);
+ else
+ vfs_deallocate_syncvnode(mp);
+end:
+ vfs_unbusy(mp);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vrele(vp);
+ return (error != 0 ? error : export_error);
+}
+
+/*
+ * vfs_domount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_domount(
+ struct thread *td, /* Calling thread. */
+ const char *fstype, /* Filesystem type. */
+ char *fspath, /* Mount path. */
+ uint64_t fsflags, /* Flags common to all filesystems. */
+ struct vfsoptlist **optlist /* Options local to the filesystem. */
+ )
+{
+ struct vfsconf *vfsp;
+ struct nameidata nd;
+ struct vnode *vp;
+ char *pathbuf;
+ int error;
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+ return (ENAMETOOLONG);
+
+ if (jailed(td->td_ucred) || usermount == 0) {
+ if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
+ return (error);
+ }
+
+ /*
+ * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
+ */
+ if (fsflags & MNT_EXPORTED) {
+ error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
+ if (error)
+ return (error);
+ }
+ if (fsflags & MNT_SUIDDIR) {
+ error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
+ if (error)
+ return (error);
+ }
+ /*
+ * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
+ */
+ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
+ if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
+ fsflags |= MNT_NOSUID | MNT_USER;
+ }
+
+ /* Load KLDs before we lock the covered vnode to avoid reversals. */
+ vfsp = NULL;
+ if ((fsflags & MNT_UPDATE) == 0) {
+ /* Don't try to load KLDs if we're mounting the root. */
+ if (fsflags & MNT_ROOTFS)
+ vfsp = vfs_byname(fstype);
+ else
+ vfsp = vfs_byname_kld(fstype, td, &error);
+ if (vfsp == NULL)
+ return (ENODEV);
+ }
+
+ /*
+ * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_SYSSPACE, fspath, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if ((fsflags & MNT_UPDATE) == 0) {
+ pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+ strcpy(pathbuf, fspath);
+ error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN);
+ /* debug.disablefullpath == 1 results in ENODEV */
+ if (error == 0 || error == ENODEV) {
+ error = vfs_domount_first(td, vfsp, pathbuf, vp,
+ fsflags, optlist);
+ }
+ free(pathbuf, M_TEMP);
+ } else
+ error = vfs_domount_update(td, vp, fsflags, optlist);
+
+ return (error);
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument, not
+ * special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+sys_unmount(struct thread *td, struct unmount_args *uap)
+{
+ struct nameidata nd;
+ struct mount *mp;
+ char *pathbuf;
+ int error, id0, id1;
+
+ AUDIT_ARG_VALUE(uap->flags);
+ if (jailed(td->td_ucred) || usermount == 0) {
+ error = priv_check(td, PRIV_VFS_UNMOUNT);
+ if (error)
+ return (error);
+ }
+
+ pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
+ if (error) {
+ free(pathbuf, M_TEMP);
+ return (error);
+ }
+ if (uap->flags & MNT_BYFSID) {
+ AUDIT_ARG_TEXT(pathbuf);
+ /* Decode the filesystem ID. */
+ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
+ free(pathbuf, M_TEMP);
+ return (EINVAL);
+ }
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+ if (mp->mnt_stat.f_fsid.val[0] == id0 &&
+ mp->mnt_stat.f_fsid.val[1] == id1) {
+ vfs_ref(mp);
+ break;
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+ } else {
+ /*
+ * Try to find global path for path argument.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_SYSSPACE, pathbuf, td);
+ if (namei(&nd) == 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
+ MNAMELEN);
+ if (error == 0 || error == ENODEV)
+ vput(nd.ni_vp);
+ }
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+ if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
+ vfs_ref(mp);
+ break;
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+ }
+ free(pathbuf, M_TEMP);
+ if (mp == NULL) {
+ /*
+ * Previously we returned ENOENT for a nonexistent path and
+ * EINVAL for a non-mountpoint. We cannot tell these apart
+ * now, so in the !MNT_BYFSID case return the more likely
+ * EINVAL for compatibility.
+ */
+ return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
+ }
+
+ /*
+ * Don't allow unmounting the root filesystem.
+ */
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ vfs_rel(mp);
+ return (EINVAL);
+ }
+ error = dounmount(mp, uap->flags, td);
+ return (error);
+}
+
+/*
+ * Return error if any of the vnodes, ignoring the root vnode
+ * and the syncer vnode, have non-zero usecount.
+ *
+ * This function is purely advisory - it can return false positives
+ * and negatives.
+ */
+static int
+vfs_check_usecounts(struct mount *mp)
+{
+ struct vnode *vp, *mvp;
+
+ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+ if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
+ vp->v_usecount != 0) {
+ VI_UNLOCK(vp);
+ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+ return (EBUSY);
+ }
+ VI_UNLOCK(vp);
+ }
+
+ return (0);
+}
+
+static void
+dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
+{
+
+ mtx_assert(MNT_MTX(mp), MA_OWNED);
+ mp->mnt_kern_flag &= ~mntkflags;
+ if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
+ mp->mnt_kern_flag &= ~MNTK_MWAIT;
+ wakeup(mp);
+ }
+ MNT_IUNLOCK(mp);
+ if (coveredvp != NULL) {
+ VOP_UNLOCK(coveredvp, 0);
+ vdrop(coveredvp);
+ }
+ vn_finished_write(mp);
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(struct mount *mp, int flags, struct thread *td)
+{
+ struct vnode *coveredvp;
+ int error;
+ uint64_t async_flag;
+ int mnt_gen_r;
+
+ if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
+ mnt_gen_r = mp->mnt_gen;
+ VI_LOCK(coveredvp);
+ vholdl(coveredvp);
+ vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
+ /*
+ * Check for mp being unmounted while waiting for the
+ * covered vnode lock.
+ */
+ if (coveredvp->v_mountedhere != mp ||
+ coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
+ VOP_UNLOCK(coveredvp, 0);
+ vdrop(coveredvp);
+ vfs_rel(mp);
+ return (EBUSY);
+ }
+ }
+
+ /*
+ * Only privileged root, or (if MNT_USER is set) the user that did the
+ * original mount is permitted to unmount this filesystem.
+ */
+ error = vfs_suser(mp, td);
+ if (error != 0) {
+ if (coveredvp != NULL) {
+ VOP_UNLOCK(coveredvp, 0);
+ vdrop(coveredvp);
+ }
+ vfs_rel(mp);
+ return (error);
+ }
+
+ vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
+ (mp->mnt_flag & MNT_UPDATE) != 0 ||
+ !TAILQ_EMPTY(&mp->mnt_uppers)) {
+ dounmount_cleanup(mp, coveredvp, 0);
+ return (EBUSY);
+ }
+ mp->mnt_kern_flag |= MNTK_UNMOUNT;
+ if (flags & MNT_NONBUSY) {
+ MNT_IUNLOCK(mp);
+ error = vfs_check_usecounts(mp);
+ MNT_ILOCK(mp);
+ if (error != 0) {
+ dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
+ return (error);
+ }
+ }
+ /* Allow filesystems to detect that a forced unmount is in progress. */
+ if (flags & MNT_FORCE) {
+ mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+ MNT_IUNLOCK(mp);
+ /*
+ * Must be done after setting MNTK_UNMOUNTF and before
+ * waiting for mnt_lockref to become 0.
+ */
+ VFS_PURGE(mp);
+ MNT_ILOCK(mp);
+ }
+ error = 0;
+ if (mp->mnt_lockref) {
+ mp->mnt_kern_flag |= MNTK_DRAINING;
+ error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
+ "mount drain", 0);
+ }
+ MNT_IUNLOCK(mp);
+ KASSERT(mp->mnt_lockref == 0,
+ ("%s: invalid lock refcount in the drain path @ %s:%d",
+ __func__, __FILE__, __LINE__));
+ KASSERT(error == 0,
+ ("%s: invalid return value for msleep in the drain path @ %s:%d",
+ __func__, __FILE__, __LINE__));
+
+ if (mp->mnt_flag & MNT_EXPUBLIC)
+ vfs_setpublicfs(NULL, NULL, NULL);
+
+ /*
+ * From now, we can claim that the use reference on the
+ * coveredvp is ours, and the ref can be released only by
+ * successfull unmount by us, or left for later unmount
+ * attempt. The previously acquired hold reference is no
+ * longer needed to protect the vnode from reuse.
+ */
+ if (coveredvp != NULL)
+ vdrop(coveredvp);
+
+ vfs_msync(mp, MNT_WAIT);
+ MNT_ILOCK(mp);
+ async_flag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+ cache_purgevfs(mp, false); /* remove cache entries for this file sys */
+ vfs_deallocate_syncvnode(mp);
+ error = VFS_UNMOUNT(mp, flags);
+ vn_finished_write(mp);
+ /*
+ * If we failed to flush the dirty blocks for this mount point,
+ * undo all the cdir/rdir and rootvnode changes we made above.
+ * Unless we failed to do so because the device is reporting that
+ * it doesn't exist anymore.
+ */
+ if (error && error != ENXIO) {
+ MNT_ILOCK(mp);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ MNT_IUNLOCK(mp);
+ vfs_allocate_syncvnode(mp);
+ MNT_ILOCK(mp);
+ }
+ mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+ mp->mnt_flag |= async_flag;
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+ (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ if (mp->mnt_kern_flag & MNTK_MWAIT) {
+ mp->mnt_kern_flag &= ~MNTK_MWAIT;
+ wakeup(mp);
+ }
+ MNT_IUNLOCK(mp);
+ if (coveredvp)
+ VOP_UNLOCK(coveredvp, 0);
+ return (error);
+ }
+ mtx_lock(&mountlist_mtx);
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
+ if (coveredvp != NULL) {
+ coveredvp->v_mountedhere = NULL;
+ VOP_UNLOCK(coveredvp, 0);
+ }
+ vfs_event_signal(NULL, VQ_UNMOUNT, 0);
+ if (rootvnode != NULL && mp == rootvnode->v_mount) {
+ vrele(rootvnode);
+ rootvnode = NULL;
+ }
+ if (mp == rootdevmp)
+ rootdevmp = NULL;
+ vfs_mount_destroy(mp);
+ return (0);
+}
+
+/*
+ * Report errors during filesystem mounting.
+ */
+void
+vfs_mount_error(struct mount *mp, const char *fmt, ...)
+{
+ struct vfsoptlist *moptlist = mp->mnt_optnew;
+ va_list ap;
+ int error, len;
+ char *errmsg;
+
+ error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
+ if (error || errmsg == NULL || len <= 0)
+ return;
+
+ va_start(ap, fmt);
+ vsnprintf(errmsg, (size_t)len, fmt, ap);
+ va_end(ap);
+}
+
+void
+vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
+{
+ va_list ap;
+ int error, len;
+ char *errmsg;
+
+ error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
+ if (error || errmsg == NULL || len <= 0)
+ return;
+
+ va_start(ap, fmt);
+ vsnprintf(errmsg, (size_t)len, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for querying mount options/arguments from filesystems.
+ */
+
+/*
+ * Check that no unknown options are given
+ */
+int
+vfs_filteropt(struct vfsoptlist *opts, const char **legal)
+{
+ struct vfsopt *opt;
+ char errmsg[255];
+ const char **t, *p, *q;
+ int ret = 0;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ p = opt->name;
+ q = NULL;
+ if (p[0] == 'n' && p[1] == 'o')
+ q = p + 2;
+ for(t = global_opts; *t != NULL; t++) {
+ if (strcmp(*t, p) == 0)
+ break;
+ if (q != NULL) {
+ if (strcmp(*t, q) == 0)
+ break;
+ }
+ }
+ if (*t != NULL)
+ continue;
+ for(t = legal; *t != NULL; t++) {
+ if (strcmp(*t, p) == 0)
+ break;
+ if (q != NULL) {
+ if (strcmp(*t, q) == 0)
+ break;
+ }
+ }
+ if (*t != NULL)
+ continue;
+ snprintf(errmsg, sizeof(errmsg),
+ "mount option <%s> is unknown", p);
+ ret = EINVAL;
+ }
+ if (ret != 0) {
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(opt->name, "errmsg") == 0) {
+ strncpy((char *)opt->value, errmsg, opt->len);
+ break;
+ }
+ }
+ if (opt == NULL)
+ printf("%s\n", errmsg);
+ }
+ return (ret);
+}
+
+/*
+ * Get a mount option by its name.
+ *
+ * Return 0 if the option was found, ENOENT otherwise.
+ * If len is non-NULL it will be filled with the length
+ * of the option. If buf is non-NULL, it will be filled
+ * with the address of the option.
+ */
+int
+vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len)
+{
+ struct vfsopt *opt;
+
+ KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ if (len != NULL)
+ *len = opt->len;
+ if (buf != NULL)
+ *buf = opt->value;
+ return (0);
+ }
+ }
+ return (ENOENT);
+}
+
+int
+vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
+{
+ struct vfsopt *opt;
+
+ if (opts == NULL)
+ return (-1);
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ return (opt->pos);
+ }
+ }
+ return (-1);
+}
+
+int
+vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
+{
+ char *opt_value, *vtp;
+ quad_t iv;
+ int error, opt_len;
+
+ error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
+ if (error != 0)
+ return (error);
+ if (opt_len == 0 || opt_value == NULL)
+ return (EINVAL);
+ if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
+ return (EINVAL);
+ iv = strtoq(opt_value, &vtp, 0);
+ if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
+ return (EINVAL);
+ if (iv < 0)
+ return (EINVAL);
+ switch (vtp[0]) {
+ case 't': case 'T':
+ iv *= 1024;
+ /* FALLTHROUGH */
+ case 'g': case 'G':
+ iv *= 1024;
+ /* FALLTHROUGH */
+ case 'm': case 'M':
+ iv *= 1024;
+ /* FALLTHROUGH */
+ case 'k': case 'K':
+ iv *= 1024;
+ case '\0':
+ break;
+ default:
+ return (EINVAL);
+ }
+ *value = iv;
+
+ return (0);
+}
+
+char *
+vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
+{
+ struct vfsopt *opt;
+
+ *error = 0;
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->len == 0 ||
+ ((char *)opt->value)[opt->len - 1] != '\0') {
+ *error = EINVAL;
+ return (NULL);
+ }
+ return (opt->value);
+ }
+ *error = ENOENT;
+ return (NULL);
+}
+
+int
+vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
+ uint64_t val)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ if (w != NULL)
+ *w |= val;
+ return (1);
+ }
+ }
+ if (w != NULL)
+ *w &= ~val;
+ return (0);
+}
+
+int
+vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
+{
+ va_list ap;
+ struct vfsopt *opt;
+ int ret;
+
+ KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->len == 0 || opt->value == NULL)
+ return (0);
+ if (((char *)opt->value)[opt->len - 1] != '\0')
+ return (0);
+ va_start(ap, fmt);
+ ret = vsscanf(opt->value, fmt, ap);
+ va_end(ap);
+ return (ret);
+ }
+ return (0);
+}
+
+int
+vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->value == NULL)
+ opt->len = len;
+ else {
+ if (opt->len != len)
+ return (EINVAL);
+ bcopy(value, opt->value, len);
+ }
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->value == NULL)
+ opt->len = len;
+ else {
+ if (opt->len < len)
+ return (EINVAL);
+ opt->len = len;
+ bcopy(value, opt->value, len);
+ }
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->value == NULL)
+ opt->len = strlen(value) + 1;
+ else if (strlcpy(opt->value, value, opt->len) >= opt->len)
+ return (EINVAL);
+ return (0);
+ }
+ return (ENOENT);
+}
+
+/*
+ * Find and copy a mount option.
+ *
+ * The size of the buffer has to be specified
+ * in len, if it is not the same length as the
+ * mount option, EINVAL is returned.
+ * Returns ENOENT if the option is not found.
+ */
+int
+vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len)
+{
+ struct vfsopt *opt;
+
+ KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ if (len != opt->len)
+ return (EINVAL);
+ bcopy(opt->value, dest, opt->len);
+ return (0);
+ }
+ }
+ return (ENOENT);
+}
+
+int
+__vfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+ int error;
+
+ error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
+ if (sbp != &mp->mnt_stat)
+ *sbp = mp->mnt_stat;
+ return (error);
+}
+
+void
+vfs_mountedfrom(struct mount *mp, const char *from)
+{
+
+ bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
+ strlcpy(mp->mnt_stat.f_mntfromname, from,
+ sizeof mp->mnt_stat.f_mntfromname);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * This is the api for building mount args and mounting filesystems from
+ * inside the kernel.
+ *
+ * The API works by accumulation of individual args. First error is
+ * latched.
+ *
+ * XXX: should be documented in new manpage kernel_mount(9)
+ */
+
+/* A memory allocation which must be freed when we are done */
+struct mntaarg {
+ SLIST_ENTRY(mntaarg) next;
+};
+
+/* The header for the mount arguments */
+struct mntarg {
+ struct iovec *v;
+ int len;
+ int error;
+ SLIST_HEAD(, mntaarg) list;
+};
+
+/*
+ * Add a boolean argument.
+ *
+ * flag is the boolean value.
+ * name must start with "no".
+ */
+struct mntarg *
+mount_argb(struct mntarg *ma, int flag, const char *name)
+{
+
+ KASSERT(name[0] == 'n' && name[1] == 'o',
+ ("mount_argb(...,%s): name must start with 'no'", name));
+
+ return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
+}
+
+/*
+ * Add an argument printf style
+ */
+struct mntarg *
+mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
+{
+ va_list ap;
+ struct mntaarg *maa;
+ struct sbuf *sb;
+ int len;
+
+ if (ma == NULL) {
+ ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INIT(&ma->list);
+ }
+ if (ma->error)
+ return (ma);
+
+ ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+ M_MOUNT, M_WAITOK);
+ ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+ ma->v[ma->len].iov_len = strlen(name) + 1;
+ ma->len++;
+
+ sb = sbuf_new_auto();
+ va_start(ap, fmt);
+ sbuf_vprintf(sb, fmt, ap);
+ va_end(ap);
+ sbuf_finish(sb);
+ len = sbuf_len(sb) + 1;
+ maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INSERT_HEAD(&ma->list, maa, next);
+ bcopy(sbuf_data(sb), maa + 1, len);
+ sbuf_delete(sb);
+
+ ma->v[ma->len].iov_base = maa + 1;
+ ma->v[ma->len].iov_len = len;
+ ma->len++;
+
+ return (ma);
+}
+
+/*
+ * Add an argument which is a userland string.
+ */
+struct mntarg *
+mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
+{
+ struct mntaarg *maa;
+ char *tbuf;
+
+ if (val == NULL)
+ return (ma);
+ if (ma == NULL) {
+ ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INIT(&ma->list);
+ }
+ if (ma->error)
+ return (ma);
+ maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INSERT_HEAD(&ma->list, maa, next);
+ tbuf = (void *)(maa + 1);
+ ma->error = copyinstr(val, tbuf, len, NULL);
+ return (mount_arg(ma, name, tbuf, -1));
+}
+
+/*
+ * Plain argument.
+ *
+ * If length is -1, treat value as a C string.
+ */
+struct mntarg *
+mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
+{
+
+ if (ma == NULL) {
+ ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INIT(&ma->list);
+ }
+ if (ma->error)
+ return (ma);
+
+ ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+ M_MOUNT, M_WAITOK);
+ ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+ ma->v[ma->len].iov_len = strlen(name) + 1;
+ ma->len++;
+
+ ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
+ if (len < 0)
+ ma->v[ma->len].iov_len = strlen(val) + 1;
+ else
+ ma->v[ma->len].iov_len = len;
+ ma->len++;
+ return (ma);
+}
+
+/*
+ * Free a mntarg structure
+ */
+static void
+free_mntarg(struct mntarg *ma)
+{
+ struct mntaarg *maa;
+
+ while (!SLIST_EMPTY(&ma->list)) {
+ maa = SLIST_FIRST(&ma->list);
+ SLIST_REMOVE_HEAD(&ma->list, next);
+ free(maa, M_MOUNT);
+ }
+ free(ma->v, M_MOUNT);
+ free(ma, M_MOUNT);
+}
+
+/*
+ * Mount a filesystem
+ */
+int
+kernel_mount(struct mntarg *ma, uint64_t flags)
+{
+ struct uio auio;
+ int error;
+
+ KASSERT(ma != NULL, ("kernel_mount NULL ma"));
+ KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
+ KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
+
+ auio.uio_iov = ma->v;
+ auio.uio_iovcnt = ma->len;
+ auio.uio_segflg = UIO_SYSSPACE;
+
+ error = ma->error;
+ if (!error)
+ error = vfs_donmount(curthread, flags, &auio);
+ free_mntarg(ma);
+ return (error);
+}
+
+/*
+ * A printflike function to mount a filesystem.
+ */
+int
+kernel_vmount(int flags, ...)
+{
+ struct mntarg *ma = NULL;
+ va_list ap;
+ const char *cp;
+ const void *vp;
+ int error;
+
+ va_start(ap, flags);
+ for (;;) {
+ cp = va_arg(ap, const char *);
+ if (cp == NULL)
+ break;
+ vp = va_arg(ap, const void *);
+ ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
+ }
+ va_end(ap);
+
+ error = kernel_mount(ma, flags);
+ return (error);
+}
+
+void
+vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
+{
+
+ bcopy(oexp, exp, sizeof(*oexp));
+ exp->ex_numsecflavors = 0;
+}
diff --git a/freebsd/sys/kern/vfs_subr.c b/freebsd/sys/kern/vfs_subr.c
new file mode 100644
index 00000000..f84caac0
--- /dev/null
+++ b/freebsd/sys/kern/vfs_subr.c
@@ -0,0 +1,5719 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/extattr.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/pctrie.h>
+#include <sys/priv.h>
+#include <sys/reboot.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+
+#include <machine/stdarg.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static void delmntque(struct vnode *vp);
+static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
+ int slpflag, int slptimeo);
+static void syncer_shutdown(void *arg, int howto);
+static int vtryrecycle(struct vnode *vp);
+static void v_init_counters(struct vnode *);
+static void v_incr_usecount(struct vnode *);
+static void v_incr_usecount_locked(struct vnode *);
+static void v_incr_devcount(struct vnode *);
+static void v_decr_devcount(struct vnode *);
+static void vgonel(struct vnode *);
+static void vfs_knllock(void *arg);
+static void vfs_knlunlock(void *arg);
+static void vfs_knl_assert_locked(void *arg);
+static void vfs_knl_assert_unlocked(void *arg);
+static void vnlru_return_batches(struct vfsops *mnt_op);
+static void destroy_vpollinfo(struct vpollinfo *vi);
+static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
+ daddr_t startlbn, daddr_t endlbn);
+
+/*
+ * These fences are intended for cases where some synchronization is
+ * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
+ * and v_usecount) updates. Access to v_iflags is generally synchronized
+ * by the interlock, but we have some internal assertions that check vnode
+ * flags without acquiring the lock. Thus, these fences are INVARIANTS-only
+ * for now.
+ */
+#ifdef INVARIANTS
+#define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq()
+#define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel()
+#else
+#define VNODE_REFCOUNT_FENCE_ACQ()
+#define VNODE_REFCOUNT_FENCE_REL()
+#endif
+
+/*
+ * Number of vnodes in existence. Increased whenever getnewvnode()
+ * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
+ */
+static unsigned long numvnodes;
+
+SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
+ "Number of vnodes in existence");
+
+static counter_u64_t vnodes_created;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
+ "Number of vnodes created by getnewvnode");
+
+static u_long mnt_free_list_batch = 128;
+SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
+ &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
+
+/*
+ * Conversion tables for conversion from vnode types to inode formats
+ * and back.
+ */
+enum vtype iftovt_tab[16] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
+};
+int vttoif_tab[10] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+ S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
+};
+
+/*
+ * List of vnodes that are ready for recycling.
+ */
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;
+
+/*
+ * "Free" vnode target. Free vnodes are rarely completely free, but are
+ * just ones that are cheap to recycle. Usually they are for files which
+ * have been stat'd but not read; these usually have inode and namecache
+ * data attached to them. This target is the preferred minimum size of a
+ * sub-cache consisting mostly of such files. The system balances the size
+ * of this sub-cache with its complement to try to prevent either from
+ * thrashing while the other is relatively inactive. The targets express
+ * a preference for the best balance.
+ *
+ * "Above" this target there are 2 further targets (watermarks) related
+ * to recyling of free vnodes. In the best-operating case, the cache is
+ * exactly full, the free list has size between vlowat and vhiwat above the
+ * free target, and recycling from it and normal use maintains this state.
+ * Sometimes the free list is below vlowat or even empty, but this state
+ * is even better for immediate use provided the cache is not full.
+ * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
+ * ones) to reach one of these states. The watermarks are currently hard-
+ * coded as 4% and 9% of the available space higher. These and the default
+ * of 25% for wantfreevnodes are too large if the memory size is large.
+ * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
+ * whenever vnlru_proc() becomes active.
+ */
+static u_long wantfreevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
+ &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
+static u_long freevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
+ &freevnodes, 0, "Number of \"free\" vnodes");
+
+static counter_u64_t recycles_count;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
+ "Number of vnodes recycled to meet vnode cache targets");
+
+/*
+ * Various variables used for debugging the new implementation of
+ * reassignbuf().
+ * XXX these are probably of (very) limited utility now.
+ */
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
+ "Number of calls to reassignbuf");
+
+static counter_u64_t free_owe_inact;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
+ "Number of times free vnodes kept on active list due to VFS "
+ "owing inactivation");
+
+/* To keep more than one thread at a time from running vfs_getnewfsid */
+static struct mtx mntid_mtx;
+
+/*
+ * Lock for any access to the following:
+ * vnode_free_list
+ * numvnodes
+ * freevnodes
+ */
+static struct mtx vnode_free_list_mtx;
+
+/* Publicly exported FS */
+struct nfs_public nfs_pub;
+
+static uma_zone_t buf_trie_zone;
+
+/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
+static uma_zone_t vnode_zone;
+static uma_zone_t vnodepoll_zone;
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syncer process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ * syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+static int syncer_delayno;
+static long syncer_mask;
+LIST_HEAD(synclist, bufobj);
+static struct synclist *syncer_workitem_pending;
+/*
+ * The sync_mtx protects:
+ * bo->bo_synclist
+ * sync_vnode_count
+ * syncer_delayno
+ * syncer_state
+ * syncer_workitem_pending
+ * syncer_worklist_len
+ * rushjob
+ */
+static struct mtx sync_mtx;
+static struct cv sync_wakeup;
+
+#define SYNCER_MAXDELAY 32
+static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
+static int syncdelay = 30; /* max time to delay syncing data */
+static int filedelay = 30; /* time to delay syncing files */
+SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
+ "Time to delay syncing files (in seconds)");
+static int dirdelay = 29; /* time to delay syncing directories */
+SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
+ "Time to delay syncing directories (in seconds)");
+static int metadelay = 28; /* time to delay syncing metadata */
+SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
+ "Time to delay syncing metadata (in seconds)");
+static int rushjob; /* number of slots to run ASAP */
+static int stat_rush_requests; /* number of times I/O speeded up */
+SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
+ "Number of times I/O speeded up (rush requests)");
+
+/*
+ * When shutting down the syncer, run it at four times normal speed.
+ */
+#define SYNCER_SHUTDOWN_SPEEDUP 4
+static int sync_vnode_count;
+static int syncer_worklist_len;
+static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
+ syncer_state;
+
+/* Target for maximum number of vnodes. */
+int desiredvnodes;
+static int gapvnodes; /* gap between wanted and desired */
+static int vhiwat; /* enough extras after expansion */
+static int vlowat; /* minimal extras before expansion */
+static int vstir; /* nonzero to stir non-free vnodes */
+static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
+
+static int
+sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
+{
+ int error, old_desiredvnodes;
+
+ old_desiredvnodes = desiredvnodes;
+ if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
+ return (error);
+ if (old_desiredvnodes != desiredvnodes) {
+ wantfreevnodes = desiredvnodes / 4;
+ /* XXX locking seems to be incomplete. */
+ vfs_hash_changesize(desiredvnodes);
+ cache_changesize(desiredvnodes);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
+ sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
+SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
+ &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
+static int vnlru_nowhere;
+SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
+ &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
+
+static int
+sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
+{
+ struct vnode *vp;
+ struct nameidata nd;
+ char *buf;
+ unsigned long ndflags;
+ int error;
+
+ if (req->newptr == NULL)
+ return (EINVAL);
+ if (req->newlen >= PATH_MAX)
+ return (E2BIG);
+
+ buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
+ error = SYSCTL_IN(req, buf, req->newlen);
+ if (error != 0)
+ goto out;
+
+ buf[req->newlen] = '\0';
+
+ ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME;
+ NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
+ if ((error = namei(&nd)) != 0)
+ goto out;
+ vp = nd.ni_vp;
+
+ if ((vp->v_iflag & VI_DOOMED) != 0) {
+ /*
+ * This vnode is being recycled. Return != 0 to let the caller
+ * know that the sysctl had no effect. Return EAGAIN because a
+ * subsequent call will likely succeed (since namei will create
+ * a new vnode if necessary)
+ */
+ error = EAGAIN;
+ goto putvnode;
+ }
+
+ counter_u64_add(recycles_count, 1);
+ vgone(vp);
+putvnode:
+ NDFREE(&nd, 0);
+out:
+ free(buf, M_TEMP);
+ return (error);
+}
+
+static int
+sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
+{
+ struct thread *td = curthread;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+ int fd;
+
+ if (req->newptr == NULL)
+ return (EBADF);
+
+ error = sysctl_handle_int(oidp, &fd, 0, req);
+ if (error != 0)
+ return (error);
+ error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
+ if (error != 0)
+ return (error);
+ vp = fp->f_vnode;
+
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error != 0)
+ goto drop;
+
+ counter_u64_add(recycles_count, 1);
+ vgone(vp);
+ VOP_UNLOCK(vp, 0);
+drop:
+ fdrop(fp, td);
+ return (error);
+}
+
+SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
+ CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
+ sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
+SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
+ sysctl_ftry_reclaim_vnode, "I",
+ "Try to reclaim a vnode by its file descriptor");
+
+/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
+static int vnsz2log;
+
+/*
+ * Support for the bufobj clean & dirty pctrie.
+ */
+static void *
+buf_trie_alloc(struct pctrie *ptree)
+{
+
+ return uma_zalloc(buf_trie_zone, M_NOWAIT);
+}
+
+static void
+buf_trie_free(struct pctrie *ptree, void *node)
+{
+
+ uma_zfree(buf_trie_zone, node);
+}
+PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
+
+/*
+ * Initialize the vnode management data structures.
+ *
+ * Reevaluate the following cap on the number of vnodes after the physical
+ * memory size exceeds 512GB. In the limit, as the physical memory size
+ * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
+ */
+#ifndef MAXVNODES_MAX
+#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */
+#endif
+
+/*
+ * Initialize a vnode as it first enters the zone.
+ */
+static int
+vnode_init(void *mem, int size, int flags)
+{
+ struct vnode *vp;
+
+ vp = mem;
+ bzero(vp, size);
+ /*
+ * Setup locks.
+ */
+ vp->v_vnlock = &vp->v_lock;
+ mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+ /*
+ * By default, don't allow shared locks unless filesystems opt-in.
+ */
+ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
+ LK_NOSHARE | LK_IS_VNODE);
+ /*
+ * Initialize bufobj.
+ */
+ bufobj_init(&vp->v_bufobj, vp);
+ /*
+ * Initialize namecache.
+ */
+ LIST_INIT(&vp->v_cache_src);
+ TAILQ_INIT(&vp->v_cache_dst);
+ /*
+ * Initialize rangelocks.
+ */
+ rangelock_init(&vp->v_rl);
+ return (0);
+}
+
+/*
+ * Free a vnode when it is cleared from the zone.
+ */
+static void
+vnode_fini(void *mem, int size)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+
+ vp = mem;
+ rangelock_destroy(&vp->v_rl);
+ lockdestroy(vp->v_vnlock);
+ mtx_destroy(&vp->v_interlock);
+ bo = &vp->v_bufobj;
+ rw_destroy(BO_LOCKPTR(bo));
+}
+
+/*
+ * Provide the size of NFS nclnode and NFS fh for calculation of the
+ * vnode memory consumption. The size is specified directly to
+ * eliminate dependency on NFS-private header.
+ *
+ * Other filesystems may use bigger or smaller (like UFS and ZFS)
+ * private inode data, but the NFS-based estimation is ample enough.
+ * Still, we care about differences in the size between 64- and 32-bit
+ * platforms.
+ *
+ * Namecache structure size is heuristically
+ * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
+ */
+#ifdef _LP64
+#define NFS_NCLNODE_SZ (528 + 64)
+#define NC_SZ 148
+#else
+#define NFS_NCLNODE_SZ (360 + 32)
+#define NC_SZ 92
+#endif
+
+static void
+vntblinit(void *dummy __unused)
+{
+ u_int i;
+ int physvnodes, virtvnodes;
+
+ /*
+ * Desiredvnodes is a function of the physical memory size and the
+ * kernel's heap size. Generally speaking, it scales with the
+ * physical memory size. The ratio of desiredvnodes to the physical
+ * memory size is 1:16 until desiredvnodes exceeds 98,304.
+ * Thereafter, the
+ * marginal ratio of desiredvnodes to the physical memory size is
+ * 1:64. However, desiredvnodes is limited by the kernel's heap
+ * size. The memory required by desiredvnodes vnodes and vm objects
+ * must not exceed 1/10th of the kernel's heap size.
+ */
+ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
+ 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
+ virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
+ sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
+ desiredvnodes = min(physvnodes, virtvnodes);
+ if (desiredvnodes > MAXVNODES_MAX) {
+ if (bootverbose)
+ printf("Reducing kern.maxvnodes %d -> %d\n",
+ desiredvnodes, MAXVNODES_MAX);
+ desiredvnodes = MAXVNODES_MAX;
+ }
+ wantfreevnodes = desiredvnodes / 4;
+ mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
+ TAILQ_INIT(&vnode_free_list);
+ mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
+ vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
+ vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
+ vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ /*
+ * Preallocate enough nodes to support one-per buf so that
+ * we can not fail an insert. reassignbuf() callers can not
+ * tolerate the insertion failure.
+ */
+ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
+ NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
+ UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ uma_prealloc(buf_trie_zone, nbuf);
+
+ vnodes_created = counter_u64_alloc(M_WAITOK);
+ recycles_count = counter_u64_alloc(M_WAITOK);
+ free_owe_inact = counter_u64_alloc(M_WAITOK);
+
+ /*
+ * Initialize the filesystem syncer.
+ */
+ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+ &syncer_mask);
+ syncer_maxdelay = syncer_mask + 1;
+ mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
+ cv_init(&sync_wakeup, "syncer");
+ for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+ vnsz2log++;
+ vnsz2log--;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
+
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Eventually, mountlist_mtx is not released on failure.
+ *
+ * vfs_busy() is a custom lock, it can block the caller.
+ * vfs_busy() only sleeps if the unmount is active on the mount point.
+ * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
+ * vnode belonging to mp.
+ *
+ * Lookup uses vfs_busy() to traverse mount points.
+ * root fs var fs
+ * / vnode lock A / vnode lock (/var) D
+ * /var vnode lock B /log vnode lock(/var/log) E
+ * vfs_busy lock C vfs_busy lock F
+ *
+ * Within each file system, the lock order is C->A->B and F->D->E.
+ *
+ * When traversing across mounts, the system follows that lock order:
+ *
+ * C->A->B
+ * |
+ * +->F->D->E
+ *
+ * The lookup() process for namei("/var") illustrates the process:
+ * VOP_LOOKUP() obtains B while A is held
+ * vfs_busy() obtains a shared lock on F while A and B are held
+ * vput() releases lock on B
+ * vput() releases lock on A
+ * VFS_ROOT() obtains lock on D while shared lock on F is held
+ * vfs_unbusy() releases shared lock on F
+ * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
+ * Attempt to lock A (instead of vp_crossmp) while D is held would
+ * violate the global order, causing deadlocks.
+ *
+ * dounmount() locks B while F is drained.
+ */
+int
+vfs_busy(struct mount *mp, int flags)
+{
+
+ MPASS((flags & ~MBF_MASK) == 0);
+ CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
+
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ /*
+ * If mount point is currently being unmounted, sleep until the
+ * mount point fate is decided. If thread doing the unmounting fails,
+ * it will clear MNTK_UNMOUNT flag before waking us up, indicating
+ * that this mount point has survived the unmount attempt and vfs_busy
+ * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
+ * flag in addition to MNTK_UNMOUNT, indicating that mount point is
+ * about to be really destroyed. vfs_busy needs to release its
+ * reference on the mount point in this case and return with ENOENT,
+ * telling the caller that mount mount it tried to busy is no longer
+ * valid.
+ */
+ while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+ if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ CTR1(KTR_VFS, "%s: failed busying before sleeping",
+ __func__);
+ return (ENOENT);
+ }
+ if (flags & MBF_MNTLSTLOCK)
+ mtx_unlock(&mountlist_mtx);
+ mp->mnt_kern_flag |= MNTK_MWAIT;
+ msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
+ if (flags & MBF_MNTLSTLOCK)
+ mtx_lock(&mountlist_mtx);
+ MNT_ILOCK(mp);
+ }
+ if (flags & MBF_MNTLSTLOCK)
+ mtx_unlock(&mountlist_mtx);
+ mp->mnt_lockref++;
+ MNT_IUNLOCK(mp);
+ return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(struct mount *mp)
+{
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
+ mp->mnt_lockref--;
+ if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
+ MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
+ CTR1(KTR_VFS, "%s: waking up waiters", __func__);
+ mp->mnt_kern_flag &= ~MNTK_DRAINING;
+ wakeup(&mp->mnt_lockref);
+ }
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid_t *fsid)
+{
+ struct mount *mp;
+
+ CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ vfs_ref(mp);
+ mtx_unlock(&mountlist_mtx);
+ return (mp);
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+ CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier, busying it before
+ * returning.
+ *
+ * To avoid congestion on mountlist_mtx, implement simple direct-mapped
+ * cache for popular filesystem identifiers. The cache is lockess, using
+ * the fact that struct mount's are never freed. In worst case we may
+ * get pointer to unmounted or even different filesystem, so we have to
+ * check what we got, and go slow way if so.
+ */
+struct mount *
+vfs_busyfs(fsid_t *fsid)
+{
+#define FSID_CACHE_SIZE 256
+ typedef struct mount * volatile vmp_t;
+ static vmp_t cache[FSID_CACHE_SIZE];
+ struct mount *mp;
+ int error;
+ uint32_t hash;
+
+ CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+ hash = fsid->val[0] ^ fsid->val[1];
+ hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
+ mp = cache[hash];
+ if (mp == NULL ||
+ mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
+ mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
+ goto slow;
+ if (vfs_busy(mp, 0) != 0) {
+ cache[hash] = NULL;
+ goto slow;
+ }
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
+ return (mp);
+ else
+ vfs_unbusy(mp);
+
+slow:
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ error = vfs_busy(mp, MBF_MNTLSTLOCK);
+ if (error) {
+ cache[hash] = NULL;
+ mtx_unlock(&mountlist_mtx);
+ return (NULL);
+ }
+ cache[hash] = mp;
+ return (mp);
+ }
+ }
+ CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+ mtx_unlock(&mountlist_mtx);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Check if a user can access privileged mount options.
+ */
+int
+vfs_suser(struct mount *mp, struct thread *td)
+{
+ int error;
+
+ if (jailed(td->td_ucred)) {
+ /*
+ * If the jail of the calling thread lacks permission for
+ * this type of file system, deny immediately.
+ */
+ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
+ return (EPERM);
+
+ /*
+ * If the file system was mounted outside the jail of the
+ * calling thread, deny immediately.
+ */
+ if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
+ return (EPERM);
+ }
+
+ /*
+ * If file system supports delegated administration, we don't check
+ * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
+ * by the file system itself.
+ * If this is not the user that did original mount, we check for
+ * the PRIV_VFS_MOUNT_OWNER privilege.
+ */
+ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
+ mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
+ if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * Get a new unique fsid. Try to make its val[0] unique, since this value
+ * will be used to create fake device numbers for stat(). Also try (but
+ * not so hard) make its val[0] unique mod 2^16, since some emulators only
+ * support 16-bit device numbers. We end up with unique val[0]'s for the
+ * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
+ *
+ * Keep in mind that several mounts may be running in parallel. Starting
+ * the search one past where the previous search terminated is both a
+ * micro-optimization and a defense against returning the same fsid to
+ * different mounts.
+ */
+void
+vfs_getnewfsid(struct mount *mp)
+{
+ static uint16_t mntid_base;
+ struct mount *nmp;
+ fsid_t tfsid;
+ int mtype;
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ mtx_lock(&mntid_mtx);
+ mtype = mp->mnt_vfc->vfc_typenum;
+ tfsid.val[1] = mtype;
+ mtype = (mtype & 0xFF) << 24;
+ for (;;) {
+ tfsid.val[0] = makedev(255,
+ mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
+ mntid_base++;
+ if ((nmp = vfs_getvfs(&tfsid)) == NULL)
+ break;
+ vfs_rel(nmp);
+ }
+ mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+ mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
+ mtx_unlock(&mntid_mtx);
+}
+
+/*
+ * Knob to control the precision of file timestamps:
+ *
+ * 0 = seconds only; nanoseconds zeroed.
+ * 1 = seconds and nanoseconds, accurate within 1/HZ.
+ * 2 = seconds and nanoseconds, truncated to microseconds.
+ * >=3 = seconds and nanoseconds, maximum precision.
+ */
+enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
+
+static int timestamp_precision = TSP_USEC;
+SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
+ &timestamp_precision, 0, "File timestamp precision (0: seconds, "
+ "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
+ "3+: sec + ns (max. precision))");
+
+/*
+ * Get a current timestamp.
+ */
+void
+vfs_timestamp(struct timespec *tsp)
+{
+ struct timeval tv;
+
+ switch (timestamp_precision) {
+ case TSP_SEC:
+ tsp->tv_sec = time_second;
+ tsp->tv_nsec = 0;
+ break;
+ case TSP_HZ:
+ getnanotime(tsp);
+ break;
+ case TSP_USEC:
+ microtime(&tv);
+ TIMEVAL_TO_TIMESPEC(&tv, tsp);
+ break;
+ case TSP_NSEC:
+ default:
+ nanotime(tsp);
+ break;
+ }
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(struct vattr *vap)
+{
+
+ vap->va_type = VNON;
+ vap->va_size = VNOVAL;
+ vap->va_bytes = VNOVAL;
+ vap->va_mode = VNOVAL;
+ vap->va_nlink = VNOVAL;
+ vap->va_uid = VNOVAL;
+ vap->va_gid = VNOVAL;
+ vap->va_fsid = VNOVAL;
+ vap->va_fileid = VNOVAL;
+ vap->va_blocksize = VNOVAL;
+ vap->va_rdev = VNOVAL;
+ vap->va_atime.tv_sec = VNOVAL;
+ vap->va_atime.tv_nsec = VNOVAL;
+ vap->va_mtime.tv_sec = VNOVAL;
+ vap->va_mtime.tv_nsec = VNOVAL;
+ vap->va_ctime.tv_sec = VNOVAL;
+ vap->va_ctime.tv_nsec = VNOVAL;
+ vap->va_birthtime.tv_sec = VNOVAL;
+ vap->va_birthtime.tv_nsec = VNOVAL;
+ vap->va_flags = VNOVAL;
+ vap->va_gen = VNOVAL;
+ vap->va_vaflags = 0;
+}
+
+/*
+ * This routine is called when we have too many vnodes. It attempts
+ * to free <count> vnodes and will potentially free vnodes that still
+ * have VM backing store (VM backing store is typically the cause
+ * of a vnode blowout so we want to do this). Therefore, this operation
+ * is not considered cheap.
+ *
+ * A number of conditions may prevent a vnode from being reclaimed.
+ * the buffer cache may have references on the vnode, a directory
+ * vnode may still have references due to the namei cache representing
+ * underlying files, or the vnode may be in active use. It is not
+ * desirable to reuse such vnodes. These conditions may cause the
+ * number of vnodes to reach some minimum value regardless of what
+ * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
+ *
+ * @param mp Try to reclaim vnodes from this mountpoint
+ * @param reclaim_nc_src Only reclaim directories with outgoing namecache
+ * entries if this argument is strue
+ * @param trigger Only reclaim vnodes with fewer than this many resident
+ * pages.
+ * @return The number of vnodes that were reclaimed.
+ */
+static int
+vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger)
+{
+ struct vnode *vp;
+ int count, done, target;
+
+ done = 0;
+ vn_start_write(NULL, &mp, V_WAIT);
+ MNT_ILOCK(mp);
+ count = mp->mnt_nvnodelistsize;
+ target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
+ target = target / 10 + 1;
+ while (count != 0 && done < target) {
+ vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+ while (vp != NULL && vp->v_type == VMARKER)
+ vp = TAILQ_NEXT(vp, v_nmntvnodes);
+ if (vp == NULL)
+ break;
+ /*
+ * XXX LRU is completely broken for non-free vnodes. First
+ * by calling here in mountpoint order, then by moving
+ * unselected vnodes to the end here, and most grossly by
+ * removing the vlruvp() function that was supposed to
+ * maintain the order. (This function was born broken
+ * since syncer problems prevented it doing anything.) The
+ * order is closer to LRC (C = Created).
+ *
+ * LRU reclaiming of vnodes seems to have last worked in
+ * FreeBSD-3 where LRU wasn't mentioned under any spelling.
+ * Then there was no hold count, and inactive vnodes were
+ * simply put on the free list in LRU order. The separate
+ * lists also break LRU. We prefer to reclaim from the
+ * free list for technical reasons. This tends to thrash
+ * the free list to keep very unrecently used held vnodes.
+ * The problem is mitigated by keeping the free list large.
+ */
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ --count;
+ if (!VI_TRYLOCK(vp))
+ goto next_iter;
+ /*
+ * If it's been deconstructed already, it's still
+ * referenced, or it exceeds the trigger, skip it.
+ * Also skip free vnodes. We are trying to make space
+ * to expand the free list, not reduce it.
+ */
+ if (vp->v_usecount ||
+ (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+ ((vp->v_iflag & VI_FREE) != 0) ||
+ (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
+ vp->v_object->resident_page_count > trigger)) {
+ VI_UNLOCK(vp);
+ goto next_iter;
+ }
+ MNT_IUNLOCK(mp);
+ vholdl(vp);
+ if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
+ vdrop(vp);
+ goto next_iter_mntunlocked;
+ }
+ VI_LOCK(vp);
+ /*
+ * v_usecount may have been bumped after VOP_LOCK() dropped
+ * the vnode interlock and before it was locked again.
+ *
+ * It is not necessary to recheck VI_DOOMED because it can
+ * only be set by another thread that holds both the vnode
+ * lock and vnode interlock. If another thread has the
+ * vnode lock before we get to VOP_LOCK() and obtains the
+ * vnode interlock after VOP_LOCK() drops the vnode
+ * interlock, the other thread will be unable to drop the
+ * vnode lock before our VOP_LOCK() call fails.
+ */
+ if (vp->v_usecount ||
+ (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+ (vp->v_iflag & VI_FREE) != 0 ||
+ (vp->v_object != NULL &&
+ vp->v_object->resident_page_count > trigger)) {
+ VOP_UNLOCK(vp, LK_INTERLOCK);
+ vdrop(vp);
+ goto next_iter_mntunlocked;
+ }
+ KASSERT((vp->v_iflag & VI_DOOMED) == 0,
+ ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
+ counter_u64_add(recycles_count, 1);
+ vgonel(vp);
+ VOP_UNLOCK(vp, 0);
+ vdropl(vp);
+ done++;
+next_iter_mntunlocked:
+ if (!should_yield())
+ goto relock_mnt;
+ goto yield;
+next_iter:
+ if (!should_yield())
+ continue;
+ MNT_IUNLOCK(mp);
+yield:
+ kern_yield(PRI_USER);
+relock_mnt:
+ MNT_ILOCK(mp);
+ }
+ MNT_IUNLOCK(mp);
+ vn_finished_write(mp);
+ return done;
+}
+
+static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
+SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
+ 0,
+ "limit on vnode free requests per call to the vnlru_free routine");
+
+/*
+ * Attempt to reduce the free list by the requested amount.
+ */
+static void
+vnlru_free_locked(int count, struct vfsops *mnt_op)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ bool tried_batches;
+
+ tried_batches = false;
+ mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+ if (count > max_vnlru_free)
+ count = max_vnlru_free;
+ for (; count > 0; count--) {
+ vp = TAILQ_FIRST(&vnode_free_list);
+ /*
+ * The list can be modified while the free_list_mtx
+ * has been dropped and vp could be NULL here.
+ */
+ if (vp == NULL) {
+ if (tried_batches)
+ break;
+ mtx_unlock(&vnode_free_list_mtx);
+ vnlru_return_batches(mnt_op);
+ tried_batches = true;
+ mtx_lock(&vnode_free_list_mtx);
+ continue;
+ }
+
+ VNASSERT(vp->v_op != NULL, vp,
+ ("vnlru_free: vnode already reclaimed."));
+ KASSERT((vp->v_iflag & VI_FREE) != 0,
+ ("Removing vnode not on freelist"));
+ KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+ ("Mangling active vnode"));
+ TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+
+ /*
+ * Don't recycle if our vnode is from different type
+ * of mount point. Note that mp is type-safe, the
+ * check does not reach unmapped address even if
+ * vnode is reclaimed.
+ * Don't recycle if we can't get the interlock without
+ * blocking.
+ */
+ if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
+ mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+ continue;
+ }
+ VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
+ vp, ("vp inconsistent on freelist"));
+
+ /*
+ * The clear of VI_FREE prevents activation of the
+ * vnode. There is no sense in putting the vnode on
+ * the mount point active list, only to remove it
+ * later during recycling. Inline the relevant part
+ * of vholdl(), to avoid triggering assertions or
+ * activating.
+ */
+ freevnodes--;
+ vp->v_iflag &= ~VI_FREE;
+ VNODE_REFCOUNT_FENCE_REL();
+ refcount_acquire(&vp->v_holdcnt);
+
+ mtx_unlock(&vnode_free_list_mtx);
+ VI_UNLOCK(vp);
+ vtryrecycle(vp);
+ /*
+ * If the recycled succeeded this vdrop will actually free
+ * the vnode. If not it will simply place it back on
+ * the free list.
+ */
+ vdrop(vp);
+ mtx_lock(&vnode_free_list_mtx);
+ }
+}
+
+void
+vnlru_free(int count, struct vfsops *mnt_op)
+{
+
+ mtx_lock(&vnode_free_list_mtx);
+ vnlru_free_locked(count, mnt_op);
+ mtx_unlock(&vnode_free_list_mtx);
+}
+
+
+/* XXX some names and initialization are bad for limits and watermarks. */
+static int
+vspace(void)
+{
+ int space;
+
+ gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
+ vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
+ vlowat = vhiwat / 2;
+ if (numvnodes > desiredvnodes)
+ return (0);
+ space = desiredvnodes - numvnodes;
+ if (freevnodes > wantfreevnodes)
+ space += freevnodes - wantfreevnodes;
+ return (space);
+}
+
+static void
+vnlru_return_batch_locked(struct mount *mp)
+{
+ struct vnode *vp;
+
+ mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+
+ if (mp->mnt_tmpfreevnodelistsize == 0)
+ return;
+
+ TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
+ VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
+ ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
+ vp->v_mflag &= ~VMP_TMPMNTFREELIST;
+ }
+ mtx_lock(&vnode_free_list_mtx);
+ TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
+ freevnodes += mp->mnt_tmpfreevnodelistsize;
+ mtx_unlock(&vnode_free_list_mtx);
+ mp->mnt_tmpfreevnodelistsize = 0;
+}
+
+static void
+vnlru_return_batch(struct mount *mp)
+{
+
+ mtx_lock(&mp->mnt_listmtx);
+ vnlru_return_batch_locked(mp);
+ mtx_unlock(&mp->mnt_listmtx);
+}
+
+static void
+vnlru_return_batches(struct vfsops *mnt_op)
+{
+ struct mount *mp, *nmp;
+ bool need_unbusy;
+
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ need_unbusy = false;
+ if (mnt_op != NULL && mp->mnt_op != mnt_op)
+ goto next;
+ if (mp->mnt_tmpfreevnodelistsize == 0)
+ goto next;
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
+ vnlru_return_batch(mp);
+ need_unbusy = true;
+ mtx_lock(&mountlist_mtx);
+ }
+next:
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ if (need_unbusy)
+ vfs_unbusy(mp);
+ }
+ mtx_unlock(&mountlist_mtx);
+}
+
+/*
+ * Attempt to recycle vnodes in a context that is always safe to block.
+ * Calling vlrurecycle() from the bowels of filesystem code has some
+ * interesting deadlock problems.
+ */
+static struct proc *vnlruproc;
+static int vnlruproc_sig;
+
+static void
+vnlru_proc(void)
+{
+ struct mount *mp, *nmp;
+ unsigned long onumvnodes;
+ int done, force, trigger, usevnodes, vsp;
+ bool reclaim_nc_src;
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
+ SHUTDOWN_PRI_FIRST);
+
+ force = 0;
+ for (;;) {
+ kproc_suspend_check(vnlruproc);
+ mtx_lock(&vnode_free_list_mtx);
+ /*
+ * If numvnodes is too large (due to desiredvnodes being
+ * adjusted using its sysctl, or emergency growth), first
+ * try to reduce it by discarding from the free list.
+ */
+ if (numvnodes > desiredvnodes)
+ vnlru_free_locked(numvnodes - desiredvnodes, NULL);
+ /*
+ * Sleep if the vnode cache is in a good state. This is
+ * when it is not over-full and has space for about a 4%
+ * or 9% expansion (by growing its size or inexcessively
+ * reducing its free list). Otherwise, try to reclaim
+ * space for a 10% expansion.
+ */
+ if (vstir && force == 0) {
+ force = 1;
+ vstir = 0;
+ }
+ vsp = vspace();
+ if (vsp >= vlowat && force == 0) {
+ vnlruproc_sig = 0;
+ wakeup(&vnlruproc_sig);
+ msleep(vnlruproc, &vnode_free_list_mtx,
+ PVFS|PDROP, "vlruwt", hz);
+ continue;
+ }
+ mtx_unlock(&vnode_free_list_mtx);
+ done = 0;
+ onumvnodes = numvnodes;
+ /*
+ * Calculate parameters for recycling. These are the same
+ * throughout the loop to give some semblance of fairness.
+ * The trigger point is to avoid recycling vnodes with lots
+ * of resident pages. We aren't trying to free memory; we
+ * are trying to recycle or at least free vnodes.
+ */
+ if (numvnodes <= desiredvnodes)
+ usevnodes = numvnodes - freevnodes;
+ else
+ usevnodes = numvnodes;
+ if (usevnodes <= 0)
+ usevnodes = 1;
+ /*
+ * The trigger value is is chosen to give a conservatively
+ * large value to ensure that it alone doesn't prevent
+ * making progress. The value can easily be so large that
+ * it is effectively infinite in some congested and
+ * misconfigured cases, and this is necessary. Normally
+ * it is about 8 to 100 (pages), which is quite large.
+ */
+ trigger = vm_cnt.v_page_count * 2 / usevnodes;
+ if (force < 2)
+ trigger = vsmalltrigger;
+ reclaim_nc_src = force >= 3;
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ done += vlrureclaim(mp, reclaim_nc_src, trigger);
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
+ uma_reclaim();
+ if (done == 0) {
+ if (force == 0 || force == 1) {
+ force = 2;
+ continue;
+ }
+ if (force == 2) {
+ force = 3;
+ continue;
+ }
+ force = 0;
+ vnlru_nowhere++;
+ tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
+ } else
+ kern_yield(PRI_USER);
+ /*
+ * After becoming active to expand above low water, keep
+ * active until above high water.
+ */
+ vsp = vspace();
+ force = vsp < vhiwat;
+ }
+}
+
+static struct kproc_desc vnlru_kp = {
+ "vnlru",
+ vnlru_proc,
+ &vnlruproc
+};
+SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
+ &vnlru_kp);
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+/*
+ * Try to recycle a freed vnode. We abort if anyone picks up a reference
+ * before we actually vgone(). This function must be called with the vnode
+ * held to prevent the vnode from being returned to the free list midway
+ * through vgone().
+ */
+static int
+vtryrecycle(struct vnode *vp)
+{
+ struct mount *vnmp;
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ VNASSERT(vp->v_holdcnt, vp,
+ ("vtryrecycle: Recycling vp %p without a reference.", vp));
+ /*
+ * This vnode may found and locked via some other list, if so we
+ * can't recycle it yet.
+ */
+ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+ CTR2(KTR_VFS,
+ "%s: impossible to recycle, vp %p lock is already held",
+ __func__, vp);
+ return (EWOULDBLOCK);
+ }
+ /*
+ * Don't recycle if its filesystem is being suspended.
+ */
+ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
+ VOP_UNLOCK(vp, 0);
+ CTR2(KTR_VFS,
+ "%s: impossible to recycle, cannot start the write for %p",
+ __func__, vp);
+ return (EBUSY);
+ }
+ /*
+ * If we got this far, we need to acquire the interlock and see if
+ * anyone picked up this vnode from another list. If not, we will
+ * mark it with DOOMED via vgonel() so that anyone who does find it
+ * will skip over it.
+ */
+ VI_LOCK(vp);
+ if (vp->v_usecount) {
+ VOP_UNLOCK(vp, LK_INTERLOCK);
+ vn_finished_write(vnmp);
+ CTR2(KTR_VFS,
+ "%s: impossible to recycle, %p is already referenced",
+ __func__, vp);
+ return (EBUSY);
+ }
+ if ((vp->v_iflag & VI_DOOMED) == 0) {
+ counter_u64_add(recycles_count, 1);
+ vgonel(vp);
+ }
+ VOP_UNLOCK(vp, LK_INTERLOCK);
+ vn_finished_write(vnmp);
+ return (0);
+}
+
+static void
+vcheckspace(void)
+{
+ int vsp;
+
+ vsp = vspace();
+ if (vsp < vlowat && vnlruproc_sig == 0) {
+ vnlruproc_sig = 1;
+ wakeup(vnlruproc);
+ }
+}
+
+/*
+ * Wait if necessary for space for a new vnode.
+ */
+static int
+getnewvnode_wait(int suspended)
+{
+
+ mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+ if (numvnodes >= desiredvnodes) {
+ if (suspended) {
+ /*
+ * The file system is being suspended. We cannot
+ * risk a deadlock here, so allow allocation of
+ * another vnode even if this would give too many.
+ */
+ return (0);
+ }
+ if (vnlruproc_sig == 0) {
+ vnlruproc_sig = 1; /* avoid unnecessary wakeups */
+ wakeup(vnlruproc);
+ }
+ msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
+ "vlruwk", hz);
+ }
+ /* Post-adjust like the pre-adjust in getnewvnode(). */
+ if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
+ vnlru_free_locked(1, NULL);
+ return (numvnodes >= desiredvnodes ? ENFILE : 0);
+}
+
+/*
+ * This hack is fragile, and probably not needed any more now that the
+ * watermark handling works.
+ */
+void
+getnewvnode_reserve(u_int count)
+{
+ struct thread *td;
+
+ /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
+ /* XXX no longer so quick, but this part is not racy. */
+ mtx_lock(&vnode_free_list_mtx);
+ if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
+ vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes,
+ freevnodes - wantfreevnodes), NULL);
+ mtx_unlock(&vnode_free_list_mtx);
+
+ td = curthread;
+ /* First try to be quick and racy. */
+ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
+ td->td_vp_reserv += count;
+ vcheckspace(); /* XXX no longer so quick, but more racy */
+ return;
+ } else
+ atomic_subtract_long(&numvnodes, count);
+
+ mtx_lock(&vnode_free_list_mtx);
+ while (count > 0) {
+ if (getnewvnode_wait(0) == 0) {
+ count--;
+ td->td_vp_reserv++;
+ atomic_add_long(&numvnodes, 1);
+ }
+ }
+ vcheckspace();
+ mtx_unlock(&vnode_free_list_mtx);
+}
+
+/*
+ * This hack is fragile, especially if desiredvnodes or wantvnodes are
+ * misconfgured or changed significantly. Reducing desiredvnodes below
+ * the reserved amount should cause bizarre behaviour like reducing it
+ * below the number of active vnodes -- the system will try to reduce
+ * numvnodes to match, but should fail, so the subtraction below should
+ * not overflow.
+ */
+void
+getnewvnode_drop_reserve(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ atomic_subtract_long(&numvnodes, td->td_vp_reserv);
+ td->td_vp_reserv = 0;
+}
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
+ struct vnode **vpp)
+{
+ struct vnode *vp;
+ struct thread *td;
+ struct lock_object *lo;
+ static int cyclecount;
+ int error __unused;
+
+ CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
+ vp = NULL;
+ td = curthread;
+ if (td->td_vp_reserv > 0) {
+ td->td_vp_reserv -= 1;
+ goto alloc;
+ }
+ mtx_lock(&vnode_free_list_mtx);
+ if (numvnodes < desiredvnodes)
+ cyclecount = 0;
+ else if (cyclecount++ >= freevnodes) {
+ cyclecount = 0;
+ vstir = 1;
+ }
+ /*
+ * Grow the vnode cache if it will not be above its target max
+ * after growing. Otherwise, if the free list is nonempty, try
+ * to reclaim 1 item from it before growing the cache (possibly
+ * above its target max if the reclamation failed or is delayed).
+ * Otherwise, wait for some space. In all cases, schedule
+ * vnlru_proc() if we are getting short of space. The watermarks
+ * should be chosen so that we never wait or even reclaim from
+ * the free list to below its target minimum.
+ */
+ if (numvnodes + 1 <= desiredvnodes)
+ ;
+ else if (freevnodes > 0)
+ vnlru_free_locked(1, NULL);
+ else {
+ error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+ MNTK_SUSPEND));
+#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
+ if (error != 0) {
+ mtx_unlock(&vnode_free_list_mtx);
+ return (error);
+ }
+#endif
+ }
+ vcheckspace();
+ atomic_add_long(&numvnodes, 1);
+ mtx_unlock(&vnode_free_list_mtx);
+alloc:
+ counter_u64_add(vnodes_created, 1);
+ vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
+ /*
+ * Locks are given the generic name "vnode" when created.
+ * Follow the historic practice of using the filesystem
+ * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
+ *
+ * Locks live in a witness group keyed on their name. Thus,
+ * when a lock is renamed, it must also move from the witness
+ * group of its old name to the witness group of its new name.
+ *
+ * The change only needs to be made when the vnode moves
+ * from one filesystem type to another. We ensure that each
+ * filesystem use a single static name pointer for its tag so
+ * that we can compare pointers rather than doing a strcmp().
+ */
+ lo = &vp->v_vnlock->lock_object;
+ if (lo->lo_name != tag) {
+ lo->lo_name = tag;
+ WITNESS_DESTROY(lo);
+ WITNESS_INIT(lo, tag);
+ }
+ /*
+ * By default, don't allow shared locks unless filesystems opt-in.
+ */
+ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
+ /*
+ * Finalize various vnode identity bits.
+ */
+ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
+ KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
+ KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
+ vp->v_type = VNON;
+ vp->v_tag = tag;
+ vp->v_op = vops;
+ v_init_counters(vp);
+ vp->v_bufobj.bo_ops = &buf_ops_bio;
+#ifdef DIAGNOSTIC
+ if (mp == NULL && vops != &dead_vnodeops)
+ printf("NULL mp in getnewvnode(9), tag %s\n", tag);
+#endif
+#ifdef MAC
+ mac_vnode_init(vp);
+ if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
+ mac_vnode_associate_singlelabel(mp, vp);
+#endif
+ if (mp != NULL) {
+ vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
+ if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
+ vp->v_vflag |= VV_NOKNOTE;
+ }
+
+ /*
+ * For the filesystems which do not use vfs_hash_insert(),
+ * still initialize v_hash to have vfs_hash_index() useful.
+ * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+ * its own hashing.
+ */
+ vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Delete from old mount point vnode list, if on one.
+ */
+static void
+delmntque(struct vnode *vp)
+{
+ struct mount *mp;
+ int active;
+
+ mp = vp->v_mount;
+ if (mp == NULL)
+ return;
+ MNT_ILOCK(mp);
+ VI_LOCK(vp);
+ KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
+ ("Active vnode list size %d > Vnode list size %d",
+ mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
+ active = vp->v_iflag & VI_ACTIVE;
+ vp->v_iflag &= ~VI_ACTIVE;
+ if (active) {
+ mtx_lock(&mp->mnt_listmtx);
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
+ mp->mnt_activevnodelistsize--;
+ mtx_unlock(&mp->mnt_listmtx);
+ }
+ vp->v_mount = NULL;
+ VI_UNLOCK(vp);
+ VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
+ ("bad mount point vnode list size"));
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ mp->mnt_nvnodelistsize--;
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+}
+
+static void
+insmntque_stddtr(struct vnode *vp, void *dtr_arg)
+{
+
+ vp->v_data = NULL;
+ vp->v_op = &dead_vnodeops;
+ vgone(vp);
+ vput(vp);
+}
+
+/*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+int
+insmntque1(struct vnode *vp, struct mount *mp,
+ void (*dtr)(struct vnode *, void *), void *dtr_arg)
+{
+
+ KASSERT(vp->v_mount == NULL,
+ ("insmntque: vnode already on per mount vnode list"));
+ VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
+ ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
+
+ /*
+ * We acquire the vnode interlock early to ensure that the
+ * vnode cannot be recycled by another process releasing a
+ * holdcnt on it before we get it on both the vnode list
+ * and the active vnode list. The mount mutex protects only
+ * manipulation of the vnode list and the vnode freelist
+ * mutex protects only manipulation of the active vnode list.
+ * Hence the need to hold the vnode interlock throughout.
+ */
+ MNT_ILOCK(mp);
+ VI_LOCK(vp);
+ if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
+ ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
+ mp->mnt_nvnodelistsize == 0)) &&
+ (vp->v_vflag & VV_FORCEINSMQ) == 0) {
+ VI_UNLOCK(vp);
+ MNT_IUNLOCK(mp);
+ if (dtr != NULL)
+ dtr(vp, dtr_arg);
+ return (EBUSY);
+ }
+ vp->v_mount = mp;
+ MNT_REF(mp);
+ TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
+ ("neg mount point vnode list size"));
+ mp->mnt_nvnodelistsize++;
+ KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+ ("Activating already active vnode"));
+ vp->v_iflag |= VI_ACTIVE;
+ mtx_lock(&mp->mnt_listmtx);
+ TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+ mp->mnt_activevnodelistsize++;
+ mtx_unlock(&mp->mnt_listmtx);
+ VI_UNLOCK(vp);
+ MNT_IUNLOCK(mp);
+ return (0);
+}
+
+int
+insmntque(struct vnode *vp, struct mount *mp)
+{
+
+ return (insmntque1(vp, mp, insmntque_stddtr, NULL));
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a bufobj
+ * Called with the underlying object locked.
+ */
+int
+bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
+{
+ int error;
+
+ BO_LOCK(bo);
+ if (flags & V_SAVE) {
+ error = bufobj_wwait(bo, slpflag, slptimeo);
+ if (error) {
+ BO_UNLOCK(bo);
+ return (error);
+ }
+ if (bo->bo_dirty.bv_cnt > 0) {
+ BO_UNLOCK(bo);
+ if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
+ return (error);
+ /*
+ * XXX We could save a lock/unlock if this was only
+ * enabled under INVARIANTS
+ */
+ BO_LOCK(bo);
+ if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
+ panic("vinvalbuf: dirty bufs");
+ }
+ }
+ /*
+ * If you alter this loop please notice that interlock is dropped and
+ * reacquired in flushbuflist. Special care is needed to ensure that
+ * no race conditions occur from this.
+ */
+ do {
+ error = flushbuflist(&bo->bo_clean,
+ flags, bo, slpflag, slptimeo);
+ if (error == 0 && !(flags & V_CLEANONLY))
+ error = flushbuflist(&bo->bo_dirty,
+ flags, bo, slpflag, slptimeo);
+ if (error != 0 && error != EAGAIN) {
+ BO_UNLOCK(bo);
+ return (error);
+ }
+ } while (error != 0);
+
+ /*
+ * Wait for I/O to complete. XXX needs cleaning up. The vnode can
+ * have write I/O in-progress but if there is a VM object then the
+ * VM object can also have read-I/O in-progress.
+ */
+ do {
+ bufobj_wwait(bo, 0, 0);
+ if ((flags & V_VMIO) == 0) {
+ BO_UNLOCK(bo);
+ if (bo->bo_object != NULL) {
+ VM_OBJECT_WLOCK(bo->bo_object);
+ vm_object_pip_wait(bo->bo_object, "bovlbx");
+ VM_OBJECT_WUNLOCK(bo->bo_object);
+ }
+ BO_LOCK(bo);
+ }
+ } while (bo->bo_numoutput > 0);
+ BO_UNLOCK(bo);
+
+ /*
+ * Destroy the copy in the VM cache, too.
+ */
+ if (bo->bo_object != NULL &&
+ (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
+ VM_OBJECT_WLOCK(bo->bo_object);
+ vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
+ OBJPR_CLEANONLY : 0);
+ VM_OBJECT_WUNLOCK(bo->bo_object);
+ }
+
+#ifdef INVARIANTS
+ BO_LOCK(bo);
+ if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
+ V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
+ bo->bo_clean.bv_cnt > 0))
+ panic("vinvalbuf: flush failed");
+ if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
+ bo->bo_dirty.bv_cnt > 0)
+ panic("vinvalbuf: flush dirty failed");
+ BO_UNLOCK(bo);
+#endif
+ return (0);
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
+{
+
+ CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+ ASSERT_VOP_LOCKED(vp, "vinvalbuf");
+ if (vp->v_object != NULL && vp->v_object->handle != vp)
+ return (0);
+ return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
+}
+
+/*
+ * Flush out buffers on the specified list.
+ *
+ */
+static int
+flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+ int slptimeo)
+{
+ struct buf *bp, *nbp;
+ int retval, error;
+ daddr_t lblkno;
+ b_xflags_t xflags;
+
+ ASSERT_BO_WLOCKED(bo);
+
+ retval = 0;
+ TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
+ if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
+ ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
+ continue;
+ }
+ if (nbp != NULL) {
+ lblkno = nbp->b_lblkno;
+ xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
+ }
+ retval = EAGAIN;
+ error = BUF_TIMELOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
+ "flushbuf", slpflag, slptimeo);
+ if (error) {
+ BO_LOCK(bo);
+ return (error != ENOLCK ? error : EAGAIN);
+ }
+ KASSERT(bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p",
+ bp, bp->b_bufobj, bo));
+ /*
+ * XXX Since there are no node locks for NFS, I
+ * believe there is a slight chance that a delayed
+ * write will occur while sleeping just above, so
+ * check for it.
+ */
+ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+ (flags & V_SAVE)) {
+ bremfree(bp);
+ bp->b_flags |= B_ASYNC;
+ bwrite(bp);
+ BO_LOCK(bo);
+ return (EAGAIN); /* XXX: why not loop ? */
+ }
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ BO_LOCK(bo);
+ if (nbp == NULL)
+ break;
+ nbp = gbincore(bo, lblkno);
+ if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+ != xflags)
+ break; /* nbp invalid */
+ }
+ return (retval);
+}
+
+int
+bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
+{
+ struct buf *bp;
+ int error;
+ daddr_t lblkno;
+
+ ASSERT_BO_LOCKED(bo);
+
+ for (lblkno = startn;;) {
+again:
+ bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
+ if (bp == NULL || bp->b_lblkno >= endn ||
+ bp->b_lblkno < startn)
+ break;
+ error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
+ LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
+ if (error != 0) {
+ BO_RLOCK(bo);
+ if (error == ENOLCK)
+ goto again;
+ return (error);
+ }
+ KASSERT(bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p",
+ bp, bp->b_bufobj, bo));
+ lblkno = bp->b_lblkno + 1;
+ if ((bp->b_flags & B_MANAGED) == 0)
+ bremfree(bp);
+ bp->b_flags |= B_RELBUF;
+ /*
+ * In the VMIO case, use the B_NOREUSE flag to hint that the
+ * pages backing each buffer in the range are unlikely to be
+ * reused. Dirty buffers will have the hint applied once
+ * they've been written.
+ */
+ if ((bp->b_flags & B_VMIO) != 0)
+ bp->b_flags |= B_NOREUSE;
+ brelse(bp);
+ BO_RLOCK(bo);
+ }
+ return (0);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length. This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(struct vnode *vp, off_t length, int blksize)
+{
+ struct buf *bp, *nbp;
+ struct bufobj *bo;
+ daddr_t startlbn;
+
+ CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
+ vp, blksize, (uintmax_t)length);
+
+ /*
+ * Round up to the *next* lbn.
+ */
+ startlbn = howmany(length, blksize);
+
+ ASSERT_VOP_LOCKED(vp, "vtruncbuf");
+
+ bo = &vp->v_bufobj;
+restart_unlocked:
+ BO_LOCK(bo);
+
+ while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
+ ;
+
+ if (length > 0) {
+restartsync:
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ if (bp->b_lblkno > 0)
+ continue;
+ /*
+ * Since we hold the vnode lock this should only
+ * fail if we're racing with the buf daemon.
+ */
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+ BO_LOCKPTR(bo)) == ENOLCK)
+ goto restart_unlocked;
+
+ VNASSERT((bp->b_flags & B_DELWRI), vp,
+ ("buf(%p) on dirty queue without DELWRI", bp));
+
+ bremfree(bp);
+ bawrite(bp);
+ BO_LOCK(bo);
+ goto restartsync;
+ }
+ }
+
+ bufobj_wwait(bo, 0, 0);
+ BO_UNLOCK(bo);
+ vnode_pager_setsize(vp, length);
+
+ return (0);
+}
+
+/*
+ * Invalidate the cached pages of a file's buffer within the range of block
+ * numbers [startlbn, endlbn).
+ */
+void
+v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
+ int blksize)
+{
+ struct bufobj *bo;
+ off_t start, end;
+
+ ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
+
+ start = blksize * startlbn;
+ end = blksize * endlbn;
+
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+ MPASS(blksize == bo->bo_bsize);
+
+ while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
+ ;
+
+ BO_UNLOCK(bo);
+ vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
+}
+
+static int
+v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
+ daddr_t startlbn, daddr_t endlbn)
+{
+ struct buf *bp, *nbp;
+ bool anyfreed;
+
+ ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
+ ASSERT_BO_LOCKED(bo);
+
+ do {
+ anyfreed = false;
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
+ if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
+ continue;
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+ BO_LOCKPTR(bo)) == ENOLCK) {
+ BO_LOCK(bo);
+ return (EAGAIN);
+ }
+
+ bremfree(bp);
+ bp->b_flags |= B_INVAL | B_RELBUF;
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = true;
+
+ BO_LOCK(bo);
+ if (nbp != NULL &&
+ (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
+ nbp->b_vp != vp ||
+ (nbp->b_flags & B_DELWRI) != 0))
+ return (EAGAIN);
+ }
+
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
+ continue;
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+ BO_LOCKPTR(bo)) == ENOLCK) {
+ BO_LOCK(bo);
+ return (EAGAIN);
+ }
+ bremfree(bp);
+ bp->b_flags |= B_INVAL | B_RELBUF;
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = true;
+
+ BO_LOCK(bo);
+ if (nbp != NULL &&
+ (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI) == 0))
+ return (EAGAIN);
+ }
+ } while (anyfreed);
+ return (0);
+}
+
+static void
+buf_vlist_remove(struct buf *bp)
+{
+ struct bufv *bv;
+
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ ASSERT_BO_WLOCKED(bp->b_bufobj);
+ KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
+ (BX_VNDIRTY|BX_VNCLEAN),
+ ("buf_vlist_remove: Buf %p is on two lists", bp));
+ if (bp->b_xflags & BX_VNDIRTY)
+ bv = &bp->b_bufobj->bo_dirty;
+ else
+ bv = &bp->b_bufobj->bo_clean;
+ BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
+ TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
+ bv->bv_cnt--;
+ bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+}
+
+/*
+ * Add the buffer to the sorted clean or dirty block list.
+ *
+ * NOTE: xflags is passed as a constant, optimizing this inline function!
+ */
+static void
+buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
+{
+ struct bufv *bv;
+ struct buf *n;
+ int error;
+
+ ASSERT_BO_WLOCKED(bo);
+ KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
+ ("dead bo %p", bo));
+ KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
+ ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
+ bp->b_xflags |= xflags;
+ if (xflags & BX_VNDIRTY)
+ bv = &bo->bo_dirty;
+ else
+ bv = &bo->bo_clean;
+
+ /*
+ * Keep the list ordered. Optimize empty list insertion. Assume
+ * we tend to grow at the tail so lookup_le should usually be cheaper
+ * than _ge.
+ */
+ if (bv->bv_cnt == 0 ||
+ bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
+ TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
+ else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
+ TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
+ else
+ TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
+ error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
+ if (error)
+ panic("buf_vlist_add: Preallocated nodes insufficient.");
+ bv->bv_cnt++;
+}
+
+/*
+ * Look up a buffer using the buffer tries.
+ */
+struct buf *
+gbincore(struct bufobj *bo, daddr_t lblkno)
+{
+ struct buf *bp;
+
+ ASSERT_BO_LOCKED(bo);
+ bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
+ if (bp != NULL)
+ return (bp);
+ return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(struct vnode *vp, struct buf *bp)
+{
+ struct bufobj *bo;
+
+ bo = &vp->v_bufobj;
+ ASSERT_BO_WLOCKED(bo);
+ VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
+
+ CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
+ VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
+ ("bgetvp: bp already attached! %p", bp));
+
+ vhold(vp);
+ bp->b_vp = vp;
+ bp->b_bufobj = bo;
+ /*
+ * Insert onto list for new vnode.
+ */
+ buf_vlist_add(bp, bo, BX_VNCLEAN);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(struct buf *bp)
+{
+ struct bufobj *bo;
+ struct vnode *vp;
+
+ CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ vp = bp->b_vp; /* XXX */
+ bo = bp->b_bufobj;
+ BO_LOCK(bo);
+ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+ buf_vlist_remove(bp);
+ else
+ panic("brelvp: Buffer %p not on queue.", bp);
+ if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+ bo->bo_flag &= ~BO_ONWORKLST;
+ mtx_lock(&sync_mtx);
+ LIST_REMOVE(bo, bo_synclist);
+ syncer_worklist_len--;
+ mtx_unlock(&sync_mtx);
+ }
+ bp->b_vp = NULL;
+ bp->b_bufobj = NULL;
+ BO_UNLOCK(bo);
+ vdrop(vp);
+}
+
+/*
+ * Add an item to the syncer work queue.
+ */
+static void
+vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
+{
+ int slot;
+
+ ASSERT_BO_WLOCKED(bo);
+
+ mtx_lock(&sync_mtx);
+ if (bo->bo_flag & BO_ONWORKLST)
+ LIST_REMOVE(bo, bo_synclist);
+ else {
+ bo->bo_flag |= BO_ONWORKLST;
+ syncer_worklist_len++;
+ }
+
+ if (delay > syncer_maxdelay - 2)
+ delay = syncer_maxdelay - 2;
+ slot = (syncer_delayno + delay) & syncer_mask;
+
+ LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
+ mtx_unlock(&sync_mtx);
+}
+
+static int
+sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
+{
+ int error, len;
+
+ mtx_lock(&sync_mtx);
+ len = syncer_worklist_len - sync_vnode_count;
+ mtx_unlock(&sync_mtx);
+ error = SYSCTL_OUT(req, &len, sizeof(len));
+ return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
+ sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
+
+static struct proc *updateproc;
+static void sched_sync(void);
+static struct kproc_desc up_kp = {
+ "syncer",
+ sched_sync,
+ &updateproc
+};
+SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
+
+static int
+sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
+{
+ struct vnode *vp;
+ struct mount *mp;
+
+ *bo = LIST_FIRST(slp);
+ if (*bo == NULL)
+ return (0);
+ vp = bo2vnode(*bo);
+ if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
+ return (1);
+ /*
+ * We use vhold in case the vnode does not
+ * successfully sync. vhold prevents the vnode from
+ * going away when we unlock the sync_mtx so that
+ * we can acquire the vnode interlock.
+ */
+ vholdl(vp);
+ mtx_unlock(&sync_mtx);
+ VI_UNLOCK(vp);
+ if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ vdrop(vp);
+ mtx_lock(&sync_mtx);
+ return (*bo == LIST_FIRST(slp));
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ (void) VOP_FSYNC(vp, MNT_LAZY, td);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ BO_LOCK(*bo);
+ if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
+ /*
+ * Put us back on the worklist. The worklist
+ * routine will remove us from our current
+ * position and then add us back in at a later
+ * position.
+ */
+ vn_syncer_add_to_worklist(*bo, syncdelay);
+ }
+ BO_UNLOCK(*bo);
+ vdrop(vp);
+ mtx_lock(&sync_mtx);
+ return (0);
+}
+
+static int first_printf = 1;
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+static void
+sched_sync(void)
+{
+ struct synclist *next, *slp;
+ struct bufobj *bo;
+ long starttime;
+ struct thread *td = curthread;
+ int last_work_seen;
+ int net_worklist_len;
+ int syncer_final_iter;
+ int error;
+
+ last_work_seen = 0;
+ syncer_final_iter = 0;
+ syncer_state = SYNCER_RUNNING;
+ starttime = time_uptime;
+ td->td_pflags |= TDP_NORUNNINGBUF;
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
+ SHUTDOWN_PRI_LAST);
+
+ mtx_lock(&sync_mtx);
+ for (;;) {
+ if (syncer_state == SYNCER_FINAL_DELAY &&
+ syncer_final_iter == 0) {
+ mtx_unlock(&sync_mtx);
+ kproc_suspend_check(td->td_proc);
+ mtx_lock(&sync_mtx);
+ }
+ net_worklist_len = syncer_worklist_len - sync_vnode_count;
+ if (syncer_state != SYNCER_RUNNING &&
+ starttime != time_uptime) {
+ if (first_printf) {
+ printf("\nSyncing disks, vnodes remaining... ");
+ first_printf = 0;
+ }
+ printf("%d ", net_worklist_len);
+ }
+ starttime = time_uptime;
+
+ /*
+ * Push files whose dirty time has expired. Be careful
+ * of interrupt race on slp queue.
+ *
+ * Skip over empty worklist slots when shutting down.
+ */
+ do {
+ slp = &syncer_workitem_pending[syncer_delayno];
+ syncer_delayno += 1;
+ if (syncer_delayno == syncer_maxdelay)
+ syncer_delayno = 0;
+ next = &syncer_workitem_pending[syncer_delayno];
+ /*
+ * If the worklist has wrapped since the
+ * it was emptied of all but syncer vnodes,
+ * switch to the FINAL_DELAY state and run
+ * for one more second.
+ */
+ if (syncer_state == SYNCER_SHUTTING_DOWN &&
+ net_worklist_len == 0 &&
+ last_work_seen == syncer_delayno) {
+ syncer_state = SYNCER_FINAL_DELAY;
+ syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
+ }
+ } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
+ syncer_worklist_len > 0);
+
+ /*
+ * Keep track of the last time there was anything
+ * on the worklist other than syncer vnodes.
+ * Return to the SHUTTING_DOWN state if any
+ * new work appears.
+ */
+ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
+ last_work_seen = syncer_delayno;
+ if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
+ syncer_state = SYNCER_SHUTTING_DOWN;
+ while (!LIST_EMPTY(slp)) {
+ error = sync_vnode(slp, &bo, td);
+ if (error == 1) {
+ LIST_REMOVE(bo, bo_synclist);
+ LIST_INSERT_HEAD(next, bo, bo_synclist);
+ continue;
+ }
+
+ if (first_printf == 0) {
+ /*
+ * Drop the sync mutex, because some watchdog
+ * drivers need to sleep while patting
+ */
+ mtx_unlock(&sync_mtx);
+ wdog_kern_pat(WD_LASTVAL);
+ mtx_lock(&sync_mtx);
+ }
+
+ }
+ if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
+ syncer_final_iter--;
+ /*
+ * The variable rushjob allows the kernel to speed up the
+ * processing of the filesystem syncer process. A rushjob
+ * value of N tells the filesystem syncer to process the next
+ * N seconds worth of work on its queue ASAP. Currently rushjob
+ * is used by the soft update code to speed up the filesystem
+ * syncer process when the incore state is getting so far
+ * ahead of the disk that the kernel memory pool is being
+ * threatened with exhaustion.
+ */
+ if (rushjob > 0) {
+ rushjob -= 1;
+ continue;
+ }
+ /*
+ * Just sleep for a short period of time between
+ * iterations when shutting down to allow some I/O
+ * to happen.
+ *
+ * If it has taken us less than a second to process the
+ * current work, then wait. Otherwise start right over
+ * again. We can still lose time if any single round
+ * takes more than two seconds, but it does not really
+ * matter as we are just trying to generally pace the
+ * filesystem activity.
+ */
+ if (syncer_state != SYNCER_RUNNING ||
+ time_uptime == starttime) {
+ thread_lock(td);
+ sched_prio(td, PPAUSE);
+ thread_unlock(td);
+ }
+ if (syncer_state != SYNCER_RUNNING)
+ cv_timedwait(&sync_wakeup, &sync_mtx,
+ hz / SYNCER_SHUTDOWN_SPEEDUP);
+ else if (time_uptime == starttime)
+ cv_timedwait(&sync_wakeup, &sync_mtx, hz);
+ }
+}
+
+/*
+ * Request the syncer daemon to speed up its work.
+ * We never push it to speed up more than half of its
+ * normal turn time, otherwise it could take over the cpu.
+ */
+int
+speedup_syncer(void)
+{
+ int ret = 0;
+
+ mtx_lock(&sync_mtx);
+ if (rushjob < syncdelay / 2) {
+ rushjob += 1;
+ stat_rush_requests += 1;
+ ret = 1;
+ }
+ mtx_unlock(&sync_mtx);
+ cv_broadcast(&sync_wakeup);
+ return (ret);
+}
+
+/*
+ * Tell the syncer to speed up its work and run though its work
+ * list several times, then tell it to shut down.
+ */
+static void
+syncer_shutdown(void *arg, int howto)
+{
+
+ if (howto & RB_NOSYNC)
+ return;
+ mtx_lock(&sync_mtx);
+ syncer_state = SYNCER_SHUTTING_DOWN;
+ rushjob = 0;
+ mtx_unlock(&sync_mtx);
+ cv_broadcast(&sync_wakeup);
+ kproc_shutdown(arg, howto);
+}
+
+void
+syncer_suspend(void)
+{
+
+ syncer_shutdown(updateproc, 0);
+}
+
+void
+syncer_resume(void)
+{
+
+ mtx_lock(&sync_mtx);
+ first_printf = 1;
+ syncer_state = SYNCER_RUNNING;
+ mtx_unlock(&sync_mtx);
+ cv_broadcast(&sync_wakeup);
+ kproc_resume(updateproc);
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(struct buf *bp)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+ int delay;
+#ifdef INVARIANTS
+ struct bufv *bv;
+#endif
+
+ vp = bp->b_vp;
+ bo = bp->b_bufobj;
+ ++reassignbufcalls;
+
+ CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ /*
+ * B_PAGING flagged buffers cannot be reassigned because their vp
+ * is not fully linked in.
+ */
+ if (bp->b_flags & B_PAGING)
+ panic("cannot reassign paging buffer");
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ BO_LOCK(bo);
+ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+ buf_vlist_remove(bp);
+ else
+ panic("reassignbuf: Buffer %p not on queue.", bp);
+ /*
+ * If dirty, put on list of dirty buffers; otherwise insert onto list
+ * of clean buffers.
+ */
+ if (bp->b_flags & B_DELWRI) {
+ if ((bo->bo_flag & BO_ONWORKLST) == 0) {
+ switch (vp->v_type) {
+ case VDIR:
+ delay = dirdelay;
+ break;
+ case VCHR:
+ delay = metadelay;
+ break;
+ default:
+ delay = filedelay;
+ }
+ vn_syncer_add_to_worklist(bo, delay);
+ }
+ buf_vlist_add(bp, bo, BX_VNDIRTY);
+ } else {
+ buf_vlist_add(bp, bo, BX_VNCLEAN);
+
+ if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+ mtx_lock(&sync_mtx);
+ LIST_REMOVE(bo, bo_synclist);
+ syncer_worklist_len--;
+ mtx_unlock(&sync_mtx);
+ bo->bo_flag &= ~BO_ONWORKLST;
+ }
+ }
+#ifdef INVARIANTS
+ bv = &bo->bo_clean;
+ bp = TAILQ_FIRST(&bv->bv_hd);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ bp = TAILQ_LAST(&bv->bv_hd, buflists);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ bv = &bo->bo_dirty;
+ bp = TAILQ_FIRST(&bv->bv_hd);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ bp = TAILQ_LAST(&bv->bv_hd, buflists);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+#endif
+ BO_UNLOCK(bo);
+}
+
+static void
+v_init_counters(struct vnode *vp)
+{
+
+ VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
+ vp, ("%s called for an initialized vnode", __FUNCTION__));
+ ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
+
+ refcount_init(&vp->v_holdcnt, 1);
+ refcount_init(&vp->v_usecount, 1);
+}
+
+static void
+v_incr_usecount_locked(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ if ((vp->v_iflag & VI_OWEINACT) != 0) {
+ VNASSERT(vp->v_usecount == 0, vp,
+ ("vnode with usecount and VI_OWEINACT set"));
+ vp->v_iflag &= ~VI_OWEINACT;
+ }
+ refcount_acquire(&vp->v_usecount);
+ v_incr_devcount(vp);
+}
+
+/*
+ * Increment the use count on the vnode, taking care to reference
+ * the driver's usecount if this is a chardev.
+ */
+static void
+v_incr_usecount(struct vnode *vp)
+{
+
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+
+ if (vp->v_type != VCHR &&
+ refcount_acquire_if_not_zero(&vp->v_usecount)) {
+ VNODE_REFCOUNT_FENCE_ACQ();
+ VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
+ ("vnode with usecount and VI_OWEINACT set"));
+ } else {
+ VI_LOCK(vp);
+ v_incr_usecount_locked(vp);
+ VI_UNLOCK(vp);
+ }
+}
+
+/*
+ * Increment si_usecount of the associated device, if any.
+ */
+static void
+v_incr_devcount(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __FUNCTION__);
+ if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+ dev_lock();
+ vp->v_rdev->si_usecount++;
+ dev_unlock();
+ }
+}
+
+/*
+ * Decrement si_usecount of the associated device, if any.
+ */
+static void
+v_decr_devcount(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __FUNCTION__);
+ if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+ dev_lock();
+ vp->v_rdev->si_usecount--;
+ dev_unlock();
+ }
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. VI_DOOMED is set if the vnode
+ * is being destroyed. Only callers who specify LK_RETRY will
+ * see doomed vnodes. If inactive processing was delayed in
+ * vput try to do it here.
+ *
+ * Notes on lockless counter manipulation:
+ * _vhold, vputx and other routines make various decisions based
+ * on either holdcnt or usecount being 0. As long as either counter
+ * is not transitioning 0->1 nor 1->0, the manipulation can be done
+ * with atomic operations. Otherwise the interlock is taken covering
+ * both the atomic and additional actions.
+ */
+int
+vget(struct vnode *vp, int flags, struct thread *td)
+{
+ int error, oweinact;
+
+ VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+ ("vget: invalid lock operation"));
+
+ if ((flags & LK_INTERLOCK) != 0)
+ ASSERT_VI_LOCKED(vp, __func__);
+ else
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ if ((flags & LK_VNHELD) != 0)
+ VNASSERT((vp->v_holdcnt > 0), vp,
+ ("vget: LK_VNHELD passed but vnode not held"));
+
+ CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+
+ if ((flags & LK_VNHELD) == 0)
+ _vhold(vp, (flags & LK_INTERLOCK) != 0);
+
+ if ((error = vn_lock(vp, flags)) != 0) {
+ vdrop(vp);
+ CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
+ vp);
+ return (error);
+ }
+ if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
+ panic("vget: vn_lock failed to return ENOENT\n");
+ /*
+ * We don't guarantee that any particular close will
+ * trigger inactive processing so just make a best effort
+ * here at preventing a reference to a removed file. If
+ * we don't succeed no harm is done.
+ *
+ * Upgrade our holdcnt to a usecount.
+ */
+ if (vp->v_type == VCHR ||
+ !refcount_acquire_if_not_zero(&vp->v_usecount)) {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_OWEINACT) == 0) {
+ oweinact = 0;
+ } else {
+ oweinact = 1;
+ vp->v_iflag &= ~VI_OWEINACT;
+ VNODE_REFCOUNT_FENCE_REL();
+ }
+ refcount_acquire(&vp->v_usecount);
+ v_incr_devcount(vp);
+ if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
+ (flags & LK_NOWAIT) == 0)
+ vinactive(vp, td);
+ VI_UNLOCK(vp);
+ }
+ return (0);
+}
+
+/*
+ * Increase the reference (use) and hold count of a vnode.
+ * This will also remove the vnode from the free list if it is presently free.
+ */
+void
+vref(struct vnode *vp)
+{
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ _vhold(vp, false);
+ v_incr_usecount(vp);
+}
+
+void
+vrefl(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ _vhold(vp, true);
+ v_incr_usecount_locked(vp);
+}
+
+void
+vrefact(struct vnode *vp)
+{
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ if (__predict_false(vp->v_type == VCHR)) {
+ VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
+ ("%s: wrong ref counts", __func__));
+ vref(vp);
+ return;
+ }
+#ifdef INVARIANTS
+ int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
+ VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
+ old = atomic_fetchadd_int(&vp->v_usecount, 1);
+ VNASSERT(old > 0, vp, ("%s: wrong use count", __func__));
+#else
+ refcount_acquire(&vp->v_holdcnt);
+ refcount_acquire(&vp->v_usecount);
+#endif
+}
+
+/*
+ * Return reference count of a vnode.
+ *
+ * The results of this call are only guaranteed when some mechanism is used to
+ * stop other processes from gaining references to the vnode. This may be the
+ * case if the caller holds the only reference. This is also useful when stale
+ * data is acceptable as race conditions may be accounted for by some other
+ * means.
+ */
+int
+vrefcnt(struct vnode *vp)
+{
+
+ return (vp->v_usecount);
+}
+
+#define VPUTX_VRELE 1
+#define VPUTX_VPUT 2
+#define VPUTX_VUNREF 3
+
+/*
+ * Decrement the use and hold counts for a vnode.
+ *
+ * See an explanation near vget() as to why atomic operation is safe.
+ */
+static void
+vputx(struct vnode *vp, int func)
+{
+ int error;
+
+ KASSERT(vp != NULL, ("vputx: null vp"));
+ if (func == VPUTX_VUNREF)
+ ASSERT_VOP_LOCKED(vp, "vunref");
+ else if (func == VPUTX_VPUT)
+ ASSERT_VOP_LOCKED(vp, "vput");
+ else
+ KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+
+ if (vp->v_type != VCHR &&
+ refcount_release_if_not_last(&vp->v_usecount)) {
+ if (func == VPUTX_VPUT)
+ VOP_UNLOCK(vp, 0);
+ vdrop(vp);
+ return;
+ }
+
+ VI_LOCK(vp);
+
+ /*
+ * We want to hold the vnode until the inactive finishes to
+ * prevent vgone() races. We drop the use count here and the
+ * hold count below when we're done.
+ */
+ if (!refcount_release(&vp->v_usecount) ||
+ (vp->v_iflag & VI_DOINGINACT)) {
+ if (func == VPUTX_VPUT)
+ VOP_UNLOCK(vp, 0);
+ v_decr_devcount(vp);
+ vdropl(vp);
+ return;
+ }
+
+ v_decr_devcount(vp);
+
+ error = 0;
+
+ if (vp->v_usecount != 0) {
+ vn_printf(vp, "vputx: usecount not zero for vnode ");
+ panic("vputx: usecount not zero");
+ }
+
+ CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
+
+ /*
+ * We must call VOP_INACTIVE with the node locked. Mark
+ * as VI_DOINGINACT to avoid recursion.
+ */
+ vp->v_iflag |= VI_OWEINACT;
+ switch (func) {
+ case VPUTX_VRELE:
+ error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
+ VI_LOCK(vp);
+ break;
+ case VPUTX_VPUT:
+ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+ error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
+ LK_NOWAIT);
+ VI_LOCK(vp);
+ }
+ break;
+ case VPUTX_VUNREF:
+ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+ error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
+ VI_LOCK(vp);
+ }
+ break;
+ }
+ VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
+ ("vnode with usecount and VI_OWEINACT set"));
+ if (error == 0) {
+ if (vp->v_iflag & VI_OWEINACT)
+ vinactive(vp, curthread);
+ if (func != VPUTX_VUNREF)
+ VOP_UNLOCK(vp, 0);
+ }
+ vdropl(vp);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(struct vnode *vp)
+{
+
+ vputx(vp, VPUTX_VRELE);
+}
+
+/*
+ * Release an already locked vnode. This give the same effects as
+ * unlock+vrele(), but takes less time and avoids releasing and
+ * re-aquiring the lock (as vrele() acquires the lock internally.)
+ */
+void
+vput(struct vnode *vp)
+{
+
+ vputx(vp, VPUTX_VPUT);
+}
+
+/*
+ * Release an exclusively locked vnode. Do not unlock the vnode lock.
+ */
+void
+vunref(struct vnode *vp)
+{
+
+ vputx(vp, VPUTX_VUNREF);
+}
+
+/*
+ * Increase the hold count and activate if this is the first reference.
+ */
+void
+_vhold(struct vnode *vp, bool locked)
+{
+ struct mount *mp;
+
+ if (locked)
+ ASSERT_VI_LOCKED(vp, __func__);
+ else
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ if (!locked) {
+ if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
+ VNODE_REFCOUNT_FENCE_ACQ();
+ VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+ ("_vhold: vnode with holdcnt is free"));
+ return;
+ }
+ VI_LOCK(vp);
+ }
+ if ((vp->v_iflag & VI_FREE) == 0) {
+ refcount_acquire(&vp->v_holdcnt);
+ if (!locked)
+ VI_UNLOCK(vp);
+ return;
+ }
+ VNASSERT(vp->v_holdcnt == 0, vp,
+ ("%s: wrong hold count", __func__));
+ VNASSERT(vp->v_op != NULL, vp,
+ ("%s: vnode already reclaimed.", __func__));
+ /*
+ * Remove a vnode from the free list, mark it as in use,
+ * and put it on the active list.
+ */
+ VNASSERT(vp->v_mount != NULL, vp,
+ ("_vhold: vnode not on per mount vnode list"));
+ mp = vp->v_mount;
+ mtx_lock(&mp->mnt_listmtx);
+ if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
+ TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
+ mp->mnt_tmpfreevnodelistsize--;
+ vp->v_mflag &= ~VMP_TMPMNTFREELIST;
+ } else {
+ mtx_lock(&vnode_free_list_mtx);
+ TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+ freevnodes--;
+ mtx_unlock(&vnode_free_list_mtx);
+ }
+ KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+ ("Activating already active vnode"));
+ vp->v_iflag &= ~VI_FREE;
+ vp->v_iflag |= VI_ACTIVE;
+ TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+ mp->mnt_activevnodelistsize++;
+ mtx_unlock(&mp->mnt_listmtx);
+ refcount_acquire(&vp->v_holdcnt);
+ if (!locked)
+ VI_UNLOCK(vp);
+}
+
+/*
+ * Drop the hold count of the vnode. If this is the last reference to
+ * the vnode we place it on the free list unless it has been vgone'd
+ * (marked VI_DOOMED) in which case we will free it.
+ *
+ * Because the vnode vm object keeps a hold reference on the vnode if
+ * there is at least one resident non-cached page, the vnode cannot
+ * leave the active list without the page cleanup done.
+ */
+void
+_vdrop(struct vnode *vp, bool locked)
+{
+ struct bufobj *bo;
+ struct mount *mp;
+ int active;
+
+ if (locked)
+ ASSERT_VI_LOCKED(vp, __func__);
+ else
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ if ((int)vp->v_holdcnt <= 0)
+ panic("vdrop: holdcnt %d", vp->v_holdcnt);
+ if (!locked) {
+ if (refcount_release_if_not_last(&vp->v_holdcnt))
+ return;
+ VI_LOCK(vp);
+ }
+ if (refcount_release(&vp->v_holdcnt) == 0) {
+ VI_UNLOCK(vp);
+ return;
+ }
+ if ((vp->v_iflag & VI_DOOMED) == 0) {
+ /*
+ * Mark a vnode as free: remove it from its active list
+ * and put it up for recycling on the freelist.
+ */
+ VNASSERT(vp->v_op != NULL, vp,
+ ("vdropl: vnode already reclaimed."));
+ VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+ ("vnode already free"));
+ VNASSERT(vp->v_holdcnt == 0, vp,
+ ("vdropl: freeing when we shouldn't"));
+ active = vp->v_iflag & VI_ACTIVE;
+ if ((vp->v_iflag & VI_OWEINACT) == 0) {
+ vp->v_iflag &= ~VI_ACTIVE;
+ mp = vp->v_mount;
+ if (mp != NULL) {
+ mtx_lock(&mp->mnt_listmtx);
+ if (active) {
+ TAILQ_REMOVE(&mp->mnt_activevnodelist,
+ vp, v_actfreelist);
+ mp->mnt_activevnodelistsize--;
+ }
+ TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist,
+ vp, v_actfreelist);
+ mp->mnt_tmpfreevnodelistsize++;
+ vp->v_iflag |= VI_FREE;
+ vp->v_mflag |= VMP_TMPMNTFREELIST;
+ VI_UNLOCK(vp);
+ if (mp->mnt_tmpfreevnodelistsize >=
+ mnt_free_list_batch)
+ vnlru_return_batch_locked(mp);
+ mtx_unlock(&mp->mnt_listmtx);
+ } else {
+ VNASSERT(active == 0, vp,
+ ("vdropl: active vnode not on per mount "
+ "vnode list"));
+ mtx_lock(&vnode_free_list_mtx);
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp,
+ v_actfreelist);
+ freevnodes++;
+ vp->v_iflag |= VI_FREE;
+ VI_UNLOCK(vp);
+ mtx_unlock(&vnode_free_list_mtx);
+ }
+ } else {
+ VI_UNLOCK(vp);
+ counter_u64_add(free_owe_inact, 1);
+ }
+ return;
+ }
+ /*
+ * The vnode has been marked for destruction, so free it.
+ *
+ * The vnode will be returned to the zone where it will
+ * normally remain until it is needed for another vnode. We
+ * need to cleanup (or verify that the cleanup has already
+ * been done) any residual data left from its current use
+ * so as not to contaminate the freshly allocated vnode.
+ */
+ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
+ atomic_subtract_long(&numvnodes, 1);
+ bo = &vp->v_bufobj;
+ VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+ ("cleaned vnode still on the free list."));
+ VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
+ VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
+ VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
+ VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
+ VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
+ VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
+ VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
+ ("clean blk trie not empty"));
+ VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
+ VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
+ ("dirty blk trie not empty"));
+ VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
+ VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
+ VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+ VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
+ ("Dangling rangelock waiters"));
+ VI_UNLOCK(vp);
+#ifdef MAC
+ mac_vnode_destroy(vp);
+#endif
+ if (vp->v_pollinfo != NULL) {
+ destroy_vpollinfo(vp->v_pollinfo);
+ vp->v_pollinfo = NULL;
+ }
+#ifdef INVARIANTS
+ /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
+ vp->v_op = NULL;
+#endif
+ vp->v_mountedhere = NULL;
+ vp->v_unpcb = NULL;
+ vp->v_rdev = NULL;
+ vp->v_fifoinfo = NULL;
+ vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+ vp->v_iflag = 0;
+ vp->v_vflag = 0;
+ bo->bo_flag = 0;
+ uma_zfree(vnode_zone, vp);
+}
+
+/*
+ * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
+ * flags. DOINGINACT prevents us from recursing in calls to vinactive.
+ * OWEINACT tracks whether a vnode missed a call to inactive due to a
+ * failed lock upgrade.
+ */
+void
+vinactive(struct vnode *vp, struct thread *td)
+{
+ struct vm_object *obj;
+
+ ASSERT_VOP_ELOCKED(vp, "vinactive");
+ ASSERT_VI_LOCKED(vp, "vinactive");
+ VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
+ ("vinactive: recursed on VI_DOINGINACT"));
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vp->v_iflag |= VI_DOINGINACT;
+ vp->v_iflag &= ~VI_OWEINACT;
+ VI_UNLOCK(vp);
+ /*
+ * Before moving off the active list, we must be sure that any
+ * modified pages are converted into the vnode's dirty
+ * buffers, since these will no longer be checked once the
+ * vnode is on the inactive list.
+ *
+ * The write-out of the dirty pages is asynchronous. At the
+ * point that VOP_INACTIVE() is called, there could still be
+ * pending I/O and dirty pages in the object.
+ */
+ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
+ (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
+ VM_OBJECT_WLOCK(obj);
+ vm_object_page_clean(obj, 0, 0, 0);
+ VM_OBJECT_WUNLOCK(obj);
+ }
+ VOP_INACTIVE(vp, td);
+ VI_LOCK(vp);
+ VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
+ ("vinactive: lost VI_DOINGINACT"));
+ vp->v_iflag &= ~VI_DOINGINACT;
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If FORCECLOSE is not specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If FORCECLOSE is specified, detach any active vnodes
+ * that are found.
+ *
+ * If WRITECLOSE is set, only flush out regular file vnodes open for
+ * writing.
+ *
+ * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
+ *
+ * `rootrefs' specifies the base reference count for the root vnode
+ * of this filesystem. The root vnode is considered busy if its
+ * v_usecount exceeds this value. On a successful return, vflush(, td)
+ * will call vrele() on the root vnode exactly rootrefs times.
+ * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
+ * be zero.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0; /* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
+#endif
+
+int
+vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
+{
+ struct vnode *vp, *mvp, *rootvp = NULL;
+ struct vattr vattr;
+ int busy = 0, error;
+
+ CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
+ rootrefs, flags);
+ if (rootrefs > 0) {
+ KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
+ ("vflush: bad args"));
+ /*
+ * Get the filesystem root vnode. We can vput() it
+ * immediately, since with rootrefs > 0, it won't go away.
+ */
+ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
+ CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
+ __func__, error);
+ return (error);
+ }
+ vput(rootvp);
+ }
+loop:
+ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+ vholdl(vp);
+ error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
+ if (error) {
+ vdrop(vp);
+ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+ goto loop;
+ }
+ /*
+ * Skip over a vnodes marked VV_SYSTEM.
+ */
+ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
+ VOP_UNLOCK(vp, 0);
+ vdrop(vp);
+ continue;
+ }
+ /*
+ * If WRITECLOSE is set, flush out unlinked but still open
+ * files (even if open only for reading) and regular file
+ * vnodes open for writing.
+ */
+ if (flags & WRITECLOSE) {
+ if (vp->v_object != NULL) {
+ VM_OBJECT_WLOCK(vp->v_object);
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ VM_OBJECT_WUNLOCK(vp->v_object);
+ }
+ error = VOP_FSYNC(vp, MNT_WAIT, td);
+ if (error != 0) {
+ VOP_UNLOCK(vp, 0);
+ vdrop(vp);
+ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+ return (error);
+ }
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+ VI_LOCK(vp);
+
+ if ((vp->v_type == VNON ||
+ (error == 0 && vattr.va_nlink > 0)) &&
+ (vp->v_writecount <= 0 || vp->v_type != VREG)) {
+ VOP_UNLOCK(vp, 0);
+ vdropl(vp);
+ continue;
+ }
+ } else
+ VI_LOCK(vp);
+ /*
+ * With v_usecount == 0, all we need to do is clear out the
+ * vnode data structures and we are done.
+ *
+ * If FORCECLOSE is set, forcibly close the vnode.
+ */
+ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
+ vgonel(vp);
+ } else {
+ busy++;
+#ifdef DIAGNOSTIC
+ if (busyprt)
+ vn_printf(vp, "vflush: busy vnode ");
+#endif
+ }
+ VOP_UNLOCK(vp, 0);
+ vdropl(vp);
+ }
+ if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
+ /*
+ * If just the root vnode is busy, and if its refcount
+ * is equal to `rootrefs', then go ahead and kill it.
+ */
+ VI_LOCK(rootvp);
+ KASSERT(busy > 0, ("vflush: not busy"));
+ VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
+ ("vflush: usecount %d < rootrefs %d",
+ rootvp->v_usecount, rootrefs));
+ if (busy == 1 && rootvp->v_usecount == rootrefs) {
+ VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
+ vgone(rootvp);
+ VOP_UNLOCK(rootvp, 0);
+ busy = 0;
+ } else
+ VI_UNLOCK(rootvp);
+ }
+ if (busy) {
+ CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
+ busy);
+ return (EBUSY);
+ }
+ for (; rootrefs > 0; rootrefs--)
+ vrele(rootvp);
+ return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ */
+int
+vrecycle(struct vnode *vp)
+{
+ int recycled;
+
+ VI_LOCK(vp);
+ recycled = vrecyclel(vp);
+ VI_UNLOCK(vp);
+ return (recycled);
+}
+
+/*
+ * vrecycle, with the vp interlock held.
+ */
+int
+vrecyclel(struct vnode *vp)
+{
+ int recycled;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ ASSERT_VI_LOCKED(vp, __func__);
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ recycled = 0;
+ if (vp->v_usecount == 0) {
+ recycled = 1;
+ vgonel(vp);
+ }
+ return (recycled);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(struct vnode *vp)
+{
+ VI_LOCK(vp);
+ vgonel(vp);
+ VI_UNLOCK(vp);
+}
+
+static void
+notify_lowervp_vfs_dummy(struct mount *mp __unused,
+ struct vnode *lowervp __unused)
+{
+}
+
+/*
+ * Notify upper mounts about reclaimed or unlinked vnode.
+ */
+void
+vfs_notify_upper(struct vnode *vp, int event)
+{
+ static struct vfsops vgonel_vfsops = {
+ .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
+ .vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
+ };
+ struct mount *mp, *ump, *mmp;
+
+ mp = vp->v_mount;
+ if (mp == NULL)
+ return;
+
+ MNT_ILOCK(mp);
+ if (TAILQ_EMPTY(&mp->mnt_uppers))
+ goto unlock;
+ MNT_IUNLOCK(mp);
+ mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
+ mmp->mnt_op = &vgonel_vfsops;
+ mmp->mnt_kern_flag |= MNTK_MARKER;
+ MNT_ILOCK(mp);
+ mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
+ for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
+ if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
+ ump = TAILQ_NEXT(ump, mnt_upper_link);
+ continue;
+ }
+ TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
+ MNT_IUNLOCK(mp);
+ switch (event) {
+ case VFS_NOTIFY_UPPER_RECLAIM:
+ VFS_RECLAIM_LOWERVP(ump, vp);
+ break;
+ case VFS_NOTIFY_UPPER_UNLINK:
+ VFS_UNLINK_LOWERVP(ump, vp);
+ break;
+ default:
+ KASSERT(0, ("invalid event %d", event));
+ break;
+ }
+ MNT_ILOCK(mp);
+ ump = TAILQ_NEXT(mmp, mnt_upper_link);
+ TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
+ }
+ free(mmp, M_TEMP);
+ mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
+ if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
+ mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
+ wakeup(&mp->mnt_uppers);
+ }
+unlock:
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+static void
+vgonel(struct vnode *vp)
+{
+ struct thread *td;
+ int oweinact;
+ int active;
+ struct mount *mp;
+
+ ASSERT_VOP_ELOCKED(vp, "vgonel");
+ ASSERT_VI_LOCKED(vp, "vgonel");
+ VNASSERT(vp->v_holdcnt, vp,
+ ("vgonel: vp %p has no reference.", vp));
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ td = curthread;
+
+ /*
+ * Don't vgonel if we're already doomed.
+ */
+ if (vp->v_iflag & VI_DOOMED)
+ return;
+ vp->v_iflag |= VI_DOOMED;
+
+ /*
+ * Check to see if the vnode is in use. If so, we have to call
+ * VOP_CLOSE() and VOP_INACTIVE().
+ */
+ active = vp->v_usecount;
+ oweinact = (vp->v_iflag & VI_OWEINACT);
+ VI_UNLOCK(vp);
+ vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
+
+ /*
+ * If purging an active vnode, it must be closed and
+ * deactivated before being reclaimed.
+ */
+ if (active)
+ VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
+ if (oweinact || active) {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOINGINACT) == 0)
+ vinactive(vp, td);
+ VI_UNLOCK(vp);
+ }
+ if (vp->v_type == VSOCK)
+ vfs_unp_reclaim(vp);
+
+ /*
+ * Clean out any buffers associated with the vnode.
+ * If the flush fails, just toss the buffers.
+ */
+ mp = NULL;
+ if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
+ (void) vn_start_secondary_write(vp, &mp, V_WAIT);
+ if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
+ while (vinvalbuf(vp, 0, 0, 0) != 0)
+ ;
+ }
+
+ BO_LOCK(&vp->v_bufobj);
+ KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
+ vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
+ TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
+ vp->v_bufobj.bo_clean.bv_cnt == 0,
+ ("vp %p bufobj not invalidated", vp));
+
+ /*
+ * For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
+ * after the object's page queue is flushed.
+ */
+ if (vp->v_bufobj.bo_object == NULL)
+ vp->v_bufobj.bo_flag |= BO_DEAD;
+ BO_UNLOCK(&vp->v_bufobj);
+
+ /*
+ * Reclaim the vnode.
+ */
+ if (VOP_RECLAIM(vp, td))
+ panic("vgone: cannot reclaim");
+ if (mp != NULL)
+ vn_finished_secondary_write(mp);
+ VNASSERT(vp->v_object == NULL, vp,
+ ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
+ /*
+ * Clear the advisory locks and wake up waiting threads.
+ */
+ (void)VOP_ADVLOCKPURGE(vp);
+ vp->v_lockf = NULL;
+ /*
+ * Delete from old mount point vnode list.
+ */
+ delmntque(vp);
+ cache_purge(vp);
+ /*
+ * Done with purge, reset to the standard lock and invalidate
+ * the vnode.
+ */
+ VI_LOCK(vp);
+ vp->v_vnlock = &vp->v_lock;
+ vp->v_op = &dead_vnodeops;
+ vp->v_tag = "none";
+ vp->v_type = VBAD;
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(struct vnode *vp)
+{
+ int count;
+
+ dev_lock();
+ count = vp->v_rdev->si_usecount;
+ dev_unlock();
+ return (count);
+}
+
+/*
+ * Same as above, but using the struct cdev *as argument
+ */
+int
+count_dev(struct cdev *dev)
+{
+ int count;
+
+ dev_lock();
+ count = dev->si_usecount;
+ dev_unlock();
+ return(count);
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
+ "VMARKER"};
+
+void
+vn_printf(struct vnode *vp, const char *fmt, ...)
+{
+ va_list ap;
+ char buf[256], buf2[16];
+ u_long flags;
+
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ printf("%p: ", (void *)vp);
+ printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
+ printf(" usecount %d, writecount %d, refcount %d",
+ vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
+ switch (vp->v_type) {
+ case VDIR:
+ printf(" mountedhere %p\n", vp->v_mountedhere);
+ break;
+ case VCHR:
+ printf(" rdev %p\n", vp->v_rdev);
+ break;
+ case VSOCK:
+ printf(" socket %p\n", vp->v_unpcb);
+ break;
+ case VFIFO:
+ printf(" fifoinfo %p\n", vp->v_fifoinfo);
+ break;
+ default:
+ printf("\n");
+ break;
+ }
+ buf[0] = '\0';
+ buf[1] = '\0';
+ if (vp->v_vflag & VV_ROOT)
+ strlcat(buf, "|VV_ROOT", sizeof(buf));
+ if (vp->v_vflag & VV_ISTTY)
+ strlcat(buf, "|VV_ISTTY", sizeof(buf));
+ if (vp->v_vflag & VV_NOSYNC)
+ strlcat(buf, "|VV_NOSYNC", sizeof(buf));
+ if (vp->v_vflag & VV_ETERNALDEV)
+ strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
+ if (vp->v_vflag & VV_CACHEDLABEL)
+ strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
+ if (vp->v_vflag & VV_COPYONWRITE)
+ strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
+ if (vp->v_vflag & VV_SYSTEM)
+ strlcat(buf, "|VV_SYSTEM", sizeof(buf));
+ if (vp->v_vflag & VV_PROCDEP)
+ strlcat(buf, "|VV_PROCDEP", sizeof(buf));
+ if (vp->v_vflag & VV_NOKNOTE)
+ strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
+ if (vp->v_vflag & VV_DELETED)
+ strlcat(buf, "|VV_DELETED", sizeof(buf));
+ if (vp->v_vflag & VV_MD)
+ strlcat(buf, "|VV_MD", sizeof(buf));
+ if (vp->v_vflag & VV_FORCEINSMQ)
+ strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
+ flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
+ VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
+ VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
+ if (flags != 0) {
+ snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
+ strlcat(buf, buf2, sizeof(buf));
+ }
+ if (vp->v_iflag & VI_MOUNT)
+ strlcat(buf, "|VI_MOUNT", sizeof(buf));
+ if (vp->v_iflag & VI_DOOMED)
+ strlcat(buf, "|VI_DOOMED", sizeof(buf));
+ if (vp->v_iflag & VI_FREE)
+ strlcat(buf, "|VI_FREE", sizeof(buf));
+ if (vp->v_iflag & VI_ACTIVE)
+ strlcat(buf, "|VI_ACTIVE", sizeof(buf));
+ if (vp->v_iflag & VI_DOINGINACT)
+ strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
+ if (vp->v_iflag & VI_OWEINACT)
+ strlcat(buf, "|VI_OWEINACT", sizeof(buf));
+ if (vp->v_iflag & VI_TEXT_REF)
+ strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
+ flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
+ VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT | VI_TEXT_REF);
+ if (flags != 0) {
+ snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
+ strlcat(buf, buf2, sizeof(buf));
+ }
+ printf(" flags (%s)\n", buf + 1);
+ if (mtx_owned(VI_MTX(vp)))
+ printf(" VI_LOCKed");
+ if (vp->v_object != NULL)
+ printf(" v_object %p ref %d pages %d "
+ "cleanbuf %d dirtybuf %d\n",
+ vp->v_object, vp->v_object->ref_count,
+ vp->v_object->resident_page_count,
+ vp->v_bufobj.bo_clean.bv_cnt,
+ vp->v_bufobj.bo_dirty.bv_cnt);
+ printf(" ");
+ lockmgr_printinfo(vp->v_vnlock);
+ if (vp->v_data != NULL)
+ VOP_PRINT(vp);
+}
+
+#ifdef DDB
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
+{
+ struct mount *mp;
+ struct vnode *vp;
+
+ /*
+ * Note: because this is DDB, we can't obey the locking semantics
+ * for these structures, which means we could catch an inconsistent
+ * state and dereference a nasty pointer. Not much to be done
+ * about that.
+ */
+ db_printf("Locked vnodes\n");
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
+ vn_printf(vp, "vnode ");
+ }
+ }
+}
+
+/*
+ * Show details about the given vnode.
+ */
+DB_SHOW_COMMAND(vnode, db_show_vnode)
+{
+ struct vnode *vp;
+
+ if (!have_addr)
+ return;
+ vp = (struct vnode *)addr;
+ vn_printf(vp, "vnode ");
+}
+
+/*
+ * Show details about the given mount point.
+ */
+DB_SHOW_COMMAND(mount, db_show_mount)
+{
+ struct mount *mp;
+ struct vfsopt *opt;
+ struct statfs *sp;
+ struct vnode *vp;
+ char buf[512];
+ uint64_t mflags;
+ u_int flags;
+
+ if (!have_addr) {
+ /* No address given, print short info about all mount points. */
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ db_printf("%p %s on %s (%s)\n", mp,
+ mp->mnt_stat.f_mntfromname,
+ mp->mnt_stat.f_mntonname,
+ mp->mnt_stat.f_fstypename);
+ if (db_pager_quit)
+ break;
+ }
+ db_printf("\nMore info: show mount <addr>\n");
+ return;
+ }
+
+ mp = (struct mount *)addr;
+ db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
+ mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
+
+ buf[0] = '\0';
+ mflags = mp->mnt_flag;
+#define MNT_FLAG(flag) do { \
+ if (mflags & (flag)) { \
+ if (buf[0] != '\0') \
+ strlcat(buf, ", ", sizeof(buf)); \
+ strlcat(buf, (#flag) + 4, sizeof(buf)); \
+ mflags &= ~(flag); \
+ } \
+} while (0)
+ MNT_FLAG(MNT_RDONLY);
+ MNT_FLAG(MNT_SYNCHRONOUS);
+ MNT_FLAG(MNT_NOEXEC);
+ MNT_FLAG(MNT_NOSUID);
+ MNT_FLAG(MNT_NFS4ACLS);
+ MNT_FLAG(MNT_UNION);
+ MNT_FLAG(MNT_ASYNC);
+ MNT_FLAG(MNT_SUIDDIR);
+ MNT_FLAG(MNT_SOFTDEP);
+ MNT_FLAG(MNT_NOSYMFOLLOW);
+ MNT_FLAG(MNT_GJOURNAL);
+ MNT_FLAG(MNT_MULTILABEL);
+ MNT_FLAG(MNT_ACLS);
+ MNT_FLAG(MNT_NOATIME);
+ MNT_FLAG(MNT_NOCLUSTERR);
+ MNT_FLAG(MNT_NOCLUSTERW);
+ MNT_FLAG(MNT_SUJ);
+ MNT_FLAG(MNT_EXRDONLY);
+ MNT_FLAG(MNT_EXPORTED);
+ MNT_FLAG(MNT_DEFEXPORTED);
+ MNT_FLAG(MNT_EXPORTANON);
+ MNT_FLAG(MNT_EXKERB);
+ MNT_FLAG(MNT_EXPUBLIC);
+ MNT_FLAG(MNT_LOCAL);
+ MNT_FLAG(MNT_QUOTA);
+ MNT_FLAG(MNT_ROOTFS);
+ MNT_FLAG(MNT_USER);
+ MNT_FLAG(MNT_IGNORE);
+ MNT_FLAG(MNT_UPDATE);
+ MNT_FLAG(MNT_DELEXPORT);
+ MNT_FLAG(MNT_RELOAD);
+ MNT_FLAG(MNT_FORCE);
+ MNT_FLAG(MNT_SNAPSHOT);
+ MNT_FLAG(MNT_BYFSID);
+#undef MNT_FLAG
+ if (mflags != 0) {
+ if (buf[0] != '\0')
+ strlcat(buf, ", ", sizeof(buf));
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+ "0x%016jx", mflags);
+ }
+ db_printf(" mnt_flag = %s\n", buf);
+
+ buf[0] = '\0';
+ flags = mp->mnt_kern_flag;
+#define MNT_KERN_FLAG(flag) do { \
+ if (flags & (flag)) { \
+ if (buf[0] != '\0') \
+ strlcat(buf, ", ", sizeof(buf)); \
+ strlcat(buf, (#flag) + 5, sizeof(buf)); \
+ flags &= ~(flag); \
+ } \
+} while (0)
+ MNT_KERN_FLAG(MNTK_UNMOUNTF);
+ MNT_KERN_FLAG(MNTK_ASYNC);
+ MNT_KERN_FLAG(MNTK_SOFTDEP);
+ MNT_KERN_FLAG(MNTK_DRAINING);
+ MNT_KERN_FLAG(MNTK_REFEXPIRE);
+ MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
+ MNT_KERN_FLAG(MNTK_SHARED_WRITES);
+ MNT_KERN_FLAG(MNTK_NO_IOPF);
+ MNT_KERN_FLAG(MNTK_VGONE_UPPER);
+ MNT_KERN_FLAG(MNTK_VGONE_WAITER);
+ MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
+ MNT_KERN_FLAG(MNTK_MARKER);
+ MNT_KERN_FLAG(MNTK_USES_BCACHE);
+ MNT_KERN_FLAG(MNTK_NOASYNC);
+ MNT_KERN_FLAG(MNTK_UNMOUNT);
+ MNT_KERN_FLAG(MNTK_MWAIT);
+ MNT_KERN_FLAG(MNTK_SUSPEND);
+ MNT_KERN_FLAG(MNTK_SUSPEND2);
+ MNT_KERN_FLAG(MNTK_SUSPENDED);
+ MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
+ MNT_KERN_FLAG(MNTK_NOKNOTE);
+#undef MNT_KERN_FLAG
+ if (flags != 0) {
+ if (buf[0] != '\0')
+ strlcat(buf, ", ", sizeof(buf));
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+ "0x%08x", flags);
+ }
+ db_printf(" mnt_kern_flag = %s\n", buf);
+
+ db_printf(" mnt_opt = ");
+ opt = TAILQ_FIRST(mp->mnt_opt);
+ if (opt != NULL) {
+ db_printf("%s", opt->name);
+ opt = TAILQ_NEXT(opt, link);
+ while (opt != NULL) {
+ db_printf(", %s", opt->name);
+ opt = TAILQ_NEXT(opt, link);
+ }
+ }
+ db_printf("\n");
+
+ sp = &mp->mnt_stat;
+ db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
+ "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
+ "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
+ "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
+ (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
+ (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
+ (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
+ (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
+ (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
+ (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
+ (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
+ (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
+
+ db_printf(" mnt_cred = { uid=%u ruid=%u",
+ (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
+ if (jailed(mp->mnt_cred))
+ db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
+ db_printf(" }\n");
+ db_printf(" mnt_ref = %d\n", mp->mnt_ref);
+ db_printf(" mnt_gen = %d\n", mp->mnt_gen);
+ db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
+ db_printf(" mnt_activevnodelistsize = %d\n",
+ mp->mnt_activevnodelistsize);
+ db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount);
+ db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
+ db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
+ db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
+ db_printf(" mnt_lockref = %d\n", mp->mnt_lockref);
+ db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
+ db_printf(" mnt_secondary_accwrites = %d\n",
+ mp->mnt_secondary_accwrites);
+ db_printf(" mnt_gjprovider = %s\n",
+ mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
+
+ db_printf("\n\nList of active vnodes\n");
+ TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
+ if (vp->v_type != VMARKER) {
+ vn_printf(vp, "vnode ");
+ if (db_pager_quit)
+ break;
+ }
+ }
+ db_printf("\n\nList of inactive vnodes\n");
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
+ vn_printf(vp, "vnode ");
+ if (db_pager_quit)
+ break;
+ }
+ }
+}
+#endif /* DDB */
+
+/*
+ * Fill in a struct xvfsconf based on a struct vfsconf.
+ */
+static int
+vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+ struct xvfsconf xvfsp;
+
+ bzero(&xvfsp, sizeof(xvfsp));
+ strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+ xvfsp.vfc_typenum = vfsp->vfc_typenum;
+ xvfsp.vfc_refcount = vfsp->vfc_refcount;
+ xvfsp.vfc_flags = vfsp->vfc_flags;
+ /*
+ * These are unused in userland, we keep them
+ * to not break binary compatibility.
+ */
+ xvfsp.vfc_vfsops = NULL;
+ xvfsp.vfc_next = NULL;
+ return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+
+#ifdef COMPAT_FREEBSD32
+struct xvfsconf32 {
+ uint32_t vfc_vfsops;
+ char vfc_name[MFSNAMELEN];
+ int32_t vfc_typenum;
+ int32_t vfc_refcount;
+ int32_t vfc_flags;
+ uint32_t vfc_next;
+};
+
+static int
+vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+ struct xvfsconf32 xvfsp;
+
+ bzero(&xvfsp, sizeof(xvfsp));
+ strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+ xvfsp.vfc_typenum = vfsp->vfc_typenum;
+ xvfsp.vfc_refcount = vfsp->vfc_refcount;
+ xvfsp.vfc_flags = vfsp->vfc_flags;
+ return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int
+sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
+{
+ struct vfsconf *vfsp;
+ int error;
+
+ error = 0;
+ vfsconf_slock();
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+#ifdef COMPAT_FREEBSD32
+ if (req->flags & SCTL_MASK32)
+ error = vfsconf2x32(req, vfsp);
+ else
+#endif
+ error = vfsconf2x(req, vfsp);
+ if (error)
+ break;
+ }
+ vfsconf_sunlock();
+ return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
+ "S,xvfsconf", "List of all configured filesystems");
+
+#ifndef BURN_BRIDGES
+static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1 - 1; /* XXX */
+ u_int namelen = arg2 + 1; /* XXX */
+ struct vfsconf *vfsp;
+
+ log(LOG_WARNING, "userland calling deprecated sysctl, "
+ "please rebuild world\n");
+
+#if 1 || defined(COMPAT_PRELITE2)
+ /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+ if (namelen == 1)
+ return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+ switch (name[1]) {
+ case VFS_MAXTYPENUM:
+ if (namelen != 2)
+ return (ENOTDIR);
+ return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+ case VFS_CONF:
+ if (namelen != 3)
+ return (ENOTDIR); /* overloaded */
+ vfsconf_slock();
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+ if (vfsp->vfc_typenum == name[2])
+ break;
+ }
+ vfsconf_sunlock();
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+#ifdef COMPAT_FREEBSD32
+ if (req->flags & SCTL_MASK32)
+ return (vfsconf2x32(req, vfsp));
+ else
+#endif
+ return (vfsconf2x(req, vfsp));
+ }
+ return (EOPNOTSUPP);
+}
+
+static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
+ CTLFLAG_MPSAFE, vfs_sysctl,
+ "Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct vfsconf *vfsp;
+ struct ovfsconf ovfs;
+
+ vfsconf_slock();
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+ bzero(&ovfs, sizeof(ovfs));
+ ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
+ strcpy(ovfs.vfc_name, vfsp->vfc_name);
+ ovfs.vfc_index = vfsp->vfc_typenum;
+ ovfs.vfc_refcount = vfsp->vfc_refcount;
+ ovfs.vfc_flags = vfsp->vfc_flags;
+ error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+ if (error != 0) {
+ vfsconf_sunlock();
+ return (error);
+ }
+ }
+ vfsconf_sunlock();
+ return (0);
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+#endif /* !BURN_BRIDGES */
+
+#define KINFO_VNODESLOP 10
+#ifdef notyet
+/*
+ * Dump vnode list (via sysctl).
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode(SYSCTL_HANDLER_ARGS)
+{
+ struct xvnode *xvn;
+ struct mount *mp;
+ struct vnode *vp;
+ int error, len, n;
+
+ /*
+ * Stale numvnodes access is not fatal here.
+ */
+ req->lock = 0;
+ len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
+ if (!req->oldptr)
+ /* Make an estimate */
+ return (SYSCTL_OUT(req, 0, len));
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
+ n = 0;
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
+ continue;
+ MNT_ILOCK(mp);
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ if (n == len)
+ break;
+ vref(vp);
+ xvn[n].xv_size = sizeof *xvn;
+ xvn[n].xv_vnode = vp;
+ xvn[n].xv_id = 0; /* XXX compat */
+#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
+ XV_COPY(usecount);
+ XV_COPY(writecount);
+ XV_COPY(holdcnt);
+ XV_COPY(mount);
+ XV_COPY(numoutput);
+ XV_COPY(type);
+#undef XV_COPY
+ xvn[n].xv_flag = vp->v_vflag;
+
+ switch (vp->v_type) {
+ case VREG:
+ case VDIR:
+ case VLNK:
+ break;
+ case VBLK:
+ case VCHR:
+ if (vp->v_rdev == NULL) {
+ vrele(vp);
+ continue;
+ }
+ xvn[n].xv_dev = dev2udev(vp->v_rdev);
+ break;
+ case VSOCK:
+ xvn[n].xv_socket = vp->v_socket;
+ break;
+ case VFIFO:
+ xvn[n].xv_fifo = vp->v_fifoinfo;
+ break;
+ case VNON:
+ case VBAD:
+ default:
+ /* shouldn't happen? */
+ vrele(vp);
+ continue;
+ }
+ vrele(vp);
+ ++n;
+ }
+ MNT_IUNLOCK(mp);
+ mtx_lock(&mountlist_mtx);
+ vfs_unbusy(mp);
+ if (n == len)
+ break;
+ }
+ mtx_unlock(&mountlist_mtx);
+
+ error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
+ free(xvn, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
+ "");
+#endif
+
+static void
+unmount_or_warn(struct mount *mp)
+{
+ int error;
+
+ error = dounmount(mp, MNT_FORCE, curthread);
+ if (error != 0) {
+ printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ }
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall(void)
+{
+ struct mount *mp, *tmp;
+
+ CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
+
+ /*
+ * Since this only runs when rebooting, it is not interlocked.
+ */
+ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
+ vfs_ref(mp);
+
+ /*
+ * Forcibly unmounting "/dev" before "/" would prevent clean
+ * unmount of the latter.
+ */
+ if (mp == rootdevmp)
+ continue;
+
+ unmount_or_warn(mp);
+ }
+
+ if (rootdevmp != NULL)
+ unmount_or_warn(rootdevmp);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags)
+{
+ struct vnode *vp, *mvp;
+ struct vm_object *obj;
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+
+ vnlru_return_batch(mp);
+
+ MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
+ obj = vp->v_object;
+ if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+ (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
+ if (!vget(vp,
+ LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
+ curthread)) {
+ if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
+ vput(vp);
+ continue;
+ }
+
+ obj = vp->v_object;
+ if (obj != NULL) {
+ VM_OBJECT_WLOCK(obj);
+ vm_object_page_clean(obj, 0, 0,
+ flags == MNT_WAIT ?
+ OBJPC_SYNC : OBJPC_NOSYNC);
+ VM_OBJECT_WUNLOCK(obj);
+ }
+ vput(vp);
+ }
+ } else
+ VI_UNLOCK(vp);
+ }
+}
+
+static void
+destroy_vpollinfo_free(struct vpollinfo *vi)
+{
+
+ knlist_destroy(&vi->vpi_selinfo.si_note);
+ mtx_destroy(&vi->vpi_lock);
+ uma_zfree(vnodepoll_zone, vi);
+}
+
+static void
+destroy_vpollinfo(struct vpollinfo *vi)
+{
+
+ knlist_clear(&vi->vpi_selinfo.si_note, 1);
+ seldrain(&vi->vpi_selinfo);
+ destroy_vpollinfo_free(vi);
+}
+
+/*
+ * Initialize per-vnode helper structure to hold poll-related state.
+ */
+void
+v_addpollinfo(struct vnode *vp)
+{
+ struct vpollinfo *vi;
+
+ if (vp->v_pollinfo != NULL)
+ return;
+ vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
+ mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
+ knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
+ vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
+ VI_LOCK(vp);
+ if (vp->v_pollinfo != NULL) {
+ VI_UNLOCK(vp);
+ destroy_vpollinfo_free(vi);
+ return;
+ }
+ vp->v_pollinfo = vi;
+ VI_UNLOCK(vp);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode. Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions. (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(struct vnode *vp, struct thread *td, int events)
+{
+
+ v_addpollinfo(vp);
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ if (vp->v_pollinfo->vpi_revents & events) {
+ /*
+ * This leaves events we are not interested
+ * in available for the other process which
+ * which presumably had requested them
+ * (otherwise they would never have been
+ * recorded).
+ */
+ events &= vp->v_pollinfo->vpi_revents;
+ vp->v_pollinfo->vpi_revents &= ~events;
+
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ return (events);
+ }
+ vp->v_pollinfo->vpi_events |= events;
+ selrecord(td, &vp->v_pollinfo->vpi_selinfo);
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ return (0);
+}
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*)(struct vop_close_args *))nullop)
+static int sync_fsync(struct vop_fsync_args *);
+static int sync_inactive(struct vop_inactive_args *);
+static int sync_reclaim(struct vop_reclaim_args *);
+
+static struct vop_vector sync_vnodeops = {
+ .vop_bypass = VOP_EOPNOTSUPP,
+ .vop_close = sync_close, /* close */
+ .vop_fsync = sync_fsync, /* fsync */
+ .vop_inactive = sync_inactive, /* inactive */
+ .vop_reclaim = sync_reclaim, /* reclaim */
+ .vop_lock1 = vop_stdlock, /* lock */
+ .vop_unlock = vop_stdunlock, /* unlock */
+ .vop_islocked = vop_stdislocked, /* islocked */
+};
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+void
+vfs_allocate_syncvnode(struct mount *mp)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+ static long start, incr, next;
+ int error;
+
+ /* Allocate a new vnode */
+ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
+ if (error != 0)
+ panic("vfs_allocate_syncvnode: getnewvnode() failed");
+ vp->v_type = VNON;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ vp->v_vflag |= VV_FORCEINSMQ;
+ error = insmntque(vp, mp);
+ if (error != 0)
+ panic("vfs_allocate_syncvnode: insmntque() failed");
+ vp->v_vflag &= ~VV_FORCEINSMQ;
+ VOP_UNLOCK(vp, 0);
+ /*
+ * Place the vnode onto the syncer worklist. We attempt to
+ * scatter them about on the list so that they will go off
+ * at evenly distributed times even if all the filesystems
+ * are mounted at once.
+ */
+ next += incr;
+ if (next == 0 || next > syncer_maxdelay) {
+ start /= 2;
+ incr /= 2;
+ if (start == 0) {
+ start = syncer_maxdelay / 2;
+ incr = syncer_maxdelay;
+ }
+ next = start;
+ }
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+ vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
+ /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
+ mtx_lock(&sync_mtx);
+ sync_vnode_count++;
+ if (mp->mnt_syncer == NULL) {
+ mp->mnt_syncer = vp;
+ vp = NULL;
+ }
+ mtx_unlock(&sync_mtx);
+ BO_UNLOCK(bo);
+ if (vp != NULL) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ vgone(vp);
+ vput(vp);
+ }
+}
+
+void
+vfs_deallocate_syncvnode(struct mount *mp)
+{
+ struct vnode *vp;
+
+ mtx_lock(&sync_mtx);
+ vp = mp->mnt_syncer;
+ if (vp != NULL)
+ mp->mnt_syncer = NULL;
+ mtx_unlock(&sync_mtx);
+ if (vp != NULL)
+ vrele(vp);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(struct vop_fsync_args *ap)
+{
+ struct vnode *syncvp = ap->a_vp;
+ struct mount *mp = syncvp->v_mount;
+ int error, save;
+ struct bufobj *bo;
+
+ /*
+ * We only need to do something if this is a lazy evaluation.
+ */
+ if (ap->a_waitfor != MNT_LAZY)
+ return (0);
+
+ /*
+ * Move ourselves to the back of the sync list.
+ */
+ bo = &syncvp->v_bufobj;
+ BO_LOCK(bo);
+ vn_syncer_add_to_worklist(bo, syncdelay);
+ BO_UNLOCK(bo);
+
+ /*
+ * Walk the list of vnodes pushing all that are dirty and
+ * not already on the sync list.
+ */
+ if (vfs_busy(mp, MBF_NOWAIT) != 0)
+ return (0);
+ if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
+ vfs_unbusy(mp);
+ return (0);
+ }
+ save = curthread_pflags_set(TDP_SYNCIO);
+ vfs_msync(mp, MNT_NOWAIT);
+ error = VFS_SYNC(mp, MNT_LAZY);
+ curthread_pflags_restore(save);
+ vn_finished_write(mp);
+ vfs_unbusy(mp);
+ return (error);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(struct vop_inactive_args *ap)
+{
+
+ vgone(ap->a_vp);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ *
+ * Modifications to the worklist must be protected by sync_mtx.
+ */
+static int
+sync_reclaim(struct vop_reclaim_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct bufobj *bo;
+
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+ mtx_lock(&sync_mtx);
+ if (vp->v_mount->mnt_syncer == vp)
+ vp->v_mount->mnt_syncer = NULL;
+ if (bo->bo_flag & BO_ONWORKLST) {
+ LIST_REMOVE(bo, bo_synclist);
+ syncer_worklist_len--;
+ sync_vnode_count--;
+ bo->bo_flag &= ~BO_ONWORKLST;
+ }
+ mtx_unlock(&sync_mtx);
+ BO_UNLOCK(bo);
+
+ return (0);
+}
+
+/*
+ * Check if vnode represents a disk device
+ */
+int
+vn_isdisk(struct vnode *vp, int *errp)
+{
+ int error;
+
+ if (vp->v_type != VCHR) {
+ error = ENOTBLK;
+ goto out;
+ }
+ error = 0;
+ dev_lock();
+ if (vp->v_rdev == NULL)
+ error = ENXIO;
+ else if (vp->v_rdev->si_devsw == NULL)
+ error = ENXIO;
+ else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
+ error = ENOTBLK;
+ dev_unlock();
+out:
+ if (errp != NULL)
+ *errp = error;
+ return (error == 0);
+}
+
+/*
+ * Common filesystem object access control check routine. Accepts a
+ * vnode's type, "mode", uid and gid, requested access mode, credentials,
+ * and optional call-by-reference privused argument allowing vaccess()
+ * to indicate to the caller whether privilege was used to satisfy the
+ * request (obsoleted). Returns 0 on success, or an errno on failure.
+ */
+int
+vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
+ accmode_t accmode, struct ucred *cred, int *privused)
+{
+ accmode_t dac_granted;
+ accmode_t priv_granted;
+
+ KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
+ ("invalid bit in accmode"));
+ KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+ ("VAPPEND without VWRITE"));
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that.
+ */
+
+ if (privused != NULL)
+ *privused = 0;
+
+ dac_granted = 0;
+
+ /* Check the owner. */
+ if (cred->cr_uid == file_uid) {
+ dac_granted |= VADMIN;
+ if (file_mode & S_IXUSR)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IRUSR)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWUSR)
+ dac_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+ goto privcheck;
+ }
+
+ /* Otherwise, check the groups (first match) */
+ if (groupmember(file_gid, cred)) {
+ if (file_mode & S_IXGRP)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IRGRP)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWGRP)
+ dac_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+ goto privcheck;
+ }
+
+ /* Otherwise, check everyone else. */
+ if (file_mode & S_IXOTH)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IROTH)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWOTH)
+ dac_granted |= (VWRITE | VAPPEND);
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+privcheck:
+ /*
+ * Build a privilege mask to determine if the set of privileges
+ * satisfies the requirements when combined with the granted mask
+ * from above. For each privilege, if the privilege is required,
+ * bitwise or the request type onto the priv_granted mask.
+ */
+ priv_granted = 0;
+
+ if (type == VDIR) {
+ /*
+ * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
+ * requests, instead of PRIV_VFS_EXEC.
+ */
+ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
+ priv_granted |= VEXEC;
+ } else {
+ /*
+ * Ensure that at least one execute bit is on. Otherwise,
+ * a privileged user will always succeed, and we don't want
+ * this to happen unless the file really is executable.
+ */
+ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+ (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+ !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+ priv_granted |= VEXEC;
+ }
+
+ if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_READ, 0))
+ priv_granted |= VREAD;
+
+ if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+ priv_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+ priv_granted |= VADMIN;
+
+ if ((accmode & (priv_granted | dac_granted)) == accmode) {
+ /* XXX audit: privilege used */
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+ return ((accmode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * Credential check based on process requesting service, and per-attribute
+ * permissions.
+ */
+int
+extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
+ struct thread *td, accmode_t accmode)
+{
+
+ /*
+ * Kernel-invoked always succeeds.
+ */
+ if (cred == NOCRED)
+ return (0);
+
+ /*
+ * Do not allow privileged processes in jail to directly manipulate
+ * system attributes.
+ */
+ switch (attrnamespace) {
+ case EXTATTR_NAMESPACE_SYSTEM:
+ /* Potentially should be: return (EPERM); */
+ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
+ case EXTATTR_NAMESPACE_USER:
+ return (VOP_ACCESS(vp, accmode, cred, td));
+ default:
+ return (EPERM);
+ }
+}
+
+#ifdef DEBUG_VFS_LOCKS
+/*
+ * This only exists to suppress warnings from unlocked specfs accesses. It is
+ * no longer ok to have an unlocked VFS.
+ */
+#define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \
+ (vp)->v_type == VCHR || (vp)->v_type == VBAD)
+
+int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
+ "Drop into debugger on lock violation");
+
+int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
+ 0, "Check for interlock across VOPs");
+
+int vfs_badlock_print = 1; /* Print lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
+ 0, "Print lock violations");
+
+int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
+ 0, "Print vnode details on lock violations");
+
+#ifdef KDB
+int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
+ &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
+#endif
+
+static void
+vfs_badlock(const char *msg, const char *str, struct vnode *vp)
+{
+
+#ifdef KDB
+ if (vfs_badlock_backtrace)
+ kdb_backtrace();
+#endif
+ if (vfs_badlock_vnode)
+ vn_printf(vp, "vnode ");
+ if (vfs_badlock_print)
+ printf("%s: %p %s\n", str, (void *)vp, msg);
+ if (vfs_badlock_ddb)
+ kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+}
+
+void
+assert_vi_locked(struct vnode *vp, const char *str)
+{
+
+ if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
+ vfs_badlock("interlock is not locked but should be", str, vp);
+}
+
+void
+assert_vi_unlocked(struct vnode *vp, const char *str)
+{
+
+ if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
+ vfs_badlock("interlock is locked but should not be", str, vp);
+}
+
+void
+assert_vop_locked(struct vnode *vp, const char *str)
+{
+ int locked;
+
+ if (!IGNORE_LOCK(vp)) {
+ locked = VOP_ISLOCKED(vp);
+ if (locked == 0 || locked == LK_EXCLOTHER)
+ vfs_badlock("is not locked but should be", str, vp);
+ }
+}
+
+void
+assert_vop_unlocked(struct vnode *vp, const char *str)
+{
+
+ if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+ vfs_badlock("is locked but should not be", str, vp);
+}
+
+void
+assert_vop_elocked(struct vnode *vp, const char *str)
+{
+
+ if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+ vfs_badlock("is not exclusive locked but should be", str, vp);
+}
+#endif /* DEBUG_VFS_LOCKS */
+
+void
+vop_rename_fail(struct vop_rename_args *ap)
+{
+
+ if (ap->a_tvp != NULL)
+ vput(ap->a_tvp);
+ if (ap->a_tdvp == ap->a_tvp)
+ vrele(ap->a_tdvp);
+ else
+ vput(ap->a_tdvp);
+ vrele(ap->a_fdvp);
+ vrele(ap->a_fvp);
+}
+
+void
+vop_rename_pre(void *ap)
+{
+ struct vop_rename_args *a = ap;
+
+#ifdef DEBUG_VFS_LOCKS
+ if (a->a_tvp)
+ ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
+ ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
+ ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
+ ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
+
+ /* Check the source (from). */
+ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
+ (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
+ ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
+ if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
+ ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
+
+ /* Check the target. */
+ if (a->a_tvp)
+ ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
+ ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+#endif
+ if (a->a_tdvp != a->a_fdvp)
+ vhold(a->a_fdvp);
+ if (a->a_tvp != a->a_fvp)
+ vhold(a->a_fvp);
+ vhold(a->a_tdvp);
+ if (a->a_tvp)
+ vhold(a->a_tvp);
+}
+
+#ifdef DEBUG_VFS_LOCKS
+void
+vop_strategy_pre(void *ap)
+{
+ struct vop_strategy_args *a;
+ struct buf *bp;
+
+ a = ap;
+ bp = a->a_bp;
+
+ /*
+ * Cluster ops lock their component buffers but not the IO container.
+ */
+ if ((bp->b_flags & B_CLUSTER) != 0)
+ return;
+
+ if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
+ if (vfs_badlock_print)
+ printf(
+ "VOP_STRATEGY: bp is not locked but should be\n");
+ if (vfs_badlock_ddb)
+ kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+ }
+}
+
+void
+vop_lock_pre(void *ap)
+{
+ struct vop_lock1_args *a = ap;
+
+ if ((a->a_flags & LK_INTERLOCK) == 0)
+ ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+ else
+ ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
+}
+
+void
+vop_lock_post(void *ap, int rc)
+{
+ struct vop_lock1_args *a = ap;
+
+ ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+ if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
+ ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
+}
+
+void
+vop_unlock_pre(void *ap)
+{
+ struct vop_unlock_args *a = ap;
+
+ if (a->a_flags & LK_INTERLOCK)
+ ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
+ ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
+}
+
+void
+vop_unlock_post(void *ap, int rc)
+{
+ struct vop_unlock_args *a = ap;
+
+ if (a->a_flags & LK_INTERLOCK)
+ ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
+}
+#endif
+
+void
+vop_create_post(void *ap, int rc)
+{
+ struct vop_create_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_deleteextattr_post(void *ap, int rc)
+{
+ struct vop_deleteextattr_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_link_post(void *ap, int rc)
+{
+ struct vop_link_args *a = ap;
+
+ if (!rc) {
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
+ VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
+ }
+}
+
+void
+vop_mkdir_post(void *ap, int rc)
+{
+ struct vop_mkdir_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+}
+
+void
+vop_mknod_post(void *ap, int rc)
+{
+ struct vop_mknod_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_reclaim_post(void *ap, int rc)
+{
+ struct vop_reclaim_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
+}
+
+void
+vop_remove_post(void *ap, int rc)
+{
+ struct vop_remove_args *a = ap;
+
+ if (!rc) {
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+ }
+}
+
+void
+vop_rename_post(void *ap, int rc)
+{
+ struct vop_rename_args *a = ap;
+ long hint;
+
+ if (!rc) {
+ hint = NOTE_WRITE;
+ if (a->a_fdvp == a->a_tdvp) {
+ if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
+ hint |= NOTE_LINK;
+ VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+ VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+ } else {
+ hint |= NOTE_EXTEND;
+ if (a->a_fvp->v_type == VDIR)
+ hint |= NOTE_LINK;
+ VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+
+ if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
+ a->a_tvp->v_type == VDIR)
+ hint &= ~NOTE_LINK;
+ VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+ }
+
+ VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
+ if (a->a_tvp)
+ VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
+ }
+ if (a->a_tdvp != a->a_fdvp)
+ vdrop(a->a_fdvp);
+ if (a->a_tvp != a->a_fvp)
+ vdrop(a->a_fvp);
+ vdrop(a->a_tdvp);
+ if (a->a_tvp)
+ vdrop(a->a_tvp);
+}
+
+void
+vop_rmdir_post(void *ap, int rc)
+{
+ struct vop_rmdir_args *a = ap;
+
+ if (!rc) {
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+ }
+}
+
+void
+vop_setattr_post(void *ap, int rc)
+{
+ struct vop_setattr_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_setextattr_post(void *ap, int rc)
+{
+ struct vop_setextattr_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_symlink_post(void *ap, int rc)
+{
+ struct vop_symlink_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_open_post(void *ap, int rc)
+{
+ struct vop_open_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
+}
+
+void
+vop_close_post(void *ap, int rc)
+{
+ struct vop_close_args *a = ap;
+
+ if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
+ (a->a_vp->v_iflag & VI_DOOMED) == 0)) {
+ VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
+ NOTE_CLOSE_WRITE : NOTE_CLOSE);
+ }
+}
+
+void
+vop_read_post(void *ap, int rc)
+{
+ struct vop_read_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
+void
+vop_readdir_post(void *ap, int rc)
+{
+ struct vop_readdir_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
+static struct knlist fs_knlist;
+
+static void
+vfs_event_init(void *arg)
+{
+ knlist_init_mtx(&fs_knlist, NULL);
+}
+/* XXX - correct order? */
+SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
+
+void
+vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
+{
+
+ KNOTE_UNLOCKED(&fs_knlist, event);
+}
+
+static int filt_fsattach(struct knote *kn);
+static void filt_fsdetach(struct knote *kn);
+static int filt_fsevent(struct knote *kn, long hint);
+
+struct filterops fs_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_fsattach,
+ .f_detach = filt_fsdetach,
+ .f_event = filt_fsevent
+};
+
+static int
+filt_fsattach(struct knote *kn)
+{
+
+ kn->kn_flags |= EV_CLEAR;
+ knlist_add(&fs_knlist, kn, 0);
+ return (0);
+}
+
+static void
+filt_fsdetach(struct knote *kn)
+{
+
+ knlist_remove(&fs_knlist, kn, 0);
+}
+
+static int
+filt_fsevent(struct knote *kn, long hint)
+{
+
+ kn->kn_fflags |= hint;
+ return (kn->kn_fflags != 0);
+}
+
+static int
+sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
+{
+ struct vfsidctl vc;
+ int error;
+ struct mount *mp;
+
+ error = SYSCTL_IN(req, &vc, sizeof(vc));
+ if (error)
+ return (error);
+ if (vc.vc_vers != VFS_CTL_VERS1)
+ return (EINVAL);
+ mp = vfs_getvfs(&vc.vc_fsid);
+ if (mp == NULL)
+ return (ENOENT);
+ /* ensure that a specific sysctl goes to the right filesystem. */
+ if (strcmp(vc.vc_fstypename, "*") != 0 &&
+ strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
+ vfs_rel(mp);
+ return (EINVAL);
+ }
+ VCTLTOREQ(&vc, req);
+ error = VFS_SYSCTL(mp, vc.vc_op, req);
+ vfs_rel(mp);
+ return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
+ NULL, 0, sysctl_vfs_ctl, "",
+ "Sysctl by fsid");
+
+/*
+ * Function to initialize a va_filerev field sensibly.
+ * XXX: Wouldn't a random number make a lot more sense ??
+ */
+u_quad_t
+init_va_filerev(void)
+{
+ struct bintime bt;
+
+ getbinuptime(&bt);
+ return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
+}
+
+static int filt_vfsread(struct knote *kn, long hint);
+static int filt_vfswrite(struct knote *kn, long hint);
+static int filt_vfsvnode(struct knote *kn, long hint);
+static void filt_vfsdetach(struct knote *kn);
+static struct filterops vfsread_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_vfsdetach,
+ .f_event = filt_vfsread
+};
+static struct filterops vfswrite_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_vfsdetach,
+ .f_event = filt_vfswrite
+};
+static struct filterops vfsvnode_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_vfsdetach,
+ .f_event = filt_vfsvnode
+};
+
+static void
+vfs_knllock(void *arg)
+{
+ struct vnode *vp = arg;
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+}
+
+static void
+vfs_knlunlock(void *arg)
+{
+ struct vnode *vp = arg;
+
+ VOP_UNLOCK(vp, 0);
+}
+
+static void
+vfs_knl_assert_locked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vnode *vp = arg;
+
+ ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
+#endif
+}
+
+static void
+vfs_knl_assert_unlocked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vnode *vp = arg;
+
+ ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
+#endif
+}
+
+int
+vfs_kqfilter(struct vop_kqfilter_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct knote *kn = ap->a_kn;
+ struct knlist *knl;
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ kn->kn_fop = &vfsread_filtops;
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &vfswrite_filtops;
+ break;
+ case EVFILT_VNODE:
+ kn->kn_fop = &vfsvnode_filtops;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ kn->kn_hook = (caddr_t)vp;
+
+ v_addpollinfo(vp);
+ if (vp->v_pollinfo == NULL)
+ return (ENOMEM);
+ knl = &vp->v_pollinfo->vpi_selinfo.si_note;
+ vhold(vp);
+ knlist_add(knl, kn, 0);
+
+ return (0);
+}
+
+/*
+ * Detach knote from vnode
+ */
+static void
+filt_vfsdetach(struct knote *kn)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+ KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
+ knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
+ vdrop(vp);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfsread(struct knote *kn, long hint)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+ struct vattr va;
+ int res;
+
+ /*
+ * filesystem is gone, so set the EOF flag and schedule
+ * the knote for deletion.
+ */
+ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
+ VI_LOCK(vp);
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ VI_UNLOCK(vp);
+ return (1);
+ }
+
+ if (VOP_GETATTR(vp, &va, curthread->td_ucred))
+ return (0);
+
+ VI_LOCK(vp);
+ kn->kn_data = va.va_size - kn->kn_fp->f_offset;
+ res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
+ VI_UNLOCK(vp);
+ return (res);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfswrite(struct knote *kn, long hint)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+ VI_LOCK(vp);
+
+ /*
+ * filesystem is gone, so set the EOF flag and schedule
+ * the knote for deletion.
+ */
+ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+
+ kn->kn_data = 0;
+ VI_UNLOCK(vp);
+ return (1);
+}
+
+static int
+filt_vfsvnode(struct knote *kn, long hint)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+ int res;
+
+ VI_LOCK(vp);
+ if (kn->kn_sfflags & hint)
+ kn->kn_fflags |= hint;
+ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
+ kn->kn_flags |= EV_EOF;
+ VI_UNLOCK(vp);
+ return (1);
+ }
+ res = (kn->kn_fflags != 0);
+ VI_UNLOCK(vp);
+ return (res);
+}
+
+int
+vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
+{
+ int error;
+
+ if (dp->d_reclen > ap->a_uio->uio_resid)
+ return (ENAMETOOLONG);
+ error = uiomove(dp, dp->d_reclen, ap->a_uio);
+ if (error) {
+ if (ap->a_ncookies != NULL) {
+ if (ap->a_cookies != NULL)
+ free(ap->a_cookies, M_TEMP);
+ ap->a_cookies = NULL;
+ *ap->a_ncookies = 0;
+ }
+ return (error);
+ }
+ if (ap->a_ncookies == NULL)
+ return (0);
+
+ KASSERT(ap->a_cookies,
+ ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
+
+ *ap->a_cookies = realloc(*ap->a_cookies,
+ (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
+ (*ap->a_cookies)[*ap->a_ncookies] = off;
+ *ap->a_ncookies += 1;
+ return (0);
+}
+
+/*
+ * Mark for update the access time of the file if the filesystem
+ * supports VOP_MARKATIME. This functionality is used by execve and
+ * mmap, so we want to avoid the I/O implied by directly setting
+ * va_atime for the sake of efficiency.
+ */
+void
+vfs_mark_atime(struct vnode *vp, struct ucred *cred)
+{
+ struct mount *mp;
+
+ mp = vp->v_mount;
+ ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
+ if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
+ (void)VOP_MARKATIME(vp);
+}
+
+/*
+ * The purpose of this routine is to remove granularity from accmode_t,
+ * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
+ * VADMIN and VAPPEND.
+ *
+ * If it returns 0, the caller is supposed to continue with the usual
+ * access checks using 'accmode' as modified by this routine. If it
+ * returns nonzero value, the caller is supposed to return that value
+ * as errno.
+ *
+ * Note that after this routine runs, accmode may be zero.
+ */
+int
+vfs_unixify_accmode(accmode_t *accmode)
+{
+ /*
+ * There is no way to specify explicit "deny" rule using
+ * file mode or POSIX.1e ACLs.
+ */
+ if (*accmode & VEXPLICIT_DENY) {
+ *accmode = 0;
+ return (0);
+ }
+
+ /*
+ * None of these can be translated into usual access bits.
+ * Also, the common case for NFSv4 ACLs is to not contain
+ * either of these bits. Caller should check for VWRITE
+ * on the containing directory instead.
+ */
+ if (*accmode & (VDELETE_CHILD | VDELETE))
+ return (EPERM);
+
+ if (*accmode & VADMIN_PERMS) {
+ *accmode &= ~VADMIN_PERMS;
+ *accmode |= VADMIN;
+ }
+
+ /*
+ * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
+ * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
+ */
+ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
+
+ return (0);
+}
+
+/*
+ * These are helper functions for filesystems to traverse all
+ * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
+ *
+ * This interface replaces MNT_VNODE_FOREACH.
+ */
+
+MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
+
+struct vnode *
+__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp;
+
+ if (should_yield())
+ kern_yield(PRI_USER);
+ MNT_ILOCK(mp);
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+ for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
+ vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
+ /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+ if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+ continue;
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOOMED) != 0) {
+ VI_UNLOCK(vp);
+ continue;
+ }
+ break;
+ }
+ if (vp == NULL) {
+ __mnt_vnode_markerfree_all(mvp, mp);
+ /* MNT_IUNLOCK(mp); -- done in above function */
+ mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
+ return (NULL);
+ }
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+ TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+ MNT_IUNLOCK(mp);
+ return (vp);
+}
+
+struct vnode *
+__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp;
+
+ *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ (*mvp)->v_mount = mp;
+ (*mvp)->v_type = VMARKER;
+
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+ if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+ continue;
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOOMED) != 0) {
+ VI_UNLOCK(vp);
+ continue;
+ }
+ break;
+ }
+ if (vp == NULL) {
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ free(*mvp, M_VNODE_MARKER);
+ *mvp = NULL;
+ return (NULL);
+ }
+ TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+ MNT_IUNLOCK(mp);
+ return (vp);
+}
+
+void
+__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
+{
+
+ if (*mvp == NULL) {
+ MNT_IUNLOCK(mp);
+ return;
+ }
+
+ mtx_assert(MNT_MTX(mp), MA_OWNED);
+
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ free(*mvp, M_VNODE_MARKER);
+ *mvp = NULL;
+}
+
+/*
+ * These are helper functions for filesystems to traverse their
+ * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
+ */
+static void
+mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ free(*mvp, M_VNODE_MARKER);
+ *mvp = NULL;
+}
+
+/*
+ * Relock the mp mount vnode list lock with the vp vnode interlock in the
+ * conventional lock order during mnt_vnode_next_active iteration.
+ *
+ * On entry, the mount vnode list lock is held and the vnode interlock is not.
+ * The list lock is dropped and reacquired. On success, both locks are held.
+ * On failure, the mount vnode list lock is held but the vnode interlock is
+ * not, and the procedure may have yielded.
+ */
+static bool
+mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
+ struct vnode *vp)
+{
+ const struct vnode *tmp;
+ bool held, ret;
+
+ VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
+ TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
+ ("%s: bad marker", __func__));
+ VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
+ ("%s: inappropriate vnode", __func__));
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+
+ ret = false;
+
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
+ TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
+
+ /*
+ * Use a hold to prevent vp from disappearing while the mount vnode
+ * list lock is dropped and reacquired. Normally a hold would be
+ * acquired with vhold(), but that might try to acquire the vnode
+ * interlock, which would be a LOR with the mount vnode list lock.
+ */
+ held = refcount_acquire_if_not_zero(&vp->v_holdcnt);
+ mtx_unlock(&mp->mnt_listmtx);
+ if (!held)
+ goto abort;
+ VI_LOCK(vp);
+ if (!refcount_release_if_not_last(&vp->v_holdcnt)) {
+ vdropl(vp);
+ goto abort;
+ }
+ mtx_lock(&mp->mnt_listmtx);
+
+ /*
+ * Determine whether the vnode is still the next one after the marker,
+ * excepting any other markers. If the vnode has not been doomed by
+ * vgone() then the hold should have ensured that it remained on the
+ * active list. If it has been doomed but is still on the active list,
+ * don't abort, but rather skip over it (avoid spinning on doomed
+ * vnodes).
+ */
+ tmp = mvp;
+ do {
+ tmp = TAILQ_NEXT(tmp, v_actfreelist);
+ } while (tmp != NULL && tmp->v_type == VMARKER);
+ if (tmp != vp) {
+ mtx_unlock(&mp->mnt_listmtx);
+ VI_UNLOCK(vp);
+ goto abort;
+ }
+
+ ret = true;
+ goto out;
+abort:
+ maybe_yield();
+ mtx_lock(&mp->mnt_listmtx);
+out:
+ if (ret)
+ ASSERT_VI_LOCKED(vp, __func__);
+ else
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+ return (ret);
+}
+
+static struct vnode *
+mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp, *nvp;
+
+ mtx_assert(&mp->mnt_listmtx, MA_OWNED);
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+restart:
+ vp = TAILQ_NEXT(*mvp, v_actfreelist);
+ while (vp != NULL) {
+ if (vp->v_type == VMARKER) {
+ vp = TAILQ_NEXT(vp, v_actfreelist);
+ continue;
+ }
+ /*
+ * Try-lock because this is the wrong lock order. If that does
+ * not succeed, drop the mount vnode list lock and try to
+ * reacquire it and the vnode interlock in the right order.
+ */
+ if (!VI_TRYLOCK(vp) &&
+ !mnt_vnode_next_active_relock(*mvp, mp, vp))
+ goto restart;
+ KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
+ KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
+ ("alien vnode on the active list %p %p", vp, mp));
+ if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
+ break;
+ nvp = TAILQ_NEXT(vp, v_actfreelist);
+ VI_UNLOCK(vp);
+ vp = nvp;
+ }
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+
+ /* Check if we are done */
+ if (vp == NULL) {
+ mtx_unlock(&mp->mnt_listmtx);
+ mnt_vnode_markerfree_active(mvp, mp);
+ return (NULL);
+ }
+ TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
+ mtx_unlock(&mp->mnt_listmtx);
+ ASSERT_VI_LOCKED(vp, "active iter");
+ KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
+ return (vp);
+}
+
+struct vnode *
+__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+
+ if (should_yield())
+ kern_yield(PRI_USER);
+ mtx_lock(&mp->mnt_listmtx);
+ return (mnt_vnode_next_active(mvp, mp));
+}
+
+struct vnode *
+__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp;
+
+ *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ MNT_IUNLOCK(mp);
+ (*mvp)->v_type = VMARKER;
+ (*mvp)->v_mount = mp;
+
+ mtx_lock(&mp->mnt_listmtx);
+ vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
+ if (vp == NULL) {
+ mtx_unlock(&mp->mnt_listmtx);
+ mnt_vnode_markerfree_active(mvp, mp);
+ return (NULL);
+ }
+ TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
+ return (mnt_vnode_next_active(mvp, mp));
+}
+
+void
+__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+ if (*mvp == NULL)
+ return;
+
+ mtx_lock(&mp->mnt_listmtx);
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+ mtx_unlock(&mp->mnt_listmtx);
+ mnt_vnode_markerfree_active(mvp, mp);
+}
+
+int
+vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
+{
+
+ if ((cnp->cn_flags & NOEXECCHECK) != 0) {
+ cnp->cn_flags &= ~NOEXECCHECK;
+ return (0);
+ }
+
+ return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
+}
diff --git a/freebsd/sys/kern/vfs_syscalls.c b/freebsd/sys/kern/vfs_syscalls.c
new file mode 100644
index 00000000..06aaa935
--- /dev/null
+++ b/freebsd/sys/kern/vfs_syscalls.c
@@ -0,0 +1,4748 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capsicum.h>
+#include <sys/disk.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/limits.h>
+#include <sys/linker.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/jail.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#include <ufs/ufs/quota.h>
+
+MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
+SDT_PROVIDER_DEFINE(vfs);
+SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
+SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
+
+static int kern_chflagsat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, u_long flags, int atflag);
+static int setfflags(struct thread *td, struct vnode *, u_long);
+static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
+static int getutimens(const struct timespec *, enum uio_seg,
+ struct timespec *, int *);
+static int setutimes(struct thread *td, struct vnode *,
+ const struct timespec *, int, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+ struct thread *td);
+static int kern_fhlinkat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, fhandle_t *fhp);
+static int kern_getfhat(struct thread *td, int flags, int fd,
+ const char *path, enum uio_seg pathseg, fhandle_t *fhp);
+static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg,
+ size_t count, struct thread *td);
+static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd,
+ const char *path, enum uio_seg segflag);
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sync(struct thread *td, struct sync_args *uap)
+{
+ struct mount *mp, *nmp;
+ int save;
+
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+ vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+ save = curthread_pflags_set(TDP_SYNCIO);
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT);
+ curthread_pflags_restore(save);
+ vn_finished_write(mp);
+ }
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ }
+ mtx_unlock(&mountlist_mtx);
+ return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
+int
+sys_quotactl(struct thread *td, struct quotactl_args *uap)
+{
+ struct mount *mp;
+ struct nameidata nd;
+ int error;
+
+ AUDIT_ARG_CMD(uap->cmd);
+ AUDIT_ARG_UID(uap->uid);
+ if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
+ return (EPERM);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ mp = nd.ni_vp->v_mount;
+ vfs_ref(mp);
+ vput(nd.ni_vp);
+ error = vfs_busy(mp, 0);
+ vfs_rel(mp);
+ if (error != 0)
+ return (error);
+ error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
+
+ /*
+ * Since quota on operation typically needs to open quota
+ * file, the Q_QUOTAON handler needs to unbusy the mount point
+ * before calling into namei. Otherwise, unmount might be
+ * started between two vfs_busy() invocations (first is our,
+ * second is from mount point cross-walk code in lookup()),
+ * causing deadlock.
+ *
+ * Require that Q_QUOTAON handles the vfs_busy() reference on
+ * its own, always returning with ubusied mount point.
+ */
+ if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON &&
+ (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF)
+ vfs_unbusy(mp);
+ return (error);
+}
+
+/*
+ * Used by statfs conversion routines to scale the block size up if
+ * necessary so that all of the block counts are <= 'max_size'. Note
+ * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
+ * value of 'n'.
+ */
+void
+statfs_scale_blocks(struct statfs *sf, long max_size)
+{
+ uint64_t count;
+ int shift;
+
+ KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
+
+ /*
+ * Attempt to scale the block counts to give a more accurate
+ * overview to userland of the ratio of free space to used
+ * space. To do this, find the largest block count and compute
+ * a divisor that lets it fit into a signed integer <= max_size.
+ */
+ if (sf->f_bavail < 0)
+ count = -sf->f_bavail;
+ else
+ count = sf->f_bavail;
+ count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
+ if (count <= max_size)
+ return;
+
+ count >>= flsl(max_size);
+ shift = 0;
+ while (count > 0) {
+ shift++;
+ count >>=1;
+ }
+
+ sf->f_bsize <<= shift;
+ sf->f_blocks >>= shift;
+ sf->f_bfree >>= shift;
+ sf->f_bavail >>= shift;
+}
+
+static int
+kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
+{
+ struct statfs *sp;
+ int error;
+
+ if (mp == NULL)
+ return (EBADF);
+ error = vfs_busy(mp, 0);
+ vfs_rel(mp);
+ if (error != 0)
+ return (error);
+#ifdef MAC
+ error = mac_mount_check_stat(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#endif
+ /*
+ * Set these in case the underlying filesystem fails to do so.
+ */
+ sp = &mp->mnt_stat;
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = VFS_STATFS(mp, sp);
+ if (error != 0)
+ goto out;
+ *buf = *sp;
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
+ buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
+ prison_enforce_statfs(td->td_ucred, mp, buf);
+ }
+out:
+ vfs_unbusy(mp);
+ return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
+int
+sys_statfs(struct thread *td, struct statfs_args *uap)
+{
+ struct statfs *sfp;
+ int error;
+
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
+ if (error == 0)
+ error = copyout(sfp, uap->buf, sizeof(struct statfs));
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+int
+kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
+ struct statfs *buf)
+{
+ struct mount *mp;
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ vfs_ref(mp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_vp);
+ return (kern_do_statfs(td, mp, buf));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
+int
+sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
+{
+ struct statfs *sfp;
+ int error;
+
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_fstatfs(td, uap->fd, sfp);
+ if (error == 0)
+ error = copyout(sfp, uap->buf, sizeof(struct statfs));
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+int
+kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
+{
+ struct file *fp;
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ error = getvnode(td, fd, &cap_fstatfs_rights, &fp);
+ if (error != 0)
+ return (error);
+ vp = fp->f_vnode;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+#ifdef AUDIT
+ AUDIT_ARG_VNODE1(vp);
+#endif
+ mp = vp->v_mount;
+ if (mp != NULL)
+ vfs_ref(mp);
+ VOP_UNLOCK(vp, 0);
+ fdrop(fp, td);
+ return (kern_do_statfs(td, mp, buf));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int mode;
+};
+#endif
+int
+sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
+{
+ size_t count;
+ int error;
+
+ if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
+ return (EINVAL);
+ error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
+ UIO_USERSPACE, uap->mode);
+ if (error == 0)
+ td->td_retval[0] = count;
+ return (error);
+}
+
+/*
+ * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
+ * The caller is responsible for freeing memory which will be allocated
+ * in '*buf'.
+ */
+int
+kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
+ size_t *countp, enum uio_seg bufseg, int mode)
+{
+ struct mount *mp, *nmp;
+ struct statfs *sfsp, *sp, *sptmp, *tofree;
+ size_t count, maxcount;
+ int error;
+
+ switch (mode) {
+ case MNT_WAIT:
+ case MNT_NOWAIT:
+ break;
+ default:
+ if (bufseg == UIO_SYSSPACE)
+ *buf = NULL;
+ return (EINVAL);
+ }
+restart:
+ maxcount = bufsize / sizeof(struct statfs);
+ if (bufsize == 0) {
+ sfsp = NULL;
+ tofree = NULL;
+ } else if (bufseg == UIO_USERSPACE) {
+ sfsp = *buf;
+ tofree = NULL;
+ } else /* if (bufseg == UIO_SYSSPACE) */ {
+ count = 0;
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ count++;
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (maxcount > count)
+ maxcount = count;
+ tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
+ M_STATFS, M_WAITOK);
+ }
+ count = 0;
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (prison_canseemount(td->td_ucred, mp) != 0) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+#ifdef MAC
+ if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+#endif
+ if (mode == MNT_WAIT) {
+ if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
+ /*
+ * If vfs_busy() failed, and MBF_NOWAIT
+ * wasn't passed, then the mp is gone.
+ * Furthermore, because of MBF_MNTLSTLOCK,
+ * the mountlist_mtx was dropped. We have
+ * no other choice than to start over.
+ */
+ mtx_unlock(&mountlist_mtx);
+ free(tofree, M_STATFS);
+ goto restart;
+ }
+ } else {
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ }
+ if (sfsp != NULL && count < maxcount) {
+ sp = &mp->mnt_stat;
+ /*
+ * Set these in case the underlying filesystem
+ * fails to do so.
+ */
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ /*
+ * If MNT_NOWAIT is specified, do not refresh
+ * the fsstat cache.
+ */
+ if (mode != MNT_NOWAIT) {
+ error = VFS_STATFS(mp, sp);
+ if (error != 0) {
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ continue;
+ }
+ }
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
+ sptmp = malloc(sizeof(struct statfs), M_STATFS,
+ M_WAITOK);
+ *sptmp = *sp;
+ sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
+ prison_enforce_statfs(td->td_ucred, mp, sptmp);
+ sp = sptmp;
+ } else
+ sptmp = NULL;
+ if (bufseg == UIO_SYSSPACE) {
+ bcopy(sp, sfsp, sizeof(*sp));
+ free(sptmp, M_STATFS);
+ } else /* if (bufseg == UIO_USERSPACE) */ {
+ error = copyout(sp, sfsp, sizeof(*sp));
+ free(sptmp, M_STATFS);
+ if (error != 0) {
+ vfs_unbusy(mp);
+ return (error);
+ }
+ }
+ sfsp++;
+ }
+ count++;
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (sfsp != NULL && count > maxcount)
+ *countp = maxcount;
+ else
+ *countp = count;
+ return (0);
+}
+
+#ifdef COMPAT_FREEBSD4
+/*
+ * Get old format filesystem statistics.
+ */
+static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *);
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_statfs_args {
+ char *path;
+ struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
+{
+ struct ostatfs osb;
+ struct statfs *sfp;
+ int error;
+
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
+ if (error == 0) {
+ freebsd4_cvtstatfs(sfp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ }
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fstatfs_args {
+ int fd;
+ struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
+{
+ struct ostatfs osb;
+ struct statfs *sfp;
+ int error;
+
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_fstatfs(td, uap->fd, sfp);
+ if (error == 0) {
+ freebsd4_cvtstatfs(sfp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ }
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_getfsstat_args {
+ struct ostatfs *buf;
+ long bufsize;
+ int mode;
+};
+#endif
+int
+freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
+{
+ struct statfs *buf, *sp;
+ struct ostatfs osb;
+ size_t count, size;
+ int error;
+
+ if (uap->bufsize < 0)
+ return (EINVAL);
+ count = uap->bufsize / sizeof(struct ostatfs);
+ if (count > SIZE_MAX / sizeof(struct statfs))
+ return (EINVAL);
+ size = count * sizeof(struct statfs);
+ error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
+ uap->mode);
+ if (error == 0)
+ td->td_retval[0] = count;
+ if (size != 0) {
+ sp = buf;
+ while (count != 0 && error == 0) {
+ freebsd4_cvtstatfs(sp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ sp++;
+ uap->buf++;
+ count--;
+ }
+ free(buf, M_STATFS);
+ }
+ return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fhstatfs_args {
+ struct fhandle *u_fhp;
+ struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
+{
+ struct ostatfs osb;
+ struct statfs *sfp;
+ fhandle_t fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+ if (error != 0)
+ return (error);
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_fhstatfs(td, fh, sfp);
+ if (error == 0) {
+ freebsd4_cvtstatfs(sfp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ }
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+/*
+ * Convert a new format statfs structure to an old format statfs structure.
+ */
+static void
+freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
+{
+
+ statfs_scale_blocks(nsp, LONG_MAX);
+ bzero(osp, sizeof(*osp));
+ osp->f_bsize = nsp->f_bsize;
+ osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
+ osp->f_blocks = nsp->f_blocks;
+ osp->f_bfree = nsp->f_bfree;
+ osp->f_bavail = nsp->f_bavail;
+ osp->f_files = MIN(nsp->f_files, LONG_MAX);
+ osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
+ osp->f_owner = nsp->f_owner;
+ osp->f_type = nsp->f_type;
+ osp->f_flags = nsp->f_flags;
+ osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
+ osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
+ osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
+ osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
+ strlcpy(osp->f_fstypename, nsp->f_fstypename,
+ MIN(MFSNAMELEN, OMFSNAMELEN));
+ strlcpy(osp->f_mntonname, nsp->f_mntonname,
+ MIN(MNAMELEN, OMNAMELEN));
+ strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
+ MIN(MNAMELEN, OMNAMELEN));
+ osp->f_fsid = nsp->f_fsid;
+}
+#endif /* COMPAT_FREEBSD4 */
+
+#if defined(COMPAT_FREEBSD11)
+/*
+ * Get old format filesystem statistics.
+ */
+static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *);
+
+int
+freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap)
+{
+ struct freebsd11_statfs osb;
+ struct statfs *sfp;
+ int error;
+
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
+ if (error == 0) {
+ freebsd11_cvtstatfs(sfp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ }
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+int
+freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap)
+{
+ struct freebsd11_statfs osb;
+ struct statfs *sfp;
+ int error;
+
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_fstatfs(td, uap->fd, sfp);
+ if (error == 0) {
+ freebsd11_cvtstatfs(sfp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ }
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+int
+freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap)
+{
+ struct freebsd11_statfs osb;
+ struct statfs *buf, *sp;
+ size_t count, size;
+ int error;
+
+ count = uap->bufsize / sizeof(struct ostatfs);
+ size = count * sizeof(struct statfs);
+ error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
+ uap->mode);
+ if (error == 0)
+ td->td_retval[0] = count;
+ if (size > 0) {
+ sp = buf;
+ while (count > 0 && error == 0) {
+ freebsd11_cvtstatfs(sp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ sp++;
+ uap->buf++;
+ count--;
+ }
+ free(buf, M_STATFS);
+ }
+ return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+int
+freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap)
+{
+ struct freebsd11_statfs osb;
+ struct statfs *sfp;
+ fhandle_t fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+ if (error)
+ return (error);
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_fhstatfs(td, fh, sfp);
+ if (error == 0) {
+ freebsd11_cvtstatfs(sfp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ }
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+/*
+ * Convert a new format statfs structure to an old format statfs structure.
+ */
+static void
+freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp)
+{
+
+ bzero(osp, sizeof(*osp));
+ osp->f_version = FREEBSD11_STATFS_VERSION;
+ osp->f_type = nsp->f_type;
+ osp->f_flags = nsp->f_flags;
+ osp->f_bsize = nsp->f_bsize;
+ osp->f_iosize = nsp->f_iosize;
+ osp->f_blocks = nsp->f_blocks;
+ osp->f_bfree = nsp->f_bfree;
+ osp->f_bavail = nsp->f_bavail;
+ osp->f_files = nsp->f_files;
+ osp->f_ffree = nsp->f_ffree;
+ osp->f_syncwrites = nsp->f_syncwrites;
+ osp->f_asyncwrites = nsp->f_asyncwrites;
+ osp->f_syncreads = nsp->f_syncreads;
+ osp->f_asyncreads = nsp->f_asyncreads;
+ osp->f_namemax = nsp->f_namemax;
+ osp->f_owner = nsp->f_owner;
+ osp->f_fsid = nsp->f_fsid;
+ strlcpy(osp->f_fstypename, nsp->f_fstypename,
+ MIN(MFSNAMELEN, sizeof(osp->f_fstypename)));
+ strlcpy(osp->f_mntonname, nsp->f_mntonname,
+ MIN(MNAMELEN, sizeof(osp->f_mntonname)));
+ strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
+ MIN(MNAMELEN, sizeof(osp->f_mntfromname)));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
+int
+sys_fchdir(struct thread *td, struct fchdir_args *uap)
+{
+ struct vnode *vp, *tdp;
+ struct mount *mp;
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ error = getvnode(td, uap->fd, &cap_fchdir_rights,
+ &fp);
+ if (error != 0)
+ return (error);
+ vp = fp->f_vnode;
+ vrefact(vp);
+ fdrop(fp, td);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ error = change_dir(vp, td);
+ while (!error && (mp = vp->v_mountedhere) != NULL) {
+ if (vfs_busy(mp, 0))
+ continue;
+ error = VFS_ROOT(mp, LK_SHARED, &tdp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ break;
+ vput(vp);
+ vp = tdp;
+ }
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0);
+ pwd_chdir(td, vp);
+ return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
+int
+sys_chdir(struct thread *td, struct chdir_args *uap)
+{
+
+ return (kern_chdir(td, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ if ((error = change_dir(nd.ni_vp, td)) != 0) {
+ vput(nd.ni_vp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+ }
+ VOP_UNLOCK(nd.ni_vp, 0);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ pwd_chdir(td, nd.ni_vp);
+ return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
+int
+sys_chroot(struct thread *td, struct chroot_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_CHROOT);
+ if (error != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error != 0)
+ goto error;
+ error = change_dir(nd.ni_vp, td);
+ if (error != 0)
+ goto e_vunlock;
+#ifdef MAC
+ error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
+ if (error != 0)
+ goto e_vunlock;
+#endif
+ VOP_UNLOCK(nd.ni_vp, 0);
+ error = pwd_chroot(td, nd.ni_vp);
+ vrele(nd.ni_vp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+e_vunlock:
+ vput(nd.ni_vp);
+error:
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+}
+
+/*
+ * Common routine for chroot and chdir. Callers must provide a locked vnode
+ * instance.
+ */
+int
+change_dir(struct vnode *vp, struct thread *td)
+{
+#ifdef MAC
+ int error;
+#endif
+
+ ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+#ifdef MAC
+ error = mac_vnode_check_chdir(td->td_ucred, vp);
+ if (error != 0)
+ return (error);
+#endif
+ return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
+}
+
+static __inline void
+flags_to_rights(int flags, cap_rights_t *rightsp)
+{
+
+ if (flags & O_EXEC) {
+ cap_rights_set(rightsp, CAP_FEXECVE);
+ } else {
+ switch ((flags & O_ACCMODE)) {
+ case O_RDONLY:
+ cap_rights_set(rightsp, CAP_READ);
+ break;
+ case O_RDWR:
+ cap_rights_set(rightsp, CAP_READ);
+ /* FALLTHROUGH */
+ case O_WRONLY:
+ cap_rights_set(rightsp, CAP_WRITE);
+ if (!(flags & (O_APPEND | O_TRUNC)))
+ cap_rights_set(rightsp, CAP_SEEK);
+ break;
+ }
+ }
+
+ if (flags & O_CREAT)
+ cap_rights_set(rightsp, CAP_CREATE);
+
+ if (flags & O_TRUNC)
+ cap_rights_set(rightsp, CAP_FTRUNCATE);
+
+ if (flags & (O_SYNC | O_FSYNC))
+ cap_rights_set(rightsp, CAP_FSYNC);
+
+ if (flags & (O_EXLOCK | O_SHLOCK))
+ cap_rights_set(rightsp, CAP_FLOCK);
+}
+
+/*
+ * Check permissions, allocate an open file structure, and call the device
+ * open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
+int
+sys_open(struct thread *td, struct open_args *uap)
+{
+
+ return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->flags, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct openat_args {
+ int fd;
+ char *path;
+ int flag;
+ int mode;
+};
+#endif
+int
+sys_openat(struct thread *td, struct openat_args *uap)
+{
+
+ AUDIT_ARG_FD(uap->fd);
+ return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+ uap->mode));
+}
+
+int
+kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int flags, int mode)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ struct file *fp;
+ struct vnode *vp;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int cmode, error, indx;
+
+ indx = -1;
+
+ AUDIT_ARG_FFLAGS(flags);
+ AUDIT_ARG_MODE(mode);
+ cap_rights_init(&rights, CAP_LOOKUP);
+ flags_to_rights(flags, &rights);
+ /*
+ * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
+ * may be specified.
+ */
+ if (flags & O_EXEC) {
+ if (flags & O_ACCMODE)
+ return (EINVAL);
+ } else if ((flags & O_ACCMODE) == O_ACCMODE) {
+ return (EINVAL);
+ } else {
+ flags = FFLAGS(flags);
+ }
+
+ /*
+ * Allocate a file structure. The descriptor to reference it
+ * is allocated and set by finstall() below.
+ */
+ error = falloc_noinstall(td, &fp);
+ if (error != 0)
+ return (error);
+ /*
+ * An extra reference on `fp' has been held for us by
+ * falloc_noinstall().
+ */
+ /* Set the flags early so the finit in devfs can pick them up. */
+ fp->f_flag = flags & FMASK;
+ cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+ &rights, td);
+ td->td_dupfd = -1; /* XXX check for fdopen */
+ error = vn_open(&nd, &flags, cmode, fp);
+ if (error != 0) {
+ /*
+ * If the vn_open replaced the method vector, something
+ * wonderous happened deep below and we just pass it up
+ * pretending we know what we do.
+ */
+ if (error == ENXIO && fp->f_ops != &badfileops)
+ goto success;
+
+ /*
+ * Handle special fdopen() case. bleh.
+ *
+ * Don't do this for relative (capability) lookups; we don't
+ * understand exactly what would happen, and we don't think
+ * that it ever should.
+ */
+ if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
+ (error == ENODEV || error == ENXIO) &&
+ td->td_dupfd >= 0) {
+ error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+ &indx);
+ if (error == 0)
+ goto success;
+ }
+
+ goto bad;
+ }
+ td->td_dupfd = 0;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+
+ /*
+ * Store the vnode, for any f_type. Typically, the vnode use
+ * count is decremented by direct call to vn_closefile() for
+ * files that switched type in the cdevsw fdopen() method.
+ */
+ fp->f_vnode = vp;
+ /*
+ * If the file wasn't claimed by devfs bind it to the normal
+ * vnode operations here.
+ */
+ if (fp->f_ops == &badfileops) {
+ KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
+ fp->f_seqcount = 1;
+ finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
+ DTYPE_VNODE, vp, &vnops);
+ }
+
+ VOP_UNLOCK(vp, 0);
+ if (flags & O_TRUNC) {
+ error = fo_truncate(fp, 0, td->td_ucred, td);
+ if (error != 0)
+ goto bad;
+ }
+success:
+ /*
+ * If we haven't already installed the FD (for dupfdopen), do so now.
+ */
+ if (indx == -1) {
+ struct filecaps *fcaps;
+
+#ifdef CAPABILITIES
+ if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0)
+ fcaps = &nd.ni_filecaps;
+ else
+#endif
+ fcaps = NULL;
+ error = finstall(td, fp, &indx, flags, fcaps);
+ /* On success finstall() consumes fcaps. */
+ if (error != 0) {
+ filecaps_free(&nd.ni_filecaps);
+ goto bad;
+ }
+ } else {
+ filecaps_free(&nd.ni_filecaps);
+ }
+
+ /*
+ * Release our private reference, leaving the one associated with
+ * the descriptor table intact.
+ */
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (0);
+bad:
+ KASSERT(indx == -1, ("indx=%d, should be -1", indx));
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+ocreat(struct thread *td, struct ocreat_args *uap)
+{
+
+ return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknodat_args {
+ int fd;
+ char *path;
+ mode_t mode;
+ dev_t dev;
+};
+#endif
+int
+sys_mknodat(struct thread *td, struct mknodat_args *uap)
+{
+
+ return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
+ uap->dev));
+}
+
+#if defined(COMPAT_FREEBSD11)
+int
+freebsd11_mknod(struct thread *td,
+ struct freebsd11_mknod_args *uap)
+{
+
+ return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->mode, uap->dev));
+}
+
+int
+freebsd11_mknodat(struct thread *td,
+ struct freebsd11_mknodat_args *uap)
+{
+
+ return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
+ uap->dev));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+int
+kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int mode, dev_t dev)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error, whiteout = 0;
+
+ AUDIT_ARG_MODE(mode);
+ AUDIT_ARG_DEV(dev);
+ switch (mode & S_IFMT) {
+ case S_IFCHR:
+ case S_IFBLK:
+ error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+ if (error == 0 && dev == VNOVAL)
+ error = EINVAL;
+ break;
+ case S_IFWHT:
+ error = priv_check(td, PRIV_VFS_MKNOD_WHT);
+ break;
+ case S_IFIFO:
+ if (dev == 0)
+ return (kern_mkfifoat(td, fd, path, pathseg, mode));
+ /* FALLTHROUGH */
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (error != 0)
+ return (error);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, pathseg, path, fd, &cap_mknodat_rights,
+ td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EEXIST);
+ } else {
+ VATTR_NULL(&vattr);
+ vattr.va_mode = (mode & ALLPERMS) &
+ ~td->td_proc->p_fd->fd_cmask;
+ vattr.va_rdev = dev;
+ whiteout = 0;
+
+ switch (mode & S_IFMT) {
+ case S_IFCHR:
+ vattr.va_type = VCHR;
+ break;
+ case S_IFBLK:
+ vattr.va_type = VBLK;
+ break;
+ case S_IFWHT:
+ whiteout = 1;
+ break;
+ default:
+ panic("kern_mknod: invalid mode");
+ }
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+#ifdef MAC
+ if (error == 0 && !whiteout)
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
+ &nd.ni_cnd, &vattr);
+#endif
+ if (error == 0) {
+ if (whiteout)
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+ else {
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+ &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+ }
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
+{
+
+ return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifoat_args {
+ int fd;
+ char *path;
+ mode_t mode;
+};
+#endif
+int
+sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
+{
+
+ return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->mode));
+}
+
+int
+kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int mode)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error;
+
+ AUDIT_ARG_MODE(mode);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights,
+ td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ if (nd.ni_vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VFIFO;
+ vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+#ifdef MAC
+out:
+#endif
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
+int
+sys_link(struct thread *td, struct link_args *uap)
+{
+
+ return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
+ UIO_USERSPACE, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct linkat_args {
+ int fd1;
+ char *path1;
+ int fd2;
+ char *path2;
+ int flag;
+};
+#endif
+int
+sys_linkat(struct thread *td, struct linkat_args *uap)
+{
+ int flag;
+
+ flag = uap->flag;
+ if (flag & ~AT_SYMLINK_FOLLOW)
+ return (EINVAL);
+
+ return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
+ UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
+}
+
+int hardlink_check_uid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
+ &hardlink_check_uid, 0,
+ "Unprivileged processes cannot create hard links to files owned by other "
+ "users");
+static int hardlink_check_gid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
+ &hardlink_check_gid, 0,
+ "Unprivileged processes cannot create hard links to files owned by other "
+ "groups");
+
+static int
+can_hardlink(struct vnode *vp, struct ucred *cred)
+{
+ struct vattr va;
+ int error;
+
+ if (!hardlink_check_uid && !hardlink_check_gid)
+ return (0);
+
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error != 0)
+ return (error);
+
+ if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
+ error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+ if (error != 0)
+ return (error);
+ }
+
+ if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
+ error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+ if (error != 0)
+ return (error);
+ }
+
+ return (0);
+}
+
+int
+kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
+ enum uio_seg segflag, int follow)
+{
+ struct nameidata nd;
+ int error;
+
+ do {
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflag,
+ path1, fd1, &cap_linkat_source_rights, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag);
+ } while (error == EAGAIN);
+ return (error);
+}
+
+static int
+kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path,
+ enum uio_seg segflag)
+{
+ struct nameidata nd;
+ struct mount *mp;
+ int error;
+
+ if (vp->v_type == VDIR) {
+ vrele(vp);
+ return (EPERM); /* POSIX */
+ }
+ NDINIT_ATRIGHTS(&nd, CREATE,
+ LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd,
+ &cap_linkat_target_rights, td);
+ if ((error = namei(&nd)) == 0) {
+ if (nd.ni_vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ vrele(vp);
+ return (EEXIST);
+ } else if (nd.ni_dvp->v_mount != vp->v_mount) {
+ /*
+ * Cross-device link. No need to recheck
+ * vp->v_type, since it cannot change, except
+ * to VBAD.
+ */
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EXDEV);
+ } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
+ error = can_hardlink(vp, td->td_ucred);
+#ifdef MAC
+ if (error == 0)
+ error = mac_vnode_check_link(td->td_ucred,
+ nd.ni_dvp, vp, &nd.ni_cnd);
+#endif
+ if (error != 0) {
+ vput(vp);
+ vput(nd.ni_dvp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+ }
+ error = vn_start_write(vp, &mp, V_NOWAIT);
+ if (error != 0) {
+ vput(vp);
+ vput(nd.ni_dvp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_start_write(NULL, &mp,
+ V_XSLEEP | PCATCH);
+ if (error != 0)
+ return (error);
+ return (EAGAIN);
+ }
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ VOP_UNLOCK(vp, 0);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ } else {
+ vput(nd.ni_dvp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(vp);
+ return (EAGAIN);
+ }
+ }
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
+int
+sys_symlink(struct thread *td, struct symlink_args *uap)
+{
+
+ return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
+ UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct symlinkat_args {
+ char *path;
+ int fd;
+ char *path2;
+};
+#endif
+int
+sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
+{
+
+ return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
+ UIO_USERSPACE));
+}
+
+int
+kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
+ enum uio_seg segflg)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ char *syspath;
+ struct nameidata nd;
+ int error;
+
+ if (segflg == UIO_SYSSPACE) {
+ syspath = path1;
+ } else {
+ syspath = uma_zalloc(namei_zone, M_WAITOK);
+ if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
+ goto out;
+ }
+ AUDIT_ARG_TEXT(syspath);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, segflg, path2, fd, &cap_symlinkat_rights,
+ td);
+ if ((error = namei(&nd)) != 0)
+ goto out;
+ if (nd.ni_vp) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ goto out;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ goto out;
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+ vattr.va_type = VLNK;
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+ if (error != 0)
+ goto out2;
+#endif
+ error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
+ if (error == 0)
+ vput(nd.ni_vp);
+#ifdef MAC
+out2:
+#endif
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+out:
+ if (segflg != UIO_SYSSPACE)
+ uma_zfree(namei_zone, syspath);
+ return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct undelete_args {
+ char *path;
+};
+#endif
+int
+sys_undelete(struct thread *td, struct undelete_args *uap)
+{
+ struct mount *mp;
+ struct nameidata nd;
+ int error;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+
+ if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
+int
+sys_unlink(struct thread *td, struct unlink_args *uap)
+{
+
+ return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct unlinkat_args {
+ int fd;
+ char *path;
+ int flag;
+};
+#endif
+int
+sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
+{
+ int flag = uap->flag;
+ int fd = uap->fd;
+ char *path = uap->path;
+
+ if (flag & ~AT_REMOVEDIR)
+ return (EINVAL);
+
+ if (flag & AT_REMOVEDIR)
+ return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
+ else
+ return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
+}
+
+int
+kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ ino_t oldinum)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct nameidata nd;
+ struct stat sb;
+ int error;
+
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, &cap_unlinkat_rights, td);
+ if ((error = namei(&nd)) != 0)
+ return (error == EINVAL ? EPERM : error);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR && oldinum == 0) {
+ error = EPERM; /* POSIX */
+ } else if (oldinum != 0 &&
+ ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
+ sb.st_ino != oldinum) {
+ error = EIDRM; /* Identifier removed */
+ } else {
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
+ */
+ if (vp->v_vflag & VV_ROOT)
+ error = EBUSY;
+ }
+ if (error == 0) {
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (vp == nd.ni_dvp)
+ vrele(vp);
+ else
+ vput(vp);
+ if ((error = vn_start_write(NULL, &mp,
+ V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+#ifdef MAC
+ error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+ &nd.ni_cnd);
+ if (error != 0)
+ goto out;
+#endif
+ vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+#ifdef MAC
+out:
+#endif
+ vn_finished_write(mp);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (vp == nd.ni_dvp)
+ vrele(vp);
+ else
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
+int
+sys_lseek(struct thread *td, struct lseek_args *uap)
+{
+
+ return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
+}
+
+int
+kern_lseek(struct thread *td, int fd, off_t offset, int whence)
+{
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ error = fget(td, fd, &cap_seek_rights, &fp);
+ if (error != 0)
+ return (error);
+ error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
+ fo_seek(fp, offset, whence, td) : ESPIPE;
+ fdrop(fp, td);
+ return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
+int
+olseek(struct thread *td, struct olseek_args *uap)
+{
+
+ return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD6)
+/* Version with the 'pad' argument */
+int
+freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
+{
+
+ return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
+}
+#endif
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+ struct thread *td)
+{
+ accmode_t accmode;
+ int error;
+
+ /* Flags == 0 means only check for existence. */
+ if (user_flags == 0)
+ return (0);
+
+ accmode = 0;
+ if (user_flags & R_OK)
+ accmode |= VREAD;
+ if (user_flags & W_OK)
+ accmode |= VWRITE;
+ if (user_flags & X_OK)
+ accmode |= VEXEC;
+#ifdef MAC
+ error = mac_vnode_check_access(cred, vp, accmode);
+ if (error != 0)
+ return (error);
+#endif
+ if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+ error = VOP_ACCESS(vp, accmode, cred, td);
+ return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int amode;
+};
+#endif
+int
+sys_access(struct thread *td, struct access_args *uap)
+{
+
+ return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ 0, uap->amode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct faccessat_args {
+ int dirfd;
+ char *path;
+ int amode;
+ int flag;
+}
+#endif
+int
+sys_faccessat(struct thread *td, struct faccessat_args *uap)
+{
+
+ return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+ uap->amode));
+}
+
+int
+kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int flag, int amode)
+{
+ struct ucred *cred, *usecred;
+ struct vnode *vp;
+ struct nameidata nd;
+ int error;
+
+ if (flag & ~AT_EACCESS)
+ return (EINVAL);
+ if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
+ return (EINVAL);
+
+ /*
+ * Create and modify a temporary credential instead of one that
+ * is potentially shared (if we need one).
+ */
+ cred = td->td_ucred;
+ if ((flag & AT_EACCESS) == 0 &&
+ ((cred->cr_uid != cred->cr_ruid ||
+ cred->cr_rgid != cred->cr_groups[0]))) {
+ usecred = crdup(cred);
+ usecred->cr_uid = cred->cr_ruid;
+ usecred->cr_groups[0] = cred->cr_rgid;
+ td->td_ucred = usecred;
+ } else
+ usecred = cred;
+ AUDIT_ARG_VALUE(amode);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
+ AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights,
+ td);
+ if ((error = namei(&nd)) != 0)
+ goto out;
+ vp = nd.ni_vp;
+
+ error = vn_access(vp, amode, usecred, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+out:
+ if (usecred != cred) {
+ td->td_ucred = cred;
+ crfree(usecred);
+ }
+ return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+ char *path;
+ int amode;
+};
+#endif
+int
+sys_eaccess(struct thread *td, struct eaccess_args *uap)
+{
+
+ return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ AT_EACCESS, uap->amode));
+}
+
+#if defined(COMPAT_43)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+int
+ostat(struct thread *td, struct ostat_args *uap)
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+
+ error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
+ &sb, NULL);
+ if (error != 0)
+ return (error);
+ cvtstat(&sb, &osb);
+ return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+int
+olstat(struct thread *td, struct olstat_args *uap)
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+
+ error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
+ UIO_USERSPACE, &sb, NULL);
+ if (error != 0)
+ return (error);
+ cvtstat(&sb, &osb);
+ return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ * XXX: many values are blindly truncated.
+ */
+void
+cvtstat(struct stat *st, struct ostat *ost)
+{
+
+ bzero(ost, sizeof(*ost));
+ ost->st_dev = st->st_dev;
+ ost->st_ino = st->st_ino;
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ ost->st_size = MIN(st->st_size, INT32_MAX);
+ ost->st_atim = st->st_atim;
+ ost->st_mtim = st->st_mtim;
+ ost->st_ctim = st->st_ctim;
+ ost->st_blksize = st->st_blksize;
+ ost->st_blocks = st->st_blocks;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
+int ino64_trunc_error;
+SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW,
+ &ino64_trunc_error, 0,
+ "Error on truncation of device, file or inode number, or link count");
+
+int
+freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost)
+{
+
+ ost->st_dev = st->st_dev;
+ if (ost->st_dev != st->st_dev) {
+ switch (ino64_trunc_error) {
+ default:
+ /*
+ * Since dev_t is almost raw, don't clamp to the
+ * maximum for case 2, but ignore the error.
+ */
+ break;
+ case 1:
+ return (EOVERFLOW);
+ }
+ }
+ ost->st_ino = st->st_ino;
+ if (ost->st_ino != st->st_ino) {
+ switch (ino64_trunc_error) {
+ default:
+ case 0:
+ break;
+ case 1:
+ return (EOVERFLOW);
+ case 2:
+ ost->st_ino = UINT32_MAX;
+ break;
+ }
+ }
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ if (ost->st_nlink != st->st_nlink) {
+ switch (ino64_trunc_error) {
+ default:
+ case 0:
+ break;
+ case 1:
+ return (EOVERFLOW);
+ case 2:
+ ost->st_nlink = UINT16_MAX;
+ break;
+ }
+ }
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ if (ost->st_rdev != st->st_rdev) {
+ switch (ino64_trunc_error) {
+ default:
+ break;
+ case 1:
+ return (EOVERFLOW);
+ }
+ }
+ ost->st_atim = st->st_atim;
+ ost->st_mtim = st->st_mtim;
+ ost->st_ctim = st->st_ctim;
+ ost->st_size = st->st_size;
+ ost->st_blocks = st->st_blocks;
+ ost->st_blksize = st->st_blksize;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+ ost->st_lspare = 0;
+ ost->st_birthtim = st->st_birthtim;
+ bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim),
+ sizeof(*ost) - offsetof(struct freebsd11_stat,
+ st_birthtim) - sizeof(ost->st_birthtim));
+ return (0);
+}
+
+int
+freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap)
+{
+ struct stat sb;
+ struct freebsd11_stat osb;
+ int error;
+
+ error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
+ &sb, NULL);
+ if (error != 0)
+ return (error);
+ error = freebsd11_cvtstat(&sb, &osb);
+ if (error == 0)
+ error = copyout(&osb, uap->ub, sizeof(osb));
+ return (error);
+}
+
+int
+freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap)
+{
+ struct stat sb;
+ struct freebsd11_stat osb;
+ int error;
+
+ error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
+ UIO_USERSPACE, &sb, NULL);
+ if (error != 0)
+ return (error);
+ error = freebsd11_cvtstat(&sb, &osb);
+ if (error == 0)
+ error = copyout(&osb, uap->ub, sizeof(osb));
+ return (error);
+}
+
+int
+freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap)
+{
+ struct fhandle fh;
+ struct stat sb;
+ struct freebsd11_stat osb;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+ if (error != 0)
+ return (error);
+ error = kern_fhstat(td, fh, &sb);
+ if (error != 0)
+ return (error);
+ error = freebsd11_cvtstat(&sb, &osb);
+ if (error == 0)
+ error = copyout(&osb, uap->sb, sizeof(osb));
+ return (error);
+}
+
+int
+freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap)
+{
+ struct stat sb;
+ struct freebsd11_stat osb;
+ int error;
+
+ error = kern_statat(td, uap->flag, uap->fd, uap->path,
+ UIO_USERSPACE, &sb, NULL);
+ if (error != 0)
+ return (error);
+ error = freebsd11_cvtstat(&sb, &osb);
+ if (error == 0)
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ return (error);
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Get file status
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatat_args {
+ int fd;
+ char *path;
+ struct stat *buf;
+ int flag;
+}
+#endif
+int
+sys_fstatat(struct thread *td, struct fstatat_args *uap)
+{
+ struct stat sb;
+ int error;
+
+ error = kern_statat(td, uap->flag, uap->fd, uap->path,
+ UIO_USERSPACE, &sb, NULL);
+ if (error == 0)
+ error = copyout(&sb, uap->buf, sizeof (sb));
+ return (error);
+}
+
+int
+kern_statat(struct thread *td, int flag, int fd, char *path,
+ enum uio_seg pathseg, struct stat *sbp,
+ void (*hook)(struct vnode *vp, struct stat *sbp))
+{
+ struct nameidata nd;
+ struct stat sb;
+ int error;
+
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+ FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
+ &cap_fstat_rights, td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
+ if (error == 0) {
+ SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
+ if (S_ISREG(sb.st_mode))
+ SDT_PROBE2(vfs, , stat, reg, path, pathseg);
+ if (__predict_false(hook != NULL))
+ hook(nd.ni_vp, &sb);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_vp);
+ if (error != 0)
+ return (error);
+#ifdef __STAT_TIME_T_EXT
+ sb.st_atim_ext = 0;
+ sb.st_mtim_ext = 0;
+ sb.st_ctim_ext = 0;
+ sb.st_btim_ext = 0;
+#endif
+ *sbp = sb;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrstat(&sb);
+#endif
+ return (0);
+}
+
+#if defined(COMPAT_FREEBSD11)
+/*
+ * Implementation of the NetBSD [l]stat() functions.
+ */
+void
+freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb)
+{
+
+ bzero(nsb, sizeof(*nsb));
+ nsb->st_dev = sb->st_dev;
+ nsb->st_ino = sb->st_ino;
+ nsb->st_mode = sb->st_mode;
+ nsb->st_nlink = sb->st_nlink;
+ nsb->st_uid = sb->st_uid;
+ nsb->st_gid = sb->st_gid;
+ nsb->st_rdev = sb->st_rdev;
+ nsb->st_atim = sb->st_atim;
+ nsb->st_mtim = sb->st_mtim;
+ nsb->st_ctim = sb->st_ctim;
+ nsb->st_size = sb->st_size;
+ nsb->st_blocks = sb->st_blocks;
+ nsb->st_blksize = sb->st_blksize;
+ nsb->st_flags = sb->st_flags;
+ nsb->st_gen = sb->st_gen;
+ nsb->st_birthtim = sb->st_birthtim;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_nstat_args {
+ char *path;
+ struct nstat *ub;
+};
+#endif
+int
+freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap)
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+
+ error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
+ &sb, NULL);
+ if (error != 0)
+ return (error);
+ freebsd11_cvtnstat(&sb, &nsb);
+ return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+
+/*
+ * NetBSD lstat. Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_nlstat_args {
+ char *path;
+ struct nstat *ub;
+};
+#endif
+int
+freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap)
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+
+ error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
+ UIO_USERSPACE, &sb, NULL);
+ if (error != 0)
+ return (error);
+ freebsd11_cvtnstat(&sb, &nsb);
+ return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
+int
+sys_pathconf(struct thread *td, struct pathconf_args *uap)
+{
+ long value;
+ int error;
+
+ error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW,
+ &value);
+ if (error == 0)
+ td->td_retval[0] = value;
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct lpathconf_args {
+ char *path;
+ int name;
+};
+#endif
+int
+sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
+{
+ long value;
+ int error;
+
+ error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
+ NOFOLLOW, &value);
+ if (error == 0)
+ td->td_retval[0] = value;
+ return (error);
+}
+
+int
+kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
+ u_long flags, long *valuep)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
+ pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = VOP_PATHCONF(nd.ni_vp, name, valuep);
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ size_t count;
+};
+#endif
+int
+sys_readlink(struct thread *td, struct readlink_args *uap)
+{
+
+ return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->buf, UIO_USERSPACE, uap->count));
+}
+#ifndef _SYS_SYSPROTO_H_
+struct readlinkat_args {
+ int fd;
+ char *path;
+ char *buf;
+ size_t bufsize;
+};
+#endif
+int
+sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
+{
+
+ return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->buf, UIO_USERSPACE, uap->bufsize));
+}
+
+int
+kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ char *buf, enum uio_seg bufseg, size_t count)
+{
+ struct vnode *vp;
+ struct nameidata nd;
+ int error;
+
+ if (count > IOSIZE_MAX)
+ return (EINVAL);
+
+ NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+
+ error = kern_readlink_vp(vp, buf, bufseg, count, td);
+ vput(vp);
+
+ return (error);
+}
+
+/*
+ * Helper function to readlink from a vnode
+ */
+static int
+kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count,
+ struct thread *td)
+{
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+
+ ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked");
+#ifdef MAC
+ error = mac_vnode_check_readlink(td->td_ucred, vp);
+ if (error != 0)
+ return (error);
+#endif
+ if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
+ return (EINVAL);
+
+ aiov.iov_base = buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = bufseg;
+ auio.uio_td = td;
+ auio.uio_resid = count;
+ error = VOP_READLINK(vp, &auio, td->td_ucred);
+ td->td_retval[0] = count - auio.uio_resid;
+ return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(struct thread *td, struct vnode *vp, u_long flags)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+
+ /* We can't support the value matching VNOVAL. */
+ if (flags == VNOVAL)
+ return (EOPNOTSUPP);
+
+ /*
+ * Prevent non-root users from setting flags on devices. When
+ * a device is reused, users can retain ownership of the device
+ * if they are allowed to set flags and programs assume that
+ * chown can't fail when done as root.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK) {
+ error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
+ if (error != 0)
+ return (error);
+ }
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = flags;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+ error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
+ if (error == 0)
+#endif
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ const char *path;
+ u_long flags;
+};
+#endif
+int
+sys_chflags(struct thread *td, struct chflags_args *uap)
+{
+
+ return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->flags, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct chflagsat_args {
+ int fd;
+ const char *path;
+ u_long flags;
+ int atflag;
+}
+#endif
+int
+sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
+{
+ int fd = uap->fd;
+ const char *path = uap->path;
+ u_long flags = uap->flags;
+ int atflag = uap->atflag;
+
+ if (atflag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchflags_args {
+ const char *path;
+ u_long flags;
+};
+#endif
+int
+sys_lchflags(struct thread *td, struct lchflags_args *uap)
+{
+
+ return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->flags, AT_SYMLINK_NOFOLLOW));
+}
+
+static int
+kern_chflagsat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, u_long flags, int atflag)
+{
+ struct nameidata nd;
+ int error, follow;
+
+ AUDIT_ARG_FFLAGS(flags);
+ follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ &cap_fchflags_rights, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfflags(td, nd.ni_vp, flags);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ u_long flags;
+};
+#endif
+int
+sys_fchflags(struct thread *td, struct fchflags_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_FFLAGS(uap->flags);
+ error = getvnode(td, uap->fd, &cap_fchflags_rights,
+ &fp);
+ if (error != 0)
+ return (error);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(fp->f_vnode);
+ VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+ error = setfflags(td, fp->f_vnode, uap->flags);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+int
+setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = mode & ALLPERMS;
+#ifdef MAC
+ error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
+ if (error == 0)
+#endif
+ error = VOP_SETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_chmod(struct thread *td, struct chmod_args *uap)
+{
+
+ return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->mode, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchmodat_args {
+ int dirfd;
+ char *path;
+ mode_t mode;
+ int flag;
+}
+#endif
+int
+sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
+{
+ int flag = uap->flag;
+ int fd = uap->fd;
+ char *path = uap->path;
+ mode_t mode = uap->mode;
+
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_lchmod(struct thread *td, struct lchmod_args *uap)
+{
+
+ return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->mode, AT_SYMLINK_NOFOLLOW));
+}
+
+int
+kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ mode_t mode, int flag)
+{
+ struct nameidata nd;
+ int error, follow;
+
+ AUDIT_ARG_MODE(mode);
+ follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ &cap_fchmod_rights, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
+int
+sys_fchmod(struct thread *td, struct fchmod_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_MODE(uap->mode);
+
+ error = fget(td, uap->fd, &cap_fchmod_rights, &fp);
+ if (error != 0)
+ return (error);
+ error = fo_chmod(fp, uap->mode, td->td_ucred, td);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+int
+setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
+ gid_t gid)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+#ifdef MAC
+ error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
+ vattr.va_gid);
+ if (error == 0)
+#endif
+ error = VOP_SETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+int
+sys_chown(struct thread *td, struct chown_args *uap)
+{
+
+ return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
+ uap->gid, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchownat_args {
+ int fd;
+ const char * path;
+ uid_t uid;
+ gid_t gid;
+ int flag;
+};
+#endif
+int
+sys_fchownat(struct thread *td, struct fchownat_args *uap)
+{
+ int flag;
+
+ flag = uap->flag;
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
+ uap->gid, uap->flag));
+}
+
+int
+kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int uid, int gid, int flag)
+{
+ struct nameidata nd;
+ int error, follow;
+
+ AUDIT_ARG_OWNER(uid, gid);
+ follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ &cap_fchown_rights, td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+int
+sys_lchown(struct thread *td, struct lchown_args *uap)
+{
+
+ return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
+int
+sys_fchown(struct thread *td, struct fchown_args *uap)
+{
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_OWNER(uap->uid, uap->gid);
+ error = fget(td, uap->fd, &cap_fchown_rights, &fp);
+ if (error != 0)
+ return (error);
+ error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
+ struct timespec *tsp)
+{
+ struct timeval tv[2];
+ const struct timeval *tvp;
+ int error;
+
+ if (usrtvp == NULL) {
+ vfs_timestamp(&tsp[0]);
+ tsp[1] = tsp[0];
+ } else {
+ if (tvpseg == UIO_SYSSPACE) {
+ tvp = usrtvp;
+ } else {
+ if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
+ return (error);
+ tvp = tv;
+ }
+
+ if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
+ tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
+ return (EINVAL);
+ TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
+ TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
+ }
+ return (0);
+}
+
+/*
+ * Common implementation code for futimens(), utimensat().
+ */
+#define UTIMENS_NULL 0x1
+#define UTIMENS_EXIT 0x2
+static int
+getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
+ struct timespec *tsp, int *retflags)
+{
+ struct timespec tsnow;
+ int error;
+
+ vfs_timestamp(&tsnow);
+ *retflags = 0;
+ if (usrtsp == NULL) {
+ tsp[0] = tsnow;
+ tsp[1] = tsnow;
+ *retflags |= UTIMENS_NULL;
+ return (0);
+ }
+ if (tspseg == UIO_SYSSPACE) {
+ tsp[0] = usrtsp[0];
+ tsp[1] = usrtsp[1];
+ } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
+ return (error);
+ if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
+ *retflags |= UTIMENS_EXIT;
+ if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
+ *retflags |= UTIMENS_NULL;
+ if (tsp[0].tv_nsec == UTIME_OMIT)
+ tsp[0].tv_sec = VNOVAL;
+ else if (tsp[0].tv_nsec == UTIME_NOW)
+ tsp[0] = tsnow;
+ else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
+ return (EINVAL);
+ if (tsp[1].tv_nsec == UTIME_OMIT)
+ tsp[1].tv_sec = VNOVAL;
+ else if (tsp[1].tv_nsec == UTIME_NOW)
+ tsp[1] = tsnow;
+ else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
+ * and utimensat().
+ */
+static int
+setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
+ int numtimes, int nullflag)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error, setbirthtime;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ setbirthtime = 0;
+ if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
+ timespeccmp(&ts[1], &vattr.va_birthtime, < ))
+ setbirthtime = 1;
+ VATTR_NULL(&vattr);
+ vattr.va_atime = ts[0];
+ vattr.va_mtime = ts[1];
+ if (setbirthtime)
+ vattr.va_birthtime = ts[1];
+ if (numtimes > 2)
+ vattr.va_birthtime = ts[2];
+ if (nullflag)
+ vattr.va_vaflags |= VA_UTIMES_NULL;
+#ifdef MAC
+ error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
+ vattr.va_mtime);
+#endif
+ if (error == 0)
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+int
+sys_utimes(struct thread *td, struct utimes_args *uap)
+{
+
+ return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->tptr, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct futimesat_args {
+ int fd;
+ const char * path;
+ const struct timeval * times;
+};
+#endif
+int
+sys_futimesat(struct thread *td, struct futimesat_args *uap)
+{
+
+ return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->times, UIO_USERSPACE));
+}
+
+int
+kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ struct timeval *tptr, enum uio_seg tptrseg)
+{
+ struct nameidata nd;
+ struct timespec ts[2];
+ int error;
+
+ if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+ return (error);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+ &cap_futimes_rights, td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+int
+sys_lutimes(struct thread *td, struct lutimes_args *uap)
+{
+
+ return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
+ UIO_USERSPACE));
+}
+
+int
+kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
+ struct timeval *tptr, enum uio_seg tptrseg)
+{
+ struct timespec ts[2];
+ struct nameidata nd;
+ int error;
+
+ if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+ int fd;
+ struct timeval *tptr;
+};
+#endif
+int
+sys_futimes(struct thread *td, struct futimes_args *uap)
+{
+
+ return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
+}
+
+int
+kern_futimes(struct thread *td, int fd, struct timeval *tptr,
+ enum uio_seg tptrseg)
+{
+ struct timespec ts[2];
+ struct file *fp;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ error = getutimes(tptr, tptrseg, ts);
+ if (error != 0)
+ return (error);
+ error = getvnode(td, fd, &cap_futimes_rights, &fp);
+ if (error != 0)
+ return (error);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(fp->f_vnode);
+ VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+ error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_futimens(struct thread *td, struct futimens_args *uap)
+{
+
+ return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
+}
+
+int
+kern_futimens(struct thread *td, int fd, struct timespec *tptr,
+ enum uio_seg tptrseg)
+{
+ struct timespec ts[2];
+ struct file *fp;
+ int error, flags;
+
+ AUDIT_ARG_FD(fd);
+ error = getutimens(tptr, tptrseg, ts, &flags);
+ if (error != 0)
+ return (error);
+ if (flags & UTIMENS_EXIT)
+ return (0);
+ error = getvnode(td, fd, &cap_futimes_rights, &fp);
+ if (error != 0)
+ return (error);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(fp->f_vnode);
+ VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+ error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_utimensat(struct thread *td, struct utimensat_args *uap)
+{
+
+ return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->times, UIO_USERSPACE, uap->flag));
+}
+
+int
+kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ struct timespec *tptr, enum uio_seg tptrseg, int flag)
+{
+ struct nameidata nd;
+ struct timespec ts[2];
+ int error, flags;
+
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
+ return (error);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+ FOLLOW) | AUDITVNODE1, pathseg, path, fd,
+ &cap_futimes_rights, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ /*
+ * We are allowed to call namei() regardless of 2xUTIME_OMIT.
+ * POSIX states:
+ * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
+ * "Search permission is denied by a component of the path prefix."
+ */
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if ((flags & UTIMENS_EXIT) == 0)
+ error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
+int
+sys_truncate(struct thread *td, struct truncate_args *uap)
+{
+
+ return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+
+int
+kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ void *rl_cookie;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error;
+
+ if (length < 0)
+ return(EINVAL);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ vn_rangelock_unlock(vp, rl_cookie);
+ vrele(vp);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+#ifdef MAC
+ else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
+ }
+#endif
+ else if ((error = vn_writechk(vp)) == 0 &&
+ (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = length;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+ }
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ vn_rangelock_unlock(vp, rl_cookie);
+ vrele(vp);
+ return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
+int
+otruncate(struct thread *td, struct otruncate_args *uap)
+{
+
+ return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD6)
+/* Versions with the pad argument */
+int
+freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
+{
+
+ return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+
+int
+freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
+{
+
+ return (kern_ftruncate(td, uap->fd, uap->length));
+}
+#endif
+
+int
+kern_fsync(struct thread *td, int fd, bool fullsync)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct file *fp;
+ int error, lock_flags;
+
+ AUDIT_ARG_FD(fd);
+ error = getvnode(td, fd, &cap_fsync_rights, &fp);
+ if (error != 0)
+ return (error);
+ vp = fp->f_vnode;
+#if 0
+ if (!fullsync)
+ /* XXXKIB: compete outstanding aio writes */;
+#endif
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ goto drop;
+ if (MNT_SHARED_WRITES(mp) ||
+ ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+ lock_flags = LK_SHARED;
+ } else {
+ lock_flags = LK_EXCLUSIVE;
+ }
+ vn_lock(vp, lock_flags | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ if (vp->v_object != NULL) {
+ VM_OBJECT_WLOCK(vp->v_object);
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ VM_OBJECT_WUNLOCK(vp->v_object);
+ }
+ error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+drop:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
+int
+sys_fsync(struct thread *td, struct fsync_args *uap)
+{
+
+ return (kern_fsync(td, uap->fd, true));
+}
+
+int
+sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
+{
+
+ return (kern_fsync(td, uap->fd, false));
+}
+
+/*
+ * Rename files. Source and destination must either both be directories, or
+ * both not be directories. If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
+int
+sys_rename(struct thread *td, struct rename_args *uap)
+{
+
+ return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
+ uap->to, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct renameat_args {
+ int oldfd;
+ char *old;
+ int newfd;
+ char *new;
+};
+#endif
+int
+sys_renameat(struct thread *td, struct renameat_args *uap)
+{
+
+ return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
+ UIO_USERSPACE));
+}
+
+int
+kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
+ enum uio_seg pathseg)
+{
+ struct mount *mp = NULL;
+ struct vnode *tvp, *fvp, *tdvp;
+ struct nameidata fromnd, tond;
+ int error;
+
+again:
+ bwillwrite();
+#ifdef MAC
+ NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
+ AUDITVNODE1, pathseg, old, oldfd,
+ &cap_renameat_source_rights, td);
+#else
+ NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
+ pathseg, old, oldfd,
+ &cap_renameat_source_rights, td);
+#endif
+
+ if ((error = namei(&fromnd)) != 0)
+ return (error);
+#ifdef MAC
+ error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
+ fromnd.ni_vp, &fromnd.ni_cnd);
+ VOP_UNLOCK(fromnd.ni_dvp, 0);
+ if (fromnd.ni_dvp != fromnd.ni_vp)
+ VOP_UNLOCK(fromnd.ni_vp, 0);
+#endif
+ fvp = fromnd.ni_vp;
+ NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
+ SAVESTART | AUDITVNODE2, pathseg, new, newfd,
+ &cap_renameat_target_rights, td);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&tond)) != 0) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ tdvp = tond.ni_dvp;
+ tvp = tond.ni_vp;
+ error = vn_start_write(fvp, &mp, V_NOWAIT);
+ if (error != 0) {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ if (tvp != NULL)
+ vput(tvp);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ vrele(tond.ni_startdir);
+ if (fromnd.ni_startdir != NULL)
+ vrele(fromnd.ni_startdir);
+ error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+ if (error != 0)
+ return (error);
+ goto again;
+ }
+ if (tvp != NULL) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+#ifdef CAPABILITIES
+ if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) {
+ /*
+ * If the target already exists we require CAP_UNLINKAT
+ * from 'newfd', when newfd was used for the lookup.
+ */
+ error = cap_check(&tond.ni_filecaps.fc_rights,
+ &cap_unlinkat_rights);
+ if (error != 0)
+ goto out;
+ }
+#endif
+ }
+ if (fvp == tdvp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * If the source is the same as the destination (that is, if they
+ * are links to the same vnode), then there is nothing to do.
+ */
+ if (fvp == tvp)
+ error = -1;
+#ifdef MAC
+ else
+ error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
+ tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
+#endif
+out:
+ if (error == 0) {
+ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ } else {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ if (tvp != NULL)
+ vput(tvp);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ }
+ vrele(tond.ni_startdir);
+ vn_finished_write(mp);
+out1:
+ if (fromnd.ni_startdir)
+ vrele(fromnd.ni_startdir);
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_mkdir(struct thread *td, struct mkdir_args *uap)
+{
+
+ return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkdirat_args {
+ int fd;
+ char *path;
+ mode_t mode;
+};
+#endif
+int
+sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
+{
+
+ return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+int
+kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
+ int mode)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error;
+
+ AUDIT_ARG_MODE(mode);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, segflg, path, fd, &cap_mkdirat_rights,
+ td);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ /*
+ * XXX namei called with LOCKPARENT but not LOCKLEAF has
+ * the strange behaviour of leaving the vnode unlocked
+ * if the target is the same vnode as the parent.
+ */
+ if (vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VDIR;
+ vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+#ifdef MAC
+out:
+#endif
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (error == 0)
+ vput(nd.ni_vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
+int
+sys_rmdir(struct thread *td, struct rmdir_args *uap)
+{
+
+ return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct nameidata nd;
+ int error;
+
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, &cap_unlinkat_rights, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ /*
+ * No rmdir "." please.
+ */
+ if (nd.ni_dvp == vp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ */
+ if (vp->v_vflag & VV_ROOT) {
+ error = EBUSY;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+ &nd.ni_cnd);
+ if (error != 0)
+ goto out;
+#endif
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ vn_finished_write(mp);
+out:
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
+int
+freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count,
+ long *basep, void (*func)(struct freebsd11_dirent *))
+{
+ struct freebsd11_dirent dstdp;
+ struct dirent *dp, *edp;
+ char *dirbuf;
+ off_t base;
+ ssize_t resid, ucount;
+ int error;
+
+ /* XXX arbitrary sanity limit on `count'. */
+ count = min(count, 64 * 1024);
+
+ dirbuf = malloc(count, M_TEMP, M_WAITOK);
+
+ error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid,
+ UIO_SYSSPACE);
+ if (error != 0)
+ goto done;
+ if (basep != NULL)
+ *basep = base;
+
+ ucount = 0;
+ for (dp = (struct dirent *)dirbuf,
+ edp = (struct dirent *)&dirbuf[count - resid];
+ ucount < count && dp < edp; ) {
+ if (dp->d_reclen == 0)
+ break;
+ MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0));
+ if (dp->d_namlen >= sizeof(dstdp.d_name))
+ continue;
+ dstdp.d_type = dp->d_type;
+ dstdp.d_namlen = dp->d_namlen;
+ dstdp.d_fileno = dp->d_fileno; /* truncate */
+ if (dstdp.d_fileno != dp->d_fileno) {
+ switch (ino64_trunc_error) {
+ default:
+ case 0:
+ break;
+ case 1:
+ error = EOVERFLOW;
+ goto done;
+ case 2:
+ dstdp.d_fileno = UINT32_MAX;
+ break;
+ }
+ }
+ dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) +
+ ((dp->d_namlen + 1 + 3) &~ 3);
+ bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
+ bzero(dstdp.d_name + dstdp.d_namlen,
+ dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) -
+ dstdp.d_namlen);
+ MPASS(dstdp.d_reclen <= dp->d_reclen);
+ MPASS(ucount + dstdp.d_reclen <= count);
+ if (func != NULL)
+ func(&dstdp);
+ error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen);
+ if (error != 0)
+ break;
+ dp = (struct dirent *)((char *)dp + dp->d_reclen);
+ ucount += dstdp.d_reclen;
+ }
+
+done:
+ free(dirbuf, M_TEMP);
+ if (error == 0)
+ td->td_retval[0] = ucount;
+ return (error);
+}
+#endif /* COMPAT */
+
+#ifdef COMPAT_43
+static void
+ogetdirentries_cvt(struct freebsd11_dirent *dp)
+{
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+ /*
+ * The expected low byte of dp->d_namlen is our dp->d_type.
+ * The high MBZ byte of dp->d_namlen is our dp->d_namlen.
+ */
+ dp->d_type = dp->d_namlen;
+ dp->d_namlen = 0;
+#else
+ /*
+ * The dp->d_type is the high byte of the expected dp->d_namlen,
+ * so must be zero'ed.
+ */
+ dp->d_type = 0;
+#endif
+}
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
+{
+ long loff;
+ int error;
+
+ error = kern_ogetdirentries(td, uap, &loff);
+ if (error == 0)
+ error = copyout(&loff, uap->basep, sizeof(long));
+ return (error);
+}
+
+int
+kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
+ long *ploff)
+{
+ long base;
+ int error;
+
+ /* XXX arbitrary sanity limit on `count'. */
+ if (uap->count > 64 * 1024)
+ return (EINVAL);
+
+ error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
+ &base, ogetdirentries_cvt);
+
+ if (error == 0 && uap->basep != NULL)
+ error = copyout(&base, uap->basep, sizeof(long));
+
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_FREEBSD11)
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd11_getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+freebsd11_getdirentries(struct thread *td,
+ struct freebsd11_getdirentries_args *uap)
+{
+ long base;
+ int error;
+
+ error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
+ &base, NULL);
+
+ if (error == 0 && uap->basep != NULL)
+ error = copyout(&base, uap->basep, sizeof(long));
+ return (error);
+}
+
+int
+freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap)
+{
+ struct freebsd11_getdirentries_args ap;
+
+ ap.fd = uap->fd;
+ ap.buf = uap->buf;
+ ap.count = uap->count;
+ ap.basep = NULL;
+ return (freebsd11_getdirentries(td, &ap));
+}
+#endif /* COMPAT_FREEBSD11 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+int
+sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
+{
+ off_t base;
+ int error;
+
+ error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+ NULL, UIO_USERSPACE);
+ if (error != 0)
+ return (error);
+ if (uap->basep != NULL)
+ error = copyout(&base, uap->basep, sizeof(off_t));
+ return (error);
+}
+
+int
+kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
+ off_t *basep, ssize_t *residp, enum uio_seg bufseg)
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ off_t loff;
+ int error, eofflag;
+ off_t foffset;
+
+ AUDIT_ARG_FD(fd);
+ if (count > IOSIZE_MAX)
+ return (EINVAL);
+ auio.uio_resid = count;
+ error = getvnode(td, fd, &cap_read_rights, &fp);
+ if (error != 0)
+ return (error);
+ if ((fp->f_flag & FREAD) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ vp = fp->f_vnode;
+ foffset = foffset_lock(fp, 0);
+unionread:
+ if (vp->v_type != VDIR) {
+ error = EINVAL;
+ goto fail;
+ }
+ aiov.iov_base = buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = bufseg;
+ auio.uio_td = td;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ loff = auio.uio_offset = foffset;
+#ifdef MAC
+ error = mac_vnode_check_readdir(td->td_ucred, vp);
+ if (error == 0)
+#endif
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
+ NULL);
+ foffset = auio.uio_offset;
+ if (error != 0) {
+ VOP_UNLOCK(vp, 0);
+ goto fail;
+ }
+ if (count == auio.uio_resid &&
+ (vp->v_vflag & VV_ROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_vnode = vp;
+ fp->f_data = vp;
+ foffset = 0;
+ vput(tvp);
+ goto unionread;
+ }
+ VOP_UNLOCK(vp, 0);
+ *basep = loff;
+ if (residp != NULL)
+ *residp = auio.uio_resid;
+ td->td_retval[0] = count - auio.uio_resid;
+fail:
+ foffset_unlock(fp, foffset, 0);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
+int
+sys_umask(struct thread *td, struct umask_args *uap)
+{
+ struct filedesc *fdp;
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ td->td_retval[0] = fdp->fd_cmask;
+ fdp->fd_cmask = uap->newmask & ALLPERMS;
+ FILEDESC_XUNLOCK(fdp);
+ return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem away from
+ * vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
+int
+sys_revoke(struct thread *td, struct revoke_args *uap)
+{
+ struct vnode *vp;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (vp->v_type != VCHR || vp->v_rdev == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_vnode_check_revoke(td->td_ucred, vp);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+ if (error != 0)
+ goto out;
+ if (td->td_ucred->cr_uid != vattr.va_uid) {
+ error = priv_check(td, PRIV_VFS_ADMIN);
+ if (error != 0)
+ goto out;
+ }
+ if (vcount(vp) > 1)
+ VOP_REVOKE(vp, REVOKEALL);
+out:
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry and check that, if it
+ * is a capability, the correct rights are present. A reference on the file
+ * entry is held upon returning.
+ */
+int
+getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+ struct file *fp;
+ int error;
+
+ error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
+ if (error != 0)
+ return (error);
+
+ /*
+ * The file could be not of the vnode type, or it may be not
+ * yet fully initialized, in which case the f_vnode pointer
+ * may be set, but f_ops is still badfileops. E.g.,
+ * devfs_open() transiently create such situation to
+ * facilitate csw d_fdopen().
+ *
+ * Dupfdopen() handling in kern_openat() installs the
+ * half-baked file into the process descriptor table, allowing
+ * other thread to dereference it. Guard against the race by
+ * checking f_ops.
+ */
+ if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ *fpp = fp;
+ return (0);
+}
+
+
+/*
+ * Get an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lgetfh_args {
+ char *fname;
+ fhandle_t *fhp;
+};
+#endif
+int
+sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
+{
+
+ return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname,
+ UIO_USERSPACE, uap->fhp));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+ char *fname;
+ fhandle_t *fhp;
+};
+#endif
+int
+sys_getfh(struct thread *td, struct getfh_args *uap)
+{
+
+ return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE,
+ uap->fhp));
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate an open descriptor into
+ * a NFS file handle.
+ *
+ * warning: do not remove the priv_check() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfhat_args {
+ int fd;
+ char *path;
+ fhandle_t *fhp;
+ int flags;
+};
+#endif
+int
+sys_getfhat(struct thread *td, struct getfhat_args *uap)
+{
+
+ if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW)) != 0)
+ return (EINVAL);
+ return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE,
+ uap->fhp));
+}
+
+static int
+kern_getfhat(struct thread *td, int flags, int fd, const char *path,
+ enum uio_seg pathseg, fhandle_t *fhp)
+{
+ struct nameidata nd;
+ fhandle_t fh;
+ struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_GETFH);
+ if (error != 0)
+ return (error);
+ NDINIT_AT(&nd, LOOKUP, ((flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW :
+ FOLLOW) | /*((flags & AT_BENEATH) != 0 ? BENEATH : 0) |*/ LOCKLEAF |
+ AUDITVNODE1, pathseg, path, fd, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ bzero(&fh, sizeof(fh));
+ fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+ error = VOP_VPTOFH(vp, &fh.fh_fid);
+ vput(vp);
+ if (error == 0)
+ error = copyout(&fh, fhp, sizeof (fh));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fhlink_args {
+ fhandle_t *fhp;
+ const char *to;
+};
+#endif
+int
+sys_fhlink(struct thread *td, struct fhlink_args *uap)
+{
+
+ return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fhlinkat_args {
+ fhandle_t *fhp;
+ int tofd;
+ const char *to;
+};
+#endif
+int
+sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap)
+{
+
+ return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp));
+}
+
+static int
+kern_fhlinkat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, fhandle_t *fhp)
+{
+ fhandle_t fh;
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_GETFH);
+ if (error != 0)
+ return (error);
+ error = copyin(fhp, &fh, sizeof(fh));
+ if (error != 0)
+ return (error);
+ do {
+ bwillwrite();
+ if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ return (error);
+ VOP_UNLOCK(vp, 0);
+ } while ((error = kern_linkat_vp(td, vp, fd, path, pathseg)) == EAGAIN);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fhreadlink_args {
+ fhandle_t *fhp;
+ char *buf;
+ size_t bufsize;
+};
+#endif
+int
+sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap)
+{
+ fhandle_t fh;
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_GETFH);
+ if (error != 0)
+ return (error);
+ if (uap->bufsize > IOSIZE_MAX)
+ return (EINVAL);
+ error = copyin(uap->fhp, &fh, sizeof(fh));
+ if (error != 0)
+ return (error);
+ if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ return (error);
+ error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into an
+ * open descriptor.
+ *
+ * warning: do not remove the priv_check() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+ const struct fhandle *u_fhp;
+ int flags;
+};
+#endif
+int
+sys_fhopen(struct thread *td, struct fhopen_args *uap)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct fhandle fhp;
+ struct file *fp;
+ int fmode, error;
+ int indx;
+
+ error = priv_check(td, PRIV_VFS_FHOPEN);
+ if (error != 0)
+ return (error);
+ indx = -1;
+ fmode = FFLAGS(uap->flags);
+ /* why not allow a non-read/write open for our lockd? */
+ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+ return (EINVAL);
+ error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
+ if (error != 0)
+ return(error);
+ /* find the mount point */
+ mp = vfs_busyfs(&fhp.fh_fsid);
+ if (mp == NULL)
+ return (ESTALE);
+ /* now give me my vnode, it gets returned to me locked */
+ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ return (error);
+
+ error = falloc_noinstall(td, &fp);
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * An extra reference on `fp' has been held for us by
+ * falloc_noinstall().
+ */
+
+#ifdef INVARIANTS
+ td->td_dupfd = -1;
+#endif
+ error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+ if (error != 0) {
+ KASSERT(fp->f_ops == &badfileops,
+ ("VOP_OPEN in fhopen() set f_ops"));
+ KASSERT(td->td_dupfd < 0,
+ ("fhopen() encountered fdopen()"));
+
+ vput(vp);
+ goto bad;
+ }
+#ifdef INVARIANTS
+ td->td_dupfd = 0;
+#endif
+ fp->f_vnode = vp;
+ fp->f_seqcount = 1;
+ finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
+ &vnops);
+ VOP_UNLOCK(vp, 0);
+ if ((fmode & O_TRUNC) != 0) {
+ error = fo_truncate(fp, 0, td->td_ucred, td);
+ if (error != 0)
+ goto bad;
+ }
+
+ error = finstall(td, fp, &indx, fmode, NULL);
+bad:
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+ struct fhandle *u_fhp;
+ struct stat *sb;
+};
+#endif
+int
+sys_fhstat(struct thread *td, struct fhstat_args *uap)
+{
+ struct stat sb;
+ struct fhandle fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fh));
+ if (error != 0)
+ return (error);
+ error = kern_fhstat(td, fh, &sb);
+ if (error == 0)
+ error = copyout(&sb, uap->sb, sizeof(sb));
+ return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_FHSTAT);
+ if (error != 0)
+ return (error);
+ if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ return (error);
+ error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+ struct fhandle *u_fhp;
+ struct statfs *buf;
+};
+#endif
+int
+sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
+{
+ struct statfs *sfp;
+ fhandle_t fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+ if (error != 0)
+ return (error);
+ sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
+ error = kern_fhstatfs(td, fh, sfp);
+ if (error == 0)
+ error = copyout(sfp, uap->buf, sizeof(*sfp));
+ free(sfp, M_STATFS);
+ return (error);
+}
+
+int
+kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
+{
+ struct statfs *sp;
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_FHSTATFS);
+ if (error != 0)
+ return (error);
+ if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+ if (error != 0) {
+ vfs_unbusy(mp);
+ return (error);
+ }
+ vput(vp);
+ error = prison_canseemount(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#ifdef MAC
+ error = mac_mount_check_stat(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#endif
+ /*
+ * Set these in case the underlying filesystem fails to do so.
+ */
+ sp = &mp->mnt_stat;
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = VFS_STATFS(mp, sp);
+ if (error == 0)
+ *buf = *sp;
+out:
+ vfs_unbusy(mp);
+ return (error);
+}
+
+int
+kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
+{
+ struct file *fp;
+ struct mount *mp;
+ struct vnode *vp;
+ off_t olen, ooffset;
+ int error;
+#ifdef AUDIT
+ int audited_vnode1 = 0;
+#endif
+
+ AUDIT_ARG_FD(fd);
+ if (offset < 0 || len <= 0)
+ return (EINVAL);
+ /* Check for wrap. */
+ if (offset > OFF_MAX - len)
+ return (EFBIG);
+ AUDIT_ARG_FD(fd);
+ error = fget(td, fd, &cap_pwrite_rights, &fp);
+ if (error != 0)
+ return (error);
+ AUDIT_ARG_FILE(td->td_proc, fp);
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
+ error = ESPIPE;
+ goto out;
+ }
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ if (fp->f_type != DTYPE_VNODE) {
+ error = ENODEV;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = ENODEV;
+ goto out;
+ }
+
+ /* Allocating blocks may take a long time, so iterate. */
+ for (;;) {
+ olen = len;
+ ooffset = offset;
+
+ bwillwrite();
+ mp = NULL;
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ break;
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vn_finished_write(mp);
+ break;
+ }
+#ifdef AUDIT
+ if (!audited_vnode1) {
+ AUDIT_ARG_VNODE1(vp);
+ audited_vnode1 = 1;
+ }
+#endif
+#ifdef MAC
+ error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
+ if (error == 0)
+#endif
+ error = VOP_ALLOCATE(vp, &offset, &len);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+
+ if (olen + ooffset != offset + len) {
+ panic("offset + len changed from %jx/%jx to %jx/%jx",
+ ooffset, olen, offset, len);
+ }
+ if (error != 0 || len == 0)
+ break;
+ KASSERT(olen > len, ("Iteration did not make progress?"));
+ maybe_yield();
+ }
+ out:
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
+{
+ int error;
+
+ error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
+ return (kern_posix_error(td, error));
+}
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint. Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+ int advice)
+{
+ struct fadvise_info *fa, *new;
+ struct file *fp;
+ struct vnode *vp;
+ off_t end;
+ int error;
+
+ if (offset < 0 || len < 0 || offset > OFF_MAX - len)
+ return (EINVAL);
+ AUDIT_ARG_VALUE(advice);
+ switch (advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+ break;
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ new = NULL;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /* XXX: CAP_POSIX_FADVISE? */
+ AUDIT_ARG_FD(fd);
+ error = fget(td, fd, &cap_no_rights, &fp);
+ if (error != 0)
+ goto out;
+ AUDIT_ARG_FILE(td->td_proc, fp);
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
+ error = ESPIPE;
+ goto out;
+ }
+ if (fp->f_type != DTYPE_VNODE) {
+ error = ENODEV;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = ENODEV;
+ goto out;
+ }
+ if (len == 0)
+ end = OFF_MAX;
+ else
+ end = offset + len - 1;
+ switch (advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Try to merge any existing non-standard region with
+ * this new region if possible, otherwise create a new
+ * non-standard region for this request.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL && fa->fa_advice == advice &&
+ ((fa->fa_start <= end && fa->fa_end >= offset) ||
+ (end != OFF_MAX && fa->fa_start == end + 1) ||
+ (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
+ if (offset < fa->fa_start)
+ fa->fa_start = offset;
+ if (end > fa->fa_end)
+ fa->fa_end = end;
+ } else {
+ new->fa_advice = advice;
+ new->fa_start = offset;
+ new->fa_end = end;
+ fp->f_advice = new;
+ new = fa;
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_NORMAL:
+ /*
+ * If a the "normal" region overlaps with an existing
+ * non-standard region, trim or remove the
+ * non-standard region.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL) {
+ if (offset <= fa->fa_start && end >= fa->fa_end) {
+ new = fa;
+ fp->f_advice = NULL;
+ } else if (offset <= fa->fa_start &&
+ end >= fa->fa_start)
+ fa->fa_start = end + 1;
+ else if (offset <= fa->fa_end && end >= fa->fa_end)
+ fa->fa_end = offset - 1;
+ else if (offset >= fa->fa_start && end <= fa->fa_end) {
+ /*
+ * If the "normal" region is a middle
+ * portion of the existing
+ * non-standard region, just remove
+ * the whole thing rather than picking
+ * one side or the other to
+ * preserve.
+ */
+ new = fa;
+ fp->f_advice = NULL;
+ }
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ error = VOP_ADVISE(vp, offset, end, advice);
+ break;
+ }
+out:
+ if (fp != NULL)
+ fdrop(fp, td);
+ free(new, M_FADVISE);
+ return (error);
+}
+
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+ int error;
+
+ error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
+ uap->advice);
+ return (kern_posix_error(td, error));
+}
diff --git a/freebsd/sys/kern/vfs_vnops.c b/freebsd/sys/kern/vfs_vnops.c
new file mode 100644
index 00000000..bdd6692d
--- /dev/null
+++ b/freebsd/sys/kern/vfs_vnops.c
@@ -0,0 +1,2607 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2013, 2014 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/disk.h>
+#include <sys/fail.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/kdb.h>
+#include <sys/stat.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/filio.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/ttycom.h>
+#include <sys/conf.h>
+#include <sys/syslog.h>
+#include <sys/unistd.h>
+#include <sys/user.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+static fo_rdwr_t vn_read;
+static fo_rdwr_t vn_write;
+static fo_rdwr_t vn_io_fault;
+static fo_truncate_t vn_truncate;
+static fo_ioctl_t vn_ioctl;
+static fo_poll_t vn_poll;
+static fo_kqfilter_t vn_kqfilter;
+static fo_stat_t vn_statfile;
+static fo_close_t vn_closefile;
+static fo_mmap_t vn_mmap;
+
+struct fileops vnops = {
+ .fo_read = vn_io_fault,
+ .fo_write = vn_io_fault,
+ .fo_truncate = vn_truncate,
+ .fo_ioctl = vn_ioctl,
+ .fo_poll = vn_poll,
+ .fo_kqfilter = vn_kqfilter,
+ .fo_stat = vn_statfile,
+ .fo_close = vn_closefile,
+ .fo_chmod = vn_chmod,
+ .fo_chown = vn_chown,
+ .fo_sendfile = vn_sendfile,
+ .fo_seek = vn_seek,
+ .fo_fill_kinfo = vn_fill_kinfo,
+ .fo_mmap = vn_mmap,
+ .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+};
+
+static const int io_hold_cnt = 16;
+static int vn_io_fault_enable = 1;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
+ &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
+static int vn_io_fault_prefault = 0;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,
+ &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
+static u_long vn_io_faults_cnt;
+SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
+ &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
+
+/*
+ * Returns true if vn_io_fault mode of handling the i/o request should
+ * be used.
+ */
+static bool
+do_vn_io_fault(struct vnode *vp, struct uio *uio)
+{
+ struct mount *mp;
+
+ return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
+ (mp = vp->v_mount) != NULL &&
+ (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
+}
+
+/*
+ * Structure used to pass arguments to vn_io_fault1(), to do either
+ * file- or vnode-based I/O calls.
+ */
+struct vn_io_fault_args {
+ enum {
+ VN_IO_FAULT_FOP,
+ VN_IO_FAULT_VOP
+ } kind;
+ struct ucred *cred;
+ int flags;
+ union {
+ struct fop_args_tag {
+ struct file *fp;
+ fo_rdwr_t *doio;
+ } fop_args;
+ struct vop_args_tag {
+ struct vnode *vp;
+ } vop_args;
+ } args;
+};
+
+static int vn_io_fault1(struct vnode *vp, struct uio *uio,
+ struct vn_io_fault_args *args, struct thread *td);
+
+int
+vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
+{
+ struct thread *td = ndp->ni_cnd.cn_thread;
+
+ return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
+}
+
+/*
+ * Common code for vnode open operations via a name lookup.
+ * Lookup the vnode and invoke VOP_CREATE if needed.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ *
+ * Note that this does NOT free nameidata for the successful case,
+ * due to the NDINIT being done elsewhere.
+ */
+int
+vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
+ struct ucred *cred, struct file *fp)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct thread *td = ndp->ni_cnd.cn_thread;
+ struct vattr vat;
+ struct vattr *vap = &vat;
+ int fmode, error;
+
+restart:
+ fmode = *flagp;
+ if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
+ O_EXCL | O_DIRECTORY))
+ return (EINVAL);
+ else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
+ ndp->ni_cnd.cn_nameiop = CREATE;
+ /*
+ * Set NOCACHE to avoid flushing the cache when
+ * rolling in many files at once.
+ */
+ ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE;
+ if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+ ndp->ni_cnd.cn_flags |= FOLLOW;
+ if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+ ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+ if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+ ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+ if ((vn_open_flags & VN_OPEN_INVFS) == 0)
+ bwillwrite();
+ if ((error = namei(ndp)) != 0)
+ return (error);
+ if (ndp->ni_vp == NULL) {
+ VATTR_NULL(vap);
+ vap->va_type = VREG;
+ vap->va_mode = cmode;
+ if (fmode & O_EXCL)
+ vap->va_vaflags |= VA_EXCLUSIVE;
+ if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vput(ndp->ni_dvp);
+ if ((error = vn_start_write(NULL, &mp,
+ V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
+ ndp->ni_cnd.cn_flags |= MAKEENTRY;
+#ifdef MAC
+ error = mac_vnode_check_create(cred, ndp->ni_dvp,
+ &ndp->ni_cnd, vap);
+ if (error == 0)
+#endif
+ error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+ &ndp->ni_cnd, vap);
+ vput(ndp->ni_dvp);
+ vn_finished_write(mp);
+ if (error) {
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ return (error);
+ }
+ fmode &= ~O_TRUNC;
+ vp = ndp->ni_vp;
+ } else {
+ if (ndp->ni_dvp == ndp->ni_vp)
+ vrele(ndp->ni_dvp);
+ else
+ vput(ndp->ni_dvp);
+ ndp->ni_dvp = NULL;
+ vp = ndp->ni_vp;
+ if (fmode & O_EXCL) {
+ error = EEXIST;
+ goto bad;
+ }
+ if (vp->v_type == VDIR) {
+ error = EISDIR;
+ goto bad;
+ }
+ fmode &= ~O_CREAT;
+ }
+ } else {
+ ndp->ni_cnd.cn_nameiop = LOOKUP;
+ ndp->ni_cnd.cn_flags = ISOPEN |
+ ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+ if (!(fmode & FWRITE))
+ ndp->ni_cnd.cn_flags |= LOCKSHARED;
+ if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+ ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+ if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+ ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+ if ((error = namei(ndp)) != 0)
+ return (error);
+ vp = ndp->ni_vp;
+ }
+ error = vn_open_vnode(vp, fmode, cred, td, fp);
+ if (error)
+ goto bad;
+ *flagp = fmode;
+ return (0);
+bad:
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vput(vp);
+ *flagp = fmode;
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+static int
+vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
+{
+ struct flock lf;
+ int error, lock_flags, type;
+
+ ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
+ if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
+ return (0);
+ KASSERT(fp != NULL, ("open with flock requires fp"));
+ if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
+ return (EOPNOTSUPP);
+
+ lock_flags = VOP_ISLOCKED(vp);
+ VOP_UNLOCK(vp, 0);
+
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
+ type = F_FLOCK;
+ if ((fmode & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
+ if (error == 0)
+ fp->f_flag |= FHASLOCK;
+
+ vn_lock(vp, lock_flags | LK_RETRY);
+ if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0)
+ error = ENOENT;
+ return (error);
+}
+
+/*
+ * Common code for vnode open operations once a vnode is located.
+ * Check permissions, and call the VOP_OPEN routine.
+ */
+int
+vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
+ struct thread *td, struct file *fp)
+{
+ accmode_t accmode;
+ int error;
+
+ if (vp->v_type == VLNK)
+ return (EMLINK);
+ if (vp->v_type == VSOCK)
+ return (EOPNOTSUPP);
+ if (vp->v_type != VDIR && fmode & O_DIRECTORY)
+ return (ENOTDIR);
+ accmode = 0;
+ if (fmode & (FWRITE | O_TRUNC)) {
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ accmode |= VWRITE;
+ }
+ if (fmode & FREAD)
+ accmode |= VREAD;
+ if (fmode & FEXEC)
+ accmode |= VEXEC;
+ if ((fmode & O_APPEND) && (fmode & FWRITE))
+ accmode |= VAPPEND;
+#ifdef MAC
+ if (fmode & O_CREAT)
+ accmode |= VCREAT;
+ if (fmode & O_VERIFY)
+ accmode |= VVERIFY;
+ error = mac_vnode_check_open(cred, vp, accmode);
+ if (error)
+ return (error);
+
+ accmode &= ~(VCREAT | VVERIFY);
+#endif
+ if ((fmode & O_CREAT) == 0 && accmode != 0) {
+ error = VOP_ACCESS(vp, accmode, cred, td);
+ if (error != 0)
+ return (error);
+ }
+ if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+ vn_lock(vp, LK_UPGRADE | LK_RETRY);
+ error = VOP_OPEN(vp, fmode, cred, td, fp);
+ if (error != 0)
+ return (error);
+
+ error = vn_open_vnode_advlock(vp, fmode, fp);
+ if (error == 0 && (fmode & FWRITE) != 0) {
+ error = VOP_ADD_WRITECOUNT(vp, 1);
+ if (error == 0) {
+ CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
+ __func__, vp, vp->v_writecount);
+ }
+ }
+
+ /*
+ * Error from advlock or VOP_ADD_WRITECOUNT() still requires
+ * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
+ * Arrange for that by having fdrop() to use vn_closefile().
+ */
+ if (error != 0) {
+ fp->f_flag |= FOPENFAILED;
+ fp->f_vnode = vp;
+ if (fp->f_ops == &badfileops) {
+ fp->f_type = DTYPE_VNODE;
+ fp->f_ops = &vnops;
+ }
+ vref(vp);
+ }
+
+ ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
+ return (error);
+
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ * It is racy.
+ */
+int
+vn_writechk(struct vnode *vp)
+{
+
+ ASSERT_VOP_LOCKED(vp, "vn_writechk");
+ /*
+ * If there's shared text associated with
+ * the vnode, try to free it up once. If
+ * we fail, we can't allow writing.
+ */
+ if (VOP_IS_TEXT(vp))
+ return (ETXTBSY);
+
+ return (0);
+}
+
+/*
+ * Vnode close call
+ */
+static int
+vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
+ struct thread *td, bool keep_ref)
+{
+ struct mount *mp;
+ int error, lock_flags;
+
+ if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
+ MNT_EXTENDED_SHARED(vp->v_mount))
+ lock_flags = LK_SHARED;
+ else
+ lock_flags = LK_EXCLUSIVE;
+
+ vn_start_write(vp, &mp, V_WAIT);
+ vn_lock(vp, lock_flags | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
+ VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
+ CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+ __func__, vp, vp->v_writecount);
+ }
+ error = VOP_CLOSE(vp, flags, file_cred, td);
+ if (keep_ref)
+ VOP_UNLOCK(vp, 0);
+ else
+ vput(vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
+ struct thread *td)
+{
+
+ return (vn_close1(vp, flags, file_cred, td, false));
+}
+
+/*
+ * Heuristic to detect sequential operation.
+ */
+static int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+
+ ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
+ if (fp->f_flag & FRDAHEAD)
+ return (fp->f_seqcount << IO_SEQSHIFT);
+
+ /*
+ * Offset 0 is handled specially. open() sets f_seqcount to 1 so
+ * that the first I/O is normally considered to be slightly
+ * sequential. Seeking to offset 0 doesn't change sequentiality
+ * unless previous seeks have reduced f_seqcount to 0, in which
+ * case offset 0 is not special.
+ */
+ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+ uio->uio_offset == fp->f_nextoff) {
+ /*
+ * f_seqcount is in units of fixed-size blocks so that it
+ * depends mainly on the amount of sequential I/O and not
+ * much on the number of sequential I/O's. The fixed size
+ * of 16384 is hard-coded here since it is (not quite) just
+ * a magic size that works well here. This size is more
+ * closely related to the best I/O size for real disks than
+ * to any block size used by software.
+ */
+ if (uio->uio_resid >= IO_SEQMAX * 16384)
+ fp->f_seqcount = IO_SEQMAX;
+ else {
+ fp->f_seqcount += howmany(uio->uio_resid, 16384);
+ if (fp->f_seqcount > IO_SEQMAX)
+ fp->f_seqcount = IO_SEQMAX;
+ }
+ return (fp->f_seqcount << IO_SEQSHIFT);
+ }
+
+ /* Not sequential. Quickly draw-down sequentiality. */
+ if (fp->f_seqcount > 1)
+ fp->f_seqcount = 1;
+ else
+ fp->f_seqcount = 0;
+ return (0);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+ enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+ struct ucred *file_cred, ssize_t *aresid, struct thread *td)
+{
+ struct uio auio;
+ struct iovec aiov;
+ struct mount *mp;
+ struct ucred *cred;
+ void *rl_cookie;
+ struct vn_io_fault_args args;
+ int error, lock_flags;
+
+ if (offset < 0 && vp->v_type != VCHR)
+ return (EINVAL);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = base;
+ aiov.iov_len = len;
+ auio.uio_resid = len;
+ auio.uio_offset = offset;
+ auio.uio_segflg = segflg;
+ auio.uio_rw = rw;
+ auio.uio_td = td;
+ error = 0;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if ((ioflg & IO_RANGELOCKED) == 0) {
+ if (rw == UIO_READ) {
+ rl_cookie = vn_rangelock_rlock(vp, offset,
+ offset + len);
+ } else {
+ rl_cookie = vn_rangelock_wlock(vp, offset,
+ offset + len);
+ }
+ } else
+ rl_cookie = NULL;
+ mp = NULL;
+ if (rw == UIO_WRITE) {
+ if (vp->v_type != VCHR &&
+ (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+ != 0)
+ goto out;
+ if (MNT_SHARED_WRITES(mp) ||
+ ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
+ lock_flags = LK_SHARED;
+ else
+ lock_flags = LK_EXCLUSIVE;
+ } else
+ lock_flags = LK_SHARED;
+ vn_lock(vp, lock_flags | LK_RETRY);
+ } else
+ rl_cookie = NULL;
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+#ifdef MAC
+ if ((ioflg & IO_NOMACCHECK) == 0) {
+ if (rw == UIO_READ)
+ error = mac_vnode_check_read(active_cred, file_cred,
+ vp);
+ else
+ error = mac_vnode_check_write(active_cred, file_cred,
+ vp);
+ }
+#endif
+ if (error == 0) {
+ if (file_cred != NULL)
+ cred = file_cred;
+ else
+ cred = active_cred;
+ if (do_vn_io_fault(vp, &auio)) {
+ args.kind = VN_IO_FAULT_VOP;
+ args.cred = cred;
+ args.flags = ioflg;
+ args.args.vop_args.vp = vp;
+ error = vn_io_fault1(vp, &auio, &args, td);
+ } else if (rw == UIO_READ) {
+ error = VOP_READ(vp, &auio, ioflg, cred);
+ } else /* if (rw == UIO_WRITE) */ {
+ error = VOP_WRITE(vp, &auio, ioflg, cred);
+ }
+ }
+ if (aresid)
+ *aresid = auio.uio_resid;
+ else
+ if (auio.uio_resid && error == 0)
+ error = EIO;
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ VOP_UNLOCK(vp, 0);
+ if (mp != NULL)
+ vn_finished_write(mp);
+ }
+ out:
+ if (rl_cookie != NULL)
+ vn_rangelock_unlock(vp, rl_cookie);
+ return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it. The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we
+ * check bwillwrite() before calling vn_rdwr(). We also call kern_yield()
+ * to give other processes a chance to lock the vnode (either other processes
+ * core'ing the same binary, or unrelated processes scanning the directory).
+ */
+int
+vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
+ off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+ struct ucred *file_cred, size_t *aresid, struct thread *td)
+{
+ int error = 0;
+ ssize_t iaresid;
+
+ do {
+ int chunk;
+
+ /*
+ * Force `offset' to a multiple of MAXBSIZE except possibly
+ * for the first chunk, so that filesystems only need to
+ * write full blocks except possibly for the first and last
+ * chunks.
+ */
+ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
+
+ if (chunk > len)
+ chunk = len;
+ if (rw != UIO_READ && vp->v_type == VREG)
+ bwillwrite();
+ iaresid = 0;
+ error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+ ioflg, active_cred, file_cred, &iaresid, td);
+ len -= chunk; /* aresid calc already includes length */
+ if (error)
+ break;
+ offset += chunk;
+ base = (char *)base + chunk;
+ kern_yield(PRI_USER);
+ } while (len);
+ if (aresid)
+ *aresid = len + iaresid;
+ return (error);
+}
+
+off_t
+foffset_lock(struct file *fp, int flags)
+{
+ struct mtx *mtxp;
+ off_t res;
+
+ KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+ /*
+ * Caller only wants the current f_offset value. Assume that
+ * the long and shorter integer types reads are atomic.
+ */
+ if ((flags & FOF_NOLOCK) != 0)
+ return (fp->f_offset);
+#endif
+
+ /*
+ * According to McKusick the vn lock was protecting f_offset here.
+ * It is now protected by the FOFFSET_LOCKED flag.
+ */
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if ((flags & FOF_NOLOCK) == 0) {
+ while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+ fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+ msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+ "vofflock", 0);
+ }
+ fp->f_vnread_flags |= FOFFSET_LOCKED;
+ }
+ res = fp->f_offset;
+ mtx_unlock(mtxp);
+ return (res);
+}
+
+void
+foffset_unlock(struct file *fp, off_t val, int flags)
+{
+ struct mtx *mtxp;
+
+ KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+ if ((flags & FOF_NOLOCK) != 0) {
+ if ((flags & FOF_NOUPDATE) == 0)
+ fp->f_offset = val;
+ if ((flags & FOF_NEXTOFF) != 0)
+ fp->f_nextoff = val;
+ return;
+ }
+#endif
+
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if ((flags & FOF_NOUPDATE) == 0)
+ fp->f_offset = val;
+ if ((flags & FOF_NEXTOFF) != 0)
+ fp->f_nextoff = val;
+ if ((flags & FOF_NOLOCK) == 0) {
+ KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
+ ("Lost FOFFSET_LOCKED"));
+ if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+ wakeup(&fp->f_vnread_flags);
+ fp->f_vnread_flags = 0;
+ }
+ mtx_unlock(mtxp);
+}
+
+void
+foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+ if ((flags & FOF_OFFSET) == 0)
+ uio->uio_offset = foffset_lock(fp, flags);
+}
+
+void
+foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+ if ((flags & FOF_OFFSET) == 0)
+ foffset_unlock(fp, uio->uio_offset, flags);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+ struct mtx *mtxp;
+ int ret;
+
+ ret = POSIX_FADV_NORMAL;
+ if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
+ return (ret);
+
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (fp->f_advice != NULL &&
+ uio->uio_offset >= fp->f_advice->fa_start &&
+ uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+ ret = fp->f_advice->fa_advice;
+ mtx_unlock(mtxp);
+ return (ret);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+ struct thread *td)
+{
+ struct vnode *vp;
+ off_t orig_offset;
+ int error, ioflag;
+ int advice;
+
+ KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+ uio->uio_td, td));
+ KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+ vp = fp->f_vnode;
+ ioflag = 0;
+ if (fp->f_flag & FNONBLOCK)
+ ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
+ advice = get_advice(fp, uio);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_NOREUSE:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* Disable read-ahead for random I/O. */
+ break;
+ }
+ orig_offset = uio->uio_offset;
+
+#ifdef MAC
+ error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
+ if (error == 0)
+#endif
+ error = VOP_READ(vp, uio, ioflag, fp->f_cred);
+ fp->f_nextoff = uio->uio_offset;
+ VOP_UNLOCK(vp, 0);
+ if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+ orig_offset != uio->uio_offset)
+ /*
+ * Use POSIX_FADV_DONTNEED to flush pages and buffers
+ * for the backing file after a POSIX_FADV_NOREUSE
+ * read(2).
+ */
+ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
+ POSIX_FADV_DONTNEED);
+ return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+ struct thread *td)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ off_t orig_offset;
+ int error, ioflag, lock_flags;
+ int advice;
+
+ KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+ uio->uio_td, td));
+ KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+ vp = fp->f_vnode;
+ if (vp->v_type == VREG)
+ bwillwrite();
+ ioflag = IO_UNIT;
+ if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
+ ioflag |= IO_APPEND;
+ if (fp->f_flag & FNONBLOCK)
+ ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
+ if ((fp->f_flag & O_FSYNC) ||
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+ ioflag |= IO_SYNC;
+ mp = NULL;
+ if (vp->v_type != VCHR &&
+ (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto unlock;
+
+ advice = get_advice(fp, uio);
+
+ if (MNT_SHARED_WRITES(mp) ||
+ (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
+ lock_flags = LK_SHARED;
+ } else {
+ lock_flags = LK_EXCLUSIVE;
+ }
+
+ vn_lock(vp, lock_flags | LK_RETRY);
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_NOREUSE:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* XXX: Is this correct? */
+ break;
+ }
+ orig_offset = uio->uio_offset;
+
+#ifdef MAC
+ error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+ if (error == 0)
+#endif
+ error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
+ fp->f_nextoff = uio->uio_offset;
+ VOP_UNLOCK(vp, 0);
+ if (vp->v_type != VCHR)
+ vn_finished_write(mp);
+ if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+ orig_offset != uio->uio_offset)
+ /*
+ * Use POSIX_FADV_DONTNEED to flush pages and buffers
+ * for the backing file after a POSIX_FADV_NOREUSE
+ * write(2).
+ */
+ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
+ POSIX_FADV_DONTNEED);
+unlock:
+ return (error);
+}
+
+/*
+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 into userspace
+ * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ() or VOP_WRITE().
+ * Instead, it first tries to do the whole range i/o with pagefaults
+ * disabled. If all pages in the i/o buffer are resident and mapped,
+ * VOP will succeed (ignoring the genuine filesystem errors).
+ * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
+ * i/o in chunks, with all pages in the chunk prefaulted and held
+ * using vm_fault_quick_hold_pages().
+ *
+ * Filesystems using this deadlock avoidance scheme should use the
+ * array of the held pages from uio, saved in the curthread->td_ma,
+ * instead of doing uiomove(). A helper function
+ * vn_io_fault_uiomove() converts uiomove request into
+ * uiomove_fromphys() over td_ma array.
+ *
+ * Since vnode locks do not cover the whole i/o anymore, rangelocks
+ * make the current i/o request atomic with respect to other i/os and
+ * truncations.
+ */
+
+/*
+ * Decode vn_io_fault_args and perform the corresponding i/o.
+ */
+static int
+vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
+ struct thread *td)
+{
+ int error, save;
+
+ error = 0;
+ save = vm_fault_disable_pagefaults();
+ switch (args->kind) {
+ case VN_IO_FAULT_FOP:
+ error = (args->args.fop_args.doio)(args->args.fop_args.fp,
+ uio, args->cred, args->flags, td);
+ break;
+ case VN_IO_FAULT_VOP:
+ if (uio->uio_rw == UIO_READ) {
+ error = VOP_READ(args->args.vop_args.vp, uio,
+ args->flags, args->cred);
+ } else if (uio->uio_rw == UIO_WRITE) {
+ error = VOP_WRITE(args->args.vop_args.vp, uio,
+ args->flags, args->cred);
+ }
+ break;
+ default:
+ panic("vn_io_fault_doio: unknown kind of io %d %d",
+ args->kind, uio->uio_rw);
+ }
+ vm_fault_enable_pagefaults(save);
+ return (error);
+}
+
+static int
+vn_io_fault_touch(char *base, const struct uio *uio)
+{
+ int r;
+
+ r = fubyte(base);
+ if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
+ return (EFAULT);
+ return (0);
+}
+
+static int
+vn_io_fault_prefault_user(const struct uio *uio)
+{
+ char *base;
+ const struct iovec *iov;
+ size_t len;
+ ssize_t resid;
+ int error, i;
+
+ KASSERT(uio->uio_segflg == UIO_USERSPACE,
+ ("vn_io_fault_prefault userspace"));
+
+ error = i = 0;
+ iov = uio->uio_iov;
+ resid = uio->uio_resid;
+ base = iov->iov_base;
+ len = iov->iov_len;
+ while (resid > 0) {
+ error = vn_io_fault_touch(base, uio);
+ if (error != 0)
+ break;
+ if (len < PAGE_SIZE) {
+ if (len != 0) {
+ error = vn_io_fault_touch(base + len - 1, uio);
+ if (error != 0)
+ break;
+ resid -= len;
+ }
+ if (++i >= uio->uio_iovcnt)
+ break;
+ iov = uio->uio_iov + i;
+ base = iov->iov_base;
+ len = iov->iov_len;
+ } else {
+ len -= PAGE_SIZE;
+ base += PAGE_SIZE;
+ resid -= PAGE_SIZE;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Common code for vn_io_fault(), agnostic to the kind of i/o request.
+ * Uses vn_io_fault_doio() to make the call to an actual i/o function.
+ * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
+ * into args and call vn_io_fault1() to handle faults during the user
+ * mode buffer accesses.
+ */
+static int
+vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
+ struct thread *td)
+{
+ vm_page_t ma[io_hold_cnt + 2];
+ struct uio *uio_clone, short_uio;
+ struct iovec short_iovec[1];
+ vm_page_t *prev_td_ma;
+ vm_prot_t prot;
+ vm_offset_t addr, end;
+ size_t len, resid;
+ ssize_t adv;
+ int error, cnt, saveheld, prev_td_ma_cnt;
+
+ if (vn_io_fault_prefault) {
+ error = vn_io_fault_prefault_user(uio);
+ if (error != 0)
+ return (error); /* Or ignore ? */
+ }
+
+ prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
+
+ /*
+ * The UFS follows IO_UNIT directive and replays back both
+ * uio_offset and uio_resid if an error is encountered during the
+ * operation. But, since the iovec may be already advanced,
+ * uio is still in an inconsistent state.
+ *
+ * Cache a copy of the original uio, which is advanced to the redo
+ * point using UIO_NOCOPY below.
+ */
+ uio_clone = cloneuio(uio);
+ resid = uio->uio_resid;
+
+ short_uio.uio_segflg = UIO_USERSPACE;
+ short_uio.uio_rw = uio->uio_rw;
+ short_uio.uio_td = uio->uio_td;
+
+ error = vn_io_fault_doio(args, uio, td);
+ if (error != EFAULT)
+ goto out;
+
+ atomic_add_long(&vn_io_faults_cnt, 1);
+ uio_clone->uio_segflg = UIO_NOCOPY;
+ uiomove(NULL, resid - uio->uio_resid, uio_clone);
+ uio_clone->uio_segflg = uio->uio_segflg;
+
+ saveheld = curthread_pflags_set(TDP_UIOHELD);
+ prev_td_ma = td->td_ma;
+ prev_td_ma_cnt = td->td_ma_cnt;
+
+ while (uio_clone->uio_resid != 0) {
+ len = uio_clone->uio_iov->iov_len;
+ if (len == 0) {
+ KASSERT(uio_clone->uio_iovcnt >= 1,
+ ("iovcnt underflow"));
+ uio_clone->uio_iov++;
+ uio_clone->uio_iovcnt--;
+ continue;
+ }
+ if (len > io_hold_cnt * PAGE_SIZE)
+ len = io_hold_cnt * PAGE_SIZE;
+ addr = (uintptr_t)uio_clone->uio_iov->iov_base;
+ end = round_page(addr + len);
+ if (end < addr) {
+ error = EFAULT;
+ break;
+ }
+ cnt = atop(end - trunc_page(addr));
+ /*
+ * A perfectly misaligned address and length could cause
+ * both the start and the end of the chunk to use partial
+ * page. +2 accounts for such a situation.
+ */
+ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
+ addr, len, prot, ma, io_hold_cnt + 2);
+ if (cnt == -1) {
+ error = EFAULT;
+ break;
+ }
+ short_uio.uio_iov = &short_iovec[0];
+ short_iovec[0].iov_base = (void *)addr;
+ short_uio.uio_iovcnt = 1;
+ short_uio.uio_resid = short_iovec[0].iov_len = len;
+ short_uio.uio_offset = uio_clone->uio_offset;
+ td->td_ma = ma;
+ td->td_ma_cnt = cnt;
+
+ error = vn_io_fault_doio(args, &short_uio, td);
+ vm_page_unhold_pages(ma, cnt);
+ adv = len - short_uio.uio_resid;
+
+ uio_clone->uio_iov->iov_base =
+ (char *)uio_clone->uio_iov->iov_base + adv;
+ uio_clone->uio_iov->iov_len -= adv;
+ uio_clone->uio_resid -= adv;
+ uio_clone->uio_offset += adv;
+
+ uio->uio_resid -= adv;
+ uio->uio_offset += adv;
+
+ if (error != 0 || adv == 0)
+ break;
+ }
+ td->td_ma = prev_td_ma;
+ td->td_ma_cnt = prev_td_ma_cnt;
+ curthread_pflags_restore(saveheld);
+out:
+ free(uio_clone, M_IOV);
+ return (error);
+}
+
+static int
+vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ fo_rdwr_t *doio;
+ struct vnode *vp;
+ void *rl_cookie;
+ struct vn_io_fault_args args;
+ int error;
+
+ doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
+ vp = fp->f_vnode;
+ foffset_lock_uio(fp, uio, flags);
+ if (do_vn_io_fault(vp, uio)) {
+ args.kind = VN_IO_FAULT_FOP;
+ args.args.fop_args.fp = fp;
+ args.args.fop_args.doio = doio;
+ args.cred = active_cred;
+ args.flags = flags | FOF_OFFSET;
+ if (uio->uio_rw == UIO_READ) {
+ rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+ uio->uio_offset + uio->uio_resid);
+ } else if ((fp->f_flag & O_APPEND) != 0 ||
+ (flags & FOF_OFFSET) == 0) {
+ /* For appenders, punt and lock the whole range. */
+ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+ } else {
+ rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+ uio->uio_offset + uio->uio_resid);
+ }
+ error = vn_io_fault1(vp, uio, &args, td);
+ vn_rangelock_unlock(vp, rl_cookie);
+ } else {
+ error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+ }
+ foffset_unlock_uio(fp, uio, flags);
+ return (error);
+}
+
+/*
+ * Helper function to perform the requested uiomove operation using
+ * the held pages for io->uio_iov[0].iov_base buffer instead of
+ * copyin/copyout. Access to the pages with uiomove_fromphys()
+ * instead of iov_base prevents page faults that could occur due to
+ * pmap_collect() invalidating the mapping created by
+ * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
+ * object cleanup revoking the write access from page mappings.
+ *
+ * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
+ * instead of plain uiomove().
+ */
+int
+vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
+{
+ struct uio transp_uio;
+ struct iovec transp_iov[1];
+ struct thread *td;
+ size_t adv;
+ int error, pgadv;
+
+ td = curthread;
+ if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+ uio->uio_segflg != UIO_USERSPACE)
+ return (uiomove(data, xfersize, uio));
+
+ KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+ transp_iov[0].iov_base = data;
+ transp_uio.uio_iov = &transp_iov[0];
+ transp_uio.uio_iovcnt = 1;
+ if (xfersize > uio->uio_resid)
+ xfersize = uio->uio_resid;
+ transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
+ transp_uio.uio_offset = 0;
+ transp_uio.uio_segflg = UIO_SYSSPACE;
+ /*
+ * Since transp_iov points to data, and td_ma page array
+ * corresponds to original uio->uio_iov, we need to invert the
+ * direction of the i/o operation as passed to
+ * uiomove_fromphys().
+ */
+ switch (uio->uio_rw) {
+ case UIO_WRITE:
+ transp_uio.uio_rw = UIO_READ;
+ break;
+ case UIO_READ:
+ transp_uio.uio_rw = UIO_WRITE;
+ break;
+ }
+ transp_uio.uio_td = uio->uio_td;
+ error = uiomove_fromphys(td->td_ma,
+ ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
+ xfersize, &transp_uio);
+ adv = xfersize - transp_uio.uio_resid;
+ pgadv =
+ (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
+ (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
+ td->td_ma += pgadv;
+ KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+ pgadv));
+ td->td_ma_cnt -= pgadv;
+ uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
+ uio->uio_iov->iov_len -= adv;
+ uio->uio_resid -= adv;
+ uio->uio_offset += adv;
+ return (error);
+}
+
+int
+vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
+ struct uio *uio)
+{
+ struct thread *td;
+ vm_offset_t iov_base;
+ int cnt, pgadv;
+
+ td = curthread;
+ if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+ uio->uio_segflg != UIO_USERSPACE)
+ return (uiomove_fromphys(ma, offset, xfersize, uio));
+
+ KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+ cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
+ iov_base = (vm_offset_t)uio->uio_iov->iov_base;
+ switch (uio->uio_rw) {
+ case UIO_WRITE:
+ pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
+ offset, cnt);
+ break;
+ case UIO_READ:
+ pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
+ cnt);
+ break;
+ }
+ pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
+ td->td_ma += pgadv;
+ KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+ pgadv));
+ td->td_ma_cnt -= pgadv;
+ uio->uio_iov->iov_base = (char *)(iov_base + cnt);
+ uio->uio_iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ return (0);
+}
+
+
+/*
+ * File table truncate routine.
+ */
+static int
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vattr vattr;
+ struct mount *mp;
+ struct vnode *vp;
+ void *rl_cookie;
+ int error;
+
+ vp = fp->f_vnode;
+
+ /*
+ * Lock the whole range for truncation. Otherwise split i/o
+ * might happen partly before and partly after the truncation.
+ */
+ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ goto out1;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ if (vp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+ if (error)
+ goto out;
+#endif
+ error = VOP_ADD_WRITECOUNT(vp, 1);
+ if (error == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = length;
+ if ((fp->f_flag & O_FSYNC) != 0)
+ vattr.va_vaflags |= VA_SYNC;
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred);
+ VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
+ }
+out:
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+out1:
+ vn_rangelock_unlock(vp, rl_cookie);
+ return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+static int
+vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vnode *vp = fp->f_vnode;
+ int error;
+
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
+ VOP_UNLOCK(vp, 0);
+
+ return (error);
+}
+
+/*
+ * Stat a vnode; implementation for the stat syscall
+ */
+int
+vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
+ struct ucred *file_cred, struct thread *td)
+{
+ struct vattr vattr;
+ struct vattr *vap;
+ int error;
+ u_short mode;
+
+ AUDIT_ARG_VNODE1(vp);
+#ifdef MAC
+ error = mac_vnode_check_stat(active_cred, file_cred, vp);
+ if (error)
+ return (error);
+#endif
+
+ vap = &vattr;
+
+ /*
+ * Initialize defaults for new and unusual fields, so that file
+ * systems which don't support these fields don't need to know
+ * about them.
+ */
+ vap->va_birthtime.tv_sec = -1;
+ vap->va_birthtime.tv_nsec = 0;
+ vap->va_fsid = VNOVAL;
+ vap->va_rdev = NODEV;
+
+ error = VOP_GETATTR(vp, vap, active_cred);
+ if (error)
+ return (error);
+
+ /*
+ * Zero the spare stat fields
+ */
+ bzero(sb, sizeof *sb);
+
+ /*
+ * Copy from vattr table
+ */
+ if (vap->va_fsid != VNOVAL)
+ sb->st_dev = vap->va_fsid;
+ else
+ sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
+ sb->st_ino = vap->va_fileid;
+ mode = vap->va_mode;
+ switch (vap->va_type) {
+ case VREG:
+ mode |= S_IFREG;
+ break;
+ case VDIR:
+ mode |= S_IFDIR;
+ break;
+ case VBLK:
+ mode |= S_IFBLK;
+ break;
+ case VCHR:
+ mode |= S_IFCHR;
+ break;
+ case VLNK:
+ mode |= S_IFLNK;
+ break;
+ case VSOCK:
+ mode |= S_IFSOCK;
+ break;
+ case VFIFO:
+ mode |= S_IFIFO;
+ break;
+ default:
+ return (EBADF);
+ }
+ sb->st_mode = mode;
+ sb->st_nlink = vap->va_nlink;
+ sb->st_uid = vap->va_uid;
+ sb->st_gid = vap->va_gid;
+ sb->st_rdev = vap->va_rdev;
+ if (vap->va_size > OFF_MAX)
+ return (EOVERFLOW);
+ sb->st_size = vap->va_size;
+ sb->st_atim = vap->va_atime;
+ sb->st_mtim = vap->va_mtime;
+ sb->st_ctim = vap->va_ctime;
+ sb->st_birthtim = vap->va_birthtime;
+
+ /*
+ * According to www.opengroup.org, the meaning of st_blksize is
+ * "a filesystem-specific preferred I/O block size for this
+ * object. In some filesystem types, this may vary from file
+ * to file"
+ * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
+ */
+
+ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
+
+ sb->st_flags = vap->va_flags;
+ if (priv_check(td, PRIV_VFS_GENERATION))
+ sb->st_gen = 0;
+ else
+ sb->st_gen = vap->va_gen;
+
+ sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+ return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ struct fiobmap2_arg *bmarg;
+ int error;
+
+ vp = fp->f_vnode;
+ switch (vp->v_type) {
+ case VDIR:
+ case VREG:
+ switch (com) {
+ case FIONREAD:
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, active_cred);
+ VOP_UNLOCK(vp, 0);
+ if (error == 0)
+ *(int *)data = vattr.va_size - fp->f_offset;
+ return (error);
+ case FIOBMAP2:
+ bmarg = (struct fiobmap2_arg *)data;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+#ifdef MAC
+ error = mac_vnode_check_read(active_cred, fp->f_cred,
+ vp);
+ if (error == 0)
+#endif
+ error = VOP_BMAP(vp, bmarg->bn, NULL,
+ &bmarg->bn, &bmarg->runp, &bmarg->runb);
+ VOP_UNLOCK(vp, 0);
+ return (error);
+ case FIONBIO:
+ case FIOASYNC:
+ return (0);
+ default:
+ return (VOP_IOCTL(vp, com, data, fp->f_flag,
+ active_cred, td));
+ }
+ break;
+ case VCHR:
+ return (VOP_IOCTL(vp, com, data, fp->f_flag,
+ active_cred, td));
+ default:
+ return (ENOTTY);
+ }
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vnode *vp;
+ int error;
+
+ vp = fp->f_vnode;
+#ifdef MAC
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
+ VOP_UNLOCK(vp, 0);
+ if (!error)
+#endif
+
+ error = VOP_POLL(vp, events, fp->f_cred, td);
+ return (error);
+}
+
+/*
+ * Acquire the requested lock and then check for validity. LK_RETRY
+ * permits vn_lock to return doomed vnodes.
+ */
+int
+_vn_lock(struct vnode *vp, int flags, char *file, int line)
+{
+ int error;
+
+ VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+ ("vn_lock: no locktype"));
+ VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count"));
+retry:
+ error = VOP_LOCK1(vp, flags, file, line);
+ flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
+ KASSERT((flags & LK_RETRY) == 0 || error == 0,
+ ("vn_lock: error %d incompatible with flags %#x", error, flags));
+
+ if ((flags & LK_RETRY) == 0) {
+ if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
+ VOP_UNLOCK(vp, 0);
+ error = ENOENT;
+ }
+ } else if (error != 0)
+ goto retry;
+ return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(struct file *fp, struct thread *td)
+{
+ struct vnode *vp;
+ struct flock lf;
+ int error;
+ bool ref;
+
+ vp = fp->f_vnode;
+ fp->f_ops = &badfileops;
+ ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
+
+ error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
+
+ if (__predict_false(ref)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
+ vrele(vp);
+ }
+ return (error);
+}
+
+static bool
+vn_suspendable(struct mount *mp)
+{
+
+ return (mp->mnt_op->vfs_susp_clean != NULL);
+}
+
+/*
+ * Preparing to start a filesystem write operation. If the operation is
+ * permitted, then we bump the count of operations in progress and
+ * proceed. If a suspend request is in progress, we wait until the
+ * suspension is over, and then proceed.
+ */
+static int
+vn_start_write_locked(struct mount *mp, int flags)
+{
+ int error, mflags;
+
+ mtx_assert(MNT_MTX(mp), MA_OWNED);
+ error = 0;
+
+ /*
+ * Check on status of suspension.
+ */
+ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
+ mp->mnt_susp_owner != curthread) {
+ mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
+ (flags & PCATCH) : 0) | (PUSER - 1);
+ while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+ if (flags & V_NOWAIT) {
+ error = EWOULDBLOCK;
+ goto unlock;
+ }
+ error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
+ "suspfs", 0);
+ if (error)
+ goto unlock;
+ }
+ }
+ if (flags & V_XSLEEP)
+ goto unlock;
+ mp->mnt_writeopcount++;
+unlock:
+ if (error != 0 || (flags & V_XSLEEP) != 0)
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ return (error);
+}
+
+int
+vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
+{
+ struct mount *mp;
+ int error;
+
+ KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
+ ("V_MNTREF requires mp"));
+
+ error = 0;
+ /*
+ * If a vnode is provided, get and return the mount point that
+ * to which it will write.
+ */
+ if (vp != NULL) {
+ if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+ *mpp = NULL;
+ if (error != EOPNOTSUPP)
+ return (error);
+ return (0);
+ }
+ }
+ if ((mp = *mpp) == NULL)
+ return (0);
+
+ if (!vn_suspendable(mp)) {
+ if (vp != NULL || (flags & V_MNTREF) != 0)
+ vfs_rel(mp);
+ return (0);
+ }
+
+ /*
+ * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+ * a vfs_ref().
+ * As long as a vnode is not provided we need to acquire a
+ * refcount for the provided mountpoint too, in order to
+ * emulate a vfs_ref().
+ */
+ MNT_ILOCK(mp);
+ if (vp == NULL && (flags & V_MNTREF) == 0)
+ MNT_REF(mp);
+
+ return (vn_start_write_locked(mp, flags));
+}
+
+/*
+ * Secondary suspension. Used by operations such as vop_inactive
+ * routines that are needed by the higher level functions. These
+ * are allowed to proceed until all the higher level functions have
+ * completed (indicated by mnt_writeopcount dropping to zero). At that
+ * time, these operations are halted until the suspension is over.
+ */
+int
+vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
+{
+ struct mount *mp;
+ int error;
+
+ KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
+ ("V_MNTREF requires mp"));
+
+ retry:
+ if (vp != NULL) {
+ if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+ *mpp = NULL;
+ if (error != EOPNOTSUPP)
+ return (error);
+ return (0);
+ }
+ }
+ /*
+ * If we are not suspended or have not yet reached suspended
+ * mode, then let the operation proceed.
+ */
+ if ((mp = *mpp) == NULL)
+ return (0);
+
+ if (!vn_suspendable(mp)) {
+ if (vp != NULL || (flags & V_MNTREF) != 0)
+ vfs_rel(mp);
+ return (0);
+ }
+
+ /*
+ * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+ * a vfs_ref().
+ * As long as a vnode is not provided we need to acquire a
+ * refcount for the provided mountpoint too, in order to
+ * emulate a vfs_ref().
+ */
+ MNT_ILOCK(mp);
+ if (vp == NULL && (flags & V_MNTREF) == 0)
+ MNT_REF(mp);
+ if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
+ mp->mnt_secondary_writes++;
+ mp->mnt_secondary_accwrites++;
+ MNT_IUNLOCK(mp);
+ return (0);
+ }
+ if (flags & V_NOWAIT) {
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ return (EWOULDBLOCK);
+ }
+ /*
+ * Wait for the suspension to finish.
+ */
+ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
+ ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
+ "suspfs", 0);
+ vfs_rel(mp);
+ if (error == 0)
+ goto retry;
+ return (error);
+}
+
+/*
+ * Filesystem write operation has completed. If we are suspending and this
+ * operation is the last one, notify the suspender that the suspension is
+ * now in effect.
+ */
+void
+vn_finished_write(struct mount *mp)
+{
+ if (mp == NULL || !vn_suspendable(mp))
+ return;
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ mp->mnt_writeopcount--;
+ if (mp->mnt_writeopcount < 0)
+ panic("vn_finished_write: neg cnt");
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+ mp->mnt_writeopcount <= 0)
+ wakeup(&mp->mnt_writeopcount);
+ MNT_IUNLOCK(mp);
+}
+
+
+/*
+ * Filesystem secondary write operation has completed. If we are
+ * suspending and this operation is the last one, notify the suspender
+ * that the suspension is now in effect.
+ */
+void
+vn_finished_secondary_write(struct mount *mp)
+{
+ if (mp == NULL || !vn_suspendable(mp))
+ return;
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ mp->mnt_secondary_writes--;
+ if (mp->mnt_secondary_writes < 0)
+ panic("vn_finished_secondary_write: neg cnt");
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+ mp->mnt_secondary_writes <= 0)
+ wakeup(&mp->mnt_secondary_writes);
+ MNT_IUNLOCK(mp);
+}
+
+
+
+/*
+ * Request a filesystem to suspend write operations.
+ */
+int
+vfs_write_suspend(struct mount *mp, int flags)
+{
+ int error;
+
+ MPASS(vn_suspendable(mp));
+
+ MNT_ILOCK(mp);
+ if (mp->mnt_susp_owner == curthread) {
+ MNT_IUNLOCK(mp);
+ return (EALREADY);
+ }
+ while (mp->mnt_kern_flag & MNTK_SUSPEND)
+ msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
+
+ /*
+ * Unmount holds a write reference on the mount point. If we
+ * own busy reference and drain for writers, we deadlock with
+ * the reference draining in the unmount path. Callers of
+ * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
+ * vfs_busy() reference is owned and caller is not in the
+ * unmount context.
+ */
+ if ((flags & VS_SKIP_UNMOUNT) != 0 &&
+ (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+ MNT_IUNLOCK(mp);
+ return (EBUSY);
+ }
+
+ mp->mnt_kern_flag |= MNTK_SUSPEND;
+ mp->mnt_susp_owner = curthread;
+ if (mp->mnt_writeopcount > 0)
+ (void) msleep(&mp->mnt_writeopcount,
+ MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
+ else
+ MNT_IUNLOCK(mp);
+ if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
+ vfs_write_resume(mp, 0);
+ return (error);
+}
+
+/*
+ * Request a filesystem to resume write operations.
+ */
+void
+vfs_write_resume(struct mount *mp, int flags)
+{
+
+ MPASS(vn_suspendable(mp));
+
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+ KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
+ mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
+ MNTK_SUSPENDED);
+ mp->mnt_susp_owner = NULL;
+ wakeup(&mp->mnt_writeopcount);
+ wakeup(&mp->mnt_flag);
+ curthread->td_pflags &= ~TDP_IGNSUSP;
+ if ((flags & VR_START_WRITE) != 0) {
+ MNT_REF(mp);
+ mp->mnt_writeopcount++;
+ }
+ MNT_IUNLOCK(mp);
+ if ((flags & VR_NO_SUSPCLR) == 0)
+ VFS_SUSP_CLEAN(mp);
+ } else if ((flags & VR_START_WRITE) != 0) {
+ MNT_REF(mp);
+ vn_start_write_locked(mp, 0);
+ } else {
+ MNT_IUNLOCK(mp);
+ }
+}
+
+/*
+ * Helper loop around vfs_write_suspend() for filesystem unmount VFS
+ * methods.
+ */
+int
+vfs_write_suspend_umnt(struct mount *mp)
+{
+ int error;
+
+ MPASS(vn_suspendable(mp));
+ KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
+ ("vfs_write_suspend_umnt: recursed"));
+
+ /* dounmount() already called vn_start_write(). */
+ for (;;) {
+ vn_finished_write(mp);
+ error = vfs_write_suspend(mp, 0);
+ if (error != 0) {
+ vn_start_write(NULL, &mp, V_WAIT);
+ return (error);
+ }
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
+ break;
+ MNT_IUNLOCK(mp);
+ vn_start_write(NULL, &mp, V_WAIT);
+ }
+ mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
+ wakeup(&mp->mnt_flag);
+ MNT_IUNLOCK(mp);
+ curthread->td_pflags |= TDP_IGNSUSP;
+ return (0);
+}
+
+/*
+ * Implement kqueues for files by translating it to vnode operation.
+ */
+static int
+vn_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (VOP_KQFILTER(fp->f_vnode, kn));
+}
+
+/*
+ * Simplified in-kernel wrapper calls for extended attribute access.
+ * Both calls pass in a NULL credential, authorizing as "kernel" access.
+ * Set IO_NODELOCKED in ioflg if the vnode is already locked.
+ */
+int
+vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, int *buflen, char *buf, struct thread *td)
+{
+ struct uio auio;
+ struct iovec iov;
+ int error;
+
+ iov.iov_len = *buflen;
+ iov.iov_base = buf;
+
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = 0;
+ auio.uio_resid = *buflen;
+
+ if ((ioflg & IO_NODELOCKED) == 0)
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+ /* authorize attribute retrieval as kernel */
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
+ td);
+
+ if ((ioflg & IO_NODELOCKED) == 0)
+ VOP_UNLOCK(vp, 0);
+
+ if (error == 0) {
+ *buflen = *buflen - auio.uio_resid;
+ }
+
+ return (error);
+}
+
+/*
+ * XXX failure mode if partially written?
+ */
+int
+vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, int buflen, char *buf, struct thread *td)
+{
+ struct uio auio;
+ struct iovec iov;
+ struct mount *mp;
+ int error;
+
+ iov.iov_len = buflen;
+ iov.iov_base = buf;
+
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = 0;
+ auio.uio_resid = buflen;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ }
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+ /* authorize attribute setting as kernel */
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0);
+ }
+
+ return (error);
+}
+
+int
+vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ }
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+ /* authorize attribute removal as kernel */
+ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
+ if (error == EOPNOTSUPP)
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+ NULL, td);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0);
+ }
+
+ return (error);
+}
+
+static int
+vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
+ struct vnode **rvp)
+{
+
+ return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
+}
+
+int
+vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
+{
+
+ return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
+ lkflags, rvp));
+}
+
+int
+vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
+ int lkflags, struct vnode **rvp)
+{
+ struct mount *mp;
+ int ltype, error;
+
+ ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
+ mp = vp->v_mount;
+ ltype = VOP_ISLOCKED(vp);
+ KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
+ ("vn_vget_ino: vp not locked"));
+ error = vfs_busy(mp, MBF_NOWAIT);
+ if (error != 0) {
+ vfs_ref(mp);
+ VOP_UNLOCK(vp, 0);
+ error = vfs_busy(mp, 0);
+ vn_lock(vp, ltype | LK_RETRY);
+ vfs_rel(mp);
+ if (error != 0)
+ return (ENOENT);
+ if (vp->v_iflag & VI_DOOMED) {
+ vfs_unbusy(mp);
+ return (ENOENT);
+ }
+ }
+ VOP_UNLOCK(vp, 0);
+ error = alloc(mp, alloc_arg, lkflags, rvp);
+ vfs_unbusy(mp);
+ if (error != 0 || *rvp != vp)
+ vn_lock(vp, ltype | LK_RETRY);
+ if (vp->v_iflag & VI_DOOMED) {
+ if (error == 0) {
+ if (*rvp == vp)
+ vunref(vp);
+ else
+ vput(*rvp);
+ }
+ error = ENOENT;
+ }
+ return (error);
+}
+
+int
+vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
+ struct thread *td)
+{
+
+ if (vp->v_type != VREG || td == NULL)
+ return (0);
+ if ((uoff_t)uio->uio_offset + uio->uio_resid >
+ lim_cur(td, RLIMIT_FSIZE)) {
+ PROC_LOCK(td->td_proc);
+ kern_psignal(td->td_proc, SIGXFSZ);
+ PROC_UNLOCK(td->td_proc);
+ return (EFBIG);
+ }
+ return (0);
+}
+
+int
+vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vnode *vp;
+
+ vp = fp->f_vnode;
+#ifdef AUDIT
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ VOP_UNLOCK(vp, 0);
+#endif
+ return (setfmode(td, active_cred, vp, mode));
+}
+
+int
+vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vnode *vp;
+
+ vp = fp->f_vnode;
+#ifdef AUDIT
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ VOP_UNLOCK(vp, 0);
+#endif
+ return (setfown(td, active_cred, vp, uid, gid));
+}
+
+void
+vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
+{
+ vm_object_t object;
+
+ if ((object = vp->v_object) == NULL)
+ return;
+ VM_OBJECT_WLOCK(object);
+ vm_object_page_remove(object, start, end, 0);
+ VM_OBJECT_WUNLOCK(object);
+}
+
+int
+vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
+{
+ struct vattr va;
+ daddr_t bn, bnp;
+ uint64_t bsize;
+ off_t noff;
+ int error;
+
+ KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
+ ("Wrong command %lu", cmd));
+
+ if (vn_lock(vp, LK_SHARED) != 0)
+ return (EBADF);
+ if (vp->v_type != VREG) {
+ error = ENOTTY;
+ goto unlock;
+ }
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error != 0)
+ goto unlock;
+ noff = *off;
+ if (noff >= va.va_size) {
+ error = ENXIO;
+ goto unlock;
+ }
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
+ noff % bsize) {
+ error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
+ if (error == EOPNOTSUPP) {
+ error = ENOTTY;
+ goto unlock;
+ }
+ if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
+ (bnp != -1 && cmd == FIOSEEKDATA)) {
+ noff = bn * bsize;
+ if (noff < *off)
+ noff = *off;
+ goto unlock;
+ }
+ }
+ if (noff > va.va_size)
+ noff = va.va_size;
+ /* noff == va.va_size. There is an implicit hole at the end of file. */
+ if (cmd == FIOSEEKDATA)
+ error = ENXIO;
+unlock:
+ VOP_UNLOCK(vp, 0);
+ if (error == 0)
+ *off = noff;
+ return (error);
+}
+
+int
+vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
+{
+ struct ucred *cred;
+ struct vnode *vp;
+ struct vattr vattr;
+ off_t foffset, size;
+ int error, noneg;
+
+ cred = td->td_ucred;
+ vp = fp->f_vnode;
+ foffset = foffset_lock(fp, 0);
+ noneg = (vp->v_type != VCHR);
+ error = 0;
+ switch (whence) {
+ case L_INCR:
+ if (noneg &&
+ (foffset < 0 ||
+ (offset > 0 && foffset > OFF_MAX - offset))) {
+ error = EOVERFLOW;
+ break;
+ }
+ offset += foffset;
+ break;
+ case L_XTND:
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ break;
+
+ /*
+ * If the file references a disk device, then fetch
+ * the media size and use that to determine the ending
+ * offset.
+ */
+ if (vattr.va_size == 0 && vp->v_type == VCHR &&
+ fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
+ vattr.va_size = size;
+ if (noneg &&
+ (vattr.va_size > OFF_MAX ||
+ (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
+ error = EOVERFLOW;
+ break;
+ }
+ offset += vattr.va_size;
+ break;
+ case L_SET:
+ break;
+ case SEEK_DATA:
+ error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
+ break;
+ case SEEK_HOLE:
+ error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
+ break;
+ default:
+ error = EINVAL;
+ }
+ if (error == 0 && noneg && offset < 0)
+ error = EINVAL;
+ if (error != 0)
+ goto drop;
+ VFS_KNOTE_UNLOCKED(vp, 0);
+ td->td_uretoff.tdu_off = offset;
+drop:
+ foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
+ return (error);
+}
+
+int
+vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
+ struct thread *td)
+{
+ int error;
+
+ /*
+ * Grant permission if the caller is the owner of the file, or
+ * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
+ * on the file. If the time pointer is null, then write
+ * permission on the file is also sufficient.
+ *
+ * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
+ * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
+ * will be allowed to set the times [..] to the current
+ * server time.
+ */
+ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
+ if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
+ error = VOP_ACCESS(vp, VWRITE, cred, td);
+ return (error);
+}
+
+int
+vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
+{
+ struct vnode *vp;
+ int error;
+
+ if (fp->f_type == DTYPE_FIFO)
+ kif->kf_type = KF_TYPE_FIFO;
+ else
+ kif->kf_type = KF_TYPE_VNODE;
+ vp = fp->f_vnode;
+ vref(vp);
+ FILEDESC_SUNLOCK(fdp);
+ error = vn_fill_kinfo_vnode(vp, kif);
+ vrele(vp);
+ FILEDESC_SLOCK(fdp);
+ return (error);
+}
+
+static inline void
+vn_fill_junk(struct kinfo_file *kif)
+{
+ size_t len, olen;
+
+ /*
+ * Simulate vn_fullpath returning changing values for a given
+ * vp during e.g. coredump.
+ */
+ len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
+ olen = strlen(kif->kf_path);
+ if (len < olen)
+ strcpy(&kif->kf_path[len - 1], "$");
+ else
+ for (; olen < len; olen++)
+ strcpy(&kif->kf_path[olen], "A");
+}
+
+int
+vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
+{
+ struct vattr va;
+ char *fullpath, *freepath;
+ int error;
+
+ kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
+ freepath = NULL;
+ fullpath = "-";
+ error = vn_fullpath(curthread, vp, &fullpath, &freepath);
+ if (error == 0) {
+ strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
+ }
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+
+ KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
+ vn_fill_junk(kif);
+ );
+
+ /*
+ * Retrieve vnode attributes.
+ */
+ va.va_fsid = VNOVAL;
+ va.va_rdev = NODEV;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &va, curthread->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ if (error != 0)
+ return (error);
+ if (va.va_fsid != VNOVAL)
+ kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
+ else
+ kif->kf_un.kf_file.kf_file_fsid =
+ vp->v_mount->mnt_stat.f_fsid.val[0];
+ kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
+ kif->kf_un.kf_file.kf_file_fsid; /* truncate */
+ kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
+ kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
+ kif->kf_un.kf_file.kf_file_size = va.va_size;
+ kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
+ kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
+ kif->kf_un.kf_file.kf_file_rdev; /* truncate */
+ return (0);
+}
+
+int
+vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
+ vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
+ struct thread *td)
+{
+#ifdef HWPMC_HOOKS
+ struct pmckern_map_in pkm;
+#endif
+ struct mount *mp;
+ struct vnode *vp;
+ vm_object_t object;
+ vm_prot_t maxprot;
+ boolean_t writecounted;
+ int error;
+
+#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
+ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
+ /*
+ * POSIX shared-memory objects are defined to have
+ * kernel persistence, and are not defined to support
+ * read(2)/write(2) -- or even open(2). Thus, we can
+ * use MAP_ASYNC to trade on-disk coherence for speed.
+ * The shm_open(3) library routine turns on the FPOSIXSHM
+ * flag to request this behavior.
+ */
+ if ((fp->f_flag & FPOSIXSHM) != 0)
+ flags |= MAP_NOSYNC;
+#endif
+ vp = fp->f_vnode;
+
+ /*
+ * Ensure that file and memory protections are
+ * compatible. Note that we only worry about
+ * writability if mapping is shared; in this case,
+ * current and max prot are dictated by the open file.
+ * XXX use the vnode instead? Problem is: what
+ * credentials do we use for determination? What if
+ * proc does a setuid?
+ */
+ mp = vp->v_mount;
+ if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
+ maxprot = VM_PROT_NONE;
+ if ((prot & VM_PROT_EXECUTE) != 0)
+ return (EACCES);
+ } else
+ maxprot = VM_PROT_EXECUTE;
+ if ((fp->f_flag & FREAD) != 0)
+ maxprot |= VM_PROT_READ;
+ else if ((prot & VM_PROT_READ) != 0)
+ return (EACCES);
+
+ /*
+ * If we are sharing potential changes via MAP_SHARED and we
+ * are trying to get write permission although we opened it
+ * without asking for it, bail out.
+ */
+ if ((flags & MAP_SHARED) != 0) {
+ if ((fp->f_flag & FWRITE) != 0)
+ maxprot |= VM_PROT_WRITE;
+ else if ((prot & VM_PROT_WRITE) != 0)
+ return (EACCES);
+ } else {
+ maxprot |= VM_PROT_WRITE;
+ cap_maxprot |= VM_PROT_WRITE;
+ }
+ maxprot &= cap_maxprot;
+
+ /*
+ * For regular files and shared memory, POSIX requires that
+ * the value of foff be a legitimate offset within the data
+ * object. In particular, negative offsets are invalid.
+ * Blocking negative offsets and overflows here avoids
+ * possible wraparound or user-level access into reserved
+ * ranges of the data object later. In contrast, POSIX does
+ * not dictate how offsets are used by device drivers, so in
+ * the case of a device mapping a negative offset is passed
+ * on.
+ */
+ if (
+#ifdef _LP64
+ size > OFF_MAX ||
+#endif
+ foff < 0 || foff > OFF_MAX - size)
+ return (EINVAL);
+
+ writecounted = FALSE;
+ error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
+ &foff, &object, &writecounted);
+ if (error != 0)
+ return (error);
+ error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+ foff, writecounted, td);
+ if (error != 0) {
+ /*
+ * If this mapping was accounted for in the vnode's
+ * writecount, then undo that now.
+ */
+ if (writecounted)
+ vm_pager_release_writecount(object, 0, size);
+ vm_object_deallocate(object);
+ }
+#ifdef HWPMC_HOOKS
+ /* Inform hwpmc(4) if an executable is being mapped. */
+ if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
+ if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
+ pkm.pm_file = vp;
+ pkm.pm_address = (uintptr_t) *addr;
+ PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
+ }
+ }
+#endif
+ return (error);
+}
+
+void
+vn_fsid(struct vnode *vp, struct vattr *va)
+{
+ fsid_t *f;
+
+ f = &vp->v_mount->mnt_stat.f_fsid;
+ va->va_fsid = (uint32_t)f->val[1];
+ va->va_fsid <<= sizeof(f->val[1]) * NBBY;
+ va->va_fsid += (uint32_t)f->val[0];
+}
+
+int
+vn_fsync_buf(struct vnode *vp, int waitfor)
+{
+ struct buf *bp, *nbp;
+ struct bufobj *bo;
+ struct mount *mp;
+ int error, maxretry;
+
+ error = 0;
+ maxretry = 10000; /* large, arbitrarily chosen */
+ mp = NULL;
+ if (vp->v_type == VCHR) {
+ VI_LOCK(vp);
+ mp = vp->v_rdev->si_mountpt;
+ VI_UNLOCK(vp);
+ }
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+loop1:
+ /*
+ * MARK/SCAN initialization to avoid infinite loops.
+ */
+ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ bp->b_vflags &= ~BV_SCANNED;
+ bp->b_error = 0;
+ }
+
+ /*
+ * Flush all dirty buffers associated with a vnode.
+ */
+loop2:
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ if ((bp->b_vflags & BV_SCANNED) != 0)
+ continue;
+ bp->b_vflags |= BV_SCANNED;
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
+ if (waitfor != MNT_WAIT)
+ continue;
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
+ BO_LOCKPTR(bo)) != 0) {
+ BO_LOCK(bo);
+ goto loop1;
+ }
+ BO_LOCK(bo);
+ }
+ BO_UNLOCK(bo);
+ KASSERT(bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p",
+ bp, bp->b_bufobj, bo));
+ if ((bp->b_flags & B_DELWRI) == 0)
+ panic("fsync: not dirty");
+ if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
+ vfs_bio_awrite(bp);
+ } else {
+ bremfree(bp);
+ bawrite(bp);
+ }
+ if (maxretry < 1000)
+ pause("dirty", hz < 1000 ? 1 : hz / 1000);
+ BO_LOCK(bo);
+ goto loop2;
+ }
+
+ /*
+ * If synchronous the caller expects us to completely resolve all
+ * dirty buffers in the system. Wait for in-progress I/O to
+ * complete (which could include background bitmap writes), then
+ * retry if dirty blocks still exist.
+ */
+ if (waitfor == MNT_WAIT) {
+ bufobj_wwait(bo, 0, 0);
+ if (bo->bo_dirty.bv_cnt > 0) {
+ /*
+ * If we are unable to write any of these buffers
+ * then we fail now rather than trying endlessly
+ * to write them out.
+ */
+ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
+ if ((error = bp->b_error) != 0)
+ break;
+ if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
+ (error == 0 && --maxretry >= 0))
+ goto loop1;
+ if (error == 0)
+ error = EAGAIN;
+ }
+ }
+ BO_UNLOCK(bo);
+ if (error != 0)
+ vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
+
+ return (error);
+}
diff --git a/freebsd/sys/sys/bio.h b/freebsd/sys/sys/bio.h
new file mode 100644
index 00000000..1dab6155
--- /dev/null
+++ b/freebsd/sys/sys/bio.h
@@ -0,0 +1,184 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)buf.h 8.9 (Berkeley) 3/30/95
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_BIO_H_
+#define _SYS_BIO_H_
+
+#include <sys/queue.h>
+#include <sys/disk_zone.h>
+
+/* bio_cmd */
+#define BIO_READ 0x01 /* Read I/O data */
+#define BIO_WRITE 0x02 /* Write I/O data */
+#define BIO_DELETE 0x03 /* TRIM or free blocks, i.e. mark as unused */
+#define BIO_GETATTR 0x04 /* Get GEOM attributes of object */
+#define BIO_FLUSH 0x05 /* Commit outstanding I/O now */
+#define BIO_CMD0 0x06 /* Available for local hacks */
+#define BIO_CMD1 0x07 /* Available for local hacks */
+#define BIO_CMD2 0x08 /* Available for local hacks */
+#define BIO_ZONE 0x09 /* Zone command */
+
+/* bio_flags */
+#define BIO_ERROR 0x01 /* An error occurred processing this bio. */
+#define BIO_DONE 0x02 /* This bio is finished. */
+#define BIO_ONQUEUE 0x04 /* This bio is in a queue & not yet taken. */
+/*
+ * This bio must be executed after all previous bios in the queue have been
+ * executed, and before any successive bios can be executed.
+ */
+#define BIO_ORDERED 0x08
+#define BIO_UNMAPPED 0x10
+#define BIO_TRANSIENT_MAPPING 0x20
+#define BIO_VLIST 0x40
+
+#ifdef _KERNEL
+struct disk;
+struct bio;
+struct vm_map;
+
+/* Empty classifier tag, to prevent further classification. */
+#define BIO_NOTCLASSIFIED (void *)(~0UL)
+
+typedef void bio_task_t(void *);
+
+/*
+ * The bio structure describes an I/O operation in the kernel.
+ */
+struct bio {
+ uint16_t bio_cmd; /* I/O operation. */
+ uint16_t bio_flags; /* General flags. */
+ uint16_t bio_cflags; /* Private use by the consumer. */
+ uint16_t bio_pflags; /* Private use by the provider. */
+ struct cdev *bio_dev; /* Device to do I/O on. */
+ struct disk *bio_disk; /* Valid below geom_disk.c only */
+ off_t bio_offset; /* Offset into file. */
+ long bio_bcount; /* Valid bytes in buffer. */
+ caddr_t bio_data; /* Memory, superblocks, indirect etc. */
+ struct vm_page **bio_ma; /* Or unmapped. */
+ int bio_ma_offset; /* Offset in the first page of bio_ma. */
+ int bio_ma_n; /* Number of pages in bio_ma. */
+ int bio_error; /* Errno for BIO_ERROR. */
+ long bio_resid; /* Remaining I/O in bytes. */
+ void (*bio_done)(struct bio *);
+ void *bio_driver1; /* Private use by the provider. */
+ void *bio_driver2; /* Private use by the provider. */
+ void *bio_caller1; /* Private use by the consumer. */
+ void *bio_caller2; /* Private use by the consumer. */
+ TAILQ_ENTRY(bio) bio_queue; /* Disksort queue. */
+ const char *bio_attribute; /* Attribute for BIO_[GS]ETATTR */
+ struct disk_zone_args bio_zone;/* Used for BIO_ZONE */
+ struct g_consumer *bio_from; /* GEOM linkage */
+ struct g_provider *bio_to; /* GEOM linkage */
+ off_t bio_length; /* Like bio_bcount */
+ off_t bio_completed; /* Inverse of bio_resid */
+ u_int bio_children; /* Number of spawned bios */
+ u_int bio_inbed; /* Children safely home by now */
+ struct bio *bio_parent; /* Pointer to parent */
+ struct bintime bio_t0; /* Time request started */
+
+ bio_task_t *bio_task; /* Task_queue handler */
+ void *bio_task_arg; /* Argument to above */
+
+ void *bio_classifier1; /* Classifier tag. */
+ void *bio_classifier2; /* Classifier tag. */
+
+#ifdef DIAGNOSTIC
+ void *_bio_caller1;
+ void *_bio_caller2;
+ uint8_t _bio_cflags;
+#endif
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+ struct buf *bio_track_bp; /* Parent buf for tracking */
+#endif
+
+ /* XXX: these go away when bio chaining is introduced */
+ daddr_t bio_pblkno; /* physical block number */
+};
+
+struct uio;
+struct devstat;
+
+struct bio_queue_head {
+ TAILQ_HEAD(bio_queue, bio) queue;
+ off_t last_offset;
+ struct bio *insert_point;
+ int total;
+ int batched;
+};
+
+extern struct vm_map *bio_transient_map;
+extern int bio_transient_maxcnt;
+
+void biodone(struct bio *bp);
+void biofinish(struct bio *bp, struct devstat *stat, int error);
+int biowait(struct bio *bp, const char *wchan);
+
+#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING)
+void biotrack_buf(struct bio *bp, const char *location);
+
+static __inline void
+biotrack(struct bio *bp, const char *location)
+{
+
+ if (bp->bio_track_bp != NULL)
+ biotrack_buf(bp, location);
+}
+#else
+static __inline void
+biotrack(struct bio *bp __unused, const char *location __unused)
+{
+}
+#endif
+
+void bioq_disksort(struct bio_queue_head *ap, struct bio *bp);
+struct bio *bioq_first(struct bio_queue_head *head);
+struct bio *bioq_takefirst(struct bio_queue_head *head);
+void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error);
+void bioq_init(struct bio_queue_head *head);
+void bioq_insert_head(struct bio_queue_head *head, struct bio *bp);
+void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp);
+void bioq_remove(struct bio_queue_head *head, struct bio *bp);
+
+int physio(struct cdev *dev, struct uio *uio, int ioflag);
+#define physread physio
+#define physwrite physio
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_BIO_H_ */
diff --git a/freebsd/sys/sys/namei.h b/freebsd/sys/sys/namei.h
new file mode 100644
index 00000000..53814117
--- /dev/null
+++ b/freebsd/sys/sys/namei.h
@@ -0,0 +1,226 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1985, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)namei.h 8.5 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_NAMEI_H_
+#define _SYS_NAMEI_H_
+
+#include <sys/caprights.h>
+#include <sys/filedesc.h>
+#include <sys/queue.h>
+#include <sys/_uio.h>
+
+struct componentname {
+ /*
+ * Arguments to lookup.
+ */
+ u_long cn_nameiop; /* namei operation */
+ u_int64_t cn_flags; /* flags to namei */
+ struct thread *cn_thread;/* thread requesting lookup */
+ struct ucred *cn_cred; /* credentials */
+ int cn_lkflags; /* Lock flags LK_EXCLUSIVE or LK_SHARED */
+ /*
+ * Shared between lookup and commit routines.
+ */
+ char *cn_pnbuf; /* pathname buffer */
+ char *cn_nameptr; /* pointer to looked up name */
+ long cn_namelen; /* length of looked up component */
+};
+
+struct nameicap_tracker;
+TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker);
+
+/*
+ * Encapsulation of namei parameters.
+ */
+struct nameidata {
+ /*
+ * Arguments to namei/lookup.
+ */
+ const char *ni_dirp; /* pathname pointer */
+ enum uio_seg ni_segflg; /* location of pathname */
+ cap_rights_t ni_rightsneeded; /* rights required to look up vnode */
+ /*
+ * Arguments to lookup.
+ */
+ struct vnode *ni_startdir; /* starting directory */
+ struct vnode *ni_rootdir; /* logical root directory */
+ struct vnode *ni_topdir; /* logical top directory */
+ int ni_dirfd; /* starting directory for *at functions */
+ int ni_lcf; /* local call flags */
+ /*
+ * Results: returned from namei
+ */
+ struct filecaps ni_filecaps; /* rights the *at base has */
+ /*
+ * Results: returned from/manipulated by lookup
+ */
+ struct vnode *ni_vp; /* vnode of result */
+ struct vnode *ni_dvp; /* vnode of intermediate directory */
+ /*
+ * Results: flags returned from namei
+ */
+ u_int ni_resflags;
+ /*
+ * Shared between namei and lookup/commit routines.
+ */
+ size_t ni_pathlen; /* remaining chars in path */
+ char *ni_next; /* next location in pathname */
+ u_int ni_loopcnt; /* count of symlinks encountered */
+ /*
+ * Lookup parameters: this structure describes the subset of
+ * information from the nameidata structure that is passed
+ * through the VOP interface.
+ */
+ struct componentname ni_cnd;
+ struct nameicap_tracker_head ni_cap_tracker;
+};
+
+#ifdef _KERNEL
+/*
+ * namei operations
+ */
+#define LOOKUP 0 /* perform name lookup only */
+#define CREATE 1 /* setup for file creation */
+#define DELETE 2 /* setup for file deletion */
+#define RENAME 3 /* setup for file renaming */
+#define OPMASK 3 /* mask for operation */
+/*
+ * namei operational modifier flags, stored in ni_cnd.flags
+ */
+#define LOCKLEAF 0x0004 /* lock vnode on return */
+#define LOCKPARENT 0x0008 /* want parent vnode returned locked */
+#define WANTPARENT 0x0010 /* want parent vnode returned unlocked */
+#define NOCACHE 0x0020 /* name must not be left in cache */
+#define FOLLOW 0x0040 /* follow symbolic links */
+#define LOCKSHARED 0x0100 /* Shared lock leaf */
+#define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */
+#define MODMASK 0x01fc /* mask of operational modifiers */
+/*
+ * Namei parameter descriptors.
+ *
+ * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP.
+ * If the caller of namei sets the flag (for example execve wants to
+ * know the name of the program that is being executed), then it must
+ * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must
+ * be freed by either the commit routine or the VOP_ABORT routine.
+ * SAVESTART is set only by the callers of namei. It implies SAVENAME
+ * plus the addition of saving the parent directory that contains the
+ * name in ni_startdir. It allows repeated calls to lookup for the
+ * name being sought. The caller is responsible for releasing the
+ * buffer and for vrele'ing ni_startdir.
+ */
+#define RDONLY 0x00000200 /* lookup with read-only semantics */
+#define HASBUF 0x00000400 /* has allocated pathname buffer */
+#define SAVENAME 0x00000800 /* save pathname buffer */
+#define SAVESTART 0x00001000 /* save starting directory */
+#define ISDOTDOT 0x00002000 /* current component name is .. */
+#define MAKEENTRY 0x00004000 /* entry is to be added to name cache */
+#define ISLASTCN 0x00008000 /* this is last component of pathname */
+#define ISSYMLINK 0x00010000 /* symlink needs interpretation */
+#define ISWHITEOUT 0x00020000 /* found whiteout */
+#define DOWHITEOUT 0x00040000 /* do whiteouts */
+#define WILLBEDIR 0x00080000 /* new files will be dirs; allow trailing / */
+#define ISUNICODE 0x00100000 /* current component name is unicode*/
+#define ISOPEN 0x00200000 /* caller is opening; return a real vnode. */
+#define NOCROSSMOUNT 0x00400000 /* do not cross mount points */
+#define NOMACCHECK 0x00800000 /* do not perform MAC checks */
+#define AUDITVNODE1 0x04000000 /* audit the looked up vnode information */
+#define AUDITVNODE2 0x08000000 /* audit the looked up vnode information */
+#define TRAILINGSLASH 0x10000000 /* path ended in a slash */
+#define NOCAPCHECK 0x20000000 /* do not perform capability checks */
+#define NOEXECCHECK 0x40000000 /* do not perform exec check on dir */
+#define PARAMASK 0x7ffffe00 /* mask of parameter descriptors */
+
+/*
+ * Namei results flags
+ */
+#define NIRES_ABS 0x00000001 /* Path was absolute */
+
+/*
+ * Flags in ni_lcf, valid for the duration of the namei call.
+ */
+#define NI_LCF_STRICTRELATIVE 0x0001 /* relative lookup only */
+#define NI_LCF_CAP_DOTDOT 0x0002 /* ".." in strictrelative case */
+
+/*
+ * Initialization of a nameidata structure.
+ */
+#define NDINIT(ndp, op, flags, segflg, namep, td) \
+ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, 0, td)
+#define NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td) \
+ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, 0, td)
+#define NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rightsp, td) \
+ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, rightsp, td)
+#define NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td) \
+ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, 0, td)
+
+void NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags,
+ enum uio_seg segflg, const char *namep, int dirfd, struct vnode *startdir,
+ cap_rights_t *rightsp, struct thread *td);
+
+#define NDF_NO_DVP_RELE 0x00000001
+#define NDF_NO_DVP_UNLOCK 0x00000002
+#define NDF_NO_DVP_PUT 0x00000003
+#define NDF_NO_VP_RELE 0x00000004
+#define NDF_NO_VP_UNLOCK 0x00000008
+#define NDF_NO_VP_PUT 0x0000000c
+#define NDF_NO_STARTDIR_RELE 0x00000010
+#define NDF_NO_FREE_PNBUF 0x00000020
+#define NDF_ONLY_PNBUF (~NDF_NO_FREE_PNBUF)
+
+void NDFREE(struct nameidata *, const u_int);
+
+int namei(struct nameidata *ndp);
+int lookup(struct nameidata *ndp);
+int relookup(struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp);
+#endif
+
+/*
+ * Stats on usefulness of namei caches.
+ */
+struct nchstats {
+ long ncs_goodhits; /* hits that we can really use */
+ long ncs_neghits; /* negative hits that we can use */
+ long ncs_badhits; /* hits we must drop */
+ long ncs_falsehits; /* hits with id mismatch */
+ long ncs_miss; /* misses */
+ long ncs_long; /* long names that ignore cache */
+ long ncs_pass2; /* names found with passes == 2 */
+ long ncs_2passes; /* number of times we attempt it */
+};
+
+extern struct nchstats nchstats;
+
+#endif /* !_SYS_NAMEI_H_ */
diff --git a/freebsd/sys/sys/pctrie.h b/freebsd/sys/sys/pctrie.h
new file mode 100644
index 00000000..88d5d258
--- /dev/null
+++ b/freebsd/sys/sys/pctrie.h
@@ -0,0 +1,152 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PCTRIE_H_
+#define _SYS_PCTRIE_H_
+
+#include <sys/_pctrie.h>
+
+#ifdef _KERNEL
+
+#define PCTRIE_DEFINE(name, type, field, allocfn, freefn) \
+ \
+CTASSERT(sizeof(((struct type *)0)->field) == sizeof(uint64_t)); \
+/* \
+ * XXX This assert protects flag bits, it does not enforce natural \
+ * alignment. 32bit architectures do not naturally align 64bit fields. \
+ */ \
+CTASSERT((__offsetof(struct type, field) & (sizeof(uint32_t) - 1)) == 0); \
+ \
+static __inline struct type * \
+name##_PCTRIE_VAL2PTR(uint64_t *val) \
+{ \
+ \
+ if (val == NULL) \
+ return (NULL); \
+ return (struct type *) \
+ ((uintptr_t)val - __offsetof(struct type, field)); \
+} \
+ \
+static __inline uint64_t * \
+name##_PCTRIE_PTR2VAL(struct type *ptr) \
+{ \
+ \
+ return &ptr->field; \
+} \
+ \
+static __inline int \
+name##_PCTRIE_INSERT(struct pctrie *ptree, struct type *ptr) \
+{ \
+ \
+ return pctrie_insert(ptree, name##_PCTRIE_PTR2VAL(ptr), \
+ allocfn); \
+} \
+ \
+static __inline struct type * \
+name##_PCTRIE_LOOKUP(struct pctrie *ptree, uint64_t key) \
+{ \
+ \
+ return name##_PCTRIE_VAL2PTR(pctrie_lookup(ptree, key)); \
+} \
+ \
+static __inline __unused struct type * \
+name##_PCTRIE_LOOKUP_LE(struct pctrie *ptree, uint64_t key) \
+{ \
+ \
+ return name##_PCTRIE_VAL2PTR(pctrie_lookup_le(ptree, key)); \
+} \
+ \
+static __inline __unused struct type * \
+name##_PCTRIE_LOOKUP_GE(struct pctrie *ptree, uint64_t key) \
+{ \
+ \
+ return name##_PCTRIE_VAL2PTR(pctrie_lookup_ge(ptree, key)); \
+} \
+ \
+static __inline __unused void \
+name##_PCTRIE_RECLAIM(struct pctrie *ptree) \
+{ \
+ \
+ pctrie_reclaim_allnodes(ptree, freefn); \
+} \
+ \
+static __inline void \
+name##_PCTRIE_REMOVE(struct pctrie *ptree, uint64_t key) \
+{ \
+ \
+ pctrie_remove(ptree, key, freefn); \
+}
+
+typedef void *(*pctrie_alloc_t)(struct pctrie *ptree);
+typedef void (*pctrie_free_t)(struct pctrie *ptree, void *node);
+
+int pctrie_insert(struct pctrie *ptree, uint64_t *val,
+ pctrie_alloc_t allocfn);
+uint64_t *pctrie_lookup(struct pctrie *ptree, uint64_t key);
+uint64_t *pctrie_lookup_ge(struct pctrie *ptree, uint64_t key);
+uint64_t *pctrie_lookup_le(struct pctrie *ptree, uint64_t key);
+void pctrie_reclaim_allnodes(struct pctrie *ptree,
+ pctrie_free_t freefn);
+void pctrie_remove(struct pctrie *ptree, uint64_t key,
+ pctrie_free_t freefn);
+size_t pctrie_node_size(void);
+int pctrie_zone_init(void *mem, int size, int flags);
+
+static __inline void
+pctrie_init(struct pctrie *ptree)
+{
+
+ ptree->pt_root = 0;
+}
+
+static __inline boolean_t
+pctrie_is_empty(struct pctrie *ptree)
+{
+
+ return (ptree->pt_root == 0);
+}
+
+/*
+ * These widths should allow the pointers to a node's children to fit within
+ * a single cache line. The extra levels from a narrow width should not be
+ * a problem thanks to path compression.
+ */
+#ifdef __LP64__
+#define PCTRIE_WIDTH 4
+#else
+#define PCTRIE_WIDTH 3
+#endif
+
+#define PCTRIE_COUNT (1 << PCTRIE_WIDTH)
+
+#endif /* _KERNEL */
+#endif /* !_SYS_PCTRIE_H_ */
diff --git a/freebsd/sys/sys/syscallsubr.h b/freebsd/sys/sys/syscallsubr.h
new file mode 100644
index 00000000..677afdd6
--- /dev/null
+++ b/freebsd/sys/sys/syscallsubr.h
@@ -0,0 +1,317 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2002 Ian Dowse. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_SYSCALLSUBR_H_
+#define _SYS_SYSCALLSUBR_H_
+
+#include <sys/signal.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <sys/mount.h>
+#include <sys/_cpuset.h>
+#include <sys/_domainset.h>
+#include <sys/_uio.h>
+
+struct __wrusage;
+struct file;
+struct filecaps;
+enum idtype;
+struct itimerval;
+struct image_args;
+struct jail;
+struct kevent;
+struct kevent_copyops;
+struct kld_file_stat;
+struct ksiginfo;
+struct mbuf;
+struct msghdr;
+struct msqid_ds;
+struct pollfd;
+struct ogetdirentries_args;
+struct rlimit;
+struct rusage;
+struct sched_param;
+union semun;
+struct sockaddr;
+struct stat;
+struct thr_param;
+struct uio;
+
+typedef int (*mmap_check_fp_fn)(struct file *, int, int, int);
+
+int kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg,
+ size_t buflen, size_t path_max);
+int kern_accept(struct thread *td, int s, struct sockaddr **name,
+ socklen_t *namelen, struct file **fp);
+int kern_accept4(struct thread *td, int s, struct sockaddr **name,
+ socklen_t *namelen, int flags, struct file **fp);
+int kern_accessat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, int flags, int mode);
+int kern_adjtime(struct thread *td, struct timeval *delta,
+ struct timeval *olddelta);
+int kern_alternate_path(struct thread *td, const char *prefix, const char *path,
+ enum uio_seg pathseg, char **pathbuf, int create, int dirfd);
+int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa);
+int kern_break(struct thread *td, uintptr_t *addr);
+int kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds,
+ size_t ncmds);
+int kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights);
+int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg);
+int kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
+ clockid_t *clk_id);
+int kern_clock_getres(struct thread *td, clockid_t clock_id,
+ struct timespec *ts);
+int kern_clock_gettime(struct thread *td, clockid_t clock_id,
+ struct timespec *ats);
+int kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
+ const struct timespec *rqtp, struct timespec *rmtp);
+int kern_clock_settime(struct thread *td, clockid_t clock_id,
+ struct timespec *ats);
+int kern_close(struct thread *td, int fd);
+int kern_connectat(struct thread *td, int dirfd, int fd,
+ struct sockaddr *sa);
+int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
+ cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp);
+int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
+ cpuwhich_t which, id_t id, size_t cpusetsize,
+ const cpuset_t *maskp);
+int kern_cpuset_getdomain(struct thread *td, cpulevel_t level,
+ cpuwhich_t which, id_t id, size_t domainsetsize,
+ domainset_t *maskp, int *policyp);
+int kern_cpuset_setdomain(struct thread *td, cpulevel_t level,
+ cpuwhich_t which, id_t id, size_t domainsetsize,
+ const domainset_t *maskp, int policy);
+int kern_cpuset_getid(struct thread *td, cpulevel_t level,
+ cpuwhich_t which, id_t id, cpusetid_t *setid);
+int kern_cpuset_setid(struct thread *td, cpuwhich_t which,
+ id_t id, cpusetid_t setid);
+int kern_dup(struct thread *td, u_int mode, int flags, int old, int new);
+int kern_execve(struct thread *td, struct image_args *args,
+ struct mac *mac_p);
+int kern_fchmodat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, mode_t mode, int flag);
+int kern_fchownat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, int uid, int gid, int flag);
+int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg);
+int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg);
+int kern_fhstat(struct thread *td, fhandle_t fh, struct stat *buf);
+int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf);
+int kern_fpathconf(struct thread *td, int fd, int name, long *valuep);
+int kern_fstat(struct thread *td, int fd, struct stat *sbp);
+int kern_fstatfs(struct thread *td, int fd, struct statfs *buf);
+int kern_fsync(struct thread *td, int fd, bool fullsync);
+int kern_ftruncate(struct thread *td, int fd, off_t length);
+int kern_futimes(struct thread *td, int fd, struct timeval *tptr,
+ enum uio_seg tptrseg);
+int kern_futimens(struct thread *td, int fd, struct timespec *tptr,
+ enum uio_seg tptrseg);
+int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
+ off_t *basep, ssize_t *residp, enum uio_seg bufseg);
+int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
+ size_t *countp, enum uio_seg bufseg, int mode);
+int kern_getitimer(struct thread *, u_int, struct itimerval *);
+int kern_getppid(struct thread *);
+int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
+ socklen_t *alen);
+int kern_getrusage(struct thread *td, int who, struct rusage *rup);
+int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
+ socklen_t *alen);
+int kern_getsockopt(struct thread *td, int s, int level, int name,
+ void *optval, enum uio_seg valseg, socklen_t *valsize);
+int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data);
+int kern_jail(struct thread *td, struct jail *j);
+int kern_jail_get(struct thread *td, struct uio *options, int flags);
+int kern_jail_set(struct thread *td, struct uio *options, int flags);
+int kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
+ struct kevent_copyops *k_ops, const struct timespec *timeout);
+int kern_kevent_anonymous(struct thread *td, int nevents,
+ struct kevent_copyops *k_ops);
+int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges,
+ int nevents, struct kevent_copyops *k_ops,
+ const struct timespec *timeout);
+int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
+int kern_kldload(struct thread *td, const char *file, int *fileid);
+int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
+int kern_kldunload(struct thread *td, int fileid, int flags);
+int kern_linkat(struct thread *td, int fd1, int fd2, char *path1,
+ char *path2, enum uio_seg segflg, int follow);
+int kern_listen(struct thread *td, int s, int backlog);
+int kern_lseek(struct thread *td, int fd, off_t offset, int whence);
+int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
+ struct timeval *tptr, enum uio_seg tptrseg);
+int kern_madvise(struct thread *td, uintptr_t addr, size_t len, int behav);
+int kern_mincore(struct thread *td, uintptr_t addr, size_t len, char *vec);
+int kern_mkdirat(struct thread *td, int fd, char *path,
+ enum uio_seg segflg, int mode);
+int kern_mkfifoat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, int mode);
+int kern_mknodat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, int mode, dev_t dev);
+int kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr,
+ size_t len);
+int kern_mmap(struct thread *td, uintptr_t addr, size_t size, int prot,
+ int flags, int fd, off_t pos);
+int kern_mmap_fpcheck(struct thread *td, uintptr_t addr, size_t len,
+ int prot, int flags, int fd, off_t pos,
+ mmap_check_fp_fn check_fp_fn);
+int kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot);
+int kern_msgctl(struct thread *, int, int, struct msqid_ds *);
+int kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *);
+int kern_msgsnd(struct thread *, int, const void *, size_t, int, long);
+int kern_msync(struct thread *td, uintptr_t addr, size_t size, int flags);
+int kern_munlock(struct thread *td, uintptr_t addr, size_t size);
+int kern_munmap(struct thread *td, uintptr_t addr, size_t size);
+int kern_nanosleep(struct thread *td, struct timespec *rqt,
+ struct timespec *rmt);
+int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
+ long *ploff);
+int kern_openat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, int flags, int mode);
+int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg,
+ int name, u_long flags, long *valuep);
+int kern_pipe(struct thread *td, int fildes[2], int flags,
+ struct filecaps *fcaps1, struct filecaps *fcaps2);
+int kern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
+ struct timespec *tsp, sigset_t *uset);
+int kern_posix_error(struct thread *td, int error);
+int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+ int advice);
+int kern_posix_fallocate(struct thread *td, int fd, off_t offset,
+ off_t len);
+int kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com,
+ void *data);
+int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte,
+ off_t offset);
+int kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset);
+int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou,
+ fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits);
+int kern_ptrace(struct thread *td, int req, pid_t pid, void *addr,
+ int data);
+int kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte,
+ off_t offset);
+int kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset);
+int kern_readlinkat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count);
+int kern_readv(struct thread *td, int fd, struct uio *auio);
+int kern_recvit(struct thread *td, int s, struct msghdr *mp,
+ enum uio_seg fromseg, struct mbuf **controlp);
+int kern_renameat(struct thread *td, int oldfd, char *old, int newfd,
+ char *new, enum uio_seg pathseg);
+int kern_rmdirat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg);
+int kern_sched_getparam(struct thread *td, struct thread *targettd,
+ struct sched_param *param);
+int kern_sched_getscheduler(struct thread *td, struct thread *targettd,
+ int *policy);
+int kern_sched_setparam(struct thread *td, struct thread *targettd,
+ struct sched_param *param);
+int kern_sched_setscheduler(struct thread *td, struct thread *targettd,
+ int policy, struct sched_param *param);
+int kern_sched_rr_get_interval(struct thread *td, pid_t pid,
+ struct timespec *ts);
+int kern_sched_rr_get_interval_td(struct thread *td, struct thread *targettd,
+ struct timespec *ts);
+int kern_semctl(struct thread *td, int semid, int semnum, int cmd,
+ union semun *arg, register_t *rval);
+int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
+ fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits);
+int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
+ struct mbuf *control, enum uio_seg segflg);
+int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups);
+int kern_setitimer(struct thread *, u_int, struct itimerval *,
+ struct itimerval *);
+int kern_setrlimit(struct thread *, u_int, struct rlimit *);
+int kern_setsockopt(struct thread *td, int s, int level, int name,
+ void *optval, enum uio_seg valseg, socklen_t valsize);
+int kern_settimeofday(struct thread *td, struct timeval *tv,
+ struct timezone *tzp);
+int kern_shm_open(struct thread *td, const char *userpath, int flags,
+ mode_t mode, struct filecaps *fcaps);
+int kern_shmat(struct thread *td, int shmid, const void *shmaddr,
+ int shmflg);
+int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf,
+ size_t *bufsz);
+int kern_shutdown(struct thread *td, int s, int how);
+int kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
+ struct sigaction *oact, int flags);
+int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss);
+int kern_sigprocmask(struct thread *td, int how,
+ sigset_t *set, sigset_t *oset, int flags);
+int kern_sigsuspend(struct thread *td, sigset_t mask);
+int kern_sigtimedwait(struct thread *td, sigset_t waitset,
+ struct ksiginfo *ksi, struct timespec *timeout);
+int kern_sigqueue(struct thread *td, pid_t pid, int signum,
+ union sigval *value);
+int kern_socket(struct thread *td, int domain, int type, int protocol);
+int kern_statat(struct thread *td, int flag, int fd, char *path,
+ enum uio_seg pathseg, struct stat *sbp,
+ void (*hook)(struct vnode *vp, struct stat *sbp));
+int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
+ struct statfs *buf);
+int kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
+ enum uio_seg segflg);
+int kern_ktimer_create(struct thread *td, clockid_t clock_id,
+ struct sigevent *evp, int *timerid, int preset_id);
+int kern_ktimer_delete(struct thread *, int);
+int kern_ktimer_settime(struct thread *td, int timer_id, int flags,
+ struct itimerspec *val, struct itimerspec *oval);
+int kern_ktimer_gettime(struct thread *td, int timer_id,
+ struct itimerspec *val);
+int kern_ktimer_getoverrun(struct thread *td, int timer_id);
+int kern_thr_alloc(struct proc *, int pages, struct thread **);
+int kern_thr_exit(struct thread *td);
+int kern_thr_new(struct thread *td, struct thr_param *param);
+int kern_thr_suspend(struct thread *td, struct timespec *tsp);
+int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg,
+ off_t length);
+int kern_unlinkat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, ino_t oldinum);
+int kern_utimesat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg);
+int kern_utimensat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg,
+ int follow);
+int kern_wait(struct thread *td, pid_t pid, int *status, int options,
+ struct rusage *rup);
+int kern_wait6(struct thread *td, enum idtype idtype, id_t id, int *status,
+ int options, struct __wrusage *wrup, siginfo_t *sip);
+int kern_writev(struct thread *td, int fd, struct uio *auio);
+int kern_socketpair(struct thread *td, int domain, int type, int protocol,
+ int *rsv);
+
+/* flags for kern_sigaction */
+#define KSA_OSIGSET 0x0001 /* uses osigact_t */
+#define KSA_FREEBSD4 0x0002 /* uses ucontext4 */
+
+struct freebsd11_dirent;
+
+int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int
+ count, long *basep, void (*func)(struct freebsd11_dirent *));
+
+#endif /* !_SYS_SYSCALLSUBR_H_ */
diff --git a/freebsd/sys/sys/sysent.h b/freebsd/sys/sys/sysent.h
new file mode 100644
index 00000000..d1d9e99b
--- /dev/null
+++ b/freebsd/sys/sys/sysent.h
@@ -0,0 +1,327 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1988, 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_SYSENT_H_
+#define _SYS_SYSENT_H_
+
+#include <bsm/audit.h>
+
+struct rlimit;
+struct sysent;
+struct thread;
+struct ksiginfo;
+struct syscall_args;
+
+enum systrace_probe_t {
+ SYSTRACE_ENTRY,
+ SYSTRACE_RETURN,
+};
+
+typedef int sy_call_t(struct thread *, void *);
+
+typedef void (*systrace_probe_func_t)(struct syscall_args *,
+ enum systrace_probe_t, int);
+typedef void (*systrace_args_func_t)(int, void *, uint64_t *, int *);
+
+#ifdef _KERNEL
+extern bool systrace_enabled;
+#endif
+extern systrace_probe_func_t systrace_probe_func;
+
+struct sysent { /* system call table */
+ int sy_narg; /* number of arguments */
+ sy_call_t *sy_call; /* implementing function */
+ au_event_t sy_auevent; /* audit event associated with syscall */
+ systrace_args_func_t sy_systrace_args_func;
+ /* optional argument conversion function. */
+ u_int32_t sy_entry; /* DTrace entry ID for systrace. */
+ u_int32_t sy_return; /* DTrace return ID for systrace. */
+ u_int32_t sy_flags; /* General flags for system calls. */
+ u_int32_t sy_thrcnt;
+};
+
+/*
+ * A system call is permitted in capability mode.
+ */
+#define SYF_CAPENABLED 0x00000001
+
+#define SY_THR_FLAGMASK 0x7
+#define SY_THR_STATIC 0x1
+#define SY_THR_DRAINING 0x2
+#define SY_THR_ABSENT 0x4
+#define SY_THR_INCR 0x8
+
+#ifdef KLD_MODULE
+#define SY_THR_STATIC_KLD 0
+#else
+#define SY_THR_STATIC_KLD SY_THR_STATIC
+#endif
+
+struct image_params;
+struct __sigset;
+struct trapframe;
+struct vnode;
+
+struct sysentvec {
+ int sv_size; /* number of entries */
+ struct sysent *sv_table; /* pointer to sysent */
+ u_int sv_mask; /* optional mask to index */
+ int sv_errsize; /* size of errno translation table */
+ const int *sv_errtbl; /* errno translation table */
+ int (*sv_transtrap)(int, int);
+ /* translate trap-to-signal mapping */
+ int (*sv_fixup)(register_t **, struct image_params *);
+ /* stack fixup function */
+ void (*sv_sendsig)(void (*)(int), struct ksiginfo *, struct __sigset *);
+ /* send signal */
+ char *sv_sigcode; /* start of sigtramp code */
+ int *sv_szsigcode; /* size of sigtramp code */
+ char *sv_name; /* name of binary type */
+ int (*sv_coredump)(struct thread *, struct vnode *, off_t, int);
+ /* function to dump core, or NULL */
+ int (*sv_imgact_try)(struct image_params *);
+ void (*sv_stackgap)(struct image_params *, u_long *);
+ int sv_minsigstksz; /* minimum signal stack size */
+ int sv_pagesize; /* spare / no longer used */
+ vm_offset_t sv_minuser; /* VM_MIN_ADDRESS */
+ vm_offset_t sv_maxuser; /* VM_MAXUSER_ADDRESS */
+ vm_offset_t sv_usrstack; /* USRSTACK */
+ vm_offset_t sv_psstrings; /* PS_STRINGS */
+ int sv_stackprot; /* vm protection for stack */
+ register_t *(*sv_copyout_strings)(struct image_params *);
+ void (*sv_setregs)(struct thread *, struct image_params *,
+ u_long);
+ void (*sv_fixlimit)(struct rlimit *, int);
+ u_long *sv_maxssiz;
+ u_int sv_flags;
+ void (*sv_set_syscall_retval)(struct thread *, int);
+ int (*sv_fetch_syscall_args)(struct thread *);
+ const char **sv_syscallnames;
+ vm_offset_t sv_timekeep_base;
+ vm_offset_t sv_shared_page_base;
+ vm_offset_t sv_shared_page_len;
+ vm_offset_t sv_sigcode_base;
+ void *sv_shared_page_obj;
+ void (*sv_schedtail)(struct thread *);
+ void (*sv_thread_detach)(struct thread *);
+ int (*sv_trap)(struct thread *);
+ u_long *sv_hwcap; /* Value passed in AT_HWCAP. */
+ u_long *sv_hwcap2; /* Value passed in AT_HWCAP2. */
+};
+
+#define SV_ILP32 0x000100 /* 32-bit executable. */
+#define SV_LP64 0x000200 /* 64-bit executable. */
+#define SV_IA32 0x004000 /* Intel 32-bit executable. */
+#define SV_AOUT 0x008000 /* a.out executable. */
+#define SV_SHP 0x010000 /* Shared page. */
+#define SV_CAPSICUM 0x020000 /* Force cap_enter() on startup. */
+#define SV_TIMEKEEP 0x040000 /* Shared page timehands. */
+#define SV_ASLR 0x080000 /* ASLR allowed. */
+
+#define SV_ABI_MASK 0xff
+#define SV_ABI_ERRNO(p, e) ((p)->p_sysent->sv_errsize <= 0 ? e : \
+ ((e) >= (p)->p_sysent->sv_errsize ? -1 : (p)->p_sysent->sv_errtbl[e]))
+#define SV_PROC_FLAG(p, x) ((p)->p_sysent->sv_flags & (x))
+#define SV_PROC_ABI(p) ((p)->p_sysent->sv_flags & SV_ABI_MASK)
+#define SV_CURPROC_FLAG(x) SV_PROC_FLAG(curproc, x)
+#define SV_CURPROC_ABI() SV_PROC_ABI(curproc)
+/* same as ELFOSABI_XXX, to prevent header pollution */
+#define SV_ABI_LINUX 3
+#define SV_ABI_FREEBSD 9
+#define SV_ABI_CLOUDABI 17
+#define SV_ABI_UNDEF 255
+
+#ifdef _KERNEL
+extern struct sysentvec aout_sysvec;
+extern struct sysent sysent[];
+extern const char *syscallnames[];
+
+#if defined(__amd64__)
+extern int i386_read_exec;
+#endif
+
+#define NO_SYSCALL (-1)
+
+struct module;
+
+struct syscall_module_data {
+ int (*chainevh)(struct module *, int, void *); /* next handler */
+ void *chainarg; /* arg for next event handler */
+ int *offset; /* offset into sysent */
+ struct sysent *new_sysent; /* new sysent */
+ struct sysent old_sysent; /* old sysent */
+ int flags; /* flags for syscall_register */
+};
+
+/* separate initialization vector so it can be used in a substructure */
+#define SYSENT_INIT_VALS(_syscallname) { \
+ .sy_narg = (sizeof(struct _syscallname ## _args ) \
+ / sizeof(register_t)), \
+ .sy_call = (sy_call_t *)&sys_##_syscallname, \
+ .sy_auevent = SYS_AUE_##_syscallname, \
+ .sy_systrace_args_func = NULL, \
+ .sy_entry = 0, \
+ .sy_return = 0, \
+ .sy_flags = 0, \
+ .sy_thrcnt = 0 \
+}
+
+#define MAKE_SYSENT(syscallname) \
+static struct sysent syscallname##_sysent = SYSENT_INIT_VALS(syscallname);
+
+#define MAKE_SYSENT_COMPAT(syscallname) \
+static struct sysent syscallname##_sysent = { \
+ (sizeof(struct syscallname ## _args ) \
+ / sizeof(register_t)), \
+ (sy_call_t *)& syscallname, \
+ SYS_AUE_##syscallname \
+}
+
+#define SYSCALL_MODULE(name, offset, new_sysent, evh, arg) \
+static struct syscall_module_data name##_syscall_mod = { \
+ evh, arg, offset, new_sysent, { 0, NULL, AUE_NULL } \
+}; \
+ \
+static moduledata_t name##_mod = { \
+ "sys/" #name, \
+ syscall_module_handler, \
+ &name##_syscall_mod \
+}; \
+DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE)
+
+#define SYSCALL_MODULE_HELPER(syscallname) \
+static int syscallname##_syscall = SYS_##syscallname; \
+MAKE_SYSENT(syscallname); \
+SYSCALL_MODULE(syscallname, \
+ & syscallname##_syscall, & syscallname##_sysent, \
+ NULL, NULL)
+
+#define SYSCALL_MODULE_PRESENT(syscallname) \
+ (sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmnosys && \
+ sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmressys)
+
+/*
+ * Syscall registration helpers with resource allocation handling.
+ */
+struct syscall_helper_data {
+ struct sysent new_sysent;
+ struct sysent old_sysent;
+ int syscall_no;
+ int registered;
+};
+#define SYSCALL_INIT_HELPER_F(syscallname, flags) { \
+ .new_sysent = { \
+ .sy_narg = (sizeof(struct syscallname ## _args ) \
+ / sizeof(register_t)), \
+ .sy_call = (sy_call_t *)& sys_ ## syscallname, \
+ .sy_auevent = SYS_AUE_##syscallname, \
+ .sy_flags = (flags) \
+ }, \
+ .syscall_no = SYS_##syscallname \
+}
+#define SYSCALL_INIT_HELPER_COMPAT_F(syscallname, flags) { \
+ .new_sysent = { \
+ .sy_narg = (sizeof(struct syscallname ## _args ) \
+ / sizeof(register_t)), \
+ .sy_call = (sy_call_t *)& syscallname, \
+ .sy_auevent = SYS_AUE_##syscallname, \
+ .sy_flags = (flags) \
+ }, \
+ .syscall_no = SYS_##syscallname \
+}
+#define SYSCALL_INIT_HELPER(syscallname) \
+ SYSCALL_INIT_HELPER_F(syscallname, 0)
+#define SYSCALL_INIT_HELPER_COMPAT(syscallname) \
+ SYSCALL_INIT_HELPER_COMPAT_F(syscallname, 0)
+#define SYSCALL_INIT_LAST { \
+ .syscall_no = NO_SYSCALL \
+}
+
+int syscall_module_handler(struct module *mod, int what, void *arg);
+int syscall_helper_register(struct syscall_helper_data *sd, int flags);
+int syscall_helper_unregister(struct syscall_helper_data *sd);
+/* Implementation, exposed for COMPAT code */
+int kern_syscall_register(struct sysent *sysents, int *offset,
+ struct sysent *new_sysent, struct sysent *old_sysent, int flags);
+int kern_syscall_deregister(struct sysent *sysents, int offset,
+ const struct sysent *old_sysent);
+int kern_syscall_module_handler(struct sysent *sysents,
+ struct module *mod, int what, void *arg);
+int kern_syscall_helper_register(struct sysent *sysents,
+ struct syscall_helper_data *sd, int flags);
+int kern_syscall_helper_unregister(struct sysent *sysents,
+ struct syscall_helper_data *sd);
+
+struct proc;
+const char *syscallname(struct proc *p, u_int code);
+
+/* Special purpose system call functions. */
+struct nosys_args;
+
+int lkmnosys(struct thread *, struct nosys_args *);
+int lkmressys(struct thread *, struct nosys_args *);
+
+int _syscall_thread_enter(struct thread *td, struct sysent *se);
+void _syscall_thread_exit(struct thread *td, struct sysent *se);
+
+static inline int
+syscall_thread_enter(struct thread *td, struct sysent *se)
+{
+
+ if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0))
+ return (0);
+ return (_syscall_thread_enter(td, se));
+}
+
+static inline void
+syscall_thread_exit(struct thread *td, struct sysent *se)
+{
+
+ if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0))
+ return;
+ _syscall_thread_exit(td, se);
+}
+
+int shared_page_alloc(int size, int align);
+int shared_page_fill(int size, int align, const void *data);
+void shared_page_write(int base, int size, const void *data);
+void exec_sysvec_init(void *param);
+void exec_inittk(void);
+
+#define INIT_SYSENTVEC(name, sv) \
+ SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \
+ (sysinit_cfunc_t)exec_sysvec_init, sv);
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_SYSENT_H_ */
diff --git a/freebsd/sys/sys/vmem.h b/freebsd/sys/sys/vmem.h
new file mode 100644
index 00000000..e74d1e3f
--- /dev/null
+++ b/freebsd/sys/sys/vmem.h
@@ -0,0 +1,145 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c)2006 YAMAMOTO Takashi,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/* From $NetBSD: vmem.h,v 1.20 2013/01/29 21:26:24 para Exp $ */
+
+/* $FreeBSD$ */
+
+#ifndef _SYS_VMEM_H_
+#define _SYS_VMEM_H_
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+
+typedef struct vmem vmem_t;
+
+typedef uintptr_t vmem_addr_t;
+typedef size_t vmem_size_t;
+
+#define VMEM_ADDR_MIN 0
+#define VMEM_ADDR_QCACHE_MIN 1
+#define VMEM_ADDR_MAX (~(vmem_addr_t)0)
+
+typedef int (vmem_import_t)(void *, vmem_size_t, int, vmem_addr_t *);
+typedef void (vmem_release_t)(void *, vmem_addr_t, vmem_size_t);
+typedef void (vmem_reclaim_t)(vmem_t *, int);
+
+/*
+ * Create a vmem:
+ * name - Name of the region
+ * base - Initial span start (optional)
+ * size - Initial span size
+ * quantum - Natural unit of allocation (ie PAGE_SIZE, 1, etc)
+ * qcache_max - Maximum size to quantum cache. This creates a UMA
+ * cache for each multiple of quantum up to qcache_max.
+ * flags - M_* flags
+ */
+vmem_t *vmem_create(const char *name, vmem_addr_t base,
+ vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags);
+vmem_t *vmem_init(vmem_t *vm, const char *name, vmem_addr_t base,
+ vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags);
+void vmem_destroy(vmem_t *);
+
+/*
+ * Set callbacks for bringing in dynamic regions:
+ * importfn - Backing store import routine.
+ * releasefn - Backing store release routine.
+ * arg - Backing store argument
+ * import_quantum - Size to import from backing store
+ */
+
+void vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
+ vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum);
+
+/*
+ * Set a limit on the total size of a vmem.
+ */
+
+void vmem_set_limit(vmem_t *vm, vmem_size_t limit);
+
+/*
+ * Set a callback for reclaiming memory when space is exhausted:
+ */
+void vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn);
+
+/*
+ * Allocate and free linear regions from a vmem. Must specify
+ * BESTFIT or FIRSTFIT. Free is non-blocking. These routines
+ * respect the quantum caches.
+ */
+int vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp);
+void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size);
+
+/*
+ * Constrained allocate and free routines. These bypass the quantum cache.
+ * size - Size in units of 1, not quantum.
+ * align - Required alignment of the start of region
+ * phase - Offset from alignment
+ * nocross - Illegal boundary
+ * minaddr - Minimum allowed address for last byte
+ * maxaddr - Maximum allowed address for first byte
+ * flags - M_* flags
+ * addrp - result
+ */
+int vmem_xalloc(vmem_t *vm, vmem_size_t size, vmem_size_t align,
+ vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
+ vmem_addr_t maxaddr, int flags, vmem_addr_t *addrp);
+void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size);
+
+/*
+ * Add a static region to a vmem after create. This won't be freed
+ * until the vmem is destroyed.
+ */
+int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags);
+
+/*
+ * Given roundup size to the vmem's native quantum size.
+ */
+vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size);
+
+/*
+ * Report vmem utilization according to the requested type.
+ */
+vmem_size_t vmem_size(vmem_t *vm, int typemask);
+
+void vmem_whatis(vmem_addr_t addr, int (*fn)(const char *, ...)
+ __printflike(1, 2));
+void vmem_print(vmem_addr_t addr, const char *, int (*fn)(const char *, ...)
+ __printflike(1, 2));
+void vmem_printall(const char *, int (*fn)(const char *, ...)
+ __printflike(1, 2));
+void vmem_startup(void);
+
+/* vmem_size typemask */
+#define VMEM_ALLOC 0x01
+#define VMEM_FREE 0x02
+#define VMEM_MAXFREE 0x10
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_VMEM_H_ */
diff --git a/freebsd/sys/vm/vm_meter.c b/freebsd/sys/vm/vm_meter.c
new file mode 100644
index 00000000..dfd50081
--- /dev/null
+++ b/freebsd/sys/vm/vm_meter.c
@@ -0,0 +1,561 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/vmmeter.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <sys/sysctl.h>
+
+struct vmmeter __read_mostly vm_cnt = {
+ .v_swtch = EARLY_COUNTER,
+ .v_trap = EARLY_COUNTER,
+ .v_syscall = EARLY_COUNTER,
+ .v_intr = EARLY_COUNTER,
+ .v_soft = EARLY_COUNTER,
+ .v_vm_faults = EARLY_COUNTER,
+ .v_io_faults = EARLY_COUNTER,
+ .v_cow_faults = EARLY_COUNTER,
+ .v_cow_optim = EARLY_COUNTER,
+ .v_zfod = EARLY_COUNTER,
+ .v_ozfod = EARLY_COUNTER,
+ .v_swapin = EARLY_COUNTER,
+ .v_swapout = EARLY_COUNTER,
+ .v_swappgsin = EARLY_COUNTER,
+ .v_swappgsout = EARLY_COUNTER,
+ .v_vnodein = EARLY_COUNTER,
+ .v_vnodeout = EARLY_COUNTER,
+ .v_vnodepgsin = EARLY_COUNTER,
+ .v_vnodepgsout = EARLY_COUNTER,
+ .v_intrans = EARLY_COUNTER,
+ .v_reactivated = EARLY_COUNTER,
+ .v_pdwakeups = EARLY_COUNTER,
+ .v_pdpages = EARLY_COUNTER,
+ .v_pdshortfalls = EARLY_COUNTER,
+ .v_dfree = EARLY_COUNTER,
+ .v_pfree = EARLY_COUNTER,
+ .v_tfree = EARLY_COUNTER,
+ .v_forks = EARLY_COUNTER,
+ .v_vforks = EARLY_COUNTER,
+ .v_rforks = EARLY_COUNTER,
+ .v_kthreads = EARLY_COUNTER,
+ .v_forkpages = EARLY_COUNTER,
+ .v_vforkpages = EARLY_COUNTER,
+ .v_rforkpages = EARLY_COUNTER,
+ .v_kthreadpages = EARLY_COUNTER,
+ .v_wire_count = EARLY_COUNTER,
+};
+
+static void
+vmcounter_startup(void)
+{
+ counter_u64_t *cnt = (counter_u64_t *)&vm_cnt;
+
+ COUNTER_ARRAY_ALLOC(cnt, VM_METER_NCOUNTERS, M_WAITOK);
+}
+SYSINIT(counter, SI_SUB_KMEM, SI_ORDER_FIRST, vmcounter_startup, NULL);
+
+SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
+ CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold");
+SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
+ CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages");
+SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
+ CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock");
+SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
+ CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive");
+SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
+ CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
+SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
+ CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point");
+
+static int
+sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
+{
+
+#ifdef SCTL_MASK32
+ u_int32_t la[4];
+
+ if (req->flags & SCTL_MASK32) {
+ la[0] = averunnable.ldavg[0];
+ la[1] = averunnable.ldavg[1];
+ la[2] = averunnable.ldavg[2];
+ la[3] = averunnable.fscale;
+ return SYSCTL_OUT(req, la, sizeof(la));
+ } else
+#endif
+ return SYSCTL_OUT(req, &averunnable, sizeof(averunnable));
+}
+SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg",
+ "Machine loadaverage history");
+
+/*
+ * This function aims to determine if the object is mapped,
+ * specifically, if it is referenced by a vm_map_entry. Because
+ * objects occasionally acquire transient references that do not
+ * represent a mapping, the method used here is inexact. However, it
+ * has very low overhead and is good enough for the advisory
+ * vm.vmtotal sysctl.
+ */
+static bool
+is_object_active(vm_object_t obj)
+{
+
+ return (obj->ref_count > obj->shadow_count);
+}
+
+#if defined(COMPAT_FREEBSD11)
+struct vmtotal11 {
+ int16_t t_rq;
+ int16_t t_dw;
+ int16_t t_pw;
+ int16_t t_sl;
+ int16_t t_sw;
+ int32_t t_vm;
+ int32_t t_avm;
+ int32_t t_rm;
+ int32_t t_arm;
+ int32_t t_vmshr;
+ int32_t t_avmshr;
+ int32_t t_rmshr;
+ int32_t t_armshr;
+ int32_t t_free;
+};
+#endif
+
+static int
+vmtotal(SYSCTL_HANDLER_ARGS)
+{
+ struct vmtotal total;
+#if defined(COMPAT_FREEBSD11)
+ struct vmtotal11 total11;
+#endif
+ vm_object_t object;
+ struct proc *p;
+ struct thread *td;
+
+ if (req->oldptr == NULL) {
+#if defined(COMPAT_FREEBSD11)
+ if (curproc->p_osrel < P_OSREL_VMTOTAL64)
+ return (SYSCTL_OUT(req, NULL, sizeof(total11)));
+#endif
+ return (SYSCTL_OUT(req, NULL, sizeof(total)));
+ }
+ bzero(&total, sizeof(total));
+
+ /*
+ * Calculate process statistics.
+ */
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if ((p->p_flag & P_SYSTEM) != 0)
+ continue;
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NEW) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ switch (td->td_state) {
+ case TDS_INHIBITED:
+ if (TD_IS_SWAPPED(td))
+ total.t_sw++;
+ else if (TD_IS_SLEEPING(td)) {
+ if (td->td_priority <= PZERO)
+ total.t_dw++;
+ else
+ total.t_sl++;
+ }
+ break;
+ case TDS_CAN_RUN:
+ total.t_sw++;
+ break;
+ case TDS_RUNQ:
+ case TDS_RUNNING:
+ total.t_rq++;
+ break;
+ default:
+ break;
+ }
+ thread_unlock(td);
+ }
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ /*
+ * Calculate object memory usage statistics.
+ */
+ mtx_lock(&vm_object_list_mtx);
+ TAILQ_FOREACH(object, &vm_object_list, object_list) {
+ /*
+ * Perform unsynchronized reads on the object. In
+ * this case, the lack of synchronization should not
+ * impair the accuracy of the reported statistics.
+ */
+ if ((object->flags & OBJ_FICTITIOUS) != 0) {
+ /*
+ * Devices, like /dev/mem, will badly skew our totals.
+ */
+ continue;
+ }
+ if (object->ref_count == 0) {
+ /*
+ * Also skip unreferenced objects, including
+ * vnodes representing mounted file systems.
+ */
+ continue;
+ }
+ if (object->ref_count == 1 &&
+ (object->flags & OBJ_NOSPLIT) != 0) {
+ /*
+ * Also skip otherwise unreferenced swap
+ * objects backing tmpfs vnodes, and POSIX or
+ * SysV shared memory.
+ */
+ continue;
+ }
+ total.t_vm += object->size;
+ total.t_rm += object->resident_page_count;
+ if (is_object_active(object)) {
+ total.t_avm += object->size;
+ total.t_arm += object->resident_page_count;
+ }
+ if (object->shadow_count > 1) {
+ /* shared object */
+ total.t_vmshr += object->size;
+ total.t_rmshr += object->resident_page_count;
+ if (is_object_active(object)) {
+ total.t_avmshr += object->size;
+ total.t_armshr += object->resident_page_count;
+ }
+ }
+ }
+ mtx_unlock(&vm_object_list_mtx);
+ total.t_pw = vm_wait_count();
+ total.t_free = vm_free_count();
+#if defined(COMPAT_FREEBSD11)
+ /* sysctl(8) allocates twice as much memory as reported by sysctl(3) */
+ if (curproc->p_osrel < P_OSREL_VMTOTAL64 && (req->oldlen ==
+ sizeof(total11) || req->oldlen == 2 * sizeof(total11))) {
+ bzero(&total11, sizeof(total11));
+ total11.t_rq = total.t_rq;
+ total11.t_dw = total.t_dw;
+ total11.t_pw = total.t_pw;
+ total11.t_sl = total.t_sl;
+ total11.t_sw = total.t_sw;
+ total11.t_vm = total.t_vm; /* truncate */
+ total11.t_avm = total.t_avm; /* truncate */
+ total11.t_rm = total.t_rm; /* truncate */
+ total11.t_arm = total.t_arm; /* truncate */
+ total11.t_vmshr = total.t_vmshr; /* truncate */
+ total11.t_avmshr = total.t_avmshr; /* truncate */
+ total11.t_rmshr = total.t_rmshr; /* truncate */
+ total11.t_armshr = total.t_armshr; /* truncate */
+ total11.t_free = total.t_free; /* truncate */
+ return (SYSCTL_OUT(req, &total11, sizeof(total11)));
+ }
+#endif
+ return (SYSCTL_OUT(req, &total, sizeof(total)));
+}
+
+SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, vmtotal, "S,vmtotal",
+ "System virtual memory statistics");
+SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats");
+static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0,
+ "VM meter sys stats");
+static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0,
+ "VM meter vm stats");
+SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats");
+
+static int
+sysctl_handle_vmstat(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+#ifdef COMPAT_FREEBSD11
+ uint32_t val32;
+#endif
+
+ val = counter_u64_fetch(*(counter_u64_t *)arg1);
+#ifdef COMPAT_FREEBSD11
+ if (req->oldlen == sizeof(val32)) {
+ val32 = val; /* truncate */
+ return (SYSCTL_OUT(req, &val32, sizeof(val32)));
+ }
+#endif
+ return (SYSCTL_OUT(req, &val, sizeof(val)));
+}
+
+#define VM_STATS(parent, var, descr) \
+ SYSCTL_OID(parent, OID_AUTO, var, CTLTYPE_U64 | CTLFLAG_MPSAFE | \
+ CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr)
+#define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr)
+#define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr)
+
+VM_STATS_SYS(v_swtch, "Context switches");
+VM_STATS_SYS(v_trap, "Traps");
+VM_STATS_SYS(v_syscall, "System calls");
+VM_STATS_SYS(v_intr, "Device interrupts");
+VM_STATS_SYS(v_soft, "Software interrupts");
+VM_STATS_VM(v_vm_faults, "Address memory faults");
+VM_STATS_VM(v_io_faults, "Page faults requiring I/O");
+VM_STATS_VM(v_cow_faults, "Copy-on-write faults");
+VM_STATS_VM(v_cow_optim, "Optimized COW faults");
+VM_STATS_VM(v_zfod, "Pages zero-filled on demand");
+VM_STATS_VM(v_ozfod, "Optimized zero fill pages");
+VM_STATS_VM(v_swapin, "Swap pager pageins");
+VM_STATS_VM(v_swapout, "Swap pager pageouts");
+VM_STATS_VM(v_swappgsin, "Swap pages swapped in");
+VM_STATS_VM(v_swappgsout, "Swap pages swapped out");
+VM_STATS_VM(v_vnodein, "Vnode pager pageins");
+VM_STATS_VM(v_vnodeout, "Vnode pager pageouts");
+VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
+VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
+VM_STATS_VM(v_intrans, "In transit page faults");
+VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon");
+VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
+VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls");
+VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
+VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
+VM_STATS_VM(v_tfree, "Total pages freed");
+VM_STATS_VM(v_forks, "Number of fork() calls");
+VM_STATS_VM(v_vforks, "Number of vfork() calls");
+VM_STATS_VM(v_rforks, "Number of rfork() calls");
+VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel");
+VM_STATS_VM(v_forkpages, "VM pages affected by fork()");
+VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()");
+VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()");
+VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel");
+
+static int
+sysctl_handle_vmstat_proc(SYSCTL_HANDLER_ARGS)
+{
+ u_int (*fn)(void);
+ uint32_t val;
+
+ fn = arg1;
+ val = fn();
+ return (SYSCTL_OUT(req, &val, sizeof(val)));
+}
+
+#define VM_STATS_PROC(var, descr, fn) \
+ SYSCTL_OID(_vm_stats_vm, OID_AUTO, var, CTLTYPE_U32 | CTLFLAG_MPSAFE | \
+ CTLFLAG_RD, fn, 0, sysctl_handle_vmstat_proc, "IU", descr)
+
+#define VM_STATS_UINT(var, descr) \
+ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr)
+
+VM_STATS_UINT(v_page_size, "Page size in bytes");
+VM_STATS_UINT(v_page_count, "Total number of pages in system");
+VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock");
+VM_STATS_UINT(v_free_target, "Pages desired free");
+VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold");
+VM_STATS_PROC(v_free_count, "Free pages", vm_free_count);
+VM_STATS_PROC(v_wire_count, "Wired pages", vm_wire_count);
+VM_STATS_PROC(v_active_count, "Active pages", vm_active_count);
+VM_STATS_UINT(v_inactive_target, "Desired inactive pages");
+VM_STATS_PROC(v_inactive_count, "Inactive pages", vm_inactive_count);
+VM_STATS_PROC(v_laundry_count, "Pages eligible for laundering",
+ vm_laundry_count);
+VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel");
+VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code");
+VM_STATS_UINT(v_free_severe, "Severe page depletion point");
+
+#ifdef COMPAT_FREEBSD11
+/*
+ * Provide compatibility sysctls for the benefit of old utilities which exit
+ * with an error if they cannot be found.
+ */
+SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_count, CTLFLAG_RD,
+ SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility");
+SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD,
+ SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility");
+#endif
+
+u_int
+vm_free_count(void)
+{
+ u_int v;
+ int i;
+
+ v = 0;
+ for (i = 0; i < vm_ndomains; i++)
+ v += vm_dom[i].vmd_free_count;
+
+ return (v);
+}
+
+static u_int
+vm_pagequeue_count(int pq)
+{
+ u_int v;
+ int i;
+
+ v = 0;
+ for (i = 0; i < vm_ndomains; i++)
+ v += vm_dom[i].vmd_pagequeues[pq].pq_cnt;
+
+ return (v);
+}
+
+u_int
+vm_active_count(void)
+{
+
+ return (vm_pagequeue_count(PQ_ACTIVE));
+}
+
+u_int
+vm_inactive_count(void)
+{
+
+ return (vm_pagequeue_count(PQ_INACTIVE));
+}
+
+u_int
+vm_laundry_count(void)
+{
+
+ return (vm_pagequeue_count(PQ_LAUNDRY));
+}
+
+static int
+sysctl_vm_pdpages(SYSCTL_HANDLER_ARGS)
+{
+ struct vm_pagequeue *pq;
+ uint64_t ret;
+ int dom, i;
+
+ ret = counter_u64_fetch(vm_cnt.v_pdpages);
+ for (dom = 0; dom < vm_ndomains; dom++)
+ for (i = 0; i < PQ_COUNT; i++) {
+ pq = &VM_DOMAIN(dom)->vmd_pagequeues[i];
+ ret += pq->pq_pdpages;
+ }
+ return (SYSCTL_OUT(req, &ret, sizeof(ret)));
+}
+SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_vm_pdpages, "QU",
+ "Pages analyzed by pagedaemon");
+
+static void
+vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent)
+{
+ struct sysctl_oid *oid;
+
+ vmd->vmd_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
+ vmd->vmd_name, CTLFLAG_RD, NULL, "");
+ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
+ "stats", CTLFLAG_RD, NULL, "");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "free_count", CTLFLAG_RD, &vmd->vmd_free_count, 0,
+ "Free pages");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "active", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_cnt, 0,
+ "Active pages");
+ SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "actpdpgs", CTLFLAG_RD,
+ &vmd->vmd_pagequeues[PQ_ACTIVE].pq_pdpages, 0,
+ "Active pages scanned by the page daemon");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "inactive", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt, 0,
+ "Inactive pages");
+ SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "inactpdpgs", CTLFLAG_RD,
+ &vmd->vmd_pagequeues[PQ_INACTIVE].pq_pdpages, 0,
+ "Inactive pages scanned by the page daemon");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "laundry", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 0,
+ "laundry pages");
+ SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "laundpdpgs", CTLFLAG_RD,
+ &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_pdpages, 0,
+ "Laundry pages scanned by the page daemon");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswappable",
+ CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt, 0,
+ "Unswappable pages");
+ SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "unswppdpgs", CTLFLAG_RD,
+ &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_pdpages, 0,
+ "Unswappable pages scanned by the page daemon");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "inactive_target", CTLFLAG_RD, &vmd->vmd_inactive_target, 0,
+ "Target inactive pages");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "free_target", CTLFLAG_RD, &vmd->vmd_free_target, 0,
+ "Target free pages");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "free_reserved", CTLFLAG_RD, &vmd->vmd_free_reserved, 0,
+ "Reserved free pages");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "free_min", CTLFLAG_RD, &vmd->vmd_free_min, 0,
+ "Minimum free pages");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+ "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0,
+ "Severe free pages");
+
+}
+
+static void
+vm_stats_init(void *arg __unused)
+{
+ struct sysctl_oid *oid;
+ int i;
+
+ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm), OID_AUTO,
+ "domain", CTLFLAG_RD, NULL, "");
+ for (i = 0; i < vm_ndomains; i++)
+ vm_domain_stats_init(VM_DOMAIN(i), oid);
+}
+
+SYSINIT(vmstats_init, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_stats_init, NULL);