From 1739d74f7dc53232fe20ed3ea9d8b4b0730b4025 Mon Sep 17 00:00:00 2001 From: Chris Johns Date: Thu, 22 Jul 2021 11:50:13 +1000 Subject: freebsd/sys: Import VFS support Update #4475 --- freebsd/sys/fs/deadfs/dead_vnops.c | 159 + freebsd/sys/fs/pseudofs/pseudofs.c | 491 +++ freebsd/sys/fs/pseudofs/pseudofs.h | 312 ++ freebsd/sys/fs/pseudofs/pseudofs_fileno.c | 159 + freebsd/sys/fs/pseudofs/pseudofs_internal.h | 213 + freebsd/sys/fs/pseudofs/pseudofs_vncache.c | 333 ++ freebsd/sys/fs/pseudofs/pseudofs_vnops.c | 1060 +++++ freebsd/sys/kern/kern_descrip.c | 4283 ++++++++++++++++++++ freebsd/sys/kern/kern_lock.c | 1719 ++++++++ freebsd/sys/kern/subr_pctrie.c | 695 ++++ freebsd/sys/kern/vfs_acl.c | 600 +++ freebsd/sys/kern/vfs_aio.c | 2987 ++++++++++++++ freebsd/sys/kern/vfs_bio.c | 5474 +++++++++++++++++++++++++ freebsd/sys/kern/vfs_cache.c | 2604 ++++++++++++ freebsd/sys/kern/vfs_cluster.c | 1086 +++++ freebsd/sys/kern/vfs_default.c | 1286 ++++++ freebsd/sys/kern/vfs_export.c | 528 +++ freebsd/sys/kern/vfs_extattr.c | 757 ++++ freebsd/sys/kern/vfs_hash.c | 234 ++ freebsd/sys/kern/vfs_init.c | 376 ++ freebsd/sys/kern/vfs_lookup.c | 1450 +++++++ freebsd/sys/kern/vfs_mount.c | 2052 ++++++++++ freebsd/sys/kern/vfs_subr.c | 5719 +++++++++++++++++++++++++++ freebsd/sys/kern/vfs_syscalls.c | 4748 ++++++++++++++++++++++ freebsd/sys/kern/vfs_vnops.c | 2607 ++++++++++++ freebsd/sys/sys/bio.h | 184 + freebsd/sys/sys/namei.h | 226 ++ freebsd/sys/sys/pctrie.h | 152 + freebsd/sys/sys/syscallsubr.h | 317 ++ freebsd/sys/sys/sysent.h | 327 ++ freebsd/sys/sys/vmem.h | 145 + freebsd/sys/vm/vm_meter.c | 561 +++ 32 files changed, 43844 insertions(+) create mode 100644 freebsd/sys/fs/deadfs/dead_vnops.c create mode 100644 freebsd/sys/fs/pseudofs/pseudofs.c create mode 100644 freebsd/sys/fs/pseudofs/pseudofs.h create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_fileno.c create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_internal.h create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_vncache.c create mode 100644 freebsd/sys/fs/pseudofs/pseudofs_vnops.c create mode 100644 freebsd/sys/kern/kern_descrip.c create mode 100644 freebsd/sys/kern/kern_lock.c create mode 100644 freebsd/sys/kern/subr_pctrie.c create mode 100644 freebsd/sys/kern/vfs_acl.c create mode 100644 freebsd/sys/kern/vfs_aio.c create mode 100644 freebsd/sys/kern/vfs_bio.c create mode 100644 freebsd/sys/kern/vfs_cache.c create mode 100644 freebsd/sys/kern/vfs_cluster.c create mode 100644 freebsd/sys/kern/vfs_default.c create mode 100644 freebsd/sys/kern/vfs_export.c create mode 100644 freebsd/sys/kern/vfs_extattr.c create mode 100644 freebsd/sys/kern/vfs_hash.c create mode 100644 freebsd/sys/kern/vfs_init.c create mode 100644 freebsd/sys/kern/vfs_lookup.c create mode 100644 freebsd/sys/kern/vfs_mount.c create mode 100644 freebsd/sys/kern/vfs_subr.c create mode 100644 freebsd/sys/kern/vfs_syscalls.c create mode 100644 freebsd/sys/kern/vfs_vnops.c create mode 100644 freebsd/sys/sys/bio.h create mode 100644 freebsd/sys/sys/namei.h create mode 100644 freebsd/sys/sys/pctrie.h create mode 100644 freebsd/sys/sys/syscallsubr.h create mode 100644 freebsd/sys/sys/sysent.h create mode 100644 freebsd/sys/sys/vmem.h create mode 100644 freebsd/sys/vm/vm_meter.c diff --git a/freebsd/sys/fs/deadfs/dead_vnops.c b/freebsd/sys/fs/deadfs/dead_vnops.c new file mode 100644 index 00000000..a3153aed --- /dev/null +++ b/freebsd/sys/fs/deadfs/dead_vnops.c @@ -0,0 +1,159 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dead_vnops.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Prototypes for dead operations on vnodes. + */ +static vop_lookup_t dead_lookup; +static vop_open_t dead_open; +static vop_getwritemount_t dead_getwritemount; +static vop_rename_t dead_rename; +static vop_unset_text_t dead_unset_text; + +struct vop_vector dead_vnodeops = { + .vop_default = &default_vnodeops, + + .vop_access = VOP_EBADF, + .vop_advlock = VOP_EBADF, + .vop_bmap = VOP_EBADF, + .vop_create = VOP_PANIC, + .vop_getattr = VOP_EBADF, + .vop_getwritemount = dead_getwritemount, + .vop_inactive = VOP_NULL, + .vop_ioctl = VOP_EBADF, + .vop_link = VOP_PANIC, + .vop_lookup = dead_lookup, + .vop_mkdir = VOP_PANIC, + .vop_mknod = VOP_PANIC, + .vop_open = dead_open, + .vop_pathconf = VOP_EBADF, /* per pathconf(2) */ + .vop_poll = dead_poll, + .vop_read = dead_read, + .vop_readdir = VOP_EBADF, + .vop_readlink = VOP_EBADF, + .vop_reclaim = VOP_NULL, + .vop_remove = VOP_PANIC, + .vop_rename = dead_rename, + .vop_rmdir = VOP_PANIC, + .vop_setattr = VOP_EBADF, + .vop_symlink = VOP_PANIC, + .vop_vptocnp = VOP_EBADF, + .vop_unset_text = dead_unset_text, + .vop_write = dead_write, +}; + +static int +dead_getwritemount(struct vop_getwritemount_args *ap) +{ + + *(ap->a_mpp) = NULL; + return (0); +} + +/* + * Trivial lookup routine that always fails. + */ +static int +dead_lookup(struct vop_lookup_args *ap) +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * Open always fails as if device did not exist. + */ +static int +dead_open(struct vop_open_args *ap) +{ + + return (ENXIO); +} + +int +dead_read(struct vop_read_args *ap) +{ + + /* + * Return EOF for tty devices, EIO for others + */ + if ((ap->a_vp->v_vflag & VV_ISTTY) == 0) + return (EIO); + return (0); +} + +int +dead_write(struct vop_write_args *ap) +{ + + return (EIO); +} + +int +dead_poll(struct vop_poll_args *ap) +{ + + if (ap->a_events & ~POLLSTANDARD) + return (POLLNVAL); + + /* + * Let the user find out that the descriptor is gone. + */ + return (POLLHUP | ((POLLIN | POLLRDNORM) & ap->a_events)); + +} + +static int +dead_rename(struct vop_rename_args *ap) +{ + + vop_rename_fail(ap); + return (EXDEV); +} + +static int +dead_unset_text(struct vop_unset_text_args *ap) +{ + + return (0); +} diff --git a/freebsd/sys/fs/pseudofs/pseudofs.c b/freebsd/sys/fs/pseudofs/pseudofs.c new file mode 100644 index 00000000..73d3c7cb --- /dev/null +++ b/freebsd/sys/fs/pseudofs/pseudofs.c @@ -0,0 +1,491 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_pseudofs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static MALLOC_DEFINE(M_PFSNODES, "pfs_nodes", "pseudofs nodes"); + +SYSCTL_NODE(_vfs, OID_AUTO, pfs, CTLFLAG_RW, 0, + "pseudofs"); + +#ifdef PSEUDOFS_TRACE +int pfs_trace; +SYSCTL_INT(_vfs_pfs, OID_AUTO, trace, CTLFLAG_RW, &pfs_trace, 0, + "enable tracing of pseudofs vnode operations"); +#endif + +#if PFS_FSNAMELEN != MFSNAMELEN +#error "PFS_FSNAMELEN is not equal to MFSNAMELEN" +#endif + +/* + * Allocate and initialize a node + */ +static struct pfs_node * +pfs_alloc_node_flags(struct pfs_info *pi, const char *name, pfs_type_t type, int flags) +{ + struct pfs_node *pn; + int malloc_flags; + + KASSERT(strlen(name) < PFS_NAMELEN, + ("%s(): node name is too long", __func__)); + if (flags & PFS_NOWAIT) + malloc_flags = M_NOWAIT | M_ZERO; + else + malloc_flags = M_WAITOK | M_ZERO; + pn = malloc(sizeof *pn, M_PFSNODES, malloc_flags); + if (pn == NULL) + return (NULL); + mtx_init(&pn->pn_mutex, "pfs_node", NULL, MTX_DEF | MTX_DUPOK); + strlcpy(pn->pn_name, name, sizeof pn->pn_name); + pn->pn_type = type; + pn->pn_info = pi; + return (pn); +} + +static struct pfs_node * +pfs_alloc_node(struct pfs_info *pi, const char *name, pfs_type_t type) +{ + return (pfs_alloc_node_flags(pi, name, type, 0)); +} + +/* + * Add a node to a directory + */ +static void +pfs_add_node(struct pfs_node *parent, struct pfs_node *pn) +{ +#ifdef INVARIANTS + struct pfs_node *iter; +#endif + + KASSERT(parent != NULL, + ("%s(): parent is NULL", __func__)); + KASSERT(pn->pn_parent == NULL, + ("%s(): node already has a parent", __func__)); + KASSERT(parent->pn_info != NULL, + ("%s(): parent has no pn_info", __func__)); + KASSERT(parent->pn_type == pfstype_dir || + parent->pn_type == pfstype_procdir || + parent->pn_type == pfstype_root, + ("%s(): parent is not a directory", __func__)); + +#ifdef INVARIANTS + /* XXX no locking! */ + if (pn->pn_type == pfstype_procdir) + for (iter = parent; iter != NULL; iter = iter->pn_parent) + KASSERT(iter->pn_type != pfstype_procdir, + ("%s(): nested process directories", __func__)); + for (iter = parent->pn_nodes; iter != NULL; iter = iter->pn_next) { + KASSERT(strcmp(pn->pn_name, iter->pn_name) != 0, + ("%s(): homonymous siblings", __func__)); + if (pn->pn_type == pfstype_procdir) + KASSERT(iter->pn_type != pfstype_procdir, + ("%s(): sibling process directories", __func__)); + } +#endif + + pn->pn_parent = parent; + pfs_fileno_alloc(pn); + + pfs_lock(parent); + pn->pn_next = parent->pn_nodes; + if ((parent->pn_flags & PFS_PROCDEP) != 0) + pn->pn_flags |= PFS_PROCDEP; + parent->pn_nodes = pn; + pfs_unlock(parent); +} + +/* + * Detach a node from its aprent + */ +static void +pfs_detach_node(struct pfs_node *pn) +{ + struct pfs_node *parent = pn->pn_parent; + struct pfs_node **iter; + + KASSERT(parent != NULL, ("%s(): node has no parent", __func__)); + KASSERT(parent->pn_info == pn->pn_info, + ("%s(): parent has different pn_info", __func__)); + + pfs_lock(parent); + iter = &parent->pn_nodes; + while (*iter != NULL) { + if (*iter == pn) { + *iter = pn->pn_next; + break; + } + iter = &(*iter)->pn_next; + } + pn->pn_parent = NULL; + pfs_unlock(parent); +} + +/* + * Add . and .. to a directory + */ +static int +pfs_fixup_dir_flags(struct pfs_node *parent, int flags) +{ + struct pfs_node *dot, *dotdot; + + dot = pfs_alloc_node_flags(parent->pn_info, ".", pfstype_this, flags); + if (dot == NULL) + return (ENOMEM); + dotdot = pfs_alloc_node_flags(parent->pn_info, "..", pfstype_parent, flags); + if (dotdot == NULL) { + pfs_destroy(dot); + return (ENOMEM); + } + pfs_add_node(parent, dot); + pfs_add_node(parent, dotdot); + return (0); +} + +static void +pfs_fixup_dir(struct pfs_node *parent) +{ + + pfs_fixup_dir_flags(parent, 0); +} + +/* + * Create a directory + */ +struct pfs_node * +pfs_create_dir(struct pfs_node *parent, const char *name, + pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy, + int flags) +{ + struct pfs_node *pn; + int rc; + + pn = pfs_alloc_node_flags(parent->pn_info, name, + (flags & PFS_PROCDEP) ? pfstype_procdir : pfstype_dir, flags); + if (pn == NULL) + return (NULL); + pn->pn_attr = attr; + pn->pn_vis = vis; + pn->pn_destroy = destroy; + pn->pn_flags = flags; + pfs_add_node(parent, pn); + rc = pfs_fixup_dir_flags(pn, flags); + if (rc) { + pfs_destroy(pn); + return (NULL); + } + return (pn); +} + +/* + * Create a file + */ +struct pfs_node * +pfs_create_file(struct pfs_node *parent, const char *name, pfs_fill_t fill, + pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy, + int flags) +{ + struct pfs_node *pn; + + pn = pfs_alloc_node_flags(parent->pn_info, name, pfstype_file, flags); + if (pn == NULL) + return (NULL); + pn->pn_fill = fill; + pn->pn_attr = attr; + pn->pn_vis = vis; + pn->pn_destroy = destroy; + pn->pn_flags = flags; + pfs_add_node(parent, pn); + + return (pn); +} + +/* + * Create a symlink + */ +struct pfs_node * +pfs_create_link(struct pfs_node *parent, const char *name, pfs_fill_t fill, + pfs_attr_t attr, pfs_vis_t vis, pfs_destroy_t destroy, + int flags) +{ + struct pfs_node *pn; + + pn = pfs_alloc_node_flags(parent->pn_info, name, pfstype_symlink, flags); + if (pn == NULL) + return (NULL); + pn->pn_fill = fill; + pn->pn_attr = attr; + pn->pn_vis = vis; + pn->pn_destroy = destroy; + pn->pn_flags = flags; + pfs_add_node(parent, pn); + + return (pn); +} + +/* + * Locate a node by name + */ +struct pfs_node * +pfs_find_node(struct pfs_node *parent, const char *name) +{ + struct pfs_node *pn; + + pfs_lock(parent); + for (pn = parent->pn_nodes; pn != NULL; pn = pn->pn_next) + if (strcmp(pn->pn_name, name) == 0) + break; + pfs_unlock(parent); + return (pn); +} + +/* + * Destroy a node and all its descendants. If the node to be destroyed + * has a parent, the parent's mutex must be held. + */ +int +pfs_destroy(struct pfs_node *pn) +{ + struct pfs_node *iter; + + KASSERT(pn != NULL, + ("%s(): node is NULL", __func__)); + KASSERT(pn->pn_info != NULL, + ("%s(): node has no pn_info", __func__)); + + if (pn->pn_parent) + pfs_detach_node(pn); + + /* destroy children */ + if (pn->pn_type == pfstype_dir || + pn->pn_type == pfstype_procdir || + pn->pn_type == pfstype_root) { + pfs_lock(pn); + while (pn->pn_nodes != NULL) { + iter = pn->pn_nodes; + pn->pn_nodes = iter->pn_next; + iter->pn_parent = NULL; + pfs_unlock(pn); + pfs_destroy(iter); + pfs_lock(pn); + } + pfs_unlock(pn); + } + + /* revoke vnodes and fileno */ + pfs_purge(pn); + + /* callback to free any private resources */ + if (pn->pn_destroy != NULL) + pn_destroy(pn); + + /* destroy the node */ + pfs_fileno_free(pn); + mtx_destroy(&pn->pn_mutex); + free(pn, M_PFSNODES); + + return (0); +} + +/* + * Mount a pseudofs instance + */ +int +pfs_mount(struct pfs_info *pi, struct mount *mp) +{ + struct statfs *sbp; + + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_LOCAL; + MNT_IUNLOCK(mp); + mp->mnt_data = pi; + vfs_getnewfsid(mp); + + sbp = &mp->mnt_stat; + vfs_mountedfrom(mp, pi->pi_name); + sbp->f_bsize = PAGE_SIZE; + sbp->f_iosize = PAGE_SIZE; + sbp->f_blocks = 1; + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = 1; + sbp->f_ffree = 0; + + return (0); +} + +/* + * Compatibility shim for old mount(2) system call + */ +int +pfs_cmount(struct mntarg *ma, void *data, uint64_t flags) +{ + int error; + + error = kernel_mount(ma, flags); + return (error); +} + +/* + * Unmount a pseudofs instance + */ +int +pfs_unmount(struct mount *mp, int mntflags) +{ + int error; + + error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, + curthread); + return (error); +} + +/* + * Return a root vnode + */ +int +pfs_root(struct mount *mp, int flags, struct vnode **vpp) +{ + struct pfs_info *pi; + + pi = (struct pfs_info *)mp->mnt_data; + return (pfs_vncache_alloc(mp, vpp, pi->pi_root, NO_PID)); +} + +/* + * Return filesystem stats + */ +int +pfs_statfs(struct mount *mp, struct statfs *sbp) +{ + /* no-op: always called with mp->mnt_stat */ + return (0); +} + +/* + * Initialize a pseudofs instance + */ +int +pfs_init(struct pfs_info *pi, struct vfsconf *vfc) +{ + struct pfs_node *root; + int error; + + pfs_fileno_init(pi); + + /* set up the root directory */ + root = pfs_alloc_node(pi, "/", pfstype_root); + pi->pi_root = root; + pfs_fileno_alloc(root); + pfs_fixup_dir(root); + + /* construct file hierarchy */ + error = (pi->pi_init)(pi, vfc); + if (error) { + pfs_destroy(root); + pi->pi_root = NULL; + return (error); + } + + if (bootverbose) + printf("%s registered\n", pi->pi_name); + return (0); +} + +/* + * Destroy a pseudofs instance + */ +int +pfs_uninit(struct pfs_info *pi, struct vfsconf *vfc) +{ + int error; + + pfs_destroy(pi->pi_root); + pi->pi_root = NULL; + pfs_fileno_uninit(pi); + if (bootverbose) + printf("%s unregistered\n", pi->pi_name); + error = (pi->pi_uninit)(pi, vfc); + return (error); +} + +/* + * Handle load / unload events + */ +static int +pfs_modevent(module_t mod, int evt, void *arg) +{ + switch (evt) { + case MOD_LOAD: + pfs_vncache_load(); + break; + case MOD_UNLOAD: + case MOD_SHUTDOWN: + pfs_vncache_unload(); + break; + default: + return EOPNOTSUPP; + break; + } + return 0; +} + +/* + * Module declaration + */ +static moduledata_t pseudofs_data = { + "pseudofs", + pfs_modevent, + NULL +}; +DECLARE_MODULE(pseudofs, pseudofs_data, SI_SUB_EXEC, SI_ORDER_FIRST); +MODULE_VERSION(pseudofs, 1); diff --git a/freebsd/sys/fs/pseudofs/pseudofs.h b/freebsd/sys/fs/pseudofs/pseudofs.h new file mode 100644 index 00000000..602e1fbf --- /dev/null +++ b/freebsd/sys/fs/pseudofs/pseudofs.h @@ -0,0 +1,312 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PSEUDOFS_H_INCLUDED +#define _PSEUDOFS_H_INCLUDED + +#include + +/* + * Opaque structures + */ +struct mntarg; +struct mount; +struct nameidata; +struct proc; +struct sbuf; +struct statfs; +struct thread; +struct uio; +struct vfsconf; +struct vnode; + +/* + * Limits and constants + */ +#define PFS_NAMELEN 128 +#define PFS_FSNAMELEN 16 /* equal to MFSNAMELEN */ +#define PFS_DELEN (offsetof(struct dirent, d_name) + PFS_NAMELEN) + +typedef enum { + pfstype_none = 0, + pfstype_root, + pfstype_dir, + pfstype_this, + pfstype_parent, + pfstype_file, + pfstype_symlink, + pfstype_procdir +} pfs_type_t; + +/* + * Flags + */ +#define PFS_RD 0x0001 /* readable */ +#define PFS_WR 0x0002 /* writeable */ +#define PFS_RDWR (PFS_RD|PFS_WR) +#define PFS_RAWRD 0x0004 /* raw reader */ +#define PFS_RAWWR 0x0008 /* raw writer */ +#define PFS_RAW (PFS_RAWRD|PFS_RAWWR) +#define PFS_PROCDEP 0x0010 /* process-dependent */ +#define PFS_NOWAIT 0x0020 /* allow malloc to fail */ + +/* + * Data structures + */ +struct pfs_info; +struct pfs_node; + +/* + * Init / uninit callback + */ +#define PFS_INIT_ARGS \ + struct pfs_info *pi, struct vfsconf *vfc +#define PFS_INIT_ARGNAMES \ + pi, vfc +#define PFS_INIT_PROTO(name) \ + int name(PFS_INIT_ARGS); +typedef int (*pfs_init_t)(PFS_INIT_ARGS); + +/* + * Filler callback + * Called with proc held but unlocked + */ +#define PFS_FILL_ARGS \ + struct thread *td, struct proc *p, struct pfs_node *pn, \ + struct sbuf *sb, struct uio *uio +#define PFS_FILL_ARGNAMES \ + td, p, pn, sb, uio +#define PFS_FILL_PROTO(name) \ + int name(PFS_FILL_ARGS); +typedef int (*pfs_fill_t)(PFS_FILL_ARGS); + +/* + * Attribute callback + * Called with proc locked + */ +struct vattr; +#define PFS_ATTR_ARGS \ + struct thread *td, struct proc *p, struct pfs_node *pn, \ + struct vattr *vap +#define PFS_ATTR_ARGNAMES \ + td, p, pn, vap +#define PFS_ATTR_PROTO(name) \ + int name(PFS_ATTR_ARGS); +typedef int (*pfs_attr_t)(PFS_ATTR_ARGS); + +/* + * Visibility callback + * Called with proc locked + */ +#define PFS_VIS_ARGS \ + struct thread *td, struct proc *p, struct pfs_node *pn +#define PFS_VIS_ARGNAMES \ + td, p, pn +#define PFS_VIS_PROTO(name) \ + int name(PFS_VIS_ARGS); +typedef int (*pfs_vis_t)(PFS_VIS_ARGS); + +/* + * Ioctl callback + * Called with proc locked + */ +#define PFS_IOCTL_ARGS \ + struct thread *td, struct proc *p, struct pfs_node *pn, \ + unsigned long cmd, void *data +#define PFS_IOCTL_ARGNAMES \ + td, p, pn, cmd, data +#define PFS_IOCTL_PROTO(name) \ + int name(PFS_IOCTL_ARGS); +typedef int (*pfs_ioctl_t)(PFS_IOCTL_ARGS); + +/* + * Getextattr callback + * Called with proc locked + */ +#define PFS_GETEXTATTR_ARGS \ + struct thread *td, struct proc *p, struct pfs_node *pn, \ + int attrnamespace, const char *name, struct uio *uio, \ + size_t *size, struct ucred *cred +#define PFS_GETEXTATTR_ARGNAMES \ + td, p, pn, attrnamespace, name, uio, size, cred +#define PFS_GETEXTATTR_PROTO(name) \ + int name(PFS_GETEXTATTR_ARGS); +struct ucred; +typedef int (*pfs_getextattr_t)(PFS_GETEXTATTR_ARGS); + +/* + * Last-close callback + * Called with proc locked + */ +#define PFS_CLOSE_ARGS \ + struct thread *td, struct proc *p, struct pfs_node *pn +#define PFS_CLOSE_ARGNAMES \ + td, p, pn +#define PFS_CLOSE_PROTO(name) \ + int name(PFS_CLOSE_ARGS); +typedef int (*pfs_close_t)(PFS_CLOSE_ARGS); + +/* + * Destroy callback + */ +#define PFS_DESTROY_ARGS \ + struct pfs_node *pn +#define PFS_DESTROY_ARGNAMES \ + pn +#define PFS_DESTROY_PROTO(name) \ + int name(PFS_DESTROY_ARGS); +typedef int (*pfs_destroy_t)(PFS_DESTROY_ARGS); + +/* + * pfs_info: describes a pseudofs instance + * + * The pi_mutex is only used to avoid using the global subr_unit lock + * for unrhdr. The rest of struct pfs_info is only modified during + * vfs_init() and vfs_uninit() of the consumer filesystem. + */ +struct pfs_info { + char pi_name[PFS_FSNAMELEN]; + pfs_init_t pi_init; + pfs_init_t pi_uninit; + + /* members below this line are initialized at run time */ + struct pfs_node *pi_root; + struct mtx pi_mutex; + struct unrhdr *pi_unrhdr; +}; + +/* + * pfs_node: describes a node (file or directory) within a pseudofs + * + * - Fields marked (o) are protected by the node's own mutex. + * - Fields marked (p) are protected by the node's parent's mutex. + * - Remaining fields are not protected by any lock and are assumed to be + * immutable once the node has been created. + * + * To prevent deadlocks, if a node's mutex is to be held at the same time + * as its parent's (e.g. when adding or removing nodes to a directory), + * the parent's mutex must always be acquired first. Unfortunately, this + * is not enforcable by WITNESS. + */ +struct pfs_node { + char pn_name[PFS_NAMELEN]; + pfs_type_t pn_type; + int pn_flags; + struct mtx pn_mutex; + void *pn_data; /* (o) */ + + pfs_fill_t pn_fill; + pfs_ioctl_t pn_ioctl; + pfs_close_t pn_close; + pfs_attr_t pn_attr; + pfs_vis_t pn_vis; + pfs_getextattr_t pn_getextattr; + pfs_destroy_t pn_destroy; + + struct pfs_info *pn_info; + u_int32_t pn_fileno; /* (o) */ + + struct pfs_node *pn_parent; /* (o) */ + struct pfs_node *pn_nodes; /* (o) */ + struct pfs_node *pn_next; /* (p) */ +}; + +/* + * VFS interface + */ +int pfs_mount (struct pfs_info *pi, struct mount *mp); +int pfs_cmount (struct mntarg *ma, void *data, uint64_t flags); +int pfs_unmount (struct mount *mp, int mntflags); +int pfs_root (struct mount *mp, int flags, + struct vnode **vpp); +int pfs_statfs (struct mount *mp, struct statfs *sbp); +int pfs_init (struct pfs_info *pi, struct vfsconf *vfc); +int pfs_uninit (struct pfs_info *pi, struct vfsconf *vfc); + +/* + * Directory structure construction and manipulation + */ +struct pfs_node *pfs_create_dir (struct pfs_node *parent, const char *name, + pfs_attr_t attr, pfs_vis_t vis, + pfs_destroy_t destroy, int flags); +struct pfs_node *pfs_create_file(struct pfs_node *parent, const char *name, + pfs_fill_t fill, pfs_attr_t attr, + pfs_vis_t vis, pfs_destroy_t destroy, + int flags); +struct pfs_node *pfs_create_link(struct pfs_node *parent, const char *name, + pfs_fill_t fill, pfs_attr_t attr, + pfs_vis_t vis, pfs_destroy_t destroy, + int flags); +struct pfs_node *pfs_find_node (struct pfs_node *parent, const char *name); +void pfs_purge (struct pfs_node *pn); +int pfs_destroy (struct pfs_node *pn); + +/* + * Now for some initialization magic... + */ +#define PSEUDOFS(name, version, flags) \ + \ +static struct pfs_info name##_info = { \ + #name, \ + name##_init, \ + name##_uninit, \ +}; \ + \ +static int \ +_##name##_mount(struct mount *mp) { \ + return (pfs_mount(&name##_info, mp)); \ +} \ + \ +static int \ +_##name##_init(struct vfsconf *vfc) { \ + return (pfs_init(&name##_info, vfc)); \ +} \ + \ +static int \ +_##name##_uninit(struct vfsconf *vfc) { \ + return (pfs_uninit(&name##_info, vfc)); \ +} \ + \ +static struct vfsops name##_vfsops = { \ + .vfs_cmount = pfs_cmount, \ + .vfs_init = _##name##_init, \ + .vfs_mount = _##name##_mount, \ + .vfs_root = pfs_root, \ + .vfs_statfs = pfs_statfs, \ + .vfs_uninit = _##name##_uninit, \ + .vfs_unmount = pfs_unmount, \ +}; \ +VFS_SET(name##_vfsops, name, VFCF_SYNTHETIC | flags); \ +MODULE_VERSION(name, version); \ +MODULE_DEPEND(name, pseudofs, 1, 1, 1); + +#endif diff --git a/freebsd/sys/fs/pseudofs/pseudofs_fileno.c b/freebsd/sys/fs/pseudofs/pseudofs_fileno.c new file mode 100644 index 00000000..2c6b2d1f --- /dev/null +++ b/freebsd/sys/fs/pseudofs/pseudofs_fileno.c @@ -0,0 +1,159 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_pseudofs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Initialize fileno bitmap + */ +void +pfs_fileno_init(struct pfs_info *pi) +{ + + mtx_init(&pi->pi_mutex, "pfs_fileno", NULL, MTX_DEF); + pi->pi_unrhdr = new_unrhdr(3, INT_MAX / NO_PID, &pi->pi_mutex); +} + +/* + * Tear down fileno bitmap + */ +void +pfs_fileno_uninit(struct pfs_info *pi) +{ + + delete_unrhdr(pi->pi_unrhdr); + pi->pi_unrhdr = NULL; + mtx_destroy(&pi->pi_mutex); +} + +/* + * Allocate a file number + */ +void +pfs_fileno_alloc(struct pfs_node *pn) +{ + + if (pn->pn_parent) + PFS_TRACE(("%s/%s", pn->pn_parent->pn_name, pn->pn_name)); + else + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + switch (pn->pn_type) { + case pfstype_root: + /* root must always be 2 */ + pn->pn_fileno = 2; + break; + case pfstype_dir: + case pfstype_file: + case pfstype_symlink: + case pfstype_procdir: + pn->pn_fileno = alloc_unr(pn->pn_info->pi_unrhdr); + break; + case pfstype_this: + KASSERT(pn->pn_parent != NULL, + ("%s(): pfstype_this node has no parent", __func__)); + pn->pn_fileno = pn->pn_parent->pn_fileno; + break; + case pfstype_parent: + KASSERT(pn->pn_parent != NULL, + ("%s(): pfstype_parent node has no parent", __func__)); + if (pn->pn_parent->pn_type == pfstype_root) { + pn->pn_fileno = pn->pn_parent->pn_fileno; + break; + } + KASSERT(pn->pn_parent->pn_parent != NULL, + ("%s(): pfstype_parent node has no grandparent", __func__)); + pn->pn_fileno = pn->pn_parent->pn_parent->pn_fileno; + break; + case pfstype_none: + KASSERT(0, + ("%s(): pfstype_none node", __func__)); + break; + } + +#if 0 + printf("%s(): %s: ", __func__, pn->pn_info->pi_name); + if (pn->pn_parent) { + if (pn->pn_parent->pn_parent) { + printf("%s/", pn->pn_parent->pn_parent->pn_name); + } + printf("%s/", pn->pn_parent->pn_name); + } + printf("%s -> %d\n", pn->pn_name, pn->pn_fileno); +#endif +} + +/* + * Release a file number + */ +void +pfs_fileno_free(struct pfs_node *pn) +{ + + pfs_assert_not_owned(pn); + + switch (pn->pn_type) { + case pfstype_root: + /* not allocated from unrhdr */ + return; + case pfstype_dir: + case pfstype_file: + case pfstype_symlink: + case pfstype_procdir: + free_unr(pn->pn_info->pi_unrhdr, pn->pn_fileno); + break; + case pfstype_this: + case pfstype_parent: + /* ignore these, as they don't "own" their file number */ + break; + case pfstype_none: + KASSERT(0, + ("pfs_fileno_free() called for pfstype_none node")); + break; + } +} diff --git a/freebsd/sys/fs/pseudofs/pseudofs_internal.h b/freebsd/sys/fs/pseudofs/pseudofs_internal.h new file mode 100644 index 00000000..3ec49e71 --- /dev/null +++ b/freebsd/sys/fs/pseudofs/pseudofs_internal.h @@ -0,0 +1,213 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PSEUDOFS_INTERNAL_H_INCLUDED +#define _PSEUDOFS_INTERNAL_H_INCLUDED + +/* + * Sysctl subtree + */ +SYSCTL_DECL(_vfs_pfs); + +/* + * Vnode data + */ +struct pfs_vdata { + struct pfs_node *pvd_pn; + pid_t pvd_pid; + struct vnode *pvd_vnode; + struct pfs_vdata*pvd_prev, *pvd_next; + int pvd_dead:1; +}; + +/* + * Vnode cache + */ +void pfs_vncache_load (void); +void pfs_vncache_unload (void); +int pfs_vncache_alloc (struct mount *, struct vnode **, + struct pfs_node *, pid_t pid); +int pfs_vncache_free (struct vnode *); + +/* + * File number bitmap + */ +void pfs_fileno_init (struct pfs_info *); +void pfs_fileno_uninit (struct pfs_info *); +void pfs_fileno_alloc (struct pfs_node *); +void pfs_fileno_free (struct pfs_node *); + +/* + * Debugging + */ +#ifdef PSEUDOFS_TRACE +extern int pfs_trace; + +#define PFS_TRACE(foo) \ + do { \ + if (pfs_trace) { \ + printf("%s(): line %d: ", __func__, __LINE__); \ + printf foo ; \ + printf("\n"); \ + } \ + } while (0) +#define PFS_RETURN(err) \ + do { \ + if (pfs_trace) { \ + printf("%s(): line %d: returning %d\n", \ + __func__, __LINE__, err); \ + } \ + return (err); \ + } while (0) +#else +#define PFS_TRACE(foo) \ + do { /* nothing */ } while (0) +#define PFS_RETURN(err) \ + return (err) +#endif + +/* + * Inline helpers for locking + */ +static inline void +pfs_lock(struct pfs_node *pn) +{ + + mtx_lock(&pn->pn_mutex); +} + +static inline void +pfs_unlock(struct pfs_node *pn) +{ + + mtx_unlock(&pn->pn_mutex); +} + +static inline void +pfs_assert_owned(struct pfs_node *pn) +{ + + mtx_assert(&pn->pn_mutex, MA_OWNED); +} + +static inline void +pfs_assert_not_owned(struct pfs_node *pn) +{ + + mtx_assert(&pn->pn_mutex, MA_NOTOWNED); +} + +static inline int +pn_fill(PFS_FILL_ARGS) +{ + + PFS_TRACE(("%s", pn->pn_name)); + KASSERT(pn->pn_fill != NULL, ("%s(): no callback", __func__)); + if (p != NULL) { + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + PROC_ASSERT_HELD(p); + } + pfs_assert_not_owned(pn); + return ((pn->pn_fill)(PFS_FILL_ARGNAMES)); +} + +static inline int +pn_attr(PFS_ATTR_ARGS) +{ + + PFS_TRACE(("%s", pn->pn_name)); + KASSERT(pn->pn_attr != NULL, ("%s(): no callback", __func__)); + if (p != NULL) + PROC_LOCK_ASSERT(p, MA_OWNED); + pfs_assert_not_owned(pn); + return ((pn->pn_attr)(PFS_ATTR_ARGNAMES)); +} + +static inline int +pn_vis(PFS_VIS_ARGS) +{ + + PFS_TRACE(("%s", pn->pn_name)); + KASSERT(pn->pn_vis != NULL, ("%s(): no callback", __func__)); + KASSERT(p != NULL, ("%s(): no process", __func__)); + PROC_LOCK_ASSERT(p, MA_OWNED); + pfs_assert_not_owned(pn); + return ((pn->pn_vis)(PFS_VIS_ARGNAMES)); +} + +static inline int +pn_ioctl(PFS_IOCTL_ARGS) +{ + + PFS_TRACE(("%s", pn->pn_name)); + KASSERT(pn->pn_ioctl != NULL, ("%s(): no callback", __func__)); + if (p != NULL) + PROC_LOCK_ASSERT(p, MA_OWNED); + pfs_assert_not_owned(pn); + return ((pn->pn_ioctl)(PFS_IOCTL_ARGNAMES)); +} + +static inline int +pn_getextattr(PFS_GETEXTATTR_ARGS) +{ + + PFS_TRACE(("%s", pn->pn_name)); + KASSERT(pn->pn_getextattr != NULL, ("%s(): no callback", __func__)); + if (p != NULL) + PROC_LOCK_ASSERT(p, MA_OWNED); + pfs_assert_not_owned(pn); + return ((pn->pn_getextattr)(PFS_GETEXTATTR_ARGNAMES)); +} + +static inline int +pn_close(PFS_CLOSE_ARGS) +{ + + PFS_TRACE(("%s", pn->pn_name)); + KASSERT(pn->pn_close != NULL, ("%s(): no callback", __func__)); + if (p != NULL) + PROC_LOCK_ASSERT(p, MA_OWNED); + pfs_assert_not_owned(pn); + return ((pn->pn_close)(PFS_CLOSE_ARGNAMES)); +} + +static inline int +pn_destroy(PFS_DESTROY_ARGS) +{ + + PFS_TRACE(("%s", pn->pn_name)); + KASSERT(pn->pn_destroy != NULL, ("%s(): no callback", __func__)); + pfs_assert_not_owned(pn); + return ((pn->pn_destroy)(PFS_DESTROY_ARGNAMES)); +} + +#endif diff --git a/freebsd/sys/fs/pseudofs/pseudofs_vncache.c b/freebsd/sys/fs/pseudofs/pseudofs_vncache.c new file mode 100644 index 00000000..05dd6569 --- /dev/null +++ b/freebsd/sys/fs/pseudofs/pseudofs_vncache.c @@ -0,0 +1,333 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_pseudofs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static MALLOC_DEFINE(M_PFSVNCACHE, "pfs_vncache", "pseudofs vnode cache"); + +static struct mtx pfs_vncache_mutex; +static struct pfs_vdata *pfs_vncache; +static eventhandler_tag pfs_exit_tag; +static void pfs_exit(void *arg, struct proc *p); +static void pfs_purge_locked(struct pfs_node *pn, bool force); + +static SYSCTL_NODE(_vfs_pfs, OID_AUTO, vncache, CTLFLAG_RW, 0, + "pseudofs vnode cache"); + +static int pfs_vncache_entries; +SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, entries, CTLFLAG_RD, + &pfs_vncache_entries, 0, + "number of entries in the vnode cache"); + +static int pfs_vncache_maxentries; +SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, maxentries, CTLFLAG_RD, + &pfs_vncache_maxentries, 0, + "highest number of entries in the vnode cache"); + +static int pfs_vncache_hits; +SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, hits, CTLFLAG_RD, + &pfs_vncache_hits, 0, + "number of cache hits since initialization"); + +static int pfs_vncache_misses; +SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, misses, CTLFLAG_RD, + &pfs_vncache_misses, 0, + "number of cache misses since initialization"); + +extern struct vop_vector pfs_vnodeops; /* XXX -> .h file */ + +/* + * Initialize vnode cache + */ +void +pfs_vncache_load(void) +{ + + mtx_init(&pfs_vncache_mutex, "pfs_vncache", NULL, MTX_DEF); + pfs_exit_tag = EVENTHANDLER_REGISTER(process_exit, pfs_exit, NULL, + EVENTHANDLER_PRI_ANY); +} + +/* + * Tear down vnode cache + */ +void +pfs_vncache_unload(void) +{ + + EVENTHANDLER_DEREGISTER(process_exit, pfs_exit_tag); + mtx_lock(&pfs_vncache_mutex); + pfs_purge_locked(NULL, true); + mtx_unlock(&pfs_vncache_mutex); + KASSERT(pfs_vncache_entries == 0, + ("%d vncache entries remaining", pfs_vncache_entries)); + mtx_destroy(&pfs_vncache_mutex); +} + +/* + * Allocate a vnode + */ +int +pfs_vncache_alloc(struct mount *mp, struct vnode **vpp, + struct pfs_node *pn, pid_t pid) +{ + struct pfs_vdata *pvd, *pvd2; + struct vnode *vp; + int error; + + /* + * See if the vnode is in the cache. + * XXX linear search is not very efficient. + */ +retry: + mtx_lock(&pfs_vncache_mutex); + for (pvd = pfs_vncache; pvd; pvd = pvd->pvd_next) { + if (pvd->pvd_pn == pn && pvd->pvd_pid == pid && + pvd->pvd_vnode->v_mount == mp) { + vp = pvd->pvd_vnode; + VI_LOCK(vp); + mtx_unlock(&pfs_vncache_mutex); + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) { + ++pfs_vncache_hits; + *vpp = vp; + /* + * Some callers cache_enter(vp) later, so + * we have to make sure it's not in the + * VFS cache so it doesn't get entered + * twice. A better solution would be to + * make pfs_vncache_alloc() responsible + * for entering the vnode in the VFS + * cache. + */ + cache_purge(vp); + return (0); + } + goto retry; + } + } + mtx_unlock(&pfs_vncache_mutex); + + /* nope, get a new one */ + pvd = malloc(sizeof *pvd, M_PFSVNCACHE, M_WAITOK); + pvd->pvd_next = pvd->pvd_prev = NULL; + error = getnewvnode("pseudofs", mp, &pfs_vnodeops, vpp); + if (error) { + free(pvd, M_PFSVNCACHE); + return (error); + } + pvd->pvd_pn = pn; + pvd->pvd_pid = pid; + (*vpp)->v_data = pvd; + switch (pn->pn_type) { + case pfstype_root: + (*vpp)->v_vflag = VV_ROOT; +#if 0 + printf("root vnode allocated\n"); +#endif + /* fall through */ + case pfstype_dir: + case pfstype_this: + case pfstype_parent: + case pfstype_procdir: + (*vpp)->v_type = VDIR; + break; + case pfstype_file: + (*vpp)->v_type = VREG; + break; + case pfstype_symlink: + (*vpp)->v_type = VLNK; + break; + case pfstype_none: + KASSERT(0, ("pfs_vncache_alloc called for null node\n")); + default: + panic("%s has unexpected type: %d", pn->pn_name, pn->pn_type); + } + /* + * Propagate flag through to vnode so users know it can change + * if the process changes (i.e. execve) + */ + if ((pn->pn_flags & PFS_PROCDEP) != 0) + (*vpp)->v_vflag |= VV_PROCDEP; + pvd->pvd_vnode = *vpp; + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); + VN_LOCK_AREC(*vpp); + error = insmntque(*vpp, mp); + if (error != 0) { + free(pvd, M_PFSVNCACHE); + *vpp = NULLVP; + return (error); + } +retry2: + mtx_lock(&pfs_vncache_mutex); + /* + * Other thread may race with us, creating the entry we are + * going to insert into the cache. Recheck after + * pfs_vncache_mutex is reacquired. + */ + for (pvd2 = pfs_vncache; pvd2; pvd2 = pvd2->pvd_next) { + if (pvd2->pvd_pn == pn && pvd2->pvd_pid == pid && + pvd2->pvd_vnode->v_mount == mp) { + vp = pvd2->pvd_vnode; + VI_LOCK(vp); + mtx_unlock(&pfs_vncache_mutex); + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) { + ++pfs_vncache_hits; + vgone(*vpp); + vput(*vpp); + *vpp = vp; + cache_purge(vp); + return (0); + } + goto retry2; + } + } + ++pfs_vncache_misses; + if (++pfs_vncache_entries > pfs_vncache_maxentries) + pfs_vncache_maxentries = pfs_vncache_entries; + pvd->pvd_prev = NULL; + pvd->pvd_next = pfs_vncache; + if (pvd->pvd_next) + pvd->pvd_next->pvd_prev = pvd; + pfs_vncache = pvd; + mtx_unlock(&pfs_vncache_mutex); + return (0); +} + +/* + * Free a vnode + */ +int +pfs_vncache_free(struct vnode *vp) +{ + struct pfs_vdata *pvd; + + mtx_lock(&pfs_vncache_mutex); + pvd = (struct pfs_vdata *)vp->v_data; + KASSERT(pvd != NULL, ("pfs_vncache_free(): no vnode data\n")); + if (pvd->pvd_next) + pvd->pvd_next->pvd_prev = pvd->pvd_prev; + if (pvd->pvd_prev) { + pvd->pvd_prev->pvd_next = pvd->pvd_next; + --pfs_vncache_entries; + } else if (pfs_vncache == pvd) { + pfs_vncache = pvd->pvd_next; + --pfs_vncache_entries; + } + mtx_unlock(&pfs_vncache_mutex); + + free(pvd, M_PFSVNCACHE); + vp->v_data = NULL; + return (0); +} + +/* + * Purge the cache of dead entries + * + * This is extremely inefficient due to the fact that vgone() not only + * indirectly modifies the vnode cache, but may also sleep. We can + * neither hold pfs_vncache_mutex across a vgone() call, nor make any + * assumptions about the state of the cache after vgone() returns. In + * consequence, we must start over after every vgone() call, and keep + * trying until we manage to traverse the entire cache. + * + * The only way to improve this situation is to change the data structure + * used to implement the cache. + */ +static void +pfs_purge_locked(struct pfs_node *pn, bool force) +{ + struct pfs_vdata *pvd; + struct vnode *vnp; + + mtx_assert(&pfs_vncache_mutex, MA_OWNED); + pvd = pfs_vncache; + while (pvd != NULL) { + if (force || pvd->pvd_dead || + (pn != NULL && pvd->pvd_pn == pn)) { + vnp = pvd->pvd_vnode; + vhold(vnp); + mtx_unlock(&pfs_vncache_mutex); + VOP_LOCK(vnp, LK_EXCLUSIVE); + vgone(vnp); + VOP_UNLOCK(vnp, 0); + mtx_lock(&pfs_vncache_mutex); + vdrop(vnp); + pvd = pfs_vncache; + } else { + pvd = pvd->pvd_next; + } + } +} + +void +pfs_purge(struct pfs_node *pn) +{ + + mtx_lock(&pfs_vncache_mutex); + pfs_purge_locked(pn, false); + mtx_unlock(&pfs_vncache_mutex); +} + +/* + * Free all vnodes associated with a defunct process + */ +static void +pfs_exit(void *arg, struct proc *p) +{ + struct pfs_vdata *pvd; + int dead; + + if (pfs_vncache == NULL) + return; + mtx_lock(&pfs_vncache_mutex); + for (pvd = pfs_vncache, dead = 0; pvd != NULL; pvd = pvd->pvd_next) + if (pvd->pvd_pid == p->p_pid) + dead = pvd->pvd_dead = 1; + if (dead) + pfs_purge_locked(NULL, false); + mtx_unlock(&pfs_vncache_mutex); +} diff --git a/freebsd/sys/fs/pseudofs/pseudofs_vnops.c b/freebsd/sys/fs/pseudofs/pseudofs_vnops.c new file mode 100644 index 00000000..da35f062 --- /dev/null +++ b/freebsd/sys/fs/pseudofs/pseudofs_vnops.c @@ -0,0 +1,1060 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_pseudofs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KASSERT_PN_IS_DIR(pn) \ + KASSERT((pn)->pn_type == pfstype_root || \ + (pn)->pn_type == pfstype_dir || \ + (pn)->pn_type == pfstype_procdir, \ + ("%s(): VDIR vnode refers to non-directory pfs_node", __func__)) + +#define KASSERT_PN_IS_FILE(pn) \ + KASSERT((pn)->pn_type == pfstype_file, \ + ("%s(): VREG vnode refers to non-file pfs_node", __func__)) + +#define KASSERT_PN_IS_LINK(pn) \ + KASSERT((pn)->pn_type == pfstype_symlink, \ + ("%s(): VLNK vnode refers to non-link pfs_node", __func__)) + +/* + * Returns the fileno, adjusted for target pid + */ +static uint32_t +pn_fileno(struct pfs_node *pn, pid_t pid) +{ + + KASSERT(pn->pn_fileno > 0, + ("%s(): no fileno allocated", __func__)); + if (pid != NO_PID) + return (pn->pn_fileno * NO_PID + pid); + return (pn->pn_fileno); +} + +/* + * Returns non-zero if given file is visible to given thread. + */ +static int +pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc) +{ + int visible; + + if (proc == NULL) + return (0); + + PROC_LOCK_ASSERT(proc, MA_OWNED); + + visible = ((proc->p_flag & P_WEXIT) == 0); + if (visible) + visible = (p_cansee(td, proc) == 0); + if (visible && pn->pn_vis != NULL) + visible = pn_vis(td, proc, pn); + if (!visible) + return (0); + return (1); +} + +static int +pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid, + bool allproc_locked, struct proc **p) +{ + struct proc *proc; + + PFS_TRACE(("%s (pid: %d, req: %d)", + pn->pn_name, pid, td->td_proc->p_pid)); + + if (p) + *p = NULL; + if (pid == NO_PID) + PFS_RETURN (1); + proc = allproc_locked ? pfind_locked(pid) : pfind(pid); + if (proc == NULL) + PFS_RETURN (0); + if (pfs_visible_proc(td, pn, proc)) { + if (p) + *p = proc; + else + PROC_UNLOCK(proc); + PFS_RETURN (1); + } + PROC_UNLOCK(proc); + PFS_RETURN (0); +} + +/* + * Verify permissions + */ +static int +pfs_access(struct vop_access_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct vattr vattr; + int error; + + PFS_TRACE(("%s", pvd->pvd_pn->pn_name)); + (void)pvd; + + error = VOP_GETATTR(vn, &vattr, va->a_cred); + if (error) + PFS_RETURN (error); + error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid, + vattr.va_gid, va->a_accmode, va->a_cred, NULL); + PFS_RETURN (error); +} + +/* + * Close a file or directory + */ +static int +pfs_close(struct vop_close_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + struct proc *proc; + int error; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + /* + * Do nothing unless this is the last close and the node has a + * last-close handler. + */ + if (vrefcnt(vn) > 1 || pn->pn_close == NULL) + PFS_RETURN (0); + + if (pvd->pvd_pid != NO_PID) { + proc = pfind(pvd->pvd_pid); + } else { + proc = NULL; + } + + error = pn_close(va->a_td, proc, pn); + + if (proc != NULL) + PROC_UNLOCK(proc); + + PFS_RETURN (error); +} + +/* + * Get file attributes + */ +static int +pfs_getattr(struct vop_getattr_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + struct vattr *vap = va->a_vap; + struct proc *proc; + int error = 0; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) + PFS_RETURN (ENOENT); + + vap->va_type = vn->v_type; + vap->va_fileid = pn_fileno(pn, pvd->pvd_pid); + vap->va_flags = 0; + vap->va_blocksize = PAGE_SIZE; + vap->va_bytes = vap->va_size = 0; + vap->va_filerev = 0; + vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_nlink = 1; + nanotime(&vap->va_ctime); + vap->va_atime = vap->va_mtime = vap->va_ctime; + + switch (pn->pn_type) { + case pfstype_procdir: + case pfstype_root: + case pfstype_dir: +#if 0 + pfs_lock(pn); + /* compute link count */ + pfs_unlock(pn); +#endif + vap->va_mode = 0555; + break; + case pfstype_file: + case pfstype_symlink: + vap->va_mode = 0444; + break; + default: + printf("shouldn't be here!\n"); + vap->va_mode = 0; + break; + } + + if (proc != NULL) { + vap->va_uid = proc->p_ucred->cr_ruid; + vap->va_gid = proc->p_ucred->cr_rgid; + } else { + vap->va_uid = 0; + vap->va_gid = 0; + } + + if (pn->pn_attr != NULL) + error = pn_attr(curthread, proc, pn, vap); + + if(proc != NULL) + PROC_UNLOCK(proc); + + PFS_RETURN (error); +} + +/* + * Perform an ioctl + */ +static int +pfs_ioctl(struct vop_ioctl_args *va) +{ + struct vnode *vn; + struct pfs_vdata *pvd; + struct pfs_node *pn; + struct proc *proc; + int error; + + vn = va->a_vp; + vn_lock(vn, LK_SHARED | LK_RETRY); + if (vn->v_iflag & VI_DOOMED) { + VOP_UNLOCK(vn, 0); + return (EBADF); + } + pvd = vn->v_data; + pn = pvd->pvd_pn; + + PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command)); + pfs_assert_not_owned(pn); + + if (vn->v_type != VREG) { + VOP_UNLOCK(vn, 0); + PFS_RETURN (EINVAL); + } + KASSERT_PN_IS_FILE(pn); + + if (pn->pn_ioctl == NULL) { + VOP_UNLOCK(vn, 0); + PFS_RETURN (ENOTTY); + } + + /* + * This is necessary because process' privileges may + * have changed since the open() call. + */ + if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) { + VOP_UNLOCK(vn, 0); + PFS_RETURN (EIO); + } + + error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data); + + if (proc != NULL) + PROC_UNLOCK(proc); + + VOP_UNLOCK(vn, 0); + PFS_RETURN (error); +} + +/* + * Perform getextattr + */ +static int +pfs_getextattr(struct vop_getextattr_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + struct proc *proc; + int error; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + /* + * This is necessary because either process' privileges may + * have changed since the open() call. + */ + if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) + PFS_RETURN (EIO); + + if (pn->pn_getextattr == NULL) + error = EOPNOTSUPP; + else + error = pn_getextattr(curthread, proc, pn, + va->a_attrnamespace, va->a_name, va->a_uio, + va->a_size, va->a_cred); + + if (proc != NULL) + PROC_UNLOCK(proc); + + PFS_RETURN (error); +} + +/* + * Convert a vnode to its component name + */ +static int +pfs_vptocnp(struct vop_vptocnp_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct vnode **dvp = ap->a_vpp; + struct pfs_vdata *pvd = vp->v_data; + struct pfs_node *pd = pvd->pvd_pn; + struct pfs_node *pn; + struct mount *mp; + char *buf = ap->a_buf; + int *buflen = ap->a_buflen; + char pidbuf[PFS_NAMELEN]; + pid_t pid = pvd->pvd_pid; + int len, i, error, locked; + + i = *buflen; + error = 0; + + pfs_lock(pd); + + if (vp->v_type == VDIR && pd->pn_type == pfstype_root) { + *dvp = vp; + vhold(*dvp); + pfs_unlock(pd); + PFS_RETURN (0); + } else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) { + len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid); + i -= len; + if (i < 0) { + error = ENOMEM; + goto failed; + } + bcopy(pidbuf, buf + i, len); + } else { + len = strlen(pd->pn_name); + i -= len; + if (i < 0) { + error = ENOMEM; + goto failed; + } + bcopy(pd->pn_name, buf + i, len); + } + + pn = pd->pn_parent; + pfs_unlock(pd); + + mp = vp->v_mount; + error = vfs_busy(mp, 0); + if (error) + return (error); + + /* + * vp is held by caller. + */ + locked = VOP_ISLOCKED(vp); + VOP_UNLOCK(vp, 0); + + error = pfs_vncache_alloc(mp, dvp, pn, pid); + if (error) { + vn_lock(vp, locked | LK_RETRY); + vfs_unbusy(mp); + PFS_RETURN(error); + } + + *buflen = i; + VOP_UNLOCK(*dvp, 0); + vn_lock(vp, locked | LK_RETRY); + vfs_unbusy(mp); + + PFS_RETURN (0); +failed: + pfs_unlock(pd); + PFS_RETURN(error); +} + +/* + * Look up a file or directory + */ +static int +pfs_lookup(struct vop_cachedlookup_args *va) +{ + struct vnode *vn = va->a_dvp; + struct vnode **vpp = va->a_vpp; + struct componentname *cnp = va->a_cnp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pd = pvd->pvd_pn; + struct pfs_node *pn, *pdn = NULL; + struct mount *mp; + pid_t pid = pvd->pvd_pid; + char *pname; + int error, i, namelen, visible; + + PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr)); + pfs_assert_not_owned(pd); + + if (vn->v_type != VDIR) + PFS_RETURN (ENOTDIR); + KASSERT_PN_IS_DIR(pd); + + /* + * Don't support DELETE or RENAME. CREATE is supported so + * that O_CREAT will work, but the lookup will still fail if + * the file does not exist. + */ + if ((cnp->cn_flags & ISLASTCN) && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) + PFS_RETURN (EOPNOTSUPP); + + /* shortcut: check if the name is too long */ + if (cnp->cn_namelen >= PFS_NAMELEN) + PFS_RETURN (ENOENT); + + /* check that parent directory is visible... */ + if (!pfs_visible(curthread, pd, pvd->pvd_pid, false, NULL)) + PFS_RETURN (ENOENT); + + /* self */ + namelen = cnp->cn_namelen; + pname = cnp->cn_nameptr; + if (namelen == 1 && pname[0] == '.') { + pn = pd; + *vpp = vn; + VREF(vn); + PFS_RETURN (0); + } + + mp = vn->v_mount; + + /* parent */ + if (cnp->cn_flags & ISDOTDOT) { + if (pd->pn_type == pfstype_root) + PFS_RETURN (EIO); + error = vfs_busy(mp, MBF_NOWAIT); + if (error != 0) { + vfs_ref(mp); + VOP_UNLOCK(vn, 0); + error = vfs_busy(mp, 0); + vn_lock(vn, LK_EXCLUSIVE | LK_RETRY); + vfs_rel(mp); + if (error != 0) + PFS_RETURN(ENOENT); + if (vn->v_iflag & VI_DOOMED) { + vfs_unbusy(mp); + PFS_RETURN(ENOENT); + } + } + VOP_UNLOCK(vn, 0); + KASSERT(pd->pn_parent != NULL, + ("%s(): non-root directory has no parent", __func__)); + /* + * This one is tricky. Descendents of procdir nodes + * inherit their parent's process affinity, but + * there's no easy reverse mapping. For simplicity, + * we assume that if this node is a procdir, its + * parent isn't (which is correct as long as + * descendents of procdir nodes are never procdir + * nodes themselves) + */ + if (pd->pn_type == pfstype_procdir) + pid = NO_PID; + pfs_lock(pd); + pn = pd->pn_parent; + pfs_unlock(pd); + goto got_pnode; + } + + pfs_lock(pd); + + /* named node */ + for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next) + if (pn->pn_type == pfstype_procdir) + pdn = pn; + else if (pn->pn_name[namelen] == '\0' && + bcmp(pname, pn->pn_name, namelen) == 0) { + pfs_unlock(pd); + goto got_pnode; + } + + /* process dependent node */ + if ((pn = pdn) != NULL) { + pid = 0; + for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i) + if ((pid = pid * 10 + pname[i] - '0') > PID_MAX) + break; + if (i == cnp->cn_namelen) { + pfs_unlock(pd); + goto got_pnode; + } + } + + pfs_unlock(pd); + + PFS_RETURN (ENOENT); + + got_pnode: + pfs_assert_not_owned(pd); + pfs_assert_not_owned(pn); + visible = pfs_visible(curthread, pn, pid, false, NULL); + if (!visible) { + error = ENOENT; + goto failed; + } + + error = pfs_vncache_alloc(mp, vpp, pn, pid); + if (error) + goto failed; + + if (cnp->cn_flags & ISDOTDOT) { + vfs_unbusy(mp); + vn_lock(vn, LK_EXCLUSIVE | LK_RETRY); + if (vn->v_iflag & VI_DOOMED) { + vput(*vpp); + *vpp = NULL; + PFS_RETURN(ENOENT); + } + } + if (cnp->cn_flags & MAKEENTRY && !(vn->v_iflag & VI_DOOMED)) + cache_enter(vn, *vpp, cnp); + PFS_RETURN (0); + failed: + if (cnp->cn_flags & ISDOTDOT) { + vfs_unbusy(mp); + vn_lock(vn, LK_EXCLUSIVE | LK_RETRY); + *vpp = NULL; + } + PFS_RETURN(error); +} + +/* + * Open a file or directory. + */ +static int +pfs_open(struct vop_open_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + int mode = va->a_mode; + + PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode)); + pfs_assert_not_owned(pn); + + /* check if the requested mode is permitted */ + if (((mode & FREAD) && !(mode & PFS_RD)) || + ((mode & FWRITE) && !(mode & PFS_WR))) + PFS_RETURN (EPERM); + + /* we don't support locking */ + if ((mode & O_SHLOCK) || (mode & O_EXLOCK)) + PFS_RETURN (EOPNOTSUPP); + + PFS_RETURN (0); +} + +/* + * Read from a file + */ +static int +pfs_read(struct vop_read_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + struct uio *uio = va->a_uio; + struct proc *proc; + struct sbuf *sb = NULL; + int error, locked; + off_t buflen; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + if (vn->v_type != VREG) + PFS_RETURN (EINVAL); + KASSERT_PN_IS_FILE(pn); + + if (!(pn->pn_flags & PFS_RD)) + PFS_RETURN (EBADF); + + if (pn->pn_fill == NULL) + PFS_RETURN (EIO); + + /* + * This is necessary because either process' privileges may + * have changed since the open() call. + */ + if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) + PFS_RETURN (EIO); + if (proc != NULL) { + _PHOLD(proc); + PROC_UNLOCK(proc); + } + + vhold(vn); + locked = VOP_ISLOCKED(vn); + VOP_UNLOCK(vn, 0); + + if (pn->pn_flags & PFS_RAWRD) { + PFS_TRACE(("%zd resid", uio->uio_resid)); + error = pn_fill(curthread, proc, pn, NULL, uio); + PFS_TRACE(("%zd resid", uio->uio_resid)); + goto ret; + } + + if (uio->uio_resid < 0 || uio->uio_offset < 0 || + uio->uio_resid > OFF_MAX - uio->uio_offset) { + error = EINVAL; + goto ret; + } + buflen = uio->uio_offset + uio->uio_resid; + if (buflen > MAXPHYS) + buflen = MAXPHYS; + + sb = sbuf_new(sb, NULL, buflen + 1, 0); + if (sb == NULL) { + error = EIO; + goto ret; + } + + error = pn_fill(curthread, proc, pn, sb, uio); + + if (error) { + sbuf_delete(sb); + goto ret; + } + + /* + * XXX: If the buffer overflowed, sbuf_len() will not return + * the data length. Then just use the full length because an + * overflowed sbuf must be full. + */ + if (sbuf_finish(sb) == 0) + buflen = sbuf_len(sb); + error = uiomove_frombuf(sbuf_data(sb), buflen, uio); + sbuf_delete(sb); +ret: + vn_lock(vn, locked | LK_RETRY); + vdrop(vn); + if (proc != NULL) + PRELE(proc); + PFS_RETURN (error); +} + +/* + * Iterate through directory entries + */ +static int +pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd, + struct pfs_node **pn, struct proc **p) +{ + int visible; + + sx_assert(&allproc_lock, SX_SLOCKED); + pfs_assert_owned(pd); + again: + if (*pn == NULL) { + /* first node */ + *pn = pd->pn_nodes; + } else if ((*pn)->pn_type != pfstype_procdir) { + /* next node */ + *pn = (*pn)->pn_next; + } + if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) { + /* next process */ + if (*p == NULL) + *p = LIST_FIRST(&allproc); + else + *p = LIST_NEXT(*p, p_list); + /* out of processes: next node */ + if (*p == NULL) + *pn = (*pn)->pn_next; + else + PROC_LOCK(*p); + } + + if ((*pn) == NULL) + return (-1); + + if (*p != NULL) { + visible = pfs_visible_proc(td, *pn, *p); + PROC_UNLOCK(*p); + } else if (proc != NULL) { + visible = pfs_visible_proc(td, *pn, proc); + } else { + visible = 1; + } + if (!visible) + goto again; + + return (0); +} + +/* Directory entry list */ +struct pfsentry { + STAILQ_ENTRY(pfsentry) link; + struct dirent entry; +}; +STAILQ_HEAD(pfsdirentlist, pfsentry); + +/* + * Return directory entries. + */ +static int +pfs_readdir(struct vop_readdir_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pd = pvd->pvd_pn; + pid_t pid = pvd->pvd_pid; + struct proc *p, *proc; + struct pfs_node *pn; + struct uio *uio; + struct pfsentry *pfsent, *pfsent2; + struct pfsdirentlist lst; + off_t offset; + int error, i, resid; + + STAILQ_INIT(&lst); + error = 0; + KASSERT(pd->pn_info == vn->v_mount->mnt_data, + ("%s(): pn_info does not match mountpoint", __func__)); + PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid)); + pfs_assert_not_owned(pd); + + if (vn->v_type != VDIR) + PFS_RETURN (ENOTDIR); + KASSERT_PN_IS_DIR(pd); + uio = va->a_uio; + + /* only allow reading entire entries */ + offset = uio->uio_offset; + resid = uio->uio_resid; + if (offset < 0 || offset % PFS_DELEN != 0 || + (resid && resid < PFS_DELEN)) + PFS_RETURN (EINVAL); + if (resid == 0) + PFS_RETURN (0); + + sx_slock(&allproc_lock); + pfs_lock(pd); + + /* check if the directory is visible to the caller */ + if (!pfs_visible(curthread, pd, pid, true, &proc)) { + sx_sunlock(&allproc_lock); + pfs_unlock(pd); + PFS_RETURN (ENOENT); + } + KASSERT(pid == NO_PID || proc != NULL, + ("%s(): no process for pid %lu", __func__, (unsigned long)pid)); + + /* skip unwanted entries */ + for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) { + if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) { + /* nothing left... */ + if (proc != NULL) + PROC_UNLOCK(proc); + pfs_unlock(pd); + sx_sunlock(&allproc_lock); + PFS_RETURN (0); + } + } + + /* fill in entries */ + while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 && + resid >= PFS_DELEN) { + if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV, + M_NOWAIT | M_ZERO)) == NULL) { + error = ENOMEM; + break; + } + pfsent->entry.d_reclen = PFS_DELEN; + pfsent->entry.d_fileno = pn_fileno(pn, pid); + /* PFS_DELEN was picked to fit PFS_NAMLEN */ + for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i) + pfsent->entry.d_name[i] = pn->pn_name[i]; + pfsent->entry.d_namlen = i; + /* NOTE: d_off is the offset of the *next* entry. */ + pfsent->entry.d_off = offset + PFS_DELEN; + switch (pn->pn_type) { + case pfstype_procdir: + KASSERT(p != NULL, + ("reached procdir node with p == NULL")); + pfsent->entry.d_namlen = snprintf(pfsent->entry.d_name, + PFS_NAMELEN, "%d", p->p_pid); + /* fall through */ + case pfstype_root: + case pfstype_dir: + case pfstype_this: + case pfstype_parent: + pfsent->entry.d_type = DT_DIR; + break; + case pfstype_file: + pfsent->entry.d_type = DT_REG; + break; + case pfstype_symlink: + pfsent->entry.d_type = DT_LNK; + break; + default: + panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type); + } + PFS_TRACE(("%s", pfsent->entry.d_name)); + dirent_terminate(&pfsent->entry); + STAILQ_INSERT_TAIL(&lst, pfsent, link); + offset += PFS_DELEN; + resid -= PFS_DELEN; + } + if (proc != NULL) + PROC_UNLOCK(proc); + pfs_unlock(pd); + sx_sunlock(&allproc_lock); + i = 0; + STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) { + if (error == 0) + error = uiomove(&pfsent->entry, PFS_DELEN, uio); + free(pfsent, M_IOV); + i++; + } + PFS_TRACE(("%ju bytes", (uintmax_t)(i * PFS_DELEN))); + PFS_RETURN (error); +} + +/* + * Read a symbolic link + */ +static int +pfs_readlink(struct vop_readlink_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + struct uio *uio = va->a_uio; + struct proc *proc = NULL; + char buf[PATH_MAX]; + struct sbuf sb; + int error, locked; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + if (vn->v_type != VLNK) + PFS_RETURN (EINVAL); + KASSERT_PN_IS_LINK(pn); + + if (pn->pn_fill == NULL) + PFS_RETURN (EIO); + + if (pvd->pvd_pid != NO_PID) { + if ((proc = pfind(pvd->pvd_pid)) == NULL) + PFS_RETURN (EIO); + if (proc->p_flag & P_WEXIT) { + PROC_UNLOCK(proc); + PFS_RETURN (EIO); + } + _PHOLD(proc); + PROC_UNLOCK(proc); + } + vhold(vn); + locked = VOP_ISLOCKED(vn); + VOP_UNLOCK(vn, 0); + + /* sbuf_new() can't fail with a static buffer */ + sbuf_new(&sb, buf, sizeof buf, 0); + + error = pn_fill(curthread, proc, pn, &sb, NULL); + + if (proc != NULL) + PRELE(proc); + vn_lock(vn, locked | LK_RETRY); + vdrop(vn); + + if (error) { + sbuf_delete(&sb); + PFS_RETURN (error); + } + + if (sbuf_finish(&sb) != 0) { + sbuf_delete(&sb); + PFS_RETURN (ENAMETOOLONG); + } + + error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio); + sbuf_delete(&sb); + PFS_RETURN (error); +} + +/* + * Reclaim a vnode + */ +static int +pfs_reclaim(struct vop_reclaim_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + return (pfs_vncache_free(va->a_vp)); +} + +/* + * Set attributes + */ +static int +pfs_setattr(struct vop_setattr_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + /* Silently ignore unchangeable attributes. */ + PFS_RETURN (0); +} + +/* + * Write to a file + */ +static int +pfs_write(struct vop_write_args *va) +{ + struct vnode *vn = va->a_vp; + struct pfs_vdata *pvd = vn->v_data; + struct pfs_node *pn = pvd->pvd_pn; + struct uio *uio = va->a_uio; + struct proc *proc; + struct sbuf sb; + int error; + + PFS_TRACE(("%s", pn->pn_name)); + pfs_assert_not_owned(pn); + + if (vn->v_type != VREG) + PFS_RETURN (EINVAL); + KASSERT_PN_IS_FILE(pn); + + if (!(pn->pn_flags & PFS_WR)) + PFS_RETURN (EBADF); + + if (pn->pn_fill == NULL) + PFS_RETURN (EIO); + + /* + * This is necessary because either process' privileges may + * have changed since the open() call. + */ + if (!pfs_visible(curthread, pn, pvd->pvd_pid, false, &proc)) + PFS_RETURN (EIO); + if (proc != NULL) { + _PHOLD(proc); + PROC_UNLOCK(proc); + } + + if (pn->pn_flags & PFS_RAWWR) { + error = pn_fill(curthread, proc, pn, NULL, uio); + if (proc != NULL) + PRELE(proc); + PFS_RETURN (error); + } + + sbuf_uionew(&sb, uio, &error); + if (error) { + if (proc != NULL) + PRELE(proc); + PFS_RETURN (error); + } + + error = pn_fill(curthread, proc, pn, &sb, uio); + + sbuf_delete(&sb); + if (proc != NULL) + PRELE(proc); + PFS_RETURN (error); +} + +/* + * Vnode operations + */ +struct vop_vector pfs_vnodeops = { + .vop_default = &default_vnodeops, + + .vop_access = pfs_access, + .vop_cachedlookup = pfs_lookup, + .vop_close = pfs_close, + .vop_create = VOP_EOPNOTSUPP, + .vop_getattr = pfs_getattr, + .vop_getextattr = pfs_getextattr, + .vop_ioctl = pfs_ioctl, + .vop_link = VOP_EOPNOTSUPP, + .vop_lookup = vfs_cache_lookup, + .vop_mkdir = VOP_EOPNOTSUPP, + .vop_mknod = VOP_EOPNOTSUPP, + .vop_open = pfs_open, + .vop_read = pfs_read, + .vop_readdir = pfs_readdir, + .vop_readlink = pfs_readlink, + .vop_reclaim = pfs_reclaim, + .vop_remove = VOP_EOPNOTSUPP, + .vop_rename = VOP_EOPNOTSUPP, + .vop_rmdir = VOP_EOPNOTSUPP, + .vop_setattr = pfs_setattr, + .vop_symlink = VOP_EOPNOTSUPP, + .vop_vptocnp = pfs_vptocnp, + .vop_write = pfs_write, + /* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */ +}; diff --git a/freebsd/sys/kern/kern_descrip.c b/freebsd/sys/kern/kern_descrip.c new file mode 100644 index 00000000..423968b2 --- /dev/null +++ b/freebsd/sys/kern/kern_descrip.c @@ -0,0 +1,4283 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_capsicum.h" +#include "opt_ddb.h" +#include "opt_ktrace.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +#include + +#include + +#include +#include + +#include + +static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); +static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", + "file desc to leader structures"); +static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); +MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); + +MALLOC_DECLARE(M_FADVISE); + +static __read_mostly uma_zone_t file_zone; +static __read_mostly uma_zone_t filedesc0_zone; + +static int closefp(struct filedesc *fdp, int fd, struct file *fp, + struct thread *td, int holdleaders); +static int fd_first_free(struct filedesc *fdp, int low, int size); +static int fd_last_used(struct filedesc *fdp, int size); +static void fdgrowtable(struct filedesc *fdp, int nfd); +static void fdgrowtable_exp(struct filedesc *fdp, int nfd); +static void fdunused(struct filedesc *fdp, int fd); +static void fdused(struct filedesc *fdp, int fd); +static int getmaxfd(struct thread *td); +static u_long *filecaps_copy_prep(const struct filecaps *src); +static void filecaps_copy_finish(const struct filecaps *src, + struct filecaps *dst, u_long *ioctls); +static u_long *filecaps_free_prep(struct filecaps *fcaps); +static void filecaps_free_finish(u_long *ioctls); + +/* + * Each process has: + * + * - An array of open file descriptors (fd_ofiles) + * - An array of file flags (fd_ofileflags) + * - A bitmap recording which descriptors are in use (fd_map) + * + * A process starts out with NDFILE descriptors. The value of NDFILE has + * been selected based the historical limit of 20 open files, and an + * assumption that the majority of processes, especially short-lived + * processes like shells, will never need more. + * + * If this initial allocation is exhausted, a larger descriptor table and + * map are allocated dynamically, and the pointers in the process's struct + * filedesc are updated to point to those. This is repeated every time + * the process runs out of file descriptors (provided it hasn't hit its + * resource limit). + * + * Since threads may hold references to individual descriptor table + * entries, the tables are never freed. Instead, they are placed on a + * linked list and freed only when the struct filedesc is released. + */ +#define NDFILE 20 +#define NDSLOTSIZE sizeof(NDSLOTTYPE) +#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) +#define NDSLOT(x) ((x) / NDENTRIES) +#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) +#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) + +/* + * SLIST entry used to keep track of ofiles which must be reclaimed when + * the process exits. + */ +struct freetable { + struct fdescenttbl *ft_table; + SLIST_ENTRY(freetable) ft_next; +}; + +/* + * Initial allocation: a filedesc structure + the head of SLIST used to + * keep track of old ofiles + enough space for NDFILE descriptors. + */ + +struct fdescenttbl0 { + int fdt_nfiles; + struct filedescent fdt_ofiles[NDFILE]; +}; + +struct filedesc0 { + struct filedesc fd_fd; + SLIST_HEAD(, freetable) fd_free; + struct fdescenttbl0 fd_dfiles; + NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; +}; + +/* + * Descriptor management. + */ +volatile int __exclusive_cache_line openfiles; /* actual number of open files */ +struct mtx sigio_lock; /* mtx to protect pointers to sigio */ +void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); + +/* + * If low >= size, just return low. Otherwise find the first zero bit in the + * given bitmap, starting at low and not exceeding size - 1. Return size if + * not found. + */ +static int +fd_first_free(struct filedesc *fdp, int low, int size) +{ + NDSLOTTYPE *map = fdp->fd_map; + NDSLOTTYPE mask; + int off, maxoff; + + if (low >= size) + return (low); + + off = NDSLOT(low); + if (low % NDENTRIES) { + mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); + if ((mask &= ~map[off]) != 0UL) + return (off * NDENTRIES + ffsl(mask) - 1); + ++off; + } + for (maxoff = NDSLOTS(size); off < maxoff; ++off) + if (map[off] != ~0UL) + return (off * NDENTRIES + ffsl(~map[off]) - 1); + return (size); +} + +/* + * Find the highest non-zero bit in the given bitmap, starting at 0 and + * not exceeding size - 1. Return -1 if not found. + */ +static int +fd_last_used(struct filedesc *fdp, int size) +{ + NDSLOTTYPE *map = fdp->fd_map; + NDSLOTTYPE mask; + int off, minoff; + + off = NDSLOT(size); + if (size % NDENTRIES) { + mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); + if ((mask &= map[off]) != 0) + return (off * NDENTRIES + flsl(mask) - 1); + --off; + } + for (minoff = NDSLOT(0); off >= minoff; --off) + if (map[off] != 0) + return (off * NDENTRIES + flsl(map[off]) - 1); + return (-1); +} + +static int +fdisused(struct filedesc *fdp, int fd) +{ + + KASSERT(fd >= 0 && fd < fdp->fd_nfiles, + ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); + + return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); +} + +/* + * Mark a file descriptor as used. + */ +static void +fdused_init(struct filedesc *fdp, int fd) +{ + + KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); + + fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); +} + +static void +fdused(struct filedesc *fdp, int fd) +{ + + FILEDESC_XLOCK_ASSERT(fdp); + + fdused_init(fdp, fd); + if (fd > fdp->fd_lastfile) + fdp->fd_lastfile = fd; + if (fd == fdp->fd_freefile) + fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); +} + +/* + * Mark a file descriptor as unused. + */ +static void +fdunused(struct filedesc *fdp, int fd) +{ + + FILEDESC_XLOCK_ASSERT(fdp); + + KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); + KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, + ("fd=%d is still in use", fd)); + + fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; + if (fd == fdp->fd_lastfile) + fdp->fd_lastfile = fd_last_used(fdp, fd); +} + +/* + * Free a file descriptor. + * + * Avoid some work if fdp is about to be destroyed. + */ +static inline void +fdefree_last(struct filedescent *fde) +{ + + filecaps_free(&fde->fde_caps); +} + +static inline void +fdfree(struct filedesc *fdp, int fd) +{ + struct filedescent *fde; + + fde = &fdp->fd_ofiles[fd]; +#ifdef CAPABILITIES + seq_write_begin(&fde->fde_seq); +#endif + fde->fde_file = NULL; +#ifdef CAPABILITIES + seq_write_end(&fde->fde_seq); +#endif + fdefree_last(fde); + fdunused(fdp, fd); +} + +void +pwd_ensure_dirs(void) +{ + struct filedesc *fdp; + + fdp = curproc->p_fd; + FILEDESC_XLOCK(fdp); + if (fdp->fd_cdir == NULL) { + fdp->fd_cdir = rootvnode; + vrefact(rootvnode); + } + if (fdp->fd_rdir == NULL) { + fdp->fd_rdir = rootvnode; + vrefact(rootvnode); + } + FILEDESC_XUNLOCK(fdp); +} + +/* + * System calls on descriptors. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdtablesize_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) +{ +#ifdef RACCT + uint64_t lim; +#endif + + td->td_retval[0] = + min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc); +#ifdef RACCT + PROC_LOCK(td->td_proc); + lim = racct_get_limit(td->td_proc, RACCT_NOFILE); + PROC_UNLOCK(td->td_proc); + if (lim < td->td_retval[0]) + td->td_retval[0] = lim; +#endif + return (0); +} + +/* + * Duplicate a file descriptor to a particular value. + * + * Note: keep in mind that a potential race condition exists when closing + * descriptors from a shared descriptor table (via rfork). + */ +#ifndef _SYS_SYSPROTO_H_ +struct dup2_args { + u_int from; + u_int to; +}; +#endif +/* ARGSUSED */ +int +sys_dup2(struct thread *td, struct dup2_args *uap) +{ + + return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); +} + +/* + * Duplicate a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct dup_args { + u_int fd; +}; +#endif +/* ARGSUSED */ +int +sys_dup(struct thread *td, struct dup_args *uap) +{ + + return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); +} + +/* + * The file control system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fcntl_args { + int fd; + int cmd; + long arg; +}; +#endif +/* ARGSUSED */ +int +sys_fcntl(struct thread *td, struct fcntl_args *uap) +{ + + return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); +} + +int +kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) +{ + struct flock fl; + struct __oflock ofl; + intptr_t arg1; + int error, newcmd; + + error = 0; + newcmd = cmd; + switch (cmd) { + case F_OGETLK: + case F_OSETLK: + case F_OSETLKW: + /* + * Convert old flock structure to new. + */ + error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); + fl.l_start = ofl.l_start; + fl.l_len = ofl.l_len; + fl.l_pid = ofl.l_pid; + fl.l_type = ofl.l_type; + fl.l_whence = ofl.l_whence; + fl.l_sysid = 0; + + switch (cmd) { + case F_OGETLK: + newcmd = F_GETLK; + break; + case F_OSETLK: + newcmd = F_SETLK; + break; + case F_OSETLKW: + newcmd = F_SETLKW; + break; + } + arg1 = (intptr_t)&fl; + break; + case F_GETLK: + case F_SETLK: + case F_SETLKW: + case F_SETLK_REMOTE: + error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); + arg1 = (intptr_t)&fl; + break; + default: + arg1 = arg; + break; + } + if (error) + return (error); + error = kern_fcntl(td, fd, newcmd, arg1); + if (error) + return (error); + if (cmd == F_OGETLK) { + ofl.l_start = fl.l_start; + ofl.l_len = fl.l_len; + ofl.l_pid = fl.l_pid; + ofl.l_type = fl.l_type; + ofl.l_whence = fl.l_whence; + error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); + } else if (cmd == F_GETLK) { + error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); + } + return (error); +} + +int +kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) +{ + struct filedesc *fdp; + struct flock *flp; + struct file *fp, *fp2; + struct filedescent *fde; + struct proc *p; + struct vnode *vp; + int error, flg, tmp; + uint64_t bsize; + off_t foffset; + + error = 0; + flg = F_POSIX; + p = td->td_proc; + fdp = p->p_fd; + + AUDIT_ARG_FD(cmd); + AUDIT_ARG_CMD(cmd); + switch (cmd) { + case F_DUPFD: + tmp = arg; + error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); + break; + + case F_DUPFD_CLOEXEC: + tmp = arg; + error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); + break; + + case F_DUP2FD: + tmp = arg; + error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); + break; + + case F_DUP2FD_CLOEXEC: + tmp = arg; + error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); + break; + + case F_GETFD: + error = EBADF; + FILEDESC_SLOCK(fdp); + fde = fdeget_locked(fdp, fd); + if (fde != NULL) { + td->td_retval[0] = + (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; + error = 0; + } + FILEDESC_SUNLOCK(fdp); + break; + + case F_SETFD: + error = EBADF; + FILEDESC_XLOCK(fdp); + fde = fdeget_locked(fdp, fd); + if (fde != NULL) { + fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | + (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); + error = 0; + } + FILEDESC_XUNLOCK(fdp); + break; + + case F_GETFL: + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); + if (error != 0) + break; + td->td_retval[0] = OFLAGS(fp->f_flag); + fdrop(fp, td); + break; + + case F_SETFL: + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); + if (error != 0) + break; + do { + tmp = flg = fp->f_flag; + tmp &= ~FCNTLFLAGS; + tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; + } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); + tmp = fp->f_flag & FNONBLOCK; + error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); + if (error != 0) { + fdrop(fp, td); + break; + } + tmp = fp->f_flag & FASYNC; + error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); + if (error == 0) { + fdrop(fp, td); + break; + } + atomic_clear_int(&fp->f_flag, FNONBLOCK); + tmp = 0; + (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); + fdrop(fp, td); + break; + + case F_GETOWN: + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); + if (error != 0) + break; + error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); + if (error == 0) + td->td_retval[0] = tmp; + fdrop(fp, td); + break; + + case F_SETOWN: + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); + if (error != 0) + break; + tmp = arg; + error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); + fdrop(fp, td); + break; + + case F_SETLK_REMOTE: + error = priv_check(td, PRIV_NFS_LOCKD); + if (error != 0) + return (error); + flg = F_REMOTE; + goto do_setlk; + + case F_SETLKW: + flg |= F_WAIT; + /* FALLTHROUGH F_SETLK */ + + case F_SETLK: + do_setlk: + flp = (struct flock *)arg; + if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { + error = EINVAL; + break; + } + + error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL); + if (error != 0) + break; + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + fdrop(fp, td); + break; + } + + if (flp->l_whence == SEEK_CUR) { + foffset = foffset_get(fp); + if (foffset < 0 || + (flp->l_start > 0 && + foffset > OFF_MAX - flp->l_start)) { + error = EOVERFLOW; + fdrop(fp, td); + break; + } + flp->l_start += foffset; + } + + vp = fp->f_vnode; + switch (flp->l_type) { + case F_RDLCK: + if ((fp->f_flag & FREAD) == 0) { + error = EBADF; + break; + } + if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { + PROC_LOCK(p->p_leader); + p->p_leader->p_flag |= P_ADVLOCK; + PROC_UNLOCK(p->p_leader); + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + flp, flg); + break; + case F_WRLCK: + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + break; + } + if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { + PROC_LOCK(p->p_leader); + p->p_leader->p_flag |= P_ADVLOCK; + PROC_UNLOCK(p->p_leader); + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + flp, flg); + break; + case F_UNLCK: + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, + flp, flg); + break; + case F_UNLCKSYS: + if (flg != F_REMOTE) { + error = EINVAL; + break; + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCKSYS, flp, flg); + break; + default: + error = EINVAL; + break; + } + if (error != 0 || flp->l_type == F_UNLCK || + flp->l_type == F_UNLCKSYS) { + fdrop(fp, td); + break; + } + + /* + * Check for a race with close. + * + * The vnode is now advisory locked (or unlocked, but this case + * is not really important) as the caller requested. + * We had to drop the filedesc lock, so we need to recheck if + * the descriptor is still valid, because if it was closed + * in the meantime we need to remove advisory lock from the + * vnode - close on any descriptor leading to an advisory + * locked vnode, removes that lock. + * We will return 0 on purpose in that case, as the result of + * successful advisory lock might have been externally visible + * already. This is fine - effectively we pretend to the caller + * that the closing thread was a bit slower and that the + * advisory lock succeeded before the close. + */ + error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2, NULL); + if (error != 0) { + fdrop(fp, td); + break; + } + if (fp != fp2) { + flp->l_whence = SEEK_SET; + flp->l_start = 0; + flp->l_len = 0; + flp->l_type = F_UNLCK; + (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCK, flp, F_POSIX); + } + fdrop(fp, td); + fdrop(fp2, td); + break; + + case F_GETLK: + error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL); + if (error != 0) + break; + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + fdrop(fp, td); + break; + } + flp = (struct flock *)arg; + if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && + flp->l_type != F_UNLCK) { + error = EINVAL; + fdrop(fp, td); + break; + } + if (flp->l_whence == SEEK_CUR) { + foffset = foffset_get(fp); + if ((flp->l_start > 0 && + foffset > OFF_MAX - flp->l_start) || + (flp->l_start < 0 && + foffset < OFF_MIN - flp->l_start)) { + error = EOVERFLOW; + fdrop(fp, td); + break; + } + flp->l_start += foffset; + } + vp = fp->f_vnode; + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, + F_POSIX); + fdrop(fp, td); + break; + + case F_RDAHEAD: + arg = arg ? 128 * 1024: 0; + /* FALLTHROUGH */ + case F_READAHEAD: + error = fget_unlocked(fdp, fd, &cap_no_rights, &fp, NULL); + if (error != 0) + break; + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + error = EBADF; + break; + } + vp = fp->f_vnode; + if (vp->v_type != VREG) { + fdrop(fp, td); + error = ENOTTY; + break; + } + + /* + * Exclusive lock synchronizes against f_seqcount reads and + * writes in sequential_heuristic(). + */ + error = vn_lock(vp, LK_EXCLUSIVE); + if (error != 0) { + fdrop(fp, td); + break; + } + if (arg >= 0) { + bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; + arg = MIN(arg, INT_MAX - bsize + 1); + fp->f_seqcount = MIN(IO_SEQMAX, + (arg + bsize - 1) / bsize); + atomic_set_int(&fp->f_flag, FRDAHEAD); + } else { + atomic_clear_int(&fp->f_flag, FRDAHEAD); + } + VOP_UNLOCK(vp, 0); + fdrop(fp, td); + break; + + default: + error = EINVAL; + break; + } + return (error); +} + +static int +getmaxfd(struct thread *td) +{ + + return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); +} + +/* + * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). + */ +int +kern_dup(struct thread *td, u_int mode, int flags, int old, int new) +{ + struct filedesc *fdp; + struct filedescent *oldfde, *newfde; + struct proc *p; + struct file *delfp; + u_long *oioctls, *nioctls; + int error, maxfd; + + p = td->td_proc; + fdp = p->p_fd; + + MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); + MPASS(mode < FDDUP_LASTMODE); + + AUDIT_ARG_FD(old); + /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ + + /* + * Verify we have a valid descriptor to dup from and possibly to + * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should + * return EINVAL when the new descriptor is out of bounds. + */ + if (old < 0) + return (EBADF); + if (new < 0) + return (mode == FDDUP_FCNTL ? EINVAL : EBADF); + maxfd = getmaxfd(td); + if (new >= maxfd) + return (mode == FDDUP_FCNTL ? EINVAL : EBADF); + + error = EBADF; + FILEDESC_XLOCK(fdp); + if (fget_locked(fdp, old) == NULL) + goto unlock; + if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) { + td->td_retval[0] = new; + if (flags & FDDUP_FLAG_CLOEXEC) + fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; + error = 0; + goto unlock; + } + + oldfde = &fdp->fd_ofiles[old]; + if (!fhold(oldfde->fde_file)) + goto unlock; + + /* + * If the caller specified a file descriptor, make sure the file + * table is large enough to hold it, and grab it. Otherwise, just + * allocate a new descriptor the usual way. + */ + switch (mode) { + case FDDUP_NORMAL: + case FDDUP_FCNTL: + if ((error = fdalloc(td, new, &new)) != 0) { + fdrop(oldfde->fde_file, td); + goto unlock; + } + break; + case FDDUP_MUSTREPLACE: + /* Target file descriptor must exist. */ + if (fget_locked(fdp, new) == NULL) { + fdrop(oldfde->fde_file, td); + goto unlock; + } + break; + case FDDUP_FIXED: + if (new >= fdp->fd_nfiles) { + /* + * The resource limits are here instead of e.g. + * fdalloc(), because the file descriptor table may be + * shared between processes, so we can't really use + * racct_add()/racct_sub(). Instead of counting the + * number of actually allocated descriptors, just put + * the limit on the size of the file descriptor table. + */ +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(p); + error = racct_set(p, RACCT_NOFILE, new + 1); + PROC_UNLOCK(p); + if (error != 0) { + error = EMFILE; + fdrop(oldfde->fde_file, td); + goto unlock; + } + } +#endif + fdgrowtable_exp(fdp, new + 1); + } + if (!fdisused(fdp, new)) + fdused(fdp, new); + break; + default: + KASSERT(0, ("%s unsupported mode %d", __func__, mode)); + } + + KASSERT(old != new, ("new fd is same as old")); + + newfde = &fdp->fd_ofiles[new]; + delfp = newfde->fde_file; + + oioctls = filecaps_free_prep(&newfde->fde_caps); + nioctls = filecaps_copy_prep(&oldfde->fde_caps); + + /* + * Duplicate the source descriptor. + */ +#ifdef CAPABILITIES + seq_write_begin(&newfde->fde_seq); +#endif + memcpy(newfde, oldfde, fde_change_size); + filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, + nioctls); + if ((flags & FDDUP_FLAG_CLOEXEC) != 0) + newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; + else + newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; +#ifdef CAPABILITIES + seq_write_end(&newfde->fde_seq); +#endif + filecaps_free_finish(oioctls); + td->td_retval[0] = new; + + error = 0; + + if (delfp != NULL) { + (void) closefp(fdp, new, delfp, td, 1); + FILEDESC_UNLOCK_ASSERT(fdp); + } else { +unlock: + FILEDESC_XUNLOCK(fdp); + } + + return (error); +} + +/* + * If sigio is on the list associated with a process or process group, + * disable signalling from the device, remove sigio from the list and + * free sigio. + */ +void +funsetown(struct sigio **sigiop) +{ + struct sigio *sigio; + + if (*sigiop == NULL) + return; + SIGIO_LOCK(); + sigio = *sigiop; + if (sigio == NULL) { + SIGIO_UNLOCK(); + return; + } + *(sigio->sio_myref) = NULL; + if ((sigio)->sio_pgid < 0) { + struct pgrp *pg = (sigio)->sio_pgrp; + PGRP_LOCK(pg); + SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, + sigio, sio_pgsigio); + PGRP_UNLOCK(pg); + } else { + struct proc *p = (sigio)->sio_proc; + PROC_LOCK(p); + SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, + sigio, sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); +} + +/* + * Free a list of sigio structures. + * We only need to lock the SIGIO_LOCK because we have made ourselves + * inaccessible to callers of fsetown and therefore do not need to lock + * the proc or pgrp struct for the list manipulation. + */ +void +funsetownlst(struct sigiolst *sigiolst) +{ + struct proc *p; + struct pgrp *pg; + struct sigio *sigio; + + sigio = SLIST_FIRST(sigiolst); + if (sigio == NULL) + return; + p = NULL; + pg = NULL; + + /* + * Every entry of the list should belong + * to a single proc or pgrp. + */ + if (sigio->sio_pgid < 0) { + pg = sigio->sio_pgrp; + PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); + } else /* if (sigio->sio_pgid > 0) */ { + p = sigio->sio_proc; + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + } + + SIGIO_LOCK(); + while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { + *(sigio->sio_myref) = NULL; + if (pg != NULL) { + KASSERT(sigio->sio_pgid < 0, + ("Proc sigio in pgrp sigio list")); + KASSERT(sigio->sio_pgrp == pg, + ("Bogus pgrp in sigio list")); + PGRP_LOCK(pg); + SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, + sio_pgsigio); + PGRP_UNLOCK(pg); + } else /* if (p != NULL) */ { + KASSERT(sigio->sio_pgid > 0, + ("Pgrp sigio in proc sigio list")); + KASSERT(sigio->sio_proc == p, + ("Bogus proc in sigio list")); + PROC_LOCK(p); + SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, + sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); + SIGIO_LOCK(); + } + SIGIO_UNLOCK(); +} + +/* + * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). + * + * After permission checking, add a sigio structure to the sigio list for + * the process or process group. + */ +int +fsetown(pid_t pgid, struct sigio **sigiop) +{ + struct proc *proc; + struct pgrp *pgrp; + struct sigio *sigio; + int ret; + + if (pgid == 0) { + funsetown(sigiop); + return (0); + } + + ret = 0; + + /* Allocate and fill in the new sigio out of locks. */ + sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); + sigio->sio_pgid = pgid; + sigio->sio_ucred = crhold(curthread->td_ucred); + sigio->sio_myref = sigiop; + + sx_slock(&proctree_lock); + if (pgid > 0) { + proc = pfind(pgid); + if (proc == NULL) { + ret = ESRCH; + goto fail; + } + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + PROC_UNLOCK(proc); + if (proc->p_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + pgrp = NULL; + } else /* if (pgid < 0) */ { + pgrp = pgfind(-pgid); + if (pgrp == NULL) { + ret = ESRCH; + goto fail; + } + PGRP_UNLOCK(pgrp); + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + if (pgrp->pg_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + proc = NULL; + } + funsetown(sigiop); + if (pgid > 0) { + PROC_LOCK(proc); + /* + * Since funsetownlst() is called without the proctree + * locked, we need to check for P_WEXIT. + * XXX: is ESRCH correct? + */ + if ((proc->p_flag & P_WEXIT) != 0) { + PROC_UNLOCK(proc); + ret = ESRCH; + goto fail; + } + SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); + sigio->sio_proc = proc; + PROC_UNLOCK(proc); + } else { + PGRP_LOCK(pgrp); + SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); + sigio->sio_pgrp = pgrp; + PGRP_UNLOCK(pgrp); + } + sx_sunlock(&proctree_lock); + SIGIO_LOCK(); + *sigiop = sigio; + SIGIO_UNLOCK(); + return (0); + +fail: + sx_sunlock(&proctree_lock); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); + return (ret); +} + +/* + * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). + */ +pid_t +fgetown(struct sigio **sigiop) +{ + pid_t pgid; + + SIGIO_LOCK(); + pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; + SIGIO_UNLOCK(); + return (pgid); +} + +/* + * Function drops the filedesc lock on return. + */ +static int +closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, + int holdleaders) +{ + int error; + + FILEDESC_XLOCK_ASSERT(fdp); + + if (holdleaders) { + if (td->td_proc->p_fdtol != NULL) { + /* + * Ask fdfree() to sleep to ensure that all relevant + * process leaders can be traversed in closef(). + */ + fdp->fd_holdleaderscount++; + } else { + holdleaders = 0; + } + } + + /* + * We now hold the fp reference that used to be owned by the + * descriptor array. We have to unlock the FILEDESC *AFTER* + * knote_fdclose to prevent a race of the fd getting opened, a knote + * added, and deleteing a knote for the new fd. + */ + knote_fdclose(td, fd); + + /* + * We need to notify mqueue if the object is of type mqueue. + */ + if (fp->f_type == DTYPE_MQUEUE) + mq_fdclose(td, fd, fp); + FILEDESC_XUNLOCK(fdp); + + error = closef(fp, td); + if (holdleaders) { + FILEDESC_XLOCK(fdp); + fdp->fd_holdleaderscount--; + if (fdp->fd_holdleaderscount == 0 && + fdp->fd_holdleaderswakeup != 0) { + fdp->fd_holdleaderswakeup = 0; + wakeup(&fdp->fd_holdleaderscount); + } + FILEDESC_XUNLOCK(fdp); + } + return (error); +} + +/* + * Close a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct close_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +sys_close(struct thread *td, struct close_args *uap) +{ + + return (kern_close(td, uap->fd)); +} + +int +kern_close(struct thread *td, int fd) +{ + struct filedesc *fdp; + struct file *fp; + + fdp = td->td_proc->p_fd; + + AUDIT_SYSCLOSE(td, fd); + + FILEDESC_XLOCK(fdp); + if ((fp = fget_locked(fdp, fd)) == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + fdfree(fdp, fd); + + /* closefp() drops the FILEDESC lock for us. */ + return (closefp(fdp, fd, fp, td, 1)); +} + +/* + * Close open file descriptors. + */ +#ifndef _SYS_SYSPROTO_H_ +struct closefrom_args { + int lowfd; +}; +#endif +/* ARGSUSED */ +int +sys_closefrom(struct thread *td, struct closefrom_args *uap) +{ + struct filedesc *fdp; + int fd; + + fdp = td->td_proc->p_fd; + AUDIT_ARG_FD(uap->lowfd); + + /* + * Treat negative starting file descriptor values identical to + * closefrom(0) which closes all files. + */ + if (uap->lowfd < 0) + uap->lowfd = 0; + FILEDESC_SLOCK(fdp); + for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) { + if (fdp->fd_ofiles[fd].fde_file != NULL) { + FILEDESC_SUNLOCK(fdp); + (void)kern_close(td, fd); + FILEDESC_SLOCK(fdp); + } + } + FILEDESC_SUNLOCK(fdp); + return (0); +} + +#if defined(COMPAT_43) +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ofstat_args { + int fd; + struct ostat *sb; +}; +#endif +/* ARGSUSED */ +int +ofstat(struct thread *td, struct ofstat_args *uap) +{ + struct ostat oub; + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) { + cvtstat(&ub, &oub); + error = copyout(&oub, uap->sb, sizeof(oub)); + } + return (error); +} +#endif /* COMPAT_43 */ + +#if defined(COMPAT_FREEBSD11) +int +freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) +{ + struct stat sb; + struct freebsd11_stat osb; + int error; + + error = kern_fstat(td, uap->fd, &sb); + if (error != 0) + return (error); + error = freebsd11_cvtstat(&sb, &osb); + if (error == 0) + error = copyout(&osb, uap->sb, sizeof(osb)); + return (error); +} +#endif /* COMPAT_FREEBSD11 */ + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstat_args { + int fd; + struct stat *sb; +}; +#endif +/* ARGSUSED */ +int +sys_fstat(struct thread *td, struct fstat_args *uap) +{ + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) + error = copyout(&ub, uap->sb, sizeof(ub)); + return (error); +} + +int +kern_fstat(struct thread *td, int fd, struct stat *sbp) +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(fd); + + error = fget(td, fd, &cap_fstat_rights, &fp); + if (error != 0) + return (error); + + AUDIT_ARG_FILE(td->td_proc, fp); + + error = fo_stat(fp, sbp, td->td_ucred, td); + fdrop(fp, td); +#ifdef __STAT_TIME_T_EXT + if (error == 0) { + sbp->st_atim_ext = 0; + sbp->st_mtim_ext = 0; + sbp->st_ctim_ext = 0; + sbp->st_btim_ext = 0; + } +#endif +#ifdef KTRACE + if (error == 0 && KTRPOINT(td, KTR_STRUCT)) + ktrstat(sbp); +#endif + return (error); +} + +#if defined(COMPAT_FREEBSD11) +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd11_nfstat_args { + int fd; + struct nstat *sb; +}; +#endif +/* ARGSUSED */ +int +freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) +{ + struct nstat nub; + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) { + freebsd11_cvtnstat(&ub, &nub); + error = copyout(&nub, uap->sb, sizeof(nub)); + } + return (error); +} +#endif /* COMPAT_FREEBSD11 */ + +/* + * Return pathconf information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fpathconf_args { + int fd; + int name; +}; +#endif +/* ARGSUSED */ +int +sys_fpathconf(struct thread *td, struct fpathconf_args *uap) +{ + long value; + int error; + + error = kern_fpathconf(td, uap->fd, uap->name, &value); + if (error == 0) + td->td_retval[0] = value; + return (error); +} + +int +kern_fpathconf(struct thread *td, int fd, int name, long *valuep) +{ + struct file *fp; + struct vnode *vp; + int error; + + error = fget(td, fd, &cap_fpathconf_rights, &fp); + if (error != 0) + return (error); + + if (name == _PC_ASYNC_IO) { + *valuep = _POSIX_ASYNCHRONOUS_IO; + goto out; + } + vp = fp->f_vnode; + if (vp != NULL) { + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_PATHCONF(vp, name, valuep); + VOP_UNLOCK(vp, 0); + } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { + if (name != _PC_PIPE_BUF) { + error = EINVAL; + } else { + *valuep = PIPE_BUF; + error = 0; + } + } else { + error = EOPNOTSUPP; + } +out: + fdrop(fp, td); + return (error); +} + +/* + * Initialize filecaps structure. + */ +void +filecaps_init(struct filecaps *fcaps) +{ + + bzero(fcaps, sizeof(*fcaps)); + fcaps->fc_nioctls = -1; +} + +/* + * Copy filecaps structure allocating memory for ioctls array if needed. + * + * The last parameter indicates whether the fdtable is locked. If it is not and + * ioctls are encountered, copying fails and the caller must lock the table. + * + * Note that if the table was not locked, the caller has to check the relevant + * sequence counter to determine whether the operation was successful. + */ +bool +filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) +{ + size_t size; + + if (src->fc_ioctls != NULL && !locked) + return (false); + memcpy(dst, src, sizeof(*src)); + if (src->fc_ioctls == NULL) + return (true); + + KASSERT(src->fc_nioctls > 0, + ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); + + size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; + dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); + memcpy(dst->fc_ioctls, src->fc_ioctls, size); + return (true); +} + +static u_long * +filecaps_copy_prep(const struct filecaps *src) +{ + u_long *ioctls; + size_t size; + + if (src->fc_ioctls == NULL) + return (NULL); + + KASSERT(src->fc_nioctls > 0, + ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); + + size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; + ioctls = malloc(size, M_FILECAPS, M_WAITOK); + return (ioctls); +} + +static void +filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, + u_long *ioctls) +{ + size_t size; + + *dst = *src; + if (src->fc_ioctls == NULL) { + MPASS(ioctls == NULL); + return; + } + + size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; + dst->fc_ioctls = ioctls; + bcopy(src->fc_ioctls, dst->fc_ioctls, size); +} + +/* + * Move filecaps structure to the new place and clear the old place. + */ +void +filecaps_move(struct filecaps *src, struct filecaps *dst) +{ + + *dst = *src; + bzero(src, sizeof(*src)); +} + +/* + * Fill the given filecaps structure with full rights. + */ +static void +filecaps_fill(struct filecaps *fcaps) +{ + + CAP_ALL(&fcaps->fc_rights); + fcaps->fc_ioctls = NULL; + fcaps->fc_nioctls = -1; + fcaps->fc_fcntls = CAP_FCNTL_ALL; +} + +/* + * Free memory allocated within filecaps structure. + */ +void +filecaps_free(struct filecaps *fcaps) +{ + + free(fcaps->fc_ioctls, M_FILECAPS); + bzero(fcaps, sizeof(*fcaps)); +} + +static u_long * +filecaps_free_prep(struct filecaps *fcaps) +{ + u_long *ioctls; + + ioctls = fcaps->fc_ioctls; + bzero(fcaps, sizeof(*fcaps)); + return (ioctls); +} + +static void +filecaps_free_finish(u_long *ioctls) +{ + + free(ioctls, M_FILECAPS); +} + +/* + * Validate the given filecaps structure. + */ +static void +filecaps_validate(const struct filecaps *fcaps, const char *func) +{ + + KASSERT(cap_rights_is_valid(&fcaps->fc_rights), + ("%s: invalid rights", func)); + KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, + ("%s: invalid fcntls", func)); + KASSERT(fcaps->fc_fcntls == 0 || + cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), + ("%s: fcntls without CAP_FCNTL", func)); + KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : + (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), + ("%s: invalid ioctls", func)); + KASSERT(fcaps->fc_nioctls == 0 || + cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), + ("%s: ioctls without CAP_IOCTL", func)); +} + +static void +fdgrowtable_exp(struct filedesc *fdp, int nfd) +{ + int nfd1; + + FILEDESC_XLOCK_ASSERT(fdp); + + nfd1 = fdp->fd_nfiles * 2; + if (nfd1 < nfd) + nfd1 = nfd; + fdgrowtable(fdp, nfd1); +} + +/* + * Grow the file table to accommodate (at least) nfd descriptors. + */ +static void +fdgrowtable(struct filedesc *fdp, int nfd) +{ + struct filedesc0 *fdp0; + struct freetable *ft; + struct fdescenttbl *ntable; + struct fdescenttbl *otable; + int nnfiles, onfiles; + NDSLOTTYPE *nmap, *omap; + + /* + * If lastfile is -1 this struct filedesc was just allocated and we are + * growing it to accommodate for the one we are going to copy from. There + * is no need to have a lock on this one as it's not visible to anyone. + */ + if (fdp->fd_lastfile != -1) + FILEDESC_XLOCK_ASSERT(fdp); + + KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); + + /* save old values */ + onfiles = fdp->fd_nfiles; + otable = fdp->fd_files; + omap = fdp->fd_map; + + /* compute the size of the new table */ + nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ + if (nnfiles <= onfiles) + /* the table is already large enough */ + return; + + /* + * Allocate a new table. We need enough space for the number of + * entries, file entries themselves and the struct freetable we will use + * when we decommission the table and place it on the freelist. + * We place the struct freetable in the middle so we don't have + * to worry about padding. + */ + ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + + nnfiles * sizeof(ntable->fdt_ofiles[0]) + + sizeof(struct freetable), + M_FILEDESC, M_ZERO | M_WAITOK); + /* copy the old data */ + ntable->fdt_nfiles = nnfiles; + memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, + onfiles * sizeof(ntable->fdt_ofiles[0])); + + /* + * Allocate a new map only if the old is not large enough. It will + * grow at a slower rate than the table as it can map more + * entries than the table can hold. + */ + if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { + nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, + M_ZERO | M_WAITOK); + /* copy over the old data and update the pointer */ + memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); + fdp->fd_map = nmap; + } + + /* + * Make sure that ntable is correctly initialized before we replace + * fd_files poiner. Otherwise fget_unlocked() may see inconsistent + * data. + */ + atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); + + /* + * Do not free the old file table, as some threads may still + * reference entries within it. Instead, place it on a freelist + * which will be processed when the struct filedesc is released. + * + * Note that if onfiles == NDFILE, we're dealing with the original + * static allocation contained within (struct filedesc0 *)fdp, + * which must not be freed. + */ + if (onfiles > NDFILE) { + ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; + fdp0 = (struct filedesc0 *)fdp; + ft->ft_table = otable; + SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); + } + /* + * The map does not have the same possibility of threads still + * holding references to it. So always free it as long as it + * does not reference the original static allocation. + */ + if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) + free(omap, M_FILEDESC); +} + +/* + * Allocate a file descriptor for the process. + */ +int +fdalloc(struct thread *td, int minfd, int *result) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = p->p_fd; + int fd, maxfd, allocfd; +#ifdef RACCT + int error; +#endif + + FILEDESC_XLOCK_ASSERT(fdp); + + if (fdp->fd_freefile > minfd) + minfd = fdp->fd_freefile; + + maxfd = getmaxfd(td); + + /* + * Search the bitmap for a free descriptor starting at minfd. + * If none is found, grow the file table. + */ + fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); + if (fd >= maxfd) + return (EMFILE); + if (fd >= fdp->fd_nfiles) { + allocfd = min(fd * 2, maxfd); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(p); + error = racct_set(p, RACCT_NOFILE, allocfd); + PROC_UNLOCK(p); + if (error != 0) + return (EMFILE); + } +#endif + /* + * fd is already equal to first free descriptor >= minfd, so + * we only need to grow the table and we are done. + */ + fdgrowtable_exp(fdp, allocfd); + } + + /* + * Perform some sanity checks, then mark the file descriptor as + * used and return it to the caller. + */ + KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), + ("invalid descriptor %d", fd)); + KASSERT(!fdisused(fdp, fd), + ("fd_first_free() returned non-free descriptor")); + KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, + ("file descriptor isn't free")); + fdused(fdp, fd); + *result = fd; + return (0); +} + +/* + * Allocate n file descriptors for the process. + */ +int +fdallocn(struct thread *td, int minfd, int *fds, int n) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = p->p_fd; + int i; + + FILEDESC_XLOCK_ASSERT(fdp); + + for (i = 0; i < n; i++) + if (fdalloc(td, 0, &fds[i]) != 0) + break; + + if (i < n) { + for (i--; i >= 0; i--) + fdunused(fdp, fds[i]); + return (EMFILE); + } + + return (0); +} + +/* + * Create a new open file structure and allocate a file descriptor for the + * process that refers to it. We add one reference to the file for the + * descriptor table and one reference for resultfp. This is to prevent us + * being preempted and the entry in the descriptor table closed after we + * release the FILEDESC lock. + */ +int +falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, + struct filecaps *fcaps) +{ + struct file *fp; + int error, fd; + + error = falloc_noinstall(td, &fp); + if (error) + return (error); /* no reference held on error */ + + error = finstall(td, fp, &fd, flags, fcaps); + if (error) { + fdrop(fp, td); /* one reference (fp only) */ + return (error); + } + + if (resultfp != NULL) + *resultfp = fp; /* copy out result */ + else + fdrop(fp, td); /* release local reference */ + + if (resultfd != NULL) + *resultfd = fd; + + return (0); +} + +/* + * Create a new open file structure without allocating a file descriptor. + */ +int +falloc_noinstall(struct thread *td, struct file **resultfp) +{ + struct file *fp; + int maxuserfiles = maxfiles - (maxfiles / 20); + int openfiles_new; + static struct timeval lastfail; + static int curfail; + + KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); + + openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; + if ((openfiles_new >= maxuserfiles && + priv_check(td, PRIV_MAXFILES) != 0) || + openfiles_new >= maxfiles) { + atomic_subtract_int(&openfiles, 1); + if (ppsratecheck(&lastfail, &curfail, 1)) { + printf("kern.maxfiles limit exceeded by uid %i, (%s) " + "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); + } + return (ENFILE); + } + fp = uma_zalloc(file_zone, M_WAITOK); + bzero(fp, sizeof(*fp)); + refcount_init(&fp->f_count, 1); + fp->f_cred = crhold(td->td_ucred); + fp->f_ops = &badfileops; + *resultfp = fp; + return (0); +} + +/* + * Install a file in a file descriptor table. + */ +void +_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, + struct filecaps *fcaps) +{ + struct filedescent *fde; + + MPASS(fp != NULL); + if (fcaps != NULL) + filecaps_validate(fcaps, __func__); + FILEDESC_XLOCK_ASSERT(fdp); + + fde = &fdp->fd_ofiles[fd]; +#ifdef CAPABILITIES + seq_write_begin(&fde->fde_seq); +#endif + fde->fde_file = fp; + fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; + if (fcaps != NULL) + filecaps_move(fcaps, &fde->fde_caps); + else + filecaps_fill(&fde->fde_caps); +#ifdef CAPABILITIES + seq_write_end(&fde->fde_seq); +#endif +} + +int +finstall(struct thread *td, struct file *fp, int *fd, int flags, + struct filecaps *fcaps) +{ + struct filedesc *fdp = td->td_proc->p_fd; + int error; + + MPASS(fd != NULL); + + if (!fhold(fp)) + return (EBADF); + FILEDESC_XLOCK(fdp); + if ((error = fdalloc(td, 0, fd))) { + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + return (error); + } + _finstall(fdp, fp, *fd, flags, fcaps); + FILEDESC_XUNLOCK(fdp); + return (0); +} + +/* + * Build a new filedesc structure from another. + * Copy the current, root, and jail root vnode references. + * + * If fdp is not NULL, return with it shared locked. + */ +struct filedesc * +fdinit(struct filedesc *fdp, bool prepfiles) +{ + struct filedesc0 *newfdp0; + struct filedesc *newfdp; + + newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); + newfdp = &newfdp0->fd_fd; + + /* Create the file descriptor table. */ + FILEDESC_LOCK_INIT(newfdp); + refcount_init(&newfdp->fd_refcnt, 1); + refcount_init(&newfdp->fd_holdcnt, 1); + newfdp->fd_cmask = CMASK; + newfdp->fd_map = newfdp0->fd_dmap; + newfdp->fd_lastfile = -1; + newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; + newfdp->fd_files->fdt_nfiles = NDFILE; + + if (fdp == NULL) + return (newfdp); + + if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles) + fdgrowtable(newfdp, fdp->fd_lastfile + 1); + + FILEDESC_SLOCK(fdp); + newfdp->fd_cdir = fdp->fd_cdir; + if (newfdp->fd_cdir) + vrefact(newfdp->fd_cdir); + newfdp->fd_rdir = fdp->fd_rdir; + if (newfdp->fd_rdir) + vrefact(newfdp->fd_rdir); + newfdp->fd_jdir = fdp->fd_jdir; + if (newfdp->fd_jdir) + vrefact(newfdp->fd_jdir); + + if (!prepfiles) { + FILEDESC_SUNLOCK(fdp); + } else { + while (fdp->fd_lastfile >= newfdp->fd_nfiles) { + FILEDESC_SUNLOCK(fdp); + fdgrowtable(newfdp, fdp->fd_lastfile + 1); + FILEDESC_SLOCK(fdp); + } + } + + return (newfdp); +} + +static struct filedesc * +fdhold(struct proc *p) +{ + struct filedesc *fdp; + + PROC_LOCK_ASSERT(p, MA_OWNED); + fdp = p->p_fd; + if (fdp != NULL) + refcount_acquire(&fdp->fd_holdcnt); + return (fdp); +} + +static void +fddrop(struct filedesc *fdp) +{ + + if (fdp->fd_holdcnt > 1) { + if (refcount_release(&fdp->fd_holdcnt) == 0) + return; + } + + FILEDESC_LOCK_DESTROY(fdp); + uma_zfree(filedesc0_zone, fdp); +} + +/* + * Share a filedesc structure. + */ +struct filedesc * +fdshare(struct filedesc *fdp) +{ + + refcount_acquire(&fdp->fd_refcnt); + return (fdp); +} + +/* + * Unshare a filedesc structure, if necessary by making a copy + */ +void +fdunshare(struct thread *td) +{ + struct filedesc *tmp; + struct proc *p = td->td_proc; + + if (p->p_fd->fd_refcnt == 1) + return; + + tmp = fdcopy(p->p_fd); + fdescfree(td); + p->p_fd = tmp; +} + +void +fdinstall_remapped(struct thread *td, struct filedesc *fdp) +{ + + fdescfree(td); + td->td_proc->p_fd = fdp; +} + +/* + * Copy a filedesc structure. A NULL pointer in returns a NULL reference, + * this is to ease callers, not catch errors. + */ +struct filedesc * +fdcopy(struct filedesc *fdp) +{ + struct filedesc *newfdp; + struct filedescent *nfde, *ofde; + int i; + + MPASS(fdp != NULL); + + newfdp = fdinit(fdp, true); + /* copy all passable descriptors (i.e. not kqueue) */ + newfdp->fd_freefile = -1; + for (i = 0; i <= fdp->fd_lastfile; ++i) { + ofde = &fdp->fd_ofiles[i]; + if (ofde->fde_file == NULL || + (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || + !fhold(ofde->fde_file)) { + if (newfdp->fd_freefile == -1) + newfdp->fd_freefile = i; + continue; + } + nfde = &newfdp->fd_ofiles[i]; + *nfde = *ofde; + filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); + fdused_init(newfdp, i); + newfdp->fd_lastfile = i; + } + if (newfdp->fd_freefile == -1) + newfdp->fd_freefile = i; + newfdp->fd_cmask = fdp->fd_cmask; + FILEDESC_SUNLOCK(fdp); + return (newfdp); +} + +/* + * Copies a filedesc structure, while remapping all file descriptors + * stored inside using a translation table. + * + * File descriptors are copied over to the new file descriptor table, + * regardless of whether the close-on-exec flag is set. + */ +int +fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds, + struct filedesc **ret) +{ + struct filedesc *newfdp; + struct filedescent *nfde, *ofde; + int error, i; + + MPASS(fdp != NULL); + + newfdp = fdinit(fdp, true); + if (nfds > fdp->fd_lastfile + 1) { + /* New table cannot be larger than the old one. */ + error = E2BIG; + goto bad; + } + /* Copy all passable descriptors (i.e. not kqueue). */ + newfdp->fd_freefile = nfds; + for (i = 0; i < nfds; ++i) { + if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) { + /* File descriptor out of bounds. */ + error = EBADF; + goto bad; + } + ofde = &fdp->fd_ofiles[fds[i]]; + if (ofde->fde_file == NULL) { + /* Unused file descriptor. */ + error = EBADF; + goto bad; + } + if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { + /* File descriptor cannot be passed. */ + error = EINVAL; + goto bad; + } + if (!fhold(nfde->fde_file)) { + error = EBADF; + goto bad; + } + nfde = &newfdp->fd_ofiles[i]; + *nfde = *ofde; + filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); + fdused_init(newfdp, i); + newfdp->fd_lastfile = i; + } + newfdp->fd_cmask = fdp->fd_cmask; + FILEDESC_SUNLOCK(fdp); + *ret = newfdp; + return (0); +bad: + FILEDESC_SUNLOCK(fdp); + fdescfree_remapped(newfdp); + return (error); +} + +/* + * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. + * one of processes using it exits) and the table used to be shared. + */ +static void +fdclearlocks(struct thread *td) +{ + struct filedesc *fdp; + struct filedesc_to_leader *fdtol; + struct flock lf; + struct file *fp; + struct proc *p; + struct vnode *vp; + int i; + + p = td->td_proc; + fdp = p->p_fd; + fdtol = p->p_fdtol; + MPASS(fdtol != NULL); + + FILEDESC_XLOCK(fdp); + KASSERT(fdtol->fdl_refcount > 0, + ("filedesc_to_refcount botch: fdl_refcount=%d", + fdtol->fdl_refcount)); + if (fdtol->fdl_refcount == 1 && + (p->p_leader->p_flag & P_ADVLOCK) != 0) { + for (i = 0; i <= fdp->fd_lastfile; i++) { + fp = fdp->fd_ofiles[i].fde_file; + if (fp == NULL || fp->f_type != DTYPE_VNODE || + !fhold(fp)) + continue; + FILEDESC_XUNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_vnode; + (void) VOP_ADVLOCK(vp, + (caddr_t)p->p_leader, F_UNLCK, + &lf, F_POSIX); + FILEDESC_XLOCK(fdp); + fdrop(fp, td); + } + } +retry: + if (fdtol->fdl_refcount == 1) { + if (fdp->fd_holdleaderscount > 0 && + (p->p_leader->p_flag & P_ADVLOCK) != 0) { + /* + * close() or kern_dup() has cleared a reference + * in a shared file descriptor table. + */ + fdp->fd_holdleaderswakeup = 1; + sx_sleep(&fdp->fd_holdleaderscount, + FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); + goto retry; + } + if (fdtol->fdl_holdcount > 0) { + /* + * Ensure that fdtol->fdl_leader remains + * valid in closef(). + */ + fdtol->fdl_wakeup = 1; + sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, + "fdlhold", 0); + goto retry; + } + } + fdtol->fdl_refcount--; + if (fdtol->fdl_refcount == 0 && + fdtol->fdl_holdcount == 0) { + fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; + fdtol->fdl_prev->fdl_next = fdtol->fdl_next; + } else + fdtol = NULL; + p->p_fdtol = NULL; + FILEDESC_XUNLOCK(fdp); + if (fdtol != NULL) + free(fdtol, M_FILEDESC_TO_LEADER); +} + +/* + * Release a filedesc structure. + */ +static void +fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose) +{ + struct filedesc0 *fdp0; + struct freetable *ft, *tft; + struct filedescent *fde; + struct file *fp; + int i; + + for (i = 0; i <= fdp->fd_lastfile; i++) { + fde = &fdp->fd_ofiles[i]; + fp = fde->fde_file; + if (fp != NULL) { + fdefree_last(fde); + if (needclose) + (void) closef(fp, td); + else + fdrop(fp, td); + } + } + + if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) + free(fdp->fd_map, M_FILEDESC); + if (fdp->fd_nfiles > NDFILE) + free(fdp->fd_files, M_FILEDESC); + + fdp0 = (struct filedesc0 *)fdp; + SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) + free(ft->ft_table, M_FILEDESC); + + fddrop(fdp); +} + +void +fdescfree(struct thread *td) +{ + struct proc *p; + struct filedesc *fdp; + struct vnode *cdir, *jdir, *rdir; + + p = td->td_proc; + fdp = p->p_fd; + MPASS(fdp != NULL); + +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(p); + racct_set(p, RACCT_NOFILE, 0); + PROC_UNLOCK(p); + } +#endif + + if (p->p_fdtol != NULL) + fdclearlocks(td); + + PROC_LOCK(p); + p->p_fd = NULL; + PROC_UNLOCK(p); + + if (refcount_release(&fdp->fd_refcnt) == 0) + return; + + FILEDESC_XLOCK(fdp); + cdir = fdp->fd_cdir; + fdp->fd_cdir = NULL; + rdir = fdp->fd_rdir; + fdp->fd_rdir = NULL; + jdir = fdp->fd_jdir; + fdp->fd_jdir = NULL; + FILEDESC_XUNLOCK(fdp); + + if (cdir != NULL) + vrele(cdir); + if (rdir != NULL) + vrele(rdir); + if (jdir != NULL) + vrele(jdir); + + fdescfree_fds(td, fdp, 1); +} + +void +fdescfree_remapped(struct filedesc *fdp) +{ + + if (fdp->fd_cdir != NULL) + vrele(fdp->fd_cdir); + if (fdp->fd_rdir != NULL) + vrele(fdp->fd_rdir); + if (fdp->fd_jdir != NULL) + vrele(fdp->fd_jdir); + + fdescfree_fds(curthread, fdp, 0); +} + +/* + * For setugid programs, we don't want to people to use that setugidness + * to generate error messages which write to a file which otherwise would + * otherwise be off-limits to the process. We check for filesystems where + * the vnode can change out from under us after execve (like [lin]procfs). + * + * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is + * sufficient. We also don't check for setugidness since we know we are. + */ +static bool +is_unsafe(struct file *fp) +{ + struct vnode *vp; + + if (fp->f_type != DTYPE_VNODE) + return (false); + + vp = fp->f_vnode; + return ((vp->v_vflag & VV_PROCDEP) != 0); +} + +/* + * Make this setguid thing safe, if at all possible. + */ +void +fdsetugidsafety(struct thread *td) +{ + struct filedesc *fdp; + struct file *fp; + int i; + + fdp = td->td_proc->p_fd; + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); + MPASS(fdp->fd_nfiles >= 3); + for (i = 0; i <= 2; i++) { + fp = fdp->fd_ofiles[i].fde_file; + if (fp != NULL && is_unsafe(fp)) { + FILEDESC_XLOCK(fdp); + knote_fdclose(td, i); + /* + * NULL-out descriptor prior to close to avoid + * a race while close blocks. + */ + fdfree(fdp, i); + FILEDESC_XUNLOCK(fdp); + (void) closef(fp, td); + } + } +} + +/* + * If a specific file object occupies a specific file descriptor, close the + * file descriptor entry and drop a reference on the file object. This is a + * convenience function to handle a subsequent error in a function that calls + * falloc() that handles the race that another thread might have closed the + * file descriptor out from under the thread creating the file object. + */ +void +fdclose(struct thread *td, struct file *fp, int idx) +{ + struct filedesc *fdp = td->td_proc->p_fd; + + FILEDESC_XLOCK(fdp); + if (fdp->fd_ofiles[idx].fde_file == fp) { + fdfree(fdp, idx); + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_XUNLOCK(fdp); +} + +/* + * Close any files on exec? + */ +void +fdcloseexec(struct thread *td) +{ + struct filedesc *fdp; + struct filedescent *fde; + struct file *fp; + int i; + + fdp = td->td_proc->p_fd; + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); + for (i = 0; i <= fdp->fd_lastfile; i++) { + fde = &fdp->fd_ofiles[i]; + fp = fde->fde_file; + if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || + (fde->fde_flags & UF_EXCLOSE))) { + FILEDESC_XLOCK(fdp); + fdfree(fdp, i); + (void) closefp(fdp, i, fp, td, 0); + FILEDESC_UNLOCK_ASSERT(fdp); + } + } +} + +/* + * It is unsafe for set[ug]id processes to be started with file + * descriptors 0..2 closed, as these descriptors are given implicit + * significance in the Standard C library. fdcheckstd() will create a + * descriptor referencing /dev/null for each of stdin, stdout, and + * stderr that is not already open. + */ +int +fdcheckstd(struct thread *td) +{ + struct filedesc *fdp; + register_t save; + int i, error, devnull; + + fdp = td->td_proc->p_fd; + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); + MPASS(fdp->fd_nfiles >= 3); + devnull = -1; + for (i = 0; i <= 2; i++) { + if (fdp->fd_ofiles[i].fde_file != NULL) + continue; + + save = td->td_retval[0]; + if (devnull != -1) { + error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); + } else { + error = kern_openat(td, AT_FDCWD, "/dev/null", + UIO_SYSSPACE, O_RDWR, 0); + if (error == 0) { + devnull = td->td_retval[0]; + KASSERT(devnull == i, ("we didn't get our fd")); + } + } + td->td_retval[0] = save; + if (error != 0) + return (error); + } + return (0); +} + +/* + * Internal form of close. Decrement reference count on file structure. + * Note: td may be NULL when closing a file that was being passed in a + * message. + */ +int +closef(struct file *fp, struct thread *td) +{ + struct vnode *vp; + struct flock lf; + struct filedesc_to_leader *fdtol; + struct filedesc *fdp; + + /* + * POSIX record locking dictates that any close releases ALL + * locks owned by this process. This is handled by setting + * a flag in the unlock to free ONLY locks obeying POSIX + * semantics, and not to free BSD-style file locks. + * If the descriptor was in a message, POSIX-style locks + * aren't passed with the descriptor, and the thread pointer + * will be NULL. Callers should be careful only to pass a + * NULL thread pointer when there really is no owning + * context that might have locks, or the locks will be + * leaked. + */ + if (fp->f_type == DTYPE_VNODE && td != NULL) { + vp = fp->f_vnode; + if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, + F_UNLCK, &lf, F_POSIX); + } + fdtol = td->td_proc->p_fdtol; + if (fdtol != NULL) { + /* + * Handle special case where file descriptor table is + * shared between multiple process leaders. + */ + fdp = td->td_proc->p_fd; + FILEDESC_XLOCK(fdp); + for (fdtol = fdtol->fdl_next; + fdtol != td->td_proc->p_fdtol; + fdtol = fdtol->fdl_next) { + if ((fdtol->fdl_leader->p_flag & + P_ADVLOCK) == 0) + continue; + fdtol->fdl_holdcount++; + FILEDESC_XUNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_vnode; + (void) VOP_ADVLOCK(vp, + (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, + F_POSIX); + FILEDESC_XLOCK(fdp); + fdtol->fdl_holdcount--; + if (fdtol->fdl_holdcount == 0 && + fdtol->fdl_wakeup != 0) { + fdtol->fdl_wakeup = 0; + wakeup(fdtol); + } + } + FILEDESC_XUNLOCK(fdp); + } + } + return (fdrop(fp, td)); +} + +/* + * Initialize the file pointer with the specified properties. + * + * The ops are set with release semantics to be certain that the flags, type, + * and data are visible when ops is. This is to prevent ops methods from being + * called with bad data. + */ +void +finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) +{ + fp->f_data = data; + fp->f_flag = flag; + fp->f_type = type; + atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); +} + +int +fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, + struct file **fpp, struct filecaps *havecapsp) +{ + struct filedescent *fde; + int error; + + FILEDESC_LOCK_ASSERT(fdp); + + fde = fdeget_locked(fdp, fd); + if (fde == NULL) { + error = EBADF; + goto out; + } + +#ifdef CAPABILITIES + error = cap_check(cap_rights_fde_inline(fde), needrightsp); + if (error != 0) + goto out; +#endif + + if (havecapsp != NULL) + filecaps_copy(&fde->fde_caps, havecapsp, true); + + *fpp = fde->fde_file; + + error = 0; +out: + return (error); +} + +int +fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, + struct file **fpp, struct filecaps *havecapsp) +{ + struct filedesc *fdp = td->td_proc->p_fd; + int error; +#ifndef CAPABILITIES + error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL); + if (error == 0 && havecapsp != NULL) + filecaps_fill(havecapsp); +#else + struct file *fp; + seq_t seq; + + for (;;) { + error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq); + if (error != 0) + return (error); + + if (havecapsp != NULL) { + if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, + havecapsp, false)) { + fdrop(fp, td); + goto get_locked; + } + } + + if (!fd_modified(fdp, fd, seq)) + break; + fdrop(fp, td); + } + + *fpp = fp; + return (0); + +get_locked: + FILEDESC_SLOCK(fdp); + error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp); + if (error == 0 && !fhold(*fpp)) + error = EBADF; + FILEDESC_SUNLOCK(fdp); +#endif + return (error); +} + +int +fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, + struct file **fpp, seq_t *seqp) +{ +#ifdef CAPABILITIES + const struct filedescent *fde; +#endif + const struct fdescenttbl *fdt; + struct file *fp; + u_int count; +#ifdef CAPABILITIES + seq_t seq; + cap_rights_t haverights; + int error; +#endif + + fdt = fdp->fd_files; + if ((u_int)fd >= fdt->fdt_nfiles) + return (EBADF); + /* + * Fetch the descriptor locklessly. We avoid fdrop() races by + * never raising a refcount above 0. To accomplish this we have + * to use a cmpset loop rather than an atomic_add. The descriptor + * must be re-verified once we acquire a reference to be certain + * that the identity is still correct and we did not lose a race + * due to preemption. + */ + for (;;) { +#ifdef CAPABILITIES + seq = seq_load(fd_seq(fdt, fd)); + fde = &fdt->fdt_ofiles[fd]; + haverights = *cap_rights_fde_inline(fde); + fp = fde->fde_file; + if (!seq_consistent(fd_seq(fdt, fd), seq)) + continue; +#else + fp = fdt->fdt_ofiles[fd].fde_file; +#endif + if (fp == NULL) + return (EBADF); +#ifdef CAPABILITIES + error = cap_check(&haverights, needrightsp); + if (error != 0) + return (error); +#endif + count = fp->f_count; + retry: + if (count == 0) { + /* + * Force a reload. Other thread could reallocate the + * table before this fd was closed, so it possible that + * there is a stale fp pointer in cached version. + */ + fdt = *(const struct fdescenttbl * const volatile *) + &(fdp->fd_files); + continue; + } + if (__predict_false(count + 1 < count)) + return (EBADF); + + /* + * Use an acquire barrier to force re-reading of fdt so it is + * refreshed for verification. + */ + if (__predict_false(atomic_fcmpset_acq_int(&fp->f_count, + &count, count + 1) == 0)) + goto retry; + fdt = fdp->fd_files; +#ifdef CAPABILITIES + if (seq_consistent_nomb(fd_seq(fdt, fd), seq)) +#else + if (fp == fdt->fdt_ofiles[fd].fde_file) +#endif + break; + fdrop(fp, curthread); + } + *fpp = fp; + if (seqp != NULL) { +#ifdef CAPABILITIES + *seqp = seq; +#endif + } + return (0); +} + +/* + * Extract the file pointer associated with the specified descriptor for the + * current user process. + * + * If the descriptor doesn't exist or doesn't match 'flags', EBADF is + * returned. + * + * File's rights will be checked against the capability rights mask. + * + * If an error occurred the non-zero error is returned and *fpp is set to + * NULL. Otherwise *fpp is held and set and zero is returned. Caller is + * responsible for fdrop(). + */ +static __inline int +_fget(struct thread *td, int fd, struct file **fpp, int flags, + cap_rights_t *needrightsp, seq_t *seqp) +{ + struct filedesc *fdp; + struct file *fp; + int error; + + *fpp = NULL; + fdp = td->td_proc->p_fd; + error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp); + if (error != 0) + return (error); + if (fp->f_ops == &badfileops) { + fdrop(fp, td); + return (EBADF); + } + + /* + * FREAD and FWRITE failure return EBADF as per POSIX. + */ + error = 0; + switch (flags) { + case FREAD: + case FWRITE: + if ((fp->f_flag & flags) == 0) + error = EBADF; + break; + case FEXEC: + if ((fp->f_flag & (FREAD | FEXEC)) == 0 || + ((fp->f_flag & FWRITE) != 0)) + error = EBADF; + break; + case 0: + break; + default: + KASSERT(0, ("wrong flags")); + } + + if (error != 0) { + fdrop(fp, td); + return (error); + } + + *fpp = fp; + return (0); +} + +int +fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) +{ + + return (_fget(td, fd, fpp, 0, rightsp, NULL)); +} + +int +fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, + struct file **fpp) +{ + int error; +#ifndef CAPABILITIES + error = _fget(td, fd, fpp, 0, rightsp, NULL); + if (maxprotp != NULL) + *maxprotp = VM_PROT_ALL; +#else + cap_rights_t fdrights; + struct filedesc *fdp = td->td_proc->p_fd; + seq_t seq; + + MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); + for (;;) { + error = _fget(td, fd, fpp, 0, rightsp, &seq); + if (error != 0) + return (error); + if (maxprotp != NULL) + fdrights = *cap_rights(fdp, fd); + if (!fd_modified(fdp, fd, seq)) + break; + fdrop(*fpp, td); + } + + /* + * If requested, convert capability rights to access flags. + */ + if (maxprotp != NULL) + *maxprotp = cap_rights_to_vmprot(&fdrights); +#endif + return (error); +} + +int +fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) +{ + + return (_fget(td, fd, fpp, FREAD, rightsp, NULL)); +} + +int +fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) +{ + + return (_fget(td, fd, fpp, FWRITE, rightsp, NULL)); +} + +int +fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, + struct file **fpp) +{ + struct filedesc *fdp = td->td_proc->p_fd; +#ifndef CAPABILITIES + return (fget_unlocked(fdp, fd, rightsp, fpp, NULL)); +#else + int error; + seq_t seq; + + MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); + for (;;) { + error = fget_unlocked(fdp, fd, rightsp, fpp, &seq); + if (error != 0) + return (error); + error = cap_fcntl_check(fdp, fd, needfcntl); + if (!fd_modified(fdp, fd, seq)) + break; + fdrop(*fpp, td); + } + if (error != 0) { + fdrop(*fpp, td); + *fpp = NULL; + } + return (error); +#endif +} + +/* + * Like fget() but loads the underlying vnode, or returns an error if the + * descriptor does not represent a vnode. Note that pipes use vnodes but + * never have VM objects. The returned vnode will be vref()'d. + * + * XXX: what about the unused flags ? + */ +static __inline int +_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, + struct vnode **vpp) +{ + struct file *fp; + int error; + + *vpp = NULL; + error = _fget(td, fd, &fp, flags, needrightsp, NULL); + if (error != 0) + return (error); + if (fp->f_vnode == NULL) { + error = EINVAL; + } else { + *vpp = fp->f_vnode; + vrefact(*vpp); + } + fdrop(fp, td); + + return (error); +} + +int +fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, 0, rightsp, vpp)); +} + +int +fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, + struct filecaps *havecaps, struct vnode **vpp) +{ + struct filedesc *fdp; + struct filecaps caps; + struct file *fp; + int error; + + fdp = td->td_proc->p_fd; + error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps); + if (error != 0) + return (error); + if (fp->f_ops == &badfileops) { + error = EBADF; + goto out; + } + if (fp->f_vnode == NULL) { + error = EINVAL; + goto out; + } + + *havecaps = caps; + *vpp = fp->f_vnode; + vrefact(*vpp); + + return (0); +out: + filecaps_free(&caps); + return (error); +} + +int +fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, FREAD, rightsp, vpp)); +} + +int +fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); +} + +#ifdef notyet +int +fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, + struct vnode **vpp) +{ + + return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); +} +#endif + +/* + * Handle the last reference to a file being closed. + * + * Without the noinline attribute clang keeps inlining the func thorough this + * file when fdrop is used. + */ +int __noinline +_fdrop(struct file *fp, struct thread *td) +{ + int error; + + if (fp->f_count != 0) + panic("fdrop: count %d", fp->f_count); + error = fo_close(fp, td); + atomic_subtract_int(&openfiles, 1); + crfree(fp->f_cred); + free(fp->f_advice, M_FADVISE); + uma_zfree(file_zone, fp); + + return (error); +} + +/* + * Apply an advisory lock on a file descriptor. + * + * Just attempt to get a record lock of the requested type on the entire file + * (l_whence = SEEK_SET, l_start = 0, l_len = 0). + */ +#ifndef _SYS_SYSPROTO_H_ +struct flock_args { + int fd; + int how; +}; +#endif +/* ARGSUSED */ +int +sys_flock(struct thread *td, struct flock_args *uap) +{ + struct file *fp; + struct vnode *vp; + struct flock lf; + int error; + + error = fget(td, uap->fd, &cap_flock_rights, &fp); + if (error != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + return (EOPNOTSUPP); + } + + vp = fp->f_vnode; + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (uap->how & LOCK_UN) { + lf.l_type = F_UNLCK; + atomic_clear_int(&fp->f_flag, FHASLOCK); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + goto done2; + } + if (uap->how & LOCK_EX) + lf.l_type = F_WRLCK; + else if (uap->how & LOCK_SH) + lf.l_type = F_RDLCK; + else { + error = EBADF; + goto done2; + } + atomic_set_int(&fp->f_flag, FHASLOCK); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); +done2: + fdrop(fp, td); + return (error); +} +/* + * Duplicate the specified descriptor to a free descriptor. + */ +int +dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, + int openerror, int *indxp) +{ + struct filedescent *newfde, *oldfde; + struct file *fp; + u_long *ioctls; + int error, indx; + + KASSERT(openerror == ENODEV || openerror == ENXIO, + ("unexpected error %d in %s", openerror, __func__)); + + /* + * If the to-be-dup'd fd number is greater than the allowed number + * of file descriptors, or the fd to be dup'd has already been + * closed, then reject. + */ + FILEDESC_XLOCK(fdp); + if ((fp = fget_locked(fdp, dfd)) == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + + error = fdalloc(td, 0, &indx); + if (error != 0) { + FILEDESC_XUNLOCK(fdp); + return (error); + } + + /* + * There are two cases of interest here. + * + * For ENODEV simply dup (dfd) to file descriptor (indx) and return. + * + * For ENXIO steal away the file structure from (dfd) and store it in + * (indx). (dfd) is effectively closed by this operation. + */ + switch (openerror) { + case ENODEV: + /* + * Check that the mode the file is being opened for is a + * subset of the mode of the existing descriptor. + */ + if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { + fdunused(fdp, indx); + FILEDESC_XUNLOCK(fdp); + return (EACCES); + } + if (!fhold(fp)) { + fdunused(fdp, indx); + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + newfde = &fdp->fd_ofiles[indx]; + oldfde = &fdp->fd_ofiles[dfd]; + ioctls = filecaps_copy_prep(&oldfde->fde_caps); +#ifdef CAPABILITIES + seq_write_begin(&newfde->fde_seq); +#endif + memcpy(newfde, oldfde, fde_change_size); + filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, + ioctls); +#ifdef CAPABILITIES + seq_write_end(&newfde->fde_seq); +#endif + break; + case ENXIO: + /* + * Steal away the file pointer from dfd and stuff it into indx. + */ + newfde = &fdp->fd_ofiles[indx]; + oldfde = &fdp->fd_ofiles[dfd]; +#ifdef CAPABILITIES + seq_write_begin(&newfde->fde_seq); +#endif + memcpy(newfde, oldfde, fde_change_size); + oldfde->fde_file = NULL; + fdunused(fdp, dfd); +#ifdef CAPABILITIES + seq_write_end(&newfde->fde_seq); +#endif + break; + } + FILEDESC_XUNLOCK(fdp); + *indxp = indx; + return (0); +} + +/* + * This sysctl determines if we will allow a process to chroot(2) if it + * has a directory open: + * 0: disallowed for all processes. + * 1: allowed for processes that were not already chroot(2)'ed. + * 2: allowed for all processes. + */ + +static int chroot_allow_open_directories = 1; + +SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, + &chroot_allow_open_directories, 0, + "Allow a process to chroot(2) if it has a directory open"); + +/* + * Helper function for raised chroot(2) security function: Refuse if + * any filedescriptors are open directories. + */ +static int +chroot_refuse_vdir_fds(struct filedesc *fdp) +{ + struct vnode *vp; + struct file *fp; + int fd; + + FILEDESC_LOCK_ASSERT(fdp); + + for (fd = 0; fd <= fdp->fd_lastfile; fd++) { + fp = fget_locked(fdp, fd); + if (fp == NULL) + continue; + if (fp->f_type == DTYPE_VNODE) { + vp = fp->f_vnode; + if (vp->v_type == VDIR) + return (EPERM); + } + } + return (0); +} + +/* + * Common routine for kern_chroot() and jail_attach(). The caller is + * responsible for invoking priv_check() and mac_vnode_check_chroot() to + * authorize this operation. + */ +int +pwd_chroot(struct thread *td, struct vnode *vp) +{ + struct filedesc *fdp; + struct vnode *oldvp; + int error; + + fdp = td->td_proc->p_fd; + FILEDESC_XLOCK(fdp); + if (chroot_allow_open_directories == 0 || + (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { + error = chroot_refuse_vdir_fds(fdp); + if (error != 0) { + FILEDESC_XUNLOCK(fdp); + return (error); + } + } + oldvp = fdp->fd_rdir; + vrefact(vp); + fdp->fd_rdir = vp; + if (fdp->fd_jdir == NULL) { + vrefact(vp); + fdp->fd_jdir = vp; + } + FILEDESC_XUNLOCK(fdp); + vrele(oldvp); + return (0); +} + +void +pwd_chdir(struct thread *td, struct vnode *vp) +{ + struct filedesc *fdp; + struct vnode *oldvp; + + fdp = td->td_proc->p_fd; + FILEDESC_XLOCK(fdp); + VNASSERT(vp->v_usecount > 0, vp, + ("chdir to a vnode with zero usecount")); + oldvp = fdp->fd_cdir; + fdp->fd_cdir = vp; + FILEDESC_XUNLOCK(fdp); + vrele(oldvp); +} + +/* + * Scan all active processes and prisons to see if any of them have a current + * or root directory of `olddp'. If so, replace them with the new mount point. + */ +void +mountcheckdirs(struct vnode *olddp, struct vnode *newdp) +{ + struct filedesc *fdp; + struct prison *pr; + struct proc *p; + int nrele; + + if (vrefcnt(olddp) == 1) + return; + nrele = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + continue; + FILEDESC_XLOCK(fdp); + if (fdp->fd_cdir == olddp) { + vrefact(newdp); + fdp->fd_cdir = newdp; + nrele++; + } + if (fdp->fd_rdir == olddp) { + vrefact(newdp); + fdp->fd_rdir = newdp; + nrele++; + } + if (fdp->fd_jdir == olddp) { + vrefact(newdp); + fdp->fd_jdir = newdp; + nrele++; + } + FILEDESC_XUNLOCK(fdp); + fddrop(fdp); + } + sx_sunlock(&allproc_lock); + if (rootvnode == olddp) { + vrefact(newdp); + rootvnode = newdp; + nrele++; + } + mtx_lock(&prison0.pr_mtx); + if (prison0.pr_root == olddp) { + vrefact(newdp); + prison0.pr_root = newdp; + nrele++; + } + mtx_unlock(&prison0.pr_mtx); + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + mtx_lock(&pr->pr_mtx); + if (pr->pr_root == olddp) { + vrefact(newdp); + pr->pr_root = newdp; + nrele++; + } + mtx_unlock(&pr->pr_mtx); + } + sx_sunlock(&allprison_lock); + while (nrele--) + vrele(olddp); +} + +struct filedesc_to_leader * +filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) +{ + struct filedesc_to_leader *fdtol; + + fdtol = malloc(sizeof(struct filedesc_to_leader), + M_FILEDESC_TO_LEADER, M_WAITOK); + fdtol->fdl_refcount = 1; + fdtol->fdl_holdcount = 0; + fdtol->fdl_wakeup = 0; + fdtol->fdl_leader = leader; + if (old != NULL) { + FILEDESC_XLOCK(fdp); + fdtol->fdl_next = old->fdl_next; + fdtol->fdl_prev = old; + old->fdl_next = fdtol; + fdtol->fdl_next->fdl_prev = fdtol; + FILEDESC_XUNLOCK(fdp); + } else { + fdtol->fdl_next = fdtol; + fdtol->fdl_prev = fdtol; + } + return (fdtol); +} + +static int +sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) +{ + struct filedesc *fdp; + int i, count, slots; + + if (*(int *)arg1 != 0) + return (EINVAL); + + fdp = curproc->p_fd; + count = 0; + FILEDESC_SLOCK(fdp); + slots = NDSLOTS(fdp->fd_lastfile + 1); + for (i = 0; i < slots; i++) + count += bitcountl(fdp->fd_map[i]); + FILEDESC_SUNLOCK(fdp); + + return (SYSCTL_OUT(req, &count, sizeof(count))); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, + CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, + "Number of open file descriptors"); + +/* + * Get file structures globally. + */ +static int +sysctl_kern_file(SYSCTL_HANDLER_ARGS) +{ + struct xfile xf; + struct filedesc *fdp; + struct file *fp; + struct proc *p; + int error, n; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + if (req->oldptr == NULL) { + n = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state == PRS_NEW) { + PROC_UNLOCK(p); + continue; + } + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + continue; + /* overestimates sparse tables. */ + if (fdp->fd_lastfile > 0) + n += fdp->fd_lastfile; + fddrop(fdp); + } + sx_sunlock(&allproc_lock); + return (SYSCTL_OUT(req, 0, n * sizeof(xf))); + } + error = 0; + bzero(&xf, sizeof(xf)); + xf.xf_size = sizeof(xf); + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state == PRS_NEW) { + PROC_UNLOCK(p); + continue; + } + if (p_cansee(req->td, p) != 0) { + PROC_UNLOCK(p); + continue; + } + xf.xf_pid = p->p_pid; + xf.xf_uid = p->p_ucred->cr_uid; + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + continue; + FILEDESC_SLOCK(fdp); + for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { + if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) + continue; + xf.xf_fd = n; + xf.xf_file = (uintptr_t)fp; + xf.xf_data = (uintptr_t)fp->f_data; + xf.xf_vnode = (uintptr_t)fp->f_vnode; + xf.xf_type = (uintptr_t)fp->f_type; + xf.xf_count = fp->f_count; + xf.xf_msgcount = 0; + xf.xf_offset = foffset_get(fp); + xf.xf_flag = fp->f_flag; + error = SYSCTL_OUT(req, &xf, sizeof(xf)); + if (error) + break; + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + if (error) + break; + } + sx_sunlock(&allproc_lock); + return (error); +} + +SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, + 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); + +#ifdef KINFO_FILE_SIZE +CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); +#endif + +static int +xlate_fflags(int fflags) +{ + static const struct { + int fflag; + int kf_fflag; + } fflags_table[] = { + { FAPPEND, KF_FLAG_APPEND }, + { FASYNC, KF_FLAG_ASYNC }, + { FFSYNC, KF_FLAG_FSYNC }, + { FHASLOCK, KF_FLAG_HASLOCK }, + { FNONBLOCK, KF_FLAG_NONBLOCK }, + { FREAD, KF_FLAG_READ }, + { FWRITE, KF_FLAG_WRITE }, + { O_CREAT, KF_FLAG_CREAT }, + { O_DIRECT, KF_FLAG_DIRECT }, + { O_EXCL, KF_FLAG_EXCL }, + { O_EXEC, KF_FLAG_EXEC }, + { O_EXLOCK, KF_FLAG_EXLOCK }, + { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, + { O_SHLOCK, KF_FLAG_SHLOCK }, + { O_TRUNC, KF_FLAG_TRUNC } + }; + unsigned int i; + int kflags; + + kflags = 0; + for (i = 0; i < nitems(fflags_table); i++) + if (fflags & fflags_table[i].fflag) + kflags |= fflags_table[i].kf_fflag; + return (kflags); +} + +/* Trim unused data from kf_path by truncating the structure size. */ +void +pack_kinfo(struct kinfo_file *kif) +{ + + kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + + strlen(kif->kf_path) + 1; + kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); +} + +static void +export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, + struct kinfo_file *kif, struct filedesc *fdp, int flags) +{ + int error; + + bzero(kif, sizeof(*kif)); + + /* Set a default type to allow for empty fill_kinfo() methods. */ + kif->kf_type = KF_TYPE_UNKNOWN; + kif->kf_flags = xlate_fflags(fp->f_flag); + if (rightsp != NULL) + kif->kf_cap_rights = *rightsp; + else + cap_rights_init(&kif->kf_cap_rights); + kif->kf_fd = fd; + kif->kf_ref_count = fp->f_count; + kif->kf_offset = foffset_get(fp); + + /* + * This may drop the filedesc lock, so the 'fp' cannot be + * accessed after this call. + */ + error = fo_fill_kinfo(fp, kif, fdp); + if (error == 0) + kif->kf_status |= KF_ATTR_VALID; + if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) + pack_kinfo(kif); + else + kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); +} + +static void +export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, + struct kinfo_file *kif, int flags) +{ + int error; + + bzero(kif, sizeof(*kif)); + + kif->kf_type = KF_TYPE_VNODE; + error = vn_fill_kinfo_vnode(vp, kif); + if (error == 0) + kif->kf_status |= KF_ATTR_VALID; + kif->kf_flags = xlate_fflags(fflags); + cap_rights_init(&kif->kf_cap_rights); + kif->kf_fd = fd; + kif->kf_ref_count = -1; + kif->kf_offset = -1; + if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) + pack_kinfo(kif); + else + kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); + vrele(vp); +} + +struct export_fd_buf { + struct filedesc *fdp; + struct sbuf *sb; + ssize_t remainder; + struct kinfo_file kif; + int flags; +}; + +static int +export_kinfo_to_sb(struct export_fd_buf *efbuf) +{ + struct kinfo_file *kif; + + kif = &efbuf->kif; + if (efbuf->remainder != -1) { + if (efbuf->remainder < kif->kf_structsize) { + /* Terminate export. */ + efbuf->remainder = 0; + return (0); + } + efbuf->remainder -= kif->kf_structsize; + } + return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); +} + +static int +export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, + struct export_fd_buf *efbuf) +{ + int error; + + if (efbuf->remainder == 0) + return (0); + export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, + efbuf->flags); + FILEDESC_SUNLOCK(efbuf->fdp); + error = export_kinfo_to_sb(efbuf); + FILEDESC_SLOCK(efbuf->fdp); + return (error); +} + +static int +export_vnode_to_sb(struct vnode *vp, int fd, int fflags, + struct export_fd_buf *efbuf) +{ + int error; + + if (efbuf->remainder == 0) + return (0); + if (efbuf->fdp != NULL) + FILEDESC_SUNLOCK(efbuf->fdp); + export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); + error = export_kinfo_to_sb(efbuf); + if (efbuf->fdp != NULL) + FILEDESC_SLOCK(efbuf->fdp); + return (error); +} + +/* + * Store a process file descriptor information to sbuf. + * + * Takes a locked proc as argument, and returns with the proc unlocked. + */ +int +kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, + int flags) +{ + struct file *fp; + struct filedesc *fdp; + struct export_fd_buf *efbuf; + struct vnode *cttyvp, *textvp, *tracevp; + int error, i; + cap_rights_t rights; + + PROC_LOCK_ASSERT(p, MA_OWNED); + + /* ktrace vnode */ + tracevp = p->p_tracevp; + if (tracevp != NULL) + vrefact(tracevp); + /* text vnode */ + textvp = p->p_textvp; + if (textvp != NULL) + vrefact(textvp); + /* Controlling tty. */ + cttyvp = NULL; + if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { + cttyvp = p->p_pgrp->pg_session->s_ttyvp; + if (cttyvp != NULL) + vrefact(cttyvp); + } + fdp = fdhold(p); + PROC_UNLOCK(p); + efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); + efbuf->fdp = NULL; + efbuf->sb = sb; + efbuf->remainder = maxlen; + efbuf->flags = flags; + if (tracevp != NULL) + export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, + efbuf); + if (textvp != NULL) + export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); + if (cttyvp != NULL) + export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, + efbuf); + error = 0; + if (fdp == NULL) + goto fail; + efbuf->fdp = fdp; + FILEDESC_SLOCK(fdp); + /* working directory */ + if (fdp->fd_cdir != NULL) { + vrefact(fdp->fd_cdir); + export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); + } + /* root directory */ + if (fdp->fd_rdir != NULL) { + vrefact(fdp->fd_rdir); + export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); + } + /* jail directory */ + if (fdp->fd_jdir != NULL) { + vrefact(fdp->fd_jdir); + export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); + } + for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { + if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) + continue; +#ifdef CAPABILITIES + rights = *cap_rights(fdp, i); +#else /* !CAPABILITIES */ + rights = cap_no_rights; +#endif + /* + * Create sysctl entry. It is OK to drop the filedesc + * lock inside of export_file_to_sb() as we will + * re-validate and re-evaluate its properties when the + * loop continues. + */ + error = export_file_to_sb(fp, i, &rights, efbuf); + if (error != 0 || efbuf->remainder == 0) + break; + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); +fail: + free(efbuf, M_TEMP); + return (error); +} + +#define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) + +/* + * Get per-process file descriptors for use by procstat(1), et al. + */ +static int +sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sb; + struct proc *p; + ssize_t maxlen; + int error, error2, *name; + + name = (int *)arg1; + + sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); + sbuf_clear_flags(&sb, SBUF_INCLUDENUL); + error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); + if (error != 0) { + sbuf_delete(&sb); + return (error); + } + maxlen = req->oldptr != NULL ? req->oldlen : -1; + error = kern_proc_filedesc_out(p, &sb, maxlen, + KERN_FILEDESC_PACK_KINFO); + error2 = sbuf_finish(&sb); + sbuf_delete(&sb); + return (error != 0 ? error : error2); +} + +#ifdef COMPAT_FREEBSD7 +#ifdef KINFO_OFILE_SIZE +CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); +#endif + +static void +kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) +{ + + okif->kf_structsize = sizeof(*okif); + okif->kf_type = kif->kf_type; + okif->kf_fd = kif->kf_fd; + okif->kf_ref_count = kif->kf_ref_count; + okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | + KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | + KF_FLAG_DIRECT | KF_FLAG_HASLOCK); + okif->kf_offset = kif->kf_offset; + if (kif->kf_type == KF_TYPE_VNODE) + okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; + else + okif->kf_vnode_type = KF_VTYPE_VNON; + strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); + if (kif->kf_type == KF_TYPE_SOCKET) { + okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; + okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; + okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; + okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; + okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; + } else { + okif->kf_sa_local.ss_family = AF_UNSPEC; + okif->kf_sa_peer.ss_family = AF_UNSPEC; + } +} + +static int +export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, + struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req) +{ + int error; + + vrefact(vp); + FILEDESC_SUNLOCK(fdp); + export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); + kinfo_to_okinfo(kif, okif); + error = SYSCTL_OUT(req, okif, sizeof(*okif)); + FILEDESC_SLOCK(fdp); + return (error); +} + +/* + * Get per-process file descriptors for use by procstat(1), et al. + */ +static int +sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) +{ + struct kinfo_ofile *okif; + struct kinfo_file *kif; + struct filedesc *fdp; + int error, i, *name; + struct file *fp; + struct proc *p; + + name = (int *)arg1; + error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); + if (error != 0) + return (error); + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + return (ENOENT); + kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); + okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); + FILEDESC_SLOCK(fdp); + if (fdp->fd_cdir != NULL) + export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, + okif, fdp, req); + if (fdp->fd_rdir != NULL) + export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, + okif, fdp, req); + if (fdp->fd_jdir != NULL) + export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, + okif, fdp, req); + for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { + if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) + continue; + export_file_to_kinfo(fp, i, NULL, kif, fdp, + KERN_FILEDESC_PACK_KINFO); + FILEDESC_SUNLOCK(fdp); + kinfo_to_okinfo(kif, okif); + error = SYSCTL_OUT(req, okif, sizeof(*okif)); + FILEDESC_SLOCK(fdp); + if (error) + break; + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + free(kif, M_TEMP); + free(okif, M_TEMP); + return (0); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, + CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, + "Process ofiledesc entries"); +#endif /* COMPAT_FREEBSD7 */ + +int +vntype_to_kinfo(int vtype) +{ + struct { + int vtype; + int kf_vtype; + } vtypes_table[] = { + { VBAD, KF_VTYPE_VBAD }, + { VBLK, KF_VTYPE_VBLK }, + { VCHR, KF_VTYPE_VCHR }, + { VDIR, KF_VTYPE_VDIR }, + { VFIFO, KF_VTYPE_VFIFO }, + { VLNK, KF_VTYPE_VLNK }, + { VNON, KF_VTYPE_VNON }, + { VREG, KF_VTYPE_VREG }, + { VSOCK, KF_VTYPE_VSOCK } + }; + unsigned int i; + + /* + * Perform vtype translation. + */ + for (i = 0; i < nitems(vtypes_table); i++) + if (vtypes_table[i].vtype == vtype) + return (vtypes_table[i].kf_vtype); + + return (KF_VTYPE_UNKNOWN); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, + CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, + "Process filedesc entries"); + +/* + * Store a process current working directory information to sbuf. + * + * Takes a locked proc as argument, and returns with the proc unlocked. + */ +int +kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) +{ + struct filedesc *fdp; + struct export_fd_buf *efbuf; + int error; + + PROC_LOCK_ASSERT(p, MA_OWNED); + + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + return (EINVAL); + + efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); + efbuf->fdp = fdp; + efbuf->sb = sb; + efbuf->remainder = maxlen; + + FILEDESC_SLOCK(fdp); + if (fdp->fd_cdir == NULL) + error = EINVAL; + else { + vrefact(fdp->fd_cdir); + error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, + FREAD, efbuf); + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + free(efbuf, M_TEMP); + return (error); +} + +/* + * Get per-process current working directory. + */ +static int +sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sb; + struct proc *p; + ssize_t maxlen; + int error, error2, *name; + + name = (int *)arg1; + + sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); + sbuf_clear_flags(&sb, SBUF_INCLUDENUL); + error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); + if (error != 0) { + sbuf_delete(&sb); + return (error); + } + maxlen = req->oldptr != NULL ? req->oldlen : -1; + error = kern_proc_cwd_out(p, &sb, maxlen); + error2 = sbuf_finish(&sb); + sbuf_delete(&sb); + return (error != 0 ? error : error2); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, + sysctl_kern_proc_cwd, "Process current working directory"); + +#ifdef DDB +/* + * For the purposes of debugging, generate a human-readable string for the + * file type. + */ +static const char * +file_type_to_name(short type) +{ + + switch (type) { + case 0: + return ("zero"); + case DTYPE_VNODE: + return ("vnode"); + case DTYPE_SOCKET: + return ("socket"); + case DTYPE_PIPE: + return ("pipe"); + case DTYPE_FIFO: + return ("fifo"); + case DTYPE_KQUEUE: + return ("kqueue"); + case DTYPE_CRYPTO: + return ("crypto"); + case DTYPE_MQUEUE: + return ("mqueue"); + case DTYPE_SHM: + return ("shm"); + case DTYPE_SEM: + return ("ksem"); + case DTYPE_PTS: + return ("pts"); + case DTYPE_DEV: + return ("dev"); + case DTYPE_PROCDESC: + return ("proc"); + case DTYPE_LINUXEFD: + return ("levent"); + case DTYPE_LINUXTFD: + return ("ltimer"); + default: + return ("unkn"); + } +} + +/* + * For the purposes of debugging, identify a process (if any, perhaps one of + * many) that references the passed file in its file descriptor array. Return + * NULL if none. + */ +static struct proc * +file_to_first_proc(struct file *fp) +{ + struct filedesc *fdp; + struct proc *p; + int n; + + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = p->p_fd; + if (fdp == NULL) + continue; + for (n = 0; n <= fdp->fd_lastfile; n++) { + if (fp == fdp->fd_ofiles[n].fde_file) + return (p); + } + } + return (NULL); +} + +static void +db_print_file(struct file *fp, int header) +{ +#define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) + struct proc *p; + + if (header) + db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", + XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", + "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", + "FCmd"); + p = file_to_first_proc(fp); + db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, + fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, + fp->f_flag, 0, fp->f_count, 0, XPTRWIDTH, fp->f_vnode, + p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); + +#undef XPTRWIDTH +} + +DB_SHOW_COMMAND(file, db_show_file) +{ + struct file *fp; + + if (!have_addr) { + db_printf("usage: show file \n"); + return; + } + fp = (struct file *)addr; + db_print_file(fp, 1); +} + +DB_SHOW_COMMAND(files, db_show_files) +{ + struct filedesc *fdp; + struct file *fp; + struct proc *p; + int header; + int n; + + header = 1; + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + if ((fdp = p->p_fd) == NULL) + continue; + for (n = 0; n <= fdp->fd_lastfile; ++n) { + if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) + continue; + db_print_file(fp, header); + header = 0; + } + } +} +#endif + +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, + &maxfilesperproc, 0, "Maximum files allowed open per process"); + +SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, + &maxfiles, 0, "Maximum number of files"); + +SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, + __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); + +/* ARGSUSED*/ +static void +filelistinit(void *dummy) +{ + + file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); +} +SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); + +/*-------------------------------------------------------------------*/ + +static int +badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) +{ + + return (EINVAL); +} + +static int +badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, + struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + + return (0); +} + +static int +badfo_kqfilter(struct file *fp, struct knote *kn) +{ + + return (EBADF); +} + +static int +badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, + struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_close(struct file *fp, struct thread *td) +{ + + return (0); +} + +static int +badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, + struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, + struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, + struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, + struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) +{ + + return (0); +} + +struct fileops badfileops = { + .fo_read = badfo_readwrite, + .fo_write = badfo_readwrite, + .fo_truncate = badfo_truncate, + .fo_ioctl = badfo_ioctl, + .fo_poll = badfo_poll, + .fo_kqfilter = badfo_kqfilter, + .fo_stat = badfo_stat, + .fo_close = badfo_close, + .fo_chmod = badfo_chmod, + .fo_chown = badfo_chown, + .fo_sendfile = badfo_sendfile, + .fo_fill_kinfo = badfo_fill_kinfo, +}; + +int +invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + return (EOPNOTSUPP); +} + +int +invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) +{ + + return (EINVAL); +} + +int +invfo_ioctl(struct file *fp, u_long com, void *data, + struct ucred *active_cred, struct thread *td) +{ + + return (ENOTTY); +} + +int +invfo_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + + return (poll_no_poll(events)); +} + +int +invfo_kqfilter(struct file *fp, struct knote *kn) +{ + + return (EINVAL); +} + +int +invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, + struct thread *td) +{ + + return (EINVAL); +} + +int +invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, + struct thread *td) +{ + + return (EINVAL); +} + +int +invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, + struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, + struct thread *td) +{ + + return (EINVAL); +} + +/*-------------------------------------------------------------------*/ + +/* + * File Descriptor pseudo-device driver (/dev/fd/). + * + * Opening minor device N dup()s the file (if any) connected to file + * descriptor N belonging to the calling process. Note that this driver + * consists of only the ``open()'' routine, because all subsequent + * references to this file will be direct to the other driver. + * + * XXX: we could give this one a cloning event handler if necessary. + */ + +/* ARGSUSED */ +static int +fdopen(struct cdev *dev, int mode, int type, struct thread *td) +{ + + /* + * XXX Kludge: set curthread->td_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be released + * by vn_open. Open will detect this special error and take the + * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * will simply report the error. + */ + td->td_dupfd = dev2unit(dev); + return (ENODEV); +} + +static struct cdevsw fildesc_cdevsw = { + .d_version = D_VERSION, + .d_open = fdopen, + .d_name = "FD", +}; + +static void +fildesc_drvinit(void *unused) +{ + struct cdev *dev; + + dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, + UID_ROOT, GID_WHEEL, 0666, "fd/0"); + make_dev_alias(dev, "stdin"); + dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, + UID_ROOT, GID_WHEEL, 0666, "fd/1"); + make_dev_alias(dev, "stdout"); + dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, + UID_ROOT, GID_WHEEL, 0666, "fd/2"); + make_dev_alias(dev, "stderr"); +} + +SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); diff --git a/freebsd/sys/kern/kern_lock.c b/freebsd/sys/kern/kern_lock.c new file mode 100644 index 00000000..d769a185 --- /dev/null +++ b/freebsd/sys/kern/kern_lock.c @@ -0,0 +1,1719 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2008 Attilio Rao + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer as + * the first lines of this file unmodified other than the possible + * addition of one or more copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include "opt_ddb.h" +#include "opt_hwpmc_hooks.h" + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef DEBUG_LOCKS +#include +#endif +#include +#include + +#include + +#ifdef DDB +#include +#endif + +#ifdef HWPMC_HOOKS +#include +PMC_SOFT_DECLARE( , , lock, failed); +#endif + +CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) == + (LK_ADAPTIVE | LK_NOSHARE)); +CTASSERT(LK_UNLOCKED == (LK_UNLOCKED & + ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS))); + +#define SQ_EXCLUSIVE_QUEUE 0 +#define SQ_SHARED_QUEUE 1 + +#ifndef INVARIANTS +#define _lockmgr_assert(lk, what, file, line) +#endif + +#define TD_SLOCKS_INC(td) ((td)->td_lk_slocks++) +#define TD_SLOCKS_DEC(td) ((td)->td_lk_slocks--) + +#ifndef DEBUG_LOCKS +#define STACK_PRINT(lk) +#define STACK_SAVE(lk) +#define STACK_ZERO(lk) +#else +#define STACK_PRINT(lk) stack_print_ddb(&(lk)->lk_stack) +#define STACK_SAVE(lk) stack_save(&(lk)->lk_stack) +#define STACK_ZERO(lk) stack_zero(&(lk)->lk_stack) +#endif + +#define LOCK_LOG2(lk, string, arg1, arg2) \ + if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ + CTR2(KTR_LOCK, (string), (arg1), (arg2)) +#define LOCK_LOG3(lk, string, arg1, arg2, arg3) \ + if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ + CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3)) + +#define GIANT_DECLARE \ + int _i = 0; \ + WITNESS_SAVE_DECL(Giant) +#define GIANT_RESTORE() do { \ + if (_i > 0) { \ + while (_i--) \ + mtx_lock(&Giant); \ + WITNESS_RESTORE(&Giant.lock_object, Giant); \ + } \ +} while (0) +#define GIANT_SAVE() do { \ + if (mtx_owned(&Giant)) { \ + WITNESS_SAVE(&Giant.lock_object, Giant); \ + while (mtx_owned(&Giant)) { \ + _i++; \ + mtx_unlock(&Giant); \ + } \ + } \ +} while (0) + +static bool __always_inline +LK_CAN_SHARE(uintptr_t x, int flags, bool fp) +{ + + if ((x & (LK_SHARE | LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) == + LK_SHARE) + return (true); + if (fp || (!(x & LK_SHARE))) + return (false); + if ((curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) || + (curthread->td_pflags & TDP_DEADLKTREAT)) + return (true); + return (false); +} + +#define LK_TRYOP(x) \ + ((x) & LK_NOWAIT) + +#define LK_CAN_WITNESS(x) \ + (((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x)) +#define LK_TRYWIT(x) \ + (LK_TRYOP(x) ? LOP_TRYLOCK : 0) + +#define LK_CAN_ADAPT(lk, f) \ + (((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 && \ + ((f) & LK_SLEEPFAIL) == 0) + +#define lockmgr_disowned(lk) \ + (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC) + +#define lockmgr_xlocked_v(v) \ + (((v) & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread) + +#define lockmgr_xlocked(lk) lockmgr_xlocked_v((lk)->lk_lock) + +static void assert_lockmgr(const struct lock_object *lock, int how); +#ifdef DDB +static void db_show_lockmgr(const struct lock_object *lock); +#endif +static void lock_lockmgr(struct lock_object *lock, uintptr_t how); +#ifdef KDTRACE_HOOKS +static int owner_lockmgr(const struct lock_object *lock, + struct thread **owner); +#endif +static uintptr_t unlock_lockmgr(struct lock_object *lock); + +struct lock_class lock_class_lockmgr = { + .lc_name = "lockmgr", + .lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE, + .lc_assert = assert_lockmgr, +#ifdef DDB + .lc_ddb_show = db_show_lockmgr, +#endif + .lc_lock = lock_lockmgr, + .lc_unlock = unlock_lockmgr, +#ifdef KDTRACE_HOOKS + .lc_owner = owner_lockmgr, +#endif +}; + +struct lockmgr_wait { + const char *iwmesg; + int ipri; + int itimo; +}; + +static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp, + int flags, bool fp); +static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp); + +static void +lockmgr_exit(u_int flags, struct lock_object *ilk, int wakeup_swapper) +{ + struct lock_class *class; + + if (flags & LK_INTERLOCK) { + class = LOCK_CLASS(ilk); + class->lc_unlock(ilk); + } + + if (__predict_false(wakeup_swapper)) + kick_proc0(); +} + +static void +lockmgr_note_shared_acquire(struct lock *lk, int contested, + uint64_t waittime, const char *file, int line, int flags) +{ + + LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested, + waittime, file, line, LOCKSTAT_READER); + LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line); + WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line); + TD_LOCKS_INC(curthread); + TD_SLOCKS_INC(curthread); + STACK_SAVE(lk); +} + +static void +lockmgr_note_shared_release(struct lock *lk, const char *file, int line) +{ + + LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_READER); + WITNESS_UNLOCK(&lk->lock_object, 0, file, line); + LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line); + TD_LOCKS_DEC(curthread); + TD_SLOCKS_DEC(curthread); +} + +static void +lockmgr_note_exclusive_acquire(struct lock *lk, int contested, + uint64_t waittime, const char *file, int line, int flags) +{ + + LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested, + waittime, file, line, LOCKSTAT_WRITER); + LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); + WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, + line); + TD_LOCKS_INC(curthread); + STACK_SAVE(lk); +} + +static void +lockmgr_note_exclusive_release(struct lock *lk, const char *file, int line) +{ + + LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_WRITER); + LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, + line); + WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); + TD_LOCKS_DEC(curthread); +} + +static __inline struct thread * +lockmgr_xholder(const struct lock *lk) +{ + uintptr_t x; + + x = lk->lk_lock; + return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x)); +} + +/* + * It assumes sleepq_lock held and returns with this one unheld. + * It also assumes the generic interlock is sane and previously checked. + * If LK_INTERLOCK is specified the interlock is not reacquired after the + * sleep. + */ +static __inline int +sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk, + const char *wmesg, int pri, int timo, int queue) +{ + GIANT_DECLARE; + struct lock_class *class; + int catch, error; + + class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; + catch = pri & PCATCH; + pri &= PRIMASK; + error = 0; + + LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk, + (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared"); + + if (flags & LK_INTERLOCK) + class->lc_unlock(ilk); + if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0) + lk->lk_exslpfail++; + GIANT_SAVE(); + sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? + SLEEPQ_INTERRUPTIBLE : 0), queue); + if ((flags & LK_TIMELOCK) && timo) + sleepq_set_timeout(&lk->lock_object, timo); + + /* + * Decisional switch for real sleeping. + */ + if ((flags & LK_TIMELOCK) && timo && catch) + error = sleepq_timedwait_sig(&lk->lock_object, pri); + else if ((flags & LK_TIMELOCK) && timo) + error = sleepq_timedwait(&lk->lock_object, pri); + else if (catch) + error = sleepq_wait_sig(&lk->lock_object, pri); + else + sleepq_wait(&lk->lock_object, pri); + GIANT_RESTORE(); + if ((flags & LK_SLEEPFAIL) && error == 0) + error = ENOLCK; + + return (error); +} + +static __inline int +wakeupshlk(struct lock *lk, const char *file, int line) +{ + uintptr_t v, x, orig_x; + u_int realexslp; + int queue, wakeup_swapper; + + wakeup_swapper = 0; + for (;;) { + x = lk->lk_lock; + if (lockmgr_sunlock_try(lk, &x)) + break; + + /* + * We should have a sharer with waiters, so enter the hard + * path in order to handle wakeups correctly. + */ + sleepq_lock(&lk->lock_object); + orig_x = lk->lk_lock; +retry_sleepq: + x = orig_x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); + v = LK_UNLOCKED; + + /* + * If the lock has exclusive waiters, give them preference in + * order to avoid deadlock with shared runners up. + * If interruptible sleeps left the exclusive queue empty + * avoid a starvation for the threads sleeping on the shared + * queue by giving them precedence and cleaning up the + * exclusive waiters bit anyway. + * Please note that lk_exslpfail count may be lying about + * the real number of waiters with the LK_SLEEPFAIL flag on + * because they may be used in conjunction with interruptible + * sleeps so lk_exslpfail might be considered an 'upper limit' + * bound, including the edge cases. + */ + realexslp = sleepq_sleepcnt(&lk->lock_object, + SQ_EXCLUSIVE_QUEUE); + if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { + if (lk->lk_exslpfail < realexslp) { + lk->lk_exslpfail = 0; + queue = SQ_EXCLUSIVE_QUEUE; + v |= (x & LK_SHARED_WAITERS); + } else { + lk->lk_exslpfail = 0; + LOCK_LOG2(lk, + "%s: %p has only LK_SLEEPFAIL sleepers", + __func__, lk); + LOCK_LOG2(lk, + "%s: %p waking up threads on the exclusive queue", + __func__, lk); + wakeup_swapper = + sleepq_broadcast(&lk->lock_object, + SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); + queue = SQ_SHARED_QUEUE; + } + + } else { + + /* + * Exclusive waiters sleeping with LK_SLEEPFAIL on + * and using interruptible sleeps/timeout may have + * left spourious lk_exslpfail counts on, so clean + * it up anyway. + */ + lk->lk_exslpfail = 0; + queue = SQ_SHARED_QUEUE; + } + + if (lockmgr_sunlock_try(lk, &orig_x)) { + sleepq_release(&lk->lock_object); + break; + } + + x |= LK_SHARERS_LOCK(1); + if (!atomic_fcmpset_rel_ptr(&lk->lk_lock, &x, v)) { + orig_x = x; + goto retry_sleepq; + } + LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", + __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : + "exclusive"); + wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, + 0, queue); + sleepq_release(&lk->lock_object); + break; + } + + lockmgr_note_shared_release(lk, file, line); + return (wakeup_swapper); +} + +static void +assert_lockmgr(const struct lock_object *lock, int what) +{ + + panic("lockmgr locks do not support assertions"); +} + +static void +lock_lockmgr(struct lock_object *lock, uintptr_t how) +{ + + panic("lockmgr locks do not support sleep interlocking"); +} + +static uintptr_t +unlock_lockmgr(struct lock_object *lock) +{ + + panic("lockmgr locks do not support sleep interlocking"); +} + +#ifdef KDTRACE_HOOKS +static int +owner_lockmgr(const struct lock_object *lock, struct thread **owner) +{ + + panic("lockmgr locks do not support owner inquiring"); +} +#endif + +void +lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags) +{ + int iflags; + + MPASS((flags & ~LK_INIT_MASK) == 0); + ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock, + ("%s: lockmgr not aligned for %s: %p", __func__, wmesg, + &lk->lk_lock)); + + iflags = LO_SLEEPABLE | LO_UPGRADABLE; + if (flags & LK_CANRECURSE) + iflags |= LO_RECURSABLE; + if ((flags & LK_NODUP) == 0) + iflags |= LO_DUPOK; + if (flags & LK_NOPROFILE) + iflags |= LO_NOPROFILE; + if ((flags & LK_NOWITNESS) == 0) + iflags |= LO_WITNESS; + if (flags & LK_QUIET) + iflags |= LO_QUIET; + if (flags & LK_IS_VNODE) + iflags |= LO_IS_VNODE; + iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE); + + lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags); + lk->lk_lock = LK_UNLOCKED; + lk->lk_recurse = 0; + lk->lk_exslpfail = 0; + lk->lk_timo = timo; + lk->lk_pri = pri; + STACK_ZERO(lk); +} + +/* + * XXX: Gross hacks to manipulate external lock flags after + * initialization. Used for certain vnode and buf locks. + */ +void +lockallowshare(struct lock *lk) +{ + + lockmgr_assert(lk, KA_XLOCKED); + lk->lock_object.lo_flags &= ~LK_NOSHARE; +} + +void +lockdisableshare(struct lock *lk) +{ + + lockmgr_assert(lk, KA_XLOCKED); + lk->lock_object.lo_flags |= LK_NOSHARE; +} + +void +lockallowrecurse(struct lock *lk) +{ + + lockmgr_assert(lk, KA_XLOCKED); + lk->lock_object.lo_flags |= LO_RECURSABLE; +} + +void +lockdisablerecurse(struct lock *lk) +{ + + lockmgr_assert(lk, KA_XLOCKED); + lk->lock_object.lo_flags &= ~LO_RECURSABLE; +} + +void +lockdestroy(struct lock *lk) +{ + + KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held")); + KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed")); + KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters")); + lock_destroy(&lk->lock_object); +} + +static bool __always_inline +lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags, bool fp) +{ + + /* + * If no other thread has an exclusive lock, or + * no exclusive waiter is present, bump the count of + * sharers. Since we have to preserve the state of + * waiters, if we fail to acquire the shared lock + * loop back and retry. + */ + *xp = lk->lk_lock; + while (LK_CAN_SHARE(*xp, flags, fp)) { + if (atomic_fcmpset_acq_ptr(&lk->lk_lock, xp, + *xp + LK_ONE_SHARER)) { + return (true); + } + } + return (false); +} + +static bool __always_inline +lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp) +{ + + for (;;) { + if (LK_SHARERS(*xp) > 1 || !(*xp & LK_ALL_WAITERS)) { + if (atomic_fcmpset_rel_ptr(&lk->lk_lock, xp, + *xp - LK_ONE_SHARER)) + return (true); + continue; + } + break; + } + return (false); +} + +static __noinline int +lockmgr_slock_hard(struct lock *lk, u_int flags, struct lock_object *ilk, + const char *file, int line, struct lockmgr_wait *lwa) +{ + uintptr_t tid, x; + int error = 0; + const char *iwmesg; + int ipri, itimo; + +#ifdef KDTRACE_HOOKS + uint64_t sleep_time = 0; +#endif +#ifdef LOCK_PROFILING + uint64_t waittime = 0; + int contested = 0; +#endif + + if (__predict_false(panicstr != NULL)) + goto out; + + tid = (uintptr_t)curthread; + + if (LK_CAN_WITNESS(flags)) + WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, + file, line, flags & LK_INTERLOCK ? ilk : NULL); + for (;;) { + if (lockmgr_slock_try(lk, &x, flags, false)) + break; +#ifdef HWPMC_HOOKS + PMC_SOFT_CALL( , , lock, failed); +#endif + lock_profile_obtain_lock_failed(&lk->lock_object, + &contested, &waittime); + + /* + * If the lock is already held by curthread in + * exclusive way avoid a deadlock. + */ + if (LK_HOLDER(x) == tid) { + LOCK_LOG2(lk, + "%s: %p already held in exclusive mode", + __func__, lk); + error = EDEADLK; + break; + } + + /* + * If the lock is expected to not sleep just give up + * and return. + */ + if (LK_TRYOP(flags)) { + LOCK_LOG2(lk, "%s: %p fails the try operation", + __func__, lk); + error = EBUSY; + break; + } + + /* + * Acquire the sleepqueue chain lock because we + * probabilly will need to manipulate waiters flags. + */ + sleepq_lock(&lk->lock_object); + x = lk->lk_lock; +retry_sleepq: + + /* + * if the lock can be acquired in shared mode, try + * again. + */ + if (LK_CAN_SHARE(x, flags, false)) { + sleepq_release(&lk->lock_object); + continue; + } + + /* + * Try to set the LK_SHARED_WAITERS flag. If we fail, + * loop back and retry. + */ + if ((x & LK_SHARED_WAITERS) == 0) { + if (!atomic_fcmpset_acq_ptr(&lk->lk_lock, &x, + x | LK_SHARED_WAITERS)) { + goto retry_sleepq; + } + LOCK_LOG2(lk, "%s: %p set shared waiters flag", + __func__, lk); + } + + if (lwa == NULL) { + iwmesg = lk->lock_object.lo_name; + ipri = lk->lk_pri; + itimo = lk->lk_timo; + } else { + iwmesg = lwa->iwmesg; + ipri = lwa->ipri; + itimo = lwa->itimo; + } + + /* + * As far as we have been unable to acquire the + * shared lock and the shared waiters flag is set, + * we will sleep. + */ +#ifdef KDTRACE_HOOKS + sleep_time -= lockstat_nsecs(&lk->lock_object); +#endif + error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, + SQ_SHARED_QUEUE); +#ifdef KDTRACE_HOOKS + sleep_time += lockstat_nsecs(&lk->lock_object); +#endif + flags &= ~LK_INTERLOCK; + if (error) { + LOCK_LOG3(lk, + "%s: interrupted sleep for %p with %d", + __func__, lk, error); + break; + } + LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", + __func__, lk); + } + if (error == 0) { +#ifdef KDTRACE_HOOKS + if (sleep_time != 0) + LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time, + LOCKSTAT_READER, (x & LK_SHARE) == 0, + (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x)); +#endif +#ifdef LOCK_PROFILING + lockmgr_note_shared_acquire(lk, contested, waittime, + file, line, flags); +#else + lockmgr_note_shared_acquire(lk, 0, 0, file, line, + flags); +#endif + } + +out: + lockmgr_exit(flags, ilk, 0); + return (error); +} + +static __noinline int +lockmgr_xlock_hard(struct lock *lk, u_int flags, struct lock_object *ilk, + const char *file, int line, struct lockmgr_wait *lwa) +{ + struct lock_class *class; + uintptr_t tid, x, v; + int error = 0; + const char *iwmesg; + int ipri, itimo; + +#ifdef KDTRACE_HOOKS + uint64_t sleep_time = 0; +#endif +#ifdef LOCK_PROFILING + uint64_t waittime = 0; + int contested = 0; +#endif + + if (__predict_false(panicstr != NULL)) + goto out; + + tid = (uintptr_t)curthread; + + if (LK_CAN_WITNESS(flags)) + WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | + LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? + ilk : NULL); + + /* + * If curthread already holds the lock and this one is + * allowed to recurse, simply recurse on it. + */ + if (lockmgr_xlocked(lk)) { + if ((flags & LK_CANRECURSE) == 0 && + (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) { + /* + * If the lock is expected to not panic just + * give up and return. + */ + if (LK_TRYOP(flags)) { + LOCK_LOG2(lk, + "%s: %p fails the try operation", + __func__, lk); + error = EBUSY; + goto out; + } + if (flags & LK_INTERLOCK) { + class = LOCK_CLASS(ilk); + class->lc_unlock(ilk); + } + panic("%s: recursing on non recursive lockmgr %p " + "@ %s:%d\n", __func__, lk, file, line); + } + lk->lk_recurse++; + LOCK_LOG2(lk, "%s: %p recursing", __func__, lk); + LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, + lk->lk_recurse, file, line); + WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | + LK_TRYWIT(flags), file, line); + TD_LOCKS_INC(curthread); + goto out; + } + + for (;;) { + if (lk->lk_lock == LK_UNLOCKED && + atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) + break; +#ifdef HWPMC_HOOKS + PMC_SOFT_CALL( , , lock, failed); +#endif + lock_profile_obtain_lock_failed(&lk->lock_object, + &contested, &waittime); + + /* + * If the lock is expected to not sleep just give up + * and return. + */ + if (LK_TRYOP(flags)) { + LOCK_LOG2(lk, "%s: %p fails the try operation", + __func__, lk); + error = EBUSY; + break; + } + + /* + * Acquire the sleepqueue chain lock because we + * probabilly will need to manipulate waiters flags. + */ + sleepq_lock(&lk->lock_object); + x = lk->lk_lock; +retry_sleepq: + + /* + * if the lock has been released while we spun on + * the sleepqueue chain lock just try again. + */ + if (x == LK_UNLOCKED) { + sleepq_release(&lk->lock_object); + continue; + } + + /* + * The lock can be in the state where there is a + * pending queue of waiters, but still no owner. + * This happens when the lock is contested and an + * owner is going to claim the lock. + * If curthread is the one successfully acquiring it + * claim lock ownership and return, preserving waiters + * flags. + */ + v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); + if ((x & ~v) == LK_UNLOCKED) { + v &= ~LK_EXCLUSIVE_SPINNERS; + if (atomic_fcmpset_acq_ptr(&lk->lk_lock, &x, + tid | v)) { + sleepq_release(&lk->lock_object); + LOCK_LOG2(lk, + "%s: %p claimed by a new writer", + __func__, lk); + break; + } + goto retry_sleepq; + } + + /* + * Try to set the LK_EXCLUSIVE_WAITERS flag. If we + * fail, loop back and retry. + */ + if ((x & LK_EXCLUSIVE_WAITERS) == 0) { + if (!atomic_fcmpset_ptr(&lk->lk_lock, &x, + x | LK_EXCLUSIVE_WAITERS)) { + goto retry_sleepq; + } + LOCK_LOG2(lk, "%s: %p set excl waiters flag", + __func__, lk); + } + + if (lwa == NULL) { + iwmesg = lk->lock_object.lo_name; + ipri = lk->lk_pri; + itimo = lk->lk_timo; + } else { + iwmesg = lwa->iwmesg; + ipri = lwa->ipri; + itimo = lwa->itimo; + } + + /* + * As far as we have been unable to acquire the + * exclusive lock and the exclusive waiters flag + * is set, we will sleep. + */ +#ifdef KDTRACE_HOOKS + sleep_time -= lockstat_nsecs(&lk->lock_object); +#endif + error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, + SQ_EXCLUSIVE_QUEUE); +#ifdef KDTRACE_HOOKS + sleep_time += lockstat_nsecs(&lk->lock_object); +#endif + flags &= ~LK_INTERLOCK; + if (error) { + LOCK_LOG3(lk, + "%s: interrupted sleep for %p with %d", + __func__, lk, error); + break; + } + LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", + __func__, lk); + } + if (error == 0) { +#ifdef KDTRACE_HOOKS + if (sleep_time != 0) + LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time, + LOCKSTAT_WRITER, (x & LK_SHARE) == 0, + (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x)); +#endif +#ifdef LOCK_PROFILING + lockmgr_note_exclusive_acquire(lk, contested, waittime, + file, line, flags); +#else + lockmgr_note_exclusive_acquire(lk, 0, 0, file, line, + flags); +#endif + } + +out: + lockmgr_exit(flags, ilk, 0); + return (error); +} + +static __noinline int +lockmgr_upgrade(struct lock *lk, u_int flags, struct lock_object *ilk, + const char *file, int line, struct lockmgr_wait *lwa) +{ + uintptr_t tid, x, v; + int error = 0; + int wakeup_swapper = 0; + int op; + + if (__predict_false(panicstr != NULL)) + goto out; + + tid = (uintptr_t)curthread; + + _lockmgr_assert(lk, KA_SLOCKED, file, line); + v = lk->lk_lock; + x = v & LK_ALL_WAITERS; + v &= LK_EXCLUSIVE_SPINNERS; + + /* + * Try to switch from one shared lock to an exclusive one. + * We need to preserve waiters flags during the operation. + */ + if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v, + tid | x)) { + LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file, + line); + WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE | + LK_TRYWIT(flags), file, line); + LOCKSTAT_RECORD0(lockmgr__upgrade, lk); + TD_SLOCKS_DEC(curthread); + goto out; + } + + op = flags & LK_TYPE_MASK; + + /* + * In LK_TRYUPGRADE mode, do not drop the lock, + * returning EBUSY instead. + */ + if (op == LK_TRYUPGRADE) { + LOCK_LOG2(lk, "%s: %p failed the nowait upgrade", + __func__, lk); + error = EBUSY; + goto out; + } + + /* + * We have been unable to succeed in upgrading, so just + * give up the shared lock. + */ + wakeup_swapper |= wakeupshlk(lk, file, line); + error = lockmgr_xlock_hard(lk, flags, ilk, file, line, lwa); + flags &= ~LK_INTERLOCK; +out: + lockmgr_exit(flags, ilk, wakeup_swapper); + return (error); +} + +int +lockmgr_lock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk, + const char *file, int line) +{ + struct lock_class *class; + uintptr_t x, tid; + u_int op; + bool locked; + + if (__predict_false(panicstr != NULL)) + return (0); + + op = flags & LK_TYPE_MASK; + locked = false; + switch (op) { + case LK_SHARED: + if (LK_CAN_WITNESS(flags)) + WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, + file, line, flags & LK_INTERLOCK ? ilk : NULL); + if (__predict_false(lk->lock_object.lo_flags & LK_NOSHARE)) + break; + if (lockmgr_slock_try(lk, &x, flags, true)) { + lockmgr_note_shared_acquire(lk, 0, 0, + file, line, flags); + locked = true; + } else { + return (lockmgr_slock_hard(lk, flags, ilk, file, line, + NULL)); + } + break; + case LK_EXCLUSIVE: + if (LK_CAN_WITNESS(flags)) + WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | + LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? + ilk : NULL); + tid = (uintptr_t)curthread; + if (lk->lk_lock == LK_UNLOCKED && + atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) { + lockmgr_note_exclusive_acquire(lk, 0, 0, file, line, + flags); + locked = true; + } else { + return (lockmgr_xlock_hard(lk, flags, ilk, file, line, + NULL)); + } + break; + case LK_UPGRADE: + case LK_TRYUPGRADE: + return (lockmgr_upgrade(lk, flags, ilk, file, line, NULL)); + default: + break; + } + if (__predict_true(locked)) { + if (__predict_false(flags & LK_INTERLOCK)) { + class = LOCK_CLASS(ilk); + class->lc_unlock(ilk); + } + return (0); + } else { + return (__lockmgr_args(lk, flags, ilk, LK_WMESG_DEFAULT, + LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, file, line)); + } +} + +static __noinline int +lockmgr_sunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk, + const char *file, int line) + +{ + int wakeup_swapper = 0; + + if (__predict_false(panicstr != NULL)) + goto out; + + wakeup_swapper = wakeupshlk(lk, file, line); + +out: + lockmgr_exit(flags, ilk, wakeup_swapper); + return (0); +} + +static __noinline int +lockmgr_xunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk, + const char *file, int line) +{ + uintptr_t tid, v; + int wakeup_swapper = 0; + u_int realexslp; + int queue; + + if (__predict_false(panicstr != NULL)) + goto out; + + tid = (uintptr_t)curthread; + + /* + * As first option, treact the lock as if it has not + * any waiter. + * Fix-up the tid var if the lock has been disowned. + */ + if (LK_HOLDER(x) == LK_KERNPROC) + tid = LK_KERNPROC; + else { + WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); + TD_LOCKS_DEC(curthread); + } + LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); + + /* + * The lock is held in exclusive mode. + * If the lock is recursed also, then unrecurse it. + */ + if (lockmgr_xlocked_v(x) && lockmgr_recursed(lk)) { + LOCK_LOG2(lk, "%s: %p unrecursing", __func__, lk); + lk->lk_recurse--; + goto out; + } + if (tid != LK_KERNPROC) + LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, + LOCKSTAT_WRITER); + + if (x == tid && atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) + goto out; + + sleepq_lock(&lk->lock_object); + x = lk->lk_lock; + v = LK_UNLOCKED; + + /* + * If the lock has exclusive waiters, give them + * preference in order to avoid deadlock with + * shared runners up. + * If interruptible sleeps left the exclusive queue + * empty avoid a starvation for the threads sleeping + * on the shared queue by giving them precedence + * and cleaning up the exclusive waiters bit anyway. + * Please note that lk_exslpfail count may be lying + * about the real number of waiters with the + * LK_SLEEPFAIL flag on because they may be used in + * conjunction with interruptible sleeps so + * lk_exslpfail might be considered an 'upper limit' + * bound, including the edge cases. + */ + MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); + realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); + if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { + if (lk->lk_exslpfail < realexslp) { + lk->lk_exslpfail = 0; + queue = SQ_EXCLUSIVE_QUEUE; + v |= (x & LK_SHARED_WAITERS); + } else { + lk->lk_exslpfail = 0; + LOCK_LOG2(lk, + "%s: %p has only LK_SLEEPFAIL sleepers", + __func__, lk); + LOCK_LOG2(lk, + "%s: %p waking up threads on the exclusive queue", + __func__, lk); + wakeup_swapper = sleepq_broadcast(&lk->lock_object, + SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); + queue = SQ_SHARED_QUEUE; + } + } else { + + /* + * Exclusive waiters sleeping with LK_SLEEPFAIL + * on and using interruptible sleeps/timeout + * may have left spourious lk_exslpfail counts + * on, so clean it up anyway. + */ + lk->lk_exslpfail = 0; + queue = SQ_SHARED_QUEUE; + } + + LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", + __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : + "exclusive"); + atomic_store_rel_ptr(&lk->lk_lock, v); + wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); + sleepq_release(&lk->lock_object); + +out: + lockmgr_exit(flags, ilk, wakeup_swapper); + return (0); +} + +int +lockmgr_unlock_fast_path(struct lock *lk, u_int flags, struct lock_object *ilk) +{ + struct lock_class *class; + uintptr_t x, tid; + const char *file; + int line; + + if (__predict_false(panicstr != NULL)) + return (0); + + file = __FILE__; + line = __LINE__; + + _lockmgr_assert(lk, KA_LOCKED, file, line); + x = lk->lk_lock; + if (__predict_true(x & LK_SHARE) != 0) { + if (lockmgr_sunlock_try(lk, &x)) { + lockmgr_note_shared_release(lk, file, line); + } else { + return (lockmgr_sunlock_hard(lk, x, flags, ilk, file, line)); + } + } else { + tid = (uintptr_t)curthread; + if (!lockmgr_recursed(lk) && + atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) { + lockmgr_note_exclusive_release(lk, file, line); + } else { + return (lockmgr_xunlock_hard(lk, x, flags, ilk, file, line)); + } + } + if (__predict_false(flags & LK_INTERLOCK)) { + class = LOCK_CLASS(ilk); + class->lc_unlock(ilk); + } + return (0); +} + +int +__lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, + const char *wmesg, int pri, int timo, const char *file, int line) +{ + GIANT_DECLARE; + struct lockmgr_wait lwa; + struct lock_class *class; + const char *iwmesg; + uintptr_t tid, v, x; + u_int op, realexslp; + int error, ipri, itimo, queue, wakeup_swapper; +#ifdef LOCK_PROFILING + uint64_t waittime = 0; + int contested = 0; +#endif + + if (panicstr != NULL) + return (0); + + error = 0; + tid = (uintptr_t)curthread; + op = (flags & LK_TYPE_MASK); + iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg; + ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri; + itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo; + + lwa.iwmesg = iwmesg; + lwa.ipri = ipri; + lwa.itimo = itimo; + + MPASS((flags & ~LK_TOTAL_MASK) == 0); + KASSERT((op & (op - 1)) == 0, + ("%s: Invalid requested operation @ %s:%d", __func__, file, line)); + KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 || + (op != LK_DOWNGRADE && op != LK_RELEASE), + ("%s: Invalid flags in regard of the operation desired @ %s:%d", + __func__, file, line)); + KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL, + ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d", + __func__, file, line)); + KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), + ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread, + lk->lock_object.lo_name, file, line)); + + class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; + + if (lk->lock_object.lo_flags & LK_NOSHARE) { + switch (op) { + case LK_SHARED: + op = LK_EXCLUSIVE; + break; + case LK_UPGRADE: + case LK_TRYUPGRADE: + case LK_DOWNGRADE: + _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED, + file, line); + if (flags & LK_INTERLOCK) + class->lc_unlock(ilk); + return (0); + } + } + + wakeup_swapper = 0; + switch (op) { + case LK_SHARED: + return (lockmgr_slock_hard(lk, flags, ilk, file, line, &lwa)); + break; + case LK_UPGRADE: + case LK_TRYUPGRADE: + return (lockmgr_upgrade(lk, flags, ilk, file, line, &lwa)); + break; + case LK_EXCLUSIVE: + return (lockmgr_xlock_hard(lk, flags, ilk, file, line, &lwa)); + break; + case LK_DOWNGRADE: + _lockmgr_assert(lk, KA_XLOCKED, file, line); + WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line); + + /* + * Panic if the lock is recursed. + */ + if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { + if (flags & LK_INTERLOCK) + class->lc_unlock(ilk); + panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n", + __func__, iwmesg, file, line); + } + TD_SLOCKS_INC(curthread); + + /* + * In order to preserve waiters flags, just spin. + */ + for (;;) { + x = lk->lk_lock; + MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); + x &= LK_ALL_WAITERS; + if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, + LK_SHARERS_LOCK(1) | x)) + break; + cpu_spinwait(); + } + LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line); + LOCKSTAT_RECORD0(lockmgr__downgrade, lk); + break; + case LK_RELEASE: + _lockmgr_assert(lk, KA_LOCKED, file, line); + x = lk->lk_lock; + + if (__predict_true(x & LK_SHARE) != 0) { + return (lockmgr_sunlock_hard(lk, x, flags, ilk, file, line)); + } else { + return (lockmgr_xunlock_hard(lk, x, flags, ilk, file, line)); + } + break; + case LK_DRAIN: + if (LK_CAN_WITNESS(flags)) + WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | + LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? + ilk : NULL); + + /* + * Trying to drain a lock we already own will result in a + * deadlock. + */ + if (lockmgr_xlocked(lk)) { + if (flags & LK_INTERLOCK) + class->lc_unlock(ilk); + panic("%s: draining %s with the lock held @ %s:%d\n", + __func__, iwmesg, file, line); + } + + for (;;) { + if (lk->lk_lock == LK_UNLOCKED && + atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) + break; + +#ifdef HWPMC_HOOKS + PMC_SOFT_CALL( , , lock, failed); +#endif + lock_profile_obtain_lock_failed(&lk->lock_object, + &contested, &waittime); + + /* + * If the lock is expected to not sleep just give up + * and return. + */ + if (LK_TRYOP(flags)) { + LOCK_LOG2(lk, "%s: %p fails the try operation", + __func__, lk); + error = EBUSY; + break; + } + + /* + * Acquire the sleepqueue chain lock because we + * probabilly will need to manipulate waiters flags. + */ + sleepq_lock(&lk->lock_object); + x = lk->lk_lock; + + /* + * if the lock has been released while we spun on + * the sleepqueue chain lock just try again. + */ + if (x == LK_UNLOCKED) { + sleepq_release(&lk->lock_object); + continue; + } + + v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); + if ((x & ~v) == LK_UNLOCKED) { + v = (x & ~LK_EXCLUSIVE_SPINNERS); + + /* + * If interruptible sleeps left the exclusive + * queue empty avoid a starvation for the + * threads sleeping on the shared queue by + * giving them precedence and cleaning up the + * exclusive waiters bit anyway. + * Please note that lk_exslpfail count may be + * lying about the real number of waiters with + * the LK_SLEEPFAIL flag on because they may + * be used in conjunction with interruptible + * sleeps so lk_exslpfail might be considered + * an 'upper limit' bound, including the edge + * cases. + */ + if (v & LK_EXCLUSIVE_WAITERS) { + queue = SQ_EXCLUSIVE_QUEUE; + v &= ~LK_EXCLUSIVE_WAITERS; + } else { + + /* + * Exclusive waiters sleeping with + * LK_SLEEPFAIL on and using + * interruptible sleeps/timeout may + * have left spourious lk_exslpfail + * counts on, so clean it up anyway. + */ + MPASS(v & LK_SHARED_WAITERS); + lk->lk_exslpfail = 0; + queue = SQ_SHARED_QUEUE; + v &= ~LK_SHARED_WAITERS; + } + if (queue == SQ_EXCLUSIVE_QUEUE) { + realexslp = + sleepq_sleepcnt(&lk->lock_object, + SQ_EXCLUSIVE_QUEUE); + if (lk->lk_exslpfail >= realexslp) { + lk->lk_exslpfail = 0; + queue = SQ_SHARED_QUEUE; + v &= ~LK_SHARED_WAITERS; + if (realexslp != 0) { + LOCK_LOG2(lk, + "%s: %p has only LK_SLEEPFAIL sleepers", + __func__, lk); + LOCK_LOG2(lk, + "%s: %p waking up threads on the exclusive queue", + __func__, lk); + wakeup_swapper = + sleepq_broadcast( + &lk->lock_object, + SLEEPQ_LK, 0, + SQ_EXCLUSIVE_QUEUE); + } + } else + lk->lk_exslpfail = 0; + } + if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) { + sleepq_release(&lk->lock_object); + continue; + } + LOCK_LOG3(lk, + "%s: %p waking up all threads on the %s queue", + __func__, lk, queue == SQ_SHARED_QUEUE ? + "shared" : "exclusive"); + wakeup_swapper |= sleepq_broadcast( + &lk->lock_object, SLEEPQ_LK, 0, queue); + + /* + * If shared waiters have been woken up we need + * to wait for one of them to acquire the lock + * before to set the exclusive waiters in + * order to avoid a deadlock. + */ + if (queue == SQ_SHARED_QUEUE) { + for (v = lk->lk_lock; + (v & LK_SHARE) && !LK_SHARERS(v); + v = lk->lk_lock) + cpu_spinwait(); + } + } + + /* + * Try to set the LK_EXCLUSIVE_WAITERS flag. If we + * fail, loop back and retry. + */ + if ((x & LK_EXCLUSIVE_WAITERS) == 0) { + if (!atomic_cmpset_ptr(&lk->lk_lock, x, + x | LK_EXCLUSIVE_WAITERS)) { + sleepq_release(&lk->lock_object); + continue; + } + LOCK_LOG2(lk, "%s: %p set drain waiters flag", + __func__, lk); + } + + /* + * As far as we have been unable to acquire the + * exclusive lock and the exclusive waiters flag + * is set, we will sleep. + */ + if (flags & LK_INTERLOCK) { + class->lc_unlock(ilk); + flags &= ~LK_INTERLOCK; + } + GIANT_SAVE(); + sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK, + SQ_EXCLUSIVE_QUEUE); + sleepq_wait(&lk->lock_object, ipri & PRIMASK); + GIANT_RESTORE(); + LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", + __func__, lk); + } + + if (error == 0) { + lock_profile_obtain_lock_success(&lk->lock_object, + contested, waittime, file, line); + LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0, + lk->lk_recurse, file, line); + WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | + LK_TRYWIT(flags), file, line); + TD_LOCKS_INC(curthread); + STACK_SAVE(lk); + } + break; + default: + if (flags & LK_INTERLOCK) + class->lc_unlock(ilk); + panic("%s: unknown lockmgr request 0x%x\n", __func__, op); + } + + if (flags & LK_INTERLOCK) + class->lc_unlock(ilk); + if (wakeup_swapper) + kick_proc0(); + + return (error); +} + +void +_lockmgr_disown(struct lock *lk, const char *file, int line) +{ + uintptr_t tid, x; + + if (SCHEDULER_STOPPED()) + return; + + tid = (uintptr_t)curthread; + _lockmgr_assert(lk, KA_XLOCKED, file, line); + + /* + * Panic if the lock is recursed. + */ + if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) + panic("%s: disown a recursed lockmgr @ %s:%d\n", + __func__, file, line); + + /* + * If the owner is already LK_KERNPROC just skip the whole operation. + */ + if (LK_HOLDER(lk->lk_lock) != tid) + return; + lock_profile_release_lock(&lk->lock_object); + LOCKSTAT_RECORD1(lockmgr__disown, lk, LOCKSTAT_WRITER); + LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line); + WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); + TD_LOCKS_DEC(curthread); + STACK_SAVE(lk); + + /* + * In order to preserve waiters flags, just spin. + */ + for (;;) { + x = lk->lk_lock; + MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); + x &= LK_ALL_WAITERS; + if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, + LK_KERNPROC | x)) + return; + cpu_spinwait(); + } +} + +void +lockmgr_printinfo(const struct lock *lk) +{ + struct thread *td; + uintptr_t x; + + if (lk->lk_lock == LK_UNLOCKED) + printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name); + else if (lk->lk_lock & LK_SHARE) + printf("lock type %s: SHARED (count %ju)\n", + lk->lock_object.lo_name, + (uintmax_t)LK_SHARERS(lk->lk_lock)); + else { + td = lockmgr_xholder(lk); + if (td == (struct thread *)LK_KERNPROC) + printf("lock type %s: EXCL by KERNPROC\n", + lk->lock_object.lo_name); + else + printf("lock type %s: EXCL by thread %p " + "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, + td, td->td_proc->p_pid, td->td_proc->p_comm, + td->td_tid); + } + + x = lk->lk_lock; + if (x & LK_EXCLUSIVE_WAITERS) + printf(" with exclusive waiters pending\n"); + if (x & LK_SHARED_WAITERS) + printf(" with shared waiters pending\n"); + if (x & LK_EXCLUSIVE_SPINNERS) + printf(" with exclusive spinners pending\n"); + + STACK_PRINT(lk); +} + +int +lockstatus(const struct lock *lk) +{ + uintptr_t v, x; + int ret; + + ret = LK_SHARED; + x = lk->lk_lock; + v = LK_HOLDER(x); + + if ((x & LK_SHARE) == 0) { + if (v == (uintptr_t)curthread || v == LK_KERNPROC) + ret = LK_EXCLUSIVE; + else + ret = LK_EXCLOTHER; + } else if (x == LK_UNLOCKED) + ret = 0; + + return (ret); +} + +#ifdef INVARIANT_SUPPORT + +FEATURE(invariant_support, + "Support for modules compiled with INVARIANTS option"); + +#ifndef INVARIANTS +#undef _lockmgr_assert +#endif + +void +_lockmgr_assert(const struct lock *lk, int what, const char *file, int line) +{ + int slocked = 0; + + if (panicstr != NULL) + return; + switch (what) { + case KA_SLOCKED: + case KA_SLOCKED | KA_NOTRECURSED: + case KA_SLOCKED | KA_RECURSED: + slocked = 1; + case KA_LOCKED: + case KA_LOCKED | KA_NOTRECURSED: + case KA_LOCKED | KA_RECURSED: +#ifdef WITNESS + + /* + * We cannot trust WITNESS if the lock is held in exclusive + * mode and a call to lockmgr_disown() happened. + * Workaround this skipping the check if the lock is held in + * exclusive mode even for the KA_LOCKED case. + */ + if (slocked || (lk->lk_lock & LK_SHARE)) { + witness_assert(&lk->lock_object, what, file, line); + break; + } +#endif + if (lk->lk_lock == LK_UNLOCKED || + ((lk->lk_lock & LK_SHARE) == 0 && (slocked || + (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))))) + panic("Lock %s not %slocked @ %s:%d\n", + lk->lock_object.lo_name, slocked ? "share" : "", + file, line); + + if ((lk->lk_lock & LK_SHARE) == 0) { + if (lockmgr_recursed(lk)) { + if (what & KA_NOTRECURSED) + panic("Lock %s recursed @ %s:%d\n", + lk->lock_object.lo_name, file, + line); + } else if (what & KA_RECURSED) + panic("Lock %s not recursed @ %s:%d\n", + lk->lock_object.lo_name, file, line); + } + break; + case KA_XLOCKED: + case KA_XLOCKED | KA_NOTRECURSED: + case KA_XLOCKED | KA_RECURSED: + if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)) + panic("Lock %s not exclusively locked @ %s:%d\n", + lk->lock_object.lo_name, file, line); + if (lockmgr_recursed(lk)) { + if (what & KA_NOTRECURSED) + panic("Lock %s recursed @ %s:%d\n", + lk->lock_object.lo_name, file, line); + } else if (what & KA_RECURSED) + panic("Lock %s not recursed @ %s:%d\n", + lk->lock_object.lo_name, file, line); + break; + case KA_UNLOCKED: + if (lockmgr_xlocked(lk) || lockmgr_disowned(lk)) + panic("Lock %s exclusively locked @ %s:%d\n", + lk->lock_object.lo_name, file, line); + break; + default: + panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file, + line); + } +} +#endif + +#ifdef DDB +int +lockmgr_chain(struct thread *td, struct thread **ownerp) +{ + struct lock *lk; + + lk = td->td_wchan; + + if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr) + return (0); + db_printf("blocked on lockmgr %s", lk->lock_object.lo_name); + if (lk->lk_lock & LK_SHARE) + db_printf("SHARED (count %ju)\n", + (uintmax_t)LK_SHARERS(lk->lk_lock)); + else + db_printf("EXCL\n"); + *ownerp = lockmgr_xholder(lk); + + return (1); +} + +static void +db_show_lockmgr(const struct lock_object *lock) +{ + struct thread *td; + const struct lock *lk; + + lk = (const struct lock *)lock; + + db_printf(" state: "); + if (lk->lk_lock == LK_UNLOCKED) + db_printf("UNLOCKED\n"); + else if (lk->lk_lock & LK_SHARE) + db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); + else { + td = lockmgr_xholder(lk); + if (td == (struct thread *)LK_KERNPROC) + db_printf("XLOCK: LK_KERNPROC\n"); + else + db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, + td->td_tid, td->td_proc->p_pid, + td->td_proc->p_comm); + if (lockmgr_recursed(lk)) + db_printf(" recursed: %d\n", lk->lk_recurse); + } + db_printf(" waiters: "); + switch (lk->lk_lock & LK_ALL_WAITERS) { + case LK_SHARED_WAITERS: + db_printf("shared\n"); + break; + case LK_EXCLUSIVE_WAITERS: + db_printf("exclusive\n"); + break; + case LK_ALL_WAITERS: + db_printf("shared and exclusive\n"); + break; + default: + db_printf("none\n"); + } + db_printf(" spinners: "); + if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS) + db_printf("exclusive\n"); + else + db_printf("none\n"); +} +#endif diff --git a/freebsd/sys/kern/subr_pctrie.c b/freebsd/sys/kern/subr_pctrie.c new file mode 100644 index 00000000..c5f2c06e --- /dev/null +++ b/freebsd/sys/kern/subr_pctrie.c @@ -0,0 +1,695 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 EMC Corp. + * Copyright (c) 2011 Jeffrey Roberson + * Copyright (c) 2008 Mayur Shardul + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * Path-compressed radix trie implementation. + * + * The implementation takes into account the following rationale: + * - Size of the nodes should be as small as possible but still big enough + * to avoid a large maximum depth for the trie. This is a balance + * between the necessity to not wire too much physical memory for the nodes + * and the necessity to avoid too much cache pollution during the trie + * operations. + * - There is not a huge bias toward the number of lookup operations over + * the number of insert and remove operations. This basically implies + * that optimizations supposedly helping one operation but hurting the + * other might be carefully evaluated. + * - On average not many nodes are expected to be fully populated, hence + * level compression may just complicate things. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" + +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#define PCTRIE_MASK (PCTRIE_COUNT - 1) +#define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) + +/* Flag bits stored in node pointers. */ +#define PCTRIE_ISLEAF 0x1 +#define PCTRIE_FLAGS 0x1 +#define PCTRIE_PAD PCTRIE_FLAGS + +/* Returns one unit associated with specified level. */ +#define PCTRIE_UNITLEVEL(lev) \ + ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) + +struct pctrie_node { + uint64_t pn_owner; /* Owner of record. */ + uint16_t pn_count; /* Valid children. */ + uint16_t pn_clev; /* Current level. */ + void *pn_child[PCTRIE_COUNT]; /* Child nodes. */ +}; + +/* + * Allocate a node. Pre-allocation should ensure that the request + * will always be satisfied. + */ +static __inline struct pctrie_node * +pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner, + uint16_t count, uint16_t clevel) +{ + struct pctrie_node *node; + + node = allocfn(ptree); + if (node == NULL) + return (NULL); + node->pn_owner = owner; + node->pn_count = count; + node->pn_clev = clevel; + + return (node); +} + +/* + * Free radix node. + */ +static __inline void +pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, + pctrie_free_t freefn) +{ +#ifdef INVARIANTS + int slot; + + KASSERT(node->pn_count == 0, + ("pctrie_node_put: node %p has %d children", node, + node->pn_count)); + for (slot = 0; slot < PCTRIE_COUNT; slot++) + KASSERT(node->pn_child[slot] == NULL, + ("pctrie_node_put: node %p has a child", node)); +#endif + freefn(ptree, node); +} + +/* + * Return the position in the array for a given level. + */ +static __inline int +pctrie_slot(uint64_t index, uint16_t level) +{ + + return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); +} + +/* Trims the key after the specified level. */ +static __inline uint64_t +pctrie_trimkey(uint64_t index, uint16_t level) +{ + uint64_t ret; + + ret = index; + if (level > 0) { + ret >>= level * PCTRIE_WIDTH; + ret <<= level * PCTRIE_WIDTH; + } + return (ret); +} + +/* + * Get the root node for a tree. + */ +static __inline struct pctrie_node * +pctrie_getroot(struct pctrie *ptree) +{ + + return ((struct pctrie_node *)ptree->pt_root); +} + +/* + * Set the root node for a tree. + */ +static __inline void +pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node) +{ + + ptree->pt_root = (uintptr_t)node; +} + +/* + * Returns TRUE if the specified node is a leaf and FALSE otherwise. + */ +static __inline boolean_t +pctrie_isleaf(struct pctrie_node *node) +{ + + return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); +} + +/* + * Returns the associated val extracted from node. + */ +static __inline uint64_t * +pctrie_toval(struct pctrie_node *node) +{ + + return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); +} + +/* + * Adds the val as a child of the provided node. + */ +static __inline void +pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev, + uint64_t *val) +{ + int slot; + + slot = pctrie_slot(index, clev); + node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF); +} + +/* + * Returns the slot where two keys differ. + * It cannot accept 2 equal keys. + */ +static __inline uint16_t +pctrie_keydiff(uint64_t index1, uint64_t index2) +{ + uint16_t clev; + + KASSERT(index1 != index2, ("%s: passing the same key value %jx", + __func__, (uintmax_t)index1)); + + index1 ^= index2; + for (clev = PCTRIE_LIMIT;; clev--) + if (pctrie_slot(index1, clev) != 0) + return (clev); +} + +/* + * Returns TRUE if it can be determined that key does not belong to the + * specified node. Otherwise, returns FALSE. + */ +static __inline boolean_t +pctrie_keybarr(struct pctrie_node *node, uint64_t idx) +{ + + if (node->pn_clev < PCTRIE_LIMIT) { + idx = pctrie_trimkey(idx, node->pn_clev + 1); + return (idx != node->pn_owner); + } + return (FALSE); +} + +/* + * Internal helper for pctrie_reclaim_allnodes(). + * This function is recursive. + */ +static void +pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, + pctrie_free_t freefn) +{ + int slot; + + KASSERT(node->pn_count <= PCTRIE_COUNT, + ("pctrie_reclaim_allnodes_int: bad count in node %p", node)); + for (slot = 0; node->pn_count != 0; slot++) { + if (node->pn_child[slot] == NULL) + continue; + if (!pctrie_isleaf(node->pn_child[slot])) + pctrie_reclaim_allnodes_int(ptree, + node->pn_child[slot], freefn); + node->pn_child[slot] = NULL; + node->pn_count--; + } + pctrie_node_put(ptree, node, freefn); +} + +/* + * pctrie node zone initializer. + */ +int +pctrie_zone_init(void *mem, int size __unused, int flags __unused) +{ + struct pctrie_node *node; + + node = mem; + memset(node->pn_child, 0, sizeof(node->pn_child)); + return (0); +} + +size_t +pctrie_node_size(void) +{ + + return (sizeof(struct pctrie_node)); +} + +/* + * Inserts the key-value pair into the trie. + * Panics if the key already exists. + */ +int +pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) +{ + uint64_t index, newind; + void **parentp; + struct pctrie_node *node, *tmp; + uint64_t *m; + int slot; + uint16_t clev; + + index = *val; + + /* + * The owner of record for root is not really important because it + * will never be used. + */ + node = pctrie_getroot(ptree); + if (node == NULL) { + ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF; + return (0); + } + parentp = (void **)&ptree->pt_root; + for (;;) { + if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m == index) + panic("%s: key %jx is already present", + __func__, (uintmax_t)index); + clev = pctrie_keydiff(*m, index); + tmp = pctrie_node_get(ptree, allocfn, + pctrie_trimkey(index, clev + 1), 2, clev); + if (tmp == NULL) + return (ENOMEM); + *parentp = tmp; + pctrie_addval(tmp, index, clev, val); + pctrie_addval(tmp, *m, clev, m); + return (0); + } else if (pctrie_keybarr(node, index)) + break; + slot = pctrie_slot(index, node->pn_clev); + if (node->pn_child[slot] == NULL) { + node->pn_count++; + pctrie_addval(node, index, node->pn_clev, val); + return (0); + } + parentp = &node->pn_child[slot]; + node = node->pn_child[slot]; + } + + /* + * A new node is needed because the right insertion level is reached. + * Setup the new intermediate node and add the 2 children: the + * new object and the older edge. + */ + newind = node->pn_owner; + clev = pctrie_keydiff(newind, index); + tmp = pctrie_node_get(ptree, allocfn, + pctrie_trimkey(index, clev + 1), 2, clev); + if (tmp == NULL) + return (ENOMEM); + *parentp = tmp; + pctrie_addval(tmp, index, clev, val); + slot = pctrie_slot(newind, clev); + tmp->pn_child[slot] = node; + + return (0); +} + +/* + * Returns the value stored at the index. If the index is not present, + * NULL is returned. + */ +uint64_t * +pctrie_lookup(struct pctrie *ptree, uint64_t index) +{ + struct pctrie_node *node; + uint64_t *m; + int slot; + + node = pctrie_getroot(ptree); + while (node != NULL) { + if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m == index) + return (m); + else + break; + } else if (pctrie_keybarr(node, index)) + break; + slot = pctrie_slot(index, node->pn_clev); + node = node->pn_child[slot]; + } + return (NULL); +} + +/* + * Look up the nearest entry at a position bigger than or equal to index. + */ +uint64_t * +pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) +{ + struct pctrie_node *stack[PCTRIE_LIMIT]; + uint64_t inc; + uint64_t *m; + struct pctrie_node *child, *node; +#ifdef INVARIANTS + int loops = 0; +#endif + int slot, tos; + + node = pctrie_getroot(ptree); + if (node == NULL) + return (NULL); + else if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m >= index) + return (m); + else + return (NULL); + } + tos = 0; + for (;;) { + /* + * If the keys differ before the current bisection node, + * then the search key might rollback to the earliest + * available bisection node or to the smallest key + * in the current node (if the owner is bigger than the + * search key). + */ + if (pctrie_keybarr(node, index)) { + if (index > node->pn_owner) { +ascend: + KASSERT(++loops < 1000, + ("pctrie_lookup_ge: too many loops")); + + /* + * Pop nodes from the stack until either the + * stack is empty or a node that could have a + * matching descendant is found. + */ + do { + if (tos == 0) + return (NULL); + node = stack[--tos]; + } while (pctrie_slot(index, + node->pn_clev) == (PCTRIE_COUNT - 1)); + + /* + * The following computation cannot overflow + * because index's slot at the current level + * is less than PCTRIE_COUNT - 1. + */ + index = pctrie_trimkey(index, + node->pn_clev); + index += PCTRIE_UNITLEVEL(node->pn_clev); + } else + index = node->pn_owner; + KASSERT(!pctrie_keybarr(node, index), + ("pctrie_lookup_ge: keybarr failed")); + } + slot = pctrie_slot(index, node->pn_clev); + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m >= index) + return (m); + } else if (child != NULL) + goto descend; + + /* + * Look for an available edge or val within the current + * bisection node. + */ + if (slot < (PCTRIE_COUNT - 1)) { + inc = PCTRIE_UNITLEVEL(node->pn_clev); + index = pctrie_trimkey(index, node->pn_clev); + do { + index += inc; + slot++; + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m >= index) + return (m); + } else if (child != NULL) + goto descend; + } while (slot < (PCTRIE_COUNT - 1)); + } + KASSERT(child == NULL || pctrie_isleaf(child), + ("pctrie_lookup_ge: child is radix node")); + + /* + * If a value or edge bigger than the search slot is not found + * in the current node, ascend to the next higher-level node. + */ + goto ascend; +descend: + KASSERT(node->pn_clev > 0, + ("pctrie_lookup_ge: pushing leaf's parent")); + KASSERT(tos < PCTRIE_LIMIT, + ("pctrie_lookup_ge: stack overflow")); + stack[tos++] = node; + node = child; + } +} + +/* + * Look up the nearest entry at a position less than or equal to index. + */ +uint64_t * +pctrie_lookup_le(struct pctrie *ptree, uint64_t index) +{ + struct pctrie_node *stack[PCTRIE_LIMIT]; + uint64_t inc; + uint64_t *m; + struct pctrie_node *child, *node; +#ifdef INVARIANTS + int loops = 0; +#endif + int slot, tos; + + node = pctrie_getroot(ptree); + if (node == NULL) + return (NULL); + else if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m <= index) + return (m); + else + return (NULL); + } + tos = 0; + for (;;) { + /* + * If the keys differ before the current bisection node, + * then the search key might rollback to the earliest + * available bisection node or to the largest key + * in the current node (if the owner is smaller than the + * search key). + */ + if (pctrie_keybarr(node, index)) { + if (index > node->pn_owner) { + index = node->pn_owner + PCTRIE_COUNT * + PCTRIE_UNITLEVEL(node->pn_clev); + } else { +ascend: + KASSERT(++loops < 1000, + ("pctrie_lookup_le: too many loops")); + + /* + * Pop nodes from the stack until either the + * stack is empty or a node that could have a + * matching descendant is found. + */ + do { + if (tos == 0) + return (NULL); + node = stack[--tos]; + } while (pctrie_slot(index, + node->pn_clev) == 0); + + /* + * The following computation cannot overflow + * because index's slot at the current level + * is greater than 0. + */ + index = pctrie_trimkey(index, + node->pn_clev); + } + index--; + KASSERT(!pctrie_keybarr(node, index), + ("pctrie_lookup_le: keybarr failed")); + } + slot = pctrie_slot(index, node->pn_clev); + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m <= index) + return (m); + } else if (child != NULL) + goto descend; + + /* + * Look for an available edge or value within the current + * bisection node. + */ + if (slot > 0) { + inc = PCTRIE_UNITLEVEL(node->pn_clev); + index |= inc - 1; + do { + index -= inc; + slot--; + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m <= index) + return (m); + } else if (child != NULL) + goto descend; + } while (slot > 0); + } + KASSERT(child == NULL || pctrie_isleaf(child), + ("pctrie_lookup_le: child is radix node")); + + /* + * If a value or edge smaller than the search slot is not found + * in the current node, ascend to the next higher-level node. + */ + goto ascend; +descend: + KASSERT(node->pn_clev > 0, + ("pctrie_lookup_le: pushing leaf's parent")); + KASSERT(tos < PCTRIE_LIMIT, + ("pctrie_lookup_le: stack overflow")); + stack[tos++] = node; + node = child; + } +} + +/* + * Remove the specified index from the tree. + * Panics if the key is not present. + */ +void +pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) +{ + struct pctrie_node *node, *parent; + uint64_t *m; + int i, slot; + + node = pctrie_getroot(ptree); + if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m != index) + panic("%s: invalid key found", __func__); + pctrie_setroot(ptree, NULL); + return; + } + parent = NULL; + for (;;) { + if (node == NULL) + panic("pctrie_remove: impossible to locate the key"); + slot = pctrie_slot(index, node->pn_clev); + if (pctrie_isleaf(node->pn_child[slot])) { + m = pctrie_toval(node->pn_child[slot]); + if (*m != index) + panic("%s: invalid key found", __func__); + node->pn_child[slot] = NULL; + node->pn_count--; + if (node->pn_count > 1) + break; + for (i = 0; i < PCTRIE_COUNT; i++) + if (node->pn_child[i] != NULL) + break; + KASSERT(i != PCTRIE_COUNT, + ("%s: invalid node configuration", __func__)); + if (parent == NULL) + pctrie_setroot(ptree, node->pn_child[i]); + else { + slot = pctrie_slot(index, parent->pn_clev); + KASSERT(parent->pn_child[slot] == node, + ("%s: invalid child value", __func__)); + parent->pn_child[slot] = node->pn_child[i]; + } + node->pn_count--; + node->pn_child[i] = NULL; + pctrie_node_put(ptree, node, freefn); + break; + } + parent = node; + node = node->pn_child[slot]; + } +} + +/* + * Remove and free all the nodes from the tree. + * This function is recursive but there is a tight control on it as the + * maximum depth of the tree is fixed. + */ +void +pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) +{ + struct pctrie_node *root; + + root = pctrie_getroot(ptree); + if (root == NULL) + return; + pctrie_setroot(ptree, NULL); + if (!pctrie_isleaf(root)) + pctrie_reclaim_allnodes_int(ptree, root, freefn); +} + +#ifdef DDB +/* + * Show details about the given node. + */ +DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) +{ + struct pctrie_node *node; + int i; + + if (!have_addr) + return; + node = (struct pctrie_node *)addr; + db_printf("node %p, owner %jx, children count %u, level %u:\n", + (void *)node, (uintmax_t)node->pn_owner, node->pn_count, + node->pn_clev); + for (i = 0; i < PCTRIE_COUNT; i++) + if (node->pn_child[i] != NULL) + db_printf("slot: %d, val: %p, value: %p, clev: %d\n", + i, (void *)node->pn_child[i], + pctrie_isleaf(node->pn_child[i]) ? + pctrie_toval(node->pn_child[i]) : NULL, + node->pn_clev); +} +#endif /* DDB */ diff --git a/freebsd/sys/kern/vfs_acl.c b/freebsd/sys/kern/vfs_acl.c new file mode 100644 index 00000000..56192cfb --- /dev/null +++ b/freebsd/sys/kern/vfs_acl.c @@ -0,0 +1,600 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 1999-2006, 2016-2017 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Portions of this software were developed by BAE Systems, the University of + * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL + * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent + * Computing (TC) research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Developed by the TrustedBSD Project. + * + * ACL system calls and other functions common across different ACL types. + * Type-specific routines go into subr_acl_.c. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES); + +MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists"); + + +static int kern___acl_aclcheck_path(struct thread *td, const char *path, + acl_type_t type, struct acl *aclp, int follow); +static int kern___acl_delete_path(struct thread *td, const char *path, + acl_type_t type, int follow); +static int kern___acl_get_path(struct thread *td, const char *path, + acl_type_t type, struct acl *aclp, int follow); +static int kern___acl_set_path(struct thread *td, const char *path, + acl_type_t type, const struct acl *aclp, int follow); +static int vacl_set_acl(struct thread *td, struct vnode *vp, + acl_type_t type, const struct acl *aclp); +static int vacl_get_acl(struct thread *td, struct vnode *vp, + acl_type_t type, struct acl *aclp); +static int vacl_aclcheck(struct thread *td, struct vnode *vp, + acl_type_t type, const struct acl *aclp); + +int +acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest) +{ + int i; + + if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES) + return (EINVAL); + + bzero(dest, sizeof(*dest)); + + dest->acl_cnt = source->acl_cnt; + dest->acl_maxcnt = ACL_MAX_ENTRIES; + + for (i = 0; i < dest->acl_cnt; i++) { + dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; + dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; + dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; + } + + return (0); +} + +int +acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest) +{ + int i; + + if (source->acl_cnt > OLDACL_MAX_ENTRIES) + return (EINVAL); + + bzero(dest, sizeof(*dest)); + + dest->acl_cnt = source->acl_cnt; + + for (i = 0; i < dest->acl_cnt; i++) { + dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; + dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; + dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; + } + + return (0); +} + +/* + * At one time, "struct ACL" was extended in order to add support for NFSv4 + * ACLs. Instead of creating compatibility versions of all the ACL-related + * syscalls, they were left intact. It's possible to find out what the code + * calling these syscalls (libc) expects basing on "type" argument - if it's + * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were + * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct + * oldacl". If it's something else, then it's the new "struct acl". In the + * latter case, the routines below just copyin/copyout the contents. In the + * former case, they copyin the "struct oldacl" and convert it to the new + * format. + */ +static int +acl_copyin(const void *user_acl, struct acl *kernel_acl, acl_type_t type) +{ + int error; + struct oldacl old; + + switch (type) { + case ACL_TYPE_ACCESS_OLD: + case ACL_TYPE_DEFAULT_OLD: + error = copyin(user_acl, &old, sizeof(old)); + if (error != 0) + break; + acl_copy_oldacl_into_acl(&old, kernel_acl); + break; + + default: + error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl)); + if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES) + return (EINVAL); + } + + return (error); +} + +static int +acl_copyout(const struct acl *kernel_acl, void *user_acl, acl_type_t type) +{ + uint32_t am; + int error; + struct oldacl old; + + switch (type) { + case ACL_TYPE_ACCESS_OLD: + case ACL_TYPE_DEFAULT_OLD: + error = acl_copy_acl_into_oldacl(kernel_acl, &old); + if (error != 0) + break; + + error = copyout(&old, user_acl, sizeof(old)); + break; + + default: + error = fueword32((char *)user_acl + + offsetof(struct acl, acl_maxcnt), &am); + if (error == -1) + return (EFAULT); + if (am != ACL_MAX_ENTRIES) + return (EINVAL); + + error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl)); + } + + return (error); +} + +/* + * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new" + * counterpart. It's required for old (pre-NFSv4 ACLs) libc to work + * with new kernel. Fixing 'type' for old binaries with new libc + * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold(). + */ +static int +acl_type_unold(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS_OLD: + return (ACL_TYPE_ACCESS); + + case ACL_TYPE_DEFAULT_OLD: + return (ACL_TYPE_DEFAULT); + + default: + return (type); + } +} + +/* + * These calls wrap the real vnode operations, and are called by the syscall + * code once the syscall has converted the path or file descriptor to a vnode + * (unlocked). The aclp pointer is assumed still to point to userland, so + * this should not be consumed within the kernel except by syscall code. + * Other code should directly invoke VOP_{SET,GET}ACL. + */ + +/* + * Given a vnode, set its ACL. + */ +static int +vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, + const struct acl *aclp) +{ + struct acl *inkernelacl; + struct mount *mp; + int error; + + AUDIT_ARG_VALUE(type); + inkernelacl = acl_alloc(M_WAITOK); + error = acl_copyin(aclp, inkernelacl, type); + if (error != 0) + goto out; + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + goto out; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + AUDIT_ARG_VNODE1(vp); +#ifdef MAC + error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl); + if (error != 0) + goto out_unlock; +#endif + error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl, + td->td_ucred, td); +#ifdef MAC +out_unlock: +#endif + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); +out: + acl_free(inkernelacl); + return (error); +} + +/* + * Given a vnode, get its ACL. + */ +static int +vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, + struct acl *aclp) +{ + struct acl *inkernelacl; + int error; + + AUDIT_ARG_VALUE(type); + inkernelacl = acl_alloc(M_WAITOK | M_ZERO); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + AUDIT_ARG_VNODE1(vp); +#ifdef MAC + error = mac_vnode_check_getacl(td->td_ucred, vp, type); + if (error != 0) + goto out; +#endif + error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl, + td->td_ucred, td); + +#ifdef MAC +out: +#endif + VOP_UNLOCK(vp, 0); + if (error == 0) + error = acl_copyout(inkernelacl, aclp, type); + acl_free(inkernelacl); + return (error); +} + +/* + * Given a vnode, delete its ACL. + */ +static int +vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) +{ + struct mount *mp; + int error; + + AUDIT_ARG_VALUE(type); + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + AUDIT_ARG_VNODE1(vp); +#ifdef MAC + error = mac_vnode_check_deleteacl(td->td_ucred, vp, type); + if (error != 0) + goto out; +#endif + error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td); +#ifdef MAC +out: +#endif + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + return (error); +} + +/* + * Given a vnode, check whether an ACL is appropriate for it + * + * XXXRW: No vnode lock held so can't audit vnode state...? + */ +static int +vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, + const struct acl *aclp) +{ + struct acl *inkernelacl; + int error; + + inkernelacl = acl_alloc(M_WAITOK); + error = acl_copyin(aclp, inkernelacl, type); + if (error != 0) + goto out; + error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl, + td->td_ucred, td); +out: + acl_free(inkernelacl); + return (error); +} + +/* + * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't + * need to lock, as the vacl_ code will get/release any locks required. + */ + +/* + * Given a file path, get an ACL for it + */ +int +sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap) +{ + + return (kern___acl_get_path(td, uap->path, uap->type, uap->aclp, + FOLLOW)); +} + +/* + * Given a file path, get an ACL for it; don't follow links. + */ +int +sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap) +{ + + return(kern___acl_get_path(td, uap->path, uap->type, uap->aclp, + NOFOLLOW)); +} + +static int +kern___acl_get_path(struct thread *td, const char *path, acl_type_t type, + struct acl *aclp, int follow) +{ + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error == 0) { + error = vacl_get_acl(td, nd.ni_vp, type, aclp); + NDFREE(&nd, 0); + } + return (error); +} + +/* + * Given a file path, set an ACL for it. + */ +int +sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap) +{ + + return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp, + FOLLOW)); +} + +/* + * Given a file path, set an ACL for it; don't follow links. + */ +int +sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap) +{ + + return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp, + NOFOLLOW)); +} + +static int +kern___acl_set_path(struct thread *td, const char *path, + acl_type_t type, const struct acl *aclp, int follow) +{ + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error == 0) { + error = vacl_set_acl(td, nd.ni_vp, type, aclp); + NDFREE(&nd, 0); + } + return (error); +} + +/* + * Given a file descriptor, get an ACL for it. + */ +int +sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) +{ + struct file *fp; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->filedes); + error = getvnode(td, uap->filedes, + cap_rights_init(&rights, CAP_ACL_GET), &fp); + if (error == 0) { + error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp); + fdrop(fp, td); + } + return (error); +} + +/* + * Given a file descriptor, set an ACL for it. + */ +int +sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) +{ + struct file *fp; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->filedes); + error = getvnode(td, uap->filedes, + cap_rights_init(&rights, CAP_ACL_SET), &fp); + if (error == 0) { + error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp); + fdrop(fp, td); + } + return (error); +} + +/* + * Given a file path, delete an ACL from it. + */ +int +sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) +{ + + return (kern___acl_delete_path(td, uap->path, uap->type, FOLLOW)); +} + +/* + * Given a file path, delete an ACL from it; don't follow links. + */ +int +sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap) +{ + + return (kern___acl_delete_path(td, uap->path, uap->type, NOFOLLOW)); +} + +static int +kern___acl_delete_path(struct thread *td, const char *path, + acl_type_t type, int follow) +{ + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error == 0) { + error = vacl_delete(td, nd.ni_vp, type); + NDFREE(&nd, 0); + } + return (error); +} + +/* + * Given a file path, delete an ACL from it. + */ +int +sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) +{ + struct file *fp; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->filedes); + error = getvnode(td, uap->filedes, + cap_rights_init(&rights, CAP_ACL_DELETE), &fp); + if (error == 0) { + error = vacl_delete(td, fp->f_vnode, uap->type); + fdrop(fp, td); + } + return (error); +} + +/* + * Given a file path, check an ACL for it. + */ +int +sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) +{ + + return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp, + FOLLOW)); +} + +/* + * Given a file path, check an ACL for it; don't follow links. + */ +int +sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap) +{ + return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp, + NOFOLLOW)); +} + +static int +kern___acl_aclcheck_path(struct thread *td, const char *path, acl_type_t type, + struct acl *aclp, int follow) +{ + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error == 0) { + error = vacl_aclcheck(td, nd.ni_vp, type, aclp); + NDFREE(&nd, 0); + } + return (error); +} + +/* + * Given a file descriptor, check an ACL for it. + */ +int +sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) +{ + struct file *fp; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->filedes); + error = getvnode(td, uap->filedes, + cap_rights_init(&rights, CAP_ACL_CHECK), &fp); + if (error == 0) { + error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp); + fdrop(fp, td); + } + return (error); +} + +struct acl * +acl_alloc(int flags) +{ + struct acl *aclp; + + aclp = malloc(sizeof(*aclp), M_ACL, flags); + if (aclp == NULL) + return (NULL); + + aclp->acl_maxcnt = ACL_MAX_ENTRIES; + + return (aclp); +} + +void +acl_free(struct acl *aclp) +{ + + free(aclp, M_ACL); +} diff --git a/freebsd/sys/kern/vfs_aio.c b/freebsd/sys/kern/vfs_aio.c new file mode 100644 index 00000000..350c51a0 --- /dev/null +++ b/freebsd/sys/kern/vfs_aio.c @@ -0,0 +1,2987 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 1997 John S. Dyson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. John S. Dyson's name may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * DISCLAIMER: This code isn't warranted to do anything useful. Anything + * bad that happens because of using this software isn't the responsibility + * of the author. This software is distributed AS-IS. + */ + +/* + * This file contains support for the POSIX 1003.1B AIO/LIO facility. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Counter for allocating reference ids to new jobs. Wrapped to 1 on + * overflow. (XXX will be removed soon.) + */ +static u_long jobrefid; + +/* + * Counter for aio_fsync. + */ +static uint64_t jobseqno; + +#ifndef MAX_AIO_PER_PROC +#define MAX_AIO_PER_PROC 32 +#endif + +#ifndef MAX_AIO_QUEUE_PER_PROC +#define MAX_AIO_QUEUE_PER_PROC 256 +#endif + +#ifndef MAX_AIO_QUEUE +#define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ +#endif + +#ifndef MAX_BUF_AIO +#define MAX_BUF_AIO 16 +#endif + +FEATURE(aio, "Asynchronous I/O"); +SYSCTL_DECL(_p1003_1b); + +static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); +static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list"); + +static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, + "Async IO management"); + +static int enable_aio_unsafe = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, + "Permit asynchronous IO on all file types, not just known-safe types"); + +static unsigned int unsafe_warningcnt = 1; +SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, + &unsafe_warningcnt, 0, + "Warnings that will be triggered upon failed IO requests on unsafe files"); + +static int max_aio_procs = MAX_AIO_PROCS; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, + "Maximum number of kernel processes to use for handling async IO "); + +static int num_aio_procs = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, + "Number of presently active kernel processes for async IO"); + +/* + * The code will adjust the actual number of AIO processes towards this + * number when it gets a chance. + */ +static int target_aio_procs = TARGET_AIO_PROCS; +SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, + 0, + "Preferred number of ready kernel processes for async IO"); + +static int max_queue_count = MAX_AIO_QUEUE; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, + "Maximum number of aio requests to queue, globally"); + +static int num_queue_count = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, + "Number of queued aio requests"); + +static int num_buf_aio = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, + "Number of aio requests presently handled by the buf subsystem"); + +static int num_unmapped_aio = 0; +SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio, + 0, + "Number of aio requests presently handled by unmapped I/O buffers"); + +/* Number of async I/O processes in the process of being started */ +/* XXX This should be local to aio_aqueue() */ +static int num_aio_resv_start = 0; + +static int aiod_lifetime; +SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, + "Maximum lifetime for idle aiod"); + +static int max_aio_per_proc = MAX_AIO_PER_PROC; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, + 0, + "Maximum active aio requests per process"); + +static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, + &max_aio_queue_per_proc, 0, + "Maximum queued aio requests per process"); + +static int max_buf_aio = MAX_BUF_AIO; +SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, + "Maximum buf aio requests per process"); + +/* + * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires + * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with + * vfs.aio.aio_listio_max. + */ +SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, + CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, + 0, "Maximum aio requests for a single lio_listio call"); + +#ifdef COMPAT_FREEBSD6 +typedef struct oaiocb { + int aio_fildes; /* File descriptor */ + off_t aio_offset; /* File offset for I/O */ + volatile void *aio_buf; /* I/O buffer in process space */ + size_t aio_nbytes; /* Number of bytes for I/O */ + struct osigevent aio_sigevent; /* Signal to deliver */ + int aio_lio_opcode; /* LIO opcode */ + int aio_reqprio; /* Request priority -- ignored */ + struct __aiocb_private _aiocb_private; +} oaiocb_t; +#endif + +/* + * Below is a key of locks used to protect each member of struct kaiocb + * aioliojob and kaioinfo and any backends. + * + * * - need not protected + * a - locked by kaioinfo lock + * b - locked by backend lock, the backend lock can be null in some cases, + * for example, BIO belongs to this type, in this case, proc lock is + * reused. + * c - locked by aio_job_mtx, the lock for the generic file I/O backend. + */ + +/* + * If the routine that services an AIO request blocks while running in an + * AIO kernel process it can starve other I/O requests. BIO requests + * queued via aio_qbio() complete asynchronously and do not use AIO kernel + * processes at all. Socket I/O requests use a separate pool of + * kprocs and also force non-blocking I/O. Other file I/O requests + * use the generic fo_read/fo_write operations which can block. The + * fsync and mlock operations can also block while executing. Ideally + * none of these requests would block while executing. + * + * Note that the service routines cannot toggle O_NONBLOCK in the file + * structure directly while handling a request due to races with + * userland threads. + */ + +/* jobflags */ +#define KAIOCB_QUEUEING 0x01 +#define KAIOCB_CANCELLED 0x02 +#define KAIOCB_CANCELLING 0x04 +#define KAIOCB_CHECKSYNC 0x08 +#define KAIOCB_CLEARED 0x10 +#define KAIOCB_FINISHED 0x20 + +/* + * AIO process info + */ +#define AIOP_FREE 0x1 /* proc on free queue */ + +struct aioproc { + int aioprocflags; /* (c) AIO proc flags */ + TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ + struct proc *aioproc; /* (*) the AIO proc */ +}; + +/* + * data-structure for lio signal management + */ +struct aioliojob { + int lioj_flags; /* (a) listio flags */ + int lioj_count; /* (a) listio flags */ + int lioj_finished_count; /* (a) listio flags */ + struct sigevent lioj_signal; /* (a) signal on all I/O done */ + TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ + struct knlist klist; /* (a) list of knotes */ + ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ +}; + +#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ +#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ +#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ + +/* + * per process aio data structure + */ +struct kaioinfo { + struct mtx kaio_mtx; /* the lock to protect this struct */ + int kaio_flags; /* (a) per process kaio flags */ + int kaio_active_count; /* (c) number of currently used AIOs */ + int kaio_count; /* (a) size of AIO queue */ + int kaio_buffer_count; /* (a) number of bio buffers */ + TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ + TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ + TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ + TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ + TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ + TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ + struct task kaio_task; /* (*) task to kick aio processes */ + struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ +}; + +#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) +#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) +#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) +#define AIO_MTX(ki) (&(ki)->kaio_mtx) + +#define KAIO_RUNDOWN 0x1 /* process is being run down */ +#define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ + +/* + * Operations used to interact with userland aio control blocks. + * Different ABIs provide their own operations. + */ +struct aiocb_ops { + int (*copyin)(struct aiocb *ujob, struct aiocb *kjob); + long (*fetch_status)(struct aiocb *ujob); + long (*fetch_error)(struct aiocb *ujob); + int (*store_status)(struct aiocb *ujob, long status); + int (*store_error)(struct aiocb *ujob, long error); + int (*store_kernelinfo)(struct aiocb *ujob, long jobref); + int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); +}; + +static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ +static struct sema aio_newproc_sem; +static struct mtx aio_job_mtx; +static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ +static struct unrhdr *aiod_unr; + +void aio_init_aioinfo(struct proc *p); +static int aio_onceonly(void); +static int aio_free_entry(struct kaiocb *job); +static void aio_process_rw(struct kaiocb *job); +static void aio_process_sync(struct kaiocb *job); +static void aio_process_mlock(struct kaiocb *job); +static void aio_schedule_fsync(void *context, int pending); +static int aio_newproc(int *); +int aio_aqueue(struct thread *td, struct aiocb *ujob, + struct aioliojob *lio, int type, struct aiocb_ops *ops); +static int aio_queue_file(struct file *fp, struct kaiocb *job); +static void aio_biowakeup(struct bio *bp); +static void aio_proc_rundown(void *arg, struct proc *p); +static void aio_proc_rundown_exec(void *arg, struct proc *p, + struct image_params *imgp); +static int aio_qbio(struct proc *p, struct kaiocb *job); +static void aio_daemon(void *param); +static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); +static bool aio_clear_cancel_function_locked(struct kaiocb *job); +static int aio_kick(struct proc *userp); +static void aio_kick_nowait(struct proc *userp); +static void aio_kick_helper(void *context, int pending); +static int filt_aioattach(struct knote *kn); +static void filt_aiodetach(struct knote *kn); +static int filt_aio(struct knote *kn, long hint); +static int filt_lioattach(struct knote *kn); +static void filt_liodetach(struct knote *kn); +static int filt_lio(struct knote *kn, long hint); + +/* + * Zones for: + * kaio Per process async io info + * aiop async io process data + * aiocb async io jobs + * aiolio list io jobs + */ +static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone; + +/* kqueue filters for aio */ +static struct filterops aio_filtops = { + .f_isfd = 0, + .f_attach = filt_aioattach, + .f_detach = filt_aiodetach, + .f_event = filt_aio, +}; +static struct filterops lio_filtops = { + .f_isfd = 0, + .f_attach = filt_lioattach, + .f_detach = filt_liodetach, + .f_event = filt_lio +}; + +static eventhandler_tag exit_tag, exec_tag; + +TASKQUEUE_DEFINE_THREAD(aiod_kick); + +/* + * Main operations function for use as a kernel module. + */ +static int +aio_modload(struct module *module, int cmd, void *arg) +{ + int error = 0; + + switch (cmd) { + case MOD_LOAD: + aio_onceonly(); + break; + case MOD_SHUTDOWN: + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +static moduledata_t aio_mod = { + "aio", + &aio_modload, + NULL +}; + +DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); +MODULE_VERSION(aio, 1); + +/* + * Startup initialization + */ +static int +aio_onceonly(void) +{ + + exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, + EVENTHANDLER_PRI_ANY); + exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, + NULL, EVENTHANDLER_PRI_ANY); + kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); + kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); + TAILQ_INIT(&aio_freeproc); + sema_init(&aio_newproc_sem, 0, "aio_new_proc"); + mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); + TAILQ_INIT(&aio_jobs); + aiod_unr = new_unrhdr(1, INT_MAX, NULL); + kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL, + NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, + NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + aiod_lifetime = AIOD_LIFETIME_DEFAULT; + jobrefid = 1; + p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); + p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); + p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); + + return (0); +} + +/* + * Init the per-process aioinfo structure. The aioinfo limits are set + * per-process for user limit (resource) management. + */ +void +aio_init_aioinfo(struct proc *p) +{ + struct kaioinfo *ki; + + ki = uma_zalloc(kaio_zone, M_WAITOK); + mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); + ki->kaio_flags = 0; + ki->kaio_active_count = 0; + ki->kaio_count = 0; + ki->kaio_buffer_count = 0; + TAILQ_INIT(&ki->kaio_all); + TAILQ_INIT(&ki->kaio_done); + TAILQ_INIT(&ki->kaio_jobqueue); + TAILQ_INIT(&ki->kaio_liojoblist); + TAILQ_INIT(&ki->kaio_syncqueue); + TAILQ_INIT(&ki->kaio_syncready); + TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); + TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); + PROC_LOCK(p); + if (p->p_aioinfo == NULL) { + p->p_aioinfo = ki; + PROC_UNLOCK(p); + } else { + PROC_UNLOCK(p); + mtx_destroy(&ki->kaio_mtx); + uma_zfree(kaio_zone, ki); + } + + while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) + aio_newproc(NULL); +} + +static int +aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) +{ + struct thread *td; + int error; + + error = sigev_findtd(p, sigev, &td); + if (error) + return (error); + if (!KSI_ONQ(ksi)) { + ksiginfo_set_sigev(ksi, sigev); + ksi->ksi_code = SI_ASYNCIO; + ksi->ksi_flags |= KSI_EXT | KSI_INS; + tdsendsignal(p, td, ksi->ksi_signo, ksi); + } + PROC_UNLOCK(p); + return (error); +} + +/* + * Free a job entry. Wait for completion if it is currently active, but don't + * delay forever. If we delay, we return a flag that says that we have to + * restart the queue scan. + */ +static int +aio_free_entry(struct kaiocb *job) +{ + struct kaioinfo *ki; + struct aioliojob *lj; + struct proc *p; + + p = job->userproc; + MPASS(curproc == p); + ki = p->p_aioinfo; + MPASS(ki != NULL); + + AIO_LOCK_ASSERT(ki, MA_OWNED); + MPASS(job->jobflags & KAIOCB_FINISHED); + + atomic_subtract_int(&num_queue_count, 1); + + ki->kaio_count--; + MPASS(ki->kaio_count >= 0); + + TAILQ_REMOVE(&ki->kaio_done, job, plist); + TAILQ_REMOVE(&ki->kaio_all, job, allist); + + lj = job->lio; + if (lj) { + lj->lioj_count--; + lj->lioj_finished_count--; + + if (lj->lioj_count == 0) { + TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); + /* lio is going away, we need to destroy any knotes */ + knlist_delete(&lj->klist, curthread, 1); + PROC_LOCK(p); + sigqueue_take(&lj->lioj_ksi); + PROC_UNLOCK(p); + uma_zfree(aiolio_zone, lj); + } + } + + /* job is going away, we need to destroy any knotes */ + knlist_delete(&job->klist, curthread, 1); + PROC_LOCK(p); + sigqueue_take(&job->ksi); + PROC_UNLOCK(p); + + AIO_UNLOCK(ki); + + /* + * The thread argument here is used to find the owning process + * and is also passed to fo_close() which may pass it to various + * places such as devsw close() routines. Because of that, we + * need a thread pointer from the process owning the job that is + * persistent and won't disappear out from under us or move to + * another process. + * + * Currently, all the callers of this function call it to remove + * a kaiocb from the current process' job list either via a + * syscall or due to the current process calling exit() or + * execve(). Thus, we know that p == curproc. We also know that + * curthread can't exit since we are curthread. + * + * Therefore, we use curthread as the thread to pass to + * knlist_delete(). This does mean that it is possible for the + * thread pointer at close time to differ from the thread pointer + * at open time, but this is already true of file descriptors in + * a multithreaded process. + */ + if (job->fd_file) + fdrop(job->fd_file, curthread); + crfree(job->cred); + uma_zfree(aiocb_zone, job); + AIO_LOCK(ki); + + return (0); +} + +static void +aio_proc_rundown_exec(void *arg, struct proc *p, + struct image_params *imgp __unused) +{ + aio_proc_rundown(arg, p); +} + +static int +aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) +{ + aio_cancel_fn_t *func; + int cancelled; + + AIO_LOCK_ASSERT(ki, MA_OWNED); + if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) + return (0); + MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); + job->jobflags |= KAIOCB_CANCELLED; + + func = job->cancel_fn; + + /* + * If there is no cancel routine, just leave the job marked as + * cancelled. The job should be in active use by a caller who + * should complete it normally or when it fails to install a + * cancel routine. + */ + if (func == NULL) + return (0); + + /* + * Set the CANCELLING flag so that aio_complete() will defer + * completions of this job. This prevents the job from being + * freed out from under the cancel callback. After the + * callback any deferred completion (whether from the callback + * or any other source) will be completed. + */ + job->jobflags |= KAIOCB_CANCELLING; + AIO_UNLOCK(ki); + func(job); + AIO_LOCK(ki); + job->jobflags &= ~KAIOCB_CANCELLING; + if (job->jobflags & KAIOCB_FINISHED) { + cancelled = job->uaiocb._aiocb_private.error == ECANCELED; + TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); + aio_bio_done_notify(p, job); + } else { + /* + * The cancel callback might have scheduled an + * operation to cancel this request, but it is + * only counted as cancelled if the request is + * cancelled when the callback returns. + */ + cancelled = 0; + } + return (cancelled); +} + +/* + * Rundown the jobs for a given process. + */ +static void +aio_proc_rundown(void *arg, struct proc *p) +{ + struct kaioinfo *ki; + struct aioliojob *lj; + struct kaiocb *job, *jobn; + + KASSERT(curthread->td_proc == p, + ("%s: called on non-curproc", __func__)); + ki = p->p_aioinfo; + if (ki == NULL) + return; + + AIO_LOCK(ki); + ki->kaio_flags |= KAIO_RUNDOWN; + +restart: + + /* + * Try to cancel all pending requests. This code simulates + * aio_cancel on all pending I/O requests. + */ + TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { + aio_cancel_job(p, ki, job); + } + + /* Wait for all running I/O to be finished */ + if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { + ki->kaio_flags |= KAIO_WAKEUP; + msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); + goto restart; + } + + /* Free all completed I/O requests. */ + while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) + aio_free_entry(job); + + while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { + if (lj->lioj_count == 0) { + TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); + knlist_delete(&lj->klist, curthread, 1); + PROC_LOCK(p); + sigqueue_take(&lj->lioj_ksi); + PROC_UNLOCK(p); + uma_zfree(aiolio_zone, lj); + } else { + panic("LIO job not cleaned up: C:%d, FC:%d\n", + lj->lioj_count, lj->lioj_finished_count); + } + } + AIO_UNLOCK(ki); + taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); + taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); + mtx_destroy(&ki->kaio_mtx); + uma_zfree(kaio_zone, ki); + p->p_aioinfo = NULL; +} + +/* + * Select a job to run (called by an AIO daemon). + */ +static struct kaiocb * +aio_selectjob(struct aioproc *aiop) +{ + struct kaiocb *job; + struct kaioinfo *ki; + struct proc *userp; + + mtx_assert(&aio_job_mtx, MA_OWNED); +restart: + TAILQ_FOREACH(job, &aio_jobs, list) { + userp = job->userproc; + ki = userp->p_aioinfo; + + if (ki->kaio_active_count < max_aio_per_proc) { + TAILQ_REMOVE(&aio_jobs, job, list); + if (!aio_clear_cancel_function(job)) + goto restart; + + /* Account for currently active jobs. */ + ki->kaio_active_count++; + break; + } + } + return (job); +} + +/* + * Move all data to a permanent storage device. This code + * simulates the fsync syscall. + */ +static int +aio_fsync_vnode(struct thread *td, struct vnode *vp) +{ + struct mount *mp; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto drop; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_object != NULL) { + VM_OBJECT_WLOCK(vp->v_object); + vm_object_page_clean(vp->v_object, 0, 0, 0); + VM_OBJECT_WUNLOCK(vp->v_object); + } + error = VOP_FSYNC(vp, MNT_WAIT, td); + + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); +drop: + return (error); +} + +/* + * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that + * does the I/O request for the non-bio version of the operations. The normal + * vn operations are used, and this code should work in all instances for every + * type of file, including pipes, sockets, fifos, and regular files. + * + * XXX I don't think it works well for socket, pipe, and fifo. + */ +static void +aio_process_rw(struct kaiocb *job) +{ + struct ucred *td_savedcred; + struct thread *td; + struct aiocb *cb; + struct file *fp; + struct uio auio; + struct iovec aiov; + ssize_t cnt; + long msgsnd_st, msgsnd_end; + long msgrcv_st, msgrcv_end; + long oublock_st, oublock_end; + long inblock_st, inblock_end; + int error; + + KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || + job->uaiocb.aio_lio_opcode == LIO_WRITE, + ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); + + aio_switch_vmspace(job); + td = curthread; + td_savedcred = td->td_ucred; + td->td_ucred = job->cred; + cb = &job->uaiocb; + fp = job->fd_file; + + aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; + aiov.iov_len = cb->aio_nbytes; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = cb->aio_offset; + auio.uio_resid = cb->aio_nbytes; + cnt = cb->aio_nbytes; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + + msgrcv_st = td->td_ru.ru_msgrcv; + msgsnd_st = td->td_ru.ru_msgsnd; + inblock_st = td->td_ru.ru_inblock; + oublock_st = td->td_ru.ru_oublock; + + /* + * aio_aqueue() acquires a reference to the file that is + * released in aio_free_entry(). + */ + if (cb->aio_lio_opcode == LIO_READ) { + auio.uio_rw = UIO_READ; + if (auio.uio_resid == 0) + error = 0; + else + error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); + } else { + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + auio.uio_rw = UIO_WRITE; + error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); + } + msgrcv_end = td->td_ru.ru_msgrcv; + msgsnd_end = td->td_ru.ru_msgsnd; + inblock_end = td->td_ru.ru_inblock; + oublock_end = td->td_ru.ru_oublock; + + job->msgrcv = msgrcv_end - msgrcv_st; + job->msgsnd = msgsnd_end - msgsnd_st; + job->inblock = inblock_end - inblock_st; + job->outblock = oublock_end - oublock_st; + + if ((error) && (auio.uio_resid != cnt)) { + if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) + error = 0; + if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { + PROC_LOCK(job->userproc); + kern_psignal(job->userproc, SIGPIPE); + PROC_UNLOCK(job->userproc); + } + } + + cnt -= auio.uio_resid; + td->td_ucred = td_savedcred; + if (error) + aio_complete(job, -1, error); + else + aio_complete(job, cnt, 0); +} + +static void +aio_process_sync(struct kaiocb *job) +{ + struct thread *td = curthread; + struct ucred *td_savedcred = td->td_ucred; + struct file *fp = job->fd_file; + int error = 0; + + KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC, + ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); + + td->td_ucred = job->cred; + if (fp->f_vnode != NULL) + error = aio_fsync_vnode(td, fp->f_vnode); + td->td_ucred = td_savedcred; + if (error) + aio_complete(job, -1, error); + else + aio_complete(job, 0, 0); +} + +static void +aio_process_mlock(struct kaiocb *job) +{ + struct aiocb *cb = &job->uaiocb; + int error; + + KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, + ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); + + aio_switch_vmspace(job); + error = kern_mlock(job->userproc, job->cred, + __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); + aio_complete(job, error != 0 ? -1 : 0, error); +} + +static void +aio_bio_done_notify(struct proc *userp, struct kaiocb *job) +{ + struct aioliojob *lj; + struct kaioinfo *ki; + struct kaiocb *sjob, *sjobn; + int lj_done; + bool schedule_fsync; + + ki = userp->p_aioinfo; + AIO_LOCK_ASSERT(ki, MA_OWNED); + lj = job->lio; + lj_done = 0; + if (lj) { + lj->lioj_finished_count++; + if (lj->lioj_count == lj->lioj_finished_count) + lj_done = 1; + } + TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); + MPASS(job->jobflags & KAIOCB_FINISHED); + + if (ki->kaio_flags & KAIO_RUNDOWN) + goto notification_done; + + if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || + job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) + aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi); + + KNOTE_LOCKED(&job->klist, 1); + + if (lj_done) { + if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { + lj->lioj_flags |= LIOJ_KEVENT_POSTED; + KNOTE_LOCKED(&lj->klist, 1); + } + if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) + == LIOJ_SIGNAL + && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || + lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { + aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + } + } + +notification_done: + if (job->jobflags & KAIOCB_CHECKSYNC) { + schedule_fsync = false; + TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { + if (job->fd_file != sjob->fd_file || + job->seqno >= sjob->seqno) + continue; + if (--sjob->pending > 0) + continue; + TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); + if (!aio_clear_cancel_function_locked(sjob)) + continue; + TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); + schedule_fsync = true; + } + if (schedule_fsync) + taskqueue_enqueue(taskqueue_aiod_kick, + &ki->kaio_sync_task); + } + if (ki->kaio_flags & KAIO_WAKEUP) { + ki->kaio_flags &= ~KAIO_WAKEUP; + wakeup(&userp->p_aioinfo); + } +} + +static void +aio_schedule_fsync(void *context, int pending) +{ + struct kaioinfo *ki; + struct kaiocb *job; + + ki = context; + AIO_LOCK(ki); + while (!TAILQ_EMPTY(&ki->kaio_syncready)) { + job = TAILQ_FIRST(&ki->kaio_syncready); + TAILQ_REMOVE(&ki->kaio_syncready, job, list); + AIO_UNLOCK(ki); + aio_schedule(job, aio_process_sync); + AIO_LOCK(ki); + } + AIO_UNLOCK(ki); +} + +bool +aio_cancel_cleared(struct kaiocb *job) +{ + + /* + * The caller should hold the same queue lock held when + * aio_clear_cancel_function() was called and set this flag + * ensuring this check sees an up-to-date value. However, + * there is no way to assert that. + */ + return ((job->jobflags & KAIOCB_CLEARED) != 0); +} + +static bool +aio_clear_cancel_function_locked(struct kaiocb *job) +{ + + AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); + MPASS(job->cancel_fn != NULL); + if (job->jobflags & KAIOCB_CANCELLING) { + job->jobflags |= KAIOCB_CLEARED; + return (false); + } + job->cancel_fn = NULL; + return (true); +} + +bool +aio_clear_cancel_function(struct kaiocb *job) +{ + struct kaioinfo *ki; + bool ret; + + ki = job->userproc->p_aioinfo; + AIO_LOCK(ki); + ret = aio_clear_cancel_function_locked(job); + AIO_UNLOCK(ki); + return (ret); +} + +static bool +aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) +{ + + AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); + if (job->jobflags & KAIOCB_CANCELLED) + return (false); + job->cancel_fn = func; + return (true); +} + +bool +aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) +{ + struct kaioinfo *ki; + bool ret; + + ki = job->userproc->p_aioinfo; + AIO_LOCK(ki); + ret = aio_set_cancel_function_locked(job, func); + AIO_UNLOCK(ki); + return (ret); +} + +void +aio_complete(struct kaiocb *job, long status, int error) +{ + struct kaioinfo *ki; + struct proc *userp; + + job->uaiocb._aiocb_private.error = error; + job->uaiocb._aiocb_private.status = status; + + userp = job->userproc; + ki = userp->p_aioinfo; + + AIO_LOCK(ki); + KASSERT(!(job->jobflags & KAIOCB_FINISHED), + ("duplicate aio_complete")); + job->jobflags |= KAIOCB_FINISHED; + if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { + TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); + aio_bio_done_notify(userp, job); + } + AIO_UNLOCK(ki); +} + +void +aio_cancel(struct kaiocb *job) +{ + + aio_complete(job, -1, ECANCELED); +} + +void +aio_switch_vmspace(struct kaiocb *job) +{ + + vmspace_switch_aio(job->userproc->p_vmspace); +} + +/* + * The AIO daemon, most of the actual work is done in aio_process_*, + * but the setup (and address space mgmt) is done in this routine. + */ +static void +aio_daemon(void *_id) +{ + struct kaiocb *job; + struct aioproc *aiop; + struct kaioinfo *ki; + struct proc *p; + struct vmspace *myvm; + struct thread *td = curthread; + int id = (intptr_t)_id; + + /* + * Grab an extra reference on the daemon's vmspace so that it + * doesn't get freed by jobs that switch to a different + * vmspace. + */ + p = td->td_proc; + myvm = vmspace_acquire_ref(p); + + KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); + + /* + * Allocate and ready the aio control info. There is one aiop structure + * per daemon. + */ + aiop = uma_zalloc(aiop_zone, M_WAITOK); + aiop->aioproc = p; + aiop->aioprocflags = 0; + + /* + * Wakeup parent process. (Parent sleeps to keep from blasting away + * and creating too many daemons.) + */ + sema_post(&aio_newproc_sem); + + mtx_lock(&aio_job_mtx); + for (;;) { + /* + * Take daemon off of free queue + */ + if (aiop->aioprocflags & AIOP_FREE) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + aiop->aioprocflags &= ~AIOP_FREE; + } + + /* + * Check for jobs. + */ + while ((job = aio_selectjob(aiop)) != NULL) { + mtx_unlock(&aio_job_mtx); + + ki = job->userproc->p_aioinfo; + job->handle_fn(job); + + mtx_lock(&aio_job_mtx); + /* Decrement the active job count. */ + ki->kaio_active_count--; + } + + /* + * Disconnect from user address space. + */ + if (p->p_vmspace != myvm) { + mtx_unlock(&aio_job_mtx); + vmspace_switch_aio(myvm); + mtx_lock(&aio_job_mtx); + /* + * We have to restart to avoid race, we only sleep if + * no job can be selected. + */ + continue; + } + + mtx_assert(&aio_job_mtx, MA_OWNED); + + TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); + aiop->aioprocflags |= AIOP_FREE; + + /* + * If daemon is inactive for a long time, allow it to exit, + * thereby freeing resources. + */ + if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", + aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && + (aiop->aioprocflags & AIOP_FREE) && + num_aio_procs > target_aio_procs) + break; + } + TAILQ_REMOVE(&aio_freeproc, aiop, list); + num_aio_procs--; + mtx_unlock(&aio_job_mtx); + uma_zfree(aiop_zone, aiop); + free_unr(aiod_unr, id); + vmspace_free(myvm); + + KASSERT(p->p_vmspace == myvm, + ("AIOD: bad vmspace for exiting daemon")); + KASSERT(myvm->vm_refcnt > 1, + ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt)); + kproc_exit(0); +} + +/* + * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The + * AIO daemon modifies its environment itself. + */ +static int +aio_newproc(int *start) +{ + int error; + struct proc *p; + int id; + + id = alloc_unr(aiod_unr); + error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, + RFNOWAIT, 0, "aiod%d", id); + if (error == 0) { + /* + * Wait until daemon is started. + */ + sema_wait(&aio_newproc_sem); + mtx_lock(&aio_job_mtx); + num_aio_procs++; + if (start != NULL) + (*start)--; + mtx_unlock(&aio_job_mtx); + } else { + free_unr(aiod_unr, id); + } + return (error); +} + +/* + * Try the high-performance, low-overhead bio method for eligible + * VCHR devices. This method doesn't use an aio helper thread, and + * thus has very low overhead. + * + * Assumes that the caller, aio_aqueue(), has incremented the file + * structure's reference count, preventing its deallocation for the + * duration of this call. + */ +static int +aio_qbio(struct proc *p, struct kaiocb *job) +{ + struct aiocb *cb; + struct file *fp; + struct bio *bp; + struct buf *pbuf; + struct vnode *vp; + struct cdevsw *csw; + struct cdev *dev; + struct kaioinfo *ki; + int error, ref, poff; + vm_prot_t prot; + + cb = &job->uaiocb; + fp = job->fd_file; + + if (!(cb->aio_lio_opcode == LIO_WRITE || + cb->aio_lio_opcode == LIO_READ)) + return (-1); + if (fp == NULL || fp->f_type != DTYPE_VNODE) + return (-1); + + vp = fp->f_vnode; + if (vp->v_type != VCHR) + return (-1); + if (vp->v_bufobj.bo_bsize == 0) + return (-1); + if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) + return (-1); + + ref = 0; + csw = devvn_refthread(vp, &dev, &ref); + if (csw == NULL) + return (ENXIO); + + if ((csw->d_flags & D_DISK) == 0) { + error = -1; + goto unref; + } + if (cb->aio_nbytes > dev->si_iosize_max) { + error = -1; + goto unref; + } + + ki = p->p_aioinfo; + poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; + if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { + if (cb->aio_nbytes > MAXPHYS) { + error = -1; + goto unref; + } + + pbuf = NULL; + } else { + if (cb->aio_nbytes > MAXPHYS - poff) { + error = -1; + goto unref; + } + if (ki->kaio_buffer_count >= max_buf_aio) { + error = EAGAIN; + goto unref; + } + + job->pbuf = pbuf = (struct buf *)getpbuf(NULL); + BUF_KERNPROC(pbuf); + AIO_LOCK(ki); + ki->kaio_buffer_count++; + AIO_UNLOCK(ki); + } + job->bp = bp = g_alloc_bio(); + + bp->bio_length = cb->aio_nbytes; + bp->bio_bcount = cb->aio_nbytes; + bp->bio_done = aio_biowakeup; + bp->bio_data = (void *)(uintptr_t)cb->aio_buf; + bp->bio_offset = cb->aio_offset; + bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; + bp->bio_dev = dev; + bp->bio_caller1 = (void *)job; + + prot = VM_PROT_READ; + if (cb->aio_lio_opcode == LIO_READ) + prot |= VM_PROT_WRITE; /* Less backwards than it looks */ + job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages, + nitems(job->pages)); + if (job->npages < 0) { + error = EFAULT; + goto doerror; + } + if (pbuf != NULL) { + pmap_qenter((vm_offset_t)pbuf->b_data, + job->pages, job->npages); + bp->bio_data = pbuf->b_data + poff; + atomic_add_int(&num_buf_aio, 1); + } else { + bp->bio_ma = job->pages; + bp->bio_ma_n = job->npages; + bp->bio_ma_offset = poff; + bp->bio_data = unmapped_buf; + bp->bio_flags |= BIO_UNMAPPED; + atomic_add_int(&num_unmapped_aio, 1); + } + + /* Perform transfer. */ + csw->d_strategy(bp); + dev_relthread(dev, ref); + return (0); + +doerror: + if (pbuf != NULL) { + AIO_LOCK(ki); + ki->kaio_buffer_count--; + AIO_UNLOCK(ki); + relpbuf(pbuf, NULL); + job->pbuf = NULL; + } + g_destroy_bio(bp); + job->bp = NULL; +unref: + dev_relthread(dev, ref); + return (error); +} + +#ifdef COMPAT_FREEBSD6 +static int +convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) +{ + + /* + * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are + * supported by AIO with the old sigevent structure. + */ + nsig->sigev_notify = osig->sigev_notify; + switch (nsig->sigev_notify) { + case SIGEV_NONE: + break; + case SIGEV_SIGNAL: + nsig->sigev_signo = osig->__sigev_u.__sigev_signo; + break; + case SIGEV_KEVENT: + nsig->sigev_notify_kqueue = + osig->__sigev_u.__sigev_notify_kqueue; + nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; + break; + default: + return (EINVAL); + } + return (0); +} + +static int +aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) +{ + struct oaiocb *ojob; + int error; + + bzero(kjob, sizeof(struct aiocb)); + error = copyin(ujob, kjob, sizeof(struct oaiocb)); + if (error) + return (error); + ojob = (struct oaiocb *)kjob; + return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); +} +#endif + +static int +aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) +{ + + return (copyin(ujob, kjob, sizeof(struct aiocb))); +} + +static long +aiocb_fetch_status(struct aiocb *ujob) +{ + + return (fuword(&ujob->_aiocb_private.status)); +} + +static long +aiocb_fetch_error(struct aiocb *ujob) +{ + + return (fuword(&ujob->_aiocb_private.error)); +} + +static int +aiocb_store_status(struct aiocb *ujob, long status) +{ + + return (suword(&ujob->_aiocb_private.status, status)); +} + +static int +aiocb_store_error(struct aiocb *ujob, long error) +{ + + return (suword(&ujob->_aiocb_private.error, error)); +} + +static int +aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) +{ + + return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); +} + +static int +aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) +{ + + return (suword(ujobp, (long)ujob)); +} + +static struct aiocb_ops aiocb_ops = { + .copyin = aiocb_copyin, + .fetch_status = aiocb_fetch_status, + .fetch_error = aiocb_fetch_error, + .store_status = aiocb_store_status, + .store_error = aiocb_store_error, + .store_kernelinfo = aiocb_store_kernelinfo, + .store_aiocb = aiocb_store_aiocb, +}; + +#ifdef COMPAT_FREEBSD6 +static struct aiocb_ops aiocb_ops_osigevent = { + .copyin = aiocb_copyin_old_sigevent, + .fetch_status = aiocb_fetch_status, + .fetch_error = aiocb_fetch_error, + .store_status = aiocb_store_status, + .store_error = aiocb_store_error, + .store_kernelinfo = aiocb_store_kernelinfo, + .store_aiocb = aiocb_store_aiocb, +}; +#endif + +/* + * Queue a new AIO request. Choosing either the threaded or direct bio VCHR + * technique is done in this code. + */ +int +aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, + int type, struct aiocb_ops *ops) +{ + struct proc *p = td->td_proc; + struct file *fp; + struct kaiocb *job; + struct kaioinfo *ki; + struct kevent kev; + int opcode; + int error; + int fd, kqfd; + int jid; + u_short evflags; + + if (p->p_aioinfo == NULL) + aio_init_aioinfo(p); + + ki = p->p_aioinfo; + + ops->store_status(ujob, -1); + ops->store_error(ujob, 0); + ops->store_kernelinfo(ujob, -1); + + if (num_queue_count >= max_queue_count || + ki->kaio_count >= max_aio_queue_per_proc) { + ops->store_error(ujob, EAGAIN); + return (EAGAIN); + } + + job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); + knlist_init_mtx(&job->klist, AIO_MTX(ki)); + + error = ops->copyin(ujob, &job->uaiocb); + if (error) { + ops->store_error(ujob, error); + uma_zfree(aiocb_zone, job); + return (error); + } + + if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { + uma_zfree(aiocb_zone, job); + return (EINVAL); + } + + if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && + job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && + job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && + job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { + ops->store_error(ujob, EINVAL); + uma_zfree(aiocb_zone, job); + return (EINVAL); + } + + if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || + job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && + !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { + uma_zfree(aiocb_zone, job); + return (EINVAL); + } + + ksiginfo_init(&job->ksi); + + /* Save userspace address of the job info. */ + job->ujob = ujob; + + /* Get the opcode. */ + if (type != LIO_NOP) + job->uaiocb.aio_lio_opcode = type; + opcode = job->uaiocb.aio_lio_opcode; + + /* + * Validate the opcode and fetch the file object for the specified + * file descriptor. + * + * XXXRW: Moved the opcode validation up here so that we don't + * retrieve a file descriptor without knowing what the capabiltity + * should be. + */ + fd = job->uaiocb.aio_fildes; + switch (opcode) { + case LIO_WRITE: + error = fget_write(td, fd, &cap_pwrite_rights, &fp); + break; + case LIO_READ: + error = fget_read(td, fd, &cap_pread_rights, &fp); + break; + case LIO_SYNC: + error = fget(td, fd, &cap_fsync_rights, &fp); + break; + case LIO_MLOCK: + fp = NULL; + break; + case LIO_NOP: + error = fget(td, fd, &cap_no_rights, &fp); + break; + default: + error = EINVAL; + } + if (error) { + uma_zfree(aiocb_zone, job); + ops->store_error(ujob, error); + return (error); + } + + if (opcode == LIO_SYNC && fp->f_vnode == NULL) { + error = EINVAL; + goto aqueue_fail; + } + + if ((opcode == LIO_READ || opcode == LIO_WRITE) && + job->uaiocb.aio_offset < 0 && + (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { + error = EINVAL; + goto aqueue_fail; + } + + job->fd_file = fp; + + mtx_lock(&aio_job_mtx); + jid = jobrefid++; + job->seqno = jobseqno++; + mtx_unlock(&aio_job_mtx); + error = ops->store_kernelinfo(ujob, jid); + if (error) { + error = EINVAL; + goto aqueue_fail; + } + job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; + + if (opcode == LIO_NOP) { + fdrop(fp, td); + uma_zfree(aiocb_zone, job); + return (0); + } + + if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) + goto no_kqueue; + evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; + if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { + error = EINVAL; + goto aqueue_fail; + } + kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; + memset(&kev, 0, sizeof(kev)); + kev.ident = (uintptr_t)job->ujob; + kev.filter = EVFILT_AIO; + kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; + kev.data = (intptr_t)job; + kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; + error = kqfd_register(kqfd, &kev, td, M_WAITOK); + if (error) + goto aqueue_fail; + +no_kqueue: + + ops->store_error(ujob, EINPROGRESS); + job->uaiocb._aiocb_private.error = EINPROGRESS; + job->userproc = p; + job->cred = crhold(td->td_ucred); + job->jobflags = KAIOCB_QUEUEING; + job->lio = lj; + + if (opcode == LIO_MLOCK) { + aio_schedule(job, aio_process_mlock); + error = 0; + } else if (fp->f_ops->fo_aio_queue == NULL) + error = aio_queue_file(fp, job); + else + error = fo_aio_queue(fp, job); + if (error) + goto aqueue_fail; + + AIO_LOCK(ki); + job->jobflags &= ~KAIOCB_QUEUEING; + TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); + ki->kaio_count++; + if (lj) + lj->lioj_count++; + atomic_add_int(&num_queue_count, 1); + if (job->jobflags & KAIOCB_FINISHED) { + /* + * The queue callback completed the request synchronously. + * The bulk of the completion is deferred in that case + * until this point. + */ + aio_bio_done_notify(p, job); + } else + TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); + AIO_UNLOCK(ki); + return (0); + +aqueue_fail: + knlist_delete(&job->klist, curthread, 0); + if (fp) + fdrop(fp, td); + uma_zfree(aiocb_zone, job); + ops->store_error(ujob, error); + return (error); +} + +static void +aio_cancel_daemon_job(struct kaiocb *job) +{ + + mtx_lock(&aio_job_mtx); + if (!aio_cancel_cleared(job)) + TAILQ_REMOVE(&aio_jobs, job, list); + mtx_unlock(&aio_job_mtx); + aio_cancel(job); +} + +void +aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) +{ + + mtx_lock(&aio_job_mtx); + if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { + mtx_unlock(&aio_job_mtx); + aio_cancel(job); + return; + } + job->handle_fn = func; + TAILQ_INSERT_TAIL(&aio_jobs, job, list); + aio_kick_nowait(job->userproc); + mtx_unlock(&aio_job_mtx); +} + +static void +aio_cancel_sync(struct kaiocb *job) +{ + struct kaioinfo *ki; + + ki = job->userproc->p_aioinfo; + AIO_LOCK(ki); + if (!aio_cancel_cleared(job)) + TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); + AIO_UNLOCK(ki); + aio_cancel(job); +} + +int +aio_queue_file(struct file *fp, struct kaiocb *job) +{ + struct kaioinfo *ki; + struct kaiocb *job2; + struct vnode *vp; + struct mount *mp; + int error; + bool safe; + + ki = job->userproc->p_aioinfo; + error = aio_qbio(job->userproc, job); + if (error >= 0) + return (error); + safe = false; + if (fp->f_type == DTYPE_VNODE) { + vp = fp->f_vnode; + if (vp->v_type == VREG || vp->v_type == VDIR) { + mp = fp->f_vnode->v_mount; + if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) + safe = true; + } + } + if (!(safe || enable_aio_unsafe)) { + counted_warning(&unsafe_warningcnt, + "is attempting to use unsafe AIO requests"); + return (EOPNOTSUPP); + } + + switch (job->uaiocb.aio_lio_opcode) { + case LIO_READ: + case LIO_WRITE: + aio_schedule(job, aio_process_rw); + error = 0; + break; + case LIO_SYNC: + AIO_LOCK(ki); + TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { + if (job2->fd_file == job->fd_file && + job2->uaiocb.aio_lio_opcode != LIO_SYNC && + job2->seqno < job->seqno) { + job2->jobflags |= KAIOCB_CHECKSYNC; + job->pending++; + } + } + if (job->pending != 0) { + if (!aio_set_cancel_function_locked(job, + aio_cancel_sync)) { + AIO_UNLOCK(ki); + aio_cancel(job); + return (0); + } + TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); + AIO_UNLOCK(ki); + return (0); + } + AIO_UNLOCK(ki); + aio_schedule(job, aio_process_sync); + error = 0; + break; + default: + error = EINVAL; + } + return (error); +} + +static void +aio_kick_nowait(struct proc *userp) +{ + struct kaioinfo *ki = userp->p_aioinfo; + struct aioproc *aiop; + + mtx_assert(&aio_job_mtx, MA_OWNED); + if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + aiop->aioprocflags &= ~AIOP_FREE; + wakeup(aiop->aioproc); + } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && + ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { + taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); + } +} + +static int +aio_kick(struct proc *userp) +{ + struct kaioinfo *ki = userp->p_aioinfo; + struct aioproc *aiop; + int error, ret = 0; + + mtx_assert(&aio_job_mtx, MA_OWNED); +retryproc: + if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + aiop->aioprocflags &= ~AIOP_FREE; + wakeup(aiop->aioproc); + } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && + ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { + num_aio_resv_start++; + mtx_unlock(&aio_job_mtx); + error = aio_newproc(&num_aio_resv_start); + mtx_lock(&aio_job_mtx); + if (error) { + num_aio_resv_start--; + goto retryproc; + } + } else { + ret = -1; + } + return (ret); +} + +static void +aio_kick_helper(void *context, int pending) +{ + struct proc *userp = context; + + mtx_lock(&aio_job_mtx); + while (--pending >= 0) { + if (aio_kick(userp)) + break; + } + mtx_unlock(&aio_job_mtx); +} + +/* + * Support the aio_return system call, as a side-effect, kernel resources are + * released. + */ +static int +kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) +{ + struct proc *p = td->td_proc; + struct kaiocb *job; + struct kaioinfo *ki; + long status, error; + + ki = p->p_aioinfo; + if (ki == NULL) + return (EINVAL); + AIO_LOCK(ki); + TAILQ_FOREACH(job, &ki->kaio_done, plist) { + if (job->ujob == ujob) + break; + } + if (job != NULL) { + MPASS(job->jobflags & KAIOCB_FINISHED); + status = job->uaiocb._aiocb_private.status; + error = job->uaiocb._aiocb_private.error; + td->td_retval[0] = status; + td->td_ru.ru_oublock += job->outblock; + td->td_ru.ru_inblock += job->inblock; + td->td_ru.ru_msgsnd += job->msgsnd; + td->td_ru.ru_msgrcv += job->msgrcv; + aio_free_entry(job); + AIO_UNLOCK(ki); + ops->store_error(ujob, error); + ops->store_status(ujob, status); + } else { + error = EINVAL; + AIO_UNLOCK(ki); + } + return (error); +} + +int +sys_aio_return(struct thread *td, struct aio_return_args *uap) +{ + + return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); +} + +/* + * Allow a process to wakeup when any of the I/O requests are completed. + */ +static int +kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, + struct timespec *ts) +{ + struct proc *p = td->td_proc; + struct timeval atv; + struct kaioinfo *ki; + struct kaiocb *firstjob, *job; + int error, i, timo; + + timo = 0; + if (ts) { + if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) + return (EINVAL); + + TIMESPEC_TO_TIMEVAL(&atv, ts); + if (itimerfix(&atv)) + return (EINVAL); + timo = tvtohz(&atv); + } + + ki = p->p_aioinfo; + if (ki == NULL) + return (EAGAIN); + + if (njoblist == 0) + return (0); + + AIO_LOCK(ki); + for (;;) { + firstjob = NULL; + error = 0; + TAILQ_FOREACH(job, &ki->kaio_all, allist) { + for (i = 0; i < njoblist; i++) { + if (job->ujob == ujoblist[i]) { + if (firstjob == NULL) + firstjob = job; + if (job->jobflags & KAIOCB_FINISHED) + goto RETURN; + } + } + } + /* All tasks were finished. */ + if (firstjob == NULL) + break; + + ki->kaio_flags |= KAIO_WAKEUP; + error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, + "aiospn", timo); + if (error == ERESTART) + error = EINTR; + if (error) + break; + } +RETURN: + AIO_UNLOCK(ki); + return (error); +} + +int +sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) +{ + struct timespec ts, *tsp; + struct aiocb **ujoblist; + int error; + + if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) + return (EINVAL); + + if (uap->timeout) { + /* Get timespec struct. */ + if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) + return (error); + tsp = &ts; + } else + tsp = NULL; + + ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); + error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); + if (error == 0) + error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); + free(ujoblist, M_AIOS); + return (error); +} + +/* + * aio_cancel cancels any non-bio aio operations not currently in progress. + */ +int +sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) +{ + struct proc *p = td->td_proc; + struct kaioinfo *ki; + struct kaiocb *job, *jobn; + struct file *fp; + int error; + int cancelled = 0; + int notcancelled = 0; + struct vnode *vp; + + /* Lookup file object. */ + error = fget(td, uap->fd, &cap_no_rights, &fp); + if (error) + return (error); + + ki = p->p_aioinfo; + if (ki == NULL) + goto done; + + if (fp->f_type == DTYPE_VNODE) { + vp = fp->f_vnode; + if (vn_isdisk(vp, &error)) { + fdrop(fp, td); + td->td_retval[0] = AIO_NOTCANCELED; + return (0); + } + } + + AIO_LOCK(ki); + TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { + if ((uap->fd == job->uaiocb.aio_fildes) && + ((uap->aiocbp == NULL) || + (uap->aiocbp == job->ujob))) { + if (aio_cancel_job(p, ki, job)) { + cancelled++; + } else { + notcancelled++; + } + if (uap->aiocbp != NULL) + break; + } + } + AIO_UNLOCK(ki); + +done: + fdrop(fp, td); + + if (uap->aiocbp != NULL) { + if (cancelled) { + td->td_retval[0] = AIO_CANCELED; + return (0); + } + } + + if (notcancelled) { + td->td_retval[0] = AIO_NOTCANCELED; + return (0); + } + + if (cancelled) { + td->td_retval[0] = AIO_CANCELED; + return (0); + } + + td->td_retval[0] = AIO_ALLDONE; + + return (0); +} + +/* + * aio_error is implemented in the kernel level for compatibility purposes + * only. For a user mode async implementation, it would be best to do it in + * a userland subroutine. + */ +static int +kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) +{ + struct proc *p = td->td_proc; + struct kaiocb *job; + struct kaioinfo *ki; + int status; + + ki = p->p_aioinfo; + if (ki == NULL) { + td->td_retval[0] = EINVAL; + return (0); + } + + AIO_LOCK(ki); + TAILQ_FOREACH(job, &ki->kaio_all, allist) { + if (job->ujob == ujob) { + if (job->jobflags & KAIOCB_FINISHED) + td->td_retval[0] = + job->uaiocb._aiocb_private.error; + else + td->td_retval[0] = EINPROGRESS; + AIO_UNLOCK(ki); + return (0); + } + } + AIO_UNLOCK(ki); + + /* + * Hack for failure of aio_aqueue. + */ + status = ops->fetch_status(ujob); + if (status == -1) { + td->td_retval[0] = ops->fetch_error(ujob); + return (0); + } + + td->td_retval[0] = EINVAL; + return (0); +} + +int +sys_aio_error(struct thread *td, struct aio_error_args *uap) +{ + + return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); +} + +/* syscall - asynchronous read from a file (REALTIME) */ +#ifdef COMPAT_FREEBSD6 +int +freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, + &aiocb_ops_osigevent)); +} +#endif + +int +sys_aio_read(struct thread *td, struct aio_read_args *uap) +{ + + return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); +} + +/* syscall - asynchronous write to a file (REALTIME) */ +#ifdef COMPAT_FREEBSD6 +int +freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, + &aiocb_ops_osigevent)); +} +#endif + +int +sys_aio_write(struct thread *td, struct aio_write_args *uap) +{ + + return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); +} + +int +sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) +{ + + return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); +} + +static int +kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, + struct aiocb **acb_list, int nent, struct sigevent *sig, + struct aiocb_ops *ops) +{ + struct proc *p = td->td_proc; + struct aiocb *job; + struct kaioinfo *ki; + struct aioliojob *lj; + struct kevent kev; + int error; + int nagain, nerror; + int i; + + if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) + return (EINVAL); + + if (nent < 0 || nent > max_aio_queue_per_proc) + return (EINVAL); + + if (p->p_aioinfo == NULL) + aio_init_aioinfo(p); + + ki = p->p_aioinfo; + + lj = uma_zalloc(aiolio_zone, M_WAITOK); + lj->lioj_flags = 0; + lj->lioj_count = 0; + lj->lioj_finished_count = 0; + knlist_init_mtx(&lj->klist, AIO_MTX(ki)); + ksiginfo_init(&lj->lioj_ksi); + + /* + * Setup signal. + */ + if (sig && (mode == LIO_NOWAIT)) { + bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); + if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { + /* Assume only new style KEVENT */ + memset(&kev, 0, sizeof(kev)); + kev.filter = EVFILT_LIO; + kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; + kev.ident = (uintptr_t)uacb_list; /* something unique */ + kev.data = (intptr_t)lj; + /* pass user defined sigval data */ + kev.udata = lj->lioj_signal.sigev_value.sival_ptr; + error = kqfd_register( + lj->lioj_signal.sigev_notify_kqueue, &kev, td, + M_WAITOK); + if (error) { + uma_zfree(aiolio_zone, lj); + return (error); + } + } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { + ; + } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || + lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { + if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { + uma_zfree(aiolio_zone, lj); + return EINVAL; + } + lj->lioj_flags |= LIOJ_SIGNAL; + } else { + uma_zfree(aiolio_zone, lj); + return EINVAL; + } + } + + AIO_LOCK(ki); + TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); + /* + * Add extra aiocb count to avoid the lio to be freed + * by other threads doing aio_waitcomplete or aio_return, + * and prevent event from being sent until we have queued + * all tasks. + */ + lj->lioj_count = 1; + AIO_UNLOCK(ki); + + /* + * Get pointers to the list of I/O requests. + */ + nagain = 0; + nerror = 0; + for (i = 0; i < nent; i++) { + job = acb_list[i]; + if (job != NULL) { + error = aio_aqueue(td, job, lj, LIO_NOP, ops); + if (error == EAGAIN) + nagain++; + else if (error != 0) + nerror++; + } + } + + error = 0; + AIO_LOCK(ki); + if (mode == LIO_WAIT) { + while (lj->lioj_count - 1 != lj->lioj_finished_count) { + ki->kaio_flags |= KAIO_WAKEUP; + error = msleep(&p->p_aioinfo, AIO_MTX(ki), + PRIBIO | PCATCH, "aiospn", 0); + if (error == ERESTART) + error = EINTR; + if (error) + break; + } + } else { + if (lj->lioj_count - 1 == lj->lioj_finished_count) { + if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { + lj->lioj_flags |= LIOJ_KEVENT_POSTED; + KNOTE_LOCKED(&lj->klist, 1); + } + if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) + == LIOJ_SIGNAL + && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || + lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { + aio_sendsig(p, &lj->lioj_signal, + &lj->lioj_ksi); + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + } + } + } + lj->lioj_count--; + if (lj->lioj_count == 0) { + TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); + knlist_delete(&lj->klist, curthread, 1); + PROC_LOCK(p); + sigqueue_take(&lj->lioj_ksi); + PROC_UNLOCK(p); + AIO_UNLOCK(ki); + uma_zfree(aiolio_zone, lj); + } else + AIO_UNLOCK(ki); + + if (nerror) + return (EIO); + else if (nagain) + return (EAGAIN); + else + return (error); +} + +/* syscall - list directed I/O (REALTIME) */ +#ifdef COMPAT_FREEBSD6 +int +freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) +{ + struct aiocb **acb_list; + struct sigevent *sigp, sig; + struct osigevent osig; + int error, nent; + + if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) + return (EINVAL); + + nent = uap->nent; + if (nent < 0 || nent > max_aio_queue_per_proc) + return (EINVAL); + + if (uap->sig && (uap->mode == LIO_NOWAIT)) { + error = copyin(uap->sig, &osig, sizeof(osig)); + if (error) + return (error); + error = convert_old_sigevent(&osig, &sig); + if (error) + return (error); + sigp = &sig; + } else + sigp = NULL; + + acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); + error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); + if (error == 0) + error = kern_lio_listio(td, uap->mode, + (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, + &aiocb_ops_osigevent); + free(acb_list, M_LIO); + return (error); +} +#endif + +/* syscall - list directed I/O (REALTIME) */ +int +sys_lio_listio(struct thread *td, struct lio_listio_args *uap) +{ + struct aiocb **acb_list; + struct sigevent *sigp, sig; + int error, nent; + + if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) + return (EINVAL); + + nent = uap->nent; + if (nent < 0 || nent > max_aio_queue_per_proc) + return (EINVAL); + + if (uap->sig && (uap->mode == LIO_NOWAIT)) { + error = copyin(uap->sig, &sig, sizeof(sig)); + if (error) + return (error); + sigp = &sig; + } else + sigp = NULL; + + acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); + error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); + if (error == 0) + error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, + nent, sigp, &aiocb_ops); + free(acb_list, M_LIO); + return (error); +} + +static void +aio_biowakeup(struct bio *bp) +{ + struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; + struct proc *userp; + struct kaioinfo *ki; + size_t nbytes; + int error, nblks; + + /* Release mapping into kernel space. */ + userp = job->userproc; + ki = userp->p_aioinfo; + if (job->pbuf) { + pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); + relpbuf(job->pbuf, NULL); + job->pbuf = NULL; + atomic_subtract_int(&num_buf_aio, 1); + AIO_LOCK(ki); + ki->kaio_buffer_count--; + AIO_UNLOCK(ki); + } else + atomic_subtract_int(&num_unmapped_aio, 1); + vm_page_unhold_pages(job->pages, job->npages); + + bp = job->bp; + job->bp = NULL; + nbytes = job->uaiocb.aio_nbytes - bp->bio_resid; + error = 0; + if (bp->bio_flags & BIO_ERROR) + error = bp->bio_error; + nblks = btodb(nbytes); + if (job->uaiocb.aio_lio_opcode == LIO_WRITE) + job->outblock += nblks; + else + job->inblock += nblks; + + if (error) + aio_complete(job, -1, error); + else + aio_complete(job, nbytes, 0); + + g_destroy_bio(bp); +} + +/* syscall - wait for the next completion of an aio request */ +static int +kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, + struct timespec *ts, struct aiocb_ops *ops) +{ + struct proc *p = td->td_proc; + struct timeval atv; + struct kaioinfo *ki; + struct kaiocb *job; + struct aiocb *ujob; + long error, status; + int timo; + + ops->store_aiocb(ujobp, NULL); + + if (ts == NULL) { + timo = 0; + } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { + timo = -1; + } else { + if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) + return (EINVAL); + + TIMESPEC_TO_TIMEVAL(&atv, ts); + if (itimerfix(&atv)) + return (EINVAL); + timo = tvtohz(&atv); + } + + if (p->p_aioinfo == NULL) + aio_init_aioinfo(p); + ki = p->p_aioinfo; + + error = 0; + job = NULL; + AIO_LOCK(ki); + while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { + if (timo == -1) { + error = EWOULDBLOCK; + break; + } + ki->kaio_flags |= KAIO_WAKEUP; + error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, + "aiowc", timo); + if (timo && error == ERESTART) + error = EINTR; + if (error) + break; + } + + if (job != NULL) { + MPASS(job->jobflags & KAIOCB_FINISHED); + ujob = job->ujob; + status = job->uaiocb._aiocb_private.status; + error = job->uaiocb._aiocb_private.error; + td->td_retval[0] = status; + td->td_ru.ru_oublock += job->outblock; + td->td_ru.ru_inblock += job->inblock; + td->td_ru.ru_msgsnd += job->msgsnd; + td->td_ru.ru_msgrcv += job->msgrcv; + aio_free_entry(job); + AIO_UNLOCK(ki); + ops->store_aiocb(ujobp, ujob); + ops->store_error(ujob, error); + ops->store_status(ujob, status); + } else + AIO_UNLOCK(ki); + + return (error); +} + +int +sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) +{ + struct timespec ts, *tsp; + int error; + + if (uap->timeout) { + /* Get timespec struct. */ + error = copyin(uap->timeout, &ts, sizeof(ts)); + if (error) + return (error); + tsp = &ts; + } else + tsp = NULL; + + return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); +} + +static int +kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, + struct aiocb_ops *ops) +{ + + if (op != O_SYNC) /* XXX lack of O_DSYNC */ + return (EINVAL); + return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops)); +} + +int +sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) +{ + + return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); +} + +/* kqueue attach function */ +static int +filt_aioattach(struct knote *kn) +{ + struct kaiocb *job; + + job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; + + /* + * The job pointer must be validated before using it, so + * registration is restricted to the kernel; the user cannot + * set EV_FLAG1. + */ + if ((kn->kn_flags & EV_FLAG1) == 0) + return (EPERM); + kn->kn_ptr.p_aio = job; + kn->kn_flags &= ~EV_FLAG1; + + knlist_add(&job->klist, kn, 0); + + return (0); +} + +/* kqueue detach function */ +static void +filt_aiodetach(struct knote *kn) +{ + struct knlist *knl; + + knl = &kn->kn_ptr.p_aio->klist; + knl->kl_lock(knl->kl_lockarg); + if (!knlist_empty(knl)) + knlist_remove(knl, kn, 1); + knl->kl_unlock(knl->kl_lockarg); +} + +/* kqueue filter function */ +/*ARGSUSED*/ +static int +filt_aio(struct knote *kn, long hint) +{ + struct kaiocb *job = kn->kn_ptr.p_aio; + + kn->kn_data = job->uaiocb._aiocb_private.error; + if (!(job->jobflags & KAIOCB_FINISHED)) + return (0); + kn->kn_flags |= EV_EOF; + return (1); +} + +/* kqueue attach function */ +static int +filt_lioattach(struct knote *kn) +{ + struct aioliojob *lj; + + lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; + + /* + * The aioliojob pointer must be validated before using it, so + * registration is restricted to the kernel; the user cannot + * set EV_FLAG1. + */ + if ((kn->kn_flags & EV_FLAG1) == 0) + return (EPERM); + kn->kn_ptr.p_lio = lj; + kn->kn_flags &= ~EV_FLAG1; + + knlist_add(&lj->klist, kn, 0); + + return (0); +} + +/* kqueue detach function */ +static void +filt_liodetach(struct knote *kn) +{ + struct knlist *knl; + + knl = &kn->kn_ptr.p_lio->klist; + knl->kl_lock(knl->kl_lockarg); + if (!knlist_empty(knl)) + knlist_remove(knl, kn, 1); + knl->kl_unlock(knl->kl_lockarg); +} + +/* kqueue filter function */ +/*ARGSUSED*/ +static int +filt_lio(struct knote *kn, long hint) +{ + struct aioliojob * lj = kn->kn_ptr.p_lio; + + return (lj->lioj_flags & LIOJ_KEVENT_POSTED); +} + +#ifdef COMPAT_FREEBSD32 +#include +#include +#include +#include +#include +#include +#include + +struct __aiocb_private32 { + int32_t status; + int32_t error; + uint32_t kernelinfo; +}; + +#ifdef COMPAT_FREEBSD6 +typedef struct oaiocb32 { + int aio_fildes; /* File descriptor */ + uint64_t aio_offset __packed; /* File offset for I/O */ + uint32_t aio_buf; /* I/O buffer in process space */ + uint32_t aio_nbytes; /* Number of bytes for I/O */ + struct osigevent32 aio_sigevent; /* Signal to deliver */ + int aio_lio_opcode; /* LIO opcode */ + int aio_reqprio; /* Request priority -- ignored */ + struct __aiocb_private32 _aiocb_private; +} oaiocb32_t; +#endif + +typedef struct aiocb32 { + int32_t aio_fildes; /* File descriptor */ + uint64_t aio_offset __packed; /* File offset for I/O */ + uint32_t aio_buf; /* I/O buffer in process space */ + uint32_t aio_nbytes; /* Number of bytes for I/O */ + int __spare__[2]; + uint32_t __spare2__; + int aio_lio_opcode; /* LIO opcode */ + int aio_reqprio; /* Request priority -- ignored */ + struct __aiocb_private32 _aiocb_private; + struct sigevent32 aio_sigevent; /* Signal to deliver */ +} aiocb32_t; + +#ifdef COMPAT_FREEBSD6 +static int +convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) +{ + + /* + * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are + * supported by AIO with the old sigevent structure. + */ + CP(*osig, *nsig, sigev_notify); + switch (nsig->sigev_notify) { + case SIGEV_NONE: + break; + case SIGEV_SIGNAL: + nsig->sigev_signo = osig->__sigev_u.__sigev_signo; + break; + case SIGEV_KEVENT: + nsig->sigev_notify_kqueue = + osig->__sigev_u.__sigev_notify_kqueue; + PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); + break; + default: + return (EINVAL); + } + return (0); +} + +static int +aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) +{ + struct oaiocb32 job32; + int error; + + bzero(kjob, sizeof(struct aiocb)); + error = copyin(ujob, &job32, sizeof(job32)); + if (error) + return (error); + + CP(job32, *kjob, aio_fildes); + CP(job32, *kjob, aio_offset); + PTRIN_CP(job32, *kjob, aio_buf); + CP(job32, *kjob, aio_nbytes); + CP(job32, *kjob, aio_lio_opcode); + CP(job32, *kjob, aio_reqprio); + CP(job32, *kjob, _aiocb_private.status); + CP(job32, *kjob, _aiocb_private.error); + PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); + return (convert_old_sigevent32(&job32.aio_sigevent, + &kjob->aio_sigevent)); +} +#endif + +static int +aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) +{ + struct aiocb32 job32; + int error; + + error = copyin(ujob, &job32, sizeof(job32)); + if (error) + return (error); + CP(job32, *kjob, aio_fildes); + CP(job32, *kjob, aio_offset); + PTRIN_CP(job32, *kjob, aio_buf); + CP(job32, *kjob, aio_nbytes); + CP(job32, *kjob, aio_lio_opcode); + CP(job32, *kjob, aio_reqprio); + CP(job32, *kjob, _aiocb_private.status); + CP(job32, *kjob, _aiocb_private.error); + PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); + return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); +} + +static long +aiocb32_fetch_status(struct aiocb *ujob) +{ + struct aiocb32 *ujob32; + + ujob32 = (struct aiocb32 *)ujob; + return (fuword32(&ujob32->_aiocb_private.status)); +} + +static long +aiocb32_fetch_error(struct aiocb *ujob) +{ + struct aiocb32 *ujob32; + + ujob32 = (struct aiocb32 *)ujob; + return (fuword32(&ujob32->_aiocb_private.error)); +} + +static int +aiocb32_store_status(struct aiocb *ujob, long status) +{ + struct aiocb32 *ujob32; + + ujob32 = (struct aiocb32 *)ujob; + return (suword32(&ujob32->_aiocb_private.status, status)); +} + +static int +aiocb32_store_error(struct aiocb *ujob, long error) +{ + struct aiocb32 *ujob32; + + ujob32 = (struct aiocb32 *)ujob; + return (suword32(&ujob32->_aiocb_private.error, error)); +} + +static int +aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) +{ + struct aiocb32 *ujob32; + + ujob32 = (struct aiocb32 *)ujob; + return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); +} + +static int +aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) +{ + + return (suword32(ujobp, (long)ujob)); +} + +static struct aiocb_ops aiocb32_ops = { + .copyin = aiocb32_copyin, + .fetch_status = aiocb32_fetch_status, + .fetch_error = aiocb32_fetch_error, + .store_status = aiocb32_store_status, + .store_error = aiocb32_store_error, + .store_kernelinfo = aiocb32_store_kernelinfo, + .store_aiocb = aiocb32_store_aiocb, +}; + +#ifdef COMPAT_FREEBSD6 +static struct aiocb_ops aiocb32_ops_osigevent = { + .copyin = aiocb32_copyin_old_sigevent, + .fetch_status = aiocb32_fetch_status, + .fetch_error = aiocb32_fetch_error, + .store_status = aiocb32_store_status, + .store_error = aiocb32_store_error, + .store_kernelinfo = aiocb32_store_kernelinfo, + .store_aiocb = aiocb32_store_aiocb, +}; +#endif + +int +freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) +{ + + return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); +} + +int +freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) +{ + struct timespec32 ts32; + struct timespec ts, *tsp; + struct aiocb **ujoblist; + uint32_t *ujoblist32; + int error, i; + + if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) + return (EINVAL); + + if (uap->timeout) { + /* Get timespec struct. */ + if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) + return (error); + CP(ts32, ts, tv_sec); + CP(ts32, ts, tv_nsec); + tsp = &ts; + } else + tsp = NULL; + + ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); + ujoblist32 = (uint32_t *)ujoblist; + error = copyin(uap->aiocbp, ujoblist32, uap->nent * + sizeof(ujoblist32[0])); + if (error == 0) { + for (i = uap->nent - 1; i >= 0; i--) + ujoblist[i] = PTRIN(ujoblist32[i]); + + error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); + } + free(ujoblist, M_AIOS); + return (error); +} + +int +freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) +{ + + return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); +} + +#ifdef COMPAT_FREEBSD6 +int +freebsd6_freebsd32_aio_read(struct thread *td, + struct freebsd6_freebsd32_aio_read_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, + &aiocb32_ops_osigevent)); +} +#endif + +int +freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, + &aiocb32_ops)); +} + +#ifdef COMPAT_FREEBSD6 +int +freebsd6_freebsd32_aio_write(struct thread *td, + struct freebsd6_freebsd32_aio_write_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, + &aiocb32_ops_osigevent)); +} +#endif + +int +freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, + &aiocb32_ops)); +} + +int +freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, + &aiocb32_ops)); +} + +int +freebsd32_aio_waitcomplete(struct thread *td, + struct freebsd32_aio_waitcomplete_args *uap) +{ + struct timespec32 ts32; + struct timespec ts, *tsp; + int error; + + if (uap->timeout) { + /* Get timespec struct. */ + error = copyin(uap->timeout, &ts32, sizeof(ts32)); + if (error) + return (error); + CP(ts32, ts, tv_sec); + CP(ts32, ts, tv_nsec); + tsp = &ts; + } else + tsp = NULL; + + return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, + &aiocb32_ops)); +} + +int +freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) +{ + + return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, + &aiocb32_ops)); +} + +#ifdef COMPAT_FREEBSD6 +int +freebsd6_freebsd32_lio_listio(struct thread *td, + struct freebsd6_freebsd32_lio_listio_args *uap) +{ + struct aiocb **acb_list; + struct sigevent *sigp, sig; + struct osigevent32 osig; + uint32_t *acb_list32; + int error, i, nent; + + if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) + return (EINVAL); + + nent = uap->nent; + if (nent < 0 || nent > max_aio_queue_per_proc) + return (EINVAL); + + if (uap->sig && (uap->mode == LIO_NOWAIT)) { + error = copyin(uap->sig, &osig, sizeof(osig)); + if (error) + return (error); + error = convert_old_sigevent32(&osig, &sig); + if (error) + return (error); + sigp = &sig; + } else + sigp = NULL; + + acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); + error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); + if (error) { + free(acb_list32, M_LIO); + return (error); + } + acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); + for (i = 0; i < nent; i++) + acb_list[i] = PTRIN(acb_list32[i]); + free(acb_list32, M_LIO); + + error = kern_lio_listio(td, uap->mode, + (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, + &aiocb32_ops_osigevent); + free(acb_list, M_LIO); + return (error); +} +#endif + +int +freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) +{ + struct aiocb **acb_list; + struct sigevent *sigp, sig; + struct sigevent32 sig32; + uint32_t *acb_list32; + int error, i, nent; + + if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) + return (EINVAL); + + nent = uap->nent; + if (nent < 0 || nent > max_aio_queue_per_proc) + return (EINVAL); + + if (uap->sig && (uap->mode == LIO_NOWAIT)) { + error = copyin(uap->sig, &sig32, sizeof(sig32)); + if (error) + return (error); + error = convert_sigevent32(&sig32, &sig); + if (error) + return (error); + sigp = &sig; + } else + sigp = NULL; + + acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); + error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); + if (error) { + free(acb_list32, M_LIO); + return (error); + } + acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); + for (i = 0; i < nent; i++) + acb_list[i] = PTRIN(acb_list32[i]); + free(acb_list32, M_LIO); + + error = kern_lio_listio(td, uap->mode, + (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, + &aiocb32_ops); + free(acb_list, M_LIO); + return (error); +} + +#endif diff --git a/freebsd/sys/kern/vfs_bio.c b/freebsd/sys/kern/vfs_bio.c new file mode 100644 index 00000000..2277bf67 --- /dev/null +++ b/freebsd/sys/kern/vfs_bio.c @@ -0,0 +1,5474 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2004 Poul-Henning Kamp + * Copyright (c) 1994,1997 John S. Dyson + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * this file contains a new buffer I/O scheme implementing a coherent + * VM object and buffer cache scheme. Pains have been taken to make + * sure that the performance degradation associated with schemes such + * as this is not realized. + * + * Author: John S. Dyson + * Significant help during the development and debugging phases + * had been provided by David Greenman, also of the FreeBSD core team. + * + * see man buf(9) for more info. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opt_swap.h" + +static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); + +struct bio_ops bioops; /* I/O operation notification */ + +struct buf_ops buf_ops_bio = { + .bop_name = "buf_ops_bio", + .bop_write = bufwrite, + .bop_strategy = bufstrategy, + .bop_sync = bufsync, + .bop_bdflush = bufbdflush, +}; + +struct bufqueue { + struct mtx_padalign bq_lock; + TAILQ_HEAD(, buf) bq_queue; + uint8_t bq_index; + uint16_t bq_subqueue; + int bq_len; +} __aligned(CACHE_LINE_SIZE); + +#define BQ_LOCKPTR(bq) (&(bq)->bq_lock) +#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) +#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) +#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) + +struct bufdomain { + struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ + struct bufqueue bd_dirtyq; + struct bufqueue *bd_cleanq; + struct mtx_padalign bd_run_lock; + /* Constants */ + long bd_maxbufspace; + long bd_hibufspace; + long bd_lobufspace; + long bd_bufspacethresh; + int bd_hifreebuffers; + int bd_lofreebuffers; + int bd_hidirtybuffers; + int bd_lodirtybuffers; + int bd_dirtybufthresh; + int bd_lim; + /* atomics */ + int bd_wanted; + int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; + int __aligned(CACHE_LINE_SIZE) bd_running; + long __aligned(CACHE_LINE_SIZE) bd_bufspace; + int __aligned(CACHE_LINE_SIZE) bd_freebuffers; +} __aligned(CACHE_LINE_SIZE); + +#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) +#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) +#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) +#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) +#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) +#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) +#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) +#define BD_DOMAIN(bd) (bd - bdomain) + +static struct buf *buf; /* buffer header pool */ +extern struct buf *swbuf; /* Swap buffer header pool. */ +caddr_t __read_mostly unmapped_buf; + +/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ +struct proc *bufdaemonproc; + +static int inmem(struct vnode *vp, daddr_t blkno); +static void vm_hold_free_pages(struct buf *bp, int newbsize); +static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, + vm_offset_t to); +static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); +static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, + vm_page_t m); +static void vfs_clean_pages_dirty_buf(struct buf *bp); +static void vfs_setdirty_locked_object(struct buf *bp); +static void vfs_vmio_invalidate(struct buf *bp); +static void vfs_vmio_truncate(struct buf *bp, int npages); +static void vfs_vmio_extend(struct buf *bp, int npages, int size); +static int vfs_bio_clcheck(struct vnode *vp, int size, + daddr_t lblkno, daddr_t blkno); +static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, + void (*)(struct buf *)); +static int buf_flush(struct vnode *vp, struct bufdomain *, int); +static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); +static void buf_daemon(void); +static __inline void bd_wakeup(void); +static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); +static void bufkva_reclaim(vmem_t *, int); +static void bufkva_free(struct buf *); +static int buf_import(void *, void **, int, int, int); +static void buf_release(void *, void **, int); +static void maxbcachebuf_adjust(void); +static inline struct bufdomain *bufdomain(struct buf *); +static void bq_remove(struct bufqueue *bq, struct buf *bp); +static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); +static int buf_recycle(struct bufdomain *, bool kva); +static void bq_init(struct bufqueue *bq, int qindex, int cpu, + const char *lockname); +static void bd_init(struct bufdomain *bd); +static int bd_flushall(struct bufdomain *bd); +static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); +static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); + +static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); +int vmiodirenable = TRUE; +SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, + "Use the VM system for directory writes"); +long runningbufspace; +SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, + "Amount of presently outstanding async buffer io"); +SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, + NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); +static counter_u64_t bufkvaspace; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, + "Kernel virtual memory used for buffers"); +static long maxbufspace; +SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, + __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", + "Maximum allowed value of bufspace (including metadata)"); +static long bufmallocspace; +SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, + "Amount of malloced memory for buffers"); +static long maxbufmallocspace; +SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, + 0, "Maximum amount of malloced memory for buffers"); +static long lobufspace; +SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, + __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", + "Minimum amount of buffers we want to have"); +long hibufspace; +SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, + __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", + "Maximum allowed value of bufspace (excluding metadata)"); +long bufspacethresh; +SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, + CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, + __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", + "Bufspace consumed before waking the daemon to free some"); +static counter_u64_t buffreekvacnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, + "Number of times we have freed the KVA space from some buffer"); +static counter_u64_t bufdefragcnt; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, + "Number of times we have had to repeat buffer allocation to defragment"); +static long lorunningspace; +SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | + CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", + "Minimum preferred space used for in-progress I/O"); +static long hirunningspace; +SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | + CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", + "Maximum amount of space to use for in-progress I/O"); +int dirtybufferflushes; +SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, + 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); +int bdwriteskip; +SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, + 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); +int altbufferflushes; +SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, + 0, "Number of fsync flushes to limit dirty buffers"); +static int recursiveflushes; +SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, + 0, "Number of flushes skipped due to being recursive"); +static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", + "Number of buffers that are dirty (has unwritten changes) at the moment"); +static int lodirtybuffers; +SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, + __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I", + "How many buffers we want to have free before bufdaemon can sleep"); +static int hidirtybuffers; +SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, + __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I", + "When the number of dirty buffers is considered severe"); +int dirtybufthresh; +SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, + __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I", + "Number of bdwrite to bawrite conversions to clear dirty buffers"); +static int numfreebuffers; +SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, + "Number of free buffers"); +static int lofreebuffers; +SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, + __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I", + "Target number of free buffers"); +static int hifreebuffers; +SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, + CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, + __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I", + "Threshold for clean buffer recycling"); +static counter_u64_t getnewbufcalls; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, + &getnewbufcalls, "Number of calls to getnewbuf"); +static counter_u64_t getnewbufrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, + &getnewbufrestarts, + "Number of times getnewbuf has had to restart a buffer acquisition"); +static counter_u64_t mappingrestarts; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, + &mappingrestarts, + "Number of times getblk has had to restart a buffer mapping for " + "unmapped buffer"); +static counter_u64_t numbufallocfails; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, + &numbufallocfails, "Number of times buffer allocations failed"); +static int flushbufqtarget = 100; +SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, + "Amount of work to do in flushbufqueues when helping bufdaemon"); +static counter_u64_t notbufdflushes; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, + "Number of dirty buffer flushes done by the bufdaemon helpers"); +static long barrierwrites; +SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, + "Number of barrier writes"); +SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, + &unmapped_buf_allowed, 0, + "Permit the use of the unmapped i/o"); +int maxbcachebuf = MAXBCACHEBUF; +SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, + "Maximum size of a buffer cache block"); + +/* + * This lock synchronizes access to bd_request. + */ +static struct mtx_padalign __exclusive_cache_line bdlock; + +/* + * This lock protects the runningbufreq and synchronizes runningbufwakeup and + * waitrunningbufspace(). + */ +static struct mtx_padalign __exclusive_cache_line rbreqlock; + +/* + * Lock that protects bdirtywait. + */ +static struct mtx_padalign __exclusive_cache_line bdirtylock; + +/* + * Wakeup point for bufdaemon, as well as indicator of whether it is already + * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it + * is idling. + */ +static int bd_request; + +/* + * Request for the buf daemon to write more buffers than is indicated by + * lodirtybuf. This may be necessary to push out excess dependencies or + * defragment the address space where a simple count of the number of dirty + * buffers is insufficient to characterize the demand for flushing them. + */ +static int bd_speedupreq; + +/* + * Synchronization (sleep/wakeup) variable for active buffer space requests. + * Set when wait starts, cleared prior to wakeup(). + * Used in runningbufwakeup() and waitrunningbufspace(). + */ +static int runningbufreq; + +/* + * Synchronization for bwillwrite() waiters. + */ +static int bdirtywait; + +/* + * Definitions for the buffer free lists. + */ +#define QUEUE_NONE 0 /* on no queue */ +#define QUEUE_EMPTY 1 /* empty buffer headers */ +#define QUEUE_DIRTY 2 /* B_DELWRI buffers */ +#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ +#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ + +/* Maximum number of buffer domains. */ +#define BUF_DOMAINS 8 + +struct bufdomainset bdlodirty; /* Domains > lodirty */ +struct bufdomainset bdhidirty; /* Domains > hidirty */ + +/* Configured number of clean queues. */ +static int __read_mostly buf_domains; + +BITSET_DEFINE(bufdomainset, BUF_DOMAINS); +struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; +struct bufqueue __exclusive_cache_line bqempty; + +/* + * per-cpu empty buffer cache. + */ +uma_zone_t buf_zone; + +/* + * Single global constant for BUF_WMESG, to avoid getting multiple references. + * buf_wmesg is referred from macros. + */ +const char *buf_wmesg = BUF_WMESG; + +static int +sysctl_runningspace(SYSCTL_HANDLER_ARGS) +{ + long value; + int error; + + value = *(long *)arg1; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + mtx_lock(&rbreqlock); + if (arg1 == &hirunningspace) { + if (value < lorunningspace) + error = EINVAL; + else + hirunningspace = value; + } else { + KASSERT(arg1 == &lorunningspace, + ("%s: unknown arg1", __func__)); + if (value > hirunningspace) + error = EINVAL; + else + lorunningspace = value; + } + mtx_unlock(&rbreqlock); + return (error); +} + +static int +sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) +{ + int error; + int value; + int i; + + value = *(int *)arg1; + error = sysctl_handle_int(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + *(int *)arg1 = value; + for (i = 0; i < buf_domains; i++) + *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = + value / buf_domains; + + return (error); +} + +static int +sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) +{ + long value; + int error; + int i; + + value = *(long *)arg1; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + *(long *)arg1 = value; + for (i = 0; i < buf_domains; i++) + *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = + value / buf_domains; + + return (error); +} + +#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) +static int +sysctl_bufspace(SYSCTL_HANDLER_ARGS) +{ + long lvalue; + int ivalue; + int i; + + lvalue = 0; + for (i = 0; i < buf_domains; i++) + lvalue += bdomain[i].bd_bufspace; + if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) + return (sysctl_handle_long(oidp, &lvalue, 0, req)); + if (lvalue > INT_MAX) + /* On overflow, still write out a long to trigger ENOMEM. */ + return (sysctl_handle_long(oidp, &lvalue, 0, req)); + ivalue = lvalue; + return (sysctl_handle_int(oidp, &ivalue, 0, req)); +} +#else +static int +sysctl_bufspace(SYSCTL_HANDLER_ARGS) +{ + long lvalue; + int i; + + lvalue = 0; + for (i = 0; i < buf_domains; i++) + lvalue += bdomain[i].bd_bufspace; + return (sysctl_handle_long(oidp, &lvalue, 0, req)); +} +#endif + +static int +sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) +{ + int value; + int i; + + value = 0; + for (i = 0; i < buf_domains; i++) + value += bdomain[i].bd_numdirtybuffers; + return (sysctl_handle_int(oidp, &value, 0, req)); +} + +/* + * bdirtywakeup: + * + * Wakeup any bwillwrite() waiters. + */ +static void +bdirtywakeup(void) +{ + mtx_lock(&bdirtylock); + if (bdirtywait) { + bdirtywait = 0; + wakeup(&bdirtywait); + } + mtx_unlock(&bdirtylock); +} + +/* + * bd_clear: + * + * Clear a domain from the appropriate bitsets when dirtybuffers + * is decremented. + */ +static void +bd_clear(struct bufdomain *bd) +{ + + mtx_lock(&bdirtylock); + if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) + BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); + if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) + BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); + mtx_unlock(&bdirtylock); +} + +/* + * bd_set: + * + * Set a domain in the appropriate bitsets when dirtybuffers + * is incremented. + */ +static void +bd_set(struct bufdomain *bd) +{ + + mtx_lock(&bdirtylock); + if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) + BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); + if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) + BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); + mtx_unlock(&bdirtylock); +} + +/* + * bdirtysub: + * + * Decrement the numdirtybuffers count by one and wakeup any + * threads blocked in bwillwrite(). + */ +static void +bdirtysub(struct buf *bp) +{ + struct bufdomain *bd; + int num; + + bd = bufdomain(bp); + num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); + if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) + bdirtywakeup(); + if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) + bd_clear(bd); +} + +/* + * bdirtyadd: + * + * Increment the numdirtybuffers count by one and wakeup the buf + * daemon if needed. + */ +static void +bdirtyadd(struct buf *bp) +{ + struct bufdomain *bd; + int num; + + /* + * Only do the wakeup once as we cross the boundary. The + * buf daemon will keep running until the condition clears. + */ + bd = bufdomain(bp); + num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); + if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) + bd_wakeup(); + if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) + bd_set(bd); +} + +/* + * bufspace_daemon_wakeup: + * + * Wakeup the daemons responsible for freeing clean bufs. + */ +static void +bufspace_daemon_wakeup(struct bufdomain *bd) +{ + + /* + * avoid the lock if the daemon is running. + */ + if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { + BD_RUN_LOCK(bd); + atomic_store_int(&bd->bd_running, 1); + wakeup(&bd->bd_running); + BD_RUN_UNLOCK(bd); + } +} + +/* + * bufspace_daemon_wait: + * + * Sleep until the domain falls below a limit or one second passes. + */ +static void +bufspace_daemon_wait(struct bufdomain *bd) +{ + /* + * Re-check our limits and sleep. bd_running must be + * cleared prior to checking the limits to avoid missed + * wakeups. The waker will adjust one of bufspace or + * freebuffers prior to checking bd_running. + */ + BD_RUN_LOCK(bd); + atomic_store_int(&bd->bd_running, 0); + if (bd->bd_bufspace < bd->bd_bufspacethresh && + bd->bd_freebuffers > bd->bd_lofreebuffers) { + msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP, + "-", hz); + } else { + /* Avoid spurious wakeups while running. */ + atomic_store_int(&bd->bd_running, 1); + BD_RUN_UNLOCK(bd); + } +} + +/* + * bufspace_adjust: + * + * Adjust the reported bufspace for a KVA managed buffer, possibly + * waking any waiters. + */ +static void +bufspace_adjust(struct buf *bp, int bufsize) +{ + struct bufdomain *bd; + long space; + int diff; + + KASSERT((bp->b_flags & B_MALLOC) == 0, + ("bufspace_adjust: malloc buf %p", bp)); + bd = bufdomain(bp); + diff = bufsize - bp->b_bufsize; + if (diff < 0) { + atomic_subtract_long(&bd->bd_bufspace, -diff); + } else if (diff > 0) { + space = atomic_fetchadd_long(&bd->bd_bufspace, diff); + /* Wake up the daemon on the transition. */ + if (space < bd->bd_bufspacethresh && + space + diff >= bd->bd_bufspacethresh) + bufspace_daemon_wakeup(bd); + } + bp->b_bufsize = bufsize; +} + +/* + * bufspace_reserve: + * + * Reserve bufspace before calling allocbuf(). metadata has a + * different space limit than data. + */ +static int +bufspace_reserve(struct bufdomain *bd, int size, bool metadata) +{ + long limit, new; + long space; + + if (metadata) + limit = bd->bd_maxbufspace; + else + limit = bd->bd_hibufspace; + space = atomic_fetchadd_long(&bd->bd_bufspace, size); + new = space + size; + if (new > limit) { + atomic_subtract_long(&bd->bd_bufspace, size); + return (ENOSPC); + } + + /* Wake up the daemon on the transition. */ + if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) + bufspace_daemon_wakeup(bd); + + return (0); +} + +/* + * bufspace_release: + * + * Release reserved bufspace after bufspace_adjust() has consumed it. + */ +static void +bufspace_release(struct bufdomain *bd, int size) +{ + + atomic_subtract_long(&bd->bd_bufspace, size); +} + +/* + * bufspace_wait: + * + * Wait for bufspace, acting as the buf daemon if a locked vnode is + * supplied. bd_wanted must be set prior to polling for space. The + * operation must be re-tried on return. + */ +static void +bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, + int slpflag, int slptimeo) +{ + struct thread *td; + int error, fl, norunbuf; + + if ((gbflags & GB_NOWAIT_BD) != 0) + return; + + td = curthread; + BD_LOCK(bd); + while (bd->bd_wanted) { + if (vp != NULL && vp->v_type != VCHR && + (td->td_pflags & TDP_BUFNEED) == 0) { + BD_UNLOCK(bd); + /* + * getblk() is called with a vnode locked, and + * some majority of the dirty buffers may as + * well belong to the vnode. Flushing the + * buffers there would make a progress that + * cannot be achieved by the buf_daemon, that + * cannot lock the vnode. + */ + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | + (td->td_pflags & TDP_NORUNNINGBUF); + + /* + * Play bufdaemon. The getnewbuf() function + * may be called while the thread owns lock + * for another dirty buffer for the same + * vnode, which makes it impossible to use + * VOP_FSYNC() there, due to the buffer lock + * recursion. + */ + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; + fl = buf_flush(vp, bd, flushbufqtarget); + td->td_pflags &= norunbuf; + BD_LOCK(bd); + if (fl != 0) + continue; + if (bd->bd_wanted == 0) + break; + } + error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), + (PRIBIO + 4) | slpflag, "newbuf", slptimeo); + if (error != 0) + break; + } + BD_UNLOCK(bd); +} + + +/* + * bufspace_daemon: + * + * buffer space management daemon. Tries to maintain some marginal + * amount of free buffer space so that requesting processes neither + * block nor work to reclaim buffers. + */ +static void +bufspace_daemon(void *arg) +{ + struct bufdomain *bd; + + EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, + SHUTDOWN_PRI_LAST + 100); + + bd = arg; + for (;;) { + kthread_suspend_check(); + + /* + * Free buffers from the clean queue until we meet our + * targets. + * + * Theory of operation: The buffer cache is most efficient + * when some free buffer headers and space are always + * available to getnewbuf(). This daemon attempts to prevent + * the excessive blocking and synchronization associated + * with shortfall. It goes through three phases according + * demand: + * + * 1) The daemon wakes up voluntarily once per-second + * during idle periods when the counters are below + * the wakeup thresholds (bufspacethresh, lofreebuffers). + * + * 2) The daemon wakes up as we cross the thresholds + * ahead of any potential blocking. This may bounce + * slightly according to the rate of consumption and + * release. + * + * 3) The daemon and consumers are starved for working + * clean buffers. This is the 'bufspace' sleep below + * which will inefficiently trade bufs with bqrelse + * until we return to condition 2. + */ + while (bd->bd_bufspace > bd->bd_lobufspace || + bd->bd_freebuffers < bd->bd_hifreebuffers) { + if (buf_recycle(bd, false) != 0) { + if (bd_flushall(bd)) + continue; + /* + * Speedup dirty if we've run out of clean + * buffers. This is possible in particular + * because softdep may held many bufs locked + * pending writes to other bufs which are + * marked for delayed write, exhausting + * clean space until they are written. + */ + bd_speedup(); + BD_LOCK(bd); + if (bd->bd_wanted) { + msleep(&bd->bd_wanted, BD_LOCKPTR(bd), + PRIBIO|PDROP, "bufspace", hz/10); + } else + BD_UNLOCK(bd); + } + maybe_yield(); + } + bufspace_daemon_wait(bd); + } +} + +/* + * bufmallocadjust: + * + * Adjust the reported bufspace for a malloc managed buffer, possibly + * waking any waiters. + */ +static void +bufmallocadjust(struct buf *bp, int bufsize) +{ + int diff; + + KASSERT((bp->b_flags & B_MALLOC) != 0, + ("bufmallocadjust: non-malloc buf %p", bp)); + diff = bufsize - bp->b_bufsize; + if (diff < 0) + atomic_subtract_long(&bufmallocspace, -diff); + else + atomic_add_long(&bufmallocspace, diff); + bp->b_bufsize = bufsize; +} + +/* + * runningwakeup: + * + * Wake up processes that are waiting on asynchronous writes to fall + * below lorunningspace. + */ +static void +runningwakeup(void) +{ + + mtx_lock(&rbreqlock); + if (runningbufreq) { + runningbufreq = 0; + wakeup(&runningbufreq); + } + mtx_unlock(&rbreqlock); +} + +/* + * runningbufwakeup: + * + * Decrement the outstanding write count according. + */ +void +runningbufwakeup(struct buf *bp) +{ + long space, bspace; + + bspace = bp->b_runningbufspace; + if (bspace == 0) + return; + space = atomic_fetchadd_long(&runningbufspace, -bspace); + KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", + space, bspace)); + bp->b_runningbufspace = 0; + /* + * Only acquire the lock and wakeup on the transition from exceeding + * the threshold to falling below it. + */ + if (space < lorunningspace) + return; + if (space - bspace > lorunningspace) + return; + runningwakeup(); +} + +/* + * waitrunningbufspace() + * + * runningbufspace is a measure of the amount of I/O currently + * running. This routine is used in async-write situations to + * prevent creating huge backups of pending writes to a device. + * Only asynchronous writes are governed by this function. + * + * This does NOT turn an async write into a sync write. It waits + * for earlier writes to complete and generally returns before the + * caller's write has reached the device. + */ +void +waitrunningbufspace(void) +{ + + mtx_lock(&rbreqlock); + while (runningbufspace > hirunningspace) { + runningbufreq = 1; + msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); + } + mtx_unlock(&rbreqlock); +} + + +/* + * vfs_buf_test_cache: + * + * Called when a buffer is extended. This function clears the B_CACHE + * bit if the newly extended portion of the buffer does not contain + * valid data. + */ +static __inline void +vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, + vm_offset_t size, vm_page_t m) +{ + + VM_OBJECT_ASSERT_LOCKED(m->object); + if (bp->b_flags & B_CACHE) { + int base = (foff + off) & PAGE_MASK; + if (vm_page_is_valid(m, base, size) == 0) + bp->b_flags &= ~B_CACHE; + } +} + +/* Wake up the buffer daemon if necessary */ +static void +bd_wakeup(void) +{ + + mtx_lock(&bdlock); + if (bd_request == 0) { + bd_request = 1; + wakeup(&bd_request); + } + mtx_unlock(&bdlock); +} + +/* + * Adjust the maxbcachbuf tunable. + */ +static void +maxbcachebuf_adjust(void) +{ + int i; + + /* + * maxbcachebuf must be a power of 2 >= MAXBSIZE. + */ + i = 2; + while (i * 2 <= maxbcachebuf) + i *= 2; + maxbcachebuf = i; + if (maxbcachebuf < MAXBSIZE) + maxbcachebuf = MAXBSIZE; + if (maxbcachebuf > MAXPHYS) + maxbcachebuf = MAXPHYS; + if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) + printf("maxbcachebuf=%d\n", maxbcachebuf); +} + +/* + * bd_speedup - speedup the buffer cache flushing code + */ +void +bd_speedup(void) +{ + int needwake; + + mtx_lock(&bdlock); + needwake = 0; + if (bd_speedupreq == 0 || bd_request == 0) + needwake = 1; + bd_speedupreq = 1; + bd_request = 1; + if (needwake) + wakeup(&bd_request); + mtx_unlock(&bdlock); +} + +#ifndef NSWBUF_MIN +#define NSWBUF_MIN 16 +#endif + +#ifdef __i386__ +#define TRANSIENT_DENOM 5 +#else +#define TRANSIENT_DENOM 10 +#endif + +/* + * Calculating buffer cache scaling values and reserve space for buffer + * headers. This is called during low level kernel initialization and + * may be called more then once. We CANNOT write to the memory area + * being reserved at this time. + */ +caddr_t +kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) +{ + int tuned_nbuf; + long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; + + /* + * physmem_est is in pages. Convert it to kilobytes (assumes + * PAGE_SIZE is >= 1K) + */ + physmem_est = physmem_est * (PAGE_SIZE / 1024); + + maxbcachebuf_adjust(); + /* + * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. + * For the first 64MB of ram nominally allocate sufficient buffers to + * cover 1/4 of our ram. Beyond the first 64MB allocate additional + * buffers to cover 1/10 of our ram over 64MB. When auto-sizing + * the buffer cache we limit the eventual kva reservation to + * maxbcache bytes. + * + * factor represents the 1/4 x ram conversion. + */ + if (nbuf == 0) { + int factor = 4 * BKVASIZE / 1024; + + nbuf = 50; + if (physmem_est > 4096) + nbuf += min((physmem_est - 4096) / factor, + 65536 / factor); + if (physmem_est > 65536) + nbuf += min((physmem_est - 65536) * 2 / (factor * 5), + 32 * 1024 * 1024 / (factor * 5)); + + if (maxbcache && nbuf > maxbcache / BKVASIZE) + nbuf = maxbcache / BKVASIZE; + tuned_nbuf = 1; + } else + tuned_nbuf = 0; + + /* XXX Avoid unsigned long overflows later on with maxbufspace. */ + maxbuf = (LONG_MAX / 3) / BKVASIZE; + if (nbuf > maxbuf) { + if (!tuned_nbuf) + printf("Warning: nbufs lowered from %d to %ld\n", nbuf, + maxbuf); + nbuf = maxbuf; + } + + /* + * Ideal allocation size for the transient bio submap is 10% + * of the maximal space buffer map. This roughly corresponds + * to the amount of the buffer mapped for typical UFS load. + * + * Clip the buffer map to reserve space for the transient + * BIOs, if its extent is bigger than 90% (80% on i386) of the + * maximum buffer map extent on the platform. + * + * The fall-back to the maxbuf in case of maxbcache unset, + * allows to not trim the buffer KVA for the architectures + * with ample KVA space. + */ + if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { + maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; + buf_sz = (long)nbuf * BKVASIZE; + if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * + (TRANSIENT_DENOM - 1)) { + /* + * There is more KVA than memory. Do not + * adjust buffer map size, and assign the rest + * of maxbuf to transient map. + */ + biotmap_sz = maxbuf_sz - buf_sz; + } else { + /* + * Buffer map spans all KVA we could afford on + * this platform. Give 10% (20% on i386) of + * the buffer map to the transient bio map. + */ + biotmap_sz = buf_sz / TRANSIENT_DENOM; + buf_sz -= biotmap_sz; + } + if (biotmap_sz / INT_MAX > MAXPHYS) + bio_transient_maxcnt = INT_MAX; + else + bio_transient_maxcnt = biotmap_sz / MAXPHYS; + /* + * Artificially limit to 1024 simultaneous in-flight I/Os + * using the transient mapping. + */ + if (bio_transient_maxcnt > 1024) + bio_transient_maxcnt = 1024; + if (tuned_nbuf) + nbuf = buf_sz / BKVASIZE; + } + + /* + * swbufs are used as temporary holders for I/O, such as paging I/O. + * We have no less then 16 and no more then 256. + */ + nswbuf = min(nbuf / 4, 256); + TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf); + if (nswbuf < NSWBUF_MIN) + nswbuf = NSWBUF_MIN; + + /* + * Reserve space for the buffer cache buffers + */ + swbuf = (void *)v; + v = (caddr_t)(swbuf + nswbuf); + buf = (void *)v; + v = (caddr_t)(buf + nbuf); + + return(v); +} + +/* Initialize the buffer subsystem. Called before use of any buffers. */ +void +bufinit(void) +{ + struct buf *bp; + int i; + + KASSERT(maxbcachebuf >= MAXBSIZE, + ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, + MAXBSIZE)); + bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); + mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); + mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); + mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); + + unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); + + /* finally, initialize each buffer header and stick on empty q */ + for (i = 0; i < nbuf; i++) { + bp = &buf[i]; + bzero(bp, sizeof *bp); + bp->b_flags = B_INVAL; + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + bp->b_qindex = QUEUE_NONE; + bp->b_domain = -1; + bp->b_subqueue = mp_maxid + 1; + bp->b_xflags = 0; + bp->b_data = bp->b_kvabase = unmapped_buf; + LIST_INIT(&bp->b_dep); + BUF_LOCKINIT(bp); + bq_insert(&bqempty, bp, false); + } + + /* + * maxbufspace is the absolute maximum amount of buffer space we are + * allowed to reserve in KVM and in real terms. The absolute maximum + * is nominally used by metadata. hibufspace is the nominal maximum + * used by most other requests. The differential is required to + * ensure that metadata deadlocks don't occur. + * + * maxbufspace is based on BKVASIZE. Allocating buffers larger then + * this may result in KVM fragmentation which is not handled optimally + * by the system. XXX This is less true with vmem. We could use + * PAGE_SIZE. + */ + maxbufspace = (long)nbuf * BKVASIZE; + hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); + lobufspace = (hibufspace / 20) * 19; /* 95% */ + bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; + + /* + * Note: The 16 MiB upper limit for hirunningspace was chosen + * arbitrarily and may need further tuning. It corresponds to + * 128 outstanding write IO requests (if IO size is 128 KiB), + * which fits with many RAID controllers' tagged queuing limits. + * The lower 1 MiB limit is the historical upper limit for + * hirunningspace. + */ + hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), + 16 * 1024 * 1024), 1024 * 1024); + lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); + + /* + * Limit the amount of malloc memory since it is wired permanently into + * the kernel space. Even though this is accounted for in the buffer + * allocation, we don't want the malloced region to grow uncontrolled. + * The malloc scheme improves memory utilization significantly on + * average (small) directories. + */ + maxbufmallocspace = hibufspace / 20; + + /* + * Reduce the chance of a deadlock occurring by limiting the number + * of delayed-write dirty buffers we allow to stack up. + */ + hidirtybuffers = nbuf / 4 + 20; + dirtybufthresh = hidirtybuffers * 9 / 10; + /* + * To support extreme low-memory systems, make sure hidirtybuffers + * cannot eat up all available buffer space. This occurs when our + * minimum cannot be met. We try to size hidirtybuffers to 3/4 our + * buffer space assuming BKVASIZE'd buffers. + */ + while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { + hidirtybuffers >>= 1; + } + lodirtybuffers = hidirtybuffers / 2; + + /* + * lofreebuffers should be sufficient to avoid stalling waiting on + * buf headers under heavy utilization. The bufs in per-cpu caches + * are counted as free but will be unavailable to threads executing + * on other cpus. + * + * hifreebuffers is the free target for the bufspace daemon. This + * should be set appropriately to limit work per-iteration. + */ + lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); + hifreebuffers = (3 * lofreebuffers) / 2; + numfreebuffers = nbuf; + + /* Setup the kva and free list allocators. */ + vmem_set_reclaim(buffer_arena, bufkva_reclaim); + buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), + NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); + + /* + * Size the clean queue according to the amount of buffer space. + * One queue per-256mb up to the max. More queues gives better + * concurrency but less accurate LRU. + */ + buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); + for (i = 0 ; i < buf_domains; i++) { + struct bufdomain *bd; + + bd = &bdomain[i]; + bd_init(bd); + bd->bd_freebuffers = nbuf / buf_domains; + bd->bd_hifreebuffers = hifreebuffers / buf_domains; + bd->bd_lofreebuffers = lofreebuffers / buf_domains; + bd->bd_bufspace = 0; + bd->bd_maxbufspace = maxbufspace / buf_domains; + bd->bd_hibufspace = hibufspace / buf_domains; + bd->bd_lobufspace = lobufspace / buf_domains; + bd->bd_bufspacethresh = bufspacethresh / buf_domains; + bd->bd_numdirtybuffers = 0; + bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; + bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; + bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; + /* Don't allow more than 2% of bufs in the per-cpu caches. */ + bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; + } + getnewbufcalls = counter_u64_alloc(M_WAITOK); + getnewbufrestarts = counter_u64_alloc(M_WAITOK); + mappingrestarts = counter_u64_alloc(M_WAITOK); + numbufallocfails = counter_u64_alloc(M_WAITOK); + notbufdflushes = counter_u64_alloc(M_WAITOK); + buffreekvacnt = counter_u64_alloc(M_WAITOK); + bufdefragcnt = counter_u64_alloc(M_WAITOK); + bufkvaspace = counter_u64_alloc(M_WAITOK); +} + +#ifdef INVARIANTS +static inline void +vfs_buf_check_mapped(struct buf *bp) +{ + + KASSERT(bp->b_kvabase != unmapped_buf, + ("mapped buf: b_kvabase was not updated %p", bp)); + KASSERT(bp->b_data != unmapped_buf, + ("mapped buf: b_data was not updated %p", bp)); + KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); +} + +static inline void +vfs_buf_check_unmapped(struct buf *bp) +{ + + KASSERT(bp->b_data == unmapped_buf, + ("unmapped buf: corrupted b_data %p", bp)); +} + +#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) +#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) +#else +#define BUF_CHECK_MAPPED(bp) do {} while (0) +#define BUF_CHECK_UNMAPPED(bp) do {} while (0) +#endif + +static int +isbufbusy(struct buf *bp) +{ + if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || + ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) + return (1); + return (0); +} + +/* + * Shutdown the system cleanly to prepare for reboot, halt, or power off. + */ +void +bufshutdown(int show_busybufs) +{ + static int first_buf_printf = 1; + struct buf *bp; + int iter, nbusy, pbusy; +#ifndef PREEMPTION + int subiter; +#endif + + /* + * Sync filesystems for shutdown + */ + wdog_kern_pat(WD_LASTVAL); + sys_sync(curthread, NULL); + + /* + * With soft updates, some buffers that are + * written will be remarked as dirty until other + * buffers are written. + */ + for (iter = pbusy = 0; iter < 20; iter++) { + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) + if (isbufbusy(bp)) + nbusy++; + if (nbusy == 0) { + if (first_buf_printf) + printf("All buffers synced."); + break; + } + if (first_buf_printf) { + printf("Syncing disks, buffers remaining... "); + first_buf_printf = 0; + } + printf("%d ", nbusy); + if (nbusy < pbusy) + iter = 0; + pbusy = nbusy; + + wdog_kern_pat(WD_LASTVAL); + sys_sync(curthread, NULL); + +#ifdef PREEMPTION + /* + * Spin for a while to allow interrupt threads to run. + */ + DELAY(50000 * iter); +#else + /* + * Context switch several times to allow interrupt + * threads to run. + */ + for (subiter = 0; subiter < 50 * iter; subiter++) { + thread_lock(curthread); + mi_switch(SW_VOL, NULL); + thread_unlock(curthread); + DELAY(1000); + } +#endif + } + printf("\n"); + /* + * Count only busy local buffers to prevent forcing + * a fsck if we're just a client of a wedged NFS server + */ + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if (isbufbusy(bp)) { +#if 0 +/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ + if (bp->b_dev == NULL) { + TAILQ_REMOVE(&mountlist, + bp->b_vp->v_mount, mnt_list); + continue; + } +#endif + nbusy++; + if (show_busybufs > 0) { + printf( + "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", + nbusy, bp, bp->b_vp, bp->b_flags, + (intmax_t)bp->b_blkno, + (intmax_t)bp->b_lblkno); + BUF_LOCKPRINTINFO(bp); + if (show_busybufs > 1) + vn_printf(bp->b_vp, + "vnode content: "); + } + } + } + if (nbusy) { + /* + * Failed to sync all blocks. Indicate this and don't + * unmount filesystems (thus forcing an fsck on reboot). + */ + printf("Giving up on %d buffers\n", nbusy); + DELAY(5000000); /* 5 seconds */ + } else { + if (!first_buf_printf) + printf("Final sync complete\n"); + /* + * Unmount filesystems + */ + if (panicstr == NULL) + vfs_unmountall(); + } + swapoff_all(); + DELAY(100000); /* wait for console output to finish */ +} + +static void +bpmap_qenter(struct buf *bp) +{ + + BUF_CHECK_MAPPED(bp); + + /* + * bp->b_data is relative to bp->b_offset, but + * bp->b_offset may be offset into the first page. + */ + bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); + pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); + bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | + (vm_offset_t)(bp->b_offset & PAGE_MASK)); +} + +static inline struct bufdomain * +bufdomain(struct buf *bp) +{ + + return (&bdomain[bp->b_domain]); +} + +static struct bufqueue * +bufqueue(struct buf *bp) +{ + + switch (bp->b_qindex) { + case QUEUE_NONE: + /* FALLTHROUGH */ + case QUEUE_SENTINEL: + return (NULL); + case QUEUE_EMPTY: + return (&bqempty); + case QUEUE_DIRTY: + return (&bufdomain(bp)->bd_dirtyq); + case QUEUE_CLEAN: + return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); + default: + break; + } + panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); +} + +/* + * Return the locked bufqueue that bp is a member of. + */ +static struct bufqueue * +bufqueue_acquire(struct buf *bp) +{ + struct bufqueue *bq, *nbq; + + /* + * bp can be pushed from a per-cpu queue to the + * cleanq while we're waiting on the lock. Retry + * if the queues don't match. + */ + bq = bufqueue(bp); + BQ_LOCK(bq); + for (;;) { + nbq = bufqueue(bp); + if (bq == nbq) + break; + BQ_UNLOCK(bq); + BQ_LOCK(nbq); + bq = nbq; + } + return (bq); +} + +/* + * binsfree: + * + * Insert the buffer into the appropriate free list. Requires a + * locked buffer on entry and buffer is unlocked before return. + */ +static void +binsfree(struct buf *bp, int qindex) +{ + struct bufdomain *bd; + struct bufqueue *bq; + + KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, + ("binsfree: Invalid qindex %d", qindex)); + BUF_ASSERT_XLOCKED(bp); + + /* + * Handle delayed bremfree() processing. + */ + if (bp->b_flags & B_REMFREE) { + if (bp->b_qindex == qindex) { + bp->b_flags |= B_REUSE; + bp->b_flags &= ~B_REMFREE; + BUF_UNLOCK(bp); + return; + } + bq = bufqueue_acquire(bp); + bq_remove(bq, bp); + BQ_UNLOCK(bq); + } + bd = bufdomain(bp); + if (qindex == QUEUE_CLEAN) { + if (bd->bd_lim != 0) + bq = &bd->bd_subq[PCPU_GET(cpuid)]; + else + bq = bd->bd_cleanq; + } else + bq = &bd->bd_dirtyq; + bq_insert(bq, bp, true); +} + +/* + * buf_free: + * + * Free a buffer to the buf zone once it no longer has valid contents. + */ +static void +buf_free(struct buf *bp) +{ + + if (bp->b_flags & B_REMFREE) + bremfreef(bp); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("losing buffer 1"); + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (!LIST_EMPTY(&bp->b_dep)) + buf_deallocate(bp); + bufkva_free(bp); + atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); + BUF_UNLOCK(bp); + uma_zfree(buf_zone, bp); +} + +/* + * buf_import: + * + * Import bufs into the uma cache from the buf list. The system still + * expects a static array of bufs and much of the synchronization + * around bufs assumes type stable storage. As a result, UMA is used + * only as a per-cpu cache of bufs still maintained on a global list. + */ +static int +buf_import(void *arg, void **store, int cnt, int domain, int flags) +{ + struct buf *bp; + int i; + + BQ_LOCK(&bqempty); + for (i = 0; i < cnt; i++) { + bp = TAILQ_FIRST(&bqempty.bq_queue); + if (bp == NULL) + break; + bq_remove(&bqempty, bp); + store[i] = bp; + } + BQ_UNLOCK(&bqempty); + + return (i); +} + +/* + * buf_release: + * + * Release bufs from the uma cache back to the buffer queues. + */ +static void +buf_release(void *arg, void **store, int cnt) +{ + struct bufqueue *bq; + struct buf *bp; + int i; + + bq = &bqempty; + BQ_LOCK(bq); + for (i = 0; i < cnt; i++) { + bp = store[i]; + /* Inline bq_insert() to batch locking. */ + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + bp->b_flags &= ~(B_AGE | B_REUSE); + bq->bq_len++; + bp->b_qindex = bq->bq_index; + } + BQ_UNLOCK(bq); +} + +/* + * buf_alloc: + * + * Allocate an empty buffer header. + */ +static struct buf * +buf_alloc(struct bufdomain *bd) +{ + struct buf *bp; + int freebufs; + + /* + * We can only run out of bufs in the buf zone if the average buf + * is less than BKVASIZE. In this case the actual wait/block will + * come from buf_reycle() failing to flush one of these small bufs. + */ + bp = NULL; + freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); + if (freebufs > 0) + bp = uma_zalloc(buf_zone, M_NOWAIT); + if (bp == NULL) { + atomic_add_int(&bd->bd_freebuffers, 1); + bufspace_daemon_wakeup(bd); + counter_u64_add(numbufallocfails, 1); + return (NULL); + } + /* + * Wake-up the bufspace daemon on transition below threshold. + */ + if (freebufs == bd->bd_lofreebuffers) + bufspace_daemon_wakeup(bd); + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) + panic("getnewbuf_empty: Locked buf %p on free queue.", bp); + + KASSERT(bp->b_vp == NULL, + ("bp: %p still has vnode %p.", bp, bp->b_vp)); + KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, + ("invalid buffer %p flags %#x", bp, bp->b_flags)); + KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, + ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); + KASSERT(bp->b_npages == 0, + ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); + KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); + KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); + + bp->b_domain = BD_DOMAIN(bd); + bp->b_flags = 0; + bp->b_ioflags = 0; + bp->b_xflags = 0; + bp->b_vflags = 0; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_bufobj = NULL; + bp->b_data = bp->b_kvabase = unmapped_buf; + bp->b_fsprivate1 = NULL; + bp->b_fsprivate2 = NULL; + bp->b_fsprivate3 = NULL; + LIST_INIT(&bp->b_dep); + + return (bp); +} + +/* + * buf_recycle: + * + * Free a buffer from the given bufqueue. kva controls whether the + * freed buf must own some kva resources. This is used for + * defragmenting. + */ +static int +buf_recycle(struct bufdomain *bd, bool kva) +{ + struct bufqueue *bq; + struct buf *bp, *nbp; + + if (kva) + counter_u64_add(bufdefragcnt, 1); + nbp = NULL; + bq = bd->bd_cleanq; + BQ_LOCK(bq); + KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), + ("buf_recycle: Locks don't match")); + nbp = TAILQ_FIRST(&bq->bq_queue); + + /* + * Run scan, possibly freeing data and/or kva mappings on the fly + * depending. + */ + while ((bp = nbp) != NULL) { + /* + * Calculate next bp (we can only use it if we do not + * release the bqlock). + */ + nbp = TAILQ_NEXT(bp, b_freelist); + + /* + * If we are defragging then we need a buffer with + * some kva to reclaim. + */ + if (kva && bp->b_kvasize == 0) + continue; + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) + continue; + + /* + * Implement a second chance algorithm for frequently + * accessed buffers. + */ + if ((bp->b_flags & B_REUSE) != 0) { + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + bp->b_flags &= ~B_REUSE; + BUF_UNLOCK(bp); + continue; + } + + /* + * Skip buffers with background writes in progress. + */ + if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { + BUF_UNLOCK(bp); + continue; + } + + KASSERT(bp->b_qindex == QUEUE_CLEAN, + ("buf_recycle: inconsistent queue %d bp %p", + bp->b_qindex, bp)); + KASSERT(bp->b_domain == BD_DOMAIN(bd), + ("getnewbuf: queue domain %d doesn't match request %d", + bp->b_domain, (int)BD_DOMAIN(bd))); + /* + * NOTE: nbp is now entirely invalid. We can only restart + * the scan from this point on. + */ + bq_remove(bq, bp); + BQ_UNLOCK(bq); + + /* + * Requeue the background write buffer with error and + * restart the scan. + */ + if ((bp->b_vflags & BV_BKGRDERR) != 0) { + bqrelse(bp); + BQ_LOCK(bq); + nbp = TAILQ_FIRST(&bq->bq_queue); + continue; + } + bp->b_flags |= B_INVAL; + brelse(bp); + return (0); + } + bd->bd_wanted = 1; + BQ_UNLOCK(bq); + + return (ENOBUFS); +} + +/* + * bremfree: + * + * Mark the buffer for removal from the appropriate free list. + * + */ +void +bremfree(struct buf *bp) +{ + + CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + KASSERT((bp->b_flags & B_REMFREE) == 0, + ("bremfree: buffer %p already marked for delayed removal.", bp)); + KASSERT(bp->b_qindex != QUEUE_NONE, + ("bremfree: buffer %p not on a queue.", bp)); + BUF_ASSERT_XLOCKED(bp); + + bp->b_flags |= B_REMFREE; +} + +/* + * bremfreef: + * + * Force an immediate removal from a free list. Used only in nfs when + * it abuses the b_freelist pointer. + */ +void +bremfreef(struct buf *bp) +{ + struct bufqueue *bq; + + bq = bufqueue_acquire(bp); + bq_remove(bq, bp); + BQ_UNLOCK(bq); +} + +static void +bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) +{ + + mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); + TAILQ_INIT(&bq->bq_queue); + bq->bq_len = 0; + bq->bq_index = qindex; + bq->bq_subqueue = subqueue; +} + +static void +bd_init(struct bufdomain *bd) +{ + int i; + + bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; + bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); + bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); + for (i = 0; i <= mp_maxid; i++) + bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, + "bufq clean subqueue lock"); + mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); +} + +/* + * bq_remove: + * + * Removes a buffer from the free list, must be called with the + * correct qlock held. + */ +static void +bq_remove(struct bufqueue *bq, struct buf *bp) +{ + + CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", + bp, bp->b_vp, bp->b_flags); + KASSERT(bp->b_qindex != QUEUE_NONE, + ("bq_remove: buffer %p not on a queue.", bp)); + KASSERT(bufqueue(bp) == bq, + ("bq_remove: Remove buffer %p from wrong queue.", bp)); + + BQ_ASSERT_LOCKED(bq); + if (bp->b_qindex != QUEUE_EMPTY) { + BUF_ASSERT_XLOCKED(bp); + } + KASSERT(bq->bq_len >= 1, + ("queue %d underflow", bp->b_qindex)); + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + bq->bq_len--; + bp->b_qindex = QUEUE_NONE; + bp->b_flags &= ~(B_REMFREE | B_REUSE); +} + +static void +bd_flush(struct bufdomain *bd, struct bufqueue *bq) +{ + struct buf *bp; + + BQ_ASSERT_LOCKED(bq); + if (bq != bd->bd_cleanq) { + BD_LOCK(bd); + while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { + TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); + TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, + b_freelist); + bp->b_subqueue = bd->bd_cleanq->bq_subqueue; + } + bd->bd_cleanq->bq_len += bq->bq_len; + bq->bq_len = 0; + } + if (bd->bd_wanted) { + bd->bd_wanted = 0; + wakeup(&bd->bd_wanted); + } + if (bq != bd->bd_cleanq) + BD_UNLOCK(bd); +} + +static int +bd_flushall(struct bufdomain *bd) +{ + struct bufqueue *bq; + int flushed; + int i; + + if (bd->bd_lim == 0) + return (0); + flushed = 0; + for (i = 0; i <= mp_maxid; i++) { + bq = &bd->bd_subq[i]; + if (bq->bq_len == 0) + continue; + BQ_LOCK(bq); + bd_flush(bd, bq); + BQ_UNLOCK(bq); + flushed++; + } + + return (flushed); +} + +static void +bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) +{ + struct bufdomain *bd; + + if (bp->b_qindex != QUEUE_NONE) + panic("bq_insert: free buffer %p onto another queue?", bp); + + bd = bufdomain(bp); + if (bp->b_flags & B_AGE) { + /* Place this buf directly on the real queue. */ + if (bq->bq_index == QUEUE_CLEAN) + bq = bd->bd_cleanq; + BQ_LOCK(bq); + TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); + } else { + BQ_LOCK(bq); + TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); + } + bp->b_flags &= ~(B_AGE | B_REUSE); + bq->bq_len++; + bp->b_qindex = bq->bq_index; + bp->b_subqueue = bq->bq_subqueue; + + /* + * Unlock before we notify so that we don't wakeup a waiter that + * fails a trylock on the buf and sleeps again. + */ + if (unlock) + BUF_UNLOCK(bp); + + if (bp->b_qindex == QUEUE_CLEAN) { + /* + * Flush the per-cpu queue and notify any waiters. + */ + if (bd->bd_wanted || (bq != bd->bd_cleanq && + bq->bq_len >= bd->bd_lim)) + bd_flush(bd, bq); + } + BQ_UNLOCK(bq); +} + +/* + * bufkva_free: + * + * Free the kva allocation for a buffer. + * + */ +static void +bufkva_free(struct buf *bp) +{ + +#ifdef INVARIANTS + if (bp->b_kvasize == 0) { + KASSERT(bp->b_kvabase == unmapped_buf && + bp->b_data == unmapped_buf, + ("Leaked KVA space on %p", bp)); + } else if (buf_mapped(bp)) + BUF_CHECK_MAPPED(bp); + else + BUF_CHECK_UNMAPPED(bp); +#endif + if (bp->b_kvasize == 0) + return; + + vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); + counter_u64_add(bufkvaspace, -bp->b_kvasize); + counter_u64_add(buffreekvacnt, 1); + bp->b_data = bp->b_kvabase = unmapped_buf; + bp->b_kvasize = 0; +} + +/* + * bufkva_alloc: + * + * Allocate the buffer KVA and set b_kvasize and b_kvabase. + */ +static int +bufkva_alloc(struct buf *bp, int maxsize, int gbflags) +{ + vm_offset_t addr; + int error; + + KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, + ("Invalid gbflags 0x%x in %s", gbflags, __func__)); + + bufkva_free(bp); + + addr = 0; + error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); + if (error != 0) { + /* + * Buffer map is too fragmented. Request the caller + * to defragment the map. + */ + return (error); + } + bp->b_kvabase = (caddr_t)addr; + bp->b_kvasize = maxsize; + counter_u64_add(bufkvaspace, bp->b_kvasize); + if ((gbflags & GB_UNMAPPED) != 0) { + bp->b_data = unmapped_buf; + BUF_CHECK_UNMAPPED(bp); + } else { + bp->b_data = bp->b_kvabase; + BUF_CHECK_MAPPED(bp); + } + return (0); +} + +/* + * bufkva_reclaim: + * + * Reclaim buffer kva by freeing buffers holding kva. This is a vmem + * callback that fires to avoid returning failure. + */ +static void +bufkva_reclaim(vmem_t *vmem, int flags) +{ + bool done; + int q; + int i; + + done = false; + for (i = 0; i < 5; i++) { + for (q = 0; q < buf_domains; q++) + if (buf_recycle(&bdomain[q], true) != 0) + done = true; + if (done) + break; + } + return; +} + +/* + * Attempt to initiate asynchronous I/O on read-ahead blocks. We must + * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, + * the buffer is valid and we do not have to do anything. + */ +static void +breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, + struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) +{ + struct buf *rabp; + struct thread *td; + int i; + + td = curthread; + + for (i = 0; i < cnt; i++, rablkno++, rabsize++) { + if (inmem(vp, *rablkno)) + continue; + rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); + if ((rabp->b_flags & B_CACHE) != 0) { + brelse(rabp); + continue; + } +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, rabp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + td->td_ru.ru_inblock++; + rabp->b_flags |= B_ASYNC; + rabp->b_flags &= ~B_INVAL; + if ((flags & GB_CKHASH) != 0) { + rabp->b_flags |= B_CKHASH; + rabp->b_ckhashcalc = ckhashfunc; + } + rabp->b_ioflags &= ~BIO_ERROR; + rabp->b_iocmd = BIO_READ; + if (rabp->b_rcred == NOCRED && cred != NOCRED) + rabp->b_rcred = crhold(cred); + vfs_busy_pages(rabp, 0); + BUF_KERNPROC(rabp); + rabp->b_iooffset = dbtob(rabp->b_blkno); + bstrategy(rabp); + } +} + +/* + * Entry point for bread() and breadn() via #defines in sys/buf.h. + * + * Get a buffer with the specified data. Look in the cache first. We + * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE + * is set, the buffer is valid and we do not have to do anything, see + * getblk(). Also starts asynchronous I/O on read-ahead blocks. + * + * Always return a NULL buffer pointer (in bpp) when returning an error. + */ +int +breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, + int *rabsize, int cnt, struct ucred *cred, int flags, + void (*ckhashfunc)(struct buf *), struct buf **bpp) +{ + struct buf *bp; + struct thread *td; + int error, readwait, rv; + + CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); + td = curthread; + /* + * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags + * are specified. + */ + error = getblkx(vp, blkno, size, 0, 0, flags, &bp); + if (error != 0) { + *bpp = NULL; + return (error); + } + flags &= ~GB_NOSPARSE; + *bpp = bp; + + /* + * If not found in cache, do some I/O + */ + readwait = 0; + if ((bp->b_flags & B_CACHE) == 0) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(td->td_proc); + racct_add_buf(td->td_proc, bp, 0); + PROC_UNLOCK(td->td_proc); + } +#endif /* RACCT */ + td->td_ru.ru_inblock++; + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~B_INVAL; + if ((flags & GB_CKHASH) != 0) { + bp->b_flags |= B_CKHASH; + bp->b_ckhashcalc = ckhashfunc; + } + bp->b_ioflags &= ~BIO_ERROR; + if (bp->b_rcred == NOCRED && cred != NOCRED) + bp->b_rcred = crhold(cred); + vfs_busy_pages(bp, 0); + bp->b_iooffset = dbtob(bp->b_blkno); + bstrategy(bp); + ++readwait; + } + + /* + * Attempt to initiate asynchronous I/O on read-ahead blocks. + */ + breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); + + rv = 0; + if (readwait) { + rv = bufwait(bp); + if (rv != 0) { + brelse(bp); + *bpp = NULL; + } + } + return (rv); +} + +/* + * Write, release buffer on completion. (Done by iodone + * if async). Do not bother writing anything if the buffer + * is invalid. + * + * Note that we set B_CACHE here, indicating that buffer is + * fully valid and thus cacheable. This is true even of NFS + * now so we set it generally. This could be set either here + * or in biodone() since the I/O is synchronous. We put it + * here. + */ +int +bufwrite(struct buf *bp) +{ + int oldflags; + struct vnode *vp; + long space; + int vp_md; + + CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { + bp->b_flags |= B_INVAL | B_RELBUF; + bp->b_flags &= ~B_CACHE; + brelse(bp); + return (ENXIO); + } + if (bp->b_flags & B_INVAL) { + brelse(bp); + return (0); + } + + if (bp->b_flags & B_BARRIER) + atomic_add_long(&barrierwrites, 1); + + oldflags = bp->b_flags; + + BUF_ASSERT_HELD(bp); + + KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), + ("FFS background buffer should not get here %p", bp)); + + vp = bp->b_vp; + if (vp) + vp_md = vp->v_vflag & VV_MD; + else + vp_md = 0; + + /* + * Mark the buffer clean. Increment the bufobj write count + * before bundirty() call, to prevent other thread from seeing + * empty dirty list and zero counter for writes in progress, + * falsely indicating that the bufobj is clean. + */ + bufobj_wref(bp->b_bufobj); + bundirty(bp); + + bp->b_flags &= ~B_DONE; + bp->b_ioflags &= ~BIO_ERROR; + bp->b_flags |= B_CACHE; + bp->b_iocmd = BIO_WRITE; + + vfs_busy_pages(bp, 1); + + /* + * Normal bwrites pipeline writes + */ + bp->b_runningbufspace = bp->b_bufsize; + space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); + +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + curthread->td_ru.ru_oublock++; + if (oldflags & B_ASYNC) + BUF_KERNPROC(bp); + bp->b_iooffset = dbtob(bp->b_blkno); + buf_track(bp, __func__); + bstrategy(bp); + + if ((oldflags & B_ASYNC) == 0) { + int rtval = bufwait(bp); + brelse(bp); + return (rtval); + } else if (space > hirunningspace) { + /* + * don't allow the async write to saturate the I/O + * system. We will not deadlock here because + * we are blocking waiting for I/O that is already in-progress + * to complete. We do not block here if it is the update + * or syncer daemon trying to clean up as that can lead + * to deadlock. + */ + if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) + waitrunningbufspace(); + } + + return (0); +} + +void +bufbdflush(struct bufobj *bo, struct buf *bp) +{ + struct buf *nbp; + + if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { + (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); + altbufferflushes++; + } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { + BO_LOCK(bo); + /* + * Try to find a buffer to flush. + */ + TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { + if ((nbp->b_vflags & BV_BKGRDINPROG) || + BUF_LOCK(nbp, + LK_EXCLUSIVE | LK_NOWAIT, NULL)) + continue; + if (bp == nbp) + panic("bdwrite: found ourselves"); + BO_UNLOCK(bo); + /* Don't countdeps with the bo lock held. */ + if (buf_countdeps(nbp, 0)) { + BO_LOCK(bo); + BUF_UNLOCK(nbp); + continue; + } + if (nbp->b_flags & B_CLUSTEROK) { + vfs_bio_awrite(nbp); + } else { + bremfree(nbp); + bawrite(nbp); + } + dirtybufferflushes++; + break; + } + if (nbp == NULL) + BO_UNLOCK(bo); + } +} + +/* + * Delayed write. (Buffer is marked dirty). Do not bother writing + * anything if the buffer is marked invalid. + * + * Note that since the buffer must be completely valid, we can safely + * set B_CACHE. In fact, we have to set B_CACHE here rather then in + * biodone() in order to prevent getblk from writing the buffer + * out synchronously. + */ +void +bdwrite(struct buf *bp) +{ + struct thread *td = curthread; + struct vnode *vp; + struct bufobj *bo; + + CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); + KASSERT((bp->b_flags & B_BARRIER) == 0, + ("Barrier request in delayed write %p", bp)); + BUF_ASSERT_HELD(bp); + + if (bp->b_flags & B_INVAL) { + brelse(bp); + return; + } + + /* + * If we have too many dirty buffers, don't create any more. + * If we are wildly over our limit, then force a complete + * cleanup. Otherwise, just keep the situation from getting + * out of control. Note that we have to avoid a recursive + * disaster and not try to clean up after our own cleanup! + */ + vp = bp->b_vp; + bo = bp->b_bufobj; + if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { + td->td_pflags |= TDP_INBDFLUSH; + BO_BDFLUSH(bo, bp); + td->td_pflags &= ~TDP_INBDFLUSH; + } else + recursiveflushes++; + + bdirty(bp); + /* + * Set B_CACHE, indicating that the buffer is fully valid. This is + * true even of NFS now. + */ + bp->b_flags |= B_CACHE; + + /* + * This bmap keeps the system from needing to do the bmap later, + * perhaps when the system is attempting to do a sync. Since it + * is likely that the indirect block -- or whatever other datastructure + * that the filesystem needs is still in memory now, it is a good + * thing to do this. Note also, that if the pageout daemon is + * requesting a sync -- there might not be enough memory to do + * the bmap then... So, this is important to do. + */ + if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { + VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); + } + + buf_track(bp, __func__); + + /* + * Set the *dirty* buffer range based upon the VM system dirty + * pages. + * + * Mark the buffer pages as clean. We need to do this here to + * satisfy the vnode_pager and the pageout daemon, so that it + * thinks that the pages have been "cleaned". Note that since + * the pages are in a delayed write buffer -- the VFS layer + * "will" see that the pages get written out on the next sync, + * or perhaps the cluster will be completed. + */ + vfs_clean_pages_dirty_buf(bp); + bqrelse(bp); + + /* + * note: we cannot initiate I/O from a bdwrite even if we wanted to, + * due to the softdep code. + */ +} + +/* + * bdirty: + * + * Turn buffer into delayed write request. We must clear BIO_READ and + * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to + * itself to properly update it in the dirty/clean lists. We mark it + * B_DONE to ensure that any asynchronization of the buffer properly + * clears B_DONE ( else a panic will occur later ). + * + * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which + * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() + * should only be called if the buffer is known-good. + * + * Since the buffer is not on a queue, we do not update the numfreebuffers + * count. + * + * The buffer must be on QUEUE_NONE. + */ +void +bdirty(struct buf *bp) +{ + + CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", + bp, bp->b_vp, bp->b_flags); + KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); + KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, + ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); + BUF_ASSERT_HELD(bp); + bp->b_flags &= ~(B_RELBUF); + bp->b_iocmd = BIO_WRITE; + + if ((bp->b_flags & B_DELWRI) == 0) { + bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; + reassignbuf(bp); + bdirtyadd(bp); + } +} + +/* + * bundirty: + * + * Clear B_DELWRI for buffer. + * + * Since the buffer is not on a queue, we do not update the numfreebuffers + * count. + * + * The buffer must be on QUEUE_NONE. + */ + +void +bundirty(struct buf *bp) +{ + + CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); + KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, + ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); + BUF_ASSERT_HELD(bp); + + if (bp->b_flags & B_DELWRI) { + bp->b_flags &= ~B_DELWRI; + reassignbuf(bp); + bdirtysub(bp); + } + /* + * Since it is now being written, we can clear its deferred write flag. + */ + bp->b_flags &= ~B_DEFERRED; +} + +/* + * bawrite: + * + * Asynchronous write. Start output on a buffer, but do not wait for + * it to complete. The buffer is released when the output completes. + * + * bwrite() ( or the VOP routine anyway ) is responsible for handling + * B_INVAL buffers. Not us. + */ +void +bawrite(struct buf *bp) +{ + + bp->b_flags |= B_ASYNC; + (void) bwrite(bp); +} + +/* + * babarrierwrite: + * + * Asynchronous barrier write. Start output on a buffer, but do not + * wait for it to complete. Place a write barrier after this write so + * that this buffer and all buffers written before it are committed to + * the disk before any buffers written after this write are committed + * to the disk. The buffer is released when the output completes. + */ +void +babarrierwrite(struct buf *bp) +{ + + bp->b_flags |= B_ASYNC | B_BARRIER; + (void) bwrite(bp); +} + +/* + * bbarrierwrite: + * + * Synchronous barrier write. Start output on a buffer and wait for + * it to complete. Place a write barrier after this write so that + * this buffer and all buffers written before it are committed to + * the disk before any buffers written after this write are committed + * to the disk. The buffer is released when the output completes. + */ +int +bbarrierwrite(struct buf *bp) +{ + + bp->b_flags |= B_BARRIER; + return (bwrite(bp)); +} + +/* + * bwillwrite: + * + * Called prior to the locking of any vnodes when we are expecting to + * write. We do not want to starve the buffer cache with too many + * dirty buffers so we block here. By blocking prior to the locking + * of any vnodes we attempt to avoid the situation where a locked vnode + * prevents the various system daemons from flushing related buffers. + */ +void +bwillwrite(void) +{ + + if (buf_dirty_count_severe()) { + mtx_lock(&bdirtylock); + while (buf_dirty_count_severe()) { + bdirtywait = 1; + msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), + "flswai", 0); + } + mtx_unlock(&bdirtylock); + } +} + +/* + * Return true if we have too many dirty buffers. + */ +int +buf_dirty_count_severe(void) +{ + + return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); +} + +/* + * brelse: + * + * Release a busy buffer and, if requested, free its resources. The + * buffer will be stashed in the appropriate bufqueue[] allowing it + * to be accessed later as a cache entity or reused for other purposes. + */ +void +brelse(struct buf *bp) +{ + struct mount *v_mnt; + int qindex; + + /* + * Many functions erroneously call brelse with a NULL bp under rare + * error conditions. Simply return when called with a NULL bp. + */ + if (bp == NULL) + return; + CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", + bp, bp->b_vp, bp->b_flags); + KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), + ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, + ("brelse: non-VMIO buffer marked NOREUSE")); + + if (BUF_LOCKRECURSED(bp)) { + /* + * Do not process, in particular, do not handle the + * B_INVAL/B_RELBUF and do not release to free list. + */ + BUF_UNLOCK(bp); + return; + } + + if (bp->b_flags & B_MANAGED) { + bqrelse(bp); + return; + } + + if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { + BO_LOCK(bp->b_bufobj); + bp->b_vflags &= ~BV_BKGRDERR; + BO_UNLOCK(bp->b_bufobj); + bdirty(bp); + } + if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && + (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && + !(bp->b_flags & B_INVAL)) { + /* + * Failed write, redirty. All errors except ENXIO (which + * means the device is gone) are treated as being + * transient. + * + * XXX Treating EIO as transient is not correct; the + * contract with the local storage device drivers is that + * they will only return EIO once the I/O is no longer + * retriable. Network I/O also respects this through the + * guarantees of TCP and/or the internal retries of NFS. + * ENOMEM might be transient, but we also have no way of + * knowing when its ok to retry/reschedule. In general, + * this entire case should be made obsolete through better + * error handling/recovery and resource scheduling. + * + * Do this also for buffers that failed with ENXIO, but have + * non-empty dependencies - the soft updates code might need + * to access the buffer to untangle them. + * + * Must clear BIO_ERROR to prevent pages from being scrapped. + */ + bp->b_ioflags &= ~BIO_ERROR; + bdirty(bp); + } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || + (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { + /* + * Either a failed read I/O, or we were asked to free or not + * cache the buffer, or we failed to write to a device that's + * no longer present. + */ + bp->b_flags |= B_INVAL; + if (!LIST_EMPTY(&bp->b_dep)) + buf_deallocate(bp); + if (bp->b_flags & B_DELWRI) + bdirtysub(bp); + bp->b_flags &= ~(B_DELWRI | B_CACHE); + if ((bp->b_flags & B_VMIO) == 0) { + allocbuf(bp, 0); + if (bp->b_vp) + brelvp(bp); + } + } + + /* + * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() + * is called with B_DELWRI set, the underlying pages may wind up + * getting freed causing a previous write (bdwrite()) to get 'lost' + * because pages associated with a B_DELWRI bp are marked clean. + * + * We still allow the B_INVAL case to call vfs_vmio_truncate(), even + * if B_DELWRI is set. + */ + if (bp->b_flags & B_DELWRI) + bp->b_flags &= ~B_RELBUF; + + /* + * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer + * constituted, not even NFS buffers now. Two flags effect this. If + * B_INVAL, the struct buf is invalidated but the VM object is kept + * around ( i.e. so it is trivial to reconstitute the buffer later ). + * + * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be + * invalidated. BIO_ERROR cannot be set for a failed write unless the + * buffer is also B_INVAL because it hits the re-dirtying code above. + * + * Normally we can do this whether a buffer is B_DELWRI or not. If + * the buffer is an NFS buffer, it is tracking piecemeal writes or + * the commit state and we cannot afford to lose the buffer. If the + * buffer has a background write in progress, we need to keep it + * around to prevent it from being reconstituted and starting a second + * background write. + */ + + v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL; + + if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || + (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && + (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 || + vn_isdisk(bp->b_vp, NULL) || (bp->b_flags & B_DELWRI) == 0)) { + vfs_vmio_invalidate(bp); + allocbuf(bp, 0); + } + + if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || + (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { + allocbuf(bp, 0); + bp->b_flags &= ~B_NOREUSE; + if (bp->b_vp != NULL) + brelvp(bp); + } + + /* + * If the buffer has junk contents signal it and eventually + * clean up B_DELWRI and diassociate the vnode so that gbincore() + * doesn't find it. + */ + if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || + (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) + bp->b_flags |= B_INVAL; + if (bp->b_flags & B_INVAL) { + if (bp->b_flags & B_DELWRI) + bundirty(bp); + if (bp->b_vp) + brelvp(bp); + } + + buf_track(bp, __func__); + + /* buffers with no memory */ + if (bp->b_bufsize == 0) { + buf_free(bp); + return; + } + /* buffers with junk contents */ + if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || + (bp->b_ioflags & BIO_ERROR)) { + bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("losing buffer 2"); + qindex = QUEUE_CLEAN; + bp->b_flags |= B_AGE; + /* remaining buffers */ + } else if (bp->b_flags & B_DELWRI) + qindex = QUEUE_DIRTY; + else + qindex = QUEUE_CLEAN; + + if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) + panic("brelse: not dirty"); + + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); + /* binsfree unlocks bp. */ + binsfree(bp, qindex); +} + +/* + * Release a buffer back to the appropriate queue but do not try to free + * it. The buffer is expected to be used again soon. + * + * bqrelse() is used by bdwrite() to requeue a delayed write, and used by + * biodone() to requeue an async I/O on completion. It is also used when + * known good buffers need to be requeued but we think we may need the data + * again soon. + * + * XXX we should be able to leave the B_RELBUF hint set on completion. + */ +void +bqrelse(struct buf *bp) +{ + int qindex; + + CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), + ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + + qindex = QUEUE_NONE; + if (BUF_LOCKRECURSED(bp)) { + /* do not release to free list */ + BUF_UNLOCK(bp); + return; + } + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + + if (bp->b_flags & B_MANAGED) { + if (bp->b_flags & B_REMFREE) + bremfreef(bp); + goto out; + } + + /* buffers with stale but valid contents */ + if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | + BV_BKGRDERR)) == BV_BKGRDERR) { + BO_LOCK(bp->b_bufobj); + bp->b_vflags &= ~BV_BKGRDERR; + BO_UNLOCK(bp->b_bufobj); + qindex = QUEUE_DIRTY; + } else { + if ((bp->b_flags & B_DELWRI) == 0 && + (bp->b_xflags & BX_VNDIRTY)) + panic("bqrelse: not dirty"); + if ((bp->b_flags & B_NOREUSE) != 0) { + brelse(bp); + return; + } + qindex = QUEUE_CLEAN; + } + buf_track(bp, __func__); + /* binsfree unlocks bp. */ + binsfree(bp, qindex); + return; + +out: + buf_track(bp, __func__); + /* unlock */ + BUF_UNLOCK(bp); +} + +/* + * Complete I/O to a VMIO backed page. Validate the pages as appropriate, + * restore bogus pages. + */ +static void +vfs_vmio_iodone(struct buf *bp) +{ + vm_ooffset_t foff; + vm_page_t m; + vm_object_t obj; + struct vnode *vp __unused; + int i, iosize, resid; + bool bogus; + + obj = bp->b_bufobj->bo_object; + KASSERT(obj->paging_in_progress >= bp->b_npages, + ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", + obj->paging_in_progress, bp->b_npages)); + + vp = bp->b_vp; + KASSERT(vp->v_holdcnt > 0, + ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); + KASSERT(vp->v_object != NULL, + ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); + + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); + + bogus = false; + iosize = bp->b_bcount - bp->b_resid; + VM_OBJECT_WLOCK(obj); + for (i = 0; i < bp->b_npages; i++) { + resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; + if (resid > iosize) + resid = iosize; + + /* + * cleanup bogus pages, restoring the originals + */ + m = bp->b_pages[i]; + if (m == bogus_page) { + bogus = true; + m = vm_page_lookup(obj, OFF_TO_IDX(foff)); + if (m == NULL) + panic("biodone: page disappeared!"); + bp->b_pages[i] = m; + } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { + /* + * In the write case, the valid and clean bits are + * already changed correctly ( see bdwrite() ), so we + * only need to do this here in the read case. + */ + KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, + resid)) == 0, ("vfs_vmio_iodone: page %p " + "has unexpected dirty bits", m)); + vfs_page_set_valid(bp, foff, m); + } + KASSERT(OFF_TO_IDX(foff) == m->pindex, + ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", + (intmax_t)foff, (uintmax_t)m->pindex)); + + vm_page_sunbusy(m); + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; + iosize -= resid; + } + vm_object_pip_wakeupn(obj, bp->b_npages); + VM_OBJECT_WUNLOCK(obj); + if (bogus && buf_mapped(bp)) { + BUF_CHECK_MAPPED(bp); + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); + } +} + +/* + * Perform page invalidation when a buffer is released. The fully invalid + * pages will be reclaimed later in vfs_vmio_truncate(). + */ +static void +vfs_vmio_invalidate(struct buf *bp) +{ + vm_object_t obj; + vm_page_t m; + int flags, i, resid, poffset, presid; + + if (buf_mapped(bp)) { + BUF_CHECK_MAPPED(bp); + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); + /* + * Get the base offset and length of the buffer. Note that + * in the VMIO case if the buffer block size is not + * page-aligned then b_data pointer may not be page-aligned. + * But our b_pages[] array *IS* page aligned. + * + * block sizes less then DEV_BSIZE (usually 512) are not + * supported due to the page granularity bits (m->valid, + * m->dirty, etc...). + * + * See man buf(9) for more information + */ + flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; + obj = bp->b_bufobj->bo_object; + resid = bp->b_bufsize; + poffset = bp->b_offset & PAGE_MASK; + VM_OBJECT_WLOCK(obj); + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + if (m == bogus_page) + panic("vfs_vmio_invalidate: Unexpected bogus page."); + bp->b_pages[i] = NULL; + + presid = resid > (PAGE_SIZE - poffset) ? + (PAGE_SIZE - poffset) : resid; + KASSERT(presid >= 0, ("brelse: extra page")); + while (vm_page_xbusied(m)) { + vm_page_lock(m); + VM_OBJECT_WUNLOCK(obj); + vm_page_busy_sleep(m, "mbncsh", true); + VM_OBJECT_WLOCK(obj); + } + if (pmap_page_wired_mappings(m) == 0) + vm_page_set_invalid(m, poffset, presid); + vm_page_release_locked(m, flags); + resid -= presid; + poffset = 0; + } + VM_OBJECT_WUNLOCK(obj); + bp->b_npages = 0; +} + +/* + * Page-granular truncation of an existing VMIO buffer. + */ +static void +vfs_vmio_truncate(struct buf *bp, int desiredpages) +{ + vm_object_t obj; + vm_page_t m; + int flags, i; + + if (bp->b_npages == desiredpages) + return; + + if (buf_mapped(bp)) { + BUF_CHECK_MAPPED(bp); + pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); + } else + BUF_CHECK_UNMAPPED(bp); + + /* + * The object lock is needed only if we will attempt to free pages. + */ + flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; + if ((bp->b_flags & B_DIRECT) != 0) { + flags |= VPR_TRYFREE; + obj = bp->b_bufobj->bo_object; + VM_OBJECT_WLOCK(obj); + } else { + obj = NULL; + } + for (i = desiredpages; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + KASSERT(m != bogus_page, ("allocbuf: bogus page found")); + bp->b_pages[i] = NULL; + if (obj != NULL) + vm_page_release_locked(m, flags); + else + vm_page_release(m, flags); + } + if (obj != NULL) + VM_OBJECT_WUNLOCK(obj); + bp->b_npages = desiredpages; +} + +/* + * Byte granular extension of VMIO buffers. + */ +static void +vfs_vmio_extend(struct buf *bp, int desiredpages, int size) +{ + /* + * We are growing the buffer, possibly in a + * byte-granular fashion. + */ + vm_object_t obj; + vm_offset_t toff; + vm_offset_t tinc; + vm_page_t m; + + /* + * Step 1, bring in the VM pages from the object, allocating + * them if necessary. We must clear B_CACHE if these pages + * are not valid for the range covered by the buffer. + */ + obj = bp->b_bufobj->bo_object; + VM_OBJECT_WLOCK(obj); + if (bp->b_npages < desiredpages) { + /* + * We must allocate system pages since blocking + * here could interfere with paging I/O, no + * matter which process we are. + * + * Only exclusive busy can be tested here. + * Blocking on shared busy might lead to + * deadlocks once allocbuf() is called after + * pages are vfs_busy_pages(). + */ + (void)vm_page_grab_pages(obj, + OFF_TO_IDX(bp->b_offset) + bp->b_npages, + VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | + VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, + &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); + bp->b_npages = desiredpages; + } + + /* + * Step 2. We've loaded the pages into the buffer, + * we have to figure out if we can still have B_CACHE + * set. Note that B_CACHE is set according to the + * byte-granular range ( bcount and size ), not the + * aligned range ( newbsize ). + * + * The VM test is against m->valid, which is DEV_BSIZE + * aligned. Needless to say, the validity of the data + * needs to also be DEV_BSIZE aligned. Note that this + * fails with NFS if the server or some other client + * extends the file's EOF. If our buffer is resized, + * B_CACHE may remain set! XXX + */ + toff = bp->b_bcount; + tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); + while ((bp->b_flags & B_CACHE) && toff < size) { + vm_pindex_t pi; + + if (tinc > (size - toff)) + tinc = size - toff; + pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; + m = bp->b_pages[pi]; + vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); + toff += tinc; + tinc = PAGE_SIZE; + } + VM_OBJECT_WUNLOCK(obj); + + /* + * Step 3, fixup the KVA pmap. + */ + if (buf_mapped(bp)) + bpmap_qenter(bp); + else + BUF_CHECK_UNMAPPED(bp); +} + +/* + * Check to see if a block at a particular lbn is available for a clustered + * write. + */ +static int +vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) +{ + struct buf *bpa; + int match; + + match = 0; + + /* If the buf isn't in core skip it */ + if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) + return (0); + + /* If the buf is busy we don't want to wait for it */ + if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) + return (0); + + /* Only cluster with valid clusterable delayed write buffers */ + if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != + (B_DELWRI | B_CLUSTEROK)) + goto done; + + if (bpa->b_bufsize != size) + goto done; + + /* + * Check to see if it is in the expected place on disk and that the + * block has been mapped. + */ + if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) + match = 1; +done: + BUF_UNLOCK(bpa); + return (match); +} + +/* + * vfs_bio_awrite: + * + * Implement clustered async writes for clearing out B_DELWRI buffers. + * This is much better then the old way of writing only one buffer at + * a time. Note that we may not be presented with the buffers in the + * correct order, so we search for the cluster in both directions. + */ +int +vfs_bio_awrite(struct buf *bp) +{ + struct bufobj *bo; + int i; + int j; + daddr_t lblkno = bp->b_lblkno; + struct vnode *vp = bp->b_vp; + int ncl; + int nwritten; + int size; + int maxcl; + int gbflags; + + bo = &vp->v_bufobj; + gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; + /* + * right now we support clustered writing only to regular files. If + * we find a clusterable block we could be in the middle of a cluster + * rather then at the beginning. + */ + if ((vp->v_type == VREG) && + (vp->v_mount != 0) && /* Only on nodes that have the size info */ + (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { + + size = vp->v_mount->mnt_stat.f_iosize; + maxcl = MAXPHYS / size; + + BO_RLOCK(bo); + for (i = 1; i < maxcl; i++) + if (vfs_bio_clcheck(vp, size, lblkno + i, + bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) + break; + + for (j = 1; i + j <= maxcl && j <= lblkno; j++) + if (vfs_bio_clcheck(vp, size, lblkno - j, + bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) + break; + BO_RUNLOCK(bo); + --j; + ncl = i + j; + /* + * this is a possible cluster write + */ + if (ncl != 1) { + BUF_UNLOCK(bp); + nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, + gbflags); + return (nwritten); + } + } + bremfree(bp); + bp->b_flags |= B_ASYNC; + /* + * default (old) behavior, writing out only one block + * + * XXX returns b_bufsize instead of b_bcount for nwritten? + */ + nwritten = bp->b_bufsize; + (void) bwrite(bp); + + return (nwritten); +} + +/* + * getnewbuf_kva: + * + * Allocate KVA for an empty buf header according to gbflags. + */ +static int +getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) +{ + + if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { + /* + * In order to keep fragmentation sane we only allocate kva + * in BKVASIZE chunks. XXX with vmem we can do page size. + */ + maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; + + if (maxsize != bp->b_kvasize && + bufkva_alloc(bp, maxsize, gbflags)) + return (ENOSPC); + } + return (0); +} + +/* + * getnewbuf: + * + * Find and initialize a new buffer header, freeing up existing buffers + * in the bufqueues as necessary. The new buffer is returned locked. + * + * We block if: + * We have insufficient buffer headers + * We have insufficient buffer space + * buffer_arena is too fragmented ( space reservation fails ) + * If we have to flush dirty buffers ( but we try to avoid this ) + * + * The caller is responsible for releasing the reserved bufspace after + * allocbuf() is called. + */ +static struct buf * +getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) +{ + struct bufdomain *bd; + struct buf *bp; + bool metadata, reserved; + + bp = NULL; + KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); + if (!unmapped_buf_allowed) + gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); + + if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || + vp->v_type == VCHR) + metadata = true; + else + metadata = false; + if (vp == NULL) + bd = &bdomain[0]; + else + bd = &bdomain[vp->v_bufobj.bo_domain]; + + counter_u64_add(getnewbufcalls, 1); + reserved = false; + do { + if (reserved == false && + bufspace_reserve(bd, maxsize, metadata) != 0) { + counter_u64_add(getnewbufrestarts, 1); + continue; + } + reserved = true; + if ((bp = buf_alloc(bd)) == NULL) { + counter_u64_add(getnewbufrestarts, 1); + continue; + } + if (getnewbuf_kva(bp, gbflags, maxsize) == 0) + return (bp); + break; + } while (buf_recycle(bd, false) == 0); + + if (reserved) + bufspace_release(bd, maxsize); + if (bp != NULL) { + bp->b_flags |= B_INVAL; + brelse(bp); + } + bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); + + return (NULL); +} + +/* + * buf_daemon: + * + * buffer flushing daemon. Buffers are normally flushed by the + * update daemon but if it cannot keep up this process starts to + * take the load in an attempt to prevent getnewbuf() from blocking. + */ +static struct kproc_desc buf_kp = { + "bufdaemon", + buf_daemon, + &bufdaemonproc +}; +SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); + +static int +buf_flush(struct vnode *vp, struct bufdomain *bd, int target) +{ + int flushed; + + flushed = flushbufqueues(vp, bd, target, 0); + if (flushed == 0) { + /* + * Could not find any buffers without rollback + * dependencies, so just write the first one + * in the hopes of eventually making progress. + */ + if (vp != NULL && target > 2) + target /= 2; + flushbufqueues(vp, bd, target, 1); + } + return (flushed); +} + +static void +buf_daemon() +{ + struct bufdomain *bd; + int speedupreq; + int lodirty; + int i; + + /* + * This process needs to be suspended prior to shutdown sync. + */ + EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, + SHUTDOWN_PRI_LAST + 100); + + /* + * Start the buf clean daemons as children threads. + */ + for (i = 0 ; i < buf_domains; i++) { + int error; + + error = kthread_add((void (*)(void *))bufspace_daemon, + &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); + if (error) + panic("error %d spawning bufspace daemon", error); + } + + /* + * This process is allowed to take the buffer cache to the limit + */ + curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; + mtx_lock(&bdlock); + for (;;) { + bd_request = 0; + mtx_unlock(&bdlock); + + kthread_suspend_check(); + + /* + * Save speedupreq for this pass and reset to capture new + * requests. + */ + speedupreq = bd_speedupreq; + bd_speedupreq = 0; + + /* + * Flush each domain sequentially according to its level and + * the speedup request. + */ + for (i = 0; i < buf_domains; i++) { + bd = &bdomain[i]; + if (speedupreq) + lodirty = bd->bd_numdirtybuffers / 2; + else + lodirty = bd->bd_lodirtybuffers; + while (bd->bd_numdirtybuffers > lodirty) { + if (buf_flush(NULL, bd, + bd->bd_numdirtybuffers - lodirty) == 0) + break; + kern_yield(PRI_USER); + } + } + + /* + * Only clear bd_request if we have reached our low water + * mark. The buf_daemon normally waits 1 second and + * then incrementally flushes any dirty buffers that have + * built up, within reason. + * + * If we were unable to hit our low water mark and couldn't + * find any flushable buffers, we sleep for a short period + * to avoid endless loops on unlockable buffers. + */ + mtx_lock(&bdlock); + if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { + /* + * We reached our low water mark, reset the + * request and sleep until we are needed again. + * The sleep is just so the suspend code works. + */ + bd_request = 0; + /* + * Do an extra wakeup in case dirty threshold + * changed via sysctl and the explicit transition + * out of shortfall was missed. + */ + bdirtywakeup(); + if (runningbufspace <= lorunningspace) + runningwakeup(); + msleep(&bd_request, &bdlock, PVM, "psleep", hz); + } else { + /* + * We couldn't find any flushable dirty buffers but + * still have too many dirty buffers, we + * have to sleep and try again. (rare) + */ + msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); + } + } +} + +/* + * flushbufqueues: + * + * Try to flush a buffer in the dirty queue. We must be careful to + * free up B_INVAL buffers instead of write them, which NFS is + * particularly sensitive to. + */ +static int flushwithdeps = 0; +SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, + 0, "Number of buffers flushed with dependecies that require rollbacks"); + +static int +flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, + int flushdeps) +{ + struct bufqueue *bq; + struct buf *sentinel; + struct vnode *vp; + struct mount *mp; + struct buf *bp; + int hasdeps; + int flushed; + int error; + bool unlock; + + flushed = 0; + bq = &bd->bd_dirtyq; + bp = NULL; + sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); + sentinel->b_qindex = QUEUE_SENTINEL; + BQ_LOCK(bq); + TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); + BQ_UNLOCK(bq); + while (flushed != target) { + maybe_yield(); + BQ_LOCK(bq); + bp = TAILQ_NEXT(sentinel, b_freelist); + if (bp != NULL) { + TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); + TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, + b_freelist); + } else { + BQ_UNLOCK(bq); + break; + } + /* + * Skip sentinels inserted by other invocations of the + * flushbufqueues(), taking care to not reorder them. + * + * Only flush the buffers that belong to the + * vnode locked by the curthread. + */ + if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && + bp->b_vp != lvp)) { + BQ_UNLOCK(bq); + continue; + } + error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); + BQ_UNLOCK(bq); + if (error != 0) + continue; + + /* + * BKGRDINPROG can only be set with the buf and bufobj + * locks both held. We tolerate a race to clear it here. + */ + if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || + (bp->b_flags & B_DELWRI) == 0) { + BUF_UNLOCK(bp); + continue; + } + if (bp->b_flags & B_INVAL) { + bremfreef(bp); + brelse(bp); + flushed++; + continue; + } + + if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { + if (flushdeps == 0) { + BUF_UNLOCK(bp); + continue; + } + hasdeps = 1; + } else + hasdeps = 0; + /* + * We must hold the lock on a vnode before writing + * one of its buffers. Otherwise we may confuse, or + * in the case of a snapshot vnode, deadlock the + * system. + * + * The lock order here is the reverse of the normal + * of vnode followed by buf lock. This is ok because + * the NOWAIT will prevent deadlock. + */ + vp = bp->b_vp; + if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { + BUF_UNLOCK(bp); + continue; + } + if (lvp == NULL) { + unlock = true; + error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); + } else { + ASSERT_VOP_LOCKED(vp, "getbuf"); + unlock = false; + error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : + vn_lock(vp, LK_TRYUPGRADE); + } + if (error == 0) { + CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", + bp, bp->b_vp, bp->b_flags); + if (curproc == bufdaemonproc) { + vfs_bio_awrite(bp); + } else { + bremfree(bp); + bwrite(bp); + counter_u64_add(notbufdflushes, 1); + } + vn_finished_write(mp); + if (unlock) + VOP_UNLOCK(vp, 0); + flushwithdeps += hasdeps; + flushed++; + + /* + * Sleeping on runningbufspace while holding + * vnode lock leads to deadlock. + */ + if (curproc == bufdaemonproc && + runningbufspace > hirunningspace) + waitrunningbufspace(); + continue; + } + vn_finished_write(mp); + BUF_UNLOCK(bp); + } + BQ_LOCK(bq); + TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); + BQ_UNLOCK(bq); + free(sentinel, M_TEMP); + return (flushed); +} + +/* + * Check to see if a block is currently memory resident. + */ +struct buf * +incore(struct bufobj *bo, daddr_t blkno) +{ + struct buf *bp; + + BO_RLOCK(bo); + bp = gbincore(bo, blkno); + BO_RUNLOCK(bo); + return (bp); +} + +/* + * Returns true if no I/O is needed to access the + * associated VM object. This is like incore except + * it also hunts around in the VM system for the data. + */ + +static int +inmem(struct vnode * vp, daddr_t blkno) +{ + vm_object_t obj; + vm_offset_t toff, tinc, size; + vm_page_t m; + vm_ooffset_t off; + + ASSERT_VOP_LOCKED(vp, "inmem"); + + if (incore(&vp->v_bufobj, blkno)) + return 1; + if (vp->v_mount == NULL) + return 0; + obj = vp->v_object; + if (obj == NULL) + return (0); + + size = PAGE_SIZE; + if (size > vp->v_mount->mnt_stat.f_iosize) + size = vp->v_mount->mnt_stat.f_iosize; + off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; + + VM_OBJECT_RLOCK(obj); + for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { + m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); + if (!m) + goto notinmem; + tinc = size; + if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) + tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); + if (vm_page_is_valid(m, + (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) + goto notinmem; + } + VM_OBJECT_RUNLOCK(obj); + return 1; + +notinmem: + VM_OBJECT_RUNLOCK(obj); + return (0); +} + +/* + * Set the dirty range for a buffer based on the status of the dirty + * bits in the pages comprising the buffer. The range is limited + * to the size of the buffer. + * + * Tell the VM system that the pages associated with this buffer + * are clean. This is used for delayed writes where the data is + * going to go to disk eventually without additional VM intevention. + * + * Note that while we only really need to clean through to b_bcount, we + * just go ahead and clean through to b_bufsize. + */ +static void +vfs_clean_pages_dirty_buf(struct buf *bp) +{ + vm_ooffset_t foff, noff, eoff; + vm_page_t m; + int i; + + if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) + return; + + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("vfs_clean_pages_dirty_buf: no buffer offset")); + + VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + vfs_drain_busy_pages(bp); + vfs_setdirty_locked_object(bp); + for (i = 0; i < bp->b_npages; i++) { + noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; + eoff = noff; + if (eoff > bp->b_offset + bp->b_bufsize) + eoff = bp->b_offset + bp->b_bufsize; + m = bp->b_pages[i]; + vfs_page_set_validclean(bp, foff, m); + /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ + foff = noff; + } + VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); +} + +static void +vfs_setdirty_locked_object(struct buf *bp) +{ + vm_object_t object; + int i; + + object = bp->b_bufobj->bo_object; + VM_OBJECT_ASSERT_WLOCKED(object); + + /* + * We qualify the scan for modified pages on whether the + * object has been flushed yet. + */ + if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { + vm_offset_t boffset; + vm_offset_t eoffset; + + /* + * test the pages to see if they have been modified directly + * by users through the VM system. + */ + for (i = 0; i < bp->b_npages; i++) + vm_page_test_dirty(bp->b_pages[i]); + + /* + * Calculate the encompassing dirty range, boffset and eoffset, + * (eoffset - boffset) bytes. + */ + + for (i = 0; i < bp->b_npages; i++) { + if (bp->b_pages[i]->dirty) + break; + } + boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); + + for (i = bp->b_npages - 1; i >= 0; --i) { + if (bp->b_pages[i]->dirty) { + break; + } + } + eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); + + /* + * Fit it to the buffer. + */ + + if (eoffset > bp->b_bcount) + eoffset = bp->b_bcount; + + /* + * If we have a good dirty range, merge with the existing + * dirty range. + */ + + if (boffset < eoffset) { + if (bp->b_dirtyoff > boffset) + bp->b_dirtyoff = boffset; + if (bp->b_dirtyend < eoffset) + bp->b_dirtyend = eoffset; + } + } +} + +/* + * Allocate the KVA mapping for an existing buffer. + * If an unmapped buffer is provided but a mapped buffer is requested, take + * also care to properly setup mappings between pages and KVA. + */ +static void +bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) +{ + int bsize, maxsize, need_mapping, need_kva; + off_t offset; + + need_mapping = bp->b_data == unmapped_buf && + (gbflags & GB_UNMAPPED) == 0; + need_kva = bp->b_kvabase == unmapped_buf && + bp->b_data == unmapped_buf && + (gbflags & GB_KVAALLOC) != 0; + if (!need_mapping && !need_kva) + return; + + BUF_CHECK_UNMAPPED(bp); + + if (need_mapping && bp->b_kvabase != unmapped_buf) { + /* + * Buffer is not mapped, but the KVA was already + * reserved at the time of the instantiation. Use the + * allocated space. + */ + goto has_addr; + } + + /* + * Calculate the amount of the address space we would reserve + * if the buffer was mapped. + */ + bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; + KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); + offset = blkno * bsize; + maxsize = size + (offset & PAGE_MASK); + maxsize = imax(maxsize, bsize); + + while (bufkva_alloc(bp, maxsize, gbflags) != 0) { + if ((gbflags & GB_NOWAIT_BD) != 0) { + /* + * XXXKIB: defragmentation cannot + * succeed, not sure what else to do. + */ + panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); + } + counter_u64_add(mappingrestarts, 1); + bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); + } +has_addr: + if (need_mapping) { + /* b_offset is handled by bpmap_qenter. */ + bp->b_data = bp->b_kvabase; + BUF_CHECK_MAPPED(bp); + bpmap_qenter(bp); + } +} + +struct buf * +getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, + int flags) +{ + struct buf *bp; + int error; + + error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp); + if (error != 0) + return (NULL); + return (bp); +} + +/* + * getblkx: + * + * Get a block given a specified block and offset into a file/device. + * The buffers B_DONE bit will be cleared on return, making it almost + * ready for an I/O initiation. B_INVAL may or may not be set on + * return. The caller should clear B_INVAL prior to initiating a + * READ. + * + * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for + * an existing buffer. + * + * For a VMIO buffer, B_CACHE is modified according to the backing VM. + * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set + * and then cleared based on the backing VM. If the previous buffer is + * non-0-sized but invalid, B_CACHE will be cleared. + * + * If getblk() must create a new buffer, the new buffer is returned with + * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which + * case it is returned with B_INVAL clear and B_CACHE set based on the + * backing VM. + * + * getblk() also forces a bwrite() for any B_DELWRI buffer whos + * B_CACHE bit is clear. + * + * What this means, basically, is that the caller should use B_CACHE to + * determine whether the buffer is fully valid or not and should clear + * B_INVAL prior to issuing a read. If the caller intends to validate + * the buffer by loading its data area with something, the caller needs + * to clear B_INVAL. If the caller does this without issuing an I/O, + * the caller should set B_CACHE ( as an optimization ), else the caller + * should issue the I/O and biodone() will set B_CACHE if the I/O was + * a write attempt or if it was a successful read. If the caller + * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR + * prior to issuing the READ. biodone() will *not* clear B_INVAL. + */ +int +getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, + int flags, struct buf **bpp) +{ + struct buf *bp; + struct bufobj *bo; + daddr_t d_blkno; + int bsize, error, maxsize, vmio; + off_t offset; + + CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); + KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, + ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); + ASSERT_VOP_LOCKED(vp, "getblk"); + if (size > maxbcachebuf) + panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, + maxbcachebuf); + if (!unmapped_buf_allowed) + flags &= ~(GB_UNMAPPED | GB_KVAALLOC); + + bo = &vp->v_bufobj; + d_blkno = blkno; +loop: + BO_RLOCK(bo); + bp = gbincore(bo, blkno); + if (bp != NULL) { + int lockflags; + /* + * Buffer is in-core. If the buffer is not busy nor managed, + * it must be on a queue. + */ + lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; + + if ((flags & GB_LOCK_NOWAIT) != 0) + lockflags |= LK_NOWAIT; + + error = BUF_TIMELOCK(bp, lockflags, + BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); + + /* + * If we slept and got the lock we have to restart in case + * the buffer changed identities. + */ + if (error == ENOLCK) + goto loop; + /* We timed out or were interrupted. */ + else if (error != 0) + return (error); + /* If recursed, assume caller knows the rules. */ + else if (BUF_LOCKRECURSED(bp)) + goto end; + + /* + * The buffer is locked. B_CACHE is cleared if the buffer is + * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set + * and for a VMIO buffer B_CACHE is adjusted according to the + * backing VM cache. + */ + if (bp->b_flags & B_INVAL) + bp->b_flags &= ~B_CACHE; + else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) + bp->b_flags |= B_CACHE; + if (bp->b_flags & B_MANAGED) + MPASS(bp->b_qindex == QUEUE_NONE); + else + bremfree(bp); + + /* + * check for size inconsistencies for non-VMIO case. + */ + if (bp->b_bcount != size) { + if ((bp->b_flags & B_VMIO) == 0 || + (size > bp->b_kvasize)) { + if (bp->b_flags & B_DELWRI) { + bp->b_flags |= B_NOCACHE; + bwrite(bp); + } else { + if (LIST_EMPTY(&bp->b_dep)) { + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + bp->b_flags |= B_NOCACHE; + bwrite(bp); + } + } + goto loop; + } + } + + /* + * Handle the case of unmapped buffer which should + * become mapped, or the buffer for which KVA + * reservation is requested. + */ + bp_unmapped_get_kva(bp, blkno, size, flags); + + /* + * If the size is inconsistent in the VMIO case, we can resize + * the buffer. This might lead to B_CACHE getting set or + * cleared. If the size has not changed, B_CACHE remains + * unchanged from its previous state. + */ + allocbuf(bp, size); + + KASSERT(bp->b_offset != NOOFFSET, + ("getblk: no buffer offset")); + + /* + * A buffer with B_DELWRI set and B_CACHE clear must + * be committed before we can return the buffer in + * order to prevent the caller from issuing a read + * ( due to B_CACHE not being set ) and overwriting + * it. + * + * Most callers, including NFS and FFS, need this to + * operate properly either because they assume they + * can issue a read if B_CACHE is not set, or because + * ( for example ) an uncached B_DELWRI might loop due + * to softupdates re-dirtying the buffer. In the latter + * case, B_CACHE is set after the first write completes, + * preventing further loops. + * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE + * above while extending the buffer, we cannot allow the + * buffer to remain with B_CACHE set after the write + * completes or it will represent a corrupt state. To + * deal with this we set B_NOCACHE to scrap the buffer + * after the write. + * + * We might be able to do something fancy, like setting + * B_CACHE in bwrite() except if B_DELWRI is already set, + * so the below call doesn't set B_CACHE, but that gets real + * confusing. This is much easier. + */ + + if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { + bp->b_flags |= B_NOCACHE; + bwrite(bp); + goto loop; + } + bp->b_flags &= ~B_DONE; + } else { + /* + * Buffer is not in-core, create new buffer. The buffer + * returned by getnewbuf() is locked. Note that the returned + * buffer is also considered valid (not marked B_INVAL). + */ + BO_RUNLOCK(bo); + /* + * If the user does not want us to create the buffer, bail out + * here. + */ + if (flags & GB_NOCREAT) + return (EEXIST); + + bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; + KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); + offset = blkno * bsize; + vmio = vp->v_object != NULL; + if (vmio) { + maxsize = size + (offset & PAGE_MASK); + } else { + maxsize = size; + /* Do not allow non-VMIO notmapped buffers. */ + flags &= ~(GB_UNMAPPED | GB_KVAALLOC); + } + maxsize = imax(maxsize, bsize); + if ((flags & GB_NOSPARSE) != 0 && vmio && + !vn_isdisk(vp, NULL)) { + error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); + KASSERT(error != EOPNOTSUPP, + ("GB_NOSPARSE from fs not supporting bmap, vp %p", + vp)); + if (error != 0) + return (error); + if (d_blkno == -1) + return (EJUSTRETURN); + } + + bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); + if (bp == NULL) { + if (slpflag || slptimeo) + return (ETIMEDOUT); + /* + * XXX This is here until the sleep path is diagnosed + * enough to work under very low memory conditions. + * + * There's an issue on low memory, 4BSD+non-preempt + * systems (eg MIPS routers with 32MB RAM) where buffer + * exhaustion occurs without sleeping for buffer + * reclaimation. This just sticks in a loop and + * constantly attempts to allocate a buffer, which + * hits exhaustion and tries to wakeup bufdaemon. + * This never happens because we never yield. + * + * The real solution is to identify and fix these cases + * so we aren't effectively busy-waiting in a loop + * until the reclaimation path has cycles to run. + */ + kern_yield(PRI_USER); + goto loop; + } + + /* + * This code is used to make sure that a buffer is not + * created while the getnewbuf routine is blocked. + * This can be a problem whether the vnode is locked or not. + * If the buffer is created out from under us, we have to + * throw away the one we just created. + * + * Note: this must occur before we associate the buffer + * with the vp especially considering limitations in + * the splay tree implementation when dealing with duplicate + * lblkno's. + */ + BO_LOCK(bo); + if (gbincore(bo, blkno)) { + BO_UNLOCK(bo); + bp->b_flags |= B_INVAL; + bufspace_release(bufdomain(bp), maxsize); + brelse(bp); + goto loop; + } + + /* + * Insert the buffer into the hash, so that it can + * be found by incore. + */ + bp->b_lblkno = blkno; + bp->b_blkno = d_blkno; + bp->b_offset = offset; + bgetvp(vp, bp); + BO_UNLOCK(bo); + + /* + * set B_VMIO bit. allocbuf() the buffer bigger. Since the + * buffer size starts out as 0, B_CACHE will be set by + * allocbuf() for the VMIO case prior to it testing the + * backing store for validity. + */ + + if (vmio) { + bp->b_flags |= B_VMIO; + KASSERT(vp->v_object == bp->b_bufobj->bo_object, + ("ARGH! different b_bufobj->bo_object %p %p %p\n", + bp, vp->v_object, bp->b_bufobj->bo_object)); + } else { + bp->b_flags &= ~B_VMIO; + KASSERT(bp->b_bufobj->bo_object == NULL, + ("ARGH! has b_bufobj->bo_object %p %p\n", + bp, bp->b_bufobj->bo_object)); + BUF_CHECK_MAPPED(bp); + } + + allocbuf(bp, size); + bufspace_release(bufdomain(bp), maxsize); + bp->b_flags &= ~B_DONE; + } + CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); + BUF_ASSERT_HELD(bp); +end: + buf_track(bp, __func__); + KASSERT(bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); + *bpp = bp; + return (0); +} + +/* + * Get an empty, disassociated buffer of given size. The buffer is initially + * set to B_INVAL. + */ +struct buf * +geteblk(int size, int flags) +{ + struct buf *bp; + int maxsize; + + maxsize = (size + BKVAMASK) & ~BKVAMASK; + while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { + if ((flags & GB_NOWAIT_BD) && + (curthread->td_pflags & TDP_BUFNEED) != 0) + return (NULL); + } + allocbuf(bp, size); + bufspace_release(bufdomain(bp), maxsize); + bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ + BUF_ASSERT_HELD(bp); + return (bp); +} + +/* + * Truncate the backing store for a non-vmio buffer. + */ +static void +vfs_nonvmio_truncate(struct buf *bp, int newbsize) +{ + + if (bp->b_flags & B_MALLOC) { + /* + * malloced buffers are not shrunk + */ + if (newbsize == 0) { + bufmallocadjust(bp, 0); + free(bp->b_data, M_BIOBUF); + bp->b_data = bp->b_kvabase; + bp->b_flags &= ~B_MALLOC; + } + return; + } + vm_hold_free_pages(bp, newbsize); + bufspace_adjust(bp, newbsize); +} + +/* + * Extend the backing for a non-VMIO buffer. + */ +static void +vfs_nonvmio_extend(struct buf *bp, int newbsize) +{ + caddr_t origbuf; + int origbufsize; + + /* + * We only use malloced memory on the first allocation. + * and revert to page-allocated memory when the buffer + * grows. + * + * There is a potential smp race here that could lead + * to bufmallocspace slightly passing the max. It + * is probably extremely rare and not worth worrying + * over. + */ + if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && + bufmallocspace < maxbufmallocspace) { + bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); + bp->b_flags |= B_MALLOC; + bufmallocadjust(bp, newbsize); + return; + } + + /* + * If the buffer is growing on its other-than-first + * allocation then we revert to the page-allocation + * scheme. + */ + origbuf = NULL; + origbufsize = 0; + if (bp->b_flags & B_MALLOC) { + origbuf = bp->b_data; + origbufsize = bp->b_bufsize; + bp->b_data = bp->b_kvabase; + bufmallocadjust(bp, 0); + bp->b_flags &= ~B_MALLOC; + newbsize = round_page(newbsize); + } + vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, + (vm_offset_t) bp->b_data + newbsize); + if (origbuf != NULL) { + bcopy(origbuf, bp->b_data, origbufsize); + free(origbuf, M_BIOBUF); + } + bufspace_adjust(bp, newbsize); +} + +/* + * This code constitutes the buffer memory from either anonymous system + * memory (in the case of non-VMIO operations) or from an associated + * VM object (in the case of VMIO operations). This code is able to + * resize a buffer up or down. + * + * Note that this code is tricky, and has many complications to resolve + * deadlock or inconsistent data situations. Tread lightly!!! + * There are B_CACHE and B_DELWRI interactions that must be dealt with by + * the caller. Calling this code willy nilly can result in the loss of data. + * + * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with + * B_CACHE for the non-VMIO case. + */ +int +allocbuf(struct buf *bp, int size) +{ + int newbsize; + + BUF_ASSERT_HELD(bp); + + if (bp->b_bcount == size) + return (1); + + if (bp->b_kvasize != 0 && bp->b_kvasize < size) + panic("allocbuf: buffer too small"); + + newbsize = roundup2(size, DEV_BSIZE); + if ((bp->b_flags & B_VMIO) == 0) { + if ((bp->b_flags & B_MALLOC) == 0) + newbsize = round_page(newbsize); + /* + * Just get anonymous memory from the kernel. Don't + * mess with B_CACHE. + */ + if (newbsize < bp->b_bufsize) + vfs_nonvmio_truncate(bp, newbsize); + else if (newbsize > bp->b_bufsize) + vfs_nonvmio_extend(bp, newbsize); + } else { + int desiredpages; + + desiredpages = (size == 0) ? 0 : + num_pages((bp->b_offset & PAGE_MASK) + newbsize); + + if (bp->b_flags & B_MALLOC) + panic("allocbuf: VMIO buffer can't be malloced"); + /* + * Set B_CACHE initially if buffer is 0 length or will become + * 0-length. + */ + if (size == 0 || bp->b_bufsize == 0) + bp->b_flags |= B_CACHE; + + if (newbsize < bp->b_bufsize) + vfs_vmio_truncate(bp, desiredpages); + /* XXX This looks as if it should be newbsize > b_bufsize */ + else if (size > bp->b_bcount) + vfs_vmio_extend(bp, desiredpages, size); + bufspace_adjust(bp, newbsize); + } + bp->b_bcount = size; /* requested buffer size. */ + return (1); +} + +extern int inflight_transient_maps; + +static struct bio_queue nondump_bios; + +void +biodone(struct bio *bp) +{ + struct mtx *mtxp; + void (*done)(struct bio *); + vm_offset_t start, end; + + biotrack(bp, __func__); + + /* + * Avoid completing I/O when dumping after a panic since that may + * result in a deadlock in the filesystem or pager code. Note that + * this doesn't affect dumps that were started manually since we aim + * to keep the system usable after it has been resumed. + */ + if (__predict_false(dumping && SCHEDULER_STOPPED())) { + TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue); + return; + } + if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { + bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; + bp->bio_flags |= BIO_UNMAPPED; + start = trunc_page((vm_offset_t)bp->bio_data); + end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); + bp->bio_data = unmapped_buf; + pmap_qremove(start, atop(end - start)); + vmem_free(transient_arena, start, end - start); + atomic_add_int(&inflight_transient_maps, -1); + } + done = bp->bio_done; + if (done == NULL) { + mtxp = mtx_pool_find(mtxpool_sleep, bp); + mtx_lock(mtxp); + bp->bio_flags |= BIO_DONE; + wakeup(bp); + mtx_unlock(mtxp); + } else + done(bp); +} + +/* + * Wait for a BIO to finish. + */ +int +biowait(struct bio *bp, const char *wchan) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, bp); + mtx_lock(mtxp); + while ((bp->bio_flags & BIO_DONE) == 0) + msleep(bp, mtxp, PRIBIO, wchan, 0); + mtx_unlock(mtxp); + if (bp->bio_error != 0) + return (bp->bio_error); + if (!(bp->bio_flags & BIO_ERROR)) + return (0); + return (EIO); +} + +void +biofinish(struct bio *bp, struct devstat *stat, int error) +{ + + if (error) { + bp->bio_error = error; + bp->bio_flags |= BIO_ERROR; + } + if (stat != NULL) + devstat_end_transaction_bio(stat, bp); + biodone(bp); +} + +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) +void +biotrack_buf(struct bio *bp, const char *location) +{ + + buf_track(bp->bio_track_bp, location); +} +#endif + +/* + * bufwait: + * + * Wait for buffer I/O completion, returning error status. The buffer + * is left locked and B_DONE on return. B_EINTR is converted into an EINTR + * error and cleared. + */ +int +bufwait(struct buf *bp) +{ + if (bp->b_iocmd == BIO_READ) + bwait(bp, PRIBIO, "biord"); + else + bwait(bp, PRIBIO, "biowr"); + if (bp->b_flags & B_EINTR) { + bp->b_flags &= ~B_EINTR; + return (EINTR); + } + if (bp->b_ioflags & BIO_ERROR) { + return (bp->b_error ? bp->b_error : EIO); + } else { + return (0); + } +} + +/* + * bufdone: + * + * Finish I/O on a buffer, optionally calling a completion function. + * This is usually called from an interrupt so process blocking is + * not allowed. + * + * biodone is also responsible for setting B_CACHE in a B_VMIO bp. + * In a non-VMIO bp, B_CACHE will be set on the next getblk() + * assuming B_INVAL is clear. + * + * For the VMIO case, we set B_CACHE if the op was a read and no + * read error occurred, or if the op was a write. B_CACHE is never + * set if the buffer is invalid or otherwise uncacheable. + * + * bufdone does not mess with B_INVAL, allowing the I/O routine or the + * initiator to leave B_INVAL set to brelse the buffer out of existence + * in the biodone routine. + */ +void +bufdone(struct buf *bp) +{ + struct bufobj *dropobj; + void (*biodone)(struct buf *); + + buf_track(bp, __func__); + CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + dropobj = NULL; + + KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); + BUF_ASSERT_HELD(bp); + + runningbufwakeup(bp); + if (bp->b_iocmd == BIO_WRITE) + dropobj = bp->b_bufobj; + /* call optional completion function if requested */ + if (bp->b_iodone != NULL) { + biodone = bp->b_iodone; + bp->b_iodone = NULL; + (*biodone) (bp); + if (dropobj) + bufobj_wdrop(dropobj); + return; + } + if (bp->b_flags & B_VMIO) { + /* + * Set B_CACHE if the op was a normal read and no error + * occurred. B_CACHE is set for writes in the b*write() + * routines. + */ + if (bp->b_iocmd == BIO_READ && + !(bp->b_flags & (B_INVAL|B_NOCACHE)) && + !(bp->b_ioflags & BIO_ERROR)) + bp->b_flags |= B_CACHE; + vfs_vmio_iodone(bp); + } + if (!LIST_EMPTY(&bp->b_dep)) + buf_complete(bp); + if ((bp->b_flags & B_CKHASH) != 0) { + KASSERT(bp->b_iocmd == BIO_READ, + ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd)); + KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp)); + (*bp->b_ckhashcalc)(bp); + } + /* + * For asynchronous completions, release the buffer now. The brelse + * will do a wakeup there if necessary - so no need to do a wakeup + * here in the async case. The sync case always needs to do a wakeup. + */ + if (bp->b_flags & B_ASYNC) { + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || + (bp->b_ioflags & BIO_ERROR)) + brelse(bp); + else + bqrelse(bp); + } else + bdone(bp); + if (dropobj) + bufobj_wdrop(dropobj); +} + +/* + * This routine is called in lieu of iodone in the case of + * incomplete I/O. This keeps the busy status for pages + * consistent. + */ +void +vfs_unbusy_pages(struct buf *bp) +{ + int i; + vm_object_t obj; + vm_page_t m; + + runningbufwakeup(bp); + if (!(bp->b_flags & B_VMIO)) + return; + + obj = bp->b_bufobj->bo_object; + VM_OBJECT_WLOCK(obj); + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + if (m == bogus_page) { + m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); + if (!m) + panic("vfs_unbusy_pages: page missing\n"); + bp->b_pages[i] = m; + if (buf_mapped(bp)) { + BUF_CHECK_MAPPED(bp); + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); + } else + BUF_CHECK_UNMAPPED(bp); + } + vm_page_sunbusy(m); + } + vm_object_pip_wakeupn(obj, bp->b_npages); + VM_OBJECT_WUNLOCK(obj); +} + +/* + * vfs_page_set_valid: + * + * Set the valid bits in a page based on the supplied offset. The + * range is restricted to the buffer's size. + * + * This routine is typically called after a read completes. + */ +static void +vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) +{ + vm_ooffset_t eoff; + + /* + * Compute the end offset, eoff, such that [off, eoff) does not span a + * page boundary and eoff is not greater than the end of the buffer. + * The end of the buffer, in this case, is our file EOF, not the + * allocation size of the buffer. + */ + eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; + if (eoff > bp->b_offset + bp->b_bcount) + eoff = bp->b_offset + bp->b_bcount; + + /* + * Set valid range. This is typically the entire buffer and thus the + * entire page. + */ + if (eoff > off) + vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); +} + +/* + * vfs_page_set_validclean: + * + * Set the valid bits and clear the dirty bits in a page based on the + * supplied offset. The range is restricted to the buffer's size. + */ +static void +vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) +{ + vm_ooffset_t soff, eoff; + + /* + * Start and end offsets in buffer. eoff - soff may not cross a + * page boundary or cross the end of the buffer. The end of the + * buffer, in this case, is our file EOF, not the allocation size + * of the buffer. + */ + soff = off; + eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; + if (eoff > bp->b_offset + bp->b_bcount) + eoff = bp->b_offset + bp->b_bcount; + + /* + * Set valid range. This is typically the entire buffer and thus the + * entire page. + */ + if (eoff > soff) { + vm_page_set_validclean( + m, + (vm_offset_t) (soff & PAGE_MASK), + (vm_offset_t) (eoff - soff) + ); + } +} + +/* + * Ensure that all buffer pages are not exclusive busied. If any page is + * exclusive busy, drain it. + */ +void +vfs_drain_busy_pages(struct buf *bp) +{ + vm_page_t m; + int i, last_busied; + + VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); + last_busied = 0; + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + if (vm_page_xbusied(m)) { + for (; last_busied < i; last_busied++) + vm_page_sbusy(bp->b_pages[last_busied]); + while (vm_page_xbusied(m)) { + vm_page_lock(m); + VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); + vm_page_busy_sleep(m, "vbpage", true); + VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + } + } + } + for (i = 0; i < last_busied; i++) + vm_page_sunbusy(bp->b_pages[i]); +} + +/* + * This routine is called before a device strategy routine. + * It is used to tell the VM system that paging I/O is in + * progress, and treat the pages associated with the buffer + * almost as being exclusive busy. Also the object paging_in_progress + * flag is handled to make sure that the object doesn't become + * inconsistent. + * + * Since I/O has not been initiated yet, certain buffer flags + * such as BIO_ERROR or B_INVAL may be in an inconsistent state + * and should be ignored. + */ +void +vfs_busy_pages(struct buf *bp, int clear_modify) +{ + vm_object_t obj; + vm_ooffset_t foff; + vm_page_t m; + int i; + bool bogus; + + if (!(bp->b_flags & B_VMIO)) + return; + + obj = bp->b_bufobj->bo_object; + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("vfs_busy_pages: no buffer offset")); + VM_OBJECT_WLOCK(obj); + vfs_drain_busy_pages(bp); + if (bp->b_bufsize != 0) + vfs_setdirty_locked_object(bp); + bogus = false; + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + + if ((bp->b_flags & B_CLUSTER) == 0) { + vm_object_pip_add(obj, 1); + vm_page_sbusy(m); + } + /* + * When readying a buffer for a read ( i.e + * clear_modify == 0 ), it is important to do + * bogus_page replacement for valid pages in + * partially instantiated buffers. Partially + * instantiated buffers can, in turn, occur when + * reconstituting a buffer from its VM backing store + * base. We only have to do this if B_CACHE is + * clear ( which causes the I/O to occur in the + * first place ). The replacement prevents the read + * I/O from overwriting potentially dirty VM-backed + * pages. XXX bogus page replacement is, uh, bogus. + * It may not work properly with small-block devices. + * We need to find a better way. + */ + if (clear_modify) { + pmap_remove_write(m); + vfs_page_set_validclean(bp, foff, m); + } else if (m->valid == VM_PAGE_BITS_ALL && + (bp->b_flags & B_CACHE) == 0) { + bp->b_pages[i] = bogus_page; + bogus = true; + } + foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; + } + VM_OBJECT_WUNLOCK(obj); + if (bogus && buf_mapped(bp)) { + BUF_CHECK_MAPPED(bp); + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); + } +} + +/* + * vfs_bio_set_valid: + * + * Set the range within the buffer to valid. The range is + * relative to the beginning of the buffer, b_offset. Note that + * b_offset itself may be offset from the beginning of the first + * page. + */ +void +vfs_bio_set_valid(struct buf *bp, int base, int size) +{ + int i, n; + vm_page_t m; + + if (!(bp->b_flags & B_VMIO)) + return; + + /* + * Fixup base to be relative to beginning of first page. + * Set initial n to be the maximum number of bytes in the + * first page that can be validated. + */ + base += (bp->b_offset & PAGE_MASK); + n = PAGE_SIZE - (base & PAGE_MASK); + + VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { + m = bp->b_pages[i]; + if (n > size) + n = size; + vm_page_set_valid_range(m, base & PAGE_MASK, n); + base += n; + size -= n; + n = PAGE_SIZE; + } + VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); +} + +/* + * vfs_bio_clrbuf: + * + * If the specified buffer is a non-VMIO buffer, clear the entire + * buffer. If the specified buffer is a VMIO buffer, clear and + * validate only the previously invalid portions of the buffer. + * This routine essentially fakes an I/O, so we need to clear + * BIO_ERROR and B_INVAL. + * + * Note that while we only theoretically need to clear through b_bcount, + * we go ahead and clear through b_bufsize. + */ +void +vfs_bio_clrbuf(struct buf *bp) +{ + int i, j, mask, sa, ea, slide; + + if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { + clrbuf(bp); + return; + } + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && + (bp->b_offset & PAGE_MASK) == 0) { + if (bp->b_pages[0] == bogus_page) + goto unlock; + mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; + VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); + if ((bp->b_pages[0]->valid & mask) == mask) + goto unlock; + if ((bp->b_pages[0]->valid & mask) == 0) { + pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); + bp->b_pages[0]->valid |= mask; + goto unlock; + } + } + sa = bp->b_offset & PAGE_MASK; + slide = 0; + for (i = 0; i < bp->b_npages; i++, sa = 0) { + slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); + ea = slide & PAGE_MASK; + if (ea == 0) + ea = PAGE_SIZE; + if (bp->b_pages[i] == bogus_page) + continue; + j = sa / DEV_BSIZE; + mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; + VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); + if ((bp->b_pages[i]->valid & mask) == mask) + continue; + if ((bp->b_pages[i]->valid & mask) == 0) + pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); + else { + for (; sa < ea; sa += DEV_BSIZE, j++) { + if ((bp->b_pages[i]->valid & (1 << j)) == 0) { + pmap_zero_page_area(bp->b_pages[i], + sa, DEV_BSIZE); + } + } + } + bp->b_pages[i]->valid |= mask; + } +unlock: + VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); + bp->b_resid = 0; +} + +void +vfs_bio_bzero_buf(struct buf *bp, int base, int size) +{ + vm_page_t m; + int i, n; + + if (buf_mapped(bp)) { + BUF_CHECK_MAPPED(bp); + bzero(bp->b_data + base, size); + } else { + BUF_CHECK_UNMAPPED(bp); + n = PAGE_SIZE - (base & PAGE_MASK); + for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { + m = bp->b_pages[i]; + if (n > size) + n = size; + pmap_zero_page_area(m, base & PAGE_MASK, n); + base += n; + size -= n; + n = PAGE_SIZE; + } + } +} + +/* + * Update buffer flags based on I/O request parameters, optionally releasing the + * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, + * where they may be placed on a page queue (VMIO) or freed immediately (direct + * I/O). Otherwise the buffer is released to the cache. + */ +static void +b_io_dismiss(struct buf *bp, int ioflag, bool release) +{ + + KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, + ("buf %p non-VMIO noreuse", bp)); + + if ((ioflag & IO_DIRECT) != 0) + bp->b_flags |= B_DIRECT; + if ((ioflag & IO_EXT) != 0) + bp->b_xflags |= BX_ALTDATA; + if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { + bp->b_flags |= B_RELBUF; + if ((ioflag & IO_NOREUSE) != 0) + bp->b_flags |= B_NOREUSE; + if (release) + brelse(bp); + } else if (release) + bqrelse(bp); +} + +void +vfs_bio_brelse(struct buf *bp, int ioflag) +{ + + b_io_dismiss(bp, ioflag, true); +} + +void +vfs_bio_set_flags(struct buf *bp, int ioflag) +{ + + b_io_dismiss(bp, ioflag, false); +} + +/* + * vm_hold_load_pages and vm_hold_free_pages get pages into + * a buffers address space. The pages are anonymous and are + * not associated with a file object. + */ +static void +vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) +{ + vm_offset_t pg; + vm_page_t p; + int index; + + BUF_CHECK_MAPPED(bp); + + to = round_page(to); + from = round_page(from); + index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; + + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { + /* + * note: must allocate system pages since blocking here + * could interfere with paging I/O, no matter which + * process we are. + */ + p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | + VM_ALLOC_WAITOK); + pmap_qenter(pg, &p, 1); + bp->b_pages[index] = p; + } + bp->b_npages = index; +} + +/* Return pages associated with this buf to the vm system */ +static void +vm_hold_free_pages(struct buf *bp, int newbsize) +{ + vm_offset_t from; + vm_page_t p; + int index, newnpages; + + BUF_CHECK_MAPPED(bp); + + from = round_page((vm_offset_t)bp->b_data + newbsize); + newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; + if (bp->b_npages > newnpages) + pmap_qremove(from, bp->b_npages - newnpages); + for (index = newnpages; index < bp->b_npages; index++) { + p = bp->b_pages[index]; + bp->b_pages[index] = NULL; + p->wire_count--; + vm_page_free(p); + } + vm_wire_sub(bp->b_npages - newnpages); + bp->b_npages = newnpages; +} + +/* + * Map an IO request into kernel virtual address space. + * + * All requests are (re)mapped into kernel VA space. + * Notice that we use b_bufsize for the size of the buffer + * to be mapped. b_bcount might be modified by the driver. + * + * Note that even if the caller determines that the address space should + * be valid, a race or a smaller-file mapped into a larger space may + * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST + * check the return value. + * + * This function only works with pager buffers. + */ +int +vmapbuf(struct buf *bp, int mapbuf) +{ + vm_prot_t prot; + int pidx; + + if (bp->b_bufsize < 0) + return (-1); + prot = VM_PROT_READ; + if (bp->b_iocmd == BIO_READ) + prot |= VM_PROT_WRITE; /* Less backwards than it looks */ + if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, + btoc(MAXPHYS))) < 0) + return (-1); + bp->b_npages = pidx; + bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; + if (mapbuf || !unmapped_buf_allowed) { + pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); + bp->b_data = bp->b_kvabase + bp->b_offset; + } else + bp->b_data = unmapped_buf; + return(0); +} + +/* + * Free the io map PTEs associated with this IO operation. + * We also invalidate the TLB entries and restore the original b_addr. + * + * This function only works with pager buffers. + */ +void +vunmapbuf(struct buf *bp) +{ + int npages; + + npages = bp->b_npages; + if (buf_mapped(bp)) + pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); + vm_page_unhold_pages(bp->b_pages, npages); + + bp->b_data = unmapped_buf; +} + +void +bdone(struct buf *bp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, bp); + mtx_lock(mtxp); + bp->b_flags |= B_DONE; + wakeup(bp); + mtx_unlock(mtxp); +} + +void +bwait(struct buf *bp, u_char pri, const char *wchan) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, bp); + mtx_lock(mtxp); + while ((bp->b_flags & B_DONE) == 0) + msleep(bp, mtxp, pri, wchan, 0); + mtx_unlock(mtxp); +} + +int +bufsync(struct bufobj *bo, int waitfor) +{ + + return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); +} + +void +bufstrategy(struct bufobj *bo, struct buf *bp) +{ + int i __unused; + struct vnode *vp; + + vp = bp->b_vp; + KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); + KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, + ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); + i = VOP_STRATEGY(vp, bp); + KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); +} + +/* + * Initialize a struct bufobj before use. Memory is assumed zero filled. + */ +void +bufobj_init(struct bufobj *bo, void *private) +{ + static volatile int bufobj_cleanq; + + bo->bo_domain = + atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; + rw_init(BO_LOCKPTR(bo), "bufobj interlock"); + bo->bo_private = private; + TAILQ_INIT(&bo->bo_clean.bv_hd); + TAILQ_INIT(&bo->bo_dirty.bv_hd); +} + +void +bufobj_wrefl(struct bufobj *bo) +{ + + KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); + ASSERT_BO_WLOCKED(bo); + bo->bo_numoutput++; +} + +void +bufobj_wref(struct bufobj *bo) +{ + + KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); + BO_LOCK(bo); + bo->bo_numoutput++; + BO_UNLOCK(bo); +} + +void +bufobj_wdrop(struct bufobj *bo) +{ + + KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); + BO_LOCK(bo); + KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); + if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { + bo->bo_flag &= ~BO_WWAIT; + wakeup(&bo->bo_numoutput); + } + BO_UNLOCK(bo); +} + +int +bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) +{ + int error; + + KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); + ASSERT_BO_WLOCKED(bo); + error = 0; + while (bo->bo_numoutput) { + bo->bo_flag |= BO_WWAIT; + error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), + slpflag | (PRIBIO + 1), "bo_wwait", timeo); + if (error) + break; + } + return (error); +} + +/* + * Set bio_data or bio_ma for struct bio from the struct buf. + */ +void +bdata2bio(struct buf *bp, struct bio *bip) +{ + + if (!buf_mapped(bp)) { + KASSERT(unmapped_buf_allowed, ("unmapped")); + bip->bio_ma = bp->b_pages; + bip->bio_ma_n = bp->b_npages; + bip->bio_data = unmapped_buf; + bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; + bip->bio_flags |= BIO_UNMAPPED; + KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / + PAGE_SIZE == bp->b_npages, + ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, + (long long)bip->bio_length, bip->bio_ma_n)); + } else { + bip->bio_data = bp->b_data; + bip->bio_ma = NULL; + } +} + +/* + * The MIPS pmap code currently doesn't handle aliased pages. + * The VIPT caches may not handle page aliasing themselves, leading + * to data corruption. + * + * As such, this code makes a system extremely unhappy if said + * system doesn't support unaliasing the above situation in hardware. + * Some "recent" systems (eg some mips24k/mips74k cores) don't enable + * this feature at build time, so it has to be handled in software. + * + * Once the MIPS pmap/cache code grows to support this function on + * earlier chips, it should be flipped back off. + */ +#ifdef __mips__ +static int buf_pager_relbuf = 1; +#else +static int buf_pager_relbuf = 0; +#endif +SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, + &buf_pager_relbuf, 0, + "Make buffer pager release buffers after reading"); + +/* + * The buffer pager. It uses buffer reads to validate pages. + * + * In contrast to the generic local pager from vm/vnode_pager.c, this + * pager correctly and easily handles volumes where the underlying + * device block size is greater than the machine page size. The + * buffer cache transparently extends the requested page run to be + * aligned at the block boundary, and does the necessary bogus page + * replacements in the addends to avoid obliterating already valid + * pages. + * + * The only non-trivial issue is that the exclusive busy state for + * pages, which is assumed by the vm_pager_getpages() interface, is + * incompatible with the VMIO buffer cache's desire to share-busy the + * pages. This function performs a trivial downgrade of the pages' + * state before reading buffers, and a less trivial upgrade from the + * shared-busy to excl-busy state after the read. + */ +int +vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, + int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, + vbg_get_blksize_t get_blksize) +{ + vm_page_t m; + vm_object_t object; + struct buf *bp; + struct mount *mp; + daddr_t lbn, lbnp; + vm_ooffset_t la, lb, poff, poffe; + long bsize; + int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; + bool redo, lpart; + + object = vp->v_object; + mp = vp->v_mount; + error = 0; + la = IDX_TO_OFF(ma[count - 1]->pindex); + if (la >= object->un_pager.vnp.vnp_size) + return (VM_PAGER_BAD); + + /* + * Change the meaning of la from where the last requested page starts + * to where it ends, because that's the end of the requested region + * and the start of the potential read-ahead region. + */ + la += PAGE_SIZE; + lpart = la > object->un_pager.vnp.vnp_size; + bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); + + /* + * Calculate read-ahead, behind and total pages. + */ + pgsin = count; + lb = IDX_TO_OFF(ma[0]->pindex); + pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); + pgsin += pgsin_b; + if (rbehind != NULL) + *rbehind = pgsin_b; + pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); + if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) + pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, + PAGE_SIZE) - la); + pgsin += pgsin_a; + if (rahead != NULL) + *rahead = pgsin_a; + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, pgsin); + + br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) + != 0) ? GB_UNMAPPED : 0; + VM_OBJECT_WLOCK(object); +again: + for (i = 0; i < count; i++) + vm_page_busy_downgrade(ma[i]); + VM_OBJECT_WUNLOCK(object); + + lbnp = -1; + for (i = 0; i < count; i++) { + m = ma[i]; + + /* + * Pages are shared busy and the object lock is not + * owned, which together allow for the pages' + * invalidation. The racy test for validity avoids + * useless creation of the buffer for the most typical + * case when invalidation is not used in redo or for + * parallel read. The shared->excl upgrade loop at + * the end of the function catches the race in a + * reliable way (protected by the object lock). + */ + if (m->valid == VM_PAGE_BITS_ALL) + continue; + + poff = IDX_TO_OFF(m->pindex); + poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); + for (; poff < poffe; poff += bsize) { + lbn = get_lblkno(vp, poff); + if (lbn == lbnp) + goto next_page; + lbnp = lbn; + + bsize = get_blksize(vp, lbn); + error = bread_gb(vp, lbn, bsize, curthread->td_ucred, + br_flags, &bp); + if (error != 0) + goto end_pages; + if (LIST_EMPTY(&bp->b_dep)) { + /* + * Invalidation clears m->valid, but + * may leave B_CACHE flag if the + * buffer existed at the invalidation + * time. In this case, recycle the + * buffer to do real read on next + * bread() after redo. + * + * Otherwise B_RELBUF is not strictly + * necessary, enable to reduce buf + * cache pressure. + */ + if (buf_pager_relbuf || + m->valid != VM_PAGE_BITS_ALL) + bp->b_flags |= B_RELBUF; + + bp->b_flags &= ~B_NOCACHE; + brelse(bp); + } else { + bqrelse(bp); + } + } + KASSERT(1 /* racy, enable for debugging */ || + m->valid == VM_PAGE_BITS_ALL || i == count - 1, + ("buf %d %p invalid", i, m)); + if (i == count - 1 && lpart) { + VM_OBJECT_WLOCK(object); + if (m->valid != 0 && + m->valid != VM_PAGE_BITS_ALL) + vm_page_zero_invalid(m, TRUE); + VM_OBJECT_WUNLOCK(object); + } +next_page:; + } +end_pages: + + VM_OBJECT_WLOCK(object); + redo = false; + for (i = 0; i < count; i++) { + vm_page_sunbusy(ma[i]); + ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); + + /* + * Since the pages were only sbusy while neither the + * buffer nor the object lock was held by us, or + * reallocated while vm_page_grab() slept for busy + * relinguish, they could have been invalidated. + * Recheck the valid bits and re-read as needed. + * + * Note that the last page is made fully valid in the + * read loop, and partial validity for the page at + * index count - 1 could mean that the page was + * invalidated or removed, so we must restart for + * safety as well. + */ + if (ma[i]->valid != VM_PAGE_BITS_ALL) + redo = true; + } + if (redo && error == 0) + goto again; + VM_OBJECT_WUNLOCK(object); + return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); +} + +#include "opt_ddb.h" +#ifdef DDB +#include + +/* DDB command to show buffer data */ +DB_SHOW_COMMAND(buffer, db_show_buffer) +{ + /* get args */ + struct buf *bp = (struct buf *)addr; +#ifdef FULL_BUF_TRACKING + uint32_t i, j; +#endif + + if (!have_addr) { + db_printf("usage: show buffer \n"); + return; + } + + db_printf("buf at %p\n", bp); + db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", + (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, + PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); + db_printf( + "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" + "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " + "b_dep = %p\n", + bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, + bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, + (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); + db_printf("b_kvabase = %p, b_kvasize = %d\n", + bp->b_kvabase, bp->b_kvasize); + if (bp->b_npages) { + int i; + db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m; + m = bp->b_pages[i]; + if (m != NULL) + db_printf("(%p, 0x%lx, 0x%lx)", m->object, + (u_long)m->pindex, + (u_long)VM_PAGE_TO_PHYS(m)); + else + db_printf("( ??? )"); + if ((i + 1) < bp->b_npages) + db_printf(","); + } + db_printf("\n"); + } + BUF_LOCKPRINTINFO(bp); +#if defined(FULL_BUF_TRACKING) + db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); + + i = bp->b_io_tcnt % BUF_TRACKING_SIZE; + for (j = 1; j <= BUF_TRACKING_SIZE; j++) { + if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) + continue; + db_printf(" %2u: %s\n", j, + bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); + } +#elif defined(BUF_TRACKING) + db_printf("b_io_tracking: %s\n", bp->b_io_tracking); +#endif + db_printf(" "); +} + +DB_SHOW_COMMAND(bufqueues, bufqueues) +{ + struct bufdomain *bd; + struct buf *bp; + long total; + int i, j, cnt; + + db_printf("bqempty: %d\n", bqempty.bq_len); + + for (i = 0; i < buf_domains; i++) { + bd = &bdomain[i]; + db_printf("Buf domain %d\n", i); + db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); + db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); + db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); + db_printf("\n"); + db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); + db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); + db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); + db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); + db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); + db_printf("\n"); + db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); + db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); + db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); + db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); + db_printf("\n"); + total = 0; + TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) + total += bp->b_bufsize; + db_printf("\tcleanq count\t%d (%ld)\n", + bd->bd_cleanq->bq_len, total); + total = 0; + TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) + total += bp->b_bufsize; + db_printf("\tdirtyq count\t%d (%ld)\n", + bd->bd_dirtyq.bq_len, total); + db_printf("\twakeup\t\t%d\n", bd->bd_wanted); + db_printf("\tlim\t\t%d\n", bd->bd_lim); + db_printf("\tCPU "); + for (j = 0; j <= mp_maxid; j++) + db_printf("%d, ", bd->bd_subq[j].bq_len); + db_printf("\n"); + cnt = 0; + total = 0; + for (j = 0; j < nbuf; j++) + if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) { + cnt++; + total += buf[j].b_bufsize; + } + db_printf("\tLocked buffers: %d space %ld\n", cnt, total); + cnt = 0; + total = 0; + for (j = 0; j < nbuf; j++) + if (buf[j].b_domain == i) { + cnt++; + total += buf[j].b_bufsize; + } + db_printf("\tTotal buffers: %d space %ld\n", cnt, total); + } +} + +DB_SHOW_COMMAND(lockedbufs, lockedbufs) +{ + struct buf *bp; + int i; + + for (i = 0; i < nbuf; i++) { + bp = &buf[i]; + if (BUF_ISLOCKED(bp)) { + db_show_buffer((uintptr_t)bp, 1, 0, NULL); + db_printf("\n"); + if (db_pager_quit) + break; + } + } +} + +DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) +{ + struct vnode *vp; + struct buf *bp; + + if (!have_addr) { + db_printf("usage: show vnodebufs \n"); + return; + } + vp = (struct vnode *)addr; + db_printf("Clean buffers:\n"); + TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { + db_show_buffer((uintptr_t)bp, 1, 0, NULL); + db_printf("\n"); + } + db_printf("Dirty buffers:\n"); + TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { + db_show_buffer((uintptr_t)bp, 1, 0, NULL); + db_printf("\n"); + } +} + +DB_COMMAND(countfreebufs, db_coundfreebufs) +{ + struct buf *bp; + int i, used = 0, nfree = 0; + + if (have_addr) { + db_printf("usage: countfreebufs\n"); + return; + } + + for (i = 0; i < nbuf; i++) { + bp = &buf[i]; + if (bp->b_qindex == QUEUE_EMPTY) + nfree++; + else + used++; + } + + db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, + nfree + used); + db_printf("numfreebuffers is %d\n", numfreebuffers); +} +#endif /* DDB */ diff --git a/freebsd/sys/kern/vfs_cache.c b/freebsd/sys/kern/vfs_cache.c new file mode 100644 index 00000000..7c14b080 --- /dev/null +++ b/freebsd/sys/kern/vfs_cache.c @@ -0,0 +1,2604 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Poul-Henning Kamp of the FreeBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" +#include "opt_ktrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +#ifdef DDB +#include +#endif + +#include + +SDT_PROVIDER_DECLARE(vfs); +SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", + "struct vnode *"); +SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", + "char *"); +SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); +SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", + "char *", "struct vnode *"); +SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); +SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", + "struct vnode *", "char *"); +SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", + "struct vnode *"); +SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, + "struct vnode *", "char *"); +SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", + "char *"); +SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); +SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); +SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); +SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", + "struct vnode *"); +SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *", + "char *", "int"); +SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *", + "char *", "int"); + +/* + * This structure describes the elements in the cache of recent + * names looked up by namei. + */ + +struct namecache { + LIST_ENTRY(namecache) nc_hash; /* hash chain */ + LIST_ENTRY(namecache) nc_src; /* source vnode list */ + TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ + struct vnode *nc_dvp; /* vnode of parent of name */ + union { + struct vnode *nu_vp; /* vnode the name refers to */ + u_int nu_neghits; /* negative entry hits */ + } n_un; + u_char nc_flag; /* flag bits */ + u_char nc_nlen; /* length of name */ + char nc_name[0]; /* segment name + nul */ +}; + +/* + * struct namecache_ts repeats struct namecache layout up to the + * nc_nlen member. + * struct namecache_ts is used in place of struct namecache when time(s) need + * to be stored. The nc_dotdottime field is used when a cache entry is mapping + * both a non-dotdot directory name plus dotdot for the directory's + * parent. + */ +struct namecache_ts { + struct timespec nc_time; /* timespec provided by fs */ + struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ + int nc_ticks; /* ticks value when entry was added */ + struct namecache nc_nc; +}; + +#define nc_vp n_un.nu_vp +#define nc_neghits n_un.nu_neghits + +/* + * Flags in namecache.nc_flag + */ +#define NCF_WHITE 0x01 +#define NCF_ISDOTDOT 0x02 +#define NCF_TS 0x04 +#define NCF_DTS 0x08 +#define NCF_DVDROP 0x10 +#define NCF_NEGATIVE 0x20 +#define NCF_HOTNEGATIVE 0x40 + +/* + * Name caching works as follows: + * + * Names found by directory scans are retained in a cache + * for future reference. It is managed LRU, so frequently + * used names will hang around. Cache is indexed by hash value + * obtained from (dvp, name) where dvp refers to the directory + * containing name. + * + * If it is a "negative" entry, (i.e. for a name that is known NOT to + * exist) the vnode pointer will be NULL. + * + * Upon reaching the last segment of a path, if the reference + * is for DELETE, or NOCACHE is set (rewrite), and the + * name is located in the cache, it will be dropped. + * + * These locks are used (in the order in which they can be taken): + * NAME TYPE ROLE + * vnodelock mtx vnode lists and v_cache_dd field protection + * bucketlock rwlock for access to given set of hash buckets + * neglist mtx negative entry LRU management + * + * Additionally, ncneg_shrink_lock mtx is used to have at most one thread + * shrinking the LRU list. + * + * It is legal to take multiple vnodelock and bucketlock locks. The locking + * order is lower address first. Both are recursive. + * + * "." lookups are lockless. + * + * ".." and vnode -> name lookups require vnodelock. + * + * name -> vnode lookup requires the relevant bucketlock to be held for reading. + * + * Insertions and removals of entries require involved vnodes and bucketlocks + * to be write-locked to prevent other threads from seeing the entry. + * + * Some lookups result in removal of the found entry (e.g. getting rid of a + * negative entry with the intent to create a positive one), which poses a + * problem when multiple threads reach the state. Similarly, two different + * threads can purge two different vnodes and try to remove the same name. + * + * If the already held vnode lock is lower than the second required lock, we + * can just take the other lock. However, in the opposite case, this could + * deadlock. As such, this is resolved by trylocking and if that fails unlocking + * the first node, locking everything in order and revalidating the state. + */ + +/* + * Structures associated with name caching. + */ +#define NCHHASH(hash) \ + (&nchashtbl[(hash) & nchash]) +static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ +static u_long __read_mostly nchash; /* size of hash table */ +SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, + "Size of namecache hash table"); +static u_long __read_mostly ncnegfactor = 12; /* ratio of negative entries */ +SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, + "Ratio of negative namecache entries"); +static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ +SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, + "Number of negative entries in namecache"); +static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ +SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, + "Number of namecache entries"); +static u_long __exclusive_cache_line numcachehv;/* number of cache entries with vnodes held */ +SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, + "Number of namecache entries with vnodes held"); +u_int __read_mostly ncsizefactor = 2; +SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, + "Size factor for namecache"); +static u_int __read_mostly ncpurgeminvnodes; +SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, + "Number of vnodes below which purgevfs ignores the request"); +static u_int __read_mostly ncneghitsrequeue = 8; +SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0, + "Number of hits to requeue a negative entry in the LRU list"); + +struct nchstats nchstats; /* cache effectiveness statistics */ + +static struct mtx ncneg_shrink_lock; +static int shrink_list_turn; + +struct neglist { + struct mtx nl_lock; + TAILQ_HEAD(, namecache) nl_list; +} __aligned(CACHE_LINE_SIZE); + +static struct neglist __read_mostly *neglists; +static struct neglist ncneg_hot; + +#define numneglists (ncneghash + 1) +static u_int __read_mostly ncneghash; +static inline struct neglist * +NCP2NEGLIST(struct namecache *ncp) +{ + + return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); +} + +#define numbucketlocks (ncbuckethash + 1) +static u_int __read_mostly ncbuckethash; +static struct rwlock_padalign __read_mostly *bucketlocks; +#define HASH2BUCKETLOCK(hash) \ + ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) + +#define numvnodelocks (ncvnodehash + 1) +static u_int __read_mostly ncvnodehash; +static struct mtx __read_mostly *vnodelocks; +static inline struct mtx * +VP2VNODELOCK(struct vnode *vp) +{ + + return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); +} + +/* + * UMA zones for the VFS cache. + * + * The small cache is used for entries with short names, which are the + * most common. The large cache is used for entries which are too big to + * fit in the small cache. + */ +static uma_zone_t __read_mostly cache_zone_small; +static uma_zone_t __read_mostly cache_zone_small_ts; +static uma_zone_t __read_mostly cache_zone_large; +static uma_zone_t __read_mostly cache_zone_large_ts; + +#define CACHE_PATH_CUTOFF 35 + +static struct namecache * +cache_alloc(int len, int ts) +{ + struct namecache_ts *ncp_ts; + struct namecache *ncp; + + if (__predict_false(ts)) { + if (len <= CACHE_PATH_CUTOFF) + ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); + else + ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); + ncp = &ncp_ts->nc_nc; + } else { + if (len <= CACHE_PATH_CUTOFF) + ncp = uma_zalloc(cache_zone_small, M_WAITOK); + else + ncp = uma_zalloc(cache_zone_large, M_WAITOK); + } + return (ncp); +} + +static void +cache_free(struct namecache *ncp) +{ + struct namecache_ts *ncp_ts; + + if (ncp == NULL) + return; + if ((ncp->nc_flag & NCF_DVDROP) != 0) + vdrop(ncp->nc_dvp); + if (__predict_false(ncp->nc_flag & NCF_TS)) { + ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); + if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) + uma_zfree(cache_zone_small_ts, ncp_ts); + else + uma_zfree(cache_zone_large_ts, ncp_ts); + } else { + if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) + uma_zfree(cache_zone_small, ncp); + else + uma_zfree(cache_zone_large, ncp); + } +} + +static void +cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) +{ + struct namecache_ts *ncp_ts; + + KASSERT((ncp->nc_flag & NCF_TS) != 0 || + (tsp == NULL && ticksp == NULL), + ("No NCF_TS")); + + if (tsp == NULL && ticksp == NULL) + return; + + ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); + if (tsp != NULL) + *tsp = ncp_ts->nc_time; + if (ticksp != NULL) + *ticksp = ncp_ts->nc_ticks; +} + +static int __read_mostly doingcache = 1; /* 1 => enable the cache */ +SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, + "VFS namecache enabled"); + +/* Export size information to userland */ +SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, + sizeof(struct namecache), "sizeof(struct namecache)"); + +/* + * The new name cache statistics + */ +static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, + "Name cache statistics"); +#define STATNODE_ULONG(name, descr) \ + SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); +#define STATNODE_COUNTER(name, descr) \ + static counter_u64_t __read_mostly name; \ + SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr); +STATNODE_ULONG(numneg, "Number of negative cache entries"); +STATNODE_ULONG(numcache, "Number of cache entries"); +STATNODE_COUNTER(numcalls, "Number of cache lookups"); +STATNODE_COUNTER(dothits, "Number of '.' hits"); +STATNODE_COUNTER(dotdothits, "Number of '..' hits"); +STATNODE_COUNTER(numchecks, "Number of checks in lookup"); +STATNODE_COUNTER(nummiss, "Number of cache misses"); +STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); +STATNODE_COUNTER(numposzaps, + "Number of cache hits (positive) we do not want to cache"); +STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); +STATNODE_COUNTER(numnegzaps, + "Number of cache hits (negative) we do not want to cache"); +STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); +/* These count for kern___getcwd(), too. */ +STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); +STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); +STATNODE_COUNTER(numfullpathfail2, + "Number of fullpath search errors (VOP_VPTOCNP failures)"); +STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); +STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); +static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, + "Number of times zap_and_exit failed to lock"); +static long cache_lock_vnodes_cel_3_failures; +STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, + "Number of times 3-way vnode locking failed"); + +static void cache_zap_locked(struct namecache *ncp, bool neg_locked); +static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, + char *buf, char **retbuf, u_int buflen); + +static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); + +static int cache_yield; +SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, + "Number of times cache called yield"); + +static void +cache_maybe_yield(void) +{ + + if (should_yield()) { + cache_yield++; + kern_yield(PRI_USER); + } +} + +static inline void +cache_assert_vlp_locked(struct mtx *vlp) +{ + + if (vlp != NULL) + mtx_assert(vlp, MA_OWNED); +} + +static inline void +cache_assert_vnode_locked(struct vnode *vp) +{ + struct mtx *vlp; + + vlp = VP2VNODELOCK(vp); + cache_assert_vlp_locked(vlp); +} + +static uint32_t +cache_get_hash(char *name, u_char len, struct vnode *dvp) +{ + uint32_t hash; + + hash = fnv_32_buf(name, len, FNV1_32_INIT); + hash = fnv_32_buf(&dvp, sizeof(dvp), hash); + return (hash); +} + +static inline struct rwlock * +NCP2BUCKETLOCK(struct namecache *ncp) +{ + uint32_t hash; + + hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); + return (HASH2BUCKETLOCK(hash)); +} + +#ifdef INVARIANTS +static void +cache_assert_bucket_locked(struct namecache *ncp, int mode) +{ + struct rwlock *blp; + + blp = NCP2BUCKETLOCK(ncp); + rw_assert(blp, mode); +} +#else +#define cache_assert_bucket_locked(x, y) do { } while (0) +#endif + +#define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y)) +static void +_cache_sort(void **p1, void **p2) +{ + void *tmp; + + if (*p1 > *p2) { + tmp = *p2; + *p2 = *p1; + *p1 = tmp; + } +} + +static void +cache_lock_all_buckets(void) +{ + u_int i; + + for (i = 0; i < numbucketlocks; i++) + rw_wlock(&bucketlocks[i]); +} + +static void +cache_unlock_all_buckets(void) +{ + u_int i; + + for (i = 0; i < numbucketlocks; i++) + rw_wunlock(&bucketlocks[i]); +} + +static void +cache_lock_all_vnodes(void) +{ + u_int i; + + for (i = 0; i < numvnodelocks; i++) + mtx_lock(&vnodelocks[i]); +} + +static void +cache_unlock_all_vnodes(void) +{ + u_int i; + + for (i = 0; i < numvnodelocks; i++) + mtx_unlock(&vnodelocks[i]); +} + +static int +cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) +{ + + cache_sort(&vlp1, &vlp2); + MPASS(vlp2 != NULL); + + if (vlp1 != NULL) { + if (!mtx_trylock(vlp1)) + return (EAGAIN); + } + if (!mtx_trylock(vlp2)) { + if (vlp1 != NULL) + mtx_unlock(vlp1); + return (EAGAIN); + } + + return (0); +} + +static void +cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) +{ + + MPASS(vlp1 != NULL || vlp2 != NULL); + + if (vlp1 != NULL) + mtx_unlock(vlp1); + if (vlp2 != NULL) + mtx_unlock(vlp2); +} + +static int +sysctl_nchstats(SYSCTL_HANDLER_ARGS) +{ + struct nchstats snap; + + if (req->oldptr == NULL) + return (SYSCTL_OUT(req, 0, sizeof(snap))); + + snap = nchstats; + snap.ncs_goodhits = counter_u64_fetch(numposhits); + snap.ncs_neghits = counter_u64_fetch(numneghits); + snap.ncs_badhits = counter_u64_fetch(numposzaps) + + counter_u64_fetch(numnegzaps); + snap.ncs_miss = counter_u64_fetch(nummisszap) + + counter_u64_fetch(nummiss); + + return (SYSCTL_OUT(req, &snap, sizeof(snap))); +} +SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | + CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", + "VFS cache effectiveness statistics"); + +#ifdef DIAGNOSTIC +/* + * Grab an atomic snapshot of the name cache hash chain lengths + */ +static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, + "hash table stats"); + +static int +sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) +{ + struct nchashhead *ncpp; + struct namecache *ncp; + int i, error, n_nchash, *cntbuf; + +retry: + n_nchash = nchash + 1; /* nchash is max index, not count */ + if (req->oldptr == NULL) + return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); + cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); + cache_lock_all_buckets(); + if (n_nchash != nchash + 1) { + cache_unlock_all_buckets(); + free(cntbuf, M_TEMP); + goto retry; + } + /* Scan hash tables counting entries */ + for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) + LIST_FOREACH(ncp, ncpp, nc_hash) + cntbuf[i]++; + cache_unlock_all_buckets(); + for (error = 0, i = 0; i < n_nchash; i++) + if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) + break; + free(cntbuf, M_TEMP); + return (error); +} +SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| + CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", + "nchash chain lengths"); + +static int +sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) +{ + int error; + struct nchashhead *ncpp; + struct namecache *ncp; + int n_nchash; + int count, maxlength, used, pct; + + if (!req->oldptr) + return SYSCTL_OUT(req, 0, 4 * sizeof(int)); + + cache_lock_all_buckets(); + n_nchash = nchash + 1; /* nchash is max index, not count */ + used = 0; + maxlength = 0; + + /* Scan hash tables for applicable entries */ + for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { + count = 0; + LIST_FOREACH(ncp, ncpp, nc_hash) { + count++; + } + if (count) + used++; + if (maxlength < count) + maxlength = count; + } + n_nchash = nchash + 1; + cache_unlock_all_buckets(); + pct = (used * 100) / (n_nchash / 100); + error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); + if (error) + return (error); + error = SYSCTL_OUT(req, &used, sizeof(used)); + if (error) + return (error); + error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); + if (error) + return (error); + error = SYSCTL_OUT(req, &pct, sizeof(pct)); + if (error) + return (error); + return (0); +} +SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| + CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", + "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); +#endif + +/* + * Negative entries management + * + * A variation of LRU scheme is used. New entries are hashed into one of + * numneglists cold lists. Entries get promoted to the hot list on first hit. + * Partial LRU for the hot list is maintained by requeueing them every + * ncneghitsrequeue hits. + * + * The shrinker will demote hot list head and evict from the cold list in a + * round-robin manner. + */ +static void +cache_negative_hit(struct namecache *ncp) +{ + struct neglist *neglist; + u_int hits; + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + hits = atomic_fetchadd_int(&ncp->nc_neghits, 1); + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + if ((hits % ncneghitsrequeue) != 0) + return; + mtx_lock(&ncneg_hot.nl_lock); + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); + TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); + mtx_unlock(&ncneg_hot.nl_lock); + return; + } + /* + * The shrinker cleared the flag and removed the entry from + * the hot list. Put it back. + */ + } else { + mtx_lock(&ncneg_hot.nl_lock); + } + neglist = NCP2NEGLIST(ncp); + mtx_lock(&neglist->nl_lock); + if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { + TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); + TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); + ncp->nc_flag |= NCF_HOTNEGATIVE; + } + mtx_unlock(&neglist->nl_lock); + mtx_unlock(&ncneg_hot.nl_lock); +} + +static void +cache_negative_insert(struct namecache *ncp, bool neg_locked) +{ + struct neglist *neglist; + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + cache_assert_bucket_locked(ncp, RA_WLOCKED); + neglist = NCP2NEGLIST(ncp); + if (!neg_locked) { + mtx_lock(&neglist->nl_lock); + } else { + mtx_assert(&neglist->nl_lock, MA_OWNED); + } + TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); + if (!neg_locked) + mtx_unlock(&neglist->nl_lock); + atomic_add_rel_long(&numneg, 1); +} + +static void +cache_negative_remove(struct namecache *ncp, bool neg_locked) +{ + struct neglist *neglist; + bool hot_locked = false; + bool list_locked = false; + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + cache_assert_bucket_locked(ncp, RA_WLOCKED); + neglist = NCP2NEGLIST(ncp); + if (!neg_locked) { + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + hot_locked = true; + mtx_lock(&ncneg_hot.nl_lock); + if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) { + list_locked = true; + mtx_lock(&neglist->nl_lock); + } + } else { + list_locked = true; + mtx_lock(&neglist->nl_lock); + } + } + if (ncp->nc_flag & NCF_HOTNEGATIVE) { + mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); + TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); + } else { + mtx_assert(&neglist->nl_lock, MA_OWNED); + TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); + } + if (list_locked) + mtx_unlock(&neglist->nl_lock); + if (hot_locked) + mtx_unlock(&ncneg_hot.nl_lock); + atomic_subtract_rel_long(&numneg, 1); +} + +static void +cache_negative_shrink_select(int start, struct namecache **ncpp, + struct neglist **neglistpp) +{ + struct neglist *neglist; + struct namecache *ncp; + int i; + + *ncpp = ncp = NULL; + neglist = NULL; + + for (i = start; i < numneglists; i++) { + neglist = &neglists[i]; + if (TAILQ_FIRST(&neglist->nl_list) == NULL) + continue; + mtx_lock(&neglist->nl_lock); + ncp = TAILQ_FIRST(&neglist->nl_list); + if (ncp != NULL) + break; + mtx_unlock(&neglist->nl_lock); + } + + *neglistpp = neglist; + *ncpp = ncp; +} + +static void +cache_negative_zap_one(void) +{ + struct namecache *ncp, *ncp2; + struct neglist *neglist; + struct mtx *dvlp; + struct rwlock *blp; + + if (!mtx_trylock(&ncneg_shrink_lock)) + return; + + mtx_lock(&ncneg_hot.nl_lock); + ncp = TAILQ_FIRST(&ncneg_hot.nl_list); + if (ncp != NULL) { + neglist = NCP2NEGLIST(ncp); + mtx_lock(&neglist->nl_lock); + TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); + TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); + ncp->nc_flag &= ~NCF_HOTNEGATIVE; + mtx_unlock(&neglist->nl_lock); + } + + cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); + shrink_list_turn++; + if (shrink_list_turn == numneglists) + shrink_list_turn = 0; + if (ncp == NULL && shrink_list_turn == 0) + cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist); + if (ncp == NULL) { + mtx_unlock(&ncneg_hot.nl_lock); + goto out; + } + + MPASS(ncp->nc_flag & NCF_NEGATIVE); + dvlp = VP2VNODELOCK(ncp->nc_dvp); + blp = NCP2BUCKETLOCK(ncp); + mtx_unlock(&neglist->nl_lock); + mtx_unlock(&ncneg_hot.nl_lock); + mtx_lock(dvlp); + rw_wlock(blp); + mtx_lock(&neglist->nl_lock); + ncp2 = TAILQ_FIRST(&neglist->nl_list); + if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || + blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) { + ncp = NULL; + goto out_unlock_all; + } + SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp, + ncp->nc_name, ncp->nc_neghits); + + cache_zap_locked(ncp, true); +out_unlock_all: + mtx_unlock(&neglist->nl_lock); + rw_wunlock(blp); + mtx_unlock(dvlp); +out: + mtx_unlock(&ncneg_shrink_lock); + cache_free(ncp); +} + +/* + * cache_zap_locked(): + * + * Removes a namecache entry from cache, whether it contains an actual + * pointer to a vnode or if it is just a negative cache entry. + */ +static void +cache_zap_locked(struct namecache *ncp, bool neg_locked) +{ + + if (!(ncp->nc_flag & NCF_NEGATIVE)) + cache_assert_vnode_locked(ncp->nc_vp); + cache_assert_vnode_locked(ncp->nc_dvp); + cache_assert_bucket_locked(ncp, RA_WLOCKED); + + CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, + (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); + if (!(ncp->nc_flag & NCF_NEGATIVE)) { + SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, + ncp->nc_name, ncp->nc_vp); + } else { + SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp, + ncp->nc_name, ncp->nc_neghits); + } + LIST_REMOVE(ncp, nc_hash); + if (!(ncp->nc_flag & NCF_NEGATIVE)) { + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); + if (ncp == ncp->nc_vp->v_cache_dd) + ncp->nc_vp->v_cache_dd = NULL; + } else { + cache_negative_remove(ncp, neg_locked); + } + if (ncp->nc_flag & NCF_ISDOTDOT) { + if (ncp == ncp->nc_dvp->v_cache_dd) + ncp->nc_dvp->v_cache_dd = NULL; + } else { + LIST_REMOVE(ncp, nc_src); + if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { + ncp->nc_flag |= NCF_DVDROP; + atomic_subtract_rel_long(&numcachehv, 1); + } + } + atomic_subtract_rel_long(&numcache, 1); +} + +static void +cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) +{ + struct rwlock *blp; + + MPASS(ncp->nc_dvp == vp); + MPASS(ncp->nc_flag & NCF_NEGATIVE); + cache_assert_vnode_locked(vp); + + blp = NCP2BUCKETLOCK(ncp); + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); +} + +static bool +cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, + struct mtx **vlpp) +{ + struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; + struct rwlock *blp; + + MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); + cache_assert_vnode_locked(vp); + + if (ncp->nc_flag & NCF_NEGATIVE) { + if (*vlpp != NULL) { + mtx_unlock(*vlpp); + *vlpp = NULL; + } + cache_zap_negative_locked_vnode_kl(ncp, vp); + return (true); + } + + pvlp = VP2VNODELOCK(vp); + blp = NCP2BUCKETLOCK(ncp); + vlp1 = VP2VNODELOCK(ncp->nc_dvp); + vlp2 = VP2VNODELOCK(ncp->nc_vp); + + if (*vlpp == vlp1 || *vlpp == vlp2) { + to_unlock = *vlpp; + *vlpp = NULL; + } else { + if (*vlpp != NULL) { + mtx_unlock(*vlpp); + *vlpp = NULL; + } + cache_sort(&vlp1, &vlp2); + if (vlp1 == pvlp) { + mtx_lock(vlp2); + to_unlock = vlp2; + } else { + if (!mtx_trylock(vlp1)) + goto out_relock; + to_unlock = vlp1; + } + } + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); + if (to_unlock != NULL) + mtx_unlock(to_unlock); + return (true); + +out_relock: + mtx_unlock(vlp2); + mtx_lock(vlp1); + mtx_lock(vlp2); + MPASS(*vlpp == NULL); + *vlpp = vlp1; + return (false); +} + +static int +cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) +{ + struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; + struct rwlock *blp; + int error = 0; + + MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); + cache_assert_vnode_locked(vp); + + pvlp = VP2VNODELOCK(vp); + if (ncp->nc_flag & NCF_NEGATIVE) { + cache_zap_negative_locked_vnode_kl(ncp, vp); + goto out; + } + + blp = NCP2BUCKETLOCK(ncp); + vlp1 = VP2VNODELOCK(ncp->nc_dvp); + vlp2 = VP2VNODELOCK(ncp->nc_vp); + cache_sort(&vlp1, &vlp2); + if (vlp1 == pvlp) { + mtx_lock(vlp2); + to_unlock = vlp2; + } else { + if (!mtx_trylock(vlp1)) { + error = EAGAIN; + goto out; + } + to_unlock = vlp1; + } + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); + mtx_unlock(to_unlock); +out: + mtx_unlock(pvlp); + return (error); +} + +static int +cache_zap_wlocked_bucket(struct namecache *ncp, struct rwlock *blp) +{ + struct mtx *dvlp, *vlp; + + cache_assert_bucket_locked(ncp, RA_WLOCKED); + + dvlp = VP2VNODELOCK(ncp->nc_dvp); + vlp = NULL; + if (!(ncp->nc_flag & NCF_NEGATIVE)) + vlp = VP2VNODELOCK(ncp->nc_vp); + if (cache_trylock_vnodes(dvlp, vlp) == 0) { + cache_zap_locked(ncp, false); + rw_wunlock(blp); + cache_unlock_vnodes(dvlp, vlp); + return (0); + } + + rw_wunlock(blp); + return (EAGAIN); +} + +static int +cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp) +{ + struct mtx *dvlp, *vlp; + + cache_assert_bucket_locked(ncp, RA_RLOCKED); + + dvlp = VP2VNODELOCK(ncp->nc_dvp); + vlp = NULL; + if (!(ncp->nc_flag & NCF_NEGATIVE)) + vlp = VP2VNODELOCK(ncp->nc_vp); + if (cache_trylock_vnodes(dvlp, vlp) == 0) { + rw_runlock(blp); + rw_wlock(blp); + cache_zap_locked(ncp, false); + rw_wunlock(blp); + cache_unlock_vnodes(dvlp, vlp); + return (0); + } + + rw_runlock(blp); + return (EAGAIN); +} + +static int +cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, + struct mtx **vlpp1, struct mtx **vlpp2) +{ + struct mtx *dvlp, *vlp; + + cache_assert_bucket_locked(ncp, RA_WLOCKED); + + dvlp = VP2VNODELOCK(ncp->nc_dvp); + vlp = NULL; + if (!(ncp->nc_flag & NCF_NEGATIVE)) + vlp = VP2VNODELOCK(ncp->nc_vp); + cache_sort(&dvlp, &vlp); + + if (*vlpp1 == dvlp && *vlpp2 == vlp) { + cache_zap_locked(ncp, false); + cache_unlock_vnodes(dvlp, vlp); + *vlpp1 = NULL; + *vlpp2 = NULL; + return (0); + } + + if (*vlpp1 != NULL) + mtx_unlock(*vlpp1); + if (*vlpp2 != NULL) + mtx_unlock(*vlpp2); + *vlpp1 = NULL; + *vlpp2 = NULL; + + if (cache_trylock_vnodes(dvlp, vlp) == 0) { + cache_zap_locked(ncp, false); + cache_unlock_vnodes(dvlp, vlp); + return (0); + } + + rw_wunlock(blp); + *vlpp1 = dvlp; + *vlpp2 = vlp; + if (*vlpp1 != NULL) + mtx_lock(*vlpp1); + mtx_lock(*vlpp2); + rw_wlock(blp); + return (EAGAIN); +} + +static void +cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) +{ + + if (blp != NULL) { + rw_runlock(blp); + } else { + mtx_unlock(vlp); + } +} + +static int __noinline +cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, + struct timespec *tsp, int *ticksp) +{ + int ltype; + + *vpp = dvp; + CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", + dvp, cnp->cn_nameptr); + counter_u64_add(dothits, 1); + SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); + if (tsp != NULL) + timespecclear(tsp); + if (ticksp != NULL) + *ticksp = ticks; + vrefact(*vpp); + /* + * When we lookup "." we still can be asked to lock it + * differently... + */ + ltype = cnp->cn_lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(*vpp)) { + if (ltype == LK_EXCLUSIVE) { + vn_lock(*vpp, LK_UPGRADE | LK_RETRY); + if ((*vpp)->v_iflag & VI_DOOMED) { + /* forced unmount */ + vrele(*vpp); + *vpp = NULL; + return (ENOENT); + } + } else + vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); + } + return (-1); +} + +static __noinline int +cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp, struct timespec *tsp, int *ticksp) +{ + struct namecache *ncp; + struct rwlock *blp; + struct mtx *dvlp, *dvlp2; + uint32_t hash; + int error; + + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { + counter_u64_add(dotdothits, 1); + dvlp = VP2VNODELOCK(dvp); + dvlp2 = NULL; + mtx_lock(dvlp); +retry_dotdot: + ncp = dvp->v_cache_dd; + if (ncp == NULL) { + SDT_PROBE3(vfs, namecache, lookup, miss, dvp, + "..", NULL); + mtx_unlock(dvlp); + if (dvlp2 != NULL) + mtx_unlock(dvlp2); + return (0); + } + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { + if (ncp->nc_dvp != dvp) + panic("dvp %p v_cache_dd %p\n", dvp, ncp); + if (!cache_zap_locked_vnode_kl2(ncp, + dvp, &dvlp2)) + goto retry_dotdot; + MPASS(dvp->v_cache_dd == NULL); + mtx_unlock(dvlp); + if (dvlp2 != NULL) + mtx_unlock(dvlp2); + cache_free(ncp); + } else { + dvp->v_cache_dd = NULL; + mtx_unlock(dvlp); + if (dvlp2 != NULL) + mtx_unlock(dvlp2); + } + return (0); + } + + hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); + blp = HASH2BUCKETLOCK(hash); +retry: + if (LIST_EMPTY(NCHHASH(hash))) + goto out_no_entry; + + rw_wlock(blp); + + LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + counter_u64_add(numchecks, 1); + if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) + break; + } + + /* We failed to find an entry */ + if (ncp == NULL) { + rw_wunlock(blp); + goto out_no_entry; + } + + counter_u64_add(numposzaps, 1); + + error = cache_zap_wlocked_bucket(ncp, blp); + if (error != 0) { + zap_and_exit_bucket_fail++; + cache_maybe_yield(); + goto retry; + } + cache_free(ncp); + return (0); +out_no_entry: + SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); + counter_u64_add(nummisszap, 1); + return (0); +} + +/** + * Lookup a name in the name cache + * + * # Arguments + * + * - dvp: Parent directory in which to search. + * - vpp: Return argument. Will contain desired vnode on cache hit. + * - cnp: Parameters of the name search. The most interesting bits of + * the cn_flags field have the following meanings: + * - MAKEENTRY: If clear, free an entry from the cache rather than look + * it up. + * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." + * - tsp: Return storage for cache timestamp. On a successful (positive + * or negative) lookup, tsp will be filled with any timespec that + * was stored when this cache entry was created. However, it will + * be clear for "." entries. + * - ticks: Return storage for alternate cache timestamp. On a successful + * (positive or negative) lookup, it will contain the ticks value + * that was current when the cache entry was created, unless cnp + * was ".". + * + * # Returns + * + * - -1: A positive cache hit. vpp will contain the desired vnode. + * - ENOENT: A negative cache hit, or dvp was recycled out from under us due + * to a forced unmount. vpp will not be modified. If the entry + * is a whiteout, then the ISWHITEOUT flag will be set in + * cnp->cn_flags. + * - 0: A cache miss. vpp will not be modified. + * + * # Locking + * + * On a cache hit, vpp will be returned locked and ref'd. If we're looking up + * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the + * lock is not recursively acquired. + */ +int +cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, + struct timespec *tsp, int *ticksp) +{ + struct namecache_ts *ncp_ts; + struct namecache *ncp; + struct rwlock *blp; + struct mtx *dvlp; + uint32_t hash; + int error, ltype; + + if (__predict_false(!doingcache)) { + cnp->cn_flags &= ~MAKEENTRY; + return (0); + } + + counter_u64_add(numcalls, 1); + + if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) + return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); + + if ((cnp->cn_flags & MAKEENTRY) == 0) + return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); + +retry: + blp = NULL; + dvlp = NULL; + error = 0; + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { + counter_u64_add(dotdothits, 1); + dvlp = VP2VNODELOCK(dvp); + mtx_lock(dvlp); + ncp = dvp->v_cache_dd; + if (ncp == NULL) { + SDT_PROBE3(vfs, namecache, lookup, miss, dvp, + "..", NULL); + mtx_unlock(dvlp); + return (0); + } + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { + if (ncp->nc_flag & NCF_NEGATIVE) + *vpp = NULL; + else + *vpp = ncp->nc_vp; + } else + *vpp = ncp->nc_dvp; + /* Return failure if negative entry was found. */ + if (*vpp == NULL) + goto negative_success; + CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", + dvp, cnp->cn_nameptr, *vpp); + SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", + *vpp); + cache_out_ts(ncp, tsp, ticksp); + if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == + NCF_DTS && tsp != NULL) { + ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); + *tsp = ncp_ts->nc_dotdottime; + } + goto success; + } + + hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); + blp = HASH2BUCKETLOCK(hash); + rw_rlock(blp); + + LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + counter_u64_add(numchecks, 1); + if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) + break; + } + + /* We failed to find an entry */ + if (ncp == NULL) { + rw_runlock(blp); + SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, + NULL); + counter_u64_add(nummiss, 1); + return (0); + } + + /* We found a "positive" match, return the vnode */ + if (!(ncp->nc_flag & NCF_NEGATIVE)) { + counter_u64_add(numposhits, 1); + *vpp = ncp->nc_vp; + CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", + dvp, cnp->cn_nameptr, *vpp, ncp); + SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, + *vpp); + cache_out_ts(ncp, tsp, ticksp); + goto success; + } + +negative_success: + /* We found a negative match, and want to create it, so purge */ + if (cnp->cn_nameiop == CREATE) { + counter_u64_add(numnegzaps, 1); + goto zap_and_exit; + } + + counter_u64_add(numneghits, 1); + cache_negative_hit(ncp); + if (ncp->nc_flag & NCF_WHITE) + cnp->cn_flags |= ISWHITEOUT; + SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, + ncp->nc_name); + cache_out_ts(ncp, tsp, ticksp); + cache_lookup_unlock(blp, dvlp); + return (ENOENT); + +success: + /* + * On success we return a locked and ref'd vnode as per the lookup + * protocol. + */ + MPASS(dvp != *vpp); + ltype = 0; /* silence gcc warning */ + if (cnp->cn_flags & ISDOTDOT) { + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK(dvp, 0); + } + vhold(*vpp); + cache_lookup_unlock(blp, dvlp); + error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread); + if (cnp->cn_flags & ISDOTDOT) { + vn_lock(dvp, ltype | LK_RETRY); + if (dvp->v_iflag & VI_DOOMED) { + if (error == 0) + vput(*vpp); + *vpp = NULL; + return (ENOENT); + } + } + if (error) { + *vpp = NULL; + goto retry; + } + if ((cnp->cn_flags & ISLASTCN) && + (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { + ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); + } + return (-1); + +zap_and_exit: + if (blp != NULL) + error = cache_zap_rlocked_bucket(ncp, blp); + else + error = cache_zap_locked_vnode(ncp, dvp); + if (error != 0) { + zap_and_exit_bucket_fail++; + cache_maybe_yield(); + goto retry; + } + cache_free(ncp); + return (0); +} + +struct celockstate { + struct mtx *vlp[3]; + struct rwlock *blp[2]; +}; +CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); +CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); + +static inline void +cache_celockstate_init(struct celockstate *cel) +{ + + bzero(cel, sizeof(*cel)); +} + +static void +cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, + struct vnode *dvp) +{ + struct mtx *vlp1, *vlp2; + + MPASS(cel->vlp[0] == NULL); + MPASS(cel->vlp[1] == NULL); + MPASS(cel->vlp[2] == NULL); + + MPASS(vp != NULL || dvp != NULL); + + vlp1 = VP2VNODELOCK(vp); + vlp2 = VP2VNODELOCK(dvp); + cache_sort(&vlp1, &vlp2); + + if (vlp1 != NULL) { + mtx_lock(vlp1); + cel->vlp[0] = vlp1; + } + mtx_lock(vlp2); + cel->vlp[1] = vlp2; +} + +static void +cache_unlock_vnodes_cel(struct celockstate *cel) +{ + + MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); + + if (cel->vlp[0] != NULL) + mtx_unlock(cel->vlp[0]); + if (cel->vlp[1] != NULL) + mtx_unlock(cel->vlp[1]); + if (cel->vlp[2] != NULL) + mtx_unlock(cel->vlp[2]); +} + +static bool +cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) +{ + struct mtx *vlp; + bool ret; + + cache_assert_vlp_locked(cel->vlp[0]); + cache_assert_vlp_locked(cel->vlp[1]); + MPASS(cel->vlp[2] == NULL); + + MPASS(vp != NULL); + vlp = VP2VNODELOCK(vp); + + ret = true; + if (vlp >= cel->vlp[1]) { + mtx_lock(vlp); + } else { + if (mtx_trylock(vlp)) + goto out; + cache_lock_vnodes_cel_3_failures++; + cache_unlock_vnodes_cel(cel); + if (vlp < cel->vlp[0]) { + mtx_lock(vlp); + mtx_lock(cel->vlp[0]); + mtx_lock(cel->vlp[1]); + } else { + if (cel->vlp[0] != NULL) + mtx_lock(cel->vlp[0]); + mtx_lock(vlp); + mtx_lock(cel->vlp[1]); + } + ret = false; + } +out: + cel->vlp[2] = vlp; + return (ret); +} + +static void +cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, + struct rwlock *blp2) +{ + + MPASS(cel->blp[0] == NULL); + MPASS(cel->blp[1] == NULL); + + cache_sort(&blp1, &blp2); + + if (blp1 != NULL) { + rw_wlock(blp1); + cel->blp[0] = blp1; + } + rw_wlock(blp2); + cel->blp[1] = blp2; +} + +static void +cache_unlock_buckets_cel(struct celockstate *cel) +{ + + if (cel->blp[0] != NULL) + rw_wunlock(cel->blp[0]); + rw_wunlock(cel->blp[1]); +} + +/* + * Lock part of the cache affected by the insertion. + * + * This means vnodelocks for dvp, vp and the relevant bucketlock. + * However, insertion can result in removal of an old entry. In this + * case we have an additional vnode and bucketlock pair to lock. If the + * entry is negative, ncelock is locked instead of the vnode. + * + * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while + * preserving the locking order (smaller address first). + */ +static void +cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, + uint32_t hash) +{ + struct namecache *ncp; + struct rwlock *blps[2]; + + blps[0] = HASH2BUCKETLOCK(hash); + for (;;) { + blps[1] = NULL; + cache_lock_vnodes_cel(cel, dvp, vp); + if (vp == NULL || vp->v_type != VDIR) + break; + ncp = vp->v_cache_dd; + if (ncp == NULL) + break; + if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) + break; + MPASS(ncp->nc_dvp == vp); + blps[1] = NCP2BUCKETLOCK(ncp); + if (ncp->nc_flag & NCF_NEGATIVE) + break; + if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) + break; + /* + * All vnodes got re-locked. Re-validate the state and if + * nothing changed we are done. Otherwise restart. + */ + if (ncp == vp->v_cache_dd && + (ncp->nc_flag & NCF_ISDOTDOT) != 0 && + blps[1] == NCP2BUCKETLOCK(ncp) && + VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) + break; + cache_unlock_vnodes_cel(cel); + cel->vlp[0] = NULL; + cel->vlp[1] = NULL; + cel->vlp[2] = NULL; + } + cache_lock_buckets_cel(cel, blps[0], blps[1]); +} + +static void +cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, + uint32_t hash) +{ + struct namecache *ncp; + struct rwlock *blps[2]; + + blps[0] = HASH2BUCKETLOCK(hash); + for (;;) { + blps[1] = NULL; + cache_lock_vnodes_cel(cel, dvp, vp); + ncp = dvp->v_cache_dd; + if (ncp == NULL) + break; + if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) + break; + MPASS(ncp->nc_dvp == dvp); + blps[1] = NCP2BUCKETLOCK(ncp); + if (ncp->nc_flag & NCF_NEGATIVE) + break; + if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) + break; + if (ncp == dvp->v_cache_dd && + (ncp->nc_flag & NCF_ISDOTDOT) != 0 && + blps[1] == NCP2BUCKETLOCK(ncp) && + VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) + break; + cache_unlock_vnodes_cel(cel); + cel->vlp[0] = NULL; + cel->vlp[1] = NULL; + cel->vlp[2] = NULL; + } + cache_lock_buckets_cel(cel, blps[0], blps[1]); +} + +static void +cache_enter_unlock(struct celockstate *cel) +{ + + cache_unlock_buckets_cel(cel); + cache_unlock_vnodes_cel(cel); +} + +/* + * Add an entry to the cache. + */ +void +cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, + struct timespec *tsp, struct timespec *dtsp) +{ + struct celockstate cel; + struct namecache *ncp, *n2, *ndd; + struct namecache_ts *ncp_ts, *n2_ts; + struct nchashhead *ncpp; + struct neglist *neglist; + uint32_t hash; + int flag; + int len; + bool neg_locked; + int lnumcache; + + CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); + VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, + ("cache_enter: Adding a doomed vnode")); + VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, + ("cache_enter: Doomed vnode used as src")); + + if (__predict_false(!doingcache)) + return; + + /* + * Avoid blowout in namecache entries. + */ + if (__predict_false(numcache >= desiredvnodes * ncsizefactor)) + return; + + cache_celockstate_init(&cel); + ndd = NULL; + ncp_ts = NULL; + flag = 0; + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) + return; + if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + len = cnp->cn_namelen; + hash = cache_get_hash(cnp->cn_nameptr, len, dvp); + cache_enter_lock_dd(&cel, dvp, vp, hash); + /* + * If dotdot entry already exists, just retarget it + * to new parent vnode, otherwise continue with new + * namecache entry allocation. + */ + if ((ncp = dvp->v_cache_dd) != NULL && + ncp->nc_flag & NCF_ISDOTDOT) { + KASSERT(ncp->nc_dvp == dvp, + ("wrong isdotdot parent")); + neg_locked = false; + if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) { + neglist = NCP2NEGLIST(ncp); + mtx_lock(&ncneg_hot.nl_lock); + mtx_lock(&neglist->nl_lock); + neg_locked = true; + } + if (!(ncp->nc_flag & NCF_NEGATIVE)) { + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, + ncp, nc_dst); + } else { + cache_negative_remove(ncp, true); + } + if (vp != NULL) { + TAILQ_INSERT_HEAD(&vp->v_cache_dst, + ncp, nc_dst); + ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE); + } else { + ncp->nc_flag &= ~(NCF_HOTNEGATIVE); + ncp->nc_flag |= NCF_NEGATIVE; + cache_negative_insert(ncp, true); + } + if (neg_locked) { + mtx_unlock(&neglist->nl_lock); + mtx_unlock(&ncneg_hot.nl_lock); + } + ncp->nc_vp = vp; + cache_enter_unlock(&cel); + return; + } + dvp->v_cache_dd = NULL; + cache_enter_unlock(&cel); + cache_celockstate_init(&cel); + SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp); + flag = NCF_ISDOTDOT; + } + } + + /* + * Calculate the hash key and setup as much of the new + * namecache entry as possible before acquiring the lock. + */ + ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); + ncp->nc_flag = flag; + ncp->nc_vp = vp; + if (vp == NULL) + ncp->nc_flag |= NCF_NEGATIVE; + ncp->nc_dvp = dvp; + if (tsp != NULL) { + ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); + ncp_ts->nc_time = *tsp; + ncp_ts->nc_ticks = ticks; + ncp_ts->nc_nc.nc_flag |= NCF_TS; + if (dtsp != NULL) { + ncp_ts->nc_dotdottime = *dtsp; + ncp_ts->nc_nc.nc_flag |= NCF_DTS; + } + } + len = ncp->nc_nlen = cnp->cn_namelen; + hash = cache_get_hash(cnp->cn_nameptr, len, dvp); + strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); + cache_enter_lock(&cel, dvp, vp, hash); + + /* + * See if this vnode or negative entry is already in the cache + * with this name. This can happen with concurrent lookups of + * the same path name. + */ + ncpp = NCHHASH(hash); + LIST_FOREACH(n2, ncpp, nc_hash) { + if (n2->nc_dvp == dvp && + n2->nc_nlen == cnp->cn_namelen && + !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { + if (tsp != NULL) { + KASSERT((n2->nc_flag & NCF_TS) != 0, + ("no NCF_TS")); + n2_ts = __containerof(n2, struct namecache_ts, nc_nc); + n2_ts->nc_time = ncp_ts->nc_time; + n2_ts->nc_ticks = ncp_ts->nc_ticks; + if (dtsp != NULL) { + n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; + if (ncp->nc_flag & NCF_NEGATIVE) + mtx_lock(&ncneg_hot.nl_lock); + n2_ts->nc_nc.nc_flag |= NCF_DTS; + if (ncp->nc_flag & NCF_NEGATIVE) + mtx_unlock(&ncneg_hot.nl_lock); + } + } + goto out_unlock_free; + } + } + + if (flag == NCF_ISDOTDOT) { + /* + * See if we are trying to add .. entry, but some other lookup + * has populated v_cache_dd pointer already. + */ + if (dvp->v_cache_dd != NULL) + goto out_unlock_free; + KASSERT(vp == NULL || vp->v_type == VDIR, + ("wrong vnode type %p", vp)); + dvp->v_cache_dd = ncp; + } + + if (vp != NULL) { + if (vp->v_type == VDIR) { + if (flag != NCF_ISDOTDOT) { + /* + * For this case, the cache entry maps both the + * directory name in it and the name ".." for the + * directory's parent. + */ + if ((ndd = vp->v_cache_dd) != NULL) { + if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) + cache_zap_locked(ndd, false); + else + ndd = NULL; + } + vp->v_cache_dd = ncp; + } + } else { + vp->v_cache_dd = NULL; + } + } + + if (flag != NCF_ISDOTDOT) { + if (LIST_EMPTY(&dvp->v_cache_src)) { + vhold(dvp); + atomic_add_rel_long(&numcachehv, 1); + } + LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); + } + + /* + * Insert the new namecache entry into the appropriate chain + * within the cache entries table. + */ + LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + + /* + * If the entry is "negative", we place it into the + * "negative" cache queue, otherwise, we place it into the + * destination vnode's cache entries queue. + */ + if (vp != NULL) { + TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); + SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, + vp); + } else { + if (cnp->cn_flags & ISWHITEOUT) + ncp->nc_flag |= NCF_WHITE; + cache_negative_insert(ncp, false); + SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, + ncp->nc_name); + } + cache_enter_unlock(&cel); + lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; + if (numneg * ncnegfactor > lnumcache) + cache_negative_zap_one(); + cache_free(ndd); + return; +out_unlock_free: + cache_enter_unlock(&cel); + cache_free(ncp); + return; +} + +static u_int +cache_roundup_2(u_int val) +{ + u_int res; + + for (res = 1; res <= val; res <<= 1) + continue; + + return (res); +} + +/* + * Name cache initialization, from vfs_init() when we are booting + */ +static void +nchinit(void *dummy __unused) +{ + u_int i; + + cache_zone_small = uma_zcreate("S VFS Cache", + sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1, + NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), + UMA_ZONE_ZINIT); + cache_zone_small_ts = uma_zcreate("STS VFS Cache", + sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1, + NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), + UMA_ZONE_ZINIT); + cache_zone_large = uma_zcreate("L VFS Cache", + sizeof(struct namecache) + NAME_MAX + 1, + NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache), + UMA_ZONE_ZINIT); + cache_zone_large_ts = uma_zcreate("LTS VFS Cache", + sizeof(struct namecache_ts) + NAME_MAX + 1, + NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), + UMA_ZONE_ZINIT); + + nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); + ncbuckethash = cache_roundup_2(mp_ncpus * 64) - 1; + if (ncbuckethash > nchash) + ncbuckethash = nchash; + bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, + M_WAITOK | M_ZERO); + for (i = 0; i < numbucketlocks; i++) + rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); + ncvnodehash = cache_roundup_2(mp_ncpus * 64) - 1; + vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, + M_WAITOK | M_ZERO); + for (i = 0; i < numvnodelocks; i++) + mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); + ncpurgeminvnodes = numbucketlocks; + + ncneghash = 3; + neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, + M_WAITOK | M_ZERO); + for (i = 0; i < numneglists; i++) { + mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); + TAILQ_INIT(&neglists[i].nl_list); + } + mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); + TAILQ_INIT(&ncneg_hot.nl_list); + + mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); + + numcalls = counter_u64_alloc(M_WAITOK); + dothits = counter_u64_alloc(M_WAITOK); + dotdothits = counter_u64_alloc(M_WAITOK); + numchecks = counter_u64_alloc(M_WAITOK); + nummiss = counter_u64_alloc(M_WAITOK); + nummisszap = counter_u64_alloc(M_WAITOK); + numposzaps = counter_u64_alloc(M_WAITOK); + numposhits = counter_u64_alloc(M_WAITOK); + numnegzaps = counter_u64_alloc(M_WAITOK); + numneghits = counter_u64_alloc(M_WAITOK); + numfullpathcalls = counter_u64_alloc(M_WAITOK); + numfullpathfail1 = counter_u64_alloc(M_WAITOK); + numfullpathfail2 = counter_u64_alloc(M_WAITOK); + numfullpathfail4 = counter_u64_alloc(M_WAITOK); + numfullpathfound = counter_u64_alloc(M_WAITOK); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); + +void +cache_changesize(int newmaxvnodes) +{ + struct nchashhead *new_nchashtbl, *old_nchashtbl; + u_long new_nchash, old_nchash; + struct namecache *ncp; + uint32_t hash; + int i; + + newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); + if (newmaxvnodes < numbucketlocks) + newmaxvnodes = numbucketlocks; + + new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash); + /* If same hash table size, nothing to do */ + if (nchash == new_nchash) { + free(new_nchashtbl, M_VFSCACHE); + return; + } + /* + * Move everything from the old hash table to the new table. + * None of the namecache entries in the table can be removed + * because to do so, they have to be removed from the hash table. + */ + cache_lock_all_vnodes(); + cache_lock_all_buckets(); + old_nchashtbl = nchashtbl; + old_nchash = nchash; + nchashtbl = new_nchashtbl; + nchash = new_nchash; + for (i = 0; i <= old_nchash; i++) { + while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { + hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, + ncp->nc_dvp); + LIST_REMOVE(ncp, nc_hash); + LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); + } + } + cache_unlock_all_buckets(); + cache_unlock_all_vnodes(); + free(old_nchashtbl, M_VFSCACHE); +} + +/* + * Invalidate all entries from and to a particular vnode. + */ +void +cache_purge(struct vnode *vp) +{ + TAILQ_HEAD(, namecache) ncps; + struct namecache *ncp, *nnp; + struct mtx *vlp, *vlp2; + + CTR1(KTR_VFS, "cache_purge(%p)", vp); + SDT_PROBE1(vfs, namecache, purge, done, vp); + if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && + vp->v_cache_dd == NULL) + return; + TAILQ_INIT(&ncps); + vlp = VP2VNODELOCK(vp); + vlp2 = NULL; + mtx_lock(vlp); +retry: + while (!LIST_EMPTY(&vp->v_cache_src)) { + ncp = LIST_FIRST(&vp->v_cache_src); + if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) + goto retry; + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); + } + while (!TAILQ_EMPTY(&vp->v_cache_dst)) { + ncp = TAILQ_FIRST(&vp->v_cache_dst); + if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) + goto retry; + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); + } + ncp = vp->v_cache_dd; + if (ncp != NULL) { + KASSERT(ncp->nc_flag & NCF_ISDOTDOT, + ("lost dotdot link")); + if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) + goto retry; + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); + } + KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); + mtx_unlock(vlp); + if (vlp2 != NULL) + mtx_unlock(vlp2); + TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { + cache_free(ncp); + } +} + +/* + * Invalidate all negative entries for a particular directory vnode. + */ +void +cache_purge_negative(struct vnode *vp) +{ + TAILQ_HEAD(, namecache) ncps; + struct namecache *ncp, *nnp; + struct mtx *vlp; + + CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); + SDT_PROBE1(vfs, namecache, purge_negative, done, vp); + if (LIST_EMPTY(&vp->v_cache_src)) + return; + TAILQ_INIT(&ncps); + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); + LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { + if (!(ncp->nc_flag & NCF_NEGATIVE)) + continue; + cache_zap_negative_locked_vnode_kl(ncp, vp); + TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); + } + mtx_unlock(vlp); + TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { + cache_free(ncp); + } +} + +/* + * Flush all entries referencing a particular filesystem. + */ +void +cache_purgevfs(struct mount *mp, bool force) +{ + TAILQ_HEAD(, namecache) ncps; + struct mtx *vlp1, *vlp2; + struct rwlock *blp; + struct nchashhead *bucket; + struct namecache *ncp, *nnp; + u_long i, j, n_nchash; + int error; + + /* Scan hash tables for applicable entries */ + SDT_PROBE1(vfs, namecache, purgevfs, done, mp); + if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) + return; + TAILQ_INIT(&ncps); + n_nchash = nchash + 1; + vlp1 = vlp2 = NULL; + for (i = 0; i < numbucketlocks; i++) { + blp = (struct rwlock *)&bucketlocks[i]; + rw_wlock(blp); + for (j = i; j < n_nchash; j += numbucketlocks) { +retry: + bucket = &nchashtbl[j]; + LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { + cache_assert_bucket_locked(ncp, RA_WLOCKED); + if (ncp->nc_dvp->v_mount != mp) + continue; + error = cache_zap_wlocked_bucket_kl(ncp, blp, + &vlp1, &vlp2); + if (error != 0) + goto retry; + TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); + } + } + rw_wunlock(blp); + if (vlp1 == NULL && vlp2 == NULL) + cache_maybe_yield(); + } + if (vlp1 != NULL) + mtx_unlock(vlp1); + if (vlp2 != NULL) + mtx_unlock(vlp2); + + TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { + cache_free(ncp); + } +} + +/* + * Perform canonical checks and cache lookup and pass on to filesystem + * through the vop_cachedlookup only if needed. + */ + +int +vfs_cache_lookup(struct vop_lookup_args *ap) +{ + struct vnode *dvp; + int error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + int flags = cnp->cn_flags; + + *vpp = NULL; + dvp = ap->a_dvp; + + if (dvp->v_type != VDIR) + return (ENOTDIR); + + if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) + return (EROFS); + + error = vn_dir_check_exec(dvp, cnp); + if (error != 0) + return (error); + + error = cache_lookup(dvp, vpp, cnp, NULL, NULL); + if (error == 0) + return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); + if (error == -1) + return (0); + return (error); +} + +/* + * XXX All of these sysctls would probably be more productive dead. + */ +static int __read_mostly disablecwd; +SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, + "Disable the getcwd syscall"); + +/* Implementation of the getcwd syscall. */ +int +sys___getcwd(struct thread *td, struct __getcwd_args *uap) +{ + + return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen, + MAXPATHLEN)); +} + +int +kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, size_t buflen, + size_t path_max) +{ + char *bp, *tmpbuf; + struct filedesc *fdp; + struct vnode *cdir, *rdir; + int error; + + if (__predict_false(disablecwd)) + return (ENODEV); + if (__predict_false(buflen < 2)) + return (EINVAL); + if (buflen > path_max) + buflen = path_max; + + tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); + fdp = td->td_proc->p_fd; + FILEDESC_SLOCK(fdp); + cdir = fdp->fd_cdir; + vrefact(cdir); + rdir = fdp->fd_rdir; + vrefact(rdir); + FILEDESC_SUNLOCK(fdp); + error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); + vrele(rdir); + vrele(cdir); + + if (!error) { + if (bufseg == UIO_SYSSPACE) + bcopy(bp, buf, strlen(bp) + 1); + else + error = copyout(bp, buf, strlen(bp) + 1); +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_NAMEI)) + ktrnamei(bp); +#endif + } + free(tmpbuf, M_TEMP); + return (error); +} + +/* + * Thus begins the fullpath magic. + */ + +static int __read_mostly disablefullpath; +SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, + "Disable the vn_fullpath function"); + +/* + * Retrieve the full filesystem path that correspond to a vnode from the name + * cache (if available) + */ +int +vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) +{ + char *buf; + struct filedesc *fdp; + struct vnode *rdir; + int error; + + if (__predict_false(disablefullpath)) + return (ENODEV); + if (__predict_false(vn == NULL)) + return (EINVAL); + + buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + fdp = td->td_proc->p_fd; + FILEDESC_SLOCK(fdp); + rdir = fdp->fd_rdir; + vrefact(rdir); + FILEDESC_SUNLOCK(fdp); + error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); + vrele(rdir); + + if (!error) + *freebuf = buf; + else + free(buf, M_TEMP); + return (error); +} + +/* + * This function is similar to vn_fullpath, but it attempts to lookup the + * pathname relative to the global root mount point. This is required for the + * auditing sub-system, as audited pathnames must be absolute, relative to the + * global root mount point. + */ +int +vn_fullpath_global(struct thread *td, struct vnode *vn, + char **retbuf, char **freebuf) +{ + char *buf; + int error; + + if (__predict_false(disablefullpath)) + return (ENODEV); + if (__predict_false(vn == NULL)) + return (EINVAL); + buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); + if (!error) + *freebuf = buf; + else + free(buf, M_TEMP); + return (error); +} + +int +vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) +{ + struct vnode *dvp; + struct namecache *ncp; + struct mtx *vlp; + int error; + + vlp = VP2VNODELOCK(*vp); + mtx_lock(vlp); + TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { + if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) + break; + } + if (ncp != NULL) { + if (*buflen < ncp->nc_nlen) { + mtx_unlock(vlp); + vrele(*vp); + counter_u64_add(numfullpathfail4, 1); + error = ENOMEM; + SDT_PROBE3(vfs, namecache, fullpath, return, error, + vp, NULL); + return (error); + } + *buflen -= ncp->nc_nlen; + memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); + SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, + ncp->nc_name, vp); + dvp = *vp; + *vp = ncp->nc_dvp; + vref(*vp); + mtx_unlock(vlp); + vrele(dvp); + return (0); + } + SDT_PROBE1(vfs, namecache, fullpath, miss, vp); + + mtx_unlock(vlp); + vn_lock(*vp, LK_SHARED | LK_RETRY); + error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); + vput(*vp); + if (error) { + counter_u64_add(numfullpathfail2, 1); + SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); + return (error); + } + + *vp = dvp; + if (dvp->v_iflag & VI_DOOMED) { + /* forced unmount */ + vrele(dvp); + error = ENOENT; + SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); + return (error); + } + /* + * *vp has its use count incremented still. + */ + + return (0); +} + +/* + * The magic behind kern___getcwd() and vn_fullpath(). + */ +static int +vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, + char *buf, char **retbuf, u_int buflen) +{ + int error, slash_prefixed; +#ifdef KDTRACE_HOOKS + struct vnode *startvp = vp; +#endif + struct vnode *vp1; + + buflen--; + buf[buflen] = '\0'; + error = 0; + slash_prefixed = 0; + + SDT_PROBE1(vfs, namecache, fullpath, entry, vp); + counter_u64_add(numfullpathcalls, 1); + vref(vp); + if (vp->v_type != VDIR) { + error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); + if (error) + return (error); + if (buflen == 0) { + vrele(vp); + return (ENOMEM); + } + buf[--buflen] = '/'; + slash_prefixed = 1; + } + while (vp != rdir && vp != rootvnode) { + /* + * The vp vnode must be already fully constructed, + * since it is either found in namecache or obtained + * from VOP_VPTOCNP(). We may test for VV_ROOT safely + * without obtaining the vnode lock. + */ + if ((vp->v_vflag & VV_ROOT) != 0) { + vn_lock(vp, LK_RETRY | LK_SHARED); + + /* + * With the vnode locked, check for races with + * unmount, forced or not. Note that we + * already verified that vp is not equal to + * the root vnode, which means that + * mnt_vnodecovered can be NULL only for the + * case of unmount. + */ + if ((vp->v_iflag & VI_DOOMED) != 0 || + (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || + vp1->v_mountedhere != vp->v_mount) { + vput(vp); + error = ENOENT; + SDT_PROBE3(vfs, namecache, fullpath, return, + error, vp, NULL); + break; + } + + vref(vp1); + vput(vp); + vp = vp1; + continue; + } + if (vp->v_type != VDIR) { + vrele(vp); + counter_u64_add(numfullpathfail1, 1); + error = ENOTDIR; + SDT_PROBE3(vfs, namecache, fullpath, return, + error, vp, NULL); + break; + } + error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); + if (error) + break; + if (buflen == 0) { + vrele(vp); + error = ENOMEM; + SDT_PROBE3(vfs, namecache, fullpath, return, error, + startvp, NULL); + break; + } + buf[--buflen] = '/'; + slash_prefixed = 1; + } + if (error) + return (error); + if (!slash_prefixed) { + if (buflen == 0) { + vrele(vp); + counter_u64_add(numfullpathfail4, 1); + SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, + startvp, NULL); + return (ENOMEM); + } + buf[--buflen] = '/'; + } + counter_u64_add(numfullpathfound, 1); + vrele(vp); + + SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen); + *retbuf = buf + buflen; + return (0); +} + +struct vnode * +vn_dir_dd_ino(struct vnode *vp) +{ + struct namecache *ncp; + struct vnode *ddvp; + struct mtx *vlp; + + ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); + TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) + continue; + ddvp = ncp->nc_dvp; + vhold(ddvp); + mtx_unlock(vlp); + if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread)) + return (NULL); + return (ddvp); + } + mtx_unlock(vlp); + return (NULL); +} + +int +vn_commname(struct vnode *vp, char *buf, u_int buflen) +{ + struct namecache *ncp; + struct mtx *vlp; + int l; + + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); + TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) + if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) + break; + if (ncp == NULL) { + mtx_unlock(vlp); + return (ENOENT); + } + l = min(ncp->nc_nlen, buflen - 1); + memcpy(buf, ncp->nc_name, l); + mtx_unlock(vlp); + buf[l] = '\0'; + return (0); +} + +/* ABI compat shims for old kernel modules. */ +#undef cache_enter + +void cache_enter(struct vnode *dvp, struct vnode *vp, + struct componentname *cnp); + +void +cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) +{ + + cache_enter_time(dvp, vp, cnp, NULL, NULL); +} + +/* + * This function updates path string to vnode's full global path + * and checks the size of the new path string against the pathlen argument. + * + * Requires a locked, referenced vnode. + * Vnode is re-locked on success or ENODEV, otherwise unlocked. + * + * If sysctl debug.disablefullpath is set, ENODEV is returned, + * vnode is left locked and path remain untouched. + * + * If vp is a directory, the call to vn_fullpath_global() always succeeds + * because it falls back to the ".." lookup if the namecache lookup fails. + */ +int +vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, + u_int pathlen) +{ + struct nameidata nd; + struct vnode *vp1; + char *rpath, *fbuf; + int error; + + ASSERT_VOP_ELOCKED(vp, __func__); + + /* Return ENODEV if sysctl debug.disablefullpath==1 */ + if (__predict_false(disablefullpath)) + return (ENODEV); + + /* Construct global filesystem path from vp. */ + VOP_UNLOCK(vp, 0); + error = vn_fullpath_global(td, vp, &rpath, &fbuf); + + if (error != 0) { + vrele(vp); + return (error); + } + + if (strlen(rpath) >= pathlen) { + vrele(vp); + error = ENAMETOOLONG; + goto out; + } + + /* + * Re-lookup the vnode by path to detect a possible rename. + * As a side effect, the vnode is relocked. + * If vnode was renamed, return ENOENT. + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, + UIO_SYSSPACE, path, td); + error = namei(&nd); + if (error != 0) { + vrele(vp); + goto out; + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vp1 = nd.ni_vp; + vrele(vp); + if (vp1 == vp) + strcpy(path, rpath); + else { + vput(vp1); + error = ENOENT; + } + +out: + free(fbuf, M_TEMP); + return (error); +} + +#ifdef DDB +static void +db_print_vpath(struct vnode *vp) +{ + + while (vp != NULL) { + db_printf("%p: ", vp); + if (vp == rootvnode) { + db_printf("/"); + vp = NULL; + } else { + if (vp->v_vflag & VV_ROOT) { + db_printf(""); + vp = vp->v_mount->mnt_vnodecovered; + } else { + struct namecache *ncp; + char *ncn; + int i; + + ncp = TAILQ_FIRST(&vp->v_cache_dst); + if (ncp != NULL) { + ncn = ncp->nc_name; + for (i = 0; i < ncp->nc_nlen; i++) + db_printf("%c", *ncn++); + vp = ncp->nc_dvp; + } else { + vp = NULL; + } + } + } + db_printf("\n"); + } + + return; +} + +DB_SHOW_COMMAND(vpath, db_show_vpath) +{ + struct vnode *vp; + + if (!have_addr) { + db_printf("usage: show vpath \n"); + return; + } + + vp = (struct vnode *)addr; + db_print_vpath(vp); +} + +#endif diff --git a/freebsd/sys/kern/vfs_cluster.c b/freebsd/sys/kern/vfs_cluster.c new file mode 100644 index 00000000..1ebe4a56 --- /dev/null +++ b/freebsd/sys/kern/vfs_cluster.c @@ -0,0 +1,1086 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * Modifications/enhancements: + * Copyright (c) 1995 John S. Dyson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_debug_cluster.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CLUSTERDEBUG) +static int rcluster= 0; +SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, + "Debug VFS clustering code"); +#endif + +static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); + +static struct cluster_save *cluster_collectbufs(struct vnode *vp, + struct buf *last_bp, int gbflags); +static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, + daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, + struct buf *fbp); +static void cluster_callback(struct buf *); + +static int write_behind = 1; +SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, + "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); + +static int read_max = 64; +SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, + "Cluster read-ahead max block count"); + +static int read_min = 1; +SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, + "Cluster read min block count"); + +/* + * Read data to a buf, including read-ahead if we find this to be beneficial. + * cluster_read replaces bread. + */ +int +cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, + struct ucred *cred, long totread, int seqcount, int gbflags, + struct buf **bpp) +{ + struct buf *bp, *rbp, *reqbp; + struct bufobj *bo; + struct thread *td; + daddr_t blkno, origblkno; + int maxra, racluster; + int error, ncontig; + int i; + + error = 0; + td = curthread; + bo = &vp->v_bufobj; + if (!unmapped_buf_allowed) + gbflags &= ~GB_UNMAPPED; + + /* + * Try to limit the amount of read-ahead by a few + * ad-hoc parameters. This needs work!!! + */ + racluster = vp->v_mount->mnt_iosize_max / size; + maxra = seqcount; + maxra = min(read_max, maxra); + maxra = min(nbuf/8, maxra); + if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) + maxra = (filesize / size) - lblkno; + + /* + * get the requested block + */ + error = getblkx(vp, lblkno, size, 0, 0, gbflags, &bp); + if (error != 0) { + *bpp = NULL; + return (error); + } + gbflags &= ~GB_NOSPARSE; + origblkno = lblkno; + *bpp = reqbp = bp; + + /* + * if it is in the cache, then check to see if the reads have been + * sequential. If they have, then try some read-ahead, otherwise + * back-off on prospective read-aheads. + */ + if (bp->b_flags & B_CACHE) { + if (!seqcount) { + return 0; + } else if ((bp->b_flags & B_RAM) == 0) { + return 0; + } else { + bp->b_flags &= ~B_RAM; + BO_RLOCK(bo); + for (i = 1; i < maxra; i++) { + /* + * Stop if the buffer does not exist or it + * is invalid (about to go away?) + */ + rbp = gbincore(&vp->v_bufobj, lblkno+i); + if (rbp == NULL || (rbp->b_flags & B_INVAL)) + break; + + /* + * Set another read-ahead mark so we know + * to check again. (If we can lock the + * buffer without waiting) + */ + if ((((i % racluster) == (racluster - 1)) || + (i == (maxra - 1))) + && (0 == BUF_LOCK(rbp, + LK_EXCLUSIVE | LK_NOWAIT, NULL))) { + rbp->b_flags |= B_RAM; + BUF_UNLOCK(rbp); + } + } + BO_RUNLOCK(bo); + if (i >= maxra) { + return 0; + } + lblkno += i; + } + reqbp = bp = NULL; + /* + * If it isn't in the cache, then get a chunk from + * disk if sequential, otherwise just get the block. + */ + } else { + off_t firstread = bp->b_offset; + int nblks; + long minread; + + KASSERT(bp->b_offset != NOOFFSET, + ("cluster_read: no buffer offset")); + + ncontig = 0; + + /* + * Adjust totread if needed + */ + minread = read_min * size; + if (minread > totread) + totread = minread; + + /* + * Compute the total number of blocks that we should read + * synchronously. + */ + if (firstread + totread > filesize) + totread = filesize - firstread; + nblks = howmany(totread, size); + if (nblks > racluster) + nblks = racluster; + + /* + * Now compute the number of contiguous blocks. + */ + if (nblks > 1) { + error = VOP_BMAP(vp, lblkno, NULL, + &blkno, &ncontig, NULL); + /* + * If this failed to map just do the original block. + */ + if (error || blkno == -1) + ncontig = 0; + } + + /* + * If we have contiguous data available do a cluster + * otherwise just read the requested block. + */ + if (ncontig) { + /* Account for our first block. */ + ncontig = min(ncontig + 1, nblks); + if (ncontig < nblks) + nblks = ncontig; + bp = cluster_rbuild(vp, filesize, lblkno, + blkno, size, nblks, gbflags, bp); + lblkno += (bp->b_bufsize / size); + } else { + bp->b_flags |= B_RAM; + bp->b_iocmd = BIO_READ; + lblkno += 1; + } + } + + /* + * handle the synchronous read so that it is available ASAP. + */ + if (bp) { + if ((bp->b_flags & B_CLUSTER) == 0) { + vfs_busy_pages(bp, 0); + } + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) + BUF_KERNPROC(bp); + bp->b_iooffset = dbtob(bp->b_blkno); + bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(td->td_proc); + racct_add_buf(td->td_proc, bp, 0); + PROC_UNLOCK(td->td_proc); + } +#endif /* RACCT */ + td->td_ru.ru_inblock++; + } + + /* + * If we have been doing sequential I/O, then do some read-ahead. + */ + while (lblkno < (origblkno + maxra)) { + error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); + if (error) + break; + + if (blkno == -1) + break; + + /* + * We could throttle ncontig here by maxra but we might as + * well read the data if it is contiguous. We're throttled + * by racluster anyway. + */ + if (ncontig) { + ncontig = min(ncontig + 1, racluster); + rbp = cluster_rbuild(vp, filesize, lblkno, blkno, + size, ncontig, gbflags, NULL); + lblkno += (rbp->b_bufsize / size); + if (rbp->b_flags & B_DELWRI) { + bqrelse(rbp); + continue; + } + } else { + rbp = getblk(vp, lblkno, size, 0, 0, gbflags); + lblkno += 1; + if (rbp->b_flags & B_DELWRI) { + bqrelse(rbp); + continue; + } + rbp->b_flags |= B_ASYNC | B_RAM; + rbp->b_iocmd = BIO_READ; + rbp->b_blkno = blkno; + } + if (rbp->b_flags & B_CACHE) { + rbp->b_flags &= ~B_ASYNC; + bqrelse(rbp); + continue; + } + if ((rbp->b_flags & B_CLUSTER) == 0) { + vfs_busy_pages(rbp, 0); + } + rbp->b_flags &= ~B_INVAL; + rbp->b_ioflags &= ~BIO_ERROR; + if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) + BUF_KERNPROC(rbp); + rbp->b_iooffset = dbtob(rbp->b_blkno); + bstrategy(rbp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(td->td_proc); + racct_add_buf(td->td_proc, rbp, 0); + PROC_UNLOCK(td->td_proc); + } +#endif /* RACCT */ + td->td_ru.ru_inblock++; + } + + if (reqbp) { + /* + * Like bread, always brelse() the buffer when + * returning an error. + */ + error = bufwait(reqbp); + if (error != 0) { + brelse(reqbp); + *bpp = NULL; + } + } + return (error); +} + +/* + * If blocks are contiguous on disk, use this to provide clustered + * read ahead. We will read as many blocks as possible sequentially + * and then parcel them up into logical blocks in the buffer hash table. + */ +static struct buf * +cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) +{ + struct buf *bp, *tbp; + daddr_t bn; + off_t off; + long tinc, tsize; + int i, inc, j, k, toff; + + KASSERT(size == vp->v_mount->mnt_stat.f_iosize, + ("cluster_rbuild: size %ld != f_iosize %jd\n", + size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); + + /* + * avoid a division + */ + while ((u_quad_t) size * (lbn + run) > filesize) { + --run; + } + + if (fbp) { + tbp = fbp; + tbp->b_iocmd = BIO_READ; + } else { + tbp = getblk(vp, lbn, size, 0, 0, gbflags); + if (tbp->b_flags & B_CACHE) + return tbp; + tbp->b_flags |= B_ASYNC | B_RAM; + tbp->b_iocmd = BIO_READ; + } + tbp->b_blkno = blkno; + if( (tbp->b_flags & B_MALLOC) || + ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) + return tbp; + + bp = trypbuf(&cluster_pbuf_freecnt); + if (bp == NULL) + return tbp; + + /* + * We are synthesizing a buffer out of vm_page_t's, but + * if the block size is not page aligned then the starting + * address may not be either. Inherit the b_data offset + * from the original buffer. + */ + bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; + if ((gbflags & GB_UNMAPPED) != 0) { + bp->b_data = unmapped_buf; + } else { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } + bp->b_iocmd = BIO_READ; + bp->b_iodone = cluster_callback; + bp->b_blkno = blkno; + bp->b_lblkno = lbn; + bp->b_offset = tbp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); + pbgetvp(vp, bp); + + TAILQ_INIT(&bp->b_cluster.cluster_head); + + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + + inc = btodb(size); + for (bn = blkno, i = 0; i < run; ++i, bn += inc) { + if (i == 0) { + VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); + vfs_drain_busy_pages(tbp); + vm_object_pip_add(tbp->b_bufobj->bo_object, + tbp->b_npages); + for (k = 0; k < tbp->b_npages; k++) + vm_page_sbusy(tbp->b_pages[k]); + VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); + } else { + if ((bp->b_npages * PAGE_SIZE) + + round_page(size) > vp->v_mount->mnt_iosize_max) { + break; + } + + tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | + (gbflags & GB_UNMAPPED)); + + /* Don't wait around for locked bufs. */ + if (tbp == NULL) + break; + + /* + * Stop scanning if the buffer is fully valid + * (marked B_CACHE), or locked (may be doing a + * background write), or if the buffer is not + * VMIO backed. The clustering code can only deal + * with VMIO-backed buffers. The bo lock is not + * required for the BKGRDINPROG check since it + * can not be set without the buf lock. + */ + if ((tbp->b_vflags & BV_BKGRDINPROG) || + (tbp->b_flags & B_CACHE) || + (tbp->b_flags & B_VMIO) == 0) { + bqrelse(tbp); + break; + } + + /* + * The buffer must be completely invalid in order to + * take part in the cluster. If it is partially valid + * then we stop. + */ + off = tbp->b_offset; + tsize = size; + VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); + for (j = 0; tsize > 0; j++) { + toff = off & PAGE_MASK; + tinc = tsize; + if (toff + tinc > PAGE_SIZE) + tinc = PAGE_SIZE - toff; + VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object); + if ((tbp->b_pages[j]->valid & + vm_page_bits(toff, tinc)) != 0) + break; + if (vm_page_xbusied(tbp->b_pages[j])) + break; + vm_object_pip_add(tbp->b_bufobj->bo_object, 1); + vm_page_sbusy(tbp->b_pages[j]); + off += tinc; + tsize -= tinc; + } + if (tsize > 0) { +clean_sbusy: + vm_object_pip_add(tbp->b_bufobj->bo_object, -j); + for (k = 0; k < j; k++) + vm_page_sunbusy(tbp->b_pages[k]); + VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); + bqrelse(tbp); + break; + } + VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); + + /* + * Set a read-ahead mark as appropriate + */ + if ((fbp && (i == 1)) || (i == (run - 1))) + tbp->b_flags |= B_RAM; + + /* + * Set the buffer up for an async read (XXX should + * we do this only if we do not wind up brelse()ing?). + * Set the block number if it isn't set, otherwise + * if it is make sure it matches the block number we + * expect. + */ + tbp->b_flags |= B_ASYNC; + tbp->b_iocmd = BIO_READ; + if (tbp->b_blkno == tbp->b_lblkno) { + tbp->b_blkno = bn; + } else if (tbp->b_blkno != bn) { + VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); + goto clean_sbusy; + } + } + /* + * XXX fbp from caller may not be B_ASYNC, but we are going + * to biodone() it in cluster_callback() anyway + */ + BUF_KERNPROC(tbp); + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); + for (j = 0; j < tbp->b_npages; j += 1) { + vm_page_t m; + m = tbp->b_pages[j]; + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages-1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + if (m->valid == VM_PAGE_BITS_ALL) + tbp->b_pages[j] = bogus_page; + } + VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); + /* + * Don't inherit tbp->b_bufsize as it may be larger due to + * a non-page-aligned size. Instead just aggregate using + * 'size'. + */ + if (tbp->b_bcount != size) + printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); + if (tbp->b_bufsize != size) + printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); + bp->b_bcount += size; + bp->b_bufsize += size; + } + + /* + * Fully valid pages in the cluster are already good and do not need + * to be re-read from disk. Replace the page with bogus_page + */ + VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + for (j = 0; j < bp->b_npages; j++) { + VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object); + if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL) + bp->b_pages[j] = bogus_page; + } + VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); + if (bp->b_bufsize > bp->b_kvasize) + panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + + if (buf_mapped(bp)) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } + return (bp); +} + +/* + * Cleanup after a clustered read or write. + * This is complicated by the fact that any of the buffers might have + * extra memory (if there were no empty buffer headers at allocbuf time) + * that we will need to shift around. + */ +static void +cluster_callback(struct buf *bp) +{ + struct buf *nbp, *tbp; + int error = 0; + + /* + * Must propagate errors to all the components. + */ + if (bp->b_ioflags & BIO_ERROR) + error = bp->b_error; + + if (buf_mapped(bp)) { + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), + bp->b_npages); + } + /* + * Move memory from the large cluster buffer into the component + * buffers and mark IO as done on these. + */ + for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); + tbp; tbp = nbp) { + nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); + if (error) { + tbp->b_ioflags |= BIO_ERROR; + tbp->b_error = error; + } else { + tbp->b_dirtyoff = tbp->b_dirtyend = 0; + tbp->b_flags &= ~B_INVAL; + tbp->b_ioflags &= ~BIO_ERROR; + /* + * XXX the bdwrite()/bqrelse() issued during + * cluster building clears B_RELBUF (see bqrelse() + * comment). If direct I/O was specified, we have + * to restore it here to allow the buffer and VM + * to be freed. + */ + if (tbp->b_flags & B_DIRECT) + tbp->b_flags |= B_RELBUF; + } + bufdone(tbp); + } + pbrelvp(bp); + relpbuf(bp, &cluster_pbuf_freecnt); +} + +/* + * cluster_wbuild_wb: + * + * Implement modified write build for cluster. + * + * write_behind = 0 write behind disabled + * write_behind = 1 write behind normal (default) + * write_behind = 2 write behind backed-off + */ + +static __inline int +cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) +{ + int r = 0; + + switch (write_behind) { + case 2: + if (start_lbn < len) + break; + start_lbn -= len; + /* FALLTHROUGH */ + case 1: + r = cluster_wbuild(vp, size, start_lbn, len, gbflags); + /* FALLTHROUGH */ + default: + /* FALLTHROUGH */ + break; + } + return(r); +} + +/* + * Do clustered write for FFS. + * + * Three cases: + * 1. Write is not sequential (write asynchronously) + * Write is sequential: + * 2. beginning of cluster - begin cluster + * 3. middle of a cluster - add to cluster + * 4. end of a cluster - asynchronously write cluster + */ +void +cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, + int gbflags) +{ + daddr_t lbn; + int maxclen, cursize; + int lblocksize; + int async; + + if (!unmapped_buf_allowed) + gbflags &= ~GB_UNMAPPED; + + if (vp->v_type == VREG) { + async = DOINGASYNC(vp); + lblocksize = vp->v_mount->mnt_stat.f_iosize; + } else { + async = 0; + lblocksize = bp->b_bufsize; + } + lbn = bp->b_lblkno; + KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); + + /* Initialize vnode to beginning of file. */ + if (lbn == 0) + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + + if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || + (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { + maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; + if (vp->v_clen != 0) { + /* + * Next block is not sequential. + * + * If we are not writing at end of file, the process + * seeked to another point in the file since its last + * write, or we have reached our maximum cluster size, + * then push the previous cluster. Otherwise try + * reallocating to make it sequential. + * + * Change to algorithm: only push previous cluster if + * it was sequential from the point of view of the + * seqcount heuristic, otherwise leave the buffer + * intact so we can potentially optimize the I/O + * later on in the buf_daemon or update daemon + * flush. + */ + cursize = vp->v_lastw - vp->v_cstart + 1; + if (((u_quad_t) bp->b_offset + lblocksize) != filesize || + lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { + if (!async && seqcount > 0) { + cluster_wbuild_wb(vp, lblocksize, + vp->v_cstart, cursize, gbflags); + } + } else { + struct buf **bpp, **endbp; + struct cluster_save *buflist; + + buflist = cluster_collectbufs(vp, bp, gbflags); + if (buflist == NULL) { + /* + * Cluster build failed so just write + * it now. + */ + bawrite(bp); + return; + } + endbp = &buflist->bs_children + [buflist->bs_nchildren - 1]; + if (VOP_REALLOCBLKS(vp, buflist)) { + /* + * Failed, push the previous cluster + * if *really* writing sequentially + * in the logical file (seqcount > 1), + * otherwise delay it in the hopes that + * the low level disk driver can + * optimize the write ordering. + */ + for (bpp = buflist->bs_children; + bpp < endbp; bpp++) + brelse(*bpp); + free(buflist, M_SEGMENT); + if (seqcount > 1) { + cluster_wbuild_wb(vp, + lblocksize, vp->v_cstart, + cursize, gbflags); + } + } else { + /* + * Succeeded, keep building cluster. + */ + for (bpp = buflist->bs_children; + bpp <= endbp; bpp++) + bdwrite(*bpp); + free(buflist, M_SEGMENT); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; + return; + } + } + } + /* + * Consider beginning a cluster. If at end of file, make + * cluster as large as possible, otherwise find size of + * existing cluster. + */ + if ((vp->v_type == VREG) && + ((u_quad_t) bp->b_offset + lblocksize) != filesize && + (bp->b_blkno == bp->b_lblkno) && + (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || + bp->b_blkno == -1)) { + bawrite(bp); + vp->v_clen = 0; + vp->v_lasta = bp->b_blkno; + vp->v_cstart = lbn + 1; + vp->v_lastw = lbn; + return; + } + vp->v_clen = maxclen; + if (!async && maxclen == 0) { /* I/O not contiguous */ + vp->v_cstart = lbn + 1; + bawrite(bp); + } else { /* Wait for rest of cluster */ + vp->v_cstart = lbn; + bdwrite(bp); + } + } else if (lbn == vp->v_cstart + vp->v_clen) { + /* + * At end of cluster, write it out if seqcount tells us we + * are operating sequentially, otherwise let the buf or + * update daemon handle it. + */ + bdwrite(bp); + if (seqcount > 1) { + cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, + vp->v_clen + 1, gbflags); + } + vp->v_clen = 0; + vp->v_cstart = lbn + 1; + } else if (vm_page_count_severe()) { + /* + * We are low on memory, get it going NOW + */ + bawrite(bp); + } else { + /* + * In the middle of a cluster, so just delay the I/O for now. + */ + bdwrite(bp); + } + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; +} + + +/* + * This is an awful lot like cluster_rbuild...wish they could be combined. + * The last lbn argument is the current block on which I/O is being + * performed. Check to see that it doesn't fall in the middle of + * the current block (if last_bp == NULL). + */ +int +cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, + int gbflags) +{ + struct buf *bp, *tbp; + struct bufobj *bo; + int i, j; + int totalwritten = 0; + int dbsize = btodb(size); + + if (!unmapped_buf_allowed) + gbflags &= ~GB_UNMAPPED; + + bo = &vp->v_bufobj; + while (len > 0) { + /* + * If the buffer is not delayed-write (i.e. dirty), or it + * is delayed-write but either locked or inval, it cannot + * partake in the clustered write. + */ + BO_LOCK(bo); + if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || + (tbp->b_vflags & BV_BKGRDINPROG)) { + BO_UNLOCK(bo); + ++start_lbn; + --len; + continue; + } + if (BUF_LOCK(tbp, + LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { + ++start_lbn; + --len; + continue; + } + if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { + BUF_UNLOCK(tbp); + ++start_lbn; + --len; + continue; + } + bremfree(tbp); + tbp->b_flags &= ~B_DONE; + + /* + * Extra memory in the buffer, punt on this buffer. + * XXX we could handle this in most cases, but we would + * have to push the extra memory down to after our max + * possible cluster size and then potentially pull it back + * up if the cluster was terminated prematurely--too much + * hassle. + */ + if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != + (B_CLUSTEROK | B_VMIO)) || + (tbp->b_bcount != tbp->b_bufsize) || + (tbp->b_bcount != size) || + (len == 1) || + ((bp = (vp->v_vflag & VV_MD) != 0 ? + trypbuf(&cluster_pbuf_freecnt) : + getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + totalwritten += tbp->b_bufsize; + bawrite(tbp); + ++start_lbn; + --len; + continue; + } + + /* + * We got a pbuf to make the cluster in. + * so initialise it. + */ + TAILQ_INIT(&bp->b_cluster.cluster_head); + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + if (tbp->b_wcred != NOCRED) + bp->b_wcred = crhold(tbp->b_wcred); + + bp->b_blkno = tbp->b_blkno; + bp->b_lblkno = tbp->b_lblkno; + bp->b_offset = tbp->b_offset; + + /* + * We are synthesizing a buffer out of vm_page_t's, but + * if the block size is not page aligned then the starting + * address may not be either. Inherit the b_data offset + * from the original buffer. + */ + if ((gbflags & GB_UNMAPPED) == 0 || + (tbp->b_flags & B_VMIO) == 0) { + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + } else { + bp->b_data = unmapped_buf; + } + bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | + B_NEEDCOMMIT)); + bp->b_iodone = cluster_callback; + pbgetvp(vp, bp); + /* + * From this location in the file, scan forward to see + * if there are buffers with adjacent data that need to + * be written as well. + */ + for (i = 0; i < len; ++i, ++start_lbn) { + if (i != 0) { /* If not the first buffer */ + /* + * If the adjacent data is not even in core it + * can't need to be written. + */ + BO_LOCK(bo); + if ((tbp = gbincore(bo, start_lbn)) == NULL || + (tbp->b_vflags & BV_BKGRDINPROG)) { + BO_UNLOCK(bo); + break; + } + + /* + * If it IS in core, but has different + * characteristics, or is locked (which + * means it could be undergoing a background + * I/O or be in a weird state), then don't + * cluster with it. + */ + if (BUF_LOCK(tbp, + LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, + BO_LOCKPTR(bo))) + break; + + if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | + B_INVAL | B_DELWRI | B_NEEDCOMMIT)) + != (B_DELWRI | B_CLUSTEROK | + (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || + tbp->b_wcred != bp->b_wcred) { + BUF_UNLOCK(tbp); + break; + } + + /* + * Check that the combined cluster + * would make sense with regard to pages + * and would not be too large + */ + if ((tbp->b_bcount != size) || + ((bp->b_blkno + (dbsize * i)) != + tbp->b_blkno) || + ((tbp->b_npages + bp->b_npages) > + (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { + BUF_UNLOCK(tbp); + break; + } + + /* + * Ok, it's passed all the tests, + * so remove it from the free list + * and mark it busy. We will use it. + */ + bremfree(tbp); + tbp->b_flags &= ~B_DONE; + } /* end of code for non-first buffers only */ + /* + * If the IO is via the VM then we do some + * special VM hackery (yuck). Since the buffer's + * block size may not be page-aligned it is possible + * for a page to be shared between two buffers. We + * have to get rid of the duplication when building + * the cluster. + */ + if (tbp->b_flags & B_VMIO) { + vm_page_t m; + + VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); + if (i == 0) { + vfs_drain_busy_pages(tbp); + } else { /* if not first buffer */ + for (j = 0; j < tbp->b_npages; j += 1) { + m = tbp->b_pages[j]; + if (vm_page_xbusied(m)) { + VM_OBJECT_WUNLOCK( + tbp->b_object); + bqrelse(tbp); + goto finishcluster; + } + } + } + for (j = 0; j < tbp->b_npages; j += 1) { + m = tbp->b_pages[j]; + vm_page_sbusy(m); + vm_object_pip_add(m->object, 1); + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages - 1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + } + VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); + } + bp->b_bcount += size; + bp->b_bufsize += size; + /* + * If any of the clustered buffers have their + * B_BARRIER flag set, transfer that request to + * the cluster. + */ + bp->b_flags |= (tbp->b_flags & B_BARRIER); + tbp->b_flags &= ~(B_DONE | B_BARRIER); + tbp->b_flags |= B_ASYNC; + tbp->b_ioflags &= ~BIO_ERROR; + tbp->b_iocmd = BIO_WRITE; + bundirty(tbp); + reassignbuf(tbp); /* put on clean list */ + bufobj_wref(tbp->b_bufobj); + BUF_KERNPROC(tbp); + buf_track(tbp, __func__); + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + } + finishcluster: + if (buf_mapped(bp)) { + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + } + if (bp->b_bufsize > bp->b_kvasize) + panic( + "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + totalwritten += bp->b_bufsize; + bp->b_dirtyoff = 0; + bp->b_dirtyend = bp->b_bufsize; + bawrite(bp); + + len -= i; + } + return totalwritten; +} + +/* + * Collect together all the buffers in a cluster. + * Plus add one additional buffer. + */ +static struct cluster_save * +cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) +{ + struct cluster_save *buflist; + struct buf *bp; + daddr_t lbn; + int i, j, len, error; + + len = vp->v_lastw - vp->v_cstart + 1; + buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), + M_SEGMENT, M_WAITOK); + buflist->bs_nchildren = 0; + buflist->bs_children = (struct buf **) (buflist + 1); + for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { + error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, + gbflags, &bp); + if (error != 0) { + /* + * If read fails, release collected buffers + * and return failure. + */ + for (j = 0; j < i; j++) + brelse(buflist->bs_children[j]); + free(buflist, M_SEGMENT); + return (NULL); + } + buflist->bs_children[i] = bp; + if (bp->b_blkno == bp->b_lblkno) + VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL, NULL); + } + buflist->bs_children[i] = bp = last_bp; + if (bp->b_blkno == bp->b_lblkno) + VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); + buflist->bs_nchildren = i + 1; + return (buflist); +} diff --git a/freebsd/sys/kern/vfs_default.c b/freebsd/sys/kern/vfs_default.c new file mode 100644 index 00000000..40041c9d --- /dev/null +++ b/freebsd/sys/kern/vfs_default.c @@ -0,0 +1,1286 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static int vop_nolookup(struct vop_lookup_args *); +static int vop_norename(struct vop_rename_args *); +static int vop_nostrategy(struct vop_strategy_args *); +static int get_next_dirent(struct vnode *vp, struct dirent **dpp, + char *dirbuf, int dirbuflen, off_t *off, + char **cpos, int *len, int *eofflag, + struct thread *td); +static int dirent_exists(struct vnode *vp, const char *dirname, + struct thread *td); + +#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) + +static int vop_stdis_text(struct vop_is_text_args *ap); +static int vop_stdunset_text(struct vop_unset_text_args *ap); +static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); +static int vop_stdfdatasync(struct vop_fdatasync_args *ap); +static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); + +/* + * This vnode table stores what we want to do if the filesystem doesn't + * implement a particular VOP. + * + * If there is no specific entry here, we will return EOPNOTSUPP. + * + * Note that every filesystem has to implement either vop_access + * or vop_accessx; failing to do so will result in immediate crash + * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(), + * which calls vop_stdaccess() etc. + */ + +struct vop_vector default_vnodeops = { + .vop_default = NULL, + .vop_bypass = VOP_EOPNOTSUPP, + + .vop_access = vop_stdaccess, + .vop_accessx = vop_stdaccessx, + .vop_advise = vop_stdadvise, + .vop_advlock = vop_stdadvlock, + .vop_advlockasync = vop_stdadvlockasync, + .vop_advlockpurge = vop_stdadvlockpurge, + .vop_allocate = vop_stdallocate, + .vop_bmap = vop_stdbmap, + .vop_close = VOP_NULL, + .vop_fsync = VOP_NULL, + .vop_fdatasync = vop_stdfdatasync, + .vop_getpages = vop_stdgetpages, + .vop_getpages_async = vop_stdgetpages_async, + .vop_getwritemount = vop_stdgetwritemount, + .vop_inactive = VOP_NULL, + .vop_ioctl = VOP_ENOTTY, + .vop_kqfilter = vop_stdkqfilter, + .vop_islocked = vop_stdislocked, + .vop_lock1 = vop_stdlock, + .vop_lookup = vop_nolookup, + .vop_open = VOP_NULL, + .vop_pathconf = VOP_EINVAL, + .vop_poll = vop_nopoll, + .vop_putpages = vop_stdputpages, + .vop_readlink = VOP_EINVAL, + .vop_rename = vop_norename, + .vop_revoke = VOP_PANIC, + .vop_strategy = vop_nostrategy, + .vop_unlock = vop_stdunlock, + .vop_vptocnp = vop_stdvptocnp, + .vop_vptofh = vop_stdvptofh, + .vop_unp_bind = vop_stdunp_bind, + .vop_unp_connect = vop_stdunp_connect, + .vop_unp_detach = vop_stdunp_detach, + .vop_is_text = vop_stdis_text, + .vop_set_text = vop_stdset_text, + .vop_unset_text = vop_stdunset_text, + .vop_add_writecount = vop_stdadd_writecount, +}; + +/* + * Series of placeholder functions for various error returns for + * VOPs. + */ + +int +vop_eopnotsupp(struct vop_generic_args *ap) +{ + /* + printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); + */ + + return (EOPNOTSUPP); +} + +int +vop_ebadf(struct vop_generic_args *ap) +{ + + return (EBADF); +} + +int +vop_enotty(struct vop_generic_args *ap) +{ + + return (ENOTTY); +} + +int +vop_einval(struct vop_generic_args *ap) +{ + + return (EINVAL); +} + +int +vop_enoent(struct vop_generic_args *ap) +{ + + return (ENOENT); +} + +int +vop_null(struct vop_generic_args *ap) +{ + + return (0); +} + +/* + * Helper function to panic on some bad VOPs in some filesystems. + */ +int +vop_panic(struct vop_generic_args *ap) +{ + + panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); +} + +/* + * vop_std and vop_no are default functions for use by + * filesystems that need the "default reasonable" implementation for a + * particular operation. + * + * The documentation for the operations they implement exists (if it exists) + * in the VOP_(9) manpage (all uppercase). + */ + +/* + * Default vop for filesystems that do not support name lookup + */ +static int +vop_nolookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * vop_norename: + * + * Handle unlock and reference counting for arguments of vop_rename + * for filesystems that do not implement rename operation. + */ +static int +vop_norename(struct vop_rename_args *ap) +{ + + vop_rename_fail(ap); + return (EOPNOTSUPP); +} + +/* + * vop_nostrategy: + * + * Strategy routine for VFS devices that have none. + * + * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy + * routine. Typically this is done for a BIO_READ strategy call. + * Typically B_INVAL is assumed to already be clear prior to a write + * and should not be cleared manually unless you just made the buffer + * invalid. BIO_ERROR should be cleared either way. + */ + +static int +vop_nostrategy (struct vop_strategy_args *ap) +{ + printf("No strategy for buffer at %p\n", ap->a_bp); + vn_printf(ap->a_vp, "vnode "); + ap->a_bp->b_ioflags |= BIO_ERROR; + ap->a_bp->b_error = EOPNOTSUPP; + bufdone(ap->a_bp); + return (EOPNOTSUPP); +} + +static int +get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, + int dirbuflen, off_t *off, char **cpos, int *len, + int *eofflag, struct thread *td) +{ + int error, reclen; + struct uio uio; + struct iovec iov; + struct dirent *dp; + + KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); + KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); + + if (*len == 0) { + iov.iov_base = dirbuf; + iov.iov_len = dirbuflen; + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = *off; + uio.uio_resid = dirbuflen; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_rw = UIO_READ; + uio.uio_td = td; + + *eofflag = 0; + +#ifdef MAC + error = mac_vnode_check_readdir(td->td_ucred, vp); + if (error == 0) +#endif + error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, + NULL, NULL); + if (error) + return (error); + + *off = uio.uio_offset; + + *cpos = dirbuf; + *len = (dirbuflen - uio.uio_resid); + + if (*len == 0) + return (ENOENT); + } + + dp = (struct dirent *)(*cpos); + reclen = dp->d_reclen; + *dpp = dp; + + /* check for malformed directory.. */ + if (reclen < DIRENT_MINSIZE) + return (EINVAL); + + *cpos += reclen; + *len -= reclen; + + return (0); +} + +/* + * Check if a named file exists in a given directory vnode. + */ +static int +dirent_exists(struct vnode *vp, const char *dirname, struct thread *td) +{ + char *dirbuf, *cpos; + int error, eofflag, dirbuflen, len, found; + off_t off; + struct dirent *dp; + struct vattr va; + + KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); + KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); + + found = 0; + + error = VOP_GETATTR(vp, &va, td->td_ucred); + if (error) + return (found); + + dirbuflen = DEV_BSIZE; + if (dirbuflen < va.va_blocksize) + dirbuflen = va.va_blocksize; + dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); + + off = 0; + len = 0; + do { + error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off, + &cpos, &len, &eofflag, td); + if (error) + goto out; + + if (dp->d_type != DT_WHT && dp->d_fileno != 0 && + strcmp(dp->d_name, dirname) == 0) { + found = 1; + goto out; + } + } while (len > 0 || !eofflag); + +out: + free(dirbuf, M_TEMP); + return (found); +} + +int +vop_stdaccess(struct vop_access_args *ap) +{ + + KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | + VAPPEND)) == 0, ("invalid bit in accmode")); + + return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td)); +} + +int +vop_stdaccessx(struct vop_accessx_args *ap) +{ + int error; + accmode_t accmode = ap->a_accmode; + + error = vfs_unixify_accmode(&accmode); + if (error != 0) + return (error); + + if (accmode == 0) + return (0); + + return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td)); +} + +/* + * Advisory record locking support + */ +int +vop_stdadvlock(struct vop_advlock_args *ap) +{ + struct vnode *vp; + struct vattr vattr; + int error; + + vp = ap->a_vp; + if (ap->a_fl->l_whence == SEEK_END) { + /* + * The NFSv4 server must avoid doing a vn_lock() here, since it + * can deadlock the nfsd threads, due to a LOR. Fortunately + * the NFSv4 server always uses SEEK_SET and this code is + * only required for the SEEK_END case. + */ + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); + VOP_UNLOCK(vp, 0); + if (error) + return (error); + } else + vattr.va_size = 0; + + return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size)); +} + +int +vop_stdadvlockasync(struct vop_advlockasync_args *ap) +{ + struct vnode *vp; + struct vattr vattr; + int error; + + vp = ap->a_vp; + if (ap->a_fl->l_whence == SEEK_END) { + /* The size argument is only needed for SEEK_END. */ + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); + VOP_UNLOCK(vp, 0); + if (error) + return (error); + } else + vattr.va_size = 0; + + return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size)); +} + +int +vop_stdadvlockpurge(struct vop_advlockpurge_args *ap) +{ + struct vnode *vp; + + vp = ap->a_vp; + lf_purgelocks(vp, &vp->v_lockf); + return (0); +} + +/* + * vop_stdpathconf: + * + * Standard implementation of POSIX pathconf, to get information about limits + * for a filesystem. + * Override per filesystem for the case where the filesystem has smaller + * limits. + */ +int +vop_stdpathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_ASYNC_IO: + *ap->a_retval = _POSIX_ASYNCHRONOUS_IO; + return (0); + case _PC_PATH_MAX: + *ap->a_retval = PATH_MAX; + return (0); + case _PC_ACL_EXTENDED: + case _PC_ACL_NFS4: + case _PC_CAP_PRESENT: + case _PC_INF_PRESENT: + case _PC_MAC_PRESENT: + *ap->a_retval = 0; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Standard lock, unlock and islocked functions. + */ +int +vop_stdlock(ap) + struct vop_lock1_args /* { + struct vnode *a_vp; + int a_flags; + char *file; + int line; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct mtx *ilk; + + ilk = VI_MTX(vp); + return (lockmgr_lock_fast_path(vp->v_vnlock, ap->a_flags, + &ilk->lock_object, ap->a_file, ap->a_line)); +} + +/* See above. */ +int +vop_stdunlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct mtx *ilk; + + ilk = VI_MTX(vp); + return (lockmgr_unlock_fast_path(vp->v_vnlock, ap->a_flags, + &ilk->lock_object)); +} + +/* See above. */ +int +vop_stdislocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (lockstatus(ap->a_vp->v_vnlock)); +} + +/* + * Return true for select/poll. + */ +int +vop_nopoll(ap) + struct vop_poll_args /* { + struct vnode *a_vp; + int a_events; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + + return (poll_no_poll(ap->a_events)); +} + +/* + * Implement poll for local filesystems that support it. + */ +int +vop_stdpoll(ap) + struct vop_poll_args /* { + struct vnode *a_vp; + int a_events; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + if (ap->a_events & ~POLLSTANDARD) + return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); + return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); +} + +/* + * Return our mount point, as we will take charge of the writes. + */ +int +vop_stdgetwritemount(ap) + struct vop_getwritemount_args /* { + struct vnode *a_vp; + struct mount **a_mpp; + } */ *ap; +{ + struct mount *mp; + + /* + * XXX Since this is called unlocked we may be recycled while + * attempting to ref the mount. If this is the case or mountpoint + * will be set to NULL. We only have to prevent this call from + * returning with a ref to an incorrect mountpoint. It is not + * harmful to return with a ref to our previous mountpoint. + */ + mp = ap->a_vp->v_mount; + if (mp != NULL) { + vfs_ref(mp); + if (mp != ap->a_vp->v_mount) { + vfs_rel(mp); + mp = NULL; + } + } + *(ap->a_mpp) = mp; + return (0); +} + +/* + * If the file system doesn't implement VOP_BMAP, then return sensible defaults: + * - Return the vnode's bufobj instead of any underlying device's bufobj + * - Calculate the physical block number as if there were equal size + * consecutive blocks, but + * - Report no contiguous runs of blocks. + */ +int +vop_stdbmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; + } */ *ap; +{ + + if (ap->a_bop != NULL) + *ap->a_bop = &ap->a_vp->v_bufobj; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + return (0); +} + +int +vop_stdfsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + int a_waitfor; + struct thread *a_td; + } */ *ap; +{ + + return (vn_fsync_buf(ap->a_vp, ap->a_waitfor)); +} + +static int +vop_stdfdatasync(struct vop_fdatasync_args *ap) +{ + + return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td)); +} + +int +vop_stdfdatasync_buf(struct vop_fdatasync_args *ap) +{ + + return (vn_fsync_buf(ap->a_vp, MNT_WAIT)); +} + +/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ +int +vop_stdgetpages(ap) + struct vop_getpages_args /* { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int *a_rbehind; + int *a_rahead; + } */ *ap; +{ + + return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, + ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL); +} + +static int +vop_stdgetpages_async(struct vop_getpages_async_args *ap) +{ + int error; + + error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead); + ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); + return (error); +} + +int +vop_stdkqfilter(struct vop_kqfilter_args *ap) +{ + return vfs_kqfilter(ap); +} + +/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ +int +vop_stdputpages(ap) + struct vop_putpages_args /* { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_sync; + int *a_rtvals; + } */ *ap; +{ + + return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, + ap->a_sync, ap->a_rtvals); +} + +int +vop_stdvptofh(struct vop_vptofh_args *ap) +{ + return (EOPNOTSUPP); +} + +int +vop_stdvptocnp(struct vop_vptocnp_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct vnode **dvp = ap->a_vpp; + struct ucred *cred = ap->a_cred; + char *buf = ap->a_buf; + int *buflen = ap->a_buflen; + char *dirbuf, *cpos; + int i, error, eofflag, dirbuflen, flags, locked, len, covered; + off_t off; + ino_t fileno; + struct vattr va; + struct nameidata nd; + struct thread *td; + struct dirent *dp; + struct vnode *mvp; + + i = *buflen; + error = 0; + covered = 0; + td = curthread; + + if (vp->v_type != VDIR) + return (ENOENT); + + error = VOP_GETATTR(vp, &va, cred); + if (error) + return (error); + + VREF(vp); + locked = VOP_ISLOCKED(vp); + VOP_UNLOCK(vp, 0); + NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, + "..", vp, td); + flags = FREAD; + error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL); + if (error) { + vn_lock(vp, locked | LK_RETRY); + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + + mvp = *dvp = nd.ni_vp; + + if (vp->v_mount != (*dvp)->v_mount && + ((*dvp)->v_vflag & VV_ROOT) && + ((*dvp)->v_mount->mnt_flag & MNT_UNION)) { + *dvp = (*dvp)->v_mount->mnt_vnodecovered; + VREF(mvp); + VOP_UNLOCK(mvp, 0); + vn_close(mvp, FREAD, cred, td); + VREF(*dvp); + vn_lock(*dvp, LK_SHARED | LK_RETRY); + covered = 1; + } + + fileno = va.va_fileid; + + dirbuflen = DEV_BSIZE; + if (dirbuflen < va.va_blocksize) + dirbuflen = va.va_blocksize; + dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); + + if ((*dvp)->v_type != VDIR) { + error = ENOENT; + goto out; + } + + off = 0; + len = 0; + do { + /* call VOP_READDIR of parent */ + error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off, + &cpos, &len, &eofflag, td); + if (error) + goto out; + + if ((dp->d_type != DT_WHT) && + (dp->d_fileno == fileno)) { + if (covered) { + VOP_UNLOCK(*dvp, 0); + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (dirent_exists(mvp, dp->d_name, td)) { + error = ENOENT; + VOP_UNLOCK(mvp, 0); + vn_lock(*dvp, LK_SHARED | LK_RETRY); + goto out; + } + VOP_UNLOCK(mvp, 0); + vn_lock(*dvp, LK_SHARED | LK_RETRY); + } + i -= dp->d_namlen; + + if (i < 0) { + error = ENOMEM; + goto out; + } + if (dp->d_namlen == 1 && dp->d_name[0] == '.') { + error = ENOENT; + } else { + bcopy(dp->d_name, buf + i, dp->d_namlen); + error = 0; + } + goto out; + } + } while (len > 0 || !eofflag); + error = ENOENT; + +out: + free(dirbuf, M_TEMP); + if (!error) { + *buflen = i; + vref(*dvp); + } + if (covered) { + vput(*dvp); + vrele(mvp); + } else { + VOP_UNLOCK(mvp, 0); + vn_close(mvp, FREAD, cred, td); + } + vn_lock(vp, locked | LK_RETRY); + return (error); +} + +int +vop_stdallocate(struct vop_allocate_args *ap) +{ +#ifdef __notyet__ + struct statfs *sfs; + off_t maxfilesize = 0; +#endif + struct iovec aiov; + struct vattr vattr, *vap; + struct uio auio; + off_t fsize, len, cur, offset; + uint8_t *buf; + struct thread *td; + struct vnode *vp; + size_t iosize; + int error; + + buf = NULL; + error = 0; + td = curthread; + vap = &vattr; + vp = ap->a_vp; + len = *ap->a_len; + offset = *ap->a_offset; + + error = VOP_GETATTR(vp, vap, td->td_ucred); + if (error != 0) + goto out; + fsize = vap->va_size; + iosize = vap->va_blocksize; + if (iosize == 0) + iosize = BLKDEV_IOSIZE; + if (iosize > MAXPHYS) + iosize = MAXPHYS; + buf = malloc(iosize, M_TEMP, M_WAITOK); + +#ifdef __notyet__ + /* + * Check if the filesystem sets f_maxfilesize; if not use + * VOP_SETATTR to perform the check. + */ + sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = VFS_STATFS(vp->v_mount, sfs, td); + if (error == 0) + maxfilesize = sfs->f_maxfilesize; + free(sfs, M_STATFS); + if (error != 0) + goto out; + if (maxfilesize) { + if (offset > maxfilesize || len > maxfilesize || + offset + len > maxfilesize) { + error = EFBIG; + goto out; + } + } else +#endif + if (offset + len > vap->va_size) { + /* + * Test offset + len against the filesystem's maxfilesize. + */ + VATTR_NULL(vap); + vap->va_size = offset + len; + error = VOP_SETATTR(vp, vap, td->td_ucred); + if (error != 0) + goto out; + VATTR_NULL(vap); + vap->va_size = fsize; + error = VOP_SETATTR(vp, vap, td->td_ucred); + if (error != 0) + goto out; + } + + for (;;) { + /* + * Read and write back anything below the nominal file + * size. There's currently no way outside the filesystem + * to know whether this area is sparse or not. + */ + cur = iosize; + if ((offset % iosize) != 0) + cur -= (offset % iosize); + if (cur > len) + cur = len; + if (offset < fsize) { + aiov.iov_base = buf; + aiov.iov_len = cur; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = offset; + auio.uio_resid = cur; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + error = VOP_READ(vp, &auio, 0, td->td_ucred); + if (error != 0) + break; + if (auio.uio_resid > 0) { + bzero(buf + cur - auio.uio_resid, + auio.uio_resid); + } + } else { + bzero(buf, cur); + } + + aiov.iov_base = buf; + aiov.iov_len = cur; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = offset; + auio.uio_resid = cur; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + + error = VOP_WRITE(vp, &auio, 0, td->td_ucred); + if (error != 0) + break; + + len -= cur; + offset += cur; + if (len == 0) + break; + if (should_yield()) + break; + } + + out: + *ap->a_len = len; + *ap->a_offset = offset; + free(buf, M_TEMP); + return (error); +} + +int +vop_stdadvise(struct vop_advise_args *ap) +{ + struct vnode *vp; + struct bufobj *bo; + daddr_t startn, endn; + off_t start, end; + int bsize, error; + + vp = ap->a_vp; + switch (ap->a_advice) { + case POSIX_FADV_WILLNEED: + /* + * Do nothing for now. Filesystems should provide a + * custom method which starts an asynchronous read of + * the requested region. + */ + error = 0; + break; + case POSIX_FADV_DONTNEED: + error = 0; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_iflag & VI_DOOMED) { + VOP_UNLOCK(vp, 0); + break; + } + + /* + * Deactivate pages in the specified range from the backing VM + * object. Pages that are resident in the buffer cache will + * remain wired until their corresponding buffers are released + * below. + */ + if (vp->v_object != NULL) { + start = trunc_page(ap->a_start); + end = round_page(ap->a_end); + VM_OBJECT_RLOCK(vp->v_object); + vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), + OFF_TO_IDX(end)); + VM_OBJECT_RUNLOCK(vp->v_object); + } + + bo = &vp->v_bufobj; + BO_RLOCK(bo); + bsize = vp->v_bufobj.bo_bsize; + startn = ap->a_start / bsize; + endn = ap->a_end / bsize; + error = bnoreuselist(&bo->bo_clean, bo, startn, endn); + if (error == 0) + error = bnoreuselist(&bo->bo_dirty, bo, startn, endn); + BO_RUNLOCK(bo); + VOP_UNLOCK(vp, 0); + break; + default: + error = EINVAL; + break; + } + return (error); +} + +int +vop_stdunp_bind(struct vop_unp_bind_args *ap) +{ + + ap->a_vp->v_unpcb = ap->a_unpcb; + return (0); +} + +int +vop_stdunp_connect(struct vop_unp_connect_args *ap) +{ + + *ap->a_unpcb = ap->a_vp->v_unpcb; + return (0); +} + +int +vop_stdunp_detach(struct vop_unp_detach_args *ap) +{ + + ap->a_vp->v_unpcb = NULL; + return (0); +} + +static int +vop_stdis_text(struct vop_is_text_args *ap) +{ + + return (ap->a_vp->v_writecount < 0); +} + +int +vop_stdset_text(struct vop_set_text_args *ap) +{ + struct vnode *vp; + struct mount *mp; + int error; + + vp = ap->a_vp; + VI_LOCK(vp); + if (vp->v_writecount > 0) { + error = ETXTBSY; + } else { + /* + * If requested by fs, keep a use reference to the + * vnode until the last text reference is released. + */ + mp = vp->v_mount; + if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 && + vp->v_writecount == 0) { + vp->v_iflag |= VI_TEXT_REF; + vrefl(vp); + } + + vp->v_writecount--; + error = 0; + } + VI_UNLOCK(vp); + return (error); +} + +static int +vop_stdunset_text(struct vop_unset_text_args *ap) +{ + struct vnode *vp; + int error; + bool last; + + vp = ap->a_vp; + last = false; + VI_LOCK(vp); + if (vp->v_writecount < 0) { + if ((vp->v_iflag & VI_TEXT_REF) != 0 && + vp->v_writecount == -1) { + last = true; + vp->v_iflag &= ~VI_TEXT_REF; + } + vp->v_writecount++; + error = 0; + } else { + error = EINVAL; + } + VI_UNLOCK(vp); + if (last) + vunref(vp); + return (error); +} + +static int +vop_stdadd_writecount(struct vop_add_writecount_args *ap) +{ + struct vnode *vp; + int error; + + vp = ap->a_vp; + VI_LOCK_FLAGS(vp, MTX_DUPOK); + if (vp->v_writecount < 0) { + error = ETXTBSY; + } else { + VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, + ("neg writecount increment %d", ap->a_inc)); + vp->v_writecount += ap->a_inc; + error = 0; + } + VI_UNLOCK(vp); + return (error); +} + +/* + * vfs default ops + * used to fill the vfs function table to get reasonable default return values. + */ +int +vfs_stdroot (mp, flags, vpp) + struct mount *mp; + int flags; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +vfs_stdstatfs (mp, sbp) + struct mount *mp; + struct statfs *sbp; +{ + + return (EOPNOTSUPP); +} + +int +vfs_stdquotactl (mp, cmds, uid, arg) + struct mount *mp; + int cmds; + uid_t uid; + void *arg; +{ + + return (EOPNOTSUPP); +} + +int +vfs_stdsync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + struct vnode *vp, *mvp; + struct thread *td; + int error, lockreq, allerror = 0; + + td = curthread; + lockreq = LK_EXCLUSIVE | LK_INTERLOCK; + if (waitfor != MNT_WAIT) + lockreq |= LK_NOWAIT; + /* + * Force stale buffer cache information to be flushed. + */ +loop: + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + if (vp->v_bufobj.bo_dirty.bv_cnt == 0) { + VI_UNLOCK(vp); + continue; + } + if ((error = vget(vp, lockreq, td)) != 0) { + if (error == ENOENT) { + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto loop; + } + continue; + } + error = VOP_FSYNC(vp, waitfor, td); + if (error) + allerror = error; + vput(vp); + } + return (allerror); +} + +int +vfs_stdnosync (mp, waitfor) + struct mount *mp; + int waitfor; +{ + + return (0); +} + +int +vfs_stdvget (mp, ino, flags, vpp) + struct mount *mp; + ino_t ino; + int flags; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +vfs_stdfhtovp (mp, fhp, flags, vpp) + struct mount *mp; + struct fid *fhp; + int flags; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +vfs_stdinit (vfsp) + struct vfsconf *vfsp; +{ + + return (0); +} + +int +vfs_stduninit (vfsp) + struct vfsconf *vfsp; +{ + + return(0); +} + +int +vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname) + struct mount *mp; + int cmd; + struct vnode *filename_vp; + int attrnamespace; + const char *attrname; +{ + + if (filename_vp != NULL) + VOP_UNLOCK(filename_vp, 0); + return (EOPNOTSUPP); +} + +int +vfs_stdsysctl(mp, op, req) + struct mount *mp; + fsctlop_t op; + struct sysctl_req *req; +{ + + return (EOPNOTSUPP); +} + +/* end of vfs default ops */ diff --git a/freebsd/sys/kern/vfs_export.c b/freebsd/sys/kern/vfs_export.c new file mode 100644 index 00000000..669d4e9f --- /dev/null +++ b/freebsd/sys/kern/vfs_export.c @@ -0,0 +1,528 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure"); + +#if defined(INET) || defined(INET6) +static struct radix_node_head *vfs_create_addrlist_af( + struct radix_node_head **prnh, int off); +#endif +static void vfs_free_addrlist(struct netexport *nep); +static int vfs_free_netcred(struct radix_node *rn, void *w); +static void vfs_free_addrlist_af(struct radix_node_head **prnh); +static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep, + struct export_args *argp); +static struct netcred *vfs_export_lookup(struct mount *, struct sockaddr *); + +/* + * Network address lookup element + */ +struct netcred { + struct radix_node netc_rnodes[2]; + int netc_exflags; + struct ucred *netc_anon; + int netc_numsecflavors; + int netc_secflavors[MAXSECFLAVORS]; +}; + +/* + * Network export information + */ +struct netexport { + struct netcred ne_defexported; /* Default export */ + struct radix_node_head *ne4; + struct radix_node_head *ne6; +}; + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by vfs_export() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(struct mount *mp, struct netexport *nep, + struct export_args *argp) +{ + struct netcred *np; + struct radix_node_head *rnh; + int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = NULL; +#if defined(INET6) || defined(INET) + int off; +#endif + int error; + + /* + * XXX: This routine converts from a `struct xucred' + * (argp->ex_anon) to a `struct ucred' (np->netc_anon). This + * operation is questionable; for example, what should be done + * with fields like cr_uidinfo and cr_prison? Currently, this + * routine does not touch them (leaves them as NULL). + */ + if (argp->ex_anon.cr_version != XUCRED_VERSION) { + vfs_mount_error(mp, "ex_anon.cr_version: %d != %d", + argp->ex_anon.cr_version, XUCRED_VERSION); + return (EINVAL); + } + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) { + vfs_mount_error(mp, + "MNT_DEFEXPORTED already set for mount %p", mp); + return (EPERM); + } + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + np->netc_anon = crget(); + np->netc_anon->cr_uid = argp->ex_anon.cr_uid; + crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups, + argp->ex_anon.cr_groups); + np->netc_anon->cr_prison = &prison0; + prison_hold(np->netc_anon->cr_prison); + np->netc_numsecflavors = argp->ex_numsecflavors; + bcopy(argp->ex_secflavors, np->netc_secflavors, + sizeof(np->netc_secflavors)); + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_DEFEXPORTED; + MNT_IUNLOCK(mp); + return (0); + } + +#if MSIZE <= 256 + if (argp->ex_addrlen > MLEN) { + vfs_mount_error(mp, "ex_addrlen %d is greater than %d", + argp->ex_addrlen, MLEN); + return (EINVAL); + } +#endif + + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); + saddr = (struct sockaddr *) (np + 1); + if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen))) + goto out; + if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) { + error = EINVAL; + vfs_mount_error(mp, "Invalid saddr->sa_family: %d"); + goto out; + } + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); + error = copyin(argp->ex_mask, smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + rnh = NULL; + switch (saddr->sa_family) { +#ifdef INET + case AF_INET: + if ((rnh = nep->ne4) == NULL) { + off = offsetof(struct sockaddr_in, sin_addr) << 3; + rnh = vfs_create_addrlist_af(&nep->ne4, off); + } + break; +#endif +#ifdef INET6 + case AF_INET6: + if ((rnh = nep->ne6) == NULL) { + off = offsetof(struct sockaddr_in6, sin6_addr) << 3; + rnh = vfs_create_addrlist_af(&nep->ne6, off); + } + break; +#endif + } + if (rnh == NULL) { + error = ENOBUFS; + vfs_mount_error(mp, "%s %s %d", + "Unable to initialize radix node head ", + "for address family", saddr->sa_family); + goto out; + } + RADIX_NODE_HEAD_LOCK(rnh); + rn = (*rnh->rnh_addaddr)(saddr, smask, &rnh->rh, np->netc_rnodes); + RADIX_NODE_HEAD_UNLOCK(rnh); + if (rn == NULL || np != (struct netcred *)rn) { /* already exists */ + error = EPERM; + vfs_mount_error(mp, + "netcred already exists for given addr/mask"); + goto out; + } + np->netc_exflags = argp->ex_flags; + np->netc_anon = crget(); + np->netc_anon->cr_uid = argp->ex_anon.cr_uid; + crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups, + argp->ex_anon.cr_groups); + np->netc_anon->cr_prison = &prison0; + prison_hold(np->netc_anon->cr_prison); + np->netc_numsecflavors = argp->ex_numsecflavors; + bcopy(argp->ex_secflavors, np->netc_secflavors, + sizeof(np->netc_secflavors)); + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* Helper for vfs_free_addrlist. */ +/* ARGSUSED */ +static int +vfs_free_netcred(struct radix_node *rn, void *w) +{ + struct radix_node_head *rnh = (struct radix_node_head *) w; + struct ucred *cred; + + (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, &rnh->rh); + cred = ((struct netcred *)rn)->netc_anon; + if (cred != NULL) + crfree(cred); + free(rn, M_NETADDR); + return (0); +} + +#if defined(INET) || defined(INET6) +static struct radix_node_head * +vfs_create_addrlist_af(struct radix_node_head **prnh, int off) +{ + + if (rn_inithead((void **)prnh, off) == 0) + return (NULL); + RADIX_NODE_HEAD_LOCK_INIT(*prnh); + return (*prnh); +} +#endif + +static void +vfs_free_addrlist_af(struct radix_node_head **prnh) +{ + struct radix_node_head *rnh; + + rnh = *prnh; + RADIX_NODE_HEAD_LOCK(rnh); + (*rnh->rnh_walktree)(&rnh->rh, vfs_free_netcred, rnh); + RADIX_NODE_HEAD_UNLOCK(rnh); + RADIX_NODE_HEAD_DESTROY(rnh); + rn_detachhead((void **)prnh); + prnh = NULL; +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(struct netexport *nep) +{ + struct ucred *cred; + + if (nep->ne4 != NULL) + vfs_free_addrlist_af(&nep->ne4); + if (nep->ne6 != NULL) + vfs_free_addrlist_af(&nep->ne6); + + cred = nep->ne_defexported.netc_anon; + if (cred != NULL) + crfree(cred); + +} + +/* + * High level function to manipulate export options on a mount point + * and the passed in netexport. + * Struct export_args *argp is the variable used to twiddle options, + * the structure is described in sys/mount.h + */ +int +vfs_export(struct mount *mp, struct export_args *argp) +{ + struct netexport *nep; + int error; + + if (argp->ex_numsecflavors < 0 + || argp->ex_numsecflavors >= MAXSECFLAVORS) + return (EINVAL); + + error = 0; + lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL); + nep = mp->mnt_export; + if (argp->ex_flags & MNT_DELEXPORT) { + if (nep == NULL) { + error = ENOENT; + goto out; + } + if (mp->mnt_flag & MNT_EXPUBLIC) { + vfs_setpublicfs(NULL, NULL, NULL); + MNT_ILOCK(mp); + mp->mnt_flag &= ~MNT_EXPUBLIC; + MNT_IUNLOCK(mp); + } + vfs_free_addrlist(nep); + mp->mnt_export = NULL; + free(nep, M_MOUNT); + nep = NULL; + MNT_ILOCK(mp); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + MNT_IUNLOCK(mp); + } + if (argp->ex_flags & MNT_EXPORTED) { + if (nep == NULL) { + nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO); + mp->mnt_export = nep; + } + if (argp->ex_flags & MNT_EXPUBLIC) { + if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) + goto out; + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_EXPUBLIC; + MNT_IUNLOCK(mp); + } + if ((error = vfs_hang_addrlist(mp, nep, argp))) + goto out; + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_EXPORTED; + MNT_IUNLOCK(mp); + } + +out: + lockmgr(&mp->mnt_explock, LK_RELEASE, NULL); + /* + * Once we have executed the vfs_export() command, we do + * not want to keep the "export" option around in the + * options list, since that will cause subsequent MNT_UPDATE + * calls to fail. The export information is saved in + * mp->mnt_export, so we can safely delete the "export" mount option + * here. + */ + vfs_deleteopt(mp->mnt_optnew, "export"); + vfs_deleteopt(mp->mnt_opt, "export"); + return (error); +} + +/* + * Set the publicly exported filesystem (WebNFS). Currently, only + * one public filesystem is possible in the spec (RFC 2054 and 2055) + */ +int +vfs_setpublicfs(struct mount *mp, struct netexport *nep, + struct export_args *argp) +{ + int error; + struct vnode *rvp; + char *cp; + + /* + * mp == NULL -> invalidate the current info, the FS is + * no longer exported. May be called from either vfs_export + * or unmount, so check if it hasn't already been done. + */ + if (mp == NULL) { + if (nfs_pub.np_valid) { + nfs_pub.np_valid = 0; + if (nfs_pub.np_index != NULL) { + free(nfs_pub.np_index, M_TEMP); + nfs_pub.np_index = NULL; + } + } + return (0); + } + + /* + * Only one allowed at a time. + */ + if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) + return (EBUSY); + + /* + * Get real filehandle for root of exported FS. + */ + bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); + nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; + + if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp))) + return (error); + + if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) + return (error); + + vput(rvp); + + /* + * If an indexfile was specified, pull it in. + */ + if (argp->ex_indexfile != NULL) { + if (nfs_pub.np_index == NULL) + nfs_pub.np_index = malloc(MAXNAMLEN + 1, M_TEMP, + M_WAITOK); + error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, + MAXNAMLEN, (size_t *)0); + if (!error) { + /* + * Check for illegal filenames. + */ + for (cp = nfs_pub.np_index; *cp; cp++) { + if (*cp == '/') { + error = EINVAL; + break; + } + } + } + if (error) { + free(nfs_pub.np_index, M_TEMP); + nfs_pub.np_index = NULL; + return (error); + } + } + + nfs_pub.np_mount = mp; + nfs_pub.np_valid = 1; + return (0); +} + +/* + * Used by the filesystems to determine if a given network address + * (passed in 'nam') is present in their exports list, returns a pointer + * to struct netcred so that the filesystem can examine it for + * access rights (read/write/etc). + */ +static struct netcred * +vfs_export_lookup(struct mount *mp, struct sockaddr *nam) +{ + RADIX_NODE_HEAD_RLOCK_TRACKER; + struct netexport *nep; + struct netcred *np = NULL; + struct radix_node_head *rnh; + struct sockaddr *saddr; + + nep = mp->mnt_export; + if (nep == NULL) + return (NULL); + if ((mp->mnt_flag & MNT_EXPORTED) == 0) + return (NULL); + + /* + * Lookup in the export list + */ + if (nam != NULL) { + saddr = nam; + rnh = NULL; + switch (saddr->sa_family) { + case AF_INET: + rnh = nep->ne4; + break; + case AF_INET6: + rnh = nep->ne6; + break; + } + if (rnh != NULL) { + RADIX_NODE_HEAD_RLOCK(rnh); + np = (struct netcred *) (*rnh->rnh_matchaddr)(saddr, &rnh->rh); + RADIX_NODE_HEAD_RUNLOCK(rnh); + if (np != NULL && (np->netc_rnodes->rn_flags & RNF_ROOT) != 0) + return (NULL); + } + } + + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && (mp->mnt_flag & MNT_DEFEXPORTED) != 0) + return (&nep->ne_defexported); + + return (np); +} + +/* + * XXX: This comment comes from the deprecated ufs_check_export() + * XXX: and may not entirely apply, but lacking something better: + * This is the generic part of fhtovp called after the underlying + * filesystem has validated the file handle. + * + * Verify that a host should have access to a filesystem. + */ + +int +vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) +{ + struct netcred *np; + + lockmgr(&mp->mnt_explock, LK_SHARED, NULL); + np = vfs_export_lookup(mp, nam); + if (np == NULL) { + lockmgr(&mp->mnt_explock, LK_RELEASE, NULL); + *credanonp = NULL; + return (EACCES); + } + *extflagsp = np->netc_exflags; + if ((*credanonp = np->netc_anon) != NULL) + crhold(*credanonp); + if (numsecflavors) + *numsecflavors = np->netc_numsecflavors; + if (secflavors) + *secflavors = np->netc_secflavors; + lockmgr(&mp->mnt_explock, LK_RELEASE, NULL); + return (0); +} + diff --git a/freebsd/sys/kern/vfs_extattr.c b/freebsd/sys/kern/vfs_extattr.c new file mode 100644 index 00000000..2903fd37 --- /dev/null +++ b/freebsd/sys/kern/vfs_extattr.c @@ -0,0 +1,757 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static int kern_extattr_set_path(struct thread *td, const char *path, + int attrnamespace, const char *attrname, void *data, + size_t nbytes, int follow); +static int kern_extattr_get_path(struct thread *td, const char *path, + int attrnamespace, const char *attrname, void *data, + size_t nbytes, int follow); +static int kern_extattr_delete_path(struct thread *td, const char *path, + int attrnamespace, const char *attrname, int follow); +static int kern_extattr_list_path(struct thread *td, const char *path, + int attrnamespace, void *data, size_t nbytes, int follow); + +/* + * Syscall to push extended attribute configuration information into the VFS. + * Accepts a path, which it converts to a mountpoint, as well as a command + * (int cmd), and attribute name and misc data. + * + * Currently this is used only by UFS1 extended attributes. + */ +#ifndef _SYS_SYSPROTO_H_ +struct extattrctl_args { + const char *path; + int cmd; + const char *filename; + int attrnamespace; + const char *attrname; +}; +#endif +int +sys_extattrctl(struct thread *td, struct extattrctl_args *uap) +{ + struct vnode *filename_vp; + struct nameidata nd; + struct mount *mp, *mp_writable; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + AUDIT_ARG_CMD(uap->cmd); + AUDIT_ARG_VALUE(uap->attrnamespace); + /* + * uap->attrname is not always defined. We check again later when we + * invoke the VFS call so as to pass in NULL there if needed. + */ + if (uap->attrname != NULL) { + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, + NULL); + if (error) + return (error); + } + AUDIT_ARG_TEXT(attrname); + + mp = NULL; + filename_vp = NULL; + if (uap->filename != NULL) { + NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2, + UIO_USERSPACE, uap->filename, td); + error = namei(&nd); + if (error) + return (error); + filename_vp = nd.ni_vp; + NDFREE(&nd, NDF_NO_VP_RELE); + } + + /* uap->path is always defined. */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, + UIO_USERSPACE, uap->path, td); + error = namei(&nd); + if (error) + goto out; + mp = nd.ni_vp->v_mount; + error = vfs_busy(mp, 0); + if (error) { + NDFREE(&nd, 0); + mp = NULL; + goto out; + } + VOP_UNLOCK(nd.ni_vp, 0); + error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH); + NDFREE(&nd, NDF_NO_VP_UNLOCK); + if (error) + goto out; + if (filename_vp != NULL) { + /* + * uap->filename is not always defined. If it is, + * grab a vnode lock, which VFS_EXTATTRCTL() will + * later release. + */ + error = vn_lock(filename_vp, LK_EXCLUSIVE); + if (error) { + vn_finished_write(mp_writable); + goto out; + } + } + + error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace, + uap->attrname != NULL ? attrname : NULL); + + vn_finished_write(mp_writable); +out: + if (mp != NULL) + vfs_unbusy(mp); + + /* + * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp, + * so vrele it if it is defined. + */ + if (filename_vp != NULL) + vrele(filename_vp); + return (error); +} + +/*- + * Set a named extended attribute on a file or directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", userspace buffer + * pointer "data", buffer length "nbytes", thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, + void *data, size_t nbytes, struct thread *td) +{ + struct mount *mp; + struct uio auio; + struct iovec aiov; + ssize_t cnt; + int error; + + if (nbytes > IOSIZE_MAX) + return (EINVAL); + + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + + aiov.iov_base = data; + aiov.iov_len = nbytes; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_resid = nbytes; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + cnt = nbytes; + +#ifdef MAC + error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace, + attrname); + if (error) + goto done; +#endif + + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, + td->td_ucred, td); + cnt -= auio.uio_resid; + td->td_retval[0] = cnt; + +#ifdef MAC +done: +#endif + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_set_fd_args { + int fd; + int attrnamespace; + const char *attrname; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_set_fd(struct thread *td, struct extattr_set_fd_args *uap) +{ + struct file *fp; + char attrname[EXTATTR_MAXNAMELEN]; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->fd); + AUDIT_ARG_VALUE(uap->attrnamespace); + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + AUDIT_ARG_TEXT(attrname); + + error = getvnode(td, uap->fd, + cap_rights_init(&rights, CAP_EXTATTR_SET), &fp); + if (error) + return (error); + + error = extattr_set_vp(fp->f_vnode, uap->attrnamespace, + attrname, uap->data, uap->nbytes, td); + fdrop(fp, td); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_set_file_args { + const char *path; + int attrnamespace; + const char *attrname; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_set_file(struct thread *td, struct extattr_set_file_args *uap) +{ + + return (kern_extattr_set_path(td, uap->path, uap->attrnamespace, + uap->attrname, uap->data, uap->nbytes, FOLLOW)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_set_link_args { + const char *path; + int attrnamespace; + const char *attrname; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_set_link(struct thread *td, struct extattr_set_link_args *uap) +{ + + return (kern_extattr_set_path(td, uap->path, uap->attrnamespace, + uap->attrname, uap->data, uap->nbytes, NOFOLLOW)); +} + +static int +kern_extattr_set_path(struct thread *td, const char *path, int attrnamespace, + const char *uattrname, void *data, size_t nbytes, int follow) +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + AUDIT_ARG_VALUE(attrnamespace); + error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + AUDIT_ARG_TEXT(attrname); + + NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_set_vp(nd.ni_vp, attrnamespace, attrname, data, + nbytes, td); + + vrele(nd.ni_vp); + return (error); +} + +/*- + * Get a named extended attribute on a file or directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", userspace buffer + * pointer "data", buffer length "nbytes", thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, + void *data, size_t nbytes, struct thread *td) +{ + struct uio auio, *auiop; + struct iovec aiov; + ssize_t cnt; + size_t size, *sizep; + int error; + + if (nbytes > IOSIZE_MAX) + return (EINVAL); + + vn_lock(vp, LK_SHARED | LK_RETRY); + + /* + * Slightly unusual semantics: if the user provides a NULL data + * pointer, they don't want to receive the data, just the maximum + * read length. + */ + auiop = NULL; + sizep = NULL; + cnt = 0; + if (data != NULL) { + aiov.iov_base = data; + aiov.iov_len = nbytes; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_resid = nbytes; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auiop = &auio; + cnt = nbytes; + } else + sizep = &size; + +#ifdef MAC + error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace, + attrname); + if (error) + goto done; +#endif + + error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, + td->td_ucred, td); + + if (auiop != NULL) { + cnt -= auio.uio_resid; + td->td_retval[0] = cnt; + } else + td->td_retval[0] = size; +#ifdef MAC +done: +#endif + VOP_UNLOCK(vp, 0); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_get_fd_args { + int fd; + int attrnamespace; + const char *attrname; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_get_fd(struct thread *td, struct extattr_get_fd_args *uap) +{ + struct file *fp; + char attrname[EXTATTR_MAXNAMELEN]; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->fd); + AUDIT_ARG_VALUE(uap->attrnamespace); + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + AUDIT_ARG_TEXT(attrname); + + error = getvnode(td, uap->fd, + cap_rights_init(&rights, CAP_EXTATTR_GET), &fp); + if (error) + return (error); + + error = extattr_get_vp(fp->f_vnode, uap->attrnamespace, + attrname, uap->data, uap->nbytes, td); + + fdrop(fp, td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_get_file_args { + const char *path; + int attrnamespace; + const char *attrname; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_get_file(struct thread *td, struct extattr_get_file_args *uap) +{ + return (kern_extattr_get_path(td, uap->path, uap->attrnamespace, + uap->attrname, uap->data, uap->nbytes, FOLLOW)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_get_link_args { + const char *path; + int attrnamespace; + const char *attrname; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_get_link(struct thread *td, struct extattr_get_link_args *uap) +{ + return (kern_extattr_get_path(td, uap->path, uap->attrnamespace, + uap->attrname, uap->data, uap->nbytes, NOFOLLOW)); +} + +static int +kern_extattr_get_path(struct thread *td, const char *path, int attrnamespace, + const char *uattrname, void *data, size_t nbytes, int follow) +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + AUDIT_ARG_VALUE(attrnamespace); + error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + AUDIT_ARG_TEXT(attrname); + + NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_get_vp(nd.ni_vp, attrnamespace, attrname, data, + nbytes, td); + + vrele(nd.ni_vp); + return (error); +} + +/* + * extattr_delete_vp(): Delete a named extended attribute on a file or + * directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", proc "p" + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, + struct thread *td) +{ + struct mount *mp; + int error; + + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + +#ifdef MAC + error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace, + attrname); + if (error) + goto done; +#endif + + error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred, + td); + if (error == EOPNOTSUPP) + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, + td->td_ucred, td); +#ifdef MAC +done: +#endif + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_delete_fd_args { + int fd; + int attrnamespace; + const char *attrname; +}; +#endif +int +sys_extattr_delete_fd(struct thread *td, struct extattr_delete_fd_args *uap) +{ + struct file *fp; + char attrname[EXTATTR_MAXNAMELEN]; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->fd); + AUDIT_ARG_VALUE(uap->attrnamespace); + error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return (error); + AUDIT_ARG_TEXT(attrname); + + error = getvnode(td, uap->fd, + cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp); + if (error) + return (error); + + error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace, + attrname, td); + fdrop(fp, td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_delete_file_args { + const char *path; + int attrnamespace; + const char *attrname; +}; +#endif +int +sys_extattr_delete_file(struct thread *td, struct extattr_delete_file_args *uap) +{ + + return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace, + uap->attrname, FOLLOW)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_delete_link_args { + const char *path; + int attrnamespace; + const char *attrname; +}; +#endif +int +sys_extattr_delete_link(struct thread *td, struct extattr_delete_link_args *uap) +{ + + return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace, + uap->attrname, NOFOLLOW)); +} + +static int +kern_extattr_delete_path(struct thread *td, const char *path, int attrnamespace, + const char *uattrname, int follow) +{ + struct nameidata nd; + char attrname[EXTATTR_MAXNAMELEN]; + int error; + + AUDIT_ARG_VALUE(attrnamespace); + error = copyinstr(uattrname, attrname, EXTATTR_MAXNAMELEN, NULL); + if (error) + return(error); + AUDIT_ARG_TEXT(attrname); + + NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error) + return(error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_delete_vp(nd.ni_vp, attrnamespace, attrname, td); + vrele(nd.ni_vp); + return(error); +} + +/*- + * Retrieve a list of extended attributes on a file or directory. + * + * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace", + * userspace buffer pointer "data", buffer length "nbytes", + * thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, + size_t nbytes, struct thread *td) +{ + struct uio auio, *auiop; + size_t size, *sizep; + struct iovec aiov; + ssize_t cnt; + int error; + + if (nbytes > IOSIZE_MAX) + return (EINVAL); + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + + auiop = NULL; + sizep = NULL; + cnt = 0; + if (data != NULL) { + aiov.iov_base = data; + aiov.iov_len = nbytes; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_resid = nbytes; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_td = td; + auiop = &auio; + cnt = nbytes; + } else + sizep = &size; + +#ifdef MAC + error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace); + if (error) + goto done; +#endif + + error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, + td->td_ucred, td); + + if (auiop != NULL) { + cnt -= auio.uio_resid; + td->td_retval[0] = cnt; + } else + td->td_retval[0] = size; +#ifdef MAC +done: +#endif + VOP_UNLOCK(vp, 0); + return (error); +} + + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_list_fd_args { + int fd; + int attrnamespace; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_list_fd(struct thread *td, struct extattr_list_fd_args *uap) +{ + struct file *fp; + cap_rights_t rights; + int error; + + AUDIT_ARG_FD(uap->fd); + AUDIT_ARG_VALUE(uap->attrnamespace); + error = getvnode(td, uap->fd, + cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp); + if (error) + return (error); + + error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data, + uap->nbytes, td); + + fdrop(fp, td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_list_file_args { + const char *path; + int attrnamespace; + void *data; + size_t nbytes; +} +#endif +int +sys_extattr_list_file(struct thread *td, struct extattr_list_file_args *uap) +{ + + return (kern_extattr_list_path(td, uap->path, uap->attrnamespace, + uap->data, uap->nbytes, FOLLOW)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct extattr_list_link_args { + const char *path; + int attrnamespace; + void *data; + size_t nbytes; +}; +#endif +int +sys_extattr_list_link(struct thread *td, struct extattr_list_link_args *uap) +{ + + return (kern_extattr_list_path(td, uap->path, uap->attrnamespace, + uap->data, uap->nbytes, NOFOLLOW)); +} + +static int +kern_extattr_list_path(struct thread *td, const char *path, int attrnamespace, + void *data, size_t nbytes, int follow) +{ + struct nameidata nd; + int error; + + AUDIT_ARG_VALUE(attrnamespace); + NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path, td); + error = namei(&nd); + if (error) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = extattr_list_vp(nd.ni_vp, attrnamespace, data, nbytes, td); + + vrele(nd.ni_vp); + return (error); +} diff --git a/freebsd/sys/kern/vfs_hash.c b/freebsd/sys/kern/vfs_hash.c new file mode 100644 index 00000000..b938f485 --- /dev/null +++ b/freebsd/sys/kern/vfs_hash.c @@ -0,0 +1,234 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2005 Poul-Henning Kamp + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table"); + +static LIST_HEAD(vfs_hash_head, vnode) *vfs_hash_tbl; +static LIST_HEAD(,vnode) vfs_hash_side; +static u_long vfs_hash_mask; +static struct rwlock vfs_hash_lock; + +static void +vfs_hashinit(void *dummy __unused) +{ + + vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask); + rw_init(&vfs_hash_lock, "vfs hash"); + LIST_INIT(&vfs_hash_side); +} + +/* Must be SI_ORDER_SECOND so desiredvnodes is available */ +SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL); + +u_int +vfs_hash_index(struct vnode *vp) +{ + + return (vp->v_hash + vp->v_mount->mnt_hashseed); +} + +static struct vfs_hash_head * +vfs_hash_bucket(const struct mount *mp, u_int hash) +{ + + return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]); +} + +int +vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, + struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg) +{ + struct vnode *vp; + int error; + + while (1) { + rw_rlock(&vfs_hash_lock); + LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) { + if (vp->v_hash != hash) + continue; + if (vp->v_mount != mp) + continue; + if (fn != NULL && fn(vp, arg)) + continue; + vhold(vp); + rw_runlock(&vfs_hash_lock); + error = vget(vp, flags | LK_VNHELD, td); + if (error == ENOENT && (flags & LK_NOWAIT) == 0) + break; + if (error) + return (error); + *vpp = vp; + return (0); + } + if (vp == NULL) { + rw_runlock(&vfs_hash_lock); + *vpp = NULL; + return (0); + } + } +} + +void +vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, + struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg) +{ + struct vnode *vp; + + while (1) { + rw_rlock(&vfs_hash_lock); + LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) { + if (vp->v_hash != hash) + continue; + if (vp->v_mount != mp) + continue; + if (fn != NULL && fn(vp, arg)) + continue; + vhold(vp); + rw_runlock(&vfs_hash_lock); + vref(vp); + vdrop(vp); + *vpp = vp; + return; + } + if (vp == NULL) { + rw_runlock(&vfs_hash_lock); + *vpp = NULL; + return; + } + } +} + +void +vfs_hash_remove(struct vnode *vp) +{ + + rw_wlock(&vfs_hash_lock); + LIST_REMOVE(vp, v_hashlist); + rw_wunlock(&vfs_hash_lock); +} + +int +vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, + struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg) +{ + struct vnode *vp2; + int error; + + *vpp = NULL; + while (1) { + rw_wlock(&vfs_hash_lock); + LIST_FOREACH(vp2, + vfs_hash_bucket(vp->v_mount, hash), v_hashlist) { + if (vp2->v_hash != hash) + continue; + if (vp2->v_mount != vp->v_mount) + continue; + if (fn != NULL && fn(vp2, arg)) + continue; + vhold(vp2); + rw_wunlock(&vfs_hash_lock); + error = vget(vp2, flags | LK_VNHELD, td); + if (error == ENOENT && (flags & LK_NOWAIT) == 0) + break; + rw_wlock(&vfs_hash_lock); + LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist); + rw_wunlock(&vfs_hash_lock); + vput(vp); + if (!error) + *vpp = vp2; + return (error); + } + if (vp2 == NULL) + break; + + } + vp->v_hash = hash; + LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist); + rw_wunlock(&vfs_hash_lock); + return (0); +} + +void +vfs_hash_rehash(struct vnode *vp, u_int hash) +{ + + rw_wlock(&vfs_hash_lock); + LIST_REMOVE(vp, v_hashlist); + LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist); + vp->v_hash = hash; + rw_wunlock(&vfs_hash_lock); +} + +void +vfs_hash_changesize(int newmaxvnodes) +{ + struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl; + u_long vfs_hash_newmask, vfs_hash_oldmask; + struct vnode *vp; + int i; + + vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH, + &vfs_hash_newmask); + /* If same hash table size, nothing to do */ + if (vfs_hash_mask == vfs_hash_newmask) { + free(vfs_hash_newtbl, M_VFS_HASH); + return; + } + /* + * Move everything from the old hash table to the new table. + * None of the vnodes in the table can be recycled because to + * do so, they have to be removed from the hash table. + */ + rw_wlock(&vfs_hash_lock); + vfs_hash_oldtbl = vfs_hash_tbl; + vfs_hash_oldmask = vfs_hash_mask; + vfs_hash_tbl = vfs_hash_newtbl; + vfs_hash_mask = vfs_hash_newmask; + for (i = 0; i <= vfs_hash_oldmask; i++) { + while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) { + LIST_REMOVE(vp, v_hashlist); + LIST_INSERT_HEAD( + vfs_hash_bucket(vp->v_mount, vp->v_hash), + vp, v_hashlist); + } + } + rw_wunlock(&vfs_hash_lock); + free(vfs_hash_oldtbl, M_VFS_HASH); +} diff --git a/freebsd/sys/kern/vfs_init.c b/freebsd/sys/kern/vfs_init.c new file mode 100644 index 00000000..5eb38e6d --- /dev/null +++ b/freebsd/sys/kern/vfs_init.c @@ -0,0 +1,376 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int vfs_register(struct vfsconf *); +static int vfs_unregister(struct vfsconf *); + +MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); + +/* + * The highest defined VFS number. + */ +int maxvfsconf = VFS_GENERIC + 1; + +/* + * Single-linked list of configured VFSes. + * New entries are added/deleted by vfs_register()/vfs_unregister() + */ +struct vfsconfhead vfsconf = TAILQ_HEAD_INITIALIZER(vfsconf); +struct sx vfsconf_sx; +SX_SYSINIT(vfsconf, &vfsconf_sx, "vfsconf"); + +/* + * Loader.conf variable vfs.typenumhash enables setting vfc_typenum using a hash + * calculation on vfc_name, so that it doesn't change when file systems are + * loaded in a different order. This will avoid the NFS server file handles from + * changing for file systems that use vfc_typenum in their fsid. + */ +static int vfs_typenumhash = 1; +SYSCTL_INT(_vfs, OID_AUTO, typenumhash, CTLFLAG_RDTUN, &vfs_typenumhash, 0, + "Set vfc_typenum using a hash calculation on vfc_name, so that it does not" + "change when file systems are loaded in a different order."); + +/* + * A Zen vnode attribute structure. + * + * Initialized when the first filesystem registers by vfs_register(). + */ +struct vattr va_null; + +/* + * vfs_init.c + * + * Allocate and fill in operations vectors. + * + * An undocumented feature of this approach to defining operations is that + * there can be multiple entries in vfs_opv_descs for the same operations + * vector. This allows third parties to extend the set of operations + * supported by another layer in a binary compatibile way. For example, + * assume that NFS needed to be modified to support Ficus. NFS has an entry + * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by + * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions) + * listing those new operations Ficus adds to NFS, all without modifying the + * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but + * that is a(whole)nother story.) This is a feature. + */ + +/* + * Routines having to do with the management of the vnode table. + */ + +static struct vfsconf * +vfs_byname_locked(const char *name) +{ + struct vfsconf *vfsp; + + sx_assert(&vfsconf_sx, SA_LOCKED); + if (!strcmp(name, "ffs")) + name = "ufs"; + TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { + if (!strcmp(name, vfsp->vfc_name)) + return (vfsp); + } + return (NULL); +} + +struct vfsconf * +vfs_byname(const char *name) +{ + struct vfsconf *vfsp; + + vfsconf_slock(); + vfsp = vfs_byname_locked(name); + vfsconf_sunlock(); + return (vfsp); +} + +struct vfsconf * +vfs_byname_kld(const char *fstype, struct thread *td, int *error) +{ + struct vfsconf *vfsp; + int fileid, loaded; + + vfsp = vfs_byname(fstype); + if (vfsp != NULL) + return (vfsp); + + /* Try to load the respective module. */ + *error = kern_kldload(td, fstype, &fileid); + loaded = (*error == 0); + if (*error == EEXIST) + *error = 0; + if (*error) + return (NULL); + + /* Look up again to see if the VFS was loaded. */ + vfsp = vfs_byname(fstype); + if (vfsp == NULL) { + if (loaded) + (void)kern_kldunload(td, fileid, LINKER_UNLOAD_FORCE); + *error = ENODEV; + return (NULL); + } + return (vfsp); +} + + +/* Register a new filesystem type in the global table */ +static int +vfs_register(struct vfsconf *vfc) +{ + struct sysctl_oid *oidp; + struct vfsops *vfsops; + static int once; + struct vfsconf *tvfc; + uint32_t hashval; + int secondpass; + + if (!once) { + vattr_null(&va_null); + once = 1; + } + + if (vfc->vfc_version != VFS_VERSION) { + printf("ERROR: filesystem %s, unsupported ABI version %x\n", + vfc->vfc_name, vfc->vfc_version); + return (EINVAL); + } + vfsconf_lock(); + if (vfs_byname_locked(vfc->vfc_name) != NULL) { + vfsconf_unlock(); + return (EEXIST); + } + + if (vfs_typenumhash != 0) { + /* + * Calculate a hash on vfc_name to use for vfc_typenum. Unless + * all of 1<->255 are assigned, it is limited to 8bits since + * that is what ZFS uses from vfc_typenum and is also the + * preferred range for vfs_getnewfsid(). + */ + hashval = fnv_32_str(vfc->vfc_name, FNV1_32_INIT); + hashval &= 0xff; + secondpass = 0; + do { + /* Look for and fix any collision. */ + TAILQ_FOREACH(tvfc, &vfsconf, vfc_list) { + if (hashval == tvfc->vfc_typenum) { + if (hashval == 255 && secondpass == 0) { + hashval = 1; + secondpass = 1; + } else + hashval++; + break; + } + } + } while (tvfc != NULL); + vfc->vfc_typenum = hashval; + if (vfc->vfc_typenum >= maxvfsconf) + maxvfsconf = vfc->vfc_typenum + 1; + } else + vfc->vfc_typenum = maxvfsconf++; + TAILQ_INSERT_TAIL(&vfsconf, vfc, vfc_list); + + /* + * Initialise unused ``struct vfsops'' fields, to use + * the vfs_std*() functions. Note, we need the mount + * and unmount operations, at the least. The check + * for vfsops available is just a debugging aid. + */ + KASSERT(vfc->vfc_vfsops != NULL, + ("Filesystem %s has no vfsops", vfc->vfc_name)); + /* + * Check the mount and unmount operations. + */ + vfsops = vfc->vfc_vfsops; + KASSERT(vfsops->vfs_mount != NULL, + ("Filesystem %s has no mount op", vfc->vfc_name)); + KASSERT(vfsops->vfs_unmount != NULL, + ("Filesystem %s has no unmount op", vfc->vfc_name)); + + if (vfsops->vfs_root == NULL) + /* return file system's root vnode */ + vfsops->vfs_root = vfs_stdroot; + if (vfsops->vfs_quotactl == NULL) + /* quota control */ + vfsops->vfs_quotactl = vfs_stdquotactl; + if (vfsops->vfs_statfs == NULL) + /* return file system's status */ + vfsops->vfs_statfs = vfs_stdstatfs; + if (vfsops->vfs_sync == NULL) + /* + * flush unwritten data (nosync) + * file systems can use vfs_stdsync + * explicitly by setting it in the + * vfsop vector. + */ + vfsops->vfs_sync = vfs_stdnosync; + if (vfsops->vfs_vget == NULL) + /* convert an inode number to a vnode */ + vfsops->vfs_vget = vfs_stdvget; + if (vfsops->vfs_fhtovp == NULL) + /* turn an NFS file handle into a vnode */ + vfsops->vfs_fhtovp = vfs_stdfhtovp; + if (vfsops->vfs_checkexp == NULL) + /* check if file system is exported */ + vfsops->vfs_checkexp = vfs_stdcheckexp; + if (vfsops->vfs_init == NULL) + /* file system specific initialisation */ + vfsops->vfs_init = vfs_stdinit; + if (vfsops->vfs_uninit == NULL) + /* file system specific uninitialisation */ + vfsops->vfs_uninit = vfs_stduninit; + if (vfsops->vfs_extattrctl == NULL) + /* extended attribute control */ + vfsops->vfs_extattrctl = vfs_stdextattrctl; + if (vfsops->vfs_sysctl == NULL) + vfsops->vfs_sysctl = vfs_stdsysctl; + + if (vfc->vfc_flags & VFCF_JAIL) + prison_add_vfs(vfc); + + /* + * Call init function for this VFS... + */ + (*(vfc->vfc_vfsops->vfs_init))(vfc); + vfsconf_unlock(); + + /* + * If this filesystem has a sysctl node under vfs + * (i.e. vfs.xxfs), then change the oid number of that node to + * match the filesystem's type number. This allows user code + * which uses the type number to read sysctl variables defined + * by the filesystem to continue working. Since the oids are + * in a sorted list, we need to make sure the order is + * preserved by re-registering the oid after modifying its + * number. + */ + sysctl_wlock(); + SLIST_FOREACH(oidp, SYSCTL_CHILDREN(&sysctl___vfs), oid_link) { + if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) { + sysctl_unregister_oid(oidp); + oidp->oid_number = vfc->vfc_typenum; + sysctl_register_oid(oidp); + break; + } + } + sysctl_wunlock(); + + return (0); +} + + +/* Remove registration of a filesystem type */ +static int +vfs_unregister(struct vfsconf *vfc) +{ + struct vfsconf *vfsp; + int error, maxtypenum; + + vfsconf_lock(); + vfsp = vfs_byname_locked(vfc->vfc_name); + if (vfsp == NULL) { + vfsconf_unlock(); + return (EINVAL); + } + if (vfsp->vfc_refcount != 0) { + vfsconf_unlock(); + return (EBUSY); + } + if (vfc->vfc_vfsops->vfs_uninit != NULL) { + error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp); + if (error != 0) { + vfsconf_unlock(); + return (error); + } + } + TAILQ_REMOVE(&vfsconf, vfsp, vfc_list); + maxtypenum = VFS_GENERIC; + TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) + if (maxtypenum < vfsp->vfc_typenum) + maxtypenum = vfsp->vfc_typenum; + maxvfsconf = maxtypenum + 1; + vfsconf_unlock(); + return (0); +} + +/* + * Standard kernel module handling code for filesystem modules. + * Referenced from VFS_SET(). + */ +int +vfs_modevent(module_t mod, int type, void *data) +{ + struct vfsconf *vfc; + int error = 0; + + vfc = (struct vfsconf *)data; + + switch (type) { + case MOD_LOAD: + if (vfc) + error = vfs_register(vfc); + break; + + case MOD_UNLOAD: + if (vfc) + error = vfs_unregister(vfc); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} diff --git a/freebsd/sys/kern/vfs_lookup.c b/freebsd/sys/kern/vfs_lookup.c new file mode 100644 index 00000000..5ee3f219 --- /dev/null +++ b/freebsd/sys/kern/vfs_lookup.c @@ -0,0 +1,1450 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_capsicum.h" +#include "opt_ktrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +#include +#include + +#include + +#define NAMEI_DIAGNOSTIC 1 +#undef NAMEI_DIAGNOSTIC + +SDT_PROVIDER_DECLARE(vfs); +SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *", + "unsigned long"); +SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *"); + +/* Allocation zone for namei. */ +uma_zone_t namei_zone; + +/* Placeholder vnode for mp traversal. */ +static struct vnode *vp_crossmp; + +static int +crossmp_vop_islocked(struct vop_islocked_args *ap) +{ + + return (LK_SHARED); +} + +static int +crossmp_vop_lock1(struct vop_lock1_args *ap) +{ + struct vnode *vp; + struct lock *lk __unused; + const char *file __unused; + int flags, line __unused; + + vp = ap->a_vp; + lk = vp->v_vnlock; + flags = ap->a_flags; + file = ap->a_file; + line = ap->a_line; + + if ((flags & LK_SHARED) == 0) + panic("invalid lock request for crossmp"); + + WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, + flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL); + WITNESS_LOCK(&lk->lock_object, 0, file, line); + if ((flags & LK_INTERLOCK) != 0) + VI_UNLOCK(vp); + LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, line); + return (0); +} + +static int +crossmp_vop_unlock(struct vop_unlock_args *ap) +{ + struct vnode *vp; + struct lock *lk __unused; + int flags; + + vp = ap->a_vp; + lk = vp->v_vnlock; + flags = ap->a_flags; + + if ((flags & LK_INTERLOCK) != 0) + VI_UNLOCK(vp); + WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE); + LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE, + LOCK_LINE); + return (0); +} + +static struct vop_vector crossmp_vnodeops = { + .vop_default = &default_vnodeops, + .vop_islocked = crossmp_vop_islocked, + .vop_lock1 = crossmp_vop_lock1, + .vop_unlock = crossmp_vop_unlock, +}; + +struct nameicap_tracker { + struct vnode *dp; + TAILQ_ENTRY(nameicap_tracker) nm_link; +}; + +/* Zone for cap mode tracker elements used for dotdot capability checks. */ +static uma_zone_t nt_zone; + +static void +nameiinit(void *dummy __unused) +{ + + namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); + +static int lookup_cap_dotdot = 1; +SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN, + &lookup_cap_dotdot, 0, + "enables \"..\" components in path lookup in capability mode"); +static int lookup_cap_dotdot_nonlocal = 1; +SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN, + &lookup_cap_dotdot_nonlocal, 0, + "enables \"..\" components in path lookup in capability mode " + "on non-local mount"); + +static void +nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp) +{ + struct nameicap_tracker *nt; + + if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR) + return; + nt = uma_zalloc(nt_zone, M_WAITOK); + vhold(dp); + nt->dp = dp; + TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); +} + +static void +nameicap_cleanup(struct nameidata *ndp) +{ + struct nameicap_tracker *nt, *nt1; + + KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) || + (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative")); + TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { + TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link); + vdrop(nt->dp); + uma_zfree(nt_zone, nt); + } +} + +/* + * For dotdot lookups in capability mode, only allow the component + * lookup to succeed if the resulting directory was already traversed + * during the operation. Also fail dotdot lookups for non-local + * filesystems, where external agents might assist local lookups to + * escape the compartment. + */ +static int +nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) +{ + struct nameicap_tracker *nt; + struct mount *mp; + + if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL || + dp->v_type != VDIR) + return (0); + mp = dp->v_mount; + if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL && + (mp->mnt_flag & MNT_LOCAL) == 0) + return (ENOTCAPABLE); + TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head, + nm_link) { + if (dp == nt->dp) + return (0); + } + return (ENOTCAPABLE); +} + +static void +namei_cleanup_cnp(struct componentname *cnp) +{ + + uma_zfree(namei_zone, cnp->cn_pnbuf); +#ifdef DIAGNOSTIC + cnp->cn_pnbuf = NULL; + cnp->cn_nameptr = NULL; +#endif +} + +static int +namei_handle_root(struct nameidata *ndp, struct vnode **dpp) +{ + struct componentname *cnp; + + cnp = &ndp->ni_cnd; + if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + return (ENOTCAPABLE); + } + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + *dpp = ndp->ni_rootdir; + vrefact(*dpp); + return (0); +} + +/* + * Convert a pathname into a pointer to a locked vnode. + * + * The FOLLOW flag is set when symbolic links are to be followed + * when they occur at the end of the name translation process. + * Symbolic links are always followed for all other pathname + * components other than the last. + * + * The segflg defines whether the name is to be copied from user + * space or kernel space. + * + * Overall outline of namei: + * + * copy in name + * get starting directory + * while (!done && !error) { + * call lookup to search path. + * if symbolic link, massage name in buffer and continue + * } + */ +int +namei(struct nameidata *ndp) +{ + struct filedesc *fdp; /* pointer to file descriptor state */ + char *cp; /* pointer into pathname argument */ + struct vnode *dp; /* the directory we are searching */ + struct iovec aiov; /* uio for reading symbolic links */ + struct componentname *cnp; + struct file *dfp; + struct thread *td; + struct proc *p; + cap_rights_t rights; + struct uio auio; + int error, linklen, startdir_used; + + cnp = &ndp->ni_cnd; + td = cnp->cn_thread; + p = td->td_proc; + ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; + KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); + KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, + ("namei: nameiop contaminated with flags")); + KASSERT((cnp->cn_flags & OPMASK) == 0, + ("namei: flags contaminated with nameiops")); + MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || + ndp->ni_startdir->v_type == VBAD); + fdp = p->p_fd; + TAILQ_INIT(&ndp->ni_cap_tracker); + ndp->ni_lcf = 0; + + /* We will set this ourselves if we need it. */ + cnp->cn_flags &= ~TRAILINGSLASH; + + /* + * Get a buffer for the name to be translated, and copy the + * name into the buffer. + */ + if ((cnp->cn_flags & HASBUF) == 0) + cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); + if (ndp->ni_segflg == UIO_SYSSPACE) + error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, + &ndp->ni_pathlen); + else + error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, + &ndp->ni_pathlen); + + /* + * Don't allow empty pathnames. + */ + if (error == 0 && *cnp->cn_pnbuf == '\0') + error = ENOENT; + +#ifdef CAPABILITY_MODE + /* + * In capability mode, lookups must be restricted to happen in + * the subtree with the root specified by the file descriptor: + * - The root must be real file descriptor, not the pseudo-descriptor + * AT_FDCWD. + * - The passed path must be relative and not absolute. + * - If lookup_cap_dotdot is disabled, path must not contain the + * '..' components. + * - If lookup_cap_dotdot is enabled, we verify that all '..' + * components lookups result in the directories which were + * previously walked by us, which prevents an escape from + * the relative root. + */ + if (error == 0 && IN_CAPABILITY_MODE(td) && + (cnp->cn_flags & NOCAPCHECK) == 0) { + ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; + if (ndp->ni_dirfd == AT_FDCWD) { +#ifdef KTRACE + if (KTRPOINT(td, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + error = ECAPMODE; + } + } +#endif + if (error != 0) { + namei_cleanup_cnp(cnp); + ndp->ni_vp = NULL; + return (error); + } + ndp->ni_loopcnt = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_NAMEI)) { + KASSERT(cnp->cn_thread == curthread, + ("namei not using curthread")); + ktrnamei(cnp->cn_pnbuf); + } +#endif + /* + * Get starting point for the translation. + */ + FILEDESC_SLOCK(fdp); + ndp->ni_rootdir = fdp->fd_rdir; + vrefact(ndp->ni_rootdir); + ndp->ni_topdir = fdp->fd_jdir; + + /* + * If we are auditing the kernel pathname, save the user pathname. + */ + if (cnp->cn_flags & AUDITVNODE1) + AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf); + if (cnp->cn_flags & AUDITVNODE2) + AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf); + + startdir_used = 0; + dp = NULL; + cnp->cn_nameptr = cnp->cn_pnbuf; + if (cnp->cn_pnbuf[0] == '/') { + ndp->ni_resflags |= NIRES_ABS; + error = namei_handle_root(ndp, &dp); + } else { + if (ndp->ni_startdir != NULL) { + dp = ndp->ni_startdir; + startdir_used = 1; + } else if (ndp->ni_dirfd == AT_FDCWD) { + dp = fdp->fd_cdir; + vrefact(dp); + } else { + rights = ndp->ni_rightsneeded; + cap_rights_set(&rights, CAP_LOOKUP); + + if (cnp->cn_flags & AUDITVNODE1) + AUDIT_ARG_ATFD1(ndp->ni_dirfd); + if (cnp->cn_flags & AUDITVNODE2) + AUDIT_ARG_ATFD2(ndp->ni_dirfd); + /* + * Effectively inlined fgetvp_rights, because we need to + * inspect the file as well as grabbing the vnode. + */ + error = fget_cap_locked(fdp, ndp->ni_dirfd, &rights, + &dfp, &ndp->ni_filecaps); + if (error != 0) { + /* + * Preserve the error; it should either be EBADF + * or capability-related, both of which can be + * safely returned to the caller. + */ + } else if (dfp->f_ops == &badfileops) { + error = EBADF; + } else if (dfp->f_vnode == NULL) { + error = ENOTDIR; + } else { + dp = dfp->f_vnode; + vrefact(dp); + + if ((dfp->f_flag & FSEARCH) != 0) + cnp->cn_flags |= NOEXECCHECK; + } +#ifdef CAPABILITIES + /* + * If file descriptor doesn't have all rights, + * all lookups relative to it must also be + * strictly relative. + */ + CAP_ALL(&rights); + if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, + &rights) || + ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || + ndp->ni_filecaps.fc_nioctls != -1) { + ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; + } +#endif + } + if (error == 0 && dp->v_type != VDIR) + error = ENOTDIR; + } + FILEDESC_SUNLOCK(fdp); + if (ndp->ni_startdir != NULL && !startdir_used) + vrele(ndp->ni_startdir); + if (error != 0) { + if (dp != NULL) + vrele(dp); + goto out; + } + if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 && + lookup_cap_dotdot != 0) + ndp->ni_lcf |= NI_LCF_CAP_DOTDOT; + SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf, + cnp->cn_flags); + for (;;) { + ndp->ni_startdir = dp; + error = lookup(ndp); + if (error != 0) + goto out; + /* + * If not a symbolic link, we're done. + */ + if ((cnp->cn_flags & ISSYMLINK) == 0) { + vrele(ndp->ni_rootdir); + if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) { + namei_cleanup_cnp(cnp); + } else + cnp->cn_flags |= HASBUF; + nameicap_cleanup(ndp); + SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp); + return (0); + } + if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + error = ELOOP; + break; + } +#ifdef MAC + if ((cnp->cn_flags & NOMACCHECK) == 0) { + error = mac_vnode_check_readlink(td->td_ucred, + ndp->ni_vp); + if (error != 0) + break; + } +#endif + if (ndp->ni_pathlen > 1) + cp = uma_zalloc(namei_zone, M_WAITOK); + else + cp = cnp->cn_pnbuf; + aiov.iov_base = cp; + aiov.iov_len = MAXPATHLEN; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_resid = MAXPATHLEN; + error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); + if (error != 0) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + break; + } + linklen = MAXPATHLEN - auio.uio_resid; + if (linklen == 0) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENOENT; + break; + } + if (linklen + ndp->ni_pathlen > MAXPATHLEN) { + if (ndp->ni_pathlen > 1) + uma_zfree(namei_zone, cp); + error = ENAMETOOLONG; + break; + } + if (ndp->ni_pathlen > 1) { + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + uma_zfree(namei_zone, cnp->cn_pnbuf); + cnp->cn_pnbuf = cp; + } else + cnp->cn_pnbuf[linklen] = '\0'; + ndp->ni_pathlen += linklen; + vput(ndp->ni_vp); + dp = ndp->ni_dvp; + /* + * Check if root directory should replace current directory. + */ + cnp->cn_nameptr = cnp->cn_pnbuf; + if (*(cnp->cn_nameptr) == '/') { + vrele(dp); + error = namei_handle_root(ndp, &dp); + if (error != 0) + goto out; + } + } + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + vrele(ndp->ni_dvp); +out: + vrele(ndp->ni_rootdir); + namei_cleanup_cnp(cnp); + nameicap_cleanup(ndp); + SDT_PROBE2(vfs, namei, lookup, return, error, NULL); + return (error); +} + +static int +compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags) +{ + + if (mp == NULL || ((lkflags & LK_SHARED) && + (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) || + ((cnflags & ISDOTDOT) && + (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) { + lkflags &= ~LK_SHARED; + lkflags |= LK_EXCLUSIVE; + } + lkflags |= LK_NODDLKTREAT; + return (lkflags); +} + +static __inline int +needs_exclusive_leaf(struct mount *mp, int flags) +{ + + /* + * Intermediate nodes can use shared locks, we only need to + * force an exclusive lock for leaf nodes. + */ + if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF)) + return (0); + + /* Always use exclusive locks if LOCKSHARED isn't set. */ + if (!(flags & LOCKSHARED)) + return (1); + + /* + * For lookups during open(), if the mount point supports + * extended shared operations, then use a shared lock for the + * leaf node, otherwise use an exclusive lock. + */ + if ((flags & ISOPEN) != 0) + return (!MNT_EXTENDED_SHARED(mp)); + + /* + * Lookup requests outside of open() that specify LOCKSHARED + * only need a shared lock on the leaf vnode. + */ + return (0); +} + +/* + * Search a pathname. + * This is a very central and rather complicated routine. + * + * The pathname is pointed to by ni_ptr and is of length ni_pathlen. + * The starting directory is taken from ni_startdir. The pathname is + * descended until done, or a symbolic link is encountered. The variable + * ni_more is clear if the path is completed; it is set to one if a + * symbolic link needing interpretation is encountered. + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it, the parent directory is returned + * locked. If flag has WANTPARENT or'ed into it, the parent directory is + * returned unlocked. Otherwise the parent directory is not returned. If + * the target of the pathname exists and LOCKLEAF is or'ed into the flag + * the target is returned locked, otherwise it is returned unlocked. + * When creating or renaming and LOCKPARENT is specified, the target may not + * be ".". When deleting and LOCKPARENT is specified, the target may be ".". + * + * Overall outline of lookup: + * + * dirloop: + * identify next component of name at ndp->ni_ptr + * handle degenerate case where name is null string + * if .. and crossing mount points and on mounted filesys, find parent + * call VOP_LOOKUP routine for next component name + * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set + * component vnode returned in ni_vp (if it exists), locked. + * if result vnode is mounted on and crossing mount points, + * find mounted on vnode + * if more components of name, do next level at dirloop + * return the answer in ni_vp, locked if LOCKLEAF set + * if LOCKPARENT set, return locked parent in ni_dvp + * if WANTPARENT set, return unlocked parent in ni_dvp + */ +int +lookup(struct nameidata *ndp) +{ + char *cp; /* pointer into pathname argument */ + char *prev_ni_next; /* saved ndp->ni_next */ + struct vnode *dp = NULL; /* the directory we are searching */ + struct vnode *tdp; /* saved dp */ + struct mount *mp; /* mount table entry */ + struct prison *pr; + size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; + int dpunlocked = 0; /* dp has already been unlocked */ + int relookup = 0; /* do not consume the path component */ + struct componentname *cnp = &ndp->ni_cnd; + int lkflags_save; + int ni_dvp_unlocked; + + /* + * Setup: break out flag bits into variables. + */ + ni_dvp_unlocked = 0; + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + KASSERT(cnp->cn_nameiop == LOOKUP || wantparent, + ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT.")); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE && + cnp->cn_nameiop != LOOKUP)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + ndp->ni_dvp = NULL; + /* + * We use shared locks until we hit the parent of the last cn then + * we adjust based on the requesting flags. + */ + cnp->cn_lkflags = LK_SHARED; + dp = ndp->ni_startdir; + ndp->ni_startdir = NULLVP; + vn_lock(dp, + compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, + cnp->cn_flags)); + +dirloop: + /* + * Search a new directory. + * + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ + for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + continue; + cnp->cn_namelen = cp - cnp->cn_nameptr; + if (cnp->cn_namelen > NAME_MAX) { + error = ENAMETOOLONG; + goto bad; + } +#ifdef NAMEI_DIAGNOSTIC + { char c = *cp; + *cp = '\0'; + printf("{%s}: ", cnp->cn_nameptr); + *cp = c; } +#endif + prev_ni_pathlen = ndp->ni_pathlen; + ndp->ni_pathlen -= cnp->cn_namelen; + KASSERT(ndp->ni_pathlen <= PATH_MAX, + ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); + prev_ni_next = ndp->ni_next; + ndp->ni_next = cp; + + /* + * Replace multiple slashes by a single slash and trailing slashes + * by a null. This must be done before VOP_LOOKUP() because some + * fs's don't know about trailing slashes. Remember if there were + * trailing slashes to handle symlinks, existing non-directories + * and non-existing files that won't be directories specially later. + */ + while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { + cp++; + ndp->ni_pathlen--; + if (*cp == '\0') { + *ndp->ni_next = '\0'; + cnp->cn_flags |= TRAILINGSLASH; + } + } + ndp->ni_next = cp; + + cnp->cn_flags |= MAKEENTRY; + if (*cp == '\0' && docache == 0) + cnp->cn_flags &= ~MAKEENTRY; + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + else + cnp->cn_flags &= ~ISDOTDOT; + if (*ndp->ni_next == 0) + cnp->cn_flags |= ISLASTCN; + else + cnp->cn_flags &= ~ISLASTCN; + + if ((cnp->cn_flags & ISLASTCN) != 0 && + cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EINVAL; + goto bad; + } + + nameicap_tracker_add(ndp, dp); + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (cnp->cn_nameiop != LOOKUP) { + error = EISDIR; + goto bad; + } + if (wantparent) { + ndp->ni_dvp = dp; + VREF(dp); + } + ndp->ni_vp = dp; + + if (cnp->cn_flags & AUDITVNODE1) + AUDIT_ARG_VNODE1(dp); + else if (cnp->cn_flags & AUDITVNODE2) + AUDIT_ARG_VNODE2(dp); + + if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) + VOP_UNLOCK(dp, 0); + /* XXX This should probably move to the top of function. */ + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + goto success; + } + + /* + * Handle "..": five special cases. + * 0. If doing a capability lookup and lookup_cap_dotdot is + * disabled, return ENOTCAPABLE. + * 1. Return an error if this is the last component of + * the name and the operation is DELETE or RENAME. + * 2. If at root directory (e.g. after chroot) + * or at absolute root directory + * then ignore it so can't get out. + * 3. If this vnode is the root of a mounted + * filesystem, then replace it with the + * vnode which was mounted on so we take the + * .. in the other filesystem. + * 4. If the vnode is the top directory of + * the jail or chroot, don't let them out. + * 5. If doing a capability lookup and lookup_cap_dotdot is + * enabled, return ENOTCAPABLE if the lookup would escape + * from the initial file descriptor directory. Checks are + * done by ensuring that namei() already traversed the + * result of dotdot lookup. + */ + if (cnp->cn_flags & ISDOTDOT) { + if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT)) + == NI_LCF_STRICTRELATIVE) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + error = ENOTCAPABLE; + goto bad; + } + if ((cnp->cn_flags & ISLASTCN) != 0 && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EINVAL; + goto bad; + } + for (;;) { + for (pr = cnp->cn_cred->cr_prison; pr != NULL; + pr = pr->pr_parent) + if (dp == pr->pr_root) + break; + if (dp == ndp->ni_rootdir || + dp == ndp->ni_topdir || + dp == rootvnode || + pr != NULL || + ((dp->v_vflag & VV_ROOT) != 0 && + (cnp->cn_flags & NOCROSSMOUNT) != 0)) { + ndp->ni_dvp = dp; + ndp->ni_vp = dp; + VREF(dp); + goto nextname; + } + if ((dp->v_vflag & VV_ROOT) == 0) + break; + if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ + error = ENOENT; + goto bad; + } + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + VREF(dp); + vput(tdp); + vn_lock(dp, + compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | + LK_RETRY, ISDOTDOT)); + error = nameicap_check_dotdot(ndp, dp); + if (error != 0) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + goto bad; + } + } + } + + /* + * We now have a segment name to search for, and a directory to search. + */ +unionlookup: +#ifdef MAC + if ((cnp->cn_flags & NOMACCHECK) == 0) { + error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp, + cnp); + if (error) + goto bad; + } +#endif + ndp->ni_dvp = dp; + ndp->ni_vp = NULL; + ASSERT_VOP_LOCKED(dp, "lookup"); + /* + * If we have a shared lock we may need to upgrade the lock for the + * last operation. + */ + if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) && + dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED) + vn_lock(dp, LK_UPGRADE|LK_RETRY); + if ((dp->v_iflag & VI_DOOMED) != 0) { + error = ENOENT; + goto bad; + } + /* + * If we're looking up the last component and we need an exclusive + * lock, adjust our lkflags. + */ + if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags)) + cnp->cn_lkflags = LK_EXCLUSIVE; +#ifdef NAMEI_DIAGNOSTIC + vn_printf(dp, "lookup in "); +#endif + lkflags_save = cnp->cn_lkflags; + cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags, + cnp->cn_flags); + error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp); + cnp->cn_lkflags = lkflags_save; + if (error != 0) { + KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); +#ifdef NAMEI_DIAGNOSTIC + printf("not found\n"); +#endif + if ((error == ENOENT) && + (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) && + (dp->v_mount->mnt_flag & MNT_UNION)) { + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + VREF(dp); + vput(tdp); + vn_lock(dp, + compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | + LK_RETRY, cnp->cn_flags)); + nameicap_tracker_add(ndp, dp); + goto unionlookup; + } + + if (error == ERELOOKUP) { + vref(dp); + ndp->ni_vp = dp; + error = 0; + relookup = 1; + goto good; + } + + if (error != EJUSTRETURN) + goto bad; + /* + * At this point, we know we're at the end of the + * pathname. If creating / renaming, we can consider + * allowing the file or directory to be created / renamed, + * provided we're not on a read-only filesystem. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + /* trailing slash only allowed for directories */ + if ((cnp->cn_flags & TRAILINGSLASH) && + !(cnp->cn_flags & WILLBEDIR)) { + error = ENOENT; + goto bad; + } + if ((cnp->cn_flags & LOCKPARENT) == 0) + VOP_UNLOCK(dp, 0); + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory vnode in ndp->ni_dvp. + */ + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + goto success; + } + +good: +#ifdef NAMEI_DIAGNOSTIC + printf("found\n"); +#endif + dp = ndp->ni_vp; + + /* + * Check to see if the vnode has been mounted on; + * if so find the root of the mounted filesystem. + */ + while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + if (vfs_busy(mp, 0)) + continue; + vput(dp); + if (dp != ndp->ni_dvp) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + vrefact(vp_crossmp); + ndp->ni_dvp = vp_crossmp; + error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, + cnp->cn_flags), &tdp); + vfs_unbusy(mp); + if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) + panic("vp_crossmp exclusively locked or reclaimed"); + if (error) { + dpunlocked = 1; + goto bad2; + } + ndp->ni_vp = dp = tdp; + } + + /* + * Check for symbolic link + */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) || + *ndp->ni_next == '/')) { + cnp->cn_flags |= ISSYMLINK; + if (dp->v_iflag & VI_DOOMED) { + /* + * We can't know whether the directory was mounted with + * NOSYMFOLLOW, so we can't follow safely. + */ + error = ENOENT; + goto bad2; + } + if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { + error = EACCES; + goto bad2; + } + /* + * Symlink code always expects an unlocked dvp. + */ + if (ndp->ni_dvp != ndp->ni_vp) { + VOP_UNLOCK(ndp->ni_dvp, 0); + ni_dvp_unlocked = 1; + } + goto success; + } + +nextname: + /* + * Not a symbolic link that we will follow. Continue with the + * next component if there is any; otherwise, we're done. + */ + KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/', + ("lookup: invalid path state.")); + if (relookup) { + relookup = 0; + ndp->ni_pathlen = prev_ni_pathlen; + ndp->ni_next = prev_ni_next; + if (ndp->ni_dvp != dp) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + goto dirloop; + } + if (cnp->cn_flags & ISDOTDOT) { + error = nameicap_check_dotdot(ndp, ndp->ni_vp); + if (error != 0) { +#ifdef KTRACE + if (KTRPOINT(curthread, KTR_CAPFAIL)) + ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); +#endif + goto bad2; + } + } + if (*ndp->ni_next == '/') { + cnp->cn_nameptr = ndp->ni_next; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + if (ndp->ni_dvp != dp) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + goto dirloop; + } + /* + * If we're processing a path with a trailing slash, + * check that the end result is a directory. + */ + if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) { + error = ENOTDIR; + goto bad2; + } + /* + * Disallow directory write attempts on read-only filesystems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto bad2; + } + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + if (!wantparent) { + ni_dvp_unlocked = 2; + if (ndp->ni_dvp != dp) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) { + VOP_UNLOCK(ndp->ni_dvp, 0); + ni_dvp_unlocked = 1; + } + + if (cnp->cn_flags & AUDITVNODE1) + AUDIT_ARG_VNODE1(dp); + else if (cnp->cn_flags & AUDITVNODE2) + AUDIT_ARG_VNODE2(dp); + + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0); +success: + /* + * Because of shared lookup we may have the vnode shared locked, but + * the caller may want it to be exclusively locked. + */ + if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && + VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { + vn_lock(dp, LK_UPGRADE | LK_RETRY); + if (dp->v_iflag & VI_DOOMED) { + error = ENOENT; + goto bad2; + } + } + return (0); + +bad2: + if (ni_dvp_unlocked != 2) { + if (dp != ndp->ni_dvp && !ni_dvp_unlocked) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + } +bad: + if (!dpunlocked) + vput(dp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * relookup - lookup a path name component + * Used by lookup to re-acquire things. + */ +int +relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) +{ + struct vnode *dp = NULL; /* the directory we are searching */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; + + KASSERT(cnp->cn_flags & ISLASTCN, + ("relookup: Not given last component.")); + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); + KASSERT(wantparent, ("relookup: parent not wanted.")); + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + dp = dvp; + cnp->cn_lkflags = LK_EXCLUSIVE; + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); + + /* + * Search a new directory. + * + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ +#ifdef NAMEI_DIAGNOSTIC + printf("{%s}: ", cnp->cn_nameptr); +#endif + + /* + * Check for "" which represents the root directory after slash + * removal. + */ + if (cnp->cn_nameptr[0] == '\0') { + /* + * Support only LOOKUP for "/" because lookup() + * can't succeed for CREATE, DELETE and RENAME. + */ + KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP")); + KASSERT(dp->v_type == VDIR, ("dp is not a directory")); + + if (!(cnp->cn_flags & LOCKLEAF)) + VOP_UNLOCK(dp, 0); + *vpp = dp; + /* XXX This should probably move to the top of function. */ + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + if (cnp->cn_flags & ISDOTDOT) + panic ("relookup: lookup on dot-dot"); + + /* + * We now have a segment name to search for, and a directory to search. + */ +#ifdef NAMEI_DIAGNOSTIC + vn_printf(dp, "search in "); +#endif + if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { + KASSERT(*vpp == NULL, ("leaf should be empty")); + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + if ((cnp->cn_flags & LOCKPARENT) == 0) + VOP_UNLOCK(dp, 0); + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory vnode in ndp->ni_dvp. + */ + return (0); + } + + dp = *vpp; + + /* + * Disallow directory write attempts on read-only filesystems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + if (dvp == dp) + vrele(dvp); + else + vput(dvp); + error = EROFS; + goto bad; + } + /* + * Set the parent lock/ref state to the requested state. + */ + if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) { + if (wantparent) + VOP_UNLOCK(dvp, 0); + else + vput(dvp); + } else if (!wantparent) + vrele(dvp); + /* + * Check for symbolic link + */ + KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), + ("relookup: symlink found.\n")); + + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0); + return (0); +bad: + vput(dp); + *vpp = NULL; + return (error); +} + +void +NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg, + const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp, + struct thread *td) +{ + + ndp->ni_cnd.cn_nameiop = op; + ndp->ni_cnd.cn_flags = flags; + ndp->ni_segflg = segflg; + ndp->ni_dirp = namep; + ndp->ni_dirfd = dirfd; + ndp->ni_startdir = startdir; + ndp->ni_resflags = 0; + if (rightsp != NULL) + ndp->ni_rightsneeded = *rightsp; + else + cap_rights_init(&ndp->ni_rightsneeded); + filecaps_init(&ndp->ni_filecaps); + ndp->ni_cnd.cn_thread = td; +} + +/* + * Free data allocated by namei(); see namei(9) for details. + */ +void +NDFREE(struct nameidata *ndp, const u_int flags) +{ + int unlock_dvp; + int unlock_vp; + + unlock_dvp = 0; + unlock_vp = 0; + + if (!(flags & NDF_NO_FREE_PNBUF) && + (ndp->ni_cnd.cn_flags & HASBUF)) { + uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); + ndp->ni_cnd.cn_flags &= ~HASBUF; + } + if (!(flags & NDF_NO_VP_UNLOCK) && + (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) + unlock_vp = 1; + if (!(flags & NDF_NO_DVP_UNLOCK) && + (ndp->ni_cnd.cn_flags & LOCKPARENT) && + ndp->ni_dvp != ndp->ni_vp) + unlock_dvp = 1; + if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { + if (unlock_vp) { + vput(ndp->ni_vp); + unlock_vp = 0; + } else + vrele(ndp->ni_vp); + ndp->ni_vp = NULL; + } + if (unlock_vp) + VOP_UNLOCK(ndp->ni_vp, 0); + if (!(flags & NDF_NO_DVP_RELE) && + (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { + if (unlock_dvp) { + vput(ndp->ni_dvp); + unlock_dvp = 0; + } else + vrele(ndp->ni_dvp); + ndp->ni_dvp = NULL; + } + if (unlock_dvp) + VOP_UNLOCK(ndp->ni_dvp, 0); + if (!(flags & NDF_NO_STARTDIR_RELE) && + (ndp->ni_cnd.cn_flags & SAVESTART)) { + vrele(ndp->ni_startdir); + ndp->ni_startdir = NULL; + } +} + +/* + * Determine if there is a suitable alternate filename under the specified + * prefix for the specified path. If the create flag is set, then the + * alternate prefix will be used so long as the parent directory exists. + * This is used by the various compatibility ABIs so that Linux binaries prefer + * files under /compat/linux for example. The chosen path (whether under + * the prefix or under /) is returned in a kernel malloc'd buffer pointed + * to by pathbuf. The caller is responsible for free'ing the buffer from + * the M_TEMP bucket if one is returned. + */ +int +kern_alternate_path(struct thread *td, const char *prefix, const char *path, + enum uio_seg pathseg, char **pathbuf, int create, int dirfd) +{ + struct nameidata nd, ndroot; + char *ptr, *buf, *cp; + size_t len, sz; + int error; + + buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + *pathbuf = buf; + + /* Copy the prefix into the new pathname as a starting point. */ + len = strlcpy(buf, prefix, MAXPATHLEN); + if (len >= MAXPATHLEN) { + *pathbuf = NULL; + free(buf, M_TEMP); + return (EINVAL); + } + sz = MAXPATHLEN - len; + ptr = buf + len; + + /* Append the filename to the prefix. */ + if (pathseg == UIO_SYSSPACE) + error = copystr(path, ptr, sz, &len); + else + error = copyinstr(path, ptr, sz, &len); + + if (error) { + *pathbuf = NULL; + free(buf, M_TEMP); + return (error); + } + + /* Only use a prefix with absolute pathnames. */ + if (*ptr != '/') { + error = EINVAL; + goto keeporig; + } + + if (dirfd != AT_FDCWD) { + /* + * We want the original because the "prefix" is + * included in the already opened dirfd. + */ + bcopy(ptr, buf, len); + return (0); + } + + /* + * We know that there is a / somewhere in this pathname. + * Search backwards for it, to find the file's parent dir + * to see if it exists in the alternate tree. If it does, + * and we want to create a file (cflag is set). We don't + * need to worry about the root comparison in this case. + */ + + if (create) { + for (cp = &ptr[len] - 1; *cp != '/'; cp--); + *cp = '\0'; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td); + error = namei(&nd); + *cp = '/'; + if (error != 0) + goto keeporig; + } else { + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td); + + error = namei(&nd); + if (error != 0) + goto keeporig; + + /* + * We now compare the vnode of the prefix to the one + * vnode asked. If they resolve to be the same, then we + * ignore the match so that the real root gets used. + * This avoids the problem of traversing "../.." to find the + * root directory and never finding it, because "/" resolves + * to the emulation root directory. This is expensive :-( + */ + NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix, + td); + + /* We shouldn't ever get an error from this namei(). */ + error = namei(&ndroot); + if (error == 0) { + if (nd.ni_vp == ndroot.ni_vp) + error = ENOENT; + + NDFREE(&ndroot, NDF_ONLY_PNBUF); + vrele(ndroot.ni_vp); + } + } + + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(nd.ni_vp); + +keeporig: + /* If there was an error, use the original path name. */ + if (error) + bcopy(ptr, buf, len); + return (error); +} diff --git a/freebsd/sys/kern/vfs_mount.c b/freebsd/sys/kern/vfs_mount.c new file mode 100644 index 00000000..3610763f --- /dev/null +++ b/freebsd/sys/kern/vfs_mount.c @@ -0,0 +1,2052 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1999-2004 Poul-Henning Kamp + * Copyright (c) 1999 Michael Smith + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +#define VFS_MOUNTARG_SIZE_MAX (1024 * 64) + +static int vfs_domount(struct thread *td, const char *fstype, char *fspath, + uint64_t fsflags, struct vfsoptlist **optlist); +static void free_mntarg(struct mntarg *ma); + +static int usermount = 0; +SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, + "Unprivileged users may mount and unmount file systems"); + +static bool default_autoro = false; +SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, + "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); + +MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); +MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); +static uma_zone_t mount_zone; + +/* List of mounted filesystems. */ +struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); + +/* For any iteration/modification of mountlist */ +struct mtx mountlist_mtx; +MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); + +EVENTHANDLER_LIST_DEFINE(vfs_mounted); +EVENTHANDLER_LIST_DEFINE(vfs_unmounted); + +/* + * Global opts, taken by all filesystems + */ +static const char *global_opts[] = { + "errmsg", + "fstype", + "fspath", + "ro", + "rw", + "nosuid", + "noexec", + NULL +}; + +static int +mount_init(void *mem, int size, int flags) +{ + struct mount *mp; + + mp = (struct mount *)mem; + mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); + mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); + lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); + return (0); +} + +static void +mount_fini(void *mem, int size) +{ + struct mount *mp; + + mp = (struct mount *)mem; + lockdestroy(&mp->mnt_explock); + mtx_destroy(&mp->mnt_listmtx); + mtx_destroy(&mp->mnt_mtx); +} + +static void +vfs_mount_init(void *dummy __unused) +{ + + mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, + NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +} +SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); + +/* + * --------------------------------------------------------------------- + * Functions for building and sanitizing the mount options + */ + +/* Remove one mount option. */ +static void +vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) +{ + + TAILQ_REMOVE(opts, opt, link); + free(opt->name, M_MOUNT); + if (opt->value != NULL) + free(opt->value, M_MOUNT); + free(opt, M_MOUNT); +} + +/* Release all resources related to the mount options. */ +void +vfs_freeopts(struct vfsoptlist *opts) +{ + struct vfsopt *opt; + + while (!TAILQ_EMPTY(opts)) { + opt = TAILQ_FIRST(opts); + vfs_freeopt(opts, opt); + } + free(opts, M_MOUNT); +} + +void +vfs_deleteopt(struct vfsoptlist *opts, const char *name) +{ + struct vfsopt *opt, *temp; + + if (opts == NULL) + return; + TAILQ_FOREACH_SAFE(opt, opts, link, temp) { + if (strcmp(opt->name, name) == 0) + vfs_freeopt(opts, opt); + } +} + +static int +vfs_isopt_ro(const char *opt) +{ + + if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || + strcmp(opt, "norw") == 0) + return (1); + return (0); +} + +static int +vfs_isopt_rw(const char *opt) +{ + + if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) + return (1); + return (0); +} + +/* + * Check if options are equal (with or without the "no" prefix). + */ +static int +vfs_equalopts(const char *opt1, const char *opt2) +{ + char *p; + + /* "opt" vs. "opt" or "noopt" vs. "noopt" */ + if (strcmp(opt1, opt2) == 0) + return (1); + /* "noopt" vs. "opt" */ + if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) + return (1); + /* "opt" vs. "noopt" */ + if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) + return (1); + while ((p = strchr(opt1, '.')) != NULL && + !strncmp(opt1, opt2, ++p - opt1)) { + opt2 += p - opt1; + opt1 = p; + /* "foo.noopt" vs. "foo.opt" */ + if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) + return (1); + /* "foo.opt" vs. "foo.noopt" */ + if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) + return (1); + } + /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ + if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && + (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) + return (1); + return (0); +} + +/* + * If a mount option is specified several times, + * (with or without the "no" prefix) only keep + * the last occurrence of it. + */ +static void +vfs_sanitizeopts(struct vfsoptlist *opts) +{ + struct vfsopt *opt, *opt2, *tmp; + + TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { + opt2 = TAILQ_PREV(opt, vfsoptlist, link); + while (opt2 != NULL) { + if (vfs_equalopts(opt->name, opt2->name)) { + tmp = TAILQ_PREV(opt2, vfsoptlist, link); + vfs_freeopt(opts, opt2); + opt2 = tmp; + } else { + opt2 = TAILQ_PREV(opt2, vfsoptlist, link); + } + } + } +} + +/* + * Build a linked list of mount options from a struct uio. + */ +int +vfs_buildopts(struct uio *auio, struct vfsoptlist **options) +{ + struct vfsoptlist *opts; + struct vfsopt *opt; + size_t memused, namelen, optlen; + unsigned int i, iovcnt; + int error; + + opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); + TAILQ_INIT(opts); + memused = 0; + iovcnt = auio->uio_iovcnt; + for (i = 0; i < iovcnt; i += 2) { + namelen = auio->uio_iov[i].iov_len; + optlen = auio->uio_iov[i + 1].iov_len; + memused += sizeof(struct vfsopt) + optlen + namelen; + /* + * Avoid consuming too much memory, and attempts to overflow + * memused. + */ + if (memused > VFS_MOUNTARG_SIZE_MAX || + optlen > VFS_MOUNTARG_SIZE_MAX || + namelen > VFS_MOUNTARG_SIZE_MAX) { + error = EINVAL; + goto bad; + } + + opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); + opt->name = malloc(namelen, M_MOUNT, M_WAITOK); + opt->value = NULL; + opt->len = 0; + opt->pos = i / 2; + opt->seen = 0; + + /* + * Do this early, so jumps to "bad" will free the current + * option. + */ + TAILQ_INSERT_TAIL(opts, opt, link); + + if (auio->uio_segflg == UIO_SYSSPACE) { + bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); + } else { + error = copyin(auio->uio_iov[i].iov_base, opt->name, + namelen); + if (error) + goto bad; + } + /* Ensure names are null-terminated strings. */ + if (namelen == 0 || opt->name[namelen - 1] != '\0') { + error = EINVAL; + goto bad; + } + if (optlen != 0) { + opt->len = optlen; + opt->value = malloc(optlen, M_MOUNT, M_WAITOK); + if (auio->uio_segflg == UIO_SYSSPACE) { + bcopy(auio->uio_iov[i + 1].iov_base, opt->value, + optlen); + } else { + error = copyin(auio->uio_iov[i + 1].iov_base, + opt->value, optlen); + if (error) + goto bad; + } + } + } + vfs_sanitizeopts(opts); + *options = opts; + return (0); +bad: + vfs_freeopts(opts); + return (error); +} + +/* + * Merge the old mount options with the new ones passed + * in the MNT_UPDATE case. + * + * XXX: This function will keep a "nofoo" option in the new + * options. E.g, if the option's canonical name is "foo", + * "nofoo" ends up in the mount point's active options. + */ +static void +vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) +{ + struct vfsopt *opt, *new; + + TAILQ_FOREACH(opt, oldopts, link) { + new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); + new->name = strdup(opt->name, M_MOUNT); + if (opt->len != 0) { + new->value = malloc(opt->len, M_MOUNT, M_WAITOK); + bcopy(opt->value, new->value, opt->len); + } else + new->value = NULL; + new->len = opt->len; + new->seen = opt->seen; + TAILQ_INSERT_HEAD(toopts, new, link); + } + vfs_sanitizeopts(toopts); +} + +/* + * Mount a filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct nmount_args { + struct iovec *iovp; + unsigned int iovcnt; + int flags; +}; +#endif +int +sys_nmount(struct thread *td, struct nmount_args *uap) +{ + struct uio *auio; + int error; + u_int iovcnt; + uint64_t flags; + + /* + * Mount flags are now 64-bits. On 32-bit archtectures only + * 32-bits are passed in, but from here on everything handles + * 64-bit flags correctly. + */ + flags = uap->flags; + + AUDIT_ARG_FFLAGS(flags); + CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, + uap->iovp, uap->iovcnt, flags); + + /* + * Filter out MNT_ROOTFS. We do not want clients of nmount() in + * userspace to set this flag, but we must filter it out if we want + * MNT_UPDATE on the root file system to work. + * MNT_ROOTFS should only be set by the kernel when mounting its + * root file system. + */ + flags &= ~MNT_ROOTFS; + + iovcnt = uap->iovcnt; + /* + * Check that we have an even number of iovec's + * and that we have at least two options. + */ + if ((iovcnt & 1) || (iovcnt < 4)) { + CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, + uap->iovcnt); + return (EINVAL); + } + + error = copyinuio(uap->iovp, iovcnt, &auio); + if (error) { + CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", + __func__, error); + return (error); + } + error = vfs_donmount(td, flags, auio); + + free(auio, M_IOV); + return (error); +} + +/* + * --------------------------------------------------------------------- + * Various utility functions + */ + +void +vfs_ref(struct mount *mp) +{ + + CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + MNT_ILOCK(mp); + MNT_REF(mp); + MNT_IUNLOCK(mp); +} + +void +vfs_rel(struct mount *mp) +{ + + CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + MNT_ILOCK(mp); + MNT_REL(mp); + MNT_IUNLOCK(mp); +} + +/* + * Allocate and initialize the mount point struct. + */ +struct mount * +vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, + struct ucred *cred) +{ + struct mount *mp; + + mp = uma_zalloc(mount_zone, M_WAITOK); + bzero(&mp->mnt_startzero, + __rangeof(struct mount, mnt_startzero, mnt_endzero)); + TAILQ_INIT(&mp->mnt_nvnodelist); + mp->mnt_nvnodelistsize = 0; + TAILQ_INIT(&mp->mnt_activevnodelist); + mp->mnt_activevnodelistsize = 0; + TAILQ_INIT(&mp->mnt_tmpfreevnodelist); + mp->mnt_tmpfreevnodelistsize = 0; + mp->mnt_ref = 0; + (void) vfs_busy(mp, MBF_NOWAIT); + atomic_add_acq_int(&vfsp->vfc_refcount, 1); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_gen++; + strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_vnodecovered = vp; + mp->mnt_cred = crdup(cred); + mp->mnt_stat.f_owner = cred->cr_uid; + strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); + mp->mnt_iosize_max = DFLTPHYS; +#ifdef MAC + mac_mount_init(mp); + mac_mount_create(cred, mp); +#endif + arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); + TAILQ_INIT(&mp->mnt_uppers); + return (mp); +} + +/* + * Destroy the mount struct previously allocated by vfs_mount_alloc(). + */ +void +vfs_mount_destroy(struct mount *mp) +{ + + MNT_ILOCK(mp); + mp->mnt_kern_flag |= MNTK_REFEXPIRE; + if (mp->mnt_kern_flag & MNTK_MWAIT) { + mp->mnt_kern_flag &= ~MNTK_MWAIT; + wakeup(mp); + } + while (mp->mnt_ref) + msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); + KASSERT(mp->mnt_ref == 0, + ("%s: invalid refcount in the drain path @ %s:%d", __func__, + __FILE__, __LINE__)); + if (mp->mnt_writeopcount != 0) + panic("vfs_mount_destroy: nonzero writeopcount"); + if (mp->mnt_secondary_writes != 0) + panic("vfs_mount_destroy: nonzero secondary_writes"); + atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); + if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { + struct vnode *vp; + + TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) + vn_printf(vp, "dangling vnode "); + panic("unmount: dangling vnode"); + } + KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); + if (mp->mnt_nvnodelistsize != 0) + panic("vfs_mount_destroy: nonzero nvnodelistsize"); + if (mp->mnt_activevnodelistsize != 0) + panic("vfs_mount_destroy: nonzero activevnodelistsize"); + if (mp->mnt_tmpfreevnodelistsize != 0) + panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize"); + if (mp->mnt_lockref != 0) + panic("vfs_mount_destroy: nonzero lock refcount"); + MNT_IUNLOCK(mp); + if (mp->mnt_vnodecovered != NULL) + vrele(mp->mnt_vnodecovered); +#ifdef MAC + mac_mount_destroy(mp); +#endif + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + crfree(mp->mnt_cred); + uma_zfree(mount_zone, mp); +} + +static bool +vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) +{ + /* This is an upgrade of an exisiting mount. */ + if ((fsflags & MNT_UPDATE) != 0) + return (false); + /* This is already an R/O mount. */ + if ((fsflags & MNT_RDONLY) != 0) + return (false); + + switch (error) { + case ENODEV: /* generic, geom, ... */ + case EACCES: /* cam/scsi, ... */ + case EROFS: /* md, mmcsd, ... */ + /* + * These errors can be returned by the storage layer to signal + * that the media is read-only. No harm in the R/O mount + * attempt if the error was returned for some other reason. + */ + return (true); + default: + return (false); + } +} + +int +vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) +{ + struct vfsoptlist *optlist; + struct vfsopt *opt, *tmp_opt; + char *fstype, *fspath, *errmsg; + int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; + bool autoro; + + errmsg = fspath = NULL; + errmsg_len = fspathlen = 0; + errmsg_pos = -1; + autoro = default_autoro; + + error = vfs_buildopts(fsoptions, &optlist); + if (error) + return (error); + + if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) + errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); + + /* + * We need these two options before the others, + * and they are mandatory for any filesystem. + * Ensure they are NUL terminated as well. + */ + fstypelen = 0; + error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); + if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') { + error = EINVAL; + if (errmsg != NULL) + strncpy(errmsg, "Invalid fstype", errmsg_len); + goto bail; + } + fspathlen = 0; + error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); + if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') { + error = EINVAL; + if (errmsg != NULL) + strncpy(errmsg, "Invalid fspath", errmsg_len); + goto bail; + } + + /* + * We need to see if we have the "update" option + * before we call vfs_domount(), since vfs_domount() has special + * logic based on MNT_UPDATE. This is very important + * when we want to update the root filesystem. + */ + TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { + if (strcmp(opt->name, "update") == 0) { + fsflags |= MNT_UPDATE; + vfs_freeopt(optlist, opt); + } + else if (strcmp(opt->name, "async") == 0) + fsflags |= MNT_ASYNC; + else if (strcmp(opt->name, "force") == 0) { + fsflags |= MNT_FORCE; + vfs_freeopt(optlist, opt); + } + else if (strcmp(opt->name, "reload") == 0) { + fsflags |= MNT_RELOAD; + vfs_freeopt(optlist, opt); + } + else if (strcmp(opt->name, "multilabel") == 0) + fsflags |= MNT_MULTILABEL; + else if (strcmp(opt->name, "noasync") == 0) + fsflags &= ~MNT_ASYNC; + else if (strcmp(opt->name, "noatime") == 0) + fsflags |= MNT_NOATIME; + else if (strcmp(opt->name, "atime") == 0) { + free(opt->name, M_MOUNT); + opt->name = strdup("nonoatime", M_MOUNT); + } + else if (strcmp(opt->name, "noclusterr") == 0) + fsflags |= MNT_NOCLUSTERR; + else if (strcmp(opt->name, "clusterr") == 0) { + free(opt->name, M_MOUNT); + opt->name = strdup("nonoclusterr", M_MOUNT); + } + else if (strcmp(opt->name, "noclusterw") == 0) + fsflags |= MNT_NOCLUSTERW; + else if (strcmp(opt->name, "clusterw") == 0) { + free(opt->name, M_MOUNT); + opt->name = strdup("nonoclusterw", M_MOUNT); + } + else if (strcmp(opt->name, "noexec") == 0) + fsflags |= MNT_NOEXEC; + else if (strcmp(opt->name, "exec") == 0) { + free(opt->name, M_MOUNT); + opt->name = strdup("nonoexec", M_MOUNT); + } + else if (strcmp(opt->name, "nosuid") == 0) + fsflags |= MNT_NOSUID; + else if (strcmp(opt->name, "suid") == 0) { + free(opt->name, M_MOUNT); + opt->name = strdup("nonosuid", M_MOUNT); + } + else if (strcmp(opt->name, "nosymfollow") == 0) + fsflags |= MNT_NOSYMFOLLOW; + else if (strcmp(opt->name, "symfollow") == 0) { + free(opt->name, M_MOUNT); + opt->name = strdup("nonosymfollow", M_MOUNT); + } + else if (strcmp(opt->name, "noro") == 0) { + fsflags &= ~MNT_RDONLY; + autoro = false; + } + else if (strcmp(opt->name, "rw") == 0) { + fsflags &= ~MNT_RDONLY; + autoro = false; + } + else if (strcmp(opt->name, "ro") == 0) { + fsflags |= MNT_RDONLY; + autoro = false; + } + else if (strcmp(opt->name, "rdonly") == 0) { + free(opt->name, M_MOUNT); + opt->name = strdup("ro", M_MOUNT); + fsflags |= MNT_RDONLY; + autoro = false; + } + else if (strcmp(opt->name, "autoro") == 0) { + vfs_freeopt(optlist, opt); + autoro = true; + } + else if (strcmp(opt->name, "suiddir") == 0) + fsflags |= MNT_SUIDDIR; + else if (strcmp(opt->name, "sync") == 0) + fsflags |= MNT_SYNCHRONOUS; + else if (strcmp(opt->name, "union") == 0) + fsflags |= MNT_UNION; + else if (strcmp(opt->name, "automounted") == 0) { + fsflags |= MNT_AUTOMOUNTED; + vfs_freeopt(optlist, opt); + } + } + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { + error = ENAMETOOLONG; + goto bail; + } + + error = vfs_domount(td, fstype, fspath, fsflags, &optlist); + + /* + * See if we can mount in the read-only mode if the error code suggests + * that it could be possible and the mount options allow for that. + * Never try it if "[no]{ro|rw}" has been explicitly requested and not + * overridden by "autoro". + */ + if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { + printf("%s: R/W mount failed, possibly R/O media," + " trying R/O mount\n", __func__); + fsflags |= MNT_RDONLY; + error = vfs_domount(td, fstype, fspath, fsflags, &optlist); + } +bail: + /* copyout the errmsg */ + if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) + && errmsg_len > 0 && errmsg != NULL) { + if (fsoptions->uio_segflg == UIO_SYSSPACE) { + bcopy(errmsg, + fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, + fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); + } else { + copyout(errmsg, + fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, + fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); + } + } + + if (optlist != NULL) + vfs_freeopts(optlist); + return (error); +} + +/* + * Old mount API. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mount_args { + char *type; + char *path; + int flags; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +sys_mount(struct thread *td, struct mount_args *uap) +{ + char *fstype; + struct vfsconf *vfsp = NULL; + struct mntarg *ma = NULL; + uint64_t flags; + int error; + + /* + * Mount flags are now 64-bits. On 32-bit architectures only + * 32-bits are passed in, but from here on everything handles + * 64-bit flags correctly. + */ + flags = uap->flags; + + AUDIT_ARG_FFLAGS(flags); + + /* + * Filter out MNT_ROOTFS. We do not want clients of mount() in + * userspace to set this flag, but we must filter it out if we want + * MNT_UPDATE on the root file system to work. + * MNT_ROOTFS should only be set by the kernel when mounting its + * root file system. + */ + flags &= ~MNT_ROOTFS; + + fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); + error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); + if (error) { + free(fstype, M_TEMP); + return (error); + } + + AUDIT_ARG_TEXT(fstype); + vfsp = vfs_byname_kld(fstype, td, &error); + free(fstype, M_TEMP); + if (vfsp == NULL) + return (ENOENT); + if (vfsp->vfc_vfsops->vfs_cmount == NULL) + return (EOPNOTSUPP); + + ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); + ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); + ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); + ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); + ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); + + error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags); + return (error); +} + +/* + * vfs_domount_first(): first file system mount (not update) + */ +static int +vfs_domount_first( + struct thread *td, /* Calling thread. */ + struct vfsconf *vfsp, /* File system type. */ + char *fspath, /* Mount path. */ + struct vnode *vp, /* Vnode to be covered. */ + uint64_t fsflags, /* Flags common to all filesystems. */ + struct vfsoptlist **optlist /* Options local to the filesystem. */ + ) +{ + struct vattr va; + struct mount *mp; + struct vnode *newdp; + int error, error1; + + ASSERT_VOP_ELOCKED(vp, __func__); + KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); + + /* + * If the jail of the calling thread lacks permission for this type of + * file system, deny immediately. + */ + if (jailed(td->td_ucred) && !prison_allow(td->td_ucred, + vfsp->vfc_prison_flag)) { + vput(vp); + return (EPERM); + } + + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + error = VOP_GETATTR(vp, &va, td->td_ucred); + if (error == 0 && va.va_uid != td->td_ucred->cr_uid) + error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0); + if (error == 0) + error = vinvalbuf(vp, V_SAVE, 0, 0); + if (error == 0 && vp->v_type != VDIR) + error = ENOTDIR; + if (error == 0) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) + vp->v_iflag |= VI_MOUNT; + else + error = EBUSY; + VI_UNLOCK(vp); + } + if (error != 0) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0); + + /* Allocate and initialize the filesystem. */ + mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); + /* XXXMAC: pass to vfs_mount_alloc? */ + mp->mnt_optnew = *optlist; + /* Set the mount level flags. */ + mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); + + /* + * Mount the filesystem. + * XXX The final recipients of VFS_MOUNT just overwrite the ndp they + * get. No freeing of cn_pnbuf. + */ + error1 = 0; + if ((error = VFS_MOUNT(mp)) != 0 || + (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || + (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { + if (error1 != 0) { + error = error1; + if ((error1 = VFS_UNMOUNT(mp, 0)) != 0) + printf("VFS_UNMOUNT returned %d\n", error1); + } + vfs_unbusy(mp); + mp->mnt_vnodecovered = NULL; + vfs_mount_destroy(mp); + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + vrele(vp); + return (error); + } + VOP_UNLOCK(newdp, 0); + + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + *optlist = NULL; + + /* + * Prevent external consumers of mount options from reading mnt_optnew. + */ + mp->mnt_optnew = NULL; + + MNT_ILOCK(mp); + if ((mp->mnt_flag & MNT_ASYNC) != 0 && + (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) + mp->mnt_kern_flag |= MNTK_ASYNC; + else + mp->mnt_kern_flag &= ~MNTK_ASYNC; + MNT_IUNLOCK(mp); + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + cache_purge(vp); + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + vp->v_mountedhere = mp; + /* Place the new filesystem at the end of the mount list. */ + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + vfs_event_signal(NULL, VQ_MOUNT, 0); + vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY); + VOP_UNLOCK(vp, 0); + EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); + VOP_UNLOCK(newdp, 0); + mountcheckdirs(vp, newdp); + vrele(newdp); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + vfs_allocate_syncvnode(mp); + vfs_unbusy(mp); + return (0); +} + +/* + * vfs_domount_update(): update of mounted file system + */ +static int +vfs_domount_update( + struct thread *td, /* Calling thread. */ + struct vnode *vp, /* Mount point vnode. */ + uint64_t fsflags, /* Flags common to all filesystems. */ + struct vfsoptlist **optlist /* Options local to the filesystem. */ + ) +{ + struct export_args export; + void *bufp; + struct mount *mp; + int error, export_error, len; + uint64_t flag; + + ASSERT_VOP_ELOCKED(vp, __func__); + KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); + mp = vp->v_mount; + + if ((vp->v_vflag & VV_ROOT) == 0) { + if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) + == 0) + error = EXDEV; + else + error = EINVAL; + vput(vp); + return (error); + } + + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + flag = mp->mnt_flag; + if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + /* + * Only privileged root, or (if MNT_USER is set) the user that + * did the original mount is permitted to update it. + */ + error = vfs_suser(mp, td); + if (error != 0) { + vput(vp); + return (error); + } + if (vfs_busy(mp, MBF_NOWAIT)) { + vput(vp); + return (EBUSY); + } + VI_LOCK(vp); + if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { + VI_UNLOCK(vp); + vfs_unbusy(mp); + vput(vp); + return (EBUSY); + } + vp->v_iflag |= VI_MOUNT; + VI_UNLOCK(vp); + VOP_UNLOCK(vp, 0); + + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { + MNT_IUNLOCK(mp); + error = EBUSY; + goto end; + } + mp->mnt_flag &= ~MNT_UPDATEMASK; + mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | + MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); + if ((mp->mnt_flag & MNT_ASYNC) == 0) + mp->mnt_kern_flag &= ~MNTK_ASYNC; + MNT_IUNLOCK(mp); + mp->mnt_optnew = *optlist; + vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); + + /* + * Mount the filesystem. + * XXX The final recipients of VFS_MOUNT just overwrite the ndp they + * get. No freeing of cn_pnbuf. + */ + error = VFS_MOUNT(mp); + + export_error = 0; + /* Process the export option. */ + if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, + &len) == 0) { + /* Assume that there is only 1 ABI for each length. */ + switch (len) { + case (sizeof(struct oexport_args)): + bzero(&export, sizeof(export)); + /* FALLTHROUGH */ + case (sizeof(export)): + bcopy(bufp, &export, len); + export_error = vfs_export(mp, &export); + break; + default: + export_error = EINVAL; + break; + } + } + + MNT_ILOCK(mp); + if (error == 0) { + mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | + MNT_SNAPSHOT); + } else { + /* + * If we fail, restore old mount flags. MNT_QUOTA is special, + * because it is not part of MNT_UPDATEMASK, but it could have + * changed in the meantime if quotactl(2) was called. + * All in all we want current value of MNT_QUOTA, not the old + * one. + */ + mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); + } + if ((mp->mnt_flag & MNT_ASYNC) != 0 && + (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) + mp->mnt_kern_flag |= MNTK_ASYNC; + else + mp->mnt_kern_flag &= ~MNTK_ASYNC; + MNT_IUNLOCK(mp); + + if (error != 0) + goto end; + + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + *optlist = NULL; + (void)VFS_STATFS(mp, &mp->mnt_stat); + /* + * Prevent external consumers of mount options from reading + * mnt_optnew. + */ + mp->mnt_optnew = NULL; + + if ((mp->mnt_flag & MNT_RDONLY) == 0) + vfs_allocate_syncvnode(mp); + else + vfs_deallocate_syncvnode(mp); +end: + vfs_unbusy(mp); + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + vrele(vp); + return (error != 0 ? error : export_error); +} + +/* + * vfs_domount(): actually attempt a filesystem mount. + */ +static int +vfs_domount( + struct thread *td, /* Calling thread. */ + const char *fstype, /* Filesystem type. */ + char *fspath, /* Mount path. */ + uint64_t fsflags, /* Flags common to all filesystems. */ + struct vfsoptlist **optlist /* Options local to the filesystem. */ + ) +{ + struct vfsconf *vfsp; + struct nameidata nd; + struct vnode *vp; + char *pathbuf; + int error; + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) + return (ENAMETOOLONG); + + if (jailed(td->td_ucred) || usermount == 0) { + if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) + return (error); + } + + /* + * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. + */ + if (fsflags & MNT_EXPORTED) { + error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); + if (error) + return (error); + } + if (fsflags & MNT_SUIDDIR) { + error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); + if (error) + return (error); + } + /* + * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. + */ + if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { + if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) + fsflags |= MNT_NOSUID | MNT_USER; + } + + /* Load KLDs before we lock the covered vnode to avoid reversals. */ + vfsp = NULL; + if ((fsflags & MNT_UPDATE) == 0) { + /* Don't try to load KLDs if we're mounting the root. */ + if (fsflags & MNT_ROOTFS) + vfsp = vfs_byname(fstype); + else + vfsp = vfs_byname_kld(fstype, td, &error); + if (vfsp == NULL) + return (ENODEV); + } + + /* + * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, + UIO_SYSSPACE, fspath, td); + error = namei(&nd); + if (error != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + if ((fsflags & MNT_UPDATE) == 0) { + pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); + strcpy(pathbuf, fspath); + error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); + /* debug.disablefullpath == 1 results in ENODEV */ + if (error == 0 || error == ENODEV) { + error = vfs_domount_first(td, vfsp, pathbuf, vp, + fsflags, optlist); + } + free(pathbuf, M_TEMP); + } else + error = vfs_domount_update(td, vp, fsflags, optlist); + + return (error); +} + +/* + * Unmount a filesystem. + * + * Note: unmount takes a path to the vnode mounted on as argument, not + * special file (as before). + */ +#ifndef _SYS_SYSPROTO_H_ +struct unmount_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +sys_unmount(struct thread *td, struct unmount_args *uap) +{ + struct nameidata nd; + struct mount *mp; + char *pathbuf; + int error, id0, id1; + + AUDIT_ARG_VALUE(uap->flags); + if (jailed(td->td_ucred) || usermount == 0) { + error = priv_check(td, PRIV_VFS_UNMOUNT); + if (error) + return (error); + } + + pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); + error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); + if (error) { + free(pathbuf, M_TEMP); + return (error); + } + if (uap->flags & MNT_BYFSID) { + AUDIT_ARG_TEXT(pathbuf); + /* Decode the filesystem ID. */ + if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { + free(pathbuf, M_TEMP); + return (EINVAL); + } + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { + if (mp->mnt_stat.f_fsid.val[0] == id0 && + mp->mnt_stat.f_fsid.val[1] == id1) { + vfs_ref(mp); + break; + } + } + mtx_unlock(&mountlist_mtx); + } else { + /* + * Try to find global path for path argument. + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, + UIO_SYSSPACE, pathbuf, td); + if (namei(&nd) == 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, + MNAMELEN); + if (error == 0 || error == ENODEV) + vput(nd.ni_vp); + } + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { + if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { + vfs_ref(mp); + break; + } + } + mtx_unlock(&mountlist_mtx); + } + free(pathbuf, M_TEMP); + if (mp == NULL) { + /* + * Previously we returned ENOENT for a nonexistent path and + * EINVAL for a non-mountpoint. We cannot tell these apart + * now, so in the !MNT_BYFSID case return the more likely + * EINVAL for compatibility. + */ + return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); + } + + /* + * Don't allow unmounting the root filesystem. + */ + if (mp->mnt_flag & MNT_ROOTFS) { + vfs_rel(mp); + return (EINVAL); + } + error = dounmount(mp, uap->flags, td); + return (error); +} + +/* + * Return error if any of the vnodes, ignoring the root vnode + * and the syncer vnode, have non-zero usecount. + * + * This function is purely advisory - it can return false positives + * and negatives. + */ +static int +vfs_check_usecounts(struct mount *mp) +{ + struct vnode *vp, *mvp; + + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && + vp->v_usecount != 0) { + VI_UNLOCK(vp); + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + return (EBUSY); + } + VI_UNLOCK(vp); + } + + return (0); +} + +static void +dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) +{ + + mtx_assert(MNT_MTX(mp), MA_OWNED); + mp->mnt_kern_flag &= ~mntkflags; + if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { + mp->mnt_kern_flag &= ~MNTK_MWAIT; + wakeup(mp); + } + MNT_IUNLOCK(mp); + if (coveredvp != NULL) { + VOP_UNLOCK(coveredvp, 0); + vdrop(coveredvp); + } + vn_finished_write(mp); +} + +/* + * Do the actual filesystem unmount. + */ +int +dounmount(struct mount *mp, int flags, struct thread *td) +{ + struct vnode *coveredvp; + int error; + uint64_t async_flag; + int mnt_gen_r; + + if ((coveredvp = mp->mnt_vnodecovered) != NULL) { + mnt_gen_r = mp->mnt_gen; + VI_LOCK(coveredvp); + vholdl(coveredvp); + vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); + /* + * Check for mp being unmounted while waiting for the + * covered vnode lock. + */ + if (coveredvp->v_mountedhere != mp || + coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { + VOP_UNLOCK(coveredvp, 0); + vdrop(coveredvp); + vfs_rel(mp); + return (EBUSY); + } + } + + /* + * Only privileged root, or (if MNT_USER is set) the user that did the + * original mount is permitted to unmount this filesystem. + */ + error = vfs_suser(mp, td); + if (error != 0) { + if (coveredvp != NULL) { + VOP_UNLOCK(coveredvp, 0); + vdrop(coveredvp); + } + vfs_rel(mp); + return (error); + } + + vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || + (mp->mnt_flag & MNT_UPDATE) != 0 || + !TAILQ_EMPTY(&mp->mnt_uppers)) { + dounmount_cleanup(mp, coveredvp, 0); + return (EBUSY); + } + mp->mnt_kern_flag |= MNTK_UNMOUNT; + if (flags & MNT_NONBUSY) { + MNT_IUNLOCK(mp); + error = vfs_check_usecounts(mp); + MNT_ILOCK(mp); + if (error != 0) { + dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); + return (error); + } + } + /* Allow filesystems to detect that a forced unmount is in progress. */ + if (flags & MNT_FORCE) { + mp->mnt_kern_flag |= MNTK_UNMOUNTF; + MNT_IUNLOCK(mp); + /* + * Must be done after setting MNTK_UNMOUNTF and before + * waiting for mnt_lockref to become 0. + */ + VFS_PURGE(mp); + MNT_ILOCK(mp); + } + error = 0; + if (mp->mnt_lockref) { + mp->mnt_kern_flag |= MNTK_DRAINING; + error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, + "mount drain", 0); + } + MNT_IUNLOCK(mp); + KASSERT(mp->mnt_lockref == 0, + ("%s: invalid lock refcount in the drain path @ %s:%d", + __func__, __FILE__, __LINE__)); + KASSERT(error == 0, + ("%s: invalid return value for msleep in the drain path @ %s:%d", + __func__, __FILE__, __LINE__)); + + if (mp->mnt_flag & MNT_EXPUBLIC) + vfs_setpublicfs(NULL, NULL, NULL); + + /* + * From now, we can claim that the use reference on the + * coveredvp is ours, and the ref can be released only by + * successfull unmount by us, or left for later unmount + * attempt. The previously acquired hold reference is no + * longer needed to protect the vnode from reuse. + */ + if (coveredvp != NULL) + vdrop(coveredvp); + + vfs_msync(mp, MNT_WAIT); + MNT_ILOCK(mp); + async_flag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + mp->mnt_kern_flag &= ~MNTK_ASYNC; + MNT_IUNLOCK(mp); + cache_purgevfs(mp, false); /* remove cache entries for this file sys */ + vfs_deallocate_syncvnode(mp); + error = VFS_UNMOUNT(mp, flags); + vn_finished_write(mp); + /* + * If we failed to flush the dirty blocks for this mount point, + * undo all the cdir/rdir and rootvnode changes we made above. + * Unless we failed to do so because the device is reporting that + * it doesn't exist anymore. + */ + if (error && error != ENXIO) { + MNT_ILOCK(mp); + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + MNT_IUNLOCK(mp); + vfs_allocate_syncvnode(mp); + MNT_ILOCK(mp); + } + mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); + mp->mnt_flag |= async_flag; + if ((mp->mnt_flag & MNT_ASYNC) != 0 && + (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) + mp->mnt_kern_flag |= MNTK_ASYNC; + if (mp->mnt_kern_flag & MNTK_MWAIT) { + mp->mnt_kern_flag &= ~MNTK_MWAIT; + wakeup(mp); + } + MNT_IUNLOCK(mp); + if (coveredvp) + VOP_UNLOCK(coveredvp, 0); + return (error); + } + mtx_lock(&mountlist_mtx); + TAILQ_REMOVE(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); + if (coveredvp != NULL) { + coveredvp->v_mountedhere = NULL; + VOP_UNLOCK(coveredvp, 0); + } + vfs_event_signal(NULL, VQ_UNMOUNT, 0); + if (rootvnode != NULL && mp == rootvnode->v_mount) { + vrele(rootvnode); + rootvnode = NULL; + } + if (mp == rootdevmp) + rootdevmp = NULL; + vfs_mount_destroy(mp); + return (0); +} + +/* + * Report errors during filesystem mounting. + */ +void +vfs_mount_error(struct mount *mp, const char *fmt, ...) +{ + struct vfsoptlist *moptlist = mp->mnt_optnew; + va_list ap; + int error, len; + char *errmsg; + + error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); + if (error || errmsg == NULL || len <= 0) + return; + + va_start(ap, fmt); + vsnprintf(errmsg, (size_t)len, fmt, ap); + va_end(ap); +} + +void +vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) +{ + va_list ap; + int error, len; + char *errmsg; + + error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); + if (error || errmsg == NULL || len <= 0) + return; + + va_start(ap, fmt); + vsnprintf(errmsg, (size_t)len, fmt, ap); + va_end(ap); +} + +/* + * --------------------------------------------------------------------- + * Functions for querying mount options/arguments from filesystems. + */ + +/* + * Check that no unknown options are given + */ +int +vfs_filteropt(struct vfsoptlist *opts, const char **legal) +{ + struct vfsopt *opt; + char errmsg[255]; + const char **t, *p, *q; + int ret = 0; + + TAILQ_FOREACH(opt, opts, link) { + p = opt->name; + q = NULL; + if (p[0] == 'n' && p[1] == 'o') + q = p + 2; + for(t = global_opts; *t != NULL; t++) { + if (strcmp(*t, p) == 0) + break; + if (q != NULL) { + if (strcmp(*t, q) == 0) + break; + } + } + if (*t != NULL) + continue; + for(t = legal; *t != NULL; t++) { + if (strcmp(*t, p) == 0) + break; + if (q != NULL) { + if (strcmp(*t, q) == 0) + break; + } + } + if (*t != NULL) + continue; + snprintf(errmsg, sizeof(errmsg), + "mount option <%s> is unknown", p); + ret = EINVAL; + } + if (ret != 0) { + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(opt->name, "errmsg") == 0) { + strncpy((char *)opt->value, errmsg, opt->len); + break; + } + } + if (opt == NULL) + printf("%s\n", errmsg); + } + return (ret); +} + +/* + * Get a mount option by its name. + * + * Return 0 if the option was found, ENOENT otherwise. + * If len is non-NULL it will be filled with the length + * of the option. If buf is non-NULL, it will be filled + * with the address of the option. + */ +int +vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) +{ + struct vfsopt *opt; + + KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) == 0) { + opt->seen = 1; + if (len != NULL) + *len = opt->len; + if (buf != NULL) + *buf = opt->value; + return (0); + } + } + return (ENOENT); +} + +int +vfs_getopt_pos(struct vfsoptlist *opts, const char *name) +{ + struct vfsopt *opt; + + if (opts == NULL) + return (-1); + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) == 0) { + opt->seen = 1; + return (opt->pos); + } + } + return (-1); +} + +int +vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) +{ + char *opt_value, *vtp; + quad_t iv; + int error, opt_len; + + error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); + if (error != 0) + return (error); + if (opt_len == 0 || opt_value == NULL) + return (EINVAL); + if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') + return (EINVAL); + iv = strtoq(opt_value, &vtp, 0); + if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) + return (EINVAL); + if (iv < 0) + return (EINVAL); + switch (vtp[0]) { + case 't': case 'T': + iv *= 1024; + /* FALLTHROUGH */ + case 'g': case 'G': + iv *= 1024; + /* FALLTHROUGH */ + case 'm': case 'M': + iv *= 1024; + /* FALLTHROUGH */ + case 'k': case 'K': + iv *= 1024; + case '\0': + break; + default: + return (EINVAL); + } + *value = iv; + + return (0); +} + +char * +vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) +{ + struct vfsopt *opt; + + *error = 0; + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) != 0) + continue; + opt->seen = 1; + if (opt->len == 0 || + ((char *)opt->value)[opt->len - 1] != '\0') { + *error = EINVAL; + return (NULL); + } + return (opt->value); + } + *error = ENOENT; + return (NULL); +} + +int +vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, + uint64_t val) +{ + struct vfsopt *opt; + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) == 0) { + opt->seen = 1; + if (w != NULL) + *w |= val; + return (1); + } + } + if (w != NULL) + *w &= ~val; + return (0); +} + +int +vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) +{ + va_list ap; + struct vfsopt *opt; + int ret; + + KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) != 0) + continue; + opt->seen = 1; + if (opt->len == 0 || opt->value == NULL) + return (0); + if (((char *)opt->value)[opt->len - 1] != '\0') + return (0); + va_start(ap, fmt); + ret = vsscanf(opt->value, fmt, ap); + va_end(ap); + return (ret); + } + return (0); +} + +int +vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) +{ + struct vfsopt *opt; + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) != 0) + continue; + opt->seen = 1; + if (opt->value == NULL) + opt->len = len; + else { + if (opt->len != len) + return (EINVAL); + bcopy(value, opt->value, len); + } + return (0); + } + return (ENOENT); +} + +int +vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) +{ + struct vfsopt *opt; + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) != 0) + continue; + opt->seen = 1; + if (opt->value == NULL) + opt->len = len; + else { + if (opt->len < len) + return (EINVAL); + opt->len = len; + bcopy(value, opt->value, len); + } + return (0); + } + return (ENOENT); +} + +int +vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) +{ + struct vfsopt *opt; + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) != 0) + continue; + opt->seen = 1; + if (opt->value == NULL) + opt->len = strlen(value) + 1; + else if (strlcpy(opt->value, value, opt->len) >= opt->len) + return (EINVAL); + return (0); + } + return (ENOENT); +} + +/* + * Find and copy a mount option. + * + * The size of the buffer has to be specified + * in len, if it is not the same length as the + * mount option, EINVAL is returned. + * Returns ENOENT if the option is not found. + */ +int +vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) +{ + struct vfsopt *opt; + + KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); + + TAILQ_FOREACH(opt, opts, link) { + if (strcmp(name, opt->name) == 0) { + opt->seen = 1; + if (len != opt->len) + return (EINVAL); + bcopy(opt->value, dest, opt->len); + return (0); + } + } + return (ENOENT); +} + +int +__vfs_statfs(struct mount *mp, struct statfs *sbp) +{ + int error; + + error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat); + if (sbp != &mp->mnt_stat) + *sbp = mp->mnt_stat; + return (error); +} + +void +vfs_mountedfrom(struct mount *mp, const char *from) +{ + + bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); + strlcpy(mp->mnt_stat.f_mntfromname, from, + sizeof mp->mnt_stat.f_mntfromname); +} + +/* + * --------------------------------------------------------------------- + * This is the api for building mount args and mounting filesystems from + * inside the kernel. + * + * The API works by accumulation of individual args. First error is + * latched. + * + * XXX: should be documented in new manpage kernel_mount(9) + */ + +/* A memory allocation which must be freed when we are done */ +struct mntaarg { + SLIST_ENTRY(mntaarg) next; +}; + +/* The header for the mount arguments */ +struct mntarg { + struct iovec *v; + int len; + int error; + SLIST_HEAD(, mntaarg) list; +}; + +/* + * Add a boolean argument. + * + * flag is the boolean value. + * name must start with "no". + */ +struct mntarg * +mount_argb(struct mntarg *ma, int flag, const char *name) +{ + + KASSERT(name[0] == 'n' && name[1] == 'o', + ("mount_argb(...,%s): name must start with 'no'", name)); + + return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); +} + +/* + * Add an argument printf style + */ +struct mntarg * +mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) +{ + va_list ap; + struct mntaarg *maa; + struct sbuf *sb; + int len; + + if (ma == NULL) { + ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); + SLIST_INIT(&ma->list); + } + if (ma->error) + return (ma); + + ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), + M_MOUNT, M_WAITOK); + ma->v[ma->len].iov_base = (void *)(uintptr_t)name; + ma->v[ma->len].iov_len = strlen(name) + 1; + ma->len++; + + sb = sbuf_new_auto(); + va_start(ap, fmt); + sbuf_vprintf(sb, fmt, ap); + va_end(ap); + sbuf_finish(sb); + len = sbuf_len(sb) + 1; + maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); + SLIST_INSERT_HEAD(&ma->list, maa, next); + bcopy(sbuf_data(sb), maa + 1, len); + sbuf_delete(sb); + + ma->v[ma->len].iov_base = maa + 1; + ma->v[ma->len].iov_len = len; + ma->len++; + + return (ma); +} + +/* + * Add an argument which is a userland string. + */ +struct mntarg * +mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) +{ + struct mntaarg *maa; + char *tbuf; + + if (val == NULL) + return (ma); + if (ma == NULL) { + ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); + SLIST_INIT(&ma->list); + } + if (ma->error) + return (ma); + maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); + SLIST_INSERT_HEAD(&ma->list, maa, next); + tbuf = (void *)(maa + 1); + ma->error = copyinstr(val, tbuf, len, NULL); + return (mount_arg(ma, name, tbuf, -1)); +} + +/* + * Plain argument. + * + * If length is -1, treat value as a C string. + */ +struct mntarg * +mount_arg(struct mntarg *ma, const char *name, const void *val, int len) +{ + + if (ma == NULL) { + ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); + SLIST_INIT(&ma->list); + } + if (ma->error) + return (ma); + + ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), + M_MOUNT, M_WAITOK); + ma->v[ma->len].iov_base = (void *)(uintptr_t)name; + ma->v[ma->len].iov_len = strlen(name) + 1; + ma->len++; + + ma->v[ma->len].iov_base = (void *)(uintptr_t)val; + if (len < 0) + ma->v[ma->len].iov_len = strlen(val) + 1; + else + ma->v[ma->len].iov_len = len; + ma->len++; + return (ma); +} + +/* + * Free a mntarg structure + */ +static void +free_mntarg(struct mntarg *ma) +{ + struct mntaarg *maa; + + while (!SLIST_EMPTY(&ma->list)) { + maa = SLIST_FIRST(&ma->list); + SLIST_REMOVE_HEAD(&ma->list, next); + free(maa, M_MOUNT); + } + free(ma->v, M_MOUNT); + free(ma, M_MOUNT); +} + +/* + * Mount a filesystem + */ +int +kernel_mount(struct mntarg *ma, uint64_t flags) +{ + struct uio auio; + int error; + + KASSERT(ma != NULL, ("kernel_mount NULL ma")); + KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); + KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); + + auio.uio_iov = ma->v; + auio.uio_iovcnt = ma->len; + auio.uio_segflg = UIO_SYSSPACE; + + error = ma->error; + if (!error) + error = vfs_donmount(curthread, flags, &auio); + free_mntarg(ma); + return (error); +} + +/* + * A printflike function to mount a filesystem. + */ +int +kernel_vmount(int flags, ...) +{ + struct mntarg *ma = NULL; + va_list ap; + const char *cp; + const void *vp; + int error; + + va_start(ap, flags); + for (;;) { + cp = va_arg(ap, const char *); + if (cp == NULL) + break; + vp = va_arg(ap, const void *); + ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); + } + va_end(ap); + + error = kernel_mount(ma, flags); + return (error); +} + +void +vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp) +{ + + bcopy(oexp, exp, sizeof(*oexp)); + exp->ex_numsecflavors = 0; +} diff --git a/freebsd/sys/kern/vfs_subr.c b/freebsd/sys/kern/vfs_subr.c new file mode 100644 index 00000000..f84caac0 --- /dev/null +++ b/freebsd/sys/kern/vfs_subr.c @@ -0,0 +1,5719 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + */ + +/* + * External virtual filesystem routines + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" +#include "opt_watchdog.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +static void delmntque(struct vnode *vp); +static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, + int slpflag, int slptimeo); +static void syncer_shutdown(void *arg, int howto); +static int vtryrecycle(struct vnode *vp); +static void v_init_counters(struct vnode *); +static void v_incr_usecount(struct vnode *); +static void v_incr_usecount_locked(struct vnode *); +static void v_incr_devcount(struct vnode *); +static void v_decr_devcount(struct vnode *); +static void vgonel(struct vnode *); +static void vfs_knllock(void *arg); +static void vfs_knlunlock(void *arg); +static void vfs_knl_assert_locked(void *arg); +static void vfs_knl_assert_unlocked(void *arg); +static void vnlru_return_batches(struct vfsops *mnt_op); +static void destroy_vpollinfo(struct vpollinfo *vi); +static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, + daddr_t startlbn, daddr_t endlbn); + +/* + * These fences are intended for cases where some synchronization is + * needed between access of v_iflags and lockless vnode refcount (v_holdcnt + * and v_usecount) updates. Access to v_iflags is generally synchronized + * by the interlock, but we have some internal assertions that check vnode + * flags without acquiring the lock. Thus, these fences are INVARIANTS-only + * for now. + */ +#ifdef INVARIANTS +#define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() +#define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() +#else +#define VNODE_REFCOUNT_FENCE_ACQ() +#define VNODE_REFCOUNT_FENCE_REL() +#endif + +/* + * Number of vnodes in existence. Increased whenever getnewvnode() + * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. + */ +static unsigned long numvnodes; + +SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, + "Number of vnodes in existence"); + +static counter_u64_t vnodes_created; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, + "Number of vnodes created by getnewvnode"); + +static u_long mnt_free_list_batch = 128; +SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW, + &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list"); + +/* + * Conversion tables for conversion from vnode types to inode formats + * and back. + */ +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON +}; +int vttoif_tab[10] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT +}; + +/* + * List of vnodes that are ready for recycling. + */ +static TAILQ_HEAD(freelst, vnode) vnode_free_list; + +/* + * "Free" vnode target. Free vnodes are rarely completely free, but are + * just ones that are cheap to recycle. Usually they are for files which + * have been stat'd but not read; these usually have inode and namecache + * data attached to them. This target is the preferred minimum size of a + * sub-cache consisting mostly of such files. The system balances the size + * of this sub-cache with its complement to try to prevent either from + * thrashing while the other is relatively inactive. The targets express + * a preference for the best balance. + * + * "Above" this target there are 2 further targets (watermarks) related + * to recyling of free vnodes. In the best-operating case, the cache is + * exactly full, the free list has size between vlowat and vhiwat above the + * free target, and recycling from it and normal use maintains this state. + * Sometimes the free list is below vlowat or even empty, but this state + * is even better for immediate use provided the cache is not full. + * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free + * ones) to reach one of these states. The watermarks are currently hard- + * coded as 4% and 9% of the available space higher. These and the default + * of 25% for wantfreevnodes are too large if the memory size is large. + * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim + * whenever vnlru_proc() becomes active. + */ +static u_long wantfreevnodes; +SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, + &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); +static u_long freevnodes; +SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, + &freevnodes, 0, "Number of \"free\" vnodes"); + +static counter_u64_t recycles_count; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, + "Number of vnodes recycled to meet vnode cache targets"); + +/* + * Various variables used for debugging the new implementation of + * reassignbuf(). + * XXX these are probably of (very) limited utility now. + */ +static int reassignbufcalls; +SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, + "Number of calls to reassignbuf"); + +static counter_u64_t free_owe_inact; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, + "Number of times free vnodes kept on active list due to VFS " + "owing inactivation"); + +/* To keep more than one thread at a time from running vfs_getnewfsid */ +static struct mtx mntid_mtx; + +/* + * Lock for any access to the following: + * vnode_free_list + * numvnodes + * freevnodes + */ +static struct mtx vnode_free_list_mtx; + +/* Publicly exported FS */ +struct nfs_public nfs_pub; + +static uma_zone_t buf_trie_zone; + +/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ +static uma_zone_t vnode_zone; +static uma_zone_t vnodepoll_zone; + +/* + * The workitem queue. + * + * It is useful to delay writes of file data and filesystem metadata + * for tens of seconds so that quickly created and deleted files need + * not waste disk bandwidth being created and removed. To realize this, + * we append vnodes to a "workitem" queue. When running with a soft + * updates implementation, most pending metadata dependencies should + * not wait for more than a few seconds. Thus, mounted on block devices + * are delayed only about a half the time that file data is delayed. + * Similarly, directory updates are more critical, so are only delayed + * about a third the time that file data is delayed. Thus, there are + * SYNCER_MAXDELAY queues that are processed round-robin at a rate of + * one each second (driven off the filesystem syncer process). The + * syncer_delayno variable indicates the next queue that is to be processed. + * Items that need to be processed soon are placed in this queue: + * + * syncer_workitem_pending[syncer_delayno] + * + * A delay of fifteen seconds is done by placing the request fifteen + * entries later in the queue: + * + * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] + * + */ +static int syncer_delayno; +static long syncer_mask; +LIST_HEAD(synclist, bufobj); +static struct synclist *syncer_workitem_pending; +/* + * The sync_mtx protects: + * bo->bo_synclist + * sync_vnode_count + * syncer_delayno + * syncer_state + * syncer_workitem_pending + * syncer_worklist_len + * rushjob + */ +static struct mtx sync_mtx; +static struct cv sync_wakeup; + +#define SYNCER_MAXDELAY 32 +static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ +static int syncdelay = 30; /* max time to delay syncing data */ +static int filedelay = 30; /* time to delay syncing files */ +SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, + "Time to delay syncing files (in seconds)"); +static int dirdelay = 29; /* time to delay syncing directories */ +SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, + "Time to delay syncing directories (in seconds)"); +static int metadelay = 28; /* time to delay syncing metadata */ +SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, + "Time to delay syncing metadata (in seconds)"); +static int rushjob; /* number of slots to run ASAP */ +static int stat_rush_requests; /* number of times I/O speeded up */ +SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, + "Number of times I/O speeded up (rush requests)"); + +/* + * When shutting down the syncer, run it at four times normal speed. + */ +#define SYNCER_SHUTDOWN_SPEEDUP 4 +static int sync_vnode_count; +static int syncer_worklist_len; +static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } + syncer_state; + +/* Target for maximum number of vnodes. */ +int desiredvnodes; +static int gapvnodes; /* gap between wanted and desired */ +static int vhiwat; /* enough extras after expansion */ +static int vlowat; /* minimal extras before expansion */ +static int vstir; /* nonzero to stir non-free vnodes */ +static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ + +static int +sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) +{ + int error, old_desiredvnodes; + + old_desiredvnodes = desiredvnodes; + if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) + return (error); + if (old_desiredvnodes != desiredvnodes) { + wantfreevnodes = desiredvnodes / 4; + /* XXX locking seems to be incomplete. */ + vfs_hash_changesize(desiredvnodes); + cache_changesize(desiredvnodes); + } + return (0); +} + +SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, + sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); +SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, + &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); +static int vnlru_nowhere; +SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, + &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); + +static int +sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) +{ + struct vnode *vp; + struct nameidata nd; + char *buf; + unsigned long ndflags; + int error; + + if (req->newptr == NULL) + return (EINVAL); + if (req->newlen >= PATH_MAX) + return (E2BIG); + + buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); + error = SYSCTL_IN(req, buf, req->newlen); + if (error != 0) + goto out; + + buf[req->newlen] = '\0'; + + ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME; + NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); + if ((error = namei(&nd)) != 0) + goto out; + vp = nd.ni_vp; + + if ((vp->v_iflag & VI_DOOMED) != 0) { + /* + * This vnode is being recycled. Return != 0 to let the caller + * know that the sysctl had no effect. Return EAGAIN because a + * subsequent call will likely succeed (since namei will create + * a new vnode if necessary) + */ + error = EAGAIN; + goto putvnode; + } + + counter_u64_add(recycles_count, 1); + vgone(vp); +putvnode: + NDFREE(&nd, 0); +out: + free(buf, M_TEMP); + return (error); +} + +static int +sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) +{ + struct thread *td = curthread; + struct vnode *vp; + struct file *fp; + int error; + int fd; + + if (req->newptr == NULL) + return (EBADF); + + error = sysctl_handle_int(oidp, &fd, 0, req); + if (error != 0) + return (error); + error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); + if (error != 0) + return (error); + vp = fp->f_vnode; + + error = vn_lock(vp, LK_EXCLUSIVE); + if (error != 0) + goto drop; + + counter_u64_add(recycles_count, 1); + vgone(vp); + VOP_UNLOCK(vp, 0); +drop: + fdrop(fp, td); + return (error); +} + +SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, + CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, + sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); +SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, + sysctl_ftry_reclaim_vnode, "I", + "Try to reclaim a vnode by its file descriptor"); + +/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ +static int vnsz2log; + +/* + * Support for the bufobj clean & dirty pctrie. + */ +static void * +buf_trie_alloc(struct pctrie *ptree) +{ + + return uma_zalloc(buf_trie_zone, M_NOWAIT); +} + +static void +buf_trie_free(struct pctrie *ptree, void *node) +{ + + uma_zfree(buf_trie_zone, node); +} +PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); + +/* + * Initialize the vnode management data structures. + * + * Reevaluate the following cap on the number of vnodes after the physical + * memory size exceeds 512GB. In the limit, as the physical memory size + * grows, the ratio of the memory size in KB to vnodes approaches 64:1. + */ +#ifndef MAXVNODES_MAX +#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ +#endif + +/* + * Initialize a vnode as it first enters the zone. + */ +static int +vnode_init(void *mem, int size, int flags) +{ + struct vnode *vp; + + vp = mem; + bzero(vp, size); + /* + * Setup locks. + */ + vp->v_vnlock = &vp->v_lock; + mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); + /* + * By default, don't allow shared locks unless filesystems opt-in. + */ + lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, + LK_NOSHARE | LK_IS_VNODE); + /* + * Initialize bufobj. + */ + bufobj_init(&vp->v_bufobj, vp); + /* + * Initialize namecache. + */ + LIST_INIT(&vp->v_cache_src); + TAILQ_INIT(&vp->v_cache_dst); + /* + * Initialize rangelocks. + */ + rangelock_init(&vp->v_rl); + return (0); +} + +/* + * Free a vnode when it is cleared from the zone. + */ +static void +vnode_fini(void *mem, int size) +{ + struct vnode *vp; + struct bufobj *bo; + + vp = mem; + rangelock_destroy(&vp->v_rl); + lockdestroy(vp->v_vnlock); + mtx_destroy(&vp->v_interlock); + bo = &vp->v_bufobj; + rw_destroy(BO_LOCKPTR(bo)); +} + +/* + * Provide the size of NFS nclnode and NFS fh for calculation of the + * vnode memory consumption. The size is specified directly to + * eliminate dependency on NFS-private header. + * + * Other filesystems may use bigger or smaller (like UFS and ZFS) + * private inode data, but the NFS-based estimation is ample enough. + * Still, we care about differences in the size between 64- and 32-bit + * platforms. + * + * Namecache structure size is heuristically + * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. + */ +#ifdef _LP64 +#define NFS_NCLNODE_SZ (528 + 64) +#define NC_SZ 148 +#else +#define NFS_NCLNODE_SZ (360 + 32) +#define NC_SZ 92 +#endif + +static void +vntblinit(void *dummy __unused) +{ + u_int i; + int physvnodes, virtvnodes; + + /* + * Desiredvnodes is a function of the physical memory size and the + * kernel's heap size. Generally speaking, it scales with the + * physical memory size. The ratio of desiredvnodes to the physical + * memory size is 1:16 until desiredvnodes exceeds 98,304. + * Thereafter, the + * marginal ratio of desiredvnodes to the physical memory size is + * 1:64. However, desiredvnodes is limited by the kernel's heap + * size. The memory required by desiredvnodes vnodes and vm objects + * must not exceed 1/10th of the kernel's heap size. + */ + physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; + virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); + desiredvnodes = min(physvnodes, virtvnodes); + if (desiredvnodes > MAXVNODES_MAX) { + if (bootverbose) + printf("Reducing kern.maxvnodes %d -> %d\n", + desiredvnodes, MAXVNODES_MAX); + desiredvnodes = MAXVNODES_MAX; + } + wantfreevnodes = desiredvnodes / 4; + mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); + TAILQ_INIT(&vnode_free_list); + mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); + vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, + vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); + vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + /* + * Preallocate enough nodes to support one-per buf so that + * we can not fail an insert. reassignbuf() callers can not + * tolerate the insertion failure. + */ + buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), + NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, + UMA_ZONE_NOFREE | UMA_ZONE_VM); + uma_prealloc(buf_trie_zone, nbuf); + + vnodes_created = counter_u64_alloc(M_WAITOK); + recycles_count = counter_u64_alloc(M_WAITOK); + free_owe_inact = counter_u64_alloc(M_WAITOK); + + /* + * Initialize the filesystem syncer. + */ + syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, + &syncer_mask); + syncer_maxdelay = syncer_mask + 1; + mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); + cv_init(&sync_wakeup, "syncer"); + for (i = 1; i <= sizeof(struct vnode); i <<= 1) + vnsz2log++; + vnsz2log--; +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); + + +/* + * Mark a mount point as busy. Used to synchronize access and to delay + * unmounting. Eventually, mountlist_mtx is not released on failure. + * + * vfs_busy() is a custom lock, it can block the caller. + * vfs_busy() only sleeps if the unmount is active on the mount point. + * For a mountpoint mp, vfs_busy-enforced lock is before lock of any + * vnode belonging to mp. + * + * Lookup uses vfs_busy() to traverse mount points. + * root fs var fs + * / vnode lock A / vnode lock (/var) D + * /var vnode lock B /log vnode lock(/var/log) E + * vfs_busy lock C vfs_busy lock F + * + * Within each file system, the lock order is C->A->B and F->D->E. + * + * When traversing across mounts, the system follows that lock order: + * + * C->A->B + * | + * +->F->D->E + * + * The lookup() process for namei("/var") illustrates the process: + * VOP_LOOKUP() obtains B while A is held + * vfs_busy() obtains a shared lock on F while A and B are held + * vput() releases lock on B + * vput() releases lock on A + * VFS_ROOT() obtains lock on D while shared lock on F is held + * vfs_unbusy() releases shared lock on F + * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. + * Attempt to lock A (instead of vp_crossmp) while D is held would + * violate the global order, causing deadlocks. + * + * dounmount() locks B while F is drained. + */ +int +vfs_busy(struct mount *mp, int flags) +{ + + MPASS((flags & ~MBF_MASK) == 0); + CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); + + MNT_ILOCK(mp); + MNT_REF(mp); + /* + * If mount point is currently being unmounted, sleep until the + * mount point fate is decided. If thread doing the unmounting fails, + * it will clear MNTK_UNMOUNT flag before waking us up, indicating + * that this mount point has survived the unmount attempt and vfs_busy + * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE + * flag in addition to MNTK_UNMOUNT, indicating that mount point is + * about to be really destroyed. vfs_busy needs to release its + * reference on the mount point in this case and return with ENOENT, + * telling the caller that mount mount it tried to busy is no longer + * valid. + */ + while (mp->mnt_kern_flag & MNTK_UNMOUNT) { + if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { + MNT_REL(mp); + MNT_IUNLOCK(mp); + CTR1(KTR_VFS, "%s: failed busying before sleeping", + __func__); + return (ENOENT); + } + if (flags & MBF_MNTLSTLOCK) + mtx_unlock(&mountlist_mtx); + mp->mnt_kern_flag |= MNTK_MWAIT; + msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); + if (flags & MBF_MNTLSTLOCK) + mtx_lock(&mountlist_mtx); + MNT_ILOCK(mp); + } + if (flags & MBF_MNTLSTLOCK) + mtx_unlock(&mountlist_mtx); + mp->mnt_lockref++; + MNT_IUNLOCK(mp); + return (0); +} + +/* + * Free a busy filesystem. + */ +void +vfs_unbusy(struct mount *mp) +{ + + CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + MNT_ILOCK(mp); + MNT_REL(mp); + KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); + mp->mnt_lockref--; + if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { + MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); + CTR1(KTR_VFS, "%s: waking up waiters", __func__); + mp->mnt_kern_flag &= ~MNTK_DRAINING; + wakeup(&mp->mnt_lockref); + } + MNT_IUNLOCK(mp); +} + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +vfs_getvfs(fsid_t *fsid) +{ + struct mount *mp; + + CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + vfs_ref(mp); + mtx_unlock(&mountlist_mtx); + return (mp); + } + } + mtx_unlock(&mountlist_mtx); + CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); + return ((struct mount *) 0); +} + +/* + * Lookup a mount point by filesystem identifier, busying it before + * returning. + * + * To avoid congestion on mountlist_mtx, implement simple direct-mapped + * cache for popular filesystem identifiers. The cache is lockess, using + * the fact that struct mount's are never freed. In worst case we may + * get pointer to unmounted or even different filesystem, so we have to + * check what we got, and go slow way if so. + */ +struct mount * +vfs_busyfs(fsid_t *fsid) +{ +#define FSID_CACHE_SIZE 256 + typedef struct mount * volatile vmp_t; + static vmp_t cache[FSID_CACHE_SIZE]; + struct mount *mp; + int error; + uint32_t hash; + + CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); + hash = fsid->val[0] ^ fsid->val[1]; + hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); + mp = cache[hash]; + if (mp == NULL || + mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || + mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) + goto slow; + if (vfs_busy(mp, 0) != 0) { + cache[hash] = NULL; + goto slow; + } + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) + return (mp); + else + vfs_unbusy(mp); + +slow: + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + error = vfs_busy(mp, MBF_MNTLSTLOCK); + if (error) { + cache[hash] = NULL; + mtx_unlock(&mountlist_mtx); + return (NULL); + } + cache[hash] = mp; + return (mp); + } + } + CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); + mtx_unlock(&mountlist_mtx); + return ((struct mount *) 0); +} + +/* + * Check if a user can access privileged mount options. + */ +int +vfs_suser(struct mount *mp, struct thread *td) +{ + int error; + + if (jailed(td->td_ucred)) { + /* + * If the jail of the calling thread lacks permission for + * this type of file system, deny immediately. + */ + if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) + return (EPERM); + + /* + * If the file system was mounted outside the jail of the + * calling thread, deny immediately. + */ + if (prison_check(td->td_ucred, mp->mnt_cred) != 0) + return (EPERM); + } + + /* + * If file system supports delegated administration, we don't check + * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified + * by the file system itself. + * If this is not the user that did original mount, we check for + * the PRIV_VFS_MOUNT_OWNER privilege. + */ + if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && + mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { + if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) + return (error); + } + return (0); +} + +/* + * Get a new unique fsid. Try to make its val[0] unique, since this value + * will be used to create fake device numbers for stat(). Also try (but + * not so hard) make its val[0] unique mod 2^16, since some emulators only + * support 16-bit device numbers. We end up with unique val[0]'s for the + * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. + * + * Keep in mind that several mounts may be running in parallel. Starting + * the search one past where the previous search terminated is both a + * micro-optimization and a defense against returning the same fsid to + * different mounts. + */ +void +vfs_getnewfsid(struct mount *mp) +{ + static uint16_t mntid_base; + struct mount *nmp; + fsid_t tfsid; + int mtype; + + CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + mtx_lock(&mntid_mtx); + mtype = mp->mnt_vfc->vfc_typenum; + tfsid.val[1] = mtype; + mtype = (mtype & 0xFF) << 24; + for (;;) { + tfsid.val[0] = makedev(255, + mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); + mntid_base++; + if ((nmp = vfs_getvfs(&tfsid)) == NULL) + break; + vfs_rel(nmp); + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; + mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; + mtx_unlock(&mntid_mtx); +} + +/* + * Knob to control the precision of file timestamps: + * + * 0 = seconds only; nanoseconds zeroed. + * 1 = seconds and nanoseconds, accurate within 1/HZ. + * 2 = seconds and nanoseconds, truncated to microseconds. + * >=3 = seconds and nanoseconds, maximum precision. + */ +enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; + +static int timestamp_precision = TSP_USEC; +SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, + ×tamp_precision, 0, "File timestamp precision (0: seconds, " + "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " + "3+: sec + ns (max. precision))"); + +/* + * Get a current timestamp. + */ +void +vfs_timestamp(struct timespec *tsp) +{ + struct timeval tv; + + switch (timestamp_precision) { + case TSP_SEC: + tsp->tv_sec = time_second; + tsp->tv_nsec = 0; + break; + case TSP_HZ: + getnanotime(tsp); + break; + case TSP_USEC: + microtime(&tv); + TIMEVAL_TO_TIMESPEC(&tv, tsp); + break; + case TSP_NSEC: + default: + nanotime(tsp); + break; + } +} + +/* + * Set vnode attributes to VNOVAL + */ +void +vattr_null(struct vattr *vap) +{ + + vap->va_type = VNON; + vap->va_size = VNOVAL; + vap->va_bytes = VNOVAL; + vap->va_mode = VNOVAL; + vap->va_nlink = VNOVAL; + vap->va_uid = VNOVAL; + vap->va_gid = VNOVAL; + vap->va_fsid = VNOVAL; + vap->va_fileid = VNOVAL; + vap->va_blocksize = VNOVAL; + vap->va_rdev = VNOVAL; + vap->va_atime.tv_sec = VNOVAL; + vap->va_atime.tv_nsec = VNOVAL; + vap->va_mtime.tv_sec = VNOVAL; + vap->va_mtime.tv_nsec = VNOVAL; + vap->va_ctime.tv_sec = VNOVAL; + vap->va_ctime.tv_nsec = VNOVAL; + vap->va_birthtime.tv_sec = VNOVAL; + vap->va_birthtime.tv_nsec = VNOVAL; + vap->va_flags = VNOVAL; + vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * This routine is called when we have too many vnodes. It attempts + * to free vnodes and will potentially free vnodes that still + * have VM backing store (VM backing store is typically the cause + * of a vnode blowout so we want to do this). Therefore, this operation + * is not considered cheap. + * + * A number of conditions may prevent a vnode from being reclaimed. + * the buffer cache may have references on the vnode, a directory + * vnode may still have references due to the namei cache representing + * underlying files, or the vnode may be in active use. It is not + * desirable to reuse such vnodes. These conditions may cause the + * number of vnodes to reach some minimum value regardless of what + * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. + * + * @param mp Try to reclaim vnodes from this mountpoint + * @param reclaim_nc_src Only reclaim directories with outgoing namecache + * entries if this argument is strue + * @param trigger Only reclaim vnodes with fewer than this many resident + * pages. + * @return The number of vnodes that were reclaimed. + */ +static int +vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger) +{ + struct vnode *vp; + int count, done, target; + + done = 0; + vn_start_write(NULL, &mp, V_WAIT); + MNT_ILOCK(mp); + count = mp->mnt_nvnodelistsize; + target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); + target = target / 10 + 1; + while (count != 0 && done < target) { + vp = TAILQ_FIRST(&mp->mnt_nvnodelist); + while (vp != NULL && vp->v_type == VMARKER) + vp = TAILQ_NEXT(vp, v_nmntvnodes); + if (vp == NULL) + break; + /* + * XXX LRU is completely broken for non-free vnodes. First + * by calling here in mountpoint order, then by moving + * unselected vnodes to the end here, and most grossly by + * removing the vlruvp() function that was supposed to + * maintain the order. (This function was born broken + * since syncer problems prevented it doing anything.) The + * order is closer to LRC (C = Created). + * + * LRU reclaiming of vnodes seems to have last worked in + * FreeBSD-3 where LRU wasn't mentioned under any spelling. + * Then there was no hold count, and inactive vnodes were + * simply put on the free list in LRU order. The separate + * lists also break LRU. We prefer to reclaim from the + * free list for technical reasons. This tends to thrash + * the free list to keep very unrecently used held vnodes. + * The problem is mitigated by keeping the free list large. + */ + TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + --count; + if (!VI_TRYLOCK(vp)) + goto next_iter; + /* + * If it's been deconstructed already, it's still + * referenced, or it exceeds the trigger, skip it. + * Also skip free vnodes. We are trying to make space + * to expand the free list, not reduce it. + */ + if (vp->v_usecount || + (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || + ((vp->v_iflag & VI_FREE) != 0) || + (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && + vp->v_object->resident_page_count > trigger)) { + VI_UNLOCK(vp); + goto next_iter; + } + MNT_IUNLOCK(mp); + vholdl(vp); + if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { + vdrop(vp); + goto next_iter_mntunlocked; + } + VI_LOCK(vp); + /* + * v_usecount may have been bumped after VOP_LOCK() dropped + * the vnode interlock and before it was locked again. + * + * It is not necessary to recheck VI_DOOMED because it can + * only be set by another thread that holds both the vnode + * lock and vnode interlock. If another thread has the + * vnode lock before we get to VOP_LOCK() and obtains the + * vnode interlock after VOP_LOCK() drops the vnode + * interlock, the other thread will be unable to drop the + * vnode lock before our VOP_LOCK() call fails. + */ + if (vp->v_usecount || + (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || + (vp->v_iflag & VI_FREE) != 0 || + (vp->v_object != NULL && + vp->v_object->resident_page_count > trigger)) { + VOP_UNLOCK(vp, LK_INTERLOCK); + vdrop(vp); + goto next_iter_mntunlocked; + } + KASSERT((vp->v_iflag & VI_DOOMED) == 0, + ("VI_DOOMED unexpectedly detected in vlrureclaim()")); + counter_u64_add(recycles_count, 1); + vgonel(vp); + VOP_UNLOCK(vp, 0); + vdropl(vp); + done++; +next_iter_mntunlocked: + if (!should_yield()) + goto relock_mnt; + goto yield; +next_iter: + if (!should_yield()) + continue; + MNT_IUNLOCK(mp); +yield: + kern_yield(PRI_USER); +relock_mnt: + MNT_ILOCK(mp); + } + MNT_IUNLOCK(mp); + vn_finished_write(mp); + return done; +} + +static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ +SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, + 0, + "limit on vnode free requests per call to the vnlru_free routine"); + +/* + * Attempt to reduce the free list by the requested amount. + */ +static void +vnlru_free_locked(int count, struct vfsops *mnt_op) +{ + struct vnode *vp; + struct mount *mp; + bool tried_batches; + + tried_batches = false; + mtx_assert(&vnode_free_list_mtx, MA_OWNED); + if (count > max_vnlru_free) + count = max_vnlru_free; + for (; count > 0; count--) { + vp = TAILQ_FIRST(&vnode_free_list); + /* + * The list can be modified while the free_list_mtx + * has been dropped and vp could be NULL here. + */ + if (vp == NULL) { + if (tried_batches) + break; + mtx_unlock(&vnode_free_list_mtx); + vnlru_return_batches(mnt_op); + tried_batches = true; + mtx_lock(&vnode_free_list_mtx); + continue; + } + + VNASSERT(vp->v_op != NULL, vp, + ("vnlru_free: vnode already reclaimed.")); + KASSERT((vp->v_iflag & VI_FREE) != 0, + ("Removing vnode not on freelist")); + KASSERT((vp->v_iflag & VI_ACTIVE) == 0, + ("Mangling active vnode")); + TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); + + /* + * Don't recycle if our vnode is from different type + * of mount point. Note that mp is type-safe, the + * check does not reach unmapped address even if + * vnode is reclaimed. + * Don't recycle if we can't get the interlock without + * blocking. + */ + if ((mnt_op != NULL && (mp = vp->v_mount) != NULL && + mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); + continue; + } + VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, + vp, ("vp inconsistent on freelist")); + + /* + * The clear of VI_FREE prevents activation of the + * vnode. There is no sense in putting the vnode on + * the mount point active list, only to remove it + * later during recycling. Inline the relevant part + * of vholdl(), to avoid triggering assertions or + * activating. + */ + freevnodes--; + vp->v_iflag &= ~VI_FREE; + VNODE_REFCOUNT_FENCE_REL(); + refcount_acquire(&vp->v_holdcnt); + + mtx_unlock(&vnode_free_list_mtx); + VI_UNLOCK(vp); + vtryrecycle(vp); + /* + * If the recycled succeeded this vdrop will actually free + * the vnode. If not it will simply place it back on + * the free list. + */ + vdrop(vp); + mtx_lock(&vnode_free_list_mtx); + } +} + +void +vnlru_free(int count, struct vfsops *mnt_op) +{ + + mtx_lock(&vnode_free_list_mtx); + vnlru_free_locked(count, mnt_op); + mtx_unlock(&vnode_free_list_mtx); +} + + +/* XXX some names and initialization are bad for limits and watermarks. */ +static int +vspace(void) +{ + int space; + + gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); + vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ + vlowat = vhiwat / 2; + if (numvnodes > desiredvnodes) + return (0); + space = desiredvnodes - numvnodes; + if (freevnodes > wantfreevnodes) + space += freevnodes - wantfreevnodes; + return (space); +} + +static void +vnlru_return_batch_locked(struct mount *mp) +{ + struct vnode *vp; + + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + + if (mp->mnt_tmpfreevnodelistsize == 0) + return; + + TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) { + VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp, + ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist")); + vp->v_mflag &= ~VMP_TMPMNTFREELIST; + } + mtx_lock(&vnode_free_list_mtx); + TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist); + freevnodes += mp->mnt_tmpfreevnodelistsize; + mtx_unlock(&vnode_free_list_mtx); + mp->mnt_tmpfreevnodelistsize = 0; +} + +static void +vnlru_return_batch(struct mount *mp) +{ + + mtx_lock(&mp->mnt_listmtx); + vnlru_return_batch_locked(mp); + mtx_unlock(&mp->mnt_listmtx); +} + +static void +vnlru_return_batches(struct vfsops *mnt_op) +{ + struct mount *mp, *nmp; + bool need_unbusy; + + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + need_unbusy = false; + if (mnt_op != NULL && mp->mnt_op != mnt_op) + goto next; + if (mp->mnt_tmpfreevnodelistsize == 0) + goto next; + if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) { + vnlru_return_batch(mp); + need_unbusy = true; + mtx_lock(&mountlist_mtx); + } +next: + nmp = TAILQ_NEXT(mp, mnt_list); + if (need_unbusy) + vfs_unbusy(mp); + } + mtx_unlock(&mountlist_mtx); +} + +/* + * Attempt to recycle vnodes in a context that is always safe to block. + * Calling vlrurecycle() from the bowels of filesystem code has some + * interesting deadlock problems. + */ +static struct proc *vnlruproc; +static int vnlruproc_sig; + +static void +vnlru_proc(void) +{ + struct mount *mp, *nmp; + unsigned long onumvnodes; + int done, force, trigger, usevnodes, vsp; + bool reclaim_nc_src; + + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, + SHUTDOWN_PRI_FIRST); + + force = 0; + for (;;) { + kproc_suspend_check(vnlruproc); + mtx_lock(&vnode_free_list_mtx); + /* + * If numvnodes is too large (due to desiredvnodes being + * adjusted using its sysctl, or emergency growth), first + * try to reduce it by discarding from the free list. + */ + if (numvnodes > desiredvnodes) + vnlru_free_locked(numvnodes - desiredvnodes, NULL); + /* + * Sleep if the vnode cache is in a good state. This is + * when it is not over-full and has space for about a 4% + * or 9% expansion (by growing its size or inexcessively + * reducing its free list). Otherwise, try to reclaim + * space for a 10% expansion. + */ + if (vstir && force == 0) { + force = 1; + vstir = 0; + } + vsp = vspace(); + if (vsp >= vlowat && force == 0) { + vnlruproc_sig = 0; + wakeup(&vnlruproc_sig); + msleep(vnlruproc, &vnode_free_list_mtx, + PVFS|PDROP, "vlruwt", hz); + continue; + } + mtx_unlock(&vnode_free_list_mtx); + done = 0; + onumvnodes = numvnodes; + /* + * Calculate parameters for recycling. These are the same + * throughout the loop to give some semblance of fairness. + * The trigger point is to avoid recycling vnodes with lots + * of resident pages. We aren't trying to free memory; we + * are trying to recycle or at least free vnodes. + */ + if (numvnodes <= desiredvnodes) + usevnodes = numvnodes - freevnodes; + else + usevnodes = numvnodes; + if (usevnodes <= 0) + usevnodes = 1; + /* + * The trigger value is is chosen to give a conservatively + * large value to ensure that it alone doesn't prevent + * making progress. The value can easily be so large that + * it is effectively infinite in some congested and + * misconfigured cases, and this is necessary. Normally + * it is about 8 to 100 (pages), which is quite large. + */ + trigger = vm_cnt.v_page_count * 2 / usevnodes; + if (force < 2) + trigger = vsmalltrigger; + reclaim_nc_src = force >= 3; + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + done += vlrureclaim(mp, reclaim_nc_src, trigger); + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp); + } + mtx_unlock(&mountlist_mtx); + if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) + uma_reclaim(); + if (done == 0) { + if (force == 0 || force == 1) { + force = 2; + continue; + } + if (force == 2) { + force = 3; + continue; + } + force = 0; + vnlru_nowhere++; + tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); + } else + kern_yield(PRI_USER); + /* + * After becoming active to expand above low water, keep + * active until above high water. + */ + vsp = vspace(); + force = vsp < vhiwat; + } +} + +static struct kproc_desc vnlru_kp = { + "vnlru", + vnlru_proc, + &vnlruproc +}; +SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, + &vnlru_kp); + +/* + * Routines having to do with the management of the vnode table. + */ + +/* + * Try to recycle a freed vnode. We abort if anyone picks up a reference + * before we actually vgone(). This function must be called with the vnode + * held to prevent the vnode from being returned to the free list midway + * through vgone(). + */ +static int +vtryrecycle(struct vnode *vp) +{ + struct mount *vnmp; + + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + VNASSERT(vp->v_holdcnt, vp, + ("vtryrecycle: Recycling vp %p without a reference.", vp)); + /* + * This vnode may found and locked via some other list, if so we + * can't recycle it yet. + */ + if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { + CTR2(KTR_VFS, + "%s: impossible to recycle, vp %p lock is already held", + __func__, vp); + return (EWOULDBLOCK); + } + /* + * Don't recycle if its filesystem is being suspended. + */ + if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { + VOP_UNLOCK(vp, 0); + CTR2(KTR_VFS, + "%s: impossible to recycle, cannot start the write for %p", + __func__, vp); + return (EBUSY); + } + /* + * If we got this far, we need to acquire the interlock and see if + * anyone picked up this vnode from another list. If not, we will + * mark it with DOOMED via vgonel() so that anyone who does find it + * will skip over it. + */ + VI_LOCK(vp); + if (vp->v_usecount) { + VOP_UNLOCK(vp, LK_INTERLOCK); + vn_finished_write(vnmp); + CTR2(KTR_VFS, + "%s: impossible to recycle, %p is already referenced", + __func__, vp); + return (EBUSY); + } + if ((vp->v_iflag & VI_DOOMED) == 0) { + counter_u64_add(recycles_count, 1); + vgonel(vp); + } + VOP_UNLOCK(vp, LK_INTERLOCK); + vn_finished_write(vnmp); + return (0); +} + +static void +vcheckspace(void) +{ + int vsp; + + vsp = vspace(); + if (vsp < vlowat && vnlruproc_sig == 0) { + vnlruproc_sig = 1; + wakeup(vnlruproc); + } +} + +/* + * Wait if necessary for space for a new vnode. + */ +static int +getnewvnode_wait(int suspended) +{ + + mtx_assert(&vnode_free_list_mtx, MA_OWNED); + if (numvnodes >= desiredvnodes) { + if (suspended) { + /* + * The file system is being suspended. We cannot + * risk a deadlock here, so allow allocation of + * another vnode even if this would give too many. + */ + return (0); + } + if (vnlruproc_sig == 0) { + vnlruproc_sig = 1; /* avoid unnecessary wakeups */ + wakeup(vnlruproc); + } + msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, + "vlruwk", hz); + } + /* Post-adjust like the pre-adjust in getnewvnode(). */ + if (numvnodes + 1 > desiredvnodes && freevnodes > 1) + vnlru_free_locked(1, NULL); + return (numvnodes >= desiredvnodes ? ENFILE : 0); +} + +/* + * This hack is fragile, and probably not needed any more now that the + * watermark handling works. + */ +void +getnewvnode_reserve(u_int count) +{ + struct thread *td; + + /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ + /* XXX no longer so quick, but this part is not racy. */ + mtx_lock(&vnode_free_list_mtx); + if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) + vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes, + freevnodes - wantfreevnodes), NULL); + mtx_unlock(&vnode_free_list_mtx); + + td = curthread; + /* First try to be quick and racy. */ + if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { + td->td_vp_reserv += count; + vcheckspace(); /* XXX no longer so quick, but more racy */ + return; + } else + atomic_subtract_long(&numvnodes, count); + + mtx_lock(&vnode_free_list_mtx); + while (count > 0) { + if (getnewvnode_wait(0) == 0) { + count--; + td->td_vp_reserv++; + atomic_add_long(&numvnodes, 1); + } + } + vcheckspace(); + mtx_unlock(&vnode_free_list_mtx); +} + +/* + * This hack is fragile, especially if desiredvnodes or wantvnodes are + * misconfgured or changed significantly. Reducing desiredvnodes below + * the reserved amount should cause bizarre behaviour like reducing it + * below the number of active vnodes -- the system will try to reduce + * numvnodes to match, but should fail, so the subtraction below should + * not overflow. + */ +void +getnewvnode_drop_reserve(void) +{ + struct thread *td; + + td = curthread; + atomic_subtract_long(&numvnodes, td->td_vp_reserv); + td->td_vp_reserv = 0; +} + +/* + * Return the next vnode from the free list. + */ +int +getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, + struct vnode **vpp) +{ + struct vnode *vp; + struct thread *td; + struct lock_object *lo; + static int cyclecount; + int error __unused; + + CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); + vp = NULL; + td = curthread; + if (td->td_vp_reserv > 0) { + td->td_vp_reserv -= 1; + goto alloc; + } + mtx_lock(&vnode_free_list_mtx); + if (numvnodes < desiredvnodes) + cyclecount = 0; + else if (cyclecount++ >= freevnodes) { + cyclecount = 0; + vstir = 1; + } + /* + * Grow the vnode cache if it will not be above its target max + * after growing. Otherwise, if the free list is nonempty, try + * to reclaim 1 item from it before growing the cache (possibly + * above its target max if the reclamation failed or is delayed). + * Otherwise, wait for some space. In all cases, schedule + * vnlru_proc() if we are getting short of space. The watermarks + * should be chosen so that we never wait or even reclaim from + * the free list to below its target minimum. + */ + if (numvnodes + 1 <= desiredvnodes) + ; + else if (freevnodes > 0) + vnlru_free_locked(1, NULL); + else { + error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & + MNTK_SUSPEND)); +#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ + if (error != 0) { + mtx_unlock(&vnode_free_list_mtx); + return (error); + } +#endif + } + vcheckspace(); + atomic_add_long(&numvnodes, 1); + mtx_unlock(&vnode_free_list_mtx); +alloc: + counter_u64_add(vnodes_created, 1); + vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); + /* + * Locks are given the generic name "vnode" when created. + * Follow the historic practice of using the filesystem + * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. + * + * Locks live in a witness group keyed on their name. Thus, + * when a lock is renamed, it must also move from the witness + * group of its old name to the witness group of its new name. + * + * The change only needs to be made when the vnode moves + * from one filesystem type to another. We ensure that each + * filesystem use a single static name pointer for its tag so + * that we can compare pointers rather than doing a strcmp(). + */ + lo = &vp->v_vnlock->lock_object; + if (lo->lo_name != tag) { + lo->lo_name = tag; + WITNESS_DESTROY(lo); + WITNESS_INIT(lo, tag); + } + /* + * By default, don't allow shared locks unless filesystems opt-in. + */ + vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; + /* + * Finalize various vnode identity bits. + */ + KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); + KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); + KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); + vp->v_type = VNON; + vp->v_tag = tag; + vp->v_op = vops; + v_init_counters(vp); + vp->v_bufobj.bo_ops = &buf_ops_bio; +#ifdef DIAGNOSTIC + if (mp == NULL && vops != &dead_vnodeops) + printf("NULL mp in getnewvnode(9), tag %s\n", tag); +#endif +#ifdef MAC + mac_vnode_init(vp); + if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) + mac_vnode_associate_singlelabel(mp, vp); +#endif + if (mp != NULL) { + vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; + if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) + vp->v_vflag |= VV_NOKNOTE; + } + + /* + * For the filesystems which do not use vfs_hash_insert(), + * still initialize v_hash to have vfs_hash_index() useful. + * E.g., nullfs uses vfs_hash_index() on the lower vnode for + * its own hashing. + */ + vp->v_hash = (uintptr_t)vp >> vnsz2log; + + *vpp = vp; + return (0); +} + +/* + * Delete from old mount point vnode list, if on one. + */ +static void +delmntque(struct vnode *vp) +{ + struct mount *mp; + int active; + + mp = vp->v_mount; + if (mp == NULL) + return; + MNT_ILOCK(mp); + VI_LOCK(vp); + KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, + ("Active vnode list size %d > Vnode list size %d", + mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); + active = vp->v_iflag & VI_ACTIVE; + vp->v_iflag &= ~VI_ACTIVE; + if (active) { + mtx_lock(&mp->mnt_listmtx); + TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); + mp->mnt_activevnodelistsize--; + mtx_unlock(&mp->mnt_listmtx); + } + vp->v_mount = NULL; + VI_UNLOCK(vp); + VNASSERT(mp->mnt_nvnodelistsize > 0, vp, + ("bad mount point vnode list size")); + TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + mp->mnt_nvnodelistsize--; + MNT_REL(mp); + MNT_IUNLOCK(mp); +} + +static void +insmntque_stddtr(struct vnode *vp, void *dtr_arg) +{ + + vp->v_data = NULL; + vp->v_op = &dead_vnodeops; + vgone(vp); + vput(vp); +} + +/* + * Insert into list of vnodes for the new mount point, if available. + */ +int +insmntque1(struct vnode *vp, struct mount *mp, + void (*dtr)(struct vnode *, void *), void *dtr_arg) +{ + + KASSERT(vp->v_mount == NULL, + ("insmntque: vnode already on per mount vnode list")); + VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); + ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); + + /* + * We acquire the vnode interlock early to ensure that the + * vnode cannot be recycled by another process releasing a + * holdcnt on it before we get it on both the vnode list + * and the active vnode list. The mount mutex protects only + * manipulation of the vnode list and the vnode freelist + * mutex protects only manipulation of the active vnode list. + * Hence the need to hold the vnode interlock throughout. + */ + MNT_ILOCK(mp); + VI_LOCK(vp); + if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && + ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || + mp->mnt_nvnodelistsize == 0)) && + (vp->v_vflag & VV_FORCEINSMQ) == 0) { + VI_UNLOCK(vp); + MNT_IUNLOCK(mp); + if (dtr != NULL) + dtr(vp, dtr_arg); + return (EBUSY); + } + vp->v_mount = mp; + MNT_REF(mp); + TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, + ("neg mount point vnode list size")); + mp->mnt_nvnodelistsize++; + KASSERT((vp->v_iflag & VI_ACTIVE) == 0, + ("Activating already active vnode")); + vp->v_iflag |= VI_ACTIVE; + mtx_lock(&mp->mnt_listmtx); + TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); + mp->mnt_activevnodelistsize++; + mtx_unlock(&mp->mnt_listmtx); + VI_UNLOCK(vp); + MNT_IUNLOCK(mp); + return (0); +} + +int +insmntque(struct vnode *vp, struct mount *mp) +{ + + return (insmntque1(vp, mp, insmntque_stddtr, NULL)); +} + +/* + * Flush out and invalidate all buffers associated with a bufobj + * Called with the underlying object locked. + */ +int +bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) +{ + int error; + + BO_LOCK(bo); + if (flags & V_SAVE) { + error = bufobj_wwait(bo, slpflag, slptimeo); + if (error) { + BO_UNLOCK(bo); + return (error); + } + if (bo->bo_dirty.bv_cnt > 0) { + BO_UNLOCK(bo); + if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) + return (error); + /* + * XXX We could save a lock/unlock if this was only + * enabled under INVARIANTS + */ + BO_LOCK(bo); + if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) + panic("vinvalbuf: dirty bufs"); + } + } + /* + * If you alter this loop please notice that interlock is dropped and + * reacquired in flushbuflist. Special care is needed to ensure that + * no race conditions occur from this. + */ + do { + error = flushbuflist(&bo->bo_clean, + flags, bo, slpflag, slptimeo); + if (error == 0 && !(flags & V_CLEANONLY)) + error = flushbuflist(&bo->bo_dirty, + flags, bo, slpflag, slptimeo); + if (error != 0 && error != EAGAIN) { + BO_UNLOCK(bo); + return (error); + } + } while (error != 0); + + /* + * Wait for I/O to complete. XXX needs cleaning up. The vnode can + * have write I/O in-progress but if there is a VM object then the + * VM object can also have read-I/O in-progress. + */ + do { + bufobj_wwait(bo, 0, 0); + if ((flags & V_VMIO) == 0) { + BO_UNLOCK(bo); + if (bo->bo_object != NULL) { + VM_OBJECT_WLOCK(bo->bo_object); + vm_object_pip_wait(bo->bo_object, "bovlbx"); + VM_OBJECT_WUNLOCK(bo->bo_object); + } + BO_LOCK(bo); + } + } while (bo->bo_numoutput > 0); + BO_UNLOCK(bo); + + /* + * Destroy the copy in the VM cache, too. + */ + if (bo->bo_object != NULL && + (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { + VM_OBJECT_WLOCK(bo->bo_object); + vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? + OBJPR_CLEANONLY : 0); + VM_OBJECT_WUNLOCK(bo->bo_object); + } + +#ifdef INVARIANTS + BO_LOCK(bo); + if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | + V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || + bo->bo_clean.bv_cnt > 0)) + panic("vinvalbuf: flush failed"); + if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && + bo->bo_dirty.bv_cnt > 0) + panic("vinvalbuf: flush dirty failed"); + BO_UNLOCK(bo); +#endif + return (0); +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) +{ + + CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); + ASSERT_VOP_LOCKED(vp, "vinvalbuf"); + if (vp->v_object != NULL && vp->v_object->handle != vp) + return (0); + return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); +} + +/* + * Flush out buffers on the specified list. + * + */ +static int +flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, + int slptimeo) +{ + struct buf *bp, *nbp; + int retval, error; + daddr_t lblkno; + b_xflags_t xflags; + + ASSERT_BO_WLOCKED(bo); + + retval = 0; + TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { + if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || + ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { + continue; + } + if (nbp != NULL) { + lblkno = nbp->b_lblkno; + xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); + } + retval = EAGAIN; + error = BUF_TIMELOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), + "flushbuf", slpflag, slptimeo); + if (error) { + BO_LOCK(bo); + return (error != ENOLCK ? error : EAGAIN); + } + KASSERT(bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", + bp, bp->b_bufobj, bo)); + /* + * XXX Since there are no node locks for NFS, I + * believe there is a slight chance that a delayed + * write will occur while sleeping just above, so + * check for it. + */ + if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && + (flags & V_SAVE)) { + bremfree(bp); + bp->b_flags |= B_ASYNC; + bwrite(bp); + BO_LOCK(bo); + return (EAGAIN); /* XXX: why not loop ? */ + } + bremfree(bp); + bp->b_flags |= (B_INVAL | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + BO_LOCK(bo); + if (nbp == NULL) + break; + nbp = gbincore(bo, lblkno); + if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) + != xflags) + break; /* nbp invalid */ + } + return (retval); +} + +int +bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) +{ + struct buf *bp; + int error; + daddr_t lblkno; + + ASSERT_BO_LOCKED(bo); + + for (lblkno = startn;;) { +again: + bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); + if (bp == NULL || bp->b_lblkno >= endn || + bp->b_lblkno < startn) + break; + error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | + LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); + if (error != 0) { + BO_RLOCK(bo); + if (error == ENOLCK) + goto again; + return (error); + } + KASSERT(bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", + bp, bp->b_bufobj, bo)); + lblkno = bp->b_lblkno + 1; + if ((bp->b_flags & B_MANAGED) == 0) + bremfree(bp); + bp->b_flags |= B_RELBUF; + /* + * In the VMIO case, use the B_NOREUSE flag to hint that the + * pages backing each buffer in the range are unlikely to be + * reused. Dirty buffers will have the hint applied once + * they've been written. + */ + if ((bp->b_flags & B_VMIO) != 0) + bp->b_flags |= B_NOREUSE; + brelse(bp); + BO_RLOCK(bo); + } + return (0); +} + +/* + * Truncate a file's buffer and pages to a specified length. This + * is in lieu of the old vinvalbuf mechanism, which performed unneeded + * sync activity. + */ +int +vtruncbuf(struct vnode *vp, off_t length, int blksize) +{ + struct buf *bp, *nbp; + struct bufobj *bo; + daddr_t startlbn; + + CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, + vp, blksize, (uintmax_t)length); + + /* + * Round up to the *next* lbn. + */ + startlbn = howmany(length, blksize); + + ASSERT_VOP_LOCKED(vp, "vtruncbuf"); + + bo = &vp->v_bufobj; +restart_unlocked: + BO_LOCK(bo); + + while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) + ; + + if (length > 0) { +restartsync: + TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { + if (bp->b_lblkno > 0) + continue; + /* + * Since we hold the vnode lock this should only + * fail if we're racing with the buf daemon. + */ + if (BUF_LOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, + BO_LOCKPTR(bo)) == ENOLCK) + goto restart_unlocked; + + VNASSERT((bp->b_flags & B_DELWRI), vp, + ("buf(%p) on dirty queue without DELWRI", bp)); + + bremfree(bp); + bawrite(bp); + BO_LOCK(bo); + goto restartsync; + } + } + + bufobj_wwait(bo, 0, 0); + BO_UNLOCK(bo); + vnode_pager_setsize(vp, length); + + return (0); +} + +/* + * Invalidate the cached pages of a file's buffer within the range of block + * numbers [startlbn, endlbn). + */ +void +v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, + int blksize) +{ + struct bufobj *bo; + off_t start, end; + + ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); + + start = blksize * startlbn; + end = blksize * endlbn; + + bo = &vp->v_bufobj; + BO_LOCK(bo); + MPASS(blksize == bo->bo_bsize); + + while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) + ; + + BO_UNLOCK(bo); + vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); +} + +static int +v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, + daddr_t startlbn, daddr_t endlbn) +{ + struct buf *bp, *nbp; + bool anyfreed; + + ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); + ASSERT_BO_LOCKED(bo); + + do { + anyfreed = false; + TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { + if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) + continue; + if (BUF_LOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, + BO_LOCKPTR(bo)) == ENOLCK) { + BO_LOCK(bo); + return (EAGAIN); + } + + bremfree(bp); + bp->b_flags |= B_INVAL | B_RELBUF; + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = true; + + BO_LOCK(bo); + if (nbp != NULL && + (((nbp->b_xflags & BX_VNCLEAN) == 0) || + nbp->b_vp != vp || + (nbp->b_flags & B_DELWRI) != 0)) + return (EAGAIN); + } + + TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { + if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) + continue; + if (BUF_LOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, + BO_LOCKPTR(bo)) == ENOLCK) { + BO_LOCK(bo); + return (EAGAIN); + } + bremfree(bp); + bp->b_flags |= B_INVAL | B_RELBUF; + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = true; + + BO_LOCK(bo); + if (nbp != NULL && + (((nbp->b_xflags & BX_VNDIRTY) == 0) || + (nbp->b_vp != vp) || + (nbp->b_flags & B_DELWRI) == 0)) + return (EAGAIN); + } + } while (anyfreed); + return (0); +} + +static void +buf_vlist_remove(struct buf *bp) +{ + struct bufv *bv; + + KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); + ASSERT_BO_WLOCKED(bp->b_bufobj); + KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != + (BX_VNDIRTY|BX_VNCLEAN), + ("buf_vlist_remove: Buf %p is on two lists", bp)); + if (bp->b_xflags & BX_VNDIRTY) + bv = &bp->b_bufobj->bo_dirty; + else + bv = &bp->b_bufobj->bo_clean; + BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); + TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); + bv->bv_cnt--; + bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); +} + +/* + * Add the buffer to the sorted clean or dirty block list. + * + * NOTE: xflags is passed as a constant, optimizing this inline function! + */ +static void +buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) +{ + struct bufv *bv; + struct buf *n; + int error; + + ASSERT_BO_WLOCKED(bo); + KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, + ("dead bo %p", bo)); + KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, + ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); + bp->b_xflags |= xflags; + if (xflags & BX_VNDIRTY) + bv = &bo->bo_dirty; + else + bv = &bo->bo_clean; + + /* + * Keep the list ordered. Optimize empty list insertion. Assume + * we tend to grow at the tail so lookup_le should usually be cheaper + * than _ge. + */ + if (bv->bv_cnt == 0 || + bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) + TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); + else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) + TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); + else + TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); + error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); + if (error) + panic("buf_vlist_add: Preallocated nodes insufficient."); + bv->bv_cnt++; +} + +/* + * Look up a buffer using the buffer tries. + */ +struct buf * +gbincore(struct bufobj *bo, daddr_t lblkno) +{ + struct buf *bp; + + ASSERT_BO_LOCKED(bo); + bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); + if (bp != NULL) + return (bp); + return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); +} + +/* + * Associate a buffer with a vnode. + */ +void +bgetvp(struct vnode *vp, struct buf *bp) +{ + struct bufobj *bo; + + bo = &vp->v_bufobj; + ASSERT_BO_WLOCKED(bo); + VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); + + CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); + VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, + ("bgetvp: bp already attached! %p", bp)); + + vhold(vp); + bp->b_vp = vp; + bp->b_bufobj = bo; + /* + * Insert onto list for new vnode. + */ + buf_vlist_add(bp, bo, BX_VNCLEAN); +} + +/* + * Disassociate a buffer from a vnode. + */ +void +brelvp(struct buf *bp) +{ + struct bufobj *bo; + struct vnode *vp; + + CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); + + /* + * Delete from old vnode list, if on one. + */ + vp = bp->b_vp; /* XXX */ + bo = bp->b_bufobj; + BO_LOCK(bo); + if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) + buf_vlist_remove(bp); + else + panic("brelvp: Buffer %p not on queue.", bp); + if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { + bo->bo_flag &= ~BO_ONWORKLST; + mtx_lock(&sync_mtx); + LIST_REMOVE(bo, bo_synclist); + syncer_worklist_len--; + mtx_unlock(&sync_mtx); + } + bp->b_vp = NULL; + bp->b_bufobj = NULL; + BO_UNLOCK(bo); + vdrop(vp); +} + +/* + * Add an item to the syncer work queue. + */ +static void +vn_syncer_add_to_worklist(struct bufobj *bo, int delay) +{ + int slot; + + ASSERT_BO_WLOCKED(bo); + + mtx_lock(&sync_mtx); + if (bo->bo_flag & BO_ONWORKLST) + LIST_REMOVE(bo, bo_synclist); + else { + bo->bo_flag |= BO_ONWORKLST; + syncer_worklist_len++; + } + + if (delay > syncer_maxdelay - 2) + delay = syncer_maxdelay - 2; + slot = (syncer_delayno + delay) & syncer_mask; + + LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); + mtx_unlock(&sync_mtx); +} + +static int +sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) +{ + int error, len; + + mtx_lock(&sync_mtx); + len = syncer_worklist_len - sync_vnode_count; + mtx_unlock(&sync_mtx); + error = SYSCTL_OUT(req, &len, sizeof(len)); + return (error); +} + +SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, + sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); + +static struct proc *updateproc; +static void sched_sync(void); +static struct kproc_desc up_kp = { + "syncer", + sched_sync, + &updateproc +}; +SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); + +static int +sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) +{ + struct vnode *vp; + struct mount *mp; + + *bo = LIST_FIRST(slp); + if (*bo == NULL) + return (0); + vp = bo2vnode(*bo); + if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) + return (1); + /* + * We use vhold in case the vnode does not + * successfully sync. vhold prevents the vnode from + * going away when we unlock the sync_mtx so that + * we can acquire the vnode interlock. + */ + vholdl(vp); + mtx_unlock(&sync_mtx); + VI_UNLOCK(vp); + if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { + vdrop(vp); + mtx_lock(&sync_mtx); + return (*bo == LIST_FIRST(slp)); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + (void) VOP_FSYNC(vp, MNT_LAZY, td); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + BO_LOCK(*bo); + if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { + /* + * Put us back on the worklist. The worklist + * routine will remove us from our current + * position and then add us back in at a later + * position. + */ + vn_syncer_add_to_worklist(*bo, syncdelay); + } + BO_UNLOCK(*bo); + vdrop(vp); + mtx_lock(&sync_mtx); + return (0); +} + +static int first_printf = 1; + +/* + * System filesystem synchronizer daemon. + */ +static void +sched_sync(void) +{ + struct synclist *next, *slp; + struct bufobj *bo; + long starttime; + struct thread *td = curthread; + int last_work_seen; + int net_worklist_len; + int syncer_final_iter; + int error; + + last_work_seen = 0; + syncer_final_iter = 0; + syncer_state = SYNCER_RUNNING; + starttime = time_uptime; + td->td_pflags |= TDP_NORUNNINGBUF; + + EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, + SHUTDOWN_PRI_LAST); + + mtx_lock(&sync_mtx); + for (;;) { + if (syncer_state == SYNCER_FINAL_DELAY && + syncer_final_iter == 0) { + mtx_unlock(&sync_mtx); + kproc_suspend_check(td->td_proc); + mtx_lock(&sync_mtx); + } + net_worklist_len = syncer_worklist_len - sync_vnode_count; + if (syncer_state != SYNCER_RUNNING && + starttime != time_uptime) { + if (first_printf) { + printf("\nSyncing disks, vnodes remaining... "); + first_printf = 0; + } + printf("%d ", net_worklist_len); + } + starttime = time_uptime; + + /* + * Push files whose dirty time has expired. Be careful + * of interrupt race on slp queue. + * + * Skip over empty worklist slots when shutting down. + */ + do { + slp = &syncer_workitem_pending[syncer_delayno]; + syncer_delayno += 1; + if (syncer_delayno == syncer_maxdelay) + syncer_delayno = 0; + next = &syncer_workitem_pending[syncer_delayno]; + /* + * If the worklist has wrapped since the + * it was emptied of all but syncer vnodes, + * switch to the FINAL_DELAY state and run + * for one more second. + */ + if (syncer_state == SYNCER_SHUTTING_DOWN && + net_worklist_len == 0 && + last_work_seen == syncer_delayno) { + syncer_state = SYNCER_FINAL_DELAY; + syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; + } + } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && + syncer_worklist_len > 0); + + /* + * Keep track of the last time there was anything + * on the worklist other than syncer vnodes. + * Return to the SHUTTING_DOWN state if any + * new work appears. + */ + if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) + last_work_seen = syncer_delayno; + if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) + syncer_state = SYNCER_SHUTTING_DOWN; + while (!LIST_EMPTY(slp)) { + error = sync_vnode(slp, &bo, td); + if (error == 1) { + LIST_REMOVE(bo, bo_synclist); + LIST_INSERT_HEAD(next, bo, bo_synclist); + continue; + } + + if (first_printf == 0) { + /* + * Drop the sync mutex, because some watchdog + * drivers need to sleep while patting + */ + mtx_unlock(&sync_mtx); + wdog_kern_pat(WD_LASTVAL); + mtx_lock(&sync_mtx); + } + + } + if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) + syncer_final_iter--; + /* + * The variable rushjob allows the kernel to speed up the + * processing of the filesystem syncer process. A rushjob + * value of N tells the filesystem syncer to process the next + * N seconds worth of work on its queue ASAP. Currently rushjob + * is used by the soft update code to speed up the filesystem + * syncer process when the incore state is getting so far + * ahead of the disk that the kernel memory pool is being + * threatened with exhaustion. + */ + if (rushjob > 0) { + rushjob -= 1; + continue; + } + /* + * Just sleep for a short period of time between + * iterations when shutting down to allow some I/O + * to happen. + * + * If it has taken us less than a second to process the + * current work, then wait. Otherwise start right over + * again. We can still lose time if any single round + * takes more than two seconds, but it does not really + * matter as we are just trying to generally pace the + * filesystem activity. + */ + if (syncer_state != SYNCER_RUNNING || + time_uptime == starttime) { + thread_lock(td); + sched_prio(td, PPAUSE); + thread_unlock(td); + } + if (syncer_state != SYNCER_RUNNING) + cv_timedwait(&sync_wakeup, &sync_mtx, + hz / SYNCER_SHUTDOWN_SPEEDUP); + else if (time_uptime == starttime) + cv_timedwait(&sync_wakeup, &sync_mtx, hz); + } +} + +/* + * Request the syncer daemon to speed up its work. + * We never push it to speed up more than half of its + * normal turn time, otherwise it could take over the cpu. + */ +int +speedup_syncer(void) +{ + int ret = 0; + + mtx_lock(&sync_mtx); + if (rushjob < syncdelay / 2) { + rushjob += 1; + stat_rush_requests += 1; + ret = 1; + } + mtx_unlock(&sync_mtx); + cv_broadcast(&sync_wakeup); + return (ret); +} + +/* + * Tell the syncer to speed up its work and run though its work + * list several times, then tell it to shut down. + */ +static void +syncer_shutdown(void *arg, int howto) +{ + + if (howto & RB_NOSYNC) + return; + mtx_lock(&sync_mtx); + syncer_state = SYNCER_SHUTTING_DOWN; + rushjob = 0; + mtx_unlock(&sync_mtx); + cv_broadcast(&sync_wakeup); + kproc_shutdown(arg, howto); +} + +void +syncer_suspend(void) +{ + + syncer_shutdown(updateproc, 0); +} + +void +syncer_resume(void) +{ + + mtx_lock(&sync_mtx); + first_printf = 1; + syncer_state = SYNCER_RUNNING; + mtx_unlock(&sync_mtx); + cv_broadcast(&sync_wakeup); + kproc_resume(updateproc); +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +void +reassignbuf(struct buf *bp) +{ + struct vnode *vp; + struct bufobj *bo; + int delay; +#ifdef INVARIANTS + struct bufv *bv; +#endif + + vp = bp->b_vp; + bo = bp->b_bufobj; + ++reassignbufcalls; + + CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", + bp, bp->b_vp, bp->b_flags); + /* + * B_PAGING flagged buffers cannot be reassigned because their vp + * is not fully linked in. + */ + if (bp->b_flags & B_PAGING) + panic("cannot reassign paging buffer"); + + /* + * Delete from old vnode list, if on one. + */ + BO_LOCK(bo); + if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) + buf_vlist_remove(bp); + else + panic("reassignbuf: Buffer %p not on queue.", bp); + /* + * If dirty, put on list of dirty buffers; otherwise insert onto list + * of clean buffers. + */ + if (bp->b_flags & B_DELWRI) { + if ((bo->bo_flag & BO_ONWORKLST) == 0) { + switch (vp->v_type) { + case VDIR: + delay = dirdelay; + break; + case VCHR: + delay = metadelay; + break; + default: + delay = filedelay; + } + vn_syncer_add_to_worklist(bo, delay); + } + buf_vlist_add(bp, bo, BX_VNDIRTY); + } else { + buf_vlist_add(bp, bo, BX_VNCLEAN); + + if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { + mtx_lock(&sync_mtx); + LIST_REMOVE(bo, bo_synclist); + syncer_worklist_len--; + mtx_unlock(&sync_mtx); + bo->bo_flag &= ~BO_ONWORKLST; + } + } +#ifdef INVARIANTS + bv = &bo->bo_clean; + bp = TAILQ_FIRST(&bv->bv_hd); + KASSERT(bp == NULL || bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); + bp = TAILQ_LAST(&bv->bv_hd, buflists); + KASSERT(bp == NULL || bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); + bv = &bo->bo_dirty; + bp = TAILQ_FIRST(&bv->bv_hd); + KASSERT(bp == NULL || bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); + bp = TAILQ_LAST(&bv->bv_hd, buflists); + KASSERT(bp == NULL || bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); +#endif + BO_UNLOCK(bo); +} + +static void +v_init_counters(struct vnode *vp) +{ + + VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, + vp, ("%s called for an initialized vnode", __FUNCTION__)); + ASSERT_VI_UNLOCKED(vp, __FUNCTION__); + + refcount_init(&vp->v_holdcnt, 1); + refcount_init(&vp->v_usecount, 1); +} + +static void +v_incr_usecount_locked(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __func__); + if ((vp->v_iflag & VI_OWEINACT) != 0) { + VNASSERT(vp->v_usecount == 0, vp, + ("vnode with usecount and VI_OWEINACT set")); + vp->v_iflag &= ~VI_OWEINACT; + } + refcount_acquire(&vp->v_usecount); + v_incr_devcount(vp); +} + +/* + * Increment the use count on the vnode, taking care to reference + * the driver's usecount if this is a chardev. + */ +static void +v_incr_usecount(struct vnode *vp) +{ + + ASSERT_VI_UNLOCKED(vp, __func__); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + + if (vp->v_type != VCHR && + refcount_acquire_if_not_zero(&vp->v_usecount)) { + VNODE_REFCOUNT_FENCE_ACQ(); + VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, + ("vnode with usecount and VI_OWEINACT set")); + } else { + VI_LOCK(vp); + v_incr_usecount_locked(vp); + VI_UNLOCK(vp); + } +} + +/* + * Increment si_usecount of the associated device, if any. + */ +static void +v_incr_devcount(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __FUNCTION__); + if (vp->v_type == VCHR && vp->v_rdev != NULL) { + dev_lock(); + vp->v_rdev->si_usecount++; + dev_unlock(); + } +} + +/* + * Decrement si_usecount of the associated device, if any. + */ +static void +v_decr_devcount(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __FUNCTION__); + if (vp->v_type == VCHR && vp->v_rdev != NULL) { + dev_lock(); + vp->v_rdev->si_usecount--; + dev_unlock(); + } +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. VI_DOOMED is set if the vnode + * is being destroyed. Only callers who specify LK_RETRY will + * see doomed vnodes. If inactive processing was delayed in + * vput try to do it here. + * + * Notes on lockless counter manipulation: + * _vhold, vputx and other routines make various decisions based + * on either holdcnt or usecount being 0. As long as either counter + * is not transitioning 0->1 nor 1->0, the manipulation can be done + * with atomic operations. Otherwise the interlock is taken covering + * both the atomic and additional actions. + */ +int +vget(struct vnode *vp, int flags, struct thread *td) +{ + int error, oweinact; + + VNASSERT((flags & LK_TYPE_MASK) != 0, vp, + ("vget: invalid lock operation")); + + if ((flags & LK_INTERLOCK) != 0) + ASSERT_VI_LOCKED(vp, __func__); + else + ASSERT_VI_UNLOCKED(vp, __func__); + if ((flags & LK_VNHELD) != 0) + VNASSERT((vp->v_holdcnt > 0), vp, + ("vget: LK_VNHELD passed but vnode not held")); + + CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); + + if ((flags & LK_VNHELD) == 0) + _vhold(vp, (flags & LK_INTERLOCK) != 0); + + if ((error = vn_lock(vp, flags)) != 0) { + vdrop(vp); + CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, + vp); + return (error); + } + if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) + panic("vget: vn_lock failed to return ENOENT\n"); + /* + * We don't guarantee that any particular close will + * trigger inactive processing so just make a best effort + * here at preventing a reference to a removed file. If + * we don't succeed no harm is done. + * + * Upgrade our holdcnt to a usecount. + */ + if (vp->v_type == VCHR || + !refcount_acquire_if_not_zero(&vp->v_usecount)) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_OWEINACT) == 0) { + oweinact = 0; + } else { + oweinact = 1; + vp->v_iflag &= ~VI_OWEINACT; + VNODE_REFCOUNT_FENCE_REL(); + } + refcount_acquire(&vp->v_usecount); + v_incr_devcount(vp); + if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE && + (flags & LK_NOWAIT) == 0) + vinactive(vp, td); + VI_UNLOCK(vp); + } + return (0); +} + +/* + * Increase the reference (use) and hold count of a vnode. + * This will also remove the vnode from the free list if it is presently free. + */ +void +vref(struct vnode *vp) +{ + + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + _vhold(vp, false); + v_incr_usecount(vp); +} + +void +vrefl(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __func__); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + _vhold(vp, true); + v_incr_usecount_locked(vp); +} + +void +vrefact(struct vnode *vp) +{ + + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + if (__predict_false(vp->v_type == VCHR)) { + VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp, + ("%s: wrong ref counts", __func__)); + vref(vp); + return; + } +#ifdef INVARIANTS + int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); + VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__)); + old = atomic_fetchadd_int(&vp->v_usecount, 1); + VNASSERT(old > 0, vp, ("%s: wrong use count", __func__)); +#else + refcount_acquire(&vp->v_holdcnt); + refcount_acquire(&vp->v_usecount); +#endif +} + +/* + * Return reference count of a vnode. + * + * The results of this call are only guaranteed when some mechanism is used to + * stop other processes from gaining references to the vnode. This may be the + * case if the caller holds the only reference. This is also useful when stale + * data is acceptable as race conditions may be accounted for by some other + * means. + */ +int +vrefcnt(struct vnode *vp) +{ + + return (vp->v_usecount); +} + +#define VPUTX_VRELE 1 +#define VPUTX_VPUT 2 +#define VPUTX_VUNREF 3 + +/* + * Decrement the use and hold counts for a vnode. + * + * See an explanation near vget() as to why atomic operation is safe. + */ +static void +vputx(struct vnode *vp, int func) +{ + int error; + + KASSERT(vp != NULL, ("vputx: null vp")); + if (func == VPUTX_VUNREF) + ASSERT_VOP_LOCKED(vp, "vunref"); + else if (func == VPUTX_VPUT) + ASSERT_VOP_LOCKED(vp, "vput"); + else + KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); + ASSERT_VI_UNLOCKED(vp, __func__); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + + if (vp->v_type != VCHR && + refcount_release_if_not_last(&vp->v_usecount)) { + if (func == VPUTX_VPUT) + VOP_UNLOCK(vp, 0); + vdrop(vp); + return; + } + + VI_LOCK(vp); + + /* + * We want to hold the vnode until the inactive finishes to + * prevent vgone() races. We drop the use count here and the + * hold count below when we're done. + */ + if (!refcount_release(&vp->v_usecount) || + (vp->v_iflag & VI_DOINGINACT)) { + if (func == VPUTX_VPUT) + VOP_UNLOCK(vp, 0); + v_decr_devcount(vp); + vdropl(vp); + return; + } + + v_decr_devcount(vp); + + error = 0; + + if (vp->v_usecount != 0) { + vn_printf(vp, "vputx: usecount not zero for vnode "); + panic("vputx: usecount not zero"); + } + + CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); + + /* + * We must call VOP_INACTIVE with the node locked. Mark + * as VI_DOINGINACT to avoid recursion. + */ + vp->v_iflag |= VI_OWEINACT; + switch (func) { + case VPUTX_VRELE: + error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); + VI_LOCK(vp); + break; + case VPUTX_VPUT: + if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { + error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | + LK_NOWAIT); + VI_LOCK(vp); + } + break; + case VPUTX_VUNREF: + if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { + error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); + VI_LOCK(vp); + } + break; + } + VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp, + ("vnode with usecount and VI_OWEINACT set")); + if (error == 0) { + if (vp->v_iflag & VI_OWEINACT) + vinactive(vp, curthread); + if (func != VPUTX_VUNREF) + VOP_UNLOCK(vp, 0); + } + vdropl(vp); +} + +/* + * Vnode put/release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void +vrele(struct vnode *vp) +{ + + vputx(vp, VPUTX_VRELE); +} + +/* + * Release an already locked vnode. This give the same effects as + * unlock+vrele(), but takes less time and avoids releasing and + * re-aquiring the lock (as vrele() acquires the lock internally.) + */ +void +vput(struct vnode *vp) +{ + + vputx(vp, VPUTX_VPUT); +} + +/* + * Release an exclusively locked vnode. Do not unlock the vnode lock. + */ +void +vunref(struct vnode *vp) +{ + + vputx(vp, VPUTX_VUNREF); +} + +/* + * Increase the hold count and activate if this is the first reference. + */ +void +_vhold(struct vnode *vp, bool locked) +{ + struct mount *mp; + + if (locked) + ASSERT_VI_LOCKED(vp, __func__); + else + ASSERT_VI_UNLOCKED(vp, __func__); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + if (!locked) { + if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) { + VNODE_REFCOUNT_FENCE_ACQ(); + VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, + ("_vhold: vnode with holdcnt is free")); + return; + } + VI_LOCK(vp); + } + if ((vp->v_iflag & VI_FREE) == 0) { + refcount_acquire(&vp->v_holdcnt); + if (!locked) + VI_UNLOCK(vp); + return; + } + VNASSERT(vp->v_holdcnt == 0, vp, + ("%s: wrong hold count", __func__)); + VNASSERT(vp->v_op != NULL, vp, + ("%s: vnode already reclaimed.", __func__)); + /* + * Remove a vnode from the free list, mark it as in use, + * and put it on the active list. + */ + VNASSERT(vp->v_mount != NULL, vp, + ("_vhold: vnode not on per mount vnode list")); + mp = vp->v_mount; + mtx_lock(&mp->mnt_listmtx); + if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) { + TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); + mp->mnt_tmpfreevnodelistsize--; + vp->v_mflag &= ~VMP_TMPMNTFREELIST; + } else { + mtx_lock(&vnode_free_list_mtx); + TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); + freevnodes--; + mtx_unlock(&vnode_free_list_mtx); + } + KASSERT((vp->v_iflag & VI_ACTIVE) == 0, + ("Activating already active vnode")); + vp->v_iflag &= ~VI_FREE; + vp->v_iflag |= VI_ACTIVE; + TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); + mp->mnt_activevnodelistsize++; + mtx_unlock(&mp->mnt_listmtx); + refcount_acquire(&vp->v_holdcnt); + if (!locked) + VI_UNLOCK(vp); +} + +/* + * Drop the hold count of the vnode. If this is the last reference to + * the vnode we place it on the free list unless it has been vgone'd + * (marked VI_DOOMED) in which case we will free it. + * + * Because the vnode vm object keeps a hold reference on the vnode if + * there is at least one resident non-cached page, the vnode cannot + * leave the active list without the page cleanup done. + */ +void +_vdrop(struct vnode *vp, bool locked) +{ + struct bufobj *bo; + struct mount *mp; + int active; + + if (locked) + ASSERT_VI_LOCKED(vp, __func__); + else + ASSERT_VI_UNLOCKED(vp, __func__); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + if ((int)vp->v_holdcnt <= 0) + panic("vdrop: holdcnt %d", vp->v_holdcnt); + if (!locked) { + if (refcount_release_if_not_last(&vp->v_holdcnt)) + return; + VI_LOCK(vp); + } + if (refcount_release(&vp->v_holdcnt) == 0) { + VI_UNLOCK(vp); + return; + } + if ((vp->v_iflag & VI_DOOMED) == 0) { + /* + * Mark a vnode as free: remove it from its active list + * and put it up for recycling on the freelist. + */ + VNASSERT(vp->v_op != NULL, vp, + ("vdropl: vnode already reclaimed.")); + VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, + ("vnode already free")); + VNASSERT(vp->v_holdcnt == 0, vp, + ("vdropl: freeing when we shouldn't")); + active = vp->v_iflag & VI_ACTIVE; + if ((vp->v_iflag & VI_OWEINACT) == 0) { + vp->v_iflag &= ~VI_ACTIVE; + mp = vp->v_mount; + if (mp != NULL) { + mtx_lock(&mp->mnt_listmtx); + if (active) { + TAILQ_REMOVE(&mp->mnt_activevnodelist, + vp, v_actfreelist); + mp->mnt_activevnodelistsize--; + } + TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, + vp, v_actfreelist); + mp->mnt_tmpfreevnodelistsize++; + vp->v_iflag |= VI_FREE; + vp->v_mflag |= VMP_TMPMNTFREELIST; + VI_UNLOCK(vp); + if (mp->mnt_tmpfreevnodelistsize >= + mnt_free_list_batch) + vnlru_return_batch_locked(mp); + mtx_unlock(&mp->mnt_listmtx); + } else { + VNASSERT(active == 0, vp, + ("vdropl: active vnode not on per mount " + "vnode list")); + mtx_lock(&vnode_free_list_mtx); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, + v_actfreelist); + freevnodes++; + vp->v_iflag |= VI_FREE; + VI_UNLOCK(vp); + mtx_unlock(&vnode_free_list_mtx); + } + } else { + VI_UNLOCK(vp); + counter_u64_add(free_owe_inact, 1); + } + return; + } + /* + * The vnode has been marked for destruction, so free it. + * + * The vnode will be returned to the zone where it will + * normally remain until it is needed for another vnode. We + * need to cleanup (or verify that the cleanup has already + * been done) any residual data left from its current use + * so as not to contaminate the freshly allocated vnode. + */ + CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); + atomic_subtract_long(&numvnodes, 1); + bo = &vp->v_bufobj; + VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, + ("cleaned vnode still on the free list.")); + VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); + VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); + VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); + VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); + VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); + VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); + VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, + ("clean blk trie not empty")); + VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); + VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, + ("dirty blk trie not empty")); + VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); + VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); + VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); + VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, + ("Dangling rangelock waiters")); + VI_UNLOCK(vp); +#ifdef MAC + mac_vnode_destroy(vp); +#endif + if (vp->v_pollinfo != NULL) { + destroy_vpollinfo(vp->v_pollinfo); + vp->v_pollinfo = NULL; + } +#ifdef INVARIANTS + /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ + vp->v_op = NULL; +#endif + vp->v_mountedhere = NULL; + vp->v_unpcb = NULL; + vp->v_rdev = NULL; + vp->v_fifoinfo = NULL; + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + vp->v_iflag = 0; + vp->v_vflag = 0; + bo->bo_flag = 0; + uma_zfree(vnode_zone, vp); +} + +/* + * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT + * flags. DOINGINACT prevents us from recursing in calls to vinactive. + * OWEINACT tracks whether a vnode missed a call to inactive due to a + * failed lock upgrade. + */ +void +vinactive(struct vnode *vp, struct thread *td) +{ + struct vm_object *obj; + + ASSERT_VOP_ELOCKED(vp, "vinactive"); + ASSERT_VI_LOCKED(vp, "vinactive"); + VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, + ("vinactive: recursed on VI_DOINGINACT")); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + vp->v_iflag |= VI_DOINGINACT; + vp->v_iflag &= ~VI_OWEINACT; + VI_UNLOCK(vp); + /* + * Before moving off the active list, we must be sure that any + * modified pages are converted into the vnode's dirty + * buffers, since these will no longer be checked once the + * vnode is on the inactive list. + * + * The write-out of the dirty pages is asynchronous. At the + * point that VOP_INACTIVE() is called, there could still be + * pending I/O and dirty pages in the object. + */ + if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && + (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { + VM_OBJECT_WLOCK(obj); + vm_object_page_clean(obj, 0, 0, 0); + VM_OBJECT_WUNLOCK(obj); + } + VOP_INACTIVE(vp, td); + VI_LOCK(vp); + VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, + ("vinactive: lost VI_DOINGINACT")); + vp->v_iflag &= ~VI_DOINGINACT; +} + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If FORCECLOSE is not specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If FORCECLOSE is specified, detach any active vnodes + * that are found. + * + * If WRITECLOSE is set, only flush out regular file vnodes open for + * writing. + * + * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. + * + * `rootrefs' specifies the base reference count for the root vnode + * of this filesystem. The root vnode is considered busy if its + * v_usecount exceeds this value. On a successful return, vflush(, td) + * will call vrele() on the root vnode exactly rootrefs times. + * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must + * be zero. + */ +#ifdef DIAGNOSTIC +static int busyprt = 0; /* print out busy vnodes */ +SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); +#endif + +int +vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) +{ + struct vnode *vp, *mvp, *rootvp = NULL; + struct vattr vattr; + int busy = 0, error; + + CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, + rootrefs, flags); + if (rootrefs > 0) { + KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, + ("vflush: bad args")); + /* + * Get the filesystem root vnode. We can vput() it + * immediately, since with rootrefs > 0, it won't go away. + */ + if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { + CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", + __func__, error); + return (error); + } + vput(rootvp); + } +loop: + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + vholdl(vp); + error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); + if (error) { + vdrop(vp); + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto loop; + } + /* + * Skip over a vnodes marked VV_SYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { + VOP_UNLOCK(vp, 0); + vdrop(vp); + continue; + } + /* + * If WRITECLOSE is set, flush out unlinked but still open + * files (even if open only for reading) and regular file + * vnodes open for writing. + */ + if (flags & WRITECLOSE) { + if (vp->v_object != NULL) { + VM_OBJECT_WLOCK(vp->v_object); + vm_object_page_clean(vp->v_object, 0, 0, 0); + VM_OBJECT_WUNLOCK(vp->v_object); + } + error = VOP_FSYNC(vp, MNT_WAIT, td); + if (error != 0) { + VOP_UNLOCK(vp, 0); + vdrop(vp); + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + return (error); + } + error = VOP_GETATTR(vp, &vattr, td->td_ucred); + VI_LOCK(vp); + + if ((vp->v_type == VNON || + (error == 0 && vattr.va_nlink > 0)) && + (vp->v_writecount <= 0 || vp->v_type != VREG)) { + VOP_UNLOCK(vp, 0); + vdropl(vp); + continue; + } + } else + VI_LOCK(vp); + /* + * With v_usecount == 0, all we need to do is clear out the + * vnode data structures and we are done. + * + * If FORCECLOSE is set, forcibly close the vnode. + */ + if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { + vgonel(vp); + } else { + busy++; +#ifdef DIAGNOSTIC + if (busyprt) + vn_printf(vp, "vflush: busy vnode "); +#endif + } + VOP_UNLOCK(vp, 0); + vdropl(vp); + } + if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { + /* + * If just the root vnode is busy, and if its refcount + * is equal to `rootrefs', then go ahead and kill it. + */ + VI_LOCK(rootvp); + KASSERT(busy > 0, ("vflush: not busy")); + VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, + ("vflush: usecount %d < rootrefs %d", + rootvp->v_usecount, rootrefs)); + if (busy == 1 && rootvp->v_usecount == rootrefs) { + VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); + vgone(rootvp); + VOP_UNLOCK(rootvp, 0); + busy = 0; + } else + VI_UNLOCK(rootvp); + } + if (busy) { + CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, + busy); + return (EBUSY); + } + for (; rootrefs > 0; rootrefs--) + vrele(rootvp); + return (0); +} + +/* + * Recycle an unused vnode to the front of the free list. + */ +int +vrecycle(struct vnode *vp) +{ + int recycled; + + VI_LOCK(vp); + recycled = vrecyclel(vp); + VI_UNLOCK(vp); + return (recycled); +} + +/* + * vrecycle, with the vp interlock held. + */ +int +vrecyclel(struct vnode *vp) +{ + int recycled; + + ASSERT_VOP_ELOCKED(vp, __func__); + ASSERT_VI_LOCKED(vp, __func__); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + recycled = 0; + if (vp->v_usecount == 0) { + recycled = 1; + vgonel(vp); + } + return (recycled); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void +vgone(struct vnode *vp) +{ + VI_LOCK(vp); + vgonel(vp); + VI_UNLOCK(vp); +} + +static void +notify_lowervp_vfs_dummy(struct mount *mp __unused, + struct vnode *lowervp __unused) +{ +} + +/* + * Notify upper mounts about reclaimed or unlinked vnode. + */ +void +vfs_notify_upper(struct vnode *vp, int event) +{ + static struct vfsops vgonel_vfsops = { + .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, + .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, + }; + struct mount *mp, *ump, *mmp; + + mp = vp->v_mount; + if (mp == NULL) + return; + + MNT_ILOCK(mp); + if (TAILQ_EMPTY(&mp->mnt_uppers)) + goto unlock; + MNT_IUNLOCK(mp); + mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); + mmp->mnt_op = &vgonel_vfsops; + mmp->mnt_kern_flag |= MNTK_MARKER; + MNT_ILOCK(mp); + mp->mnt_kern_flag |= MNTK_VGONE_UPPER; + for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { + if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { + ump = TAILQ_NEXT(ump, mnt_upper_link); + continue; + } + TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); + MNT_IUNLOCK(mp); + switch (event) { + case VFS_NOTIFY_UPPER_RECLAIM: + VFS_RECLAIM_LOWERVP(ump, vp); + break; + case VFS_NOTIFY_UPPER_UNLINK: + VFS_UNLINK_LOWERVP(ump, vp); + break; + default: + KASSERT(0, ("invalid event %d", event)); + break; + } + MNT_ILOCK(mp); + ump = TAILQ_NEXT(mmp, mnt_upper_link); + TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); + } + free(mmp, M_TEMP); + mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; + if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { + mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; + wakeup(&mp->mnt_uppers); + } +unlock: + MNT_IUNLOCK(mp); +} + +/* + * vgone, with the vp interlock held. + */ +static void +vgonel(struct vnode *vp) +{ + struct thread *td; + int oweinact; + int active; + struct mount *mp; + + ASSERT_VOP_ELOCKED(vp, "vgonel"); + ASSERT_VI_LOCKED(vp, "vgonel"); + VNASSERT(vp->v_holdcnt, vp, + ("vgonel: vp %p has no reference.", vp)); + CTR2(KTR_VFS, "%s: vp %p", __func__, vp); + td = curthread; + + /* + * Don't vgonel if we're already doomed. + */ + if (vp->v_iflag & VI_DOOMED) + return; + vp->v_iflag |= VI_DOOMED; + + /* + * Check to see if the vnode is in use. If so, we have to call + * VOP_CLOSE() and VOP_INACTIVE(). + */ + active = vp->v_usecount; + oweinact = (vp->v_iflag & VI_OWEINACT); + VI_UNLOCK(vp); + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); + + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. + */ + if (active) + VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); + if (oweinact || active) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_DOINGINACT) == 0) + vinactive(vp, td); + VI_UNLOCK(vp); + } + if (vp->v_type == VSOCK) + vfs_unp_reclaim(vp); + + /* + * Clean out any buffers associated with the vnode. + * If the flush fails, just toss the buffers. + */ + mp = NULL; + if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) + (void) vn_start_secondary_write(vp, &mp, V_WAIT); + if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { + while (vinvalbuf(vp, 0, 0, 0) != 0) + ; + } + + BO_LOCK(&vp->v_bufobj); + KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && + vp->v_bufobj.bo_dirty.bv_cnt == 0 && + TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && + vp->v_bufobj.bo_clean.bv_cnt == 0, + ("vp %p bufobj not invalidated", vp)); + + /* + * For VMIO bufobj, BO_DEAD is set in vm_object_terminate() + * after the object's page queue is flushed. + */ + if (vp->v_bufobj.bo_object == NULL) + vp->v_bufobj.bo_flag |= BO_DEAD; + BO_UNLOCK(&vp->v_bufobj); + + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp, td)) + panic("vgone: cannot reclaim"); + if (mp != NULL) + vn_finished_secondary_write(mp); + VNASSERT(vp->v_object == NULL, vp, + ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); + /* + * Clear the advisory locks and wake up waiting threads. + */ + (void)VOP_ADVLOCKPURGE(vp); + vp->v_lockf = NULL; + /* + * Delete from old mount point vnode list. + */ + delmntque(vp); + cache_purge(vp); + /* + * Done with purge, reset to the standard lock and invalidate + * the vnode. + */ + VI_LOCK(vp); + vp->v_vnlock = &vp->v_lock; + vp->v_op = &dead_vnodeops; + vp->v_tag = "none"; + vp->v_type = VBAD; +} + +/* + * Calculate the total number of references to a special device. + */ +int +vcount(struct vnode *vp) +{ + int count; + + dev_lock(); + count = vp->v_rdev->si_usecount; + dev_unlock(); + return (count); +} + +/* + * Same as above, but using the struct cdev *as argument + */ +int +count_dev(struct cdev *dev) +{ + int count; + + dev_lock(); + count = dev->si_usecount; + dev_unlock(); + return(count); +} + +/* + * Print out a description of a vnode. + */ +static char *typename[] = +{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", + "VMARKER"}; + +void +vn_printf(struct vnode *vp, const char *fmt, ...) +{ + va_list ap; + char buf[256], buf2[16]; + u_long flags; + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + printf("%p: ", (void *)vp); + printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); + printf(" usecount %d, writecount %d, refcount %d", + vp->v_usecount, vp->v_writecount, vp->v_holdcnt); + switch (vp->v_type) { + case VDIR: + printf(" mountedhere %p\n", vp->v_mountedhere); + break; + case VCHR: + printf(" rdev %p\n", vp->v_rdev); + break; + case VSOCK: + printf(" socket %p\n", vp->v_unpcb); + break; + case VFIFO: + printf(" fifoinfo %p\n", vp->v_fifoinfo); + break; + default: + printf("\n"); + break; + } + buf[0] = '\0'; + buf[1] = '\0'; + if (vp->v_vflag & VV_ROOT) + strlcat(buf, "|VV_ROOT", sizeof(buf)); + if (vp->v_vflag & VV_ISTTY) + strlcat(buf, "|VV_ISTTY", sizeof(buf)); + if (vp->v_vflag & VV_NOSYNC) + strlcat(buf, "|VV_NOSYNC", sizeof(buf)); + if (vp->v_vflag & VV_ETERNALDEV) + strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); + if (vp->v_vflag & VV_CACHEDLABEL) + strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); + if (vp->v_vflag & VV_COPYONWRITE) + strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); + if (vp->v_vflag & VV_SYSTEM) + strlcat(buf, "|VV_SYSTEM", sizeof(buf)); + if (vp->v_vflag & VV_PROCDEP) + strlcat(buf, "|VV_PROCDEP", sizeof(buf)); + if (vp->v_vflag & VV_NOKNOTE) + strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); + if (vp->v_vflag & VV_DELETED) + strlcat(buf, "|VV_DELETED", sizeof(buf)); + if (vp->v_vflag & VV_MD) + strlcat(buf, "|VV_MD", sizeof(buf)); + if (vp->v_vflag & VV_FORCEINSMQ) + strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); + flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | + VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | + VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); + if (flags != 0) { + snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); + strlcat(buf, buf2, sizeof(buf)); + } + if (vp->v_iflag & VI_MOUNT) + strlcat(buf, "|VI_MOUNT", sizeof(buf)); + if (vp->v_iflag & VI_DOOMED) + strlcat(buf, "|VI_DOOMED", sizeof(buf)); + if (vp->v_iflag & VI_FREE) + strlcat(buf, "|VI_FREE", sizeof(buf)); + if (vp->v_iflag & VI_ACTIVE) + strlcat(buf, "|VI_ACTIVE", sizeof(buf)); + if (vp->v_iflag & VI_DOINGINACT) + strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); + if (vp->v_iflag & VI_OWEINACT) + strlcat(buf, "|VI_OWEINACT", sizeof(buf)); + if (vp->v_iflag & VI_TEXT_REF) + strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); + flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE | + VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT | VI_TEXT_REF); + if (flags != 0) { + snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); + strlcat(buf, buf2, sizeof(buf)); + } + printf(" flags (%s)\n", buf + 1); + if (mtx_owned(VI_MTX(vp))) + printf(" VI_LOCKed"); + if (vp->v_object != NULL) + printf(" v_object %p ref %d pages %d " + "cleanbuf %d dirtybuf %d\n", + vp->v_object, vp->v_object->ref_count, + vp->v_object->resident_page_count, + vp->v_bufobj.bo_clean.bv_cnt, + vp->v_bufobj.bo_dirty.bv_cnt); + printf(" "); + lockmgr_printinfo(vp->v_vnlock); + if (vp->v_data != NULL) + VOP_PRINT(vp); +} + +#ifdef DDB +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +DB_SHOW_COMMAND(lockedvnods, lockedvnodes) +{ + struct mount *mp; + struct vnode *vp; + + /* + * Note: because this is DDB, we can't obey the locking semantics + * for these structures, which means we could catch an inconsistent + * state and dereference a nasty pointer. Not much to be done + * about that. + */ + db_printf("Locked vnodes\n"); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { + if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) + vn_printf(vp, "vnode "); + } + } +} + +/* + * Show details about the given vnode. + */ +DB_SHOW_COMMAND(vnode, db_show_vnode) +{ + struct vnode *vp; + + if (!have_addr) + return; + vp = (struct vnode *)addr; + vn_printf(vp, "vnode "); +} + +/* + * Show details about the given mount point. + */ +DB_SHOW_COMMAND(mount, db_show_mount) +{ + struct mount *mp; + struct vfsopt *opt; + struct statfs *sp; + struct vnode *vp; + char buf[512]; + uint64_t mflags; + u_int flags; + + if (!have_addr) { + /* No address given, print short info about all mount points. */ + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + db_printf("%p %s on %s (%s)\n", mp, + mp->mnt_stat.f_mntfromname, + mp->mnt_stat.f_mntonname, + mp->mnt_stat.f_fstypename); + if (db_pager_quit) + break; + } + db_printf("\nMore info: show mount \n"); + return; + } + + mp = (struct mount *)addr; + db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, + mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); + + buf[0] = '\0'; + mflags = mp->mnt_flag; +#define MNT_FLAG(flag) do { \ + if (mflags & (flag)) { \ + if (buf[0] != '\0') \ + strlcat(buf, ", ", sizeof(buf)); \ + strlcat(buf, (#flag) + 4, sizeof(buf)); \ + mflags &= ~(flag); \ + } \ +} while (0) + MNT_FLAG(MNT_RDONLY); + MNT_FLAG(MNT_SYNCHRONOUS); + MNT_FLAG(MNT_NOEXEC); + MNT_FLAG(MNT_NOSUID); + MNT_FLAG(MNT_NFS4ACLS); + MNT_FLAG(MNT_UNION); + MNT_FLAG(MNT_ASYNC); + MNT_FLAG(MNT_SUIDDIR); + MNT_FLAG(MNT_SOFTDEP); + MNT_FLAG(MNT_NOSYMFOLLOW); + MNT_FLAG(MNT_GJOURNAL); + MNT_FLAG(MNT_MULTILABEL); + MNT_FLAG(MNT_ACLS); + MNT_FLAG(MNT_NOATIME); + MNT_FLAG(MNT_NOCLUSTERR); + MNT_FLAG(MNT_NOCLUSTERW); + MNT_FLAG(MNT_SUJ); + MNT_FLAG(MNT_EXRDONLY); + MNT_FLAG(MNT_EXPORTED); + MNT_FLAG(MNT_DEFEXPORTED); + MNT_FLAG(MNT_EXPORTANON); + MNT_FLAG(MNT_EXKERB); + MNT_FLAG(MNT_EXPUBLIC); + MNT_FLAG(MNT_LOCAL); + MNT_FLAG(MNT_QUOTA); + MNT_FLAG(MNT_ROOTFS); + MNT_FLAG(MNT_USER); + MNT_FLAG(MNT_IGNORE); + MNT_FLAG(MNT_UPDATE); + MNT_FLAG(MNT_DELEXPORT); + MNT_FLAG(MNT_RELOAD); + MNT_FLAG(MNT_FORCE); + MNT_FLAG(MNT_SNAPSHOT); + MNT_FLAG(MNT_BYFSID); +#undef MNT_FLAG + if (mflags != 0) { + if (buf[0] != '\0') + strlcat(buf, ", ", sizeof(buf)); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), + "0x%016jx", mflags); + } + db_printf(" mnt_flag = %s\n", buf); + + buf[0] = '\0'; + flags = mp->mnt_kern_flag; +#define MNT_KERN_FLAG(flag) do { \ + if (flags & (flag)) { \ + if (buf[0] != '\0') \ + strlcat(buf, ", ", sizeof(buf)); \ + strlcat(buf, (#flag) + 5, sizeof(buf)); \ + flags &= ~(flag); \ + } \ +} while (0) + MNT_KERN_FLAG(MNTK_UNMOUNTF); + MNT_KERN_FLAG(MNTK_ASYNC); + MNT_KERN_FLAG(MNTK_SOFTDEP); + MNT_KERN_FLAG(MNTK_DRAINING); + MNT_KERN_FLAG(MNTK_REFEXPIRE); + MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); + MNT_KERN_FLAG(MNTK_SHARED_WRITES); + MNT_KERN_FLAG(MNTK_NO_IOPF); + MNT_KERN_FLAG(MNTK_VGONE_UPPER); + MNT_KERN_FLAG(MNTK_VGONE_WAITER); + MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); + MNT_KERN_FLAG(MNTK_MARKER); + MNT_KERN_FLAG(MNTK_USES_BCACHE); + MNT_KERN_FLAG(MNTK_NOASYNC); + MNT_KERN_FLAG(MNTK_UNMOUNT); + MNT_KERN_FLAG(MNTK_MWAIT); + MNT_KERN_FLAG(MNTK_SUSPEND); + MNT_KERN_FLAG(MNTK_SUSPEND2); + MNT_KERN_FLAG(MNTK_SUSPENDED); + MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); + MNT_KERN_FLAG(MNTK_NOKNOTE); +#undef MNT_KERN_FLAG + if (flags != 0) { + if (buf[0] != '\0') + strlcat(buf, ", ", sizeof(buf)); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), + "0x%08x", flags); + } + db_printf(" mnt_kern_flag = %s\n", buf); + + db_printf(" mnt_opt = "); + opt = TAILQ_FIRST(mp->mnt_opt); + if (opt != NULL) { + db_printf("%s", opt->name); + opt = TAILQ_NEXT(opt, link); + while (opt != NULL) { + db_printf(", %s", opt->name); + opt = TAILQ_NEXT(opt, link); + } + } + db_printf("\n"); + + sp = &mp->mnt_stat; + db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " + "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " + "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " + "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", + (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, + (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, + (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, + (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, + (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, + (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, + (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, + (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); + + db_printf(" mnt_cred = { uid=%u ruid=%u", + (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); + if (jailed(mp->mnt_cred)) + db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); + db_printf(" }\n"); + db_printf(" mnt_ref = %d\n", mp->mnt_ref); + db_printf(" mnt_gen = %d\n", mp->mnt_gen); + db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); + db_printf(" mnt_activevnodelistsize = %d\n", + mp->mnt_activevnodelistsize); + db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); + db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); + db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); + db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); + db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); + db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); + db_printf(" mnt_secondary_accwrites = %d\n", + mp->mnt_secondary_accwrites); + db_printf(" mnt_gjprovider = %s\n", + mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); + + db_printf("\n\nList of active vnodes\n"); + TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { + if (vp->v_type != VMARKER) { + vn_printf(vp, "vnode "); + if (db_pager_quit) + break; + } + } + db_printf("\n\nList of inactive vnodes\n"); + TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { + if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { + vn_printf(vp, "vnode "); + if (db_pager_quit) + break; + } + } +} +#endif /* DDB */ + +/* + * Fill in a struct xvfsconf based on a struct vfsconf. + */ +static int +vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) +{ + struct xvfsconf xvfsp; + + bzero(&xvfsp, sizeof(xvfsp)); + strcpy(xvfsp.vfc_name, vfsp->vfc_name); + xvfsp.vfc_typenum = vfsp->vfc_typenum; + xvfsp.vfc_refcount = vfsp->vfc_refcount; + xvfsp.vfc_flags = vfsp->vfc_flags; + /* + * These are unused in userland, we keep them + * to not break binary compatibility. + */ + xvfsp.vfc_vfsops = NULL; + xvfsp.vfc_next = NULL; + return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); +} + +#ifdef COMPAT_FREEBSD32 +struct xvfsconf32 { + uint32_t vfc_vfsops; + char vfc_name[MFSNAMELEN]; + int32_t vfc_typenum; + int32_t vfc_refcount; + int32_t vfc_flags; + uint32_t vfc_next; +}; + +static int +vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) +{ + struct xvfsconf32 xvfsp; + + bzero(&xvfsp, sizeof(xvfsp)); + strcpy(xvfsp.vfc_name, vfsp->vfc_name); + xvfsp.vfc_typenum = vfsp->vfc_typenum; + xvfsp.vfc_refcount = vfsp->vfc_refcount; + xvfsp.vfc_flags = vfsp->vfc_flags; + return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); +} +#endif + +/* + * Top level filesystem related information gathering. + */ +static int +sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) +{ + struct vfsconf *vfsp; + int error; + + error = 0; + vfsconf_slock(); + TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { +#ifdef COMPAT_FREEBSD32 + if (req->flags & SCTL_MASK32) + error = vfsconf2x32(req, vfsp); + else +#endif + error = vfsconf2x(req, vfsp); + if (error) + break; + } + vfsconf_sunlock(); + return (error); +} + +SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, + "S,xvfsconf", "List of all configured filesystems"); + +#ifndef BURN_BRIDGES +static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); + +static int +vfs_sysctl(SYSCTL_HANDLER_ARGS) +{ + int *name = (int *)arg1 - 1; /* XXX */ + u_int namelen = arg2 + 1; /* XXX */ + struct vfsconf *vfsp; + + log(LOG_WARNING, "userland calling deprecated sysctl, " + "please rebuild world\n"); + +#if 1 || defined(COMPAT_PRELITE2) + /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ + if (namelen == 1) + return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); +#endif + + switch (name[1]) { + case VFS_MAXTYPENUM: + if (namelen != 2) + return (ENOTDIR); + return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); + case VFS_CONF: + if (namelen != 3) + return (ENOTDIR); /* overloaded */ + vfsconf_slock(); + TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { + if (vfsp->vfc_typenum == name[2]) + break; + } + vfsconf_sunlock(); + if (vfsp == NULL) + return (EOPNOTSUPP); +#ifdef COMPAT_FREEBSD32 + if (req->flags & SCTL_MASK32) + return (vfsconf2x32(req, vfsp)); + else +#endif + return (vfsconf2x(req, vfsp)); + } + return (EOPNOTSUPP); +} + +static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | + CTLFLAG_MPSAFE, vfs_sysctl, + "Generic filesystem"); + +#if 1 || defined(COMPAT_PRELITE2) + +static int +sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) +{ + int error; + struct vfsconf *vfsp; + struct ovfsconf ovfs; + + vfsconf_slock(); + TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { + bzero(&ovfs, sizeof(ovfs)); + ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ + strcpy(ovfs.vfc_name, vfsp->vfc_name); + ovfs.vfc_index = vfsp->vfc_typenum; + ovfs.vfc_refcount = vfsp->vfc_refcount; + ovfs.vfc_flags = vfsp->vfc_flags; + error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); + if (error != 0) { + vfsconf_sunlock(); + return (error); + } + } + vfsconf_sunlock(); + return (0); +} + +#endif /* 1 || COMPAT_PRELITE2 */ +#endif /* !BURN_BRIDGES */ + +#define KINFO_VNODESLOP 10 +#ifdef notyet +/* + * Dump vnode list (via sysctl). + */ +/* ARGSUSED */ +static int +sysctl_vnode(SYSCTL_HANDLER_ARGS) +{ + struct xvnode *xvn; + struct mount *mp; + struct vnode *vp; + int error, len, n; + + /* + * Stale numvnodes access is not fatal here. + */ + req->lock = 0; + len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; + if (!req->oldptr) + /* Make an estimate */ + return (SYSCTL_OUT(req, 0, len)); + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); + n = 0; + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) + continue; + MNT_ILOCK(mp); + TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { + if (n == len) + break; + vref(vp); + xvn[n].xv_size = sizeof *xvn; + xvn[n].xv_vnode = vp; + xvn[n].xv_id = 0; /* XXX compat */ +#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field + XV_COPY(usecount); + XV_COPY(writecount); + XV_COPY(holdcnt); + XV_COPY(mount); + XV_COPY(numoutput); + XV_COPY(type); +#undef XV_COPY + xvn[n].xv_flag = vp->v_vflag; + + switch (vp->v_type) { + case VREG: + case VDIR: + case VLNK: + break; + case VBLK: + case VCHR: + if (vp->v_rdev == NULL) { + vrele(vp); + continue; + } + xvn[n].xv_dev = dev2udev(vp->v_rdev); + break; + case VSOCK: + xvn[n].xv_socket = vp->v_socket; + break; + case VFIFO: + xvn[n].xv_fifo = vp->v_fifoinfo; + break; + case VNON: + case VBAD: + default: + /* shouldn't happen? */ + vrele(vp); + continue; + } + vrele(vp); + ++n; + } + MNT_IUNLOCK(mp); + mtx_lock(&mountlist_mtx); + vfs_unbusy(mp); + if (n == len) + break; + } + mtx_unlock(&mountlist_mtx); + + error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); + free(xvn, M_TEMP); + return (error); +} + +SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | + CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", + ""); +#endif + +static void +unmount_or_warn(struct mount *mp) +{ + int error; + + error = dounmount(mp, MNT_FORCE, curthread); + if (error != 0) { + printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); + if (error == EBUSY) + printf("BUSY)\n"); + else + printf("%d)\n", error); + } +} + +/* + * Unmount all filesystems. The list is traversed in reverse order + * of mounting to avoid dependencies. + */ +void +vfs_unmountall(void) +{ + struct mount *mp, *tmp; + + CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); + + /* + * Since this only runs when rebooting, it is not interlocked. + */ + TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { + vfs_ref(mp); + + /* + * Forcibly unmounting "/dev" before "/" would prevent clean + * unmount of the latter. + */ + if (mp == rootdevmp) + continue; + + unmount_or_warn(mp); + } + + if (rootdevmp != NULL) + unmount_or_warn(rootdevmp); +} + +/* + * perform msync on all vnodes under a mount point + * the mount point must be locked. + */ +void +vfs_msync(struct mount *mp, int flags) +{ + struct vnode *vp, *mvp; + struct vm_object *obj; + + CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + + vnlru_return_batch(mp); + + MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + obj = vp->v_object; + if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && + (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { + if (!vget(vp, + LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, + curthread)) { + if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ + vput(vp); + continue; + } + + obj = vp->v_object; + if (obj != NULL) { + VM_OBJECT_WLOCK(obj); + vm_object_page_clean(obj, 0, 0, + flags == MNT_WAIT ? + OBJPC_SYNC : OBJPC_NOSYNC); + VM_OBJECT_WUNLOCK(obj); + } + vput(vp); + } + } else + VI_UNLOCK(vp); + } +} + +static void +destroy_vpollinfo_free(struct vpollinfo *vi) +{ + + knlist_destroy(&vi->vpi_selinfo.si_note); + mtx_destroy(&vi->vpi_lock); + uma_zfree(vnodepoll_zone, vi); +} + +static void +destroy_vpollinfo(struct vpollinfo *vi) +{ + + knlist_clear(&vi->vpi_selinfo.si_note, 1); + seldrain(&vi->vpi_selinfo); + destroy_vpollinfo_free(vi); +} + +/* + * Initialize per-vnode helper structure to hold poll-related state. + */ +void +v_addpollinfo(struct vnode *vp) +{ + struct vpollinfo *vi; + + if (vp->v_pollinfo != NULL) + return; + vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); + mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); + knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, + vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); + VI_LOCK(vp); + if (vp->v_pollinfo != NULL) { + VI_UNLOCK(vp); + destroy_vpollinfo_free(vi); + return; + } + vp->v_pollinfo = vi; + VI_UNLOCK(vp); +} + +/* + * Record a process's interest in events which might happen to + * a vnode. Because poll uses the historic select-style interface + * internally, this routine serves as both the ``check for any + * pending events'' and the ``record my interest in future events'' + * functions. (These are done together, while the lock is held, + * to avoid race conditions.) + */ +int +vn_pollrecord(struct vnode *vp, struct thread *td, int events) +{ + + v_addpollinfo(vp); + mtx_lock(&vp->v_pollinfo->vpi_lock); + if (vp->v_pollinfo->vpi_revents & events) { + /* + * This leaves events we are not interested + * in available for the other process which + * which presumably had requested them + * (otherwise they would never have been + * recorded). + */ + events &= vp->v_pollinfo->vpi_revents; + vp->v_pollinfo->vpi_revents &= ~events; + + mtx_unlock(&vp->v_pollinfo->vpi_lock); + return (events); + } + vp->v_pollinfo->vpi_events |= events; + selrecord(td, &vp->v_pollinfo->vpi_selinfo); + mtx_unlock(&vp->v_pollinfo->vpi_lock); + return (0); +} + +/* + * Routine to create and manage a filesystem syncer vnode. + */ +#define sync_close ((int (*)(struct vop_close_args *))nullop) +static int sync_fsync(struct vop_fsync_args *); +static int sync_inactive(struct vop_inactive_args *); +static int sync_reclaim(struct vop_reclaim_args *); + +static struct vop_vector sync_vnodeops = { + .vop_bypass = VOP_EOPNOTSUPP, + .vop_close = sync_close, /* close */ + .vop_fsync = sync_fsync, /* fsync */ + .vop_inactive = sync_inactive, /* inactive */ + .vop_reclaim = sync_reclaim, /* reclaim */ + .vop_lock1 = vop_stdlock, /* lock */ + .vop_unlock = vop_stdunlock, /* unlock */ + .vop_islocked = vop_stdislocked, /* islocked */ +}; + +/* + * Create a new filesystem syncer vnode for the specified mount point. + */ +void +vfs_allocate_syncvnode(struct mount *mp) +{ + struct vnode *vp; + struct bufobj *bo; + static long start, incr, next; + int error; + + /* Allocate a new vnode */ + error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); + if (error != 0) + panic("vfs_allocate_syncvnode: getnewvnode() failed"); + vp->v_type = VNON; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vp->v_vflag |= VV_FORCEINSMQ; + error = insmntque(vp, mp); + if (error != 0) + panic("vfs_allocate_syncvnode: insmntque() failed"); + vp->v_vflag &= ~VV_FORCEINSMQ; + VOP_UNLOCK(vp, 0); + /* + * Place the vnode onto the syncer worklist. We attempt to + * scatter them about on the list so that they will go off + * at evenly distributed times even if all the filesystems + * are mounted at once. + */ + next += incr; + if (next == 0 || next > syncer_maxdelay) { + start /= 2; + incr /= 2; + if (start == 0) { + start = syncer_maxdelay / 2; + incr = syncer_maxdelay; + } + next = start; + } + bo = &vp->v_bufobj; + BO_LOCK(bo); + vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); + /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ + mtx_lock(&sync_mtx); + sync_vnode_count++; + if (mp->mnt_syncer == NULL) { + mp->mnt_syncer = vp; + vp = NULL; + } + mtx_unlock(&sync_mtx); + BO_UNLOCK(bo); + if (vp != NULL) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vgone(vp); + vput(vp); + } +} + +void +vfs_deallocate_syncvnode(struct mount *mp) +{ + struct vnode *vp; + + mtx_lock(&sync_mtx); + vp = mp->mnt_syncer; + if (vp != NULL) + mp->mnt_syncer = NULL; + mtx_unlock(&sync_mtx); + if (vp != NULL) + vrele(vp); +} + +/* + * Do a lazy sync of the filesystem. + */ +static int +sync_fsync(struct vop_fsync_args *ap) +{ + struct vnode *syncvp = ap->a_vp; + struct mount *mp = syncvp->v_mount; + int error, save; + struct bufobj *bo; + + /* + * We only need to do something if this is a lazy evaluation. + */ + if (ap->a_waitfor != MNT_LAZY) + return (0); + + /* + * Move ourselves to the back of the sync list. + */ + bo = &syncvp->v_bufobj; + BO_LOCK(bo); + vn_syncer_add_to_worklist(bo, syncdelay); + BO_UNLOCK(bo); + + /* + * Walk the list of vnodes pushing all that are dirty and + * not already on the sync list. + */ + if (vfs_busy(mp, MBF_NOWAIT) != 0) + return (0); + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { + vfs_unbusy(mp); + return (0); + } + save = curthread_pflags_set(TDP_SYNCIO); + vfs_msync(mp, MNT_NOWAIT); + error = VFS_SYNC(mp, MNT_LAZY); + curthread_pflags_restore(save); + vn_finished_write(mp); + vfs_unbusy(mp); + return (error); +} + +/* + * The syncer vnode is no referenced. + */ +static int +sync_inactive(struct vop_inactive_args *ap) +{ + + vgone(ap->a_vp); + return (0); +} + +/* + * The syncer vnode is no longer needed and is being decommissioned. + * + * Modifications to the worklist must be protected by sync_mtx. + */ +static int +sync_reclaim(struct vop_reclaim_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct bufobj *bo; + + bo = &vp->v_bufobj; + BO_LOCK(bo); + mtx_lock(&sync_mtx); + if (vp->v_mount->mnt_syncer == vp) + vp->v_mount->mnt_syncer = NULL; + if (bo->bo_flag & BO_ONWORKLST) { + LIST_REMOVE(bo, bo_synclist); + syncer_worklist_len--; + sync_vnode_count--; + bo->bo_flag &= ~BO_ONWORKLST; + } + mtx_unlock(&sync_mtx); + BO_UNLOCK(bo); + + return (0); +} + +/* + * Check if vnode represents a disk device + */ +int +vn_isdisk(struct vnode *vp, int *errp) +{ + int error; + + if (vp->v_type != VCHR) { + error = ENOTBLK; + goto out; + } + error = 0; + dev_lock(); + if (vp->v_rdev == NULL) + error = ENXIO; + else if (vp->v_rdev->si_devsw == NULL) + error = ENXIO; + else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) + error = ENOTBLK; + dev_unlock(); +out: + if (errp != NULL) + *errp = error; + return (error == 0); +} + +/* + * Common filesystem object access control check routine. Accepts a + * vnode's type, "mode", uid and gid, requested access mode, credentials, + * and optional call-by-reference privused argument allowing vaccess() + * to indicate to the caller whether privilege was used to satisfy the + * request (obsoleted). Returns 0 on success, or an errno on failure. + */ +int +vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, + accmode_t accmode, struct ucred *cred, int *privused) +{ + accmode_t dac_granted; + accmode_t priv_granted; + + KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, + ("invalid bit in accmode")); + KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), + ("VAPPEND without VWRITE")); + + /* + * Look for a normal, non-privileged way to access the file/directory + * as requested. If it exists, go with that. + */ + + if (privused != NULL) + *privused = 0; + + dac_granted = 0; + + /* Check the owner. */ + if (cred->cr_uid == file_uid) { + dac_granted |= VADMIN; + if (file_mode & S_IXUSR) + dac_granted |= VEXEC; + if (file_mode & S_IRUSR) + dac_granted |= VREAD; + if (file_mode & S_IWUSR) + dac_granted |= (VWRITE | VAPPEND); + + if ((accmode & dac_granted) == accmode) + return (0); + + goto privcheck; + } + + /* Otherwise, check the groups (first match) */ + if (groupmember(file_gid, cred)) { + if (file_mode & S_IXGRP) + dac_granted |= VEXEC; + if (file_mode & S_IRGRP) + dac_granted |= VREAD; + if (file_mode & S_IWGRP) + dac_granted |= (VWRITE | VAPPEND); + + if ((accmode & dac_granted) == accmode) + return (0); + + goto privcheck; + } + + /* Otherwise, check everyone else. */ + if (file_mode & S_IXOTH) + dac_granted |= VEXEC; + if (file_mode & S_IROTH) + dac_granted |= VREAD; + if (file_mode & S_IWOTH) + dac_granted |= (VWRITE | VAPPEND); + if ((accmode & dac_granted) == accmode) + return (0); + +privcheck: + /* + * Build a privilege mask to determine if the set of privileges + * satisfies the requirements when combined with the granted mask + * from above. For each privilege, if the privilege is required, + * bitwise or the request type onto the priv_granted mask. + */ + priv_granted = 0; + + if (type == VDIR) { + /* + * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC + * requests, instead of PRIV_VFS_EXEC. + */ + if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && + !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) + priv_granted |= VEXEC; + } else { + /* + * Ensure that at least one execute bit is on. Otherwise, + * a privileged user will always succeed, and we don't want + * this to happen unless the file really is executable. + */ + if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && + (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && + !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) + priv_granted |= VEXEC; + } + + if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && + !priv_check_cred(cred, PRIV_VFS_READ, 0)) + priv_granted |= VREAD; + + if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && + !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) + priv_granted |= (VWRITE | VAPPEND); + + if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && + !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) + priv_granted |= VADMIN; + + if ((accmode & (priv_granted | dac_granted)) == accmode) { + /* XXX audit: privilege used */ + if (privused != NULL) + *privused = 1; + return (0); + } + + return ((accmode & VADMIN) ? EPERM : EACCES); +} + +/* + * Credential check based on process requesting service, and per-attribute + * permissions. + */ +int +extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, + struct thread *td, accmode_t accmode) +{ + + /* + * Kernel-invoked always succeeds. + */ + if (cred == NOCRED) + return (0); + + /* + * Do not allow privileged processes in jail to directly manipulate + * system attributes. + */ + switch (attrnamespace) { + case EXTATTR_NAMESPACE_SYSTEM: + /* Potentially should be: return (EPERM); */ + return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); + case EXTATTR_NAMESPACE_USER: + return (VOP_ACCESS(vp, accmode, cred, td)); + default: + return (EPERM); + } +} + +#ifdef DEBUG_VFS_LOCKS +/* + * This only exists to suppress warnings from unlocked specfs accesses. It is + * no longer ok to have an unlocked VFS. + */ +#define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ + (vp)->v_type == VCHR || (vp)->v_type == VBAD) + +int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ +SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, + "Drop into debugger on lock violation"); + +int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ +SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, + 0, "Check for interlock across VOPs"); + +int vfs_badlock_print = 1; /* Print lock violations. */ +SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, + 0, "Print lock violations"); + +int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ +SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, + 0, "Print vnode details on lock violations"); + +#ifdef KDB +int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ +SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, + &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); +#endif + +static void +vfs_badlock(const char *msg, const char *str, struct vnode *vp) +{ + +#ifdef KDB + if (vfs_badlock_backtrace) + kdb_backtrace(); +#endif + if (vfs_badlock_vnode) + vn_printf(vp, "vnode "); + if (vfs_badlock_print) + printf("%s: %p %s\n", str, (void *)vp, msg); + if (vfs_badlock_ddb) + kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); +} + +void +assert_vi_locked(struct vnode *vp, const char *str) +{ + + if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) + vfs_badlock("interlock is not locked but should be", str, vp); +} + +void +assert_vi_unlocked(struct vnode *vp, const char *str) +{ + + if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) + vfs_badlock("interlock is locked but should not be", str, vp); +} + +void +assert_vop_locked(struct vnode *vp, const char *str) +{ + int locked; + + if (!IGNORE_LOCK(vp)) { + locked = VOP_ISLOCKED(vp); + if (locked == 0 || locked == LK_EXCLOTHER) + vfs_badlock("is not locked but should be", str, vp); + } +} + +void +assert_vop_unlocked(struct vnode *vp, const char *str) +{ + + if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) + vfs_badlock("is locked but should not be", str, vp); +} + +void +assert_vop_elocked(struct vnode *vp, const char *str) +{ + + if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) + vfs_badlock("is not exclusive locked but should be", str, vp); +} +#endif /* DEBUG_VFS_LOCKS */ + +void +vop_rename_fail(struct vop_rename_args *ap) +{ + + if (ap->a_tvp != NULL) + vput(ap->a_tvp); + if (ap->a_tdvp == ap->a_tvp) + vrele(ap->a_tdvp); + else + vput(ap->a_tdvp); + vrele(ap->a_fdvp); + vrele(ap->a_fvp); +} + +void +vop_rename_pre(void *ap) +{ + struct vop_rename_args *a = ap; + +#ifdef DEBUG_VFS_LOCKS + if (a->a_tvp) + ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); + ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); + ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); + ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); + + /* Check the source (from). */ + if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && + (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) + ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); + if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) + ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); + + /* Check the target. */ + if (a->a_tvp) + ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); + ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); +#endif + if (a->a_tdvp != a->a_fdvp) + vhold(a->a_fdvp); + if (a->a_tvp != a->a_fvp) + vhold(a->a_fvp); + vhold(a->a_tdvp); + if (a->a_tvp) + vhold(a->a_tvp); +} + +#ifdef DEBUG_VFS_LOCKS +void +vop_strategy_pre(void *ap) +{ + struct vop_strategy_args *a; + struct buf *bp; + + a = ap; + bp = a->a_bp; + + /* + * Cluster ops lock their component buffers but not the IO container. + */ + if ((bp->b_flags & B_CLUSTER) != 0) + return; + + if (panicstr == NULL && !BUF_ISLOCKED(bp)) { + if (vfs_badlock_print) + printf( + "VOP_STRATEGY: bp is not locked but should be\n"); + if (vfs_badlock_ddb) + kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); + } +} + +void +vop_lock_pre(void *ap) +{ + struct vop_lock1_args *a = ap; + + if ((a->a_flags & LK_INTERLOCK) == 0) + ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); + else + ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); +} + +void +vop_lock_post(void *ap, int rc) +{ + struct vop_lock1_args *a = ap; + + ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); + if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) + ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); +} + +void +vop_unlock_pre(void *ap) +{ + struct vop_unlock_args *a = ap; + + if (a->a_flags & LK_INTERLOCK) + ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); + ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); +} + +void +vop_unlock_post(void *ap, int rc) +{ + struct vop_unlock_args *a = ap; + + if (a->a_flags & LK_INTERLOCK) + ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); +} +#endif + +void +vop_create_post(void *ap, int rc) +{ + struct vop_create_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); +} + +void +vop_deleteextattr_post(void *ap, int rc) +{ + struct vop_deleteextattr_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); +} + +void +vop_link_post(void *ap, int rc) +{ + struct vop_link_args *a = ap; + + if (!rc) { + VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); + VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); + } +} + +void +vop_mkdir_post(void *ap, int rc) +{ + struct vop_mkdir_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); +} + +void +vop_mknod_post(void *ap, int rc) +{ + struct vop_mknod_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); +} + +void +vop_reclaim_post(void *ap, int rc) +{ + struct vop_reclaim_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE); +} + +void +vop_remove_post(void *ap, int rc) +{ + struct vop_remove_args *a = ap; + + if (!rc) { + VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); + VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); + } +} + +void +vop_rename_post(void *ap, int rc) +{ + struct vop_rename_args *a = ap; + long hint; + + if (!rc) { + hint = NOTE_WRITE; + if (a->a_fdvp == a->a_tdvp) { + if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) + hint |= NOTE_LINK; + VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); + VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); + } else { + hint |= NOTE_EXTEND; + if (a->a_fvp->v_type == VDIR) + hint |= NOTE_LINK; + VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); + + if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && + a->a_tvp->v_type == VDIR) + hint &= ~NOTE_LINK; + VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); + } + + VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); + if (a->a_tvp) + VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); + } + if (a->a_tdvp != a->a_fdvp) + vdrop(a->a_fdvp); + if (a->a_tvp != a->a_fvp) + vdrop(a->a_fvp); + vdrop(a->a_tdvp); + if (a->a_tvp) + vdrop(a->a_tvp); +} + +void +vop_rmdir_post(void *ap, int rc) +{ + struct vop_rmdir_args *a = ap; + + if (!rc) { + VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); + VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); + } +} + +void +vop_setattr_post(void *ap, int rc) +{ + struct vop_setattr_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); +} + +void +vop_setextattr_post(void *ap, int rc) +{ + struct vop_setextattr_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); +} + +void +vop_symlink_post(void *ap, int rc) +{ + struct vop_symlink_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); +} + +void +vop_open_post(void *ap, int rc) +{ + struct vop_open_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); +} + +void +vop_close_post(void *ap, int rc) +{ + struct vop_close_args *a = ap; + + if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ + (a->a_vp->v_iflag & VI_DOOMED) == 0)) { + VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? + NOTE_CLOSE_WRITE : NOTE_CLOSE); + } +} + +void +vop_read_post(void *ap, int rc) +{ + struct vop_read_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); +} + +void +vop_readdir_post(void *ap, int rc) +{ + struct vop_readdir_args *a = ap; + + if (!rc) + VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); +} + +static struct knlist fs_knlist; + +static void +vfs_event_init(void *arg) +{ + knlist_init_mtx(&fs_knlist, NULL); +} +/* XXX - correct order? */ +SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); + +void +vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) +{ + + KNOTE_UNLOCKED(&fs_knlist, event); +} + +static int filt_fsattach(struct knote *kn); +static void filt_fsdetach(struct knote *kn); +static int filt_fsevent(struct knote *kn, long hint); + +struct filterops fs_filtops = { + .f_isfd = 0, + .f_attach = filt_fsattach, + .f_detach = filt_fsdetach, + .f_event = filt_fsevent +}; + +static int +filt_fsattach(struct knote *kn) +{ + + kn->kn_flags |= EV_CLEAR; + knlist_add(&fs_knlist, kn, 0); + return (0); +} + +static void +filt_fsdetach(struct knote *kn) +{ + + knlist_remove(&fs_knlist, kn, 0); +} + +static int +filt_fsevent(struct knote *kn, long hint) +{ + + kn->kn_fflags |= hint; + return (kn->kn_fflags != 0); +} + +static int +sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) +{ + struct vfsidctl vc; + int error; + struct mount *mp; + + error = SYSCTL_IN(req, &vc, sizeof(vc)); + if (error) + return (error); + if (vc.vc_vers != VFS_CTL_VERS1) + return (EINVAL); + mp = vfs_getvfs(&vc.vc_fsid); + if (mp == NULL) + return (ENOENT); + /* ensure that a specific sysctl goes to the right filesystem. */ + if (strcmp(vc.vc_fstypename, "*") != 0 && + strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { + vfs_rel(mp); + return (EINVAL); + } + VCTLTOREQ(&vc, req); + error = VFS_SYSCTL(mp, vc.vc_op, req); + vfs_rel(mp); + return (error); +} + +SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, + NULL, 0, sysctl_vfs_ctl, "", + "Sysctl by fsid"); + +/* + * Function to initialize a va_filerev field sensibly. + * XXX: Wouldn't a random number make a lot more sense ?? + */ +u_quad_t +init_va_filerev(void) +{ + struct bintime bt; + + getbinuptime(&bt); + return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); +} + +static int filt_vfsread(struct knote *kn, long hint); +static int filt_vfswrite(struct knote *kn, long hint); +static int filt_vfsvnode(struct knote *kn, long hint); +static void filt_vfsdetach(struct knote *kn); +static struct filterops vfsread_filtops = { + .f_isfd = 1, + .f_detach = filt_vfsdetach, + .f_event = filt_vfsread +}; +static struct filterops vfswrite_filtops = { + .f_isfd = 1, + .f_detach = filt_vfsdetach, + .f_event = filt_vfswrite +}; +static struct filterops vfsvnode_filtops = { + .f_isfd = 1, + .f_detach = filt_vfsdetach, + .f_event = filt_vfsvnode +}; + +static void +vfs_knllock(void *arg) +{ + struct vnode *vp = arg; + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); +} + +static void +vfs_knlunlock(void *arg) +{ + struct vnode *vp = arg; + + VOP_UNLOCK(vp, 0); +} + +static void +vfs_knl_assert_locked(void *arg) +{ +#ifdef DEBUG_VFS_LOCKS + struct vnode *vp = arg; + + ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); +#endif +} + +static void +vfs_knl_assert_unlocked(void *arg) +{ +#ifdef DEBUG_VFS_LOCKS + struct vnode *vp = arg; + + ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); +#endif +} + +int +vfs_kqfilter(struct vop_kqfilter_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct knote *kn = ap->a_kn; + struct knlist *knl; + + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &vfsread_filtops; + break; + case EVFILT_WRITE: + kn->kn_fop = &vfswrite_filtops; + break; + case EVFILT_VNODE: + kn->kn_fop = &vfsvnode_filtops; + break; + default: + return (EINVAL); + } + + kn->kn_hook = (caddr_t)vp; + + v_addpollinfo(vp); + if (vp->v_pollinfo == NULL) + return (ENOMEM); + knl = &vp->v_pollinfo->vpi_selinfo.si_note; + vhold(vp); + knlist_add(knl, kn, 0); + + return (0); +} + +/* + * Detach knote from vnode + */ +static void +filt_vfsdetach(struct knote *kn) +{ + struct vnode *vp = (struct vnode *)kn->kn_hook; + + KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); + knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); + vdrop(vp); +} + +/*ARGSUSED*/ +static int +filt_vfsread(struct knote *kn, long hint) +{ + struct vnode *vp = (struct vnode *)kn->kn_hook; + struct vattr va; + int res; + + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { + VI_LOCK(vp); + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + VI_UNLOCK(vp); + return (1); + } + + if (VOP_GETATTR(vp, &va, curthread->td_ucred)) + return (0); + + VI_LOCK(vp); + kn->kn_data = va.va_size - kn->kn_fp->f_offset; + res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; + VI_UNLOCK(vp); + return (res); +} + +/*ARGSUSED*/ +static int +filt_vfswrite(struct knote *kn, long hint) +{ + struct vnode *vp = (struct vnode *)kn->kn_hook; + + VI_LOCK(vp); + + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + + kn->kn_data = 0; + VI_UNLOCK(vp); + return (1); +} + +static int +filt_vfsvnode(struct knote *kn, long hint) +{ + struct vnode *vp = (struct vnode *)kn->kn_hook; + int res; + + VI_LOCK(vp); + if (kn->kn_sfflags & hint) + kn->kn_fflags |= hint; + if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { + kn->kn_flags |= EV_EOF; + VI_UNLOCK(vp); + return (1); + } + res = (kn->kn_fflags != 0); + VI_UNLOCK(vp); + return (res); +} + +int +vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) +{ + int error; + + if (dp->d_reclen > ap->a_uio->uio_resid) + return (ENAMETOOLONG); + error = uiomove(dp, dp->d_reclen, ap->a_uio); + if (error) { + if (ap->a_ncookies != NULL) { + if (ap->a_cookies != NULL) + free(ap->a_cookies, M_TEMP); + ap->a_cookies = NULL; + *ap->a_ncookies = 0; + } + return (error); + } + if (ap->a_ncookies == NULL) + return (0); + + KASSERT(ap->a_cookies, + ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); + + *ap->a_cookies = realloc(*ap->a_cookies, + (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); + (*ap->a_cookies)[*ap->a_ncookies] = off; + *ap->a_ncookies += 1; + return (0); +} + +/* + * Mark for update the access time of the file if the filesystem + * supports VOP_MARKATIME. This functionality is used by execve and + * mmap, so we want to avoid the I/O implied by directly setting + * va_atime for the sake of efficiency. + */ +void +vfs_mark_atime(struct vnode *vp, struct ucred *cred) +{ + struct mount *mp; + + mp = vp->v_mount; + ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); + if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) + (void)VOP_MARKATIME(vp); +} + +/* + * The purpose of this routine is to remove granularity from accmode_t, + * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, + * VADMIN and VAPPEND. + * + * If it returns 0, the caller is supposed to continue with the usual + * access checks using 'accmode' as modified by this routine. If it + * returns nonzero value, the caller is supposed to return that value + * as errno. + * + * Note that after this routine runs, accmode may be zero. + */ +int +vfs_unixify_accmode(accmode_t *accmode) +{ + /* + * There is no way to specify explicit "deny" rule using + * file mode or POSIX.1e ACLs. + */ + if (*accmode & VEXPLICIT_DENY) { + *accmode = 0; + return (0); + } + + /* + * None of these can be translated into usual access bits. + * Also, the common case for NFSv4 ACLs is to not contain + * either of these bits. Caller should check for VWRITE + * on the containing directory instead. + */ + if (*accmode & (VDELETE_CHILD | VDELETE)) + return (EPERM); + + if (*accmode & VADMIN_PERMS) { + *accmode &= ~VADMIN_PERMS; + *accmode |= VADMIN; + } + + /* + * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL + * or VSYNCHRONIZE using file mode or POSIX.1e ACL. + */ + *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); + + return (0); +} + +/* + * These are helper functions for filesystems to traverse all + * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. + * + * This interface replaces MNT_VNODE_FOREACH. + */ + +MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); + +struct vnode * +__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) +{ + struct vnode *vp; + + if (should_yield()) + kern_yield(PRI_USER); + MNT_ILOCK(mp); + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; + vp = TAILQ_NEXT(vp, v_nmntvnodes)) { + /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */ + if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0) + continue; + VI_LOCK(vp); + if ((vp->v_iflag & VI_DOOMED) != 0) { + VI_UNLOCK(vp); + continue; + } + break; + } + if (vp == NULL) { + __mnt_vnode_markerfree_all(mvp, mp); + /* MNT_IUNLOCK(mp); -- done in above function */ + mtx_assert(MNT_MTX(mp), MA_NOTOWNED); + return (NULL); + } + TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); + TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); + MNT_IUNLOCK(mp); + return (vp); +} + +struct vnode * +__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) +{ + struct vnode *vp; + + *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); + MNT_ILOCK(mp); + MNT_REF(mp); + (*mvp)->v_mount = mp; + (*mvp)->v_type = VMARKER; + + TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { + /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */ + if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0) + continue; + VI_LOCK(vp); + if ((vp->v_iflag & VI_DOOMED) != 0) { + VI_UNLOCK(vp); + continue; + } + break; + } + if (vp == NULL) { + MNT_REL(mp); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + *mvp = NULL; + return (NULL); + } + TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); + MNT_IUNLOCK(mp); + return (vp); +} + +void +__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) +{ + + if (*mvp == NULL) { + MNT_IUNLOCK(mp); + return; + } + + mtx_assert(MNT_MTX(mp), MA_OWNED); + + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); + MNT_REL(mp); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + *mvp = NULL; +} + +/* + * These are helper functions for filesystems to traverse their + * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h + */ +static void +mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) +{ + + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + + MNT_ILOCK(mp); + MNT_REL(mp); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + *mvp = NULL; +} + +/* + * Relock the mp mount vnode list lock with the vp vnode interlock in the + * conventional lock order during mnt_vnode_next_active iteration. + * + * On entry, the mount vnode list lock is held and the vnode interlock is not. + * The list lock is dropped and reacquired. On success, both locks are held. + * On failure, the mount vnode list lock is held but the vnode interlock is + * not, and the procedure may have yielded. + */ +static bool +mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp, + struct vnode *vp) +{ + const struct vnode *tmp; + bool held, ret; + + VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && + TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp, + ("%s: bad marker", __func__)); + VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, + ("%s: inappropriate vnode", __func__)); + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + + ret = false; + + TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist); + TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist); + + /* + * Use a hold to prevent vp from disappearing while the mount vnode + * list lock is dropped and reacquired. Normally a hold would be + * acquired with vhold(), but that might try to acquire the vnode + * interlock, which would be a LOR with the mount vnode list lock. + */ + held = refcount_acquire_if_not_zero(&vp->v_holdcnt); + mtx_unlock(&mp->mnt_listmtx); + if (!held) + goto abort; + VI_LOCK(vp); + if (!refcount_release_if_not_last(&vp->v_holdcnt)) { + vdropl(vp); + goto abort; + } + mtx_lock(&mp->mnt_listmtx); + + /* + * Determine whether the vnode is still the next one after the marker, + * excepting any other markers. If the vnode has not been doomed by + * vgone() then the hold should have ensured that it remained on the + * active list. If it has been doomed but is still on the active list, + * don't abort, but rather skip over it (avoid spinning on doomed + * vnodes). + */ + tmp = mvp; + do { + tmp = TAILQ_NEXT(tmp, v_actfreelist); + } while (tmp != NULL && tmp->v_type == VMARKER); + if (tmp != vp) { + mtx_unlock(&mp->mnt_listmtx); + VI_UNLOCK(vp); + goto abort; + } + + ret = true; + goto out; +abort: + maybe_yield(); + mtx_lock(&mp->mnt_listmtx); +out: + if (ret) + ASSERT_VI_LOCKED(vp, __func__); + else + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + return (ret); +} + +static struct vnode * +mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) +{ + struct vnode *vp, *nvp; + + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); +restart: + vp = TAILQ_NEXT(*mvp, v_actfreelist); + while (vp != NULL) { + if (vp->v_type == VMARKER) { + vp = TAILQ_NEXT(vp, v_actfreelist); + continue; + } + /* + * Try-lock because this is the wrong lock order. If that does + * not succeed, drop the mount vnode list lock and try to + * reacquire it and the vnode interlock in the right order. + */ + if (!VI_TRYLOCK(vp) && + !mnt_vnode_next_active_relock(*mvp, mp, vp)) + goto restart; + KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); + KASSERT(vp->v_mount == mp || vp->v_mount == NULL, + ("alien vnode on the active list %p %p", vp, mp)); + if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) + break; + nvp = TAILQ_NEXT(vp, v_actfreelist); + VI_UNLOCK(vp); + vp = nvp; + } + TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); + + /* Check if we are done */ + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_active(mvp, mp); + return (NULL); + } + TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); + mtx_unlock(&mp->mnt_listmtx); + ASSERT_VI_LOCKED(vp, "active iter"); + KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); + return (vp); +} + +struct vnode * +__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) +{ + + if (should_yield()) + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + return (mnt_vnode_next_active(mvp, mp)); +} + +struct vnode * +__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) +{ + struct vnode *vp; + + *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); + MNT_ILOCK(mp); + MNT_REF(mp); + MNT_IUNLOCK(mp); + (*mvp)->v_type = VMARKER; + (*mvp)->v_mount = mp; + + mtx_lock(&mp->mnt_listmtx); + vp = TAILQ_FIRST(&mp->mnt_activevnodelist); + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_active(mvp, mp); + return (NULL); + } + TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); + return (mnt_vnode_next_active(mvp, mp)); +} + +void +__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) +{ + + if (*mvp == NULL) + return; + + mtx_lock(&mp->mnt_listmtx); + TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_active(mvp, mp); +} + +int +vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) +{ + + if ((cnp->cn_flags & NOEXECCHECK) != 0) { + cnp->cn_flags &= ~NOEXECCHECK; + return (0); + } + + return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); +} diff --git a/freebsd/sys/kern/vfs_syscalls.c b/freebsd/sys/kern/vfs_syscalls.c new file mode 100644 index 00000000..06aaa935 --- /dev/null +++ b/freebsd/sys/kern/vfs_syscalls.c @@ -0,0 +1,4748 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_capsicum.h" +#include "opt_ktrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +#include + +#include +#include + +#include +#include +#include +#include + +#include + +MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); + +SDT_PROVIDER_DEFINE(vfs); +SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int"); +SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int"); + +static int kern_chflagsat(struct thread *td, int fd, const char *path, + enum uio_seg pathseg, u_long flags, int atflag); +static int setfflags(struct thread *td, struct vnode *, u_long); +static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); +static int getutimens(const struct timespec *, enum uio_seg, + struct timespec *, int *); +static int setutimes(struct thread *td, struct vnode *, + const struct timespec *, int, int); +static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, + struct thread *td); +static int kern_fhlinkat(struct thread *td, int fd, const char *path, + enum uio_seg pathseg, fhandle_t *fhp); +static int kern_getfhat(struct thread *td, int flags, int fd, + const char *path, enum uio_seg pathseg, fhandle_t *fhp); +static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, + size_t count, struct thread *td); +static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, + const char *path, enum uio_seg segflag); + +/* + * Sync each mounted filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sync_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +sys_sync(struct thread *td, struct sync_args *uap) +{ + struct mount *mp, *nmp; + int save; + + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0 && + vn_start_write(NULL, &mp, V_NOWAIT) == 0) { + save = curthread_pflags_set(TDP_SYNCIO); + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_NOWAIT); + curthread_pflags_restore(save); + vn_finished_write(mp); + } + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp); + } + mtx_unlock(&mountlist_mtx); + return (0); +} + +/* + * Change filesystem quotas. + */ +#ifndef _SYS_SYSPROTO_H_ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +#endif +int +sys_quotactl(struct thread *td, struct quotactl_args *uap) +{ + struct mount *mp; + struct nameidata nd; + int error; + + AUDIT_ARG_CMD(uap->cmd); + AUDIT_ARG_UID(uap->uid); + if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) + return (EPERM); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, + uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + mp = nd.ni_vp->v_mount; + vfs_ref(mp); + vput(nd.ni_vp); + error = vfs_busy(mp, 0); + vfs_rel(mp); + if (error != 0) + return (error); + error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg); + + /* + * Since quota on operation typically needs to open quota + * file, the Q_QUOTAON handler needs to unbusy the mount point + * before calling into namei. Otherwise, unmount might be + * started between two vfs_busy() invocations (first is our, + * second is from mount point cross-walk code in lookup()), + * causing deadlock. + * + * Require that Q_QUOTAON handles the vfs_busy() reference on + * its own, always returning with ubusied mount point. + */ + if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON && + (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF) + vfs_unbusy(mp); + return (error); +} + +/* + * Used by statfs conversion routines to scale the block size up if + * necessary so that all of the block counts are <= 'max_size'. Note + * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero + * value of 'n'. + */ +void +statfs_scale_blocks(struct statfs *sf, long max_size) +{ + uint64_t count; + int shift; + + KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); + + /* + * Attempt to scale the block counts to give a more accurate + * overview to userland of the ratio of free space to used + * space. To do this, find the largest block count and compute + * a divisor that lets it fit into a signed integer <= max_size. + */ + if (sf->f_bavail < 0) + count = -sf->f_bavail; + else + count = sf->f_bavail; + count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); + if (count <= max_size) + return; + + count >>= flsl(max_size); + shift = 0; + while (count > 0) { + shift++; + count >>=1; + } + + sf->f_bsize <<= shift; + sf->f_blocks >>= shift; + sf->f_bfree >>= shift; + sf->f_bavail >>= shift; +} + +static int +kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf) +{ + struct statfs *sp; + int error; + + if (mp == NULL) + return (EBADF); + error = vfs_busy(mp, 0); + vfs_rel(mp); + if (error != 0) + return (error); +#ifdef MAC + error = mac_mount_check_stat(td->td_ucred, mp); + if (error != 0) + goto out; +#endif + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp = &mp->mnt_stat; + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = VFS_STATFS(mp, sp); + if (error != 0) + goto out; + *buf = *sp; + if (priv_check(td, PRIV_VFS_GENERATION)) { + buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; + prison_enforce_statfs(td->td_ucred, mp, buf); + } +out: + vfs_unbusy(mp); + return (error); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct statfs_args { + char *path; + struct statfs *buf; +}; +#endif +int +sys_statfs(struct thread *td, struct statfs_args *uap) +{ + struct statfs *sfp; + int error; + + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); + if (error == 0) + error = copyout(sfp, uap->buf, sizeof(struct statfs)); + free(sfp, M_STATFS); + return (error); +} + +int +kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, + struct statfs *buf) +{ + struct mount *mp; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, + pathseg, path, td); + error = namei(&nd); + if (error != 0) + return (error); + mp = nd.ni_vp->v_mount; + vfs_ref(mp); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_vp); + return (kern_do_statfs(td, mp, buf)); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +#endif +int +sys_fstatfs(struct thread *td, struct fstatfs_args *uap) +{ + struct statfs *sfp; + int error; + + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_fstatfs(td, uap->fd, sfp); + if (error == 0) + error = copyout(sfp, uap->buf, sizeof(struct statfs)); + free(sfp, M_STATFS); + return (error); +} + +int +kern_fstatfs(struct thread *td, int fd, struct statfs *buf) +{ + struct file *fp; + struct mount *mp; + struct vnode *vp; + int error; + + AUDIT_ARG_FD(fd); + error = getvnode(td, fd, &cap_fstatfs_rights, &fp); + if (error != 0) + return (error); + vp = fp->f_vnode; + vn_lock(vp, LK_SHARED | LK_RETRY); +#ifdef AUDIT + AUDIT_ARG_VNODE1(vp); +#endif + mp = vp->v_mount; + if (mp != NULL) + vfs_ref(mp); + VOP_UNLOCK(vp, 0); + fdrop(fp, td); + return (kern_do_statfs(td, mp, buf)); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int mode; +}; +#endif +int +sys_getfsstat(struct thread *td, struct getfsstat_args *uap) +{ + size_t count; + int error; + + if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX) + return (EINVAL); + error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count, + UIO_USERSPACE, uap->mode); + if (error == 0) + td->td_retval[0] = count; + return (error); +} + +/* + * If (bufsize > 0 && bufseg == UIO_SYSSPACE) + * The caller is responsible for freeing memory which will be allocated + * in '*buf'. + */ +int +kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, + size_t *countp, enum uio_seg bufseg, int mode) +{ + struct mount *mp, *nmp; + struct statfs *sfsp, *sp, *sptmp, *tofree; + size_t count, maxcount; + int error; + + switch (mode) { + case MNT_WAIT: + case MNT_NOWAIT: + break; + default: + if (bufseg == UIO_SYSSPACE) + *buf = NULL; + return (EINVAL); + } +restart: + maxcount = bufsize / sizeof(struct statfs); + if (bufsize == 0) { + sfsp = NULL; + tofree = NULL; + } else if (bufseg == UIO_USERSPACE) { + sfsp = *buf; + tofree = NULL; + } else /* if (bufseg == UIO_SYSSPACE) */ { + count = 0; + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + count++; + } + mtx_unlock(&mountlist_mtx); + if (maxcount > count) + maxcount = count; + tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs), + M_STATFS, M_WAITOK); + } + count = 0; + mtx_lock(&mountlist_mtx); + for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { + if (prison_canseemount(td->td_ucred, mp) != 0) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } +#ifdef MAC + if (mac_mount_check_stat(td->td_ucred, mp) != 0) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } +#endif + if (mode == MNT_WAIT) { + if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) { + /* + * If vfs_busy() failed, and MBF_NOWAIT + * wasn't passed, then the mp is gone. + * Furthermore, because of MBF_MNTLSTLOCK, + * the mountlist_mtx was dropped. We have + * no other choice than to start over. + */ + mtx_unlock(&mountlist_mtx); + free(tofree, M_STATFS); + goto restart; + } + } else { + if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { + nmp = TAILQ_NEXT(mp, mnt_list); + continue; + } + } + if (sfsp != NULL && count < maxcount) { + sp = &mp->mnt_stat; + /* + * Set these in case the underlying filesystem + * fails to do so. + */ + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + /* + * If MNT_NOWAIT is specified, do not refresh + * the fsstat cache. + */ + if (mode != MNT_NOWAIT) { + error = VFS_STATFS(mp, sp); + if (error != 0) { + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp); + continue; + } + } + if (priv_check(td, PRIV_VFS_GENERATION)) { + sptmp = malloc(sizeof(struct statfs), M_STATFS, + M_WAITOK); + *sptmp = *sp; + sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0; + prison_enforce_statfs(td->td_ucred, mp, sptmp); + sp = sptmp; + } else + sptmp = NULL; + if (bufseg == UIO_SYSSPACE) { + bcopy(sp, sfsp, sizeof(*sp)); + free(sptmp, M_STATFS); + } else /* if (bufseg == UIO_USERSPACE) */ { + error = copyout(sp, sfsp, sizeof(*sp)); + free(sptmp, M_STATFS); + if (error != 0) { + vfs_unbusy(mp); + return (error); + } + } + sfsp++; + } + count++; + mtx_lock(&mountlist_mtx); + nmp = TAILQ_NEXT(mp, mnt_list); + vfs_unbusy(mp); + } + mtx_unlock(&mountlist_mtx); + if (sfsp != NULL && count > maxcount) + *countp = maxcount; + else + *countp = count; + return (0); +} + +#ifdef COMPAT_FREEBSD4 +/* + * Get old format filesystem statistics. + */ +static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *); + +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_statfs_args { + char *path; + struct ostatfs *buf; +}; +#endif +int +freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap) +{ + struct ostatfs osb; + struct statfs *sfp; + int error; + + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); + if (error == 0) { + freebsd4_cvtstatfs(sfp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + } + free(sfp, M_STATFS); + return (error); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_fstatfs_args { + int fd; + struct ostatfs *buf; +}; +#endif +int +freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap) +{ + struct ostatfs osb; + struct statfs *sfp; + int error; + + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_fstatfs(td, uap->fd, sfp); + if (error == 0) { + freebsd4_cvtstatfs(sfp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + } + free(sfp, M_STATFS); + return (error); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_getfsstat_args { + struct ostatfs *buf; + long bufsize; + int mode; +}; +#endif +int +freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap) +{ + struct statfs *buf, *sp; + struct ostatfs osb; + size_t count, size; + int error; + + if (uap->bufsize < 0) + return (EINVAL); + count = uap->bufsize / sizeof(struct ostatfs); + if (count > SIZE_MAX / sizeof(struct statfs)) + return (EINVAL); + size = count * sizeof(struct statfs); + error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, + uap->mode); + if (error == 0) + td->td_retval[0] = count; + if (size != 0) { + sp = buf; + while (count != 0 && error == 0) { + freebsd4_cvtstatfs(sp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + sp++; + uap->buf++; + count--; + } + free(buf, M_STATFS); + } + return (error); +} + +/* + * Implement fstatfs() for (NFS) file handles. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd4_fhstatfs_args { + struct fhandle *u_fhp; + struct ostatfs *buf; +}; +#endif +int +freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap) +{ + struct ostatfs osb; + struct statfs *sfp; + fhandle_t fh; + int error; + + error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); + if (error != 0) + return (error); + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_fhstatfs(td, fh, sfp); + if (error == 0) { + freebsd4_cvtstatfs(sfp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + } + free(sfp, M_STATFS); + return (error); +} + +/* + * Convert a new format statfs structure to an old format statfs structure. + */ +static void +freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp) +{ + + statfs_scale_blocks(nsp, LONG_MAX); + bzero(osp, sizeof(*osp)); + osp->f_bsize = nsp->f_bsize; + osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); + osp->f_blocks = nsp->f_blocks; + osp->f_bfree = nsp->f_bfree; + osp->f_bavail = nsp->f_bavail; + osp->f_files = MIN(nsp->f_files, LONG_MAX); + osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); + osp->f_owner = nsp->f_owner; + osp->f_type = nsp->f_type; + osp->f_flags = nsp->f_flags; + osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); + osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); + osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); + osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); + strlcpy(osp->f_fstypename, nsp->f_fstypename, + MIN(MFSNAMELEN, OMFSNAMELEN)); + strlcpy(osp->f_mntonname, nsp->f_mntonname, + MIN(MNAMELEN, OMNAMELEN)); + strlcpy(osp->f_mntfromname, nsp->f_mntfromname, + MIN(MNAMELEN, OMNAMELEN)); + osp->f_fsid = nsp->f_fsid; +} +#endif /* COMPAT_FREEBSD4 */ + +#if defined(COMPAT_FREEBSD11) +/* + * Get old format filesystem statistics. + */ +static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *); + +int +freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap) +{ + struct freebsd11_statfs osb; + struct statfs *sfp; + int error; + + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); + if (error == 0) { + freebsd11_cvtstatfs(sfp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + } + free(sfp, M_STATFS); + return (error); +} + +/* + * Get filesystem statistics. + */ +int +freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap) +{ + struct freebsd11_statfs osb; + struct statfs *sfp; + int error; + + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_fstatfs(td, uap->fd, sfp); + if (error == 0) { + freebsd11_cvtstatfs(sfp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + } + free(sfp, M_STATFS); + return (error); +} + +/* + * Get statistics on all filesystems. + */ +int +freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap) +{ + struct freebsd11_statfs osb; + struct statfs *buf, *sp; + size_t count, size; + int error; + + count = uap->bufsize / sizeof(struct ostatfs); + size = count * sizeof(struct statfs); + error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, + uap->mode); + if (error == 0) + td->td_retval[0] = count; + if (size > 0) { + sp = buf; + while (count > 0 && error == 0) { + freebsd11_cvtstatfs(sp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + sp++; + uap->buf++; + count--; + } + free(buf, M_STATFS); + } + return (error); +} + +/* + * Implement fstatfs() for (NFS) file handles. + */ +int +freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap) +{ + struct freebsd11_statfs osb; + struct statfs *sfp; + fhandle_t fh; + int error; + + error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); + if (error) + return (error); + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_fhstatfs(td, fh, sfp); + if (error == 0) { + freebsd11_cvtstatfs(sfp, &osb); + error = copyout(&osb, uap->buf, sizeof(osb)); + } + free(sfp, M_STATFS); + return (error); +} + +/* + * Convert a new format statfs structure to an old format statfs structure. + */ +static void +freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp) +{ + + bzero(osp, sizeof(*osp)); + osp->f_version = FREEBSD11_STATFS_VERSION; + osp->f_type = nsp->f_type; + osp->f_flags = nsp->f_flags; + osp->f_bsize = nsp->f_bsize; + osp->f_iosize = nsp->f_iosize; + osp->f_blocks = nsp->f_blocks; + osp->f_bfree = nsp->f_bfree; + osp->f_bavail = nsp->f_bavail; + osp->f_files = nsp->f_files; + osp->f_ffree = nsp->f_ffree; + osp->f_syncwrites = nsp->f_syncwrites; + osp->f_asyncwrites = nsp->f_asyncwrites; + osp->f_syncreads = nsp->f_syncreads; + osp->f_asyncreads = nsp->f_asyncreads; + osp->f_namemax = nsp->f_namemax; + osp->f_owner = nsp->f_owner; + osp->f_fsid = nsp->f_fsid; + strlcpy(osp->f_fstypename, nsp->f_fstypename, + MIN(MFSNAMELEN, sizeof(osp->f_fstypename))); + strlcpy(osp->f_mntonname, nsp->f_mntonname, + MIN(MNAMELEN, sizeof(osp->f_mntonname))); + strlcpy(osp->f_mntfromname, nsp->f_mntfromname, + MIN(MNAMELEN, sizeof(osp->f_mntfromname))); +} +#endif /* COMPAT_FREEBSD11 */ + +/* + * Change current working directory to a given file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchdir_args { + int fd; +}; +#endif +int +sys_fchdir(struct thread *td, struct fchdir_args *uap) +{ + struct vnode *vp, *tdp; + struct mount *mp; + struct file *fp; + int error; + + AUDIT_ARG_FD(uap->fd); + error = getvnode(td, uap->fd, &cap_fchdir_rights, + &fp); + if (error != 0) + return (error); + vp = fp->f_vnode; + vrefact(vp); + fdrop(fp, td); + vn_lock(vp, LK_SHARED | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + error = change_dir(vp, td); + while (!error && (mp = vp->v_mountedhere) != NULL) { + if (vfs_busy(mp, 0)) + continue; + error = VFS_ROOT(mp, LK_SHARED, &tdp); + vfs_unbusy(mp); + if (error != 0) + break; + vput(vp); + vp = tdp; + } + if (error != 0) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0); + pwd_chdir(td, vp); + return (0); +} + +/* + * Change current working directory (``.''). + */ +#ifndef _SYS_SYSPROTO_H_ +struct chdir_args { + char *path; +}; +#endif +int +sys_chdir(struct thread *td, struct chdir_args *uap) +{ + + return (kern_chdir(td, uap->path, UIO_USERSPACE)); +} + +int +kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) +{ + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, + pathseg, path, td); + if ((error = namei(&nd)) != 0) + return (error); + if ((error = change_dir(nd.ni_vp, td)) != 0) { + vput(nd.ni_vp); + NDFREE(&nd, NDF_ONLY_PNBUF); + return (error); + } + VOP_UNLOCK(nd.ni_vp, 0); + NDFREE(&nd, NDF_ONLY_PNBUF); + pwd_chdir(td, nd.ni_vp); + return (0); +} + +/* + * Change notion of root (``/'') directory. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chroot_args { + char *path; +}; +#endif +int +sys_chroot(struct thread *td, struct chroot_args *uap) +{ + struct nameidata nd; + int error; + + error = priv_check(td, PRIV_VFS_CHROOT); + if (error != 0) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, + UIO_USERSPACE, uap->path, td); + error = namei(&nd); + if (error != 0) + goto error; + error = change_dir(nd.ni_vp, td); + if (error != 0) + goto e_vunlock; +#ifdef MAC + error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp); + if (error != 0) + goto e_vunlock; +#endif + VOP_UNLOCK(nd.ni_vp, 0); + error = pwd_chroot(td, nd.ni_vp); + vrele(nd.ni_vp); + NDFREE(&nd, NDF_ONLY_PNBUF); + return (error); +e_vunlock: + vput(nd.ni_vp); +error: + NDFREE(&nd, NDF_ONLY_PNBUF); + return (error); +} + +/* + * Common routine for chroot and chdir. Callers must provide a locked vnode + * instance. + */ +int +change_dir(struct vnode *vp, struct thread *td) +{ +#ifdef MAC + int error; +#endif + + ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); + if (vp->v_type != VDIR) + return (ENOTDIR); +#ifdef MAC + error = mac_vnode_check_chdir(td->td_ucred, vp); + if (error != 0) + return (error); +#endif + return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td)); +} + +static __inline void +flags_to_rights(int flags, cap_rights_t *rightsp) +{ + + if (flags & O_EXEC) { + cap_rights_set(rightsp, CAP_FEXECVE); + } else { + switch ((flags & O_ACCMODE)) { + case O_RDONLY: + cap_rights_set(rightsp, CAP_READ); + break; + case O_RDWR: + cap_rights_set(rightsp, CAP_READ); + /* FALLTHROUGH */ + case O_WRONLY: + cap_rights_set(rightsp, CAP_WRITE); + if (!(flags & (O_APPEND | O_TRUNC))) + cap_rights_set(rightsp, CAP_SEEK); + break; + } + } + + if (flags & O_CREAT) + cap_rights_set(rightsp, CAP_CREATE); + + if (flags & O_TRUNC) + cap_rights_set(rightsp, CAP_FTRUNCATE); + + if (flags & (O_SYNC | O_FSYNC)) + cap_rights_set(rightsp, CAP_FSYNC); + + if (flags & (O_EXLOCK | O_SHLOCK)) + cap_rights_set(rightsp, CAP_FLOCK); +} + +/* + * Check permissions, allocate an open file structure, and call the device + * open routine if any. + */ +#ifndef _SYS_SYSPROTO_H_ +struct open_args { + char *path; + int flags; + int mode; +}; +#endif +int +sys_open(struct thread *td, struct open_args *uap) +{ + + return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->flags, uap->mode)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct openat_args { + int fd; + char *path; + int flag; + int mode; +}; +#endif +int +sys_openat(struct thread *td, struct openat_args *uap) +{ + + AUDIT_ARG_FD(uap->fd); + return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, + uap->mode)); +} + +int +kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + int flags, int mode) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct vnode *vp; + struct nameidata nd; + cap_rights_t rights; + int cmode, error, indx; + + indx = -1; + + AUDIT_ARG_FFLAGS(flags); + AUDIT_ARG_MODE(mode); + cap_rights_init(&rights, CAP_LOOKUP); + flags_to_rights(flags, &rights); + /* + * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags + * may be specified. + */ + if (flags & O_EXEC) { + if (flags & O_ACCMODE) + return (EINVAL); + } else if ((flags & O_ACCMODE) == O_ACCMODE) { + return (EINVAL); + } else { + flags = FFLAGS(flags); + } + + /* + * Allocate a file structure. The descriptor to reference it + * is allocated and set by finstall() below. + */ + error = falloc_noinstall(td, &fp); + if (error != 0) + return (error); + /* + * An extra reference on `fp' has been held for us by + * falloc_noinstall(). + */ + /* Set the flags early so the finit in devfs can pick them up. */ + fp->f_flag = flags & FMASK; + cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, + &rights, td); + td->td_dupfd = -1; /* XXX check for fdopen */ + error = vn_open(&nd, &flags, cmode, fp); + if (error != 0) { + /* + * If the vn_open replaced the method vector, something + * wonderous happened deep below and we just pass it up + * pretending we know what we do. + */ + if (error == ENXIO && fp->f_ops != &badfileops) + goto success; + + /* + * Handle special fdopen() case. bleh. + * + * Don't do this for relative (capability) lookups; we don't + * understand exactly what would happen, and we don't think + * that it ever should. + */ + if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 && + (error == ENODEV || error == ENXIO) && + td->td_dupfd >= 0) { + error = dupfdopen(td, fdp, td->td_dupfd, flags, error, + &indx); + if (error == 0) + goto success; + } + + goto bad; + } + td->td_dupfd = 0; + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + /* + * Store the vnode, for any f_type. Typically, the vnode use + * count is decremented by direct call to vn_closefile() for + * files that switched type in the cdevsw fdopen() method. + */ + fp->f_vnode = vp; + /* + * If the file wasn't claimed by devfs bind it to the normal + * vnode operations here. + */ + if (fp->f_ops == &badfileops) { + KASSERT(vp->v_type != VFIFO, ("Unexpected fifo.")); + fp->f_seqcount = 1; + finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK), + DTYPE_VNODE, vp, &vnops); + } + + VOP_UNLOCK(vp, 0); + if (flags & O_TRUNC) { + error = fo_truncate(fp, 0, td->td_ucred, td); + if (error != 0) + goto bad; + } +success: + /* + * If we haven't already installed the FD (for dupfdopen), do so now. + */ + if (indx == -1) { + struct filecaps *fcaps; + +#ifdef CAPABILITIES + if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0) + fcaps = &nd.ni_filecaps; + else +#endif + fcaps = NULL; + error = finstall(td, fp, &indx, flags, fcaps); + /* On success finstall() consumes fcaps. */ + if (error != 0) { + filecaps_free(&nd.ni_filecaps); + goto bad; + } + } else { + filecaps_free(&nd.ni_filecaps); + } + + /* + * Release our private reference, leaving the one associated with + * the descriptor table intact. + */ + fdrop(fp, td); + td->td_retval[0] = indx; + return (0); +bad: + KASSERT(indx == -1, ("indx=%d, should be -1", indx)); + fdrop(fp, td); + return (error); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ocreat_args { + char *path; + int mode; +}; +#endif +int +ocreat(struct thread *td, struct ocreat_args *uap) +{ + + return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mknodat_args { + int fd; + char *path; + mode_t mode; + dev_t dev; +}; +#endif +int +sys_mknodat(struct thread *td, struct mknodat_args *uap) +{ + + return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, + uap->dev)); +} + +#if defined(COMPAT_FREEBSD11) +int +freebsd11_mknod(struct thread *td, + struct freebsd11_mknod_args *uap) +{ + + return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->mode, uap->dev)); +} + +int +freebsd11_mknodat(struct thread *td, + struct freebsd11_mknodat_args *uap) +{ + + return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, + uap->dev)); +} +#endif /* COMPAT_FREEBSD11 */ + +int +kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + int mode, dev_t dev) +{ + struct vnode *vp; + struct mount *mp; + struct vattr vattr; + struct nameidata nd; + int error, whiteout = 0; + + AUDIT_ARG_MODE(mode); + AUDIT_ARG_DEV(dev); + switch (mode & S_IFMT) { + case S_IFCHR: + case S_IFBLK: + error = priv_check(td, PRIV_VFS_MKNOD_DEV); + if (error == 0 && dev == VNOVAL) + error = EINVAL; + break; + case S_IFWHT: + error = priv_check(td, PRIV_VFS_MKNOD_WHT); + break; + case S_IFIFO: + if (dev == 0) + return (kern_mkfifoat(td, fd, path, pathseg, mode)); + /* FALLTHROUGH */ + default: + error = EINVAL; + break; + } + if (error != 0) + return (error); +restart: + bwillwrite(); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | + NOCACHE, pathseg, path, fd, &cap_mknodat_rights, + td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (vp == nd.ni_dvp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } else { + VATTR_NULL(&vattr); + vattr.va_mode = (mode & ALLPERMS) & + ~td->td_proc->p_fd->fd_cmask; + vattr.va_rdev = dev; + whiteout = 0; + + switch (mode & S_IFMT) { + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + case S_IFWHT: + whiteout = 1; + break; + default: + panic("kern_mknod: invalid mode"); + } + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } +#ifdef MAC + if (error == 0 && !whiteout) + error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, + &nd.ni_cnd, &vattr); +#endif + if (error == 0) { + if (whiteout) + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); + else { + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, + &nd.ni_cnd, &vattr); + if (error == 0) + vput(nd.ni_vp); + } + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + return (error); +} + +/* + * Create a named pipe. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkfifo_args { + char *path; + int mode; +}; +#endif +int +sys_mkfifo(struct thread *td, struct mkfifo_args *uap) +{ + + return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->mode)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct mkfifoat_args { + int fd; + char *path; + mode_t mode; +}; +#endif +int +sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap) +{ + + return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, + uap->mode)); +} + +int +kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + int mode) +{ + struct mount *mp; + struct vattr vattr; + struct nameidata nd; + int error; + + AUDIT_ARG_MODE(mode); +restart: + bwillwrite(); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | + NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights, + td); + if ((error = namei(&nd)) != 0) + return (error); + if (nd.ni_vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_vp == nd.ni_dvp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; +#ifdef MAC + error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, + &vattr); + if (error != 0) + goto out; +#endif + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + if (error == 0) + vput(nd.ni_vp); +#ifdef MAC +out: +#endif + vput(nd.ni_dvp); + vn_finished_write(mp); + NDFREE(&nd, NDF_ONLY_PNBUF); + return (error); +} + +/* + * Make a hard file link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct link_args { + char *path; + char *link; +}; +#endif +int +sys_link(struct thread *td, struct link_args *uap) +{ + + return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link, + UIO_USERSPACE, FOLLOW)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct linkat_args { + int fd1; + char *path1; + int fd2; + char *path2; + int flag; +}; +#endif +int +sys_linkat(struct thread *td, struct linkat_args *uap) +{ + int flag; + + flag = uap->flag; + if (flag & ~AT_SYMLINK_FOLLOW) + return (EINVAL); + + return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, + UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW)); +} + +int hardlink_check_uid = 0; +SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, + &hardlink_check_uid, 0, + "Unprivileged processes cannot create hard links to files owned by other " + "users"); +static int hardlink_check_gid = 0; +SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, + &hardlink_check_gid, 0, + "Unprivileged processes cannot create hard links to files owned by other " + "groups"); + +static int +can_hardlink(struct vnode *vp, struct ucred *cred) +{ + struct vattr va; + int error; + + if (!hardlink_check_uid && !hardlink_check_gid) + return (0); + + error = VOP_GETATTR(vp, &va, cred); + if (error != 0) + return (error); + + if (hardlink_check_uid && cred->cr_uid != va.va_uid) { + error = priv_check_cred(cred, PRIV_VFS_LINK, 0); + if (error != 0) + return (error); + } + + if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { + error = priv_check_cred(cred, PRIV_VFS_LINK, 0); + if (error != 0) + return (error); + } + + return (0); +} + +int +kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2, + enum uio_seg segflag, int follow) +{ + struct nameidata nd; + int error; + + do { + bwillwrite(); + NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflag, + path1, fd1, &cap_linkat_source_rights, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); + } while (error == EAGAIN); + return (error); +} + +static int +kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, + enum uio_seg segflag) +{ + struct nameidata nd; + struct mount *mp; + int error; + + if (vp->v_type == VDIR) { + vrele(vp); + return (EPERM); /* POSIX */ + } + NDINIT_ATRIGHTS(&nd, CREATE, + LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd, + &cap_linkat_target_rights, td); + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + vrele(vp); + return (EEXIST); + } else if (nd.ni_dvp->v_mount != vp->v_mount) { + /* + * Cross-device link. No need to recheck + * vp->v_type, since it cannot change, except + * to VBAD. + */ + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vrele(vp); + return (EXDEV); + } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) { + error = can_hardlink(vp, td->td_ucred); +#ifdef MAC + if (error == 0) + error = mac_vnode_check_link(td->td_ucred, + nd.ni_dvp, vp, &nd.ni_cnd); +#endif + if (error != 0) { + vput(vp); + vput(nd.ni_dvp); + NDFREE(&nd, NDF_ONLY_PNBUF); + return (error); + } + error = vn_start_write(vp, &mp, V_NOWAIT); + if (error != 0) { + vput(vp); + vput(nd.ni_dvp); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = vn_start_write(NULL, &mp, + V_XSLEEP | PCATCH); + if (error != 0) + return (error); + return (EAGAIN); + } + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + VOP_UNLOCK(vp, 0); + vput(nd.ni_dvp); + vn_finished_write(mp); + NDFREE(&nd, NDF_ONLY_PNBUF); + } else { + vput(nd.ni_dvp); + NDFREE(&nd, NDF_ONLY_PNBUF); + vrele(vp); + return (EAGAIN); + } + } + vrele(vp); + return (error); +} + +/* + * Make a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct symlink_args { + char *path; + char *link; +}; +#endif +int +sys_symlink(struct thread *td, struct symlink_args *uap) +{ + + return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link, + UIO_USERSPACE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct symlinkat_args { + char *path; + int fd; + char *path2; +}; +#endif +int +sys_symlinkat(struct thread *td, struct symlinkat_args *uap) +{ + + return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, + UIO_USERSPACE)); +} + +int +kern_symlinkat(struct thread *td, char *path1, int fd, char *path2, + enum uio_seg segflg) +{ + struct mount *mp; + struct vattr vattr; + char *syspath; + struct nameidata nd; + int error; + + if (segflg == UIO_SYSSPACE) { + syspath = path1; + } else { + syspath = uma_zalloc(namei_zone, M_WAITOK); + if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0) + goto out; + } + AUDIT_ARG_TEXT(syspath); +restart: + bwillwrite(); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | + NOCACHE, segflg, path2, fd, &cap_symlinkat_rights, + td); + if ((error = namei(&nd)) != 0) + goto out; + if (nd.ni_vp) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_vp == nd.ni_dvp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + goto out; + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; +#ifdef MAC + vattr.va_type = VLNK; + error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, + &vattr); + if (error != 0) + goto out2; +#endif + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); + if (error == 0) + vput(nd.ni_vp); +#ifdef MAC +out2: +#endif + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); +out: + if (segflg != UIO_SYSSPACE) + uma_zfree(namei_zone, syspath); + return (error); +} + +/* + * Delete a whiteout from the filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct undelete_args { + char *path; +}; +#endif +int +sys_undelete(struct thread *td, struct undelete_args *uap) +{ + struct mount *mp; + struct nameidata nd; + int error; + +restart: + bwillwrite(); + NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1, + UIO_USERSPACE, uap->path, td); + error = namei(&nd); + if (error != 0) + return (error); + + if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_vp == nd.ni_dvp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + vn_finished_write(mp); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct unlink_args { + char *path; +}; +#endif +int +sys_unlink(struct thread *td, struct unlink_args *uap) +{ + + return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct unlinkat_args { + int fd; + char *path; + int flag; +}; +#endif +int +sys_unlinkat(struct thread *td, struct unlinkat_args *uap) +{ + int flag = uap->flag; + int fd = uap->fd; + char *path = uap->path; + + if (flag & ~AT_REMOVEDIR) + return (EINVAL); + + if (flag & AT_REMOVEDIR) + return (kern_rmdirat(td, fd, path, UIO_USERSPACE)); + else + return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0)); +} + +int +kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + ino_t oldinum) +{ + struct mount *mp; + struct vnode *vp; + struct nameidata nd; + struct stat sb; + int error; + +restart: + bwillwrite(); + NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1, + pathseg, path, fd, &cap_unlinkat_rights, td); + if ((error = namei(&nd)) != 0) + return (error == EINVAL ? EPERM : error); + vp = nd.ni_vp; + if (vp->v_type == VDIR && oldinum == 0) { + error = EPERM; /* POSIX */ + } else if (oldinum != 0 && + ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && + sb.st_ino != oldinum) { + error = EIDRM; /* Identifier removed */ + } else { + /* + * The root of a mounted filesystem cannot be deleted. + * + * XXX: can this only be a VDIR case? + */ + if (vp->v_vflag & VV_ROOT) + error = EBUSY; + } + if (error == 0) { + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + if ((error = vn_start_write(NULL, &mp, + V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } +#ifdef MAC + error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, + &nd.ni_cnd); + if (error != 0) + goto out; +#endif + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); +#ifdef MAC +out: +#endif + vn_finished_write(mp); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + return (error); +} + +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +#endif +int +sys_lseek(struct thread *td, struct lseek_args *uap) +{ + + return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); +} + +int +kern_lseek(struct thread *td, int fd, off_t offset, int whence) +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(fd); + error = fget(td, fd, &cap_seek_rights, &fp); + if (error != 0) + return (error); + error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ? + fo_seek(fp, offset, whence, td) : ESPIPE; + fdrop(fp, td); + return (error); +} + +#if defined(COMPAT_43) +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olseek_args { + int fd; + long offset; + int whence; +}; +#endif +int +olseek(struct thread *td, struct olseek_args *uap) +{ + + return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); +} +#endif /* COMPAT_43 */ + +#if defined(COMPAT_FREEBSD6) +/* Version with the 'pad' argument */ +int +freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap) +{ + + return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); +} +#endif + +/* + * Check access permissions using passed credentials. + */ +static int +vn_access(struct vnode *vp, int user_flags, struct ucred *cred, + struct thread *td) +{ + accmode_t accmode; + int error; + + /* Flags == 0 means only check for existence. */ + if (user_flags == 0) + return (0); + + accmode = 0; + if (user_flags & R_OK) + accmode |= VREAD; + if (user_flags & W_OK) + accmode |= VWRITE; + if (user_flags & X_OK) + accmode |= VEXEC; +#ifdef MAC + error = mac_vnode_check_access(cred, vp, accmode); + if (error != 0) + return (error); +#endif + if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, accmode, cred, td); + return (error); +} + +/* + * Check access permissions using "real" credentials. + */ +#ifndef _SYS_SYSPROTO_H_ +struct access_args { + char *path; + int amode; +}; +#endif +int +sys_access(struct thread *td, struct access_args *uap) +{ + + return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + 0, uap->amode)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct faccessat_args { + int dirfd; + char *path; + int amode; + int flag; +} +#endif +int +sys_faccessat(struct thread *td, struct faccessat_args *uap) +{ + + return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, + uap->amode)); +} + +int +kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + int flag, int amode) +{ + struct ucred *cred, *usecred; + struct vnode *vp; + struct nameidata nd; + int error; + + if (flag & ~AT_EACCESS) + return (EINVAL); + if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0) + return (EINVAL); + + /* + * Create and modify a temporary credential instead of one that + * is potentially shared (if we need one). + */ + cred = td->td_ucred; + if ((flag & AT_EACCESS) == 0 && + ((cred->cr_uid != cred->cr_ruid || + cred->cr_rgid != cred->cr_groups[0]))) { + usecred = crdup(cred); + usecred->cr_uid = cred->cr_ruid; + usecred->cr_groups[0] = cred->cr_rgid; + td->td_ucred = usecred; + } else + usecred = cred; + AUDIT_ARG_VALUE(amode); + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | + AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, + td); + if ((error = namei(&nd)) != 0) + goto out; + vp = nd.ni_vp; + + error = vn_access(vp, amode, usecred, td); + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); +out: + if (usecred != cred) { + td->td_ucred = cred; + crfree(usecred); + } + return (error); +} + +/* + * Check access permissions using "effective" credentials. + */ +#ifndef _SYS_SYSPROTO_H_ +struct eaccess_args { + char *path; + int amode; +}; +#endif +int +sys_eaccess(struct thread *td, struct eaccess_args *uap) +{ + + return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + AT_EACCESS, uap->amode)); +} + +#if defined(COMPAT_43) +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ostat_args { + char *path; + struct ostat *ub; +}; +#endif +int +ostat(struct thread *td, struct ostat_args *uap) +{ + struct stat sb; + struct ostat osb; + int error; + + error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, + &sb, NULL); + if (error != 0) + return (error); + cvtstat(&sb, &osb); + return (copyout(&osb, uap->ub, sizeof (osb))); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olstat_args { + char *path; + struct ostat *ub; +}; +#endif +int +olstat(struct thread *td, struct olstat_args *uap) +{ + struct stat sb; + struct ostat osb; + int error; + + error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, + UIO_USERSPACE, &sb, NULL); + if (error != 0) + return (error); + cvtstat(&sb, &osb); + return (copyout(&osb, uap->ub, sizeof (osb))); +} + +/* + * Convert from an old to a new stat structure. + * XXX: many values are blindly truncated. + */ +void +cvtstat(struct stat *st, struct ostat *ost) +{ + + bzero(ost, sizeof(*ost)); + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + ost->st_size = MIN(st->st_size, INT32_MAX); + ost->st_atim = st->st_atim; + ost->st_mtim = st->st_mtim; + ost->st_ctim = st->st_ctim; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 */ + +#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) +int ino64_trunc_error; +SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW, + &ino64_trunc_error, 0, + "Error on truncation of device, file or inode number, or link count"); + +int +freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost) +{ + + ost->st_dev = st->st_dev; + if (ost->st_dev != st->st_dev) { + switch (ino64_trunc_error) { + default: + /* + * Since dev_t is almost raw, don't clamp to the + * maximum for case 2, but ignore the error. + */ + break; + case 1: + return (EOVERFLOW); + } + } + ost->st_ino = st->st_ino; + if (ost->st_ino != st->st_ino) { + switch (ino64_trunc_error) { + default: + case 0: + break; + case 1: + return (EOVERFLOW); + case 2: + ost->st_ino = UINT32_MAX; + break; + } + } + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + if (ost->st_nlink != st->st_nlink) { + switch (ino64_trunc_error) { + default: + case 0: + break; + case 1: + return (EOVERFLOW); + case 2: + ost->st_nlink = UINT16_MAX; + break; + } + } + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (ost->st_rdev != st->st_rdev) { + switch (ino64_trunc_error) { + default: + break; + case 1: + return (EOVERFLOW); + } + } + ost->st_atim = st->st_atim; + ost->st_mtim = st->st_mtim; + ost->st_ctim = st->st_ctim; + ost->st_size = st->st_size; + ost->st_blocks = st->st_blocks; + ost->st_blksize = st->st_blksize; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; + ost->st_lspare = 0; + ost->st_birthtim = st->st_birthtim; + bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim), + sizeof(*ost) - offsetof(struct freebsd11_stat, + st_birthtim) - sizeof(ost->st_birthtim)); + return (0); +} + +int +freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap) +{ + struct stat sb; + struct freebsd11_stat osb; + int error; + + error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, + &sb, NULL); + if (error != 0) + return (error); + error = freebsd11_cvtstat(&sb, &osb); + if (error == 0) + error = copyout(&osb, uap->ub, sizeof(osb)); + return (error); +} + +int +freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap) +{ + struct stat sb; + struct freebsd11_stat osb; + int error; + + error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, + UIO_USERSPACE, &sb, NULL); + if (error != 0) + return (error); + error = freebsd11_cvtstat(&sb, &osb); + if (error == 0) + error = copyout(&osb, uap->ub, sizeof(osb)); + return (error); +} + +int +freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap) +{ + struct fhandle fh; + struct stat sb; + struct freebsd11_stat osb; + int error; + + error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); + if (error != 0) + return (error); + error = kern_fhstat(td, fh, &sb); + if (error != 0) + return (error); + error = freebsd11_cvtstat(&sb, &osb); + if (error == 0) + error = copyout(&osb, uap->sb, sizeof(osb)); + return (error); +} + +int +freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap) +{ + struct stat sb; + struct freebsd11_stat osb; + int error; + + error = kern_statat(td, uap->flag, uap->fd, uap->path, + UIO_USERSPACE, &sb, NULL); + if (error != 0) + return (error); + error = freebsd11_cvtstat(&sb, &osb); + if (error == 0) + error = copyout(&osb, uap->buf, sizeof(osb)); + return (error); +} +#endif /* COMPAT_FREEBSD11 */ + +/* + * Get file status + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatat_args { + int fd; + char *path; + struct stat *buf; + int flag; +} +#endif +int +sys_fstatat(struct thread *td, struct fstatat_args *uap) +{ + struct stat sb; + int error; + + error = kern_statat(td, uap->flag, uap->fd, uap->path, + UIO_USERSPACE, &sb, NULL); + if (error == 0) + error = copyout(&sb, uap->buf, sizeof (sb)); + return (error); +} + +int +kern_statat(struct thread *td, int flag, int fd, char *path, + enum uio_seg pathseg, struct stat *sbp, + void (*hook)(struct vnode *vp, struct stat *sbp)) +{ + struct nameidata nd; + struct stat sb; + int error; + + if (flag & ~AT_SYMLINK_NOFOLLOW) + return (EINVAL); + + NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : + FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, + &cap_fstat_rights, td); + + if ((error = namei(&nd)) != 0) + return (error); + error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); + if (error == 0) { + SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode); + if (S_ISREG(sb.st_mode)) + SDT_PROBE2(vfs, , stat, reg, path, pathseg); + if (__predict_false(hook != NULL)) + hook(nd.ni_vp, &sb); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_vp); + if (error != 0) + return (error); +#ifdef __STAT_TIME_T_EXT + sb.st_atim_ext = 0; + sb.st_mtim_ext = 0; + sb.st_ctim_ext = 0; + sb.st_btim_ext = 0; +#endif + *sbp = sb; +#ifdef KTRACE + if (KTRPOINT(td, KTR_STRUCT)) + ktrstat(&sb); +#endif + return (0); +} + +#if defined(COMPAT_FREEBSD11) +/* + * Implementation of the NetBSD [l]stat() functions. + */ +void +freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb) +{ + + bzero(nsb, sizeof(*nsb)); + nsb->st_dev = sb->st_dev; + nsb->st_ino = sb->st_ino; + nsb->st_mode = sb->st_mode; + nsb->st_nlink = sb->st_nlink; + nsb->st_uid = sb->st_uid; + nsb->st_gid = sb->st_gid; + nsb->st_rdev = sb->st_rdev; + nsb->st_atim = sb->st_atim; + nsb->st_mtim = sb->st_mtim; + nsb->st_ctim = sb->st_ctim; + nsb->st_size = sb->st_size; + nsb->st_blocks = sb->st_blocks; + nsb->st_blksize = sb->st_blksize; + nsb->st_flags = sb->st_flags; + nsb->st_gen = sb->st_gen; + nsb->st_birthtim = sb->st_birthtim; +} + +#ifndef _SYS_SYSPROTO_H_ +struct freebsd11_nstat_args { + char *path; + struct nstat *ub; +}; +#endif +int +freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap) +{ + struct stat sb; + struct nstat nsb; + int error; + + error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, + &sb, NULL); + if (error != 0) + return (error); + freebsd11_cvtnstat(&sb, &nsb); + return (copyout(&nsb, uap->ub, sizeof (nsb))); +} + +/* + * NetBSD lstat. Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct freebsd11_nlstat_args { + char *path; + struct nstat *ub; +}; +#endif +int +freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap) +{ + struct stat sb; + struct nstat nsb; + int error; + + error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, + UIO_USERSPACE, &sb, NULL); + if (error != 0) + return (error); + freebsd11_cvtnstat(&sb, &nsb); + return (copyout(&nsb, uap->ub, sizeof (nsb))); +} +#endif /* COMPAT_FREEBSD11 */ + +/* + * Get configurable pathname variables. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pathconf_args { + char *path; + int name; +}; +#endif +int +sys_pathconf(struct thread *td, struct pathconf_args *uap) +{ + long value; + int error; + + error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW, + &value); + if (error == 0) + td->td_retval[0] = value; + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct lpathconf_args { + char *path; + int name; +}; +#endif +int +sys_lpathconf(struct thread *td, struct lpathconf_args *uap) +{ + long value; + int error; + + error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, + NOFOLLOW, &value); + if (error == 0) + td->td_retval[0] = value; + return (error); +} + +int +kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name, + u_long flags, long *valuep) +{ + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags, + pathseg, path, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + + error = VOP_PATHCONF(nd.ni_vp, name, valuep); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readlink_args { + char *path; + char *buf; + size_t count; +}; +#endif +int +sys_readlink(struct thread *td, struct readlink_args *uap) +{ + + return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->buf, UIO_USERSPACE, uap->count)); +} +#ifndef _SYS_SYSPROTO_H_ +struct readlinkat_args { + int fd; + char *path; + char *buf; + size_t bufsize; +}; +#endif +int +sys_readlinkat(struct thread *td, struct readlinkat_args *uap) +{ + + return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, + uap->buf, UIO_USERSPACE, uap->bufsize)); +} + +int +kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + char *buf, enum uio_seg bufseg, size_t count) +{ + struct vnode *vp; + struct nameidata nd; + int error; + + if (count > IOSIZE_MAX) + return (EINVAL); + + NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, + pathseg, path, fd, td); + + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + + error = kern_readlink_vp(vp, buf, bufseg, count, td); + vput(vp); + + return (error); +} + +/* + * Helper function to readlink from a vnode + */ +static int +kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, + struct thread *td) +{ + struct iovec aiov; + struct uio auio; + int error; + + ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked"); +#ifdef MAC + error = mac_vnode_check_readlink(td->td_ucred, vp); + if (error != 0) + return (error); +#endif + if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0) + return (EINVAL); + + aiov.iov_base = buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = bufseg; + auio.uio_td = td; + auio.uio_resid = count; + error = VOP_READLINK(vp, &auio, td->td_ucred); + td->td_retval[0] = count - auio.uio_resid; + return (error); +} + +/* + * Common implementation code for chflags() and fchflags(). + */ +static int +setfflags(struct thread *td, struct vnode *vp, u_long flags) +{ + struct mount *mp; + struct vattr vattr; + int error; + + /* We can't support the value matching VNOVAL. */ + if (flags == VNOVAL) + return (EOPNOTSUPP); + + /* + * Prevent non-root users from setting flags on devices. When + * a device is reused, users can retain ownership of the device + * if they are allowed to set flags and programs assume that + * chown can't fail when done as root. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) { + error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); + if (error != 0) + return (error); + } + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + VATTR_NULL(&vattr); + vattr.va_flags = flags; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); +#ifdef MAC + error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); + if (error == 0) +#endif + error = VOP_SETATTR(vp, &vattr, td->td_ucred); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + return (error); +} + +/* + * Change flags of a file given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chflags_args { + const char *path; + u_long flags; +}; +#endif +int +sys_chflags(struct thread *td, struct chflags_args *uap) +{ + + return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->flags, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct chflagsat_args { + int fd; + const char *path; + u_long flags; + int atflag; +} +#endif +int +sys_chflagsat(struct thread *td, struct chflagsat_args *uap) +{ + int fd = uap->fd; + const char *path = uap->path; + u_long flags = uap->flags; + int atflag = uap->atflag; + + if (atflag & ~AT_SYMLINK_NOFOLLOW) + return (EINVAL); + + return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag)); +} + +/* + * Same as chflags() but doesn't follow symlinks. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchflags_args { + const char *path; + u_long flags; +}; +#endif +int +sys_lchflags(struct thread *td, struct lchflags_args *uap) +{ + + return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->flags, AT_SYMLINK_NOFOLLOW)); +} + +static int +kern_chflagsat(struct thread *td, int fd, const char *path, + enum uio_seg pathseg, u_long flags, int atflag) +{ + struct nameidata nd; + int error, follow; + + AUDIT_ARG_FFLAGS(flags); + follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; + NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd, + &cap_fchflags_rights, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfflags(td, nd.ni_vp, flags); + vrele(nd.ni_vp); + return (error); +} + +/* + * Change flags of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchflags_args { + int fd; + u_long flags; +}; +#endif +int +sys_fchflags(struct thread *td, struct fchflags_args *uap) +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(uap->fd); + AUDIT_ARG_FFLAGS(uap->flags); + error = getvnode(td, uap->fd, &cap_fchflags_rights, + &fp); + if (error != 0) + return (error); +#ifdef AUDIT + vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); + AUDIT_ARG_VNODE1(fp->f_vnode); + VOP_UNLOCK(fp->f_vnode, 0); +#endif + error = setfflags(td, fp->f_vnode, uap->flags); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation code for chmod(), lchmod() and fchmod(). + */ +int +setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode) +{ + struct mount *mp; + struct vattr vattr; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VATTR_NULL(&vattr); + vattr.va_mode = mode & ALLPERMS; +#ifdef MAC + error = mac_vnode_check_setmode(cred, vp, vattr.va_mode); + if (error == 0) +#endif + error = VOP_SETATTR(vp, &vattr, cred); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + return (error); +} + +/* + * Change mode of a file given path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chmod_args { + char *path; + int mode; +}; +#endif +int +sys_chmod(struct thread *td, struct chmod_args *uap) +{ + + return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->mode, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct fchmodat_args { + int dirfd; + char *path; + mode_t mode; + int flag; +} +#endif +int +sys_fchmodat(struct thread *td, struct fchmodat_args *uap) +{ + int flag = uap->flag; + int fd = uap->fd; + char *path = uap->path; + mode_t mode = uap->mode; + + if (flag & ~AT_SYMLINK_NOFOLLOW) + return (EINVAL); + + return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag)); +} + +/* + * Change mode of a file given path name (don't follow links.) + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchmod_args { + char *path; + int mode; +}; +#endif +int +sys_lchmod(struct thread *td, struct lchmod_args *uap) +{ + + return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->mode, AT_SYMLINK_NOFOLLOW)); +} + +int +kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + mode_t mode, int flag) +{ + struct nameidata nd; + int error, follow; + + AUDIT_ARG_MODE(mode); + follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; + NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd, + &cap_fchmod_rights, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfmode(td, td->td_ucred, nd.ni_vp, mode); + vrele(nd.ni_vp); + return (error); +} + +/* + * Change mode of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchmod_args { + int fd; + int mode; +}; +#endif +int +sys_fchmod(struct thread *td, struct fchmod_args *uap) +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(uap->fd); + AUDIT_ARG_MODE(uap->mode); + + error = fget(td, uap->fd, &cap_fchmod_rights, &fp); + if (error != 0) + return (error); + error = fo_chmod(fp, uap->mode, td->td_ucred, td); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation for chown(), lchown(), and fchown() + */ +int +setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, + gid_t gid) +{ + struct mount *mp; + struct vattr vattr; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VATTR_NULL(&vattr); + vattr.va_uid = uid; + vattr.va_gid = gid; +#ifdef MAC + error = mac_vnode_check_setowner(cred, vp, vattr.va_uid, + vattr.va_gid); + if (error == 0) +#endif + error = VOP_SETATTR(vp, &vattr, cred); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + return (error); +} + +/* + * Set ownership given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chown_args { + char *path; + int uid; + int gid; +}; +#endif +int +sys_chown(struct thread *td, struct chown_args *uap) +{ + + return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, + uap->gid, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct fchownat_args { + int fd; + const char * path; + uid_t uid; + gid_t gid; + int flag; +}; +#endif +int +sys_fchownat(struct thread *td, struct fchownat_args *uap) +{ + int flag; + + flag = uap->flag; + if (flag & ~AT_SYMLINK_NOFOLLOW) + return (EINVAL); + + return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, + uap->gid, uap->flag)); +} + +int +kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + int uid, int gid, int flag) +{ + struct nameidata nd; + int error, follow; + + AUDIT_ARG_OWNER(uid, gid); + follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; + NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd, + &cap_fchown_rights, td); + + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set ownership given a path name, do not cross symlinks. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchown_args { + char *path; + int uid; + int gid; +}; +#endif +int +sys_lchown(struct thread *td, struct lchown_args *uap) +{ + + return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW)); +} + +/* + * Set ownership given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchown_args { + int fd; + int uid; + int gid; +}; +#endif +int +sys_fchown(struct thread *td, struct fchown_args *uap) +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(uap->fd); + AUDIT_ARG_OWNER(uap->uid, uap->gid); + error = fget(td, uap->fd, &cap_fchown_rights, &fp); + if (error != 0) + return (error); + error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td); + fdrop(fp, td); + return (error); +} + +/* + * Common implementation code for utimes(), lutimes(), and futimes(). + */ +static int +getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg, + struct timespec *tsp) +{ + struct timeval tv[2]; + const struct timeval *tvp; + int error; + + if (usrtvp == NULL) { + vfs_timestamp(&tsp[0]); + tsp[1] = tsp[0]; + } else { + if (tvpseg == UIO_SYSSPACE) { + tvp = usrtvp; + } else { + if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) + return (error); + tvp = tv; + } + + if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || + tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) + return (EINVAL); + TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); + TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); + } + return (0); +} + +/* + * Common implementation code for futimens(), utimensat(). + */ +#define UTIMENS_NULL 0x1 +#define UTIMENS_EXIT 0x2 +static int +getutimens(const struct timespec *usrtsp, enum uio_seg tspseg, + struct timespec *tsp, int *retflags) +{ + struct timespec tsnow; + int error; + + vfs_timestamp(&tsnow); + *retflags = 0; + if (usrtsp == NULL) { + tsp[0] = tsnow; + tsp[1] = tsnow; + *retflags |= UTIMENS_NULL; + return (0); + } + if (tspseg == UIO_SYSSPACE) { + tsp[0] = usrtsp[0]; + tsp[1] = usrtsp[1]; + } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0) + return (error); + if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT) + *retflags |= UTIMENS_EXIT; + if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW) + *retflags |= UTIMENS_NULL; + if (tsp[0].tv_nsec == UTIME_OMIT) + tsp[0].tv_sec = VNOVAL; + else if (tsp[0].tv_nsec == UTIME_NOW) + tsp[0] = tsnow; + else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L) + return (EINVAL); + if (tsp[1].tv_nsec == UTIME_OMIT) + tsp[1].tv_sec = VNOVAL; + else if (tsp[1].tv_nsec == UTIME_NOW) + tsp[1] = tsnow; + else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L) + return (EINVAL); + + return (0); +} + +/* + * Common implementation code for utimes(), lutimes(), futimes(), futimens(), + * and utimensat(). + */ +static int +setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts, + int numtimes, int nullflag) +{ + struct mount *mp; + struct vattr vattr; + int error, setbirthtime; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + setbirthtime = 0; + if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) && + timespeccmp(&ts[1], &vattr.va_birthtime, < )) + setbirthtime = 1; + VATTR_NULL(&vattr); + vattr.va_atime = ts[0]; + vattr.va_mtime = ts[1]; + if (setbirthtime) + vattr.va_birthtime = ts[1]; + if (numtimes > 2) + vattr.va_birthtime = ts[2]; + if (nullflag) + vattr.va_vaflags |= VA_UTIMES_NULL; +#ifdef MAC + error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, + vattr.va_mtime); +#endif + if (error == 0) + error = VOP_SETATTR(vp, &vattr, td->td_ucred); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +#endif +int +sys_utimes(struct thread *td, struct utimes_args *uap) +{ + + return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->tptr, UIO_USERSPACE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct futimesat_args { + int fd; + const char * path; + const struct timeval * times; +}; +#endif +int +sys_futimesat(struct thread *td, struct futimesat_args *uap) +{ + + return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, + uap->times, UIO_USERSPACE)); +} + +int +kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + struct timeval *tptr, enum uio_seg tptrseg) +{ + struct nameidata nd; + struct timespec ts[2]; + int error; + + if ((error = getutimes(tptr, tptrseg, ts)) != 0) + return (error); + NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, + &cap_futimes_rights, td); + + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lutimes_args { + char *path; + struct timeval *tptr; +}; +#endif +int +sys_lutimes(struct thread *td, struct lutimes_args *uap) +{ + + return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, + UIO_USERSPACE)); +} + +int +kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, + struct timeval *tptr, enum uio_seg tptrseg) +{ + struct timespec ts[2]; + struct nameidata nd; + int error; + + if ((error = getutimes(tptr, tptrseg, ts)) != 0) + return (error); + NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td); + if ((error = namei(&nd)) != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct futimes_args { + int fd; + struct timeval *tptr; +}; +#endif +int +sys_futimes(struct thread *td, struct futimes_args *uap) +{ + + return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); +} + +int +kern_futimes(struct thread *td, int fd, struct timeval *tptr, + enum uio_seg tptrseg) +{ + struct timespec ts[2]; + struct file *fp; + int error; + + AUDIT_ARG_FD(fd); + error = getutimes(tptr, tptrseg, ts); + if (error != 0) + return (error); + error = getvnode(td, fd, &cap_futimes_rights, &fp); + if (error != 0) + return (error); +#ifdef AUDIT + vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); + AUDIT_ARG_VNODE1(fp->f_vnode); + VOP_UNLOCK(fp->f_vnode, 0); +#endif + error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); + fdrop(fp, td); + return (error); +} + +int +sys_futimens(struct thread *td, struct futimens_args *uap) +{ + + return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE)); +} + +int +kern_futimens(struct thread *td, int fd, struct timespec *tptr, + enum uio_seg tptrseg) +{ + struct timespec ts[2]; + struct file *fp; + int error, flags; + + AUDIT_ARG_FD(fd); + error = getutimens(tptr, tptrseg, ts, &flags); + if (error != 0) + return (error); + if (flags & UTIMENS_EXIT) + return (0); + error = getvnode(td, fd, &cap_futimes_rights, &fp); + if (error != 0) + return (error); +#ifdef AUDIT + vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); + AUDIT_ARG_VNODE1(fp->f_vnode); + VOP_UNLOCK(fp->f_vnode, 0); +#endif + error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL); + fdrop(fp, td); + return (error); +} + +int +sys_utimensat(struct thread *td, struct utimensat_args *uap) +{ + + return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE, + uap->times, UIO_USERSPACE, uap->flag)); +} + +int +kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg, + struct timespec *tptr, enum uio_seg tptrseg, int flag) +{ + struct nameidata nd; + struct timespec ts[2]; + int error, flags; + + if (flag & ~AT_SYMLINK_NOFOLLOW) + return (EINVAL); + + if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0) + return (error); + NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : + FOLLOW) | AUDITVNODE1, pathseg, path, fd, + &cap_futimes_rights, td); + if ((error = namei(&nd)) != 0) + return (error); + /* + * We are allowed to call namei() regardless of 2xUTIME_OMIT. + * POSIX states: + * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected." + * "Search permission is denied by a component of the path prefix." + */ + NDFREE(&nd, NDF_ONLY_PNBUF); + if ((flags & UTIMENS_EXIT) == 0) + error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL); + vrele(nd.ni_vp); + return (error); +} + +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +#endif +int +sys_truncate(struct thread *td, struct truncate_args *uap) +{ + + return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); +} + +int +kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) +{ + struct mount *mp; + struct vnode *vp; + void *rl_cookie; + struct vattr vattr; + struct nameidata nd; + int error; + + if (length < 0) + return(EINVAL); + NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { + vn_rangelock_unlock(vp, rl_cookie); + vrele(vp); + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_type == VDIR) + error = EISDIR; +#ifdef MAC + else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { + } +#endif + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = length; + error = VOP_SETATTR(vp, &vattr, td->td_ucred); + } + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + vn_rangelock_unlock(vp, rl_cookie); + vrele(vp); + return (error); +} + +#if defined(COMPAT_43) +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct otruncate_args { + char *path; + long length; +}; +#endif +int +otruncate(struct thread *td, struct otruncate_args *uap) +{ + + return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); +} +#endif /* COMPAT_43 */ + +#if defined(COMPAT_FREEBSD6) +/* Versions with the pad argument */ +int +freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) +{ + + return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); +} + +int +freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) +{ + + return (kern_ftruncate(td, uap->fd, uap->length)); +} +#endif + +int +kern_fsync(struct thread *td, int fd, bool fullsync) +{ + struct vnode *vp; + struct mount *mp; + struct file *fp; + int error, lock_flags; + + AUDIT_ARG_FD(fd); + error = getvnode(td, fd, &cap_fsync_rights, &fp); + if (error != 0) + return (error); + vp = fp->f_vnode; +#if 0 + if (!fullsync) + /* XXXKIB: compete outstanding aio writes */; +#endif + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + goto drop; + if (MNT_SHARED_WRITES(mp) || + ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { + lock_flags = LK_SHARED; + } else { + lock_flags = LK_EXCLUSIVE; + } + vn_lock(vp, lock_flags | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + if (vp->v_object != NULL) { + VM_OBJECT_WLOCK(vp->v_object); + vm_object_page_clean(vp->v_object, 0, 0, 0); + VM_OBJECT_WUNLOCK(vp->v_object); + } + error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); +drop: + fdrop(fp, td); + return (error); +} + +/* + * Sync an open file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif +int +sys_fsync(struct thread *td, struct fsync_args *uap) +{ + + return (kern_fsync(td, uap->fd, true)); +} + +int +sys_fdatasync(struct thread *td, struct fdatasync_args *uap) +{ + + return (kern_fsync(td, uap->fd, false)); +} + +/* + * Rename files. Source and destination must either both be directories, or + * both not be directories. If target is a directory, it must be empty. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rename_args { + char *from; + char *to; +}; +#endif +int +sys_rename(struct thread *td, struct rename_args *uap) +{ + + return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD, + uap->to, UIO_USERSPACE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct renameat_args { + int oldfd; + char *old; + int newfd; + char *new; +}; +#endif +int +sys_renameat(struct thread *td, struct renameat_args *uap) +{ + + return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, + UIO_USERSPACE)); +} + +int +kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new, + enum uio_seg pathseg) +{ + struct mount *mp = NULL; + struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + +again: + bwillwrite(); +#ifdef MAC + NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | + AUDITVNODE1, pathseg, old, oldfd, + &cap_renameat_source_rights, td); +#else + NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1, + pathseg, old, oldfd, + &cap_renameat_source_rights, td); +#endif + + if ((error = namei(&fromnd)) != 0) + return (error); +#ifdef MAC + error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp, + fromnd.ni_vp, &fromnd.ni_cnd); + VOP_UNLOCK(fromnd.ni_dvp, 0); + if (fromnd.ni_dvp != fromnd.ni_vp) + VOP_UNLOCK(fromnd.ni_vp, 0); +#endif + fvp = fromnd.ni_vp; + NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | + SAVESTART | AUDITVNODE2, pathseg, new, newfd, + &cap_renameat_target_rights, td); + if (fromnd.ni_vp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + if ((error = namei(&tond)) != 0) { + /* Translate error code for rename("dir1", "dir2/."). */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + NDFREE(&fromnd, NDF_ONLY_PNBUF); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + error = vn_start_write(fvp, &mp, V_NOWAIT); + if (error != 0) { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&tond, NDF_ONLY_PNBUF); + if (tvp != NULL) + vput(tvp); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + vrele(fromnd.ni_dvp); + vrele(fvp); + vrele(tond.ni_startdir); + if (fromnd.ni_startdir != NULL) + vrele(fromnd.ni_startdir); + error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); + if (error != 0) + return (error); + goto again; + } + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } +#ifdef CAPABILITIES + if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) { + /* + * If the target already exists we require CAP_UNLINKAT + * from 'newfd', when newfd was used for the lookup. + */ + error = cap_check(&tond.ni_filecaps.fc_rights, + &cap_unlinkat_rights); + if (error != 0) + goto out; + } +#endif + } + if (fvp == tdvp) { + error = EINVAL; + goto out; + } + /* + * If the source is the same as the destination (that is, if they + * are links to the same vnode), then there is nothing to do. + */ + if (fvp == tvp) + error = -1; +#ifdef MAC + else + error = mac_vnode_check_rename_to(td->td_ucred, tdvp, + tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); +#endif +out: + if (error == 0) { + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&tond, NDF_ONLY_PNBUF); + } else { + NDFREE(&fromnd, NDF_ONLY_PNBUF); + NDFREE(&tond, NDF_ONLY_PNBUF); + if (tvp != NULL) + vput(tvp); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + vn_finished_write(mp); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkdir_args { + char *path; + int mode; +}; +#endif +int +sys_mkdir(struct thread *td, struct mkdir_args *uap) +{ + + return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE, + uap->mode)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct mkdirat_args { + int fd; + char *path; + mode_t mode; +}; +#endif +int +sys_mkdirat(struct thread *td, struct mkdirat_args *uap) +{ + + return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); +} + +int +kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg, + int mode) +{ + struct mount *mp; + struct vnode *vp; + struct vattr vattr; + struct nameidata nd; + int error; + + AUDIT_ARG_MODE(mode); +restart: + bwillwrite(); + NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | + NOCACHE, segflg, path, fd, &cap_mkdirat_rights, + td); + nd.ni_cnd.cn_flags |= WILLBEDIR; + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + NDFREE(&nd, NDF_ONLY_PNBUF); + /* + * XXX namei called with LOCKPARENT but not LOCKLEAF has + * the strange behaviour of leaving the vnode unlocked + * if the target is the same vnode as the parent. + */ + if (vp == nd.ni_dvp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; +#ifdef MAC + error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, + &vattr); + if (error != 0) + goto out; +#endif + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); +#ifdef MAC +out: +#endif + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if (error == 0) + vput(nd.ni_vp); + vn_finished_write(mp); + return (error); +} + +/* + * Remove a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rmdir_args { + char *path; +}; +#endif +int +sys_rmdir(struct thread *td, struct rmdir_args *uap) +{ + + return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE)); +} + +int +kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg) +{ + struct mount *mp; + struct vnode *vp; + struct nameidata nd; + int error; + +restart: + bwillwrite(); + NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1, + pathseg, path, fd, &cap_unlinkat_rights, td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_vflag & VV_ROOT) { + error = EBUSY; + goto out; + } +#ifdef MAC + error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, + &nd.ni_cnd); + if (error != 0) + goto out; +#endif + if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + vn_finished_write(mp); +out: + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(vp); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) +int +freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count, + long *basep, void (*func)(struct freebsd11_dirent *)) +{ + struct freebsd11_dirent dstdp; + struct dirent *dp, *edp; + char *dirbuf; + off_t base; + ssize_t resid, ucount; + int error; + + /* XXX arbitrary sanity limit on `count'. */ + count = min(count, 64 * 1024); + + dirbuf = malloc(count, M_TEMP, M_WAITOK); + + error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid, + UIO_SYSSPACE); + if (error != 0) + goto done; + if (basep != NULL) + *basep = base; + + ucount = 0; + for (dp = (struct dirent *)dirbuf, + edp = (struct dirent *)&dirbuf[count - resid]; + ucount < count && dp < edp; ) { + if (dp->d_reclen == 0) + break; + MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0)); + if (dp->d_namlen >= sizeof(dstdp.d_name)) + continue; + dstdp.d_type = dp->d_type; + dstdp.d_namlen = dp->d_namlen; + dstdp.d_fileno = dp->d_fileno; /* truncate */ + if (dstdp.d_fileno != dp->d_fileno) { + switch (ino64_trunc_error) { + default: + case 0: + break; + case 1: + error = EOVERFLOW; + goto done; + case 2: + dstdp.d_fileno = UINT32_MAX; + break; + } + } + dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) + + ((dp->d_namlen + 1 + 3) &~ 3); + bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); + bzero(dstdp.d_name + dstdp.d_namlen, + dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) - + dstdp.d_namlen); + MPASS(dstdp.d_reclen <= dp->d_reclen); + MPASS(ucount + dstdp.d_reclen <= count); + if (func != NULL) + func(&dstdp); + error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen); + if (error != 0) + break; + dp = (struct dirent *)((char *)dp + dp->d_reclen); + ucount += dstdp.d_reclen; + } + +done: + free(dirbuf, M_TEMP); + if (error == 0) + td->td_retval[0] = ucount; + return (error); +} +#endif /* COMPAT */ + +#ifdef COMPAT_43 +static void +ogetdirentries_cvt(struct freebsd11_dirent *dp) +{ +#if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +#else + /* + * The dp->d_type is the high byte of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +#endif +} + +/* + * Read a block of directory entries in a filesystem independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +ogetdirentries(struct thread *td, struct ogetdirentries_args *uap) +{ + long loff; + int error; + + error = kern_ogetdirentries(td, uap, &loff); + if (error == 0) + error = copyout(&loff, uap->basep, sizeof(long)); + return (error); +} + +int +kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, + long *ploff) +{ + long base; + int error; + + /* XXX arbitrary sanity limit on `count'. */ + if (uap->count > 64 * 1024) + return (EINVAL); + + error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, + &base, ogetdirentries_cvt); + + if (error == 0 && uap->basep != NULL) + error = copyout(&base, uap->basep, sizeof(long)); + + return (error); +} +#endif /* COMPAT_43 */ + +#if defined(COMPAT_FREEBSD11) +#ifndef _SYS_SYSPROTO_H_ +struct freebsd11_getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +freebsd11_getdirentries(struct thread *td, + struct freebsd11_getdirentries_args *uap) +{ + long base; + int error; + + error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, + &base, NULL); + + if (error == 0 && uap->basep != NULL) + error = copyout(&base, uap->basep, sizeof(long)); + return (error); +} + +int +freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap) +{ + struct freebsd11_getdirentries_args ap; + + ap.fd = uap->fd; + ap.buf = uap->buf; + ap.count = uap->count; + ap.basep = NULL; + return (freebsd11_getdirentries(td, &ap)); +} +#endif /* COMPAT_FREEBSD11 */ + +/* + * Read a block of directory entries in a filesystem independent format. + */ +int +sys_getdirentries(struct thread *td, struct getdirentries_args *uap) +{ + off_t base; + int error; + + error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, + NULL, UIO_USERSPACE); + if (error != 0) + return (error); + if (uap->basep != NULL) + error = copyout(&base, uap->basep, sizeof(off_t)); + return (error); +} + +int +kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, + off_t *basep, ssize_t *residp, enum uio_seg bufseg) +{ + struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + off_t loff; + int error, eofflag; + off_t foffset; + + AUDIT_ARG_FD(fd); + if (count > IOSIZE_MAX) + return (EINVAL); + auio.uio_resid = count; + error = getvnode(td, fd, &cap_read_rights, &fp); + if (error != 0) + return (error); + if ((fp->f_flag & FREAD) == 0) { + fdrop(fp, td); + return (EBADF); + } + vp = fp->f_vnode; + foffset = foffset_lock(fp, 0); +unionread: + if (vp->v_type != VDIR) { + error = EINVAL; + goto fail; + } + aiov.iov_base = buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = bufseg; + auio.uio_td = td; + vn_lock(vp, LK_SHARED | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + loff = auio.uio_offset = foffset; +#ifdef MAC + error = mac_vnode_check_readdir(td->td_ucred, vp); + if (error == 0) +#endif + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, + NULL); + foffset = auio.uio_offset; + if (error != 0) { + VOP_UNLOCK(vp, 0); + goto fail; + } + if (count == auio.uio_resid && + (vp->v_vflag & VV_ROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_vnode = vp; + fp->f_data = vp; + foffset = 0; + vput(tvp); + goto unionread; + } + VOP_UNLOCK(vp, 0); + *basep = loff; + if (residp != NULL) + *residp = auio.uio_resid; + td->td_retval[0] = count - auio.uio_resid; +fail: + foffset_unlock(fp, foffset, 0); + fdrop(fp, td); + return (error); +} + +/* + * Set the mode mask for creation of filesystem nodes. + */ +#ifndef _SYS_SYSPROTO_H_ +struct umask_args { + int newmask; +}; +#endif +int +sys_umask(struct thread *td, struct umask_args *uap) +{ + struct filedesc *fdp; + + fdp = td->td_proc->p_fd; + FILEDESC_XLOCK(fdp); + td->td_retval[0] = fdp->fd_cmask; + fdp->fd_cmask = uap->newmask & ALLPERMS; + FILEDESC_XUNLOCK(fdp); + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem away from + * vnode. + */ +#ifndef _SYS_SYSPROTO_H_ +struct revoke_args { + char *path; +}; +#endif +int +sys_revoke(struct thread *td, struct revoke_args *uap) +{ + struct vnode *vp; + struct vattr vattr; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, + uap->path, td); + if ((error = namei(&nd)) != 0) + return (error); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (vp->v_type != VCHR || vp->v_rdev == NULL) { + error = EINVAL; + goto out; + } +#ifdef MAC + error = mac_vnode_check_revoke(td->td_ucred, vp); + if (error != 0) + goto out; +#endif + error = VOP_GETATTR(vp, &vattr, td->td_ucred); + if (error != 0) + goto out; + if (td->td_ucred->cr_uid != vattr.va_uid) { + error = priv_check(td, PRIV_VFS_ADMIN); + if (error != 0) + goto out; + } + if (vcount(vp) > 1) + VOP_REVOKE(vp, REVOKEALL); +out: + vput(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry and check that, if it + * is a capability, the correct rights are present. A reference on the file + * entry is held upon returning. + */ +int +getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) +{ + struct file *fp; + int error; + + error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); + if (error != 0) + return (error); + + /* + * The file could be not of the vnode type, or it may be not + * yet fully initialized, in which case the f_vnode pointer + * may be set, but f_ops is still badfileops. E.g., + * devfs_open() transiently create such situation to + * facilitate csw d_fdopen(). + * + * Dupfdopen() handling in kern_openat() installs the + * half-baked file into the process descriptor table, allowing + * other thread to dereference it. Guard against the race by + * checking f_ops. + */ + if (fp->f_vnode == NULL || fp->f_ops == &badfileops) { + fdrop(fp, td); + return (EINVAL); + } + *fpp = fp; + return (0); +} + + +/* + * Get an (NFS) file handle. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lgetfh_args { + char *fname; + fhandle_t *fhp; +}; +#endif +int +sys_lgetfh(struct thread *td, struct lgetfh_args *uap) +{ + + return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname, + UIO_USERSPACE, uap->fhp)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getfh_args { + char *fname; + fhandle_t *fhp; +}; +#endif +int +sys_getfh(struct thread *td, struct getfh_args *uap) +{ + + return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE, + uap->fhp)); +} + +/* + * syscall for the rpc.lockd to use to translate an open descriptor into + * a NFS file handle. + * + * warning: do not remove the priv_check() call or this becomes one giant + * security hole. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfhat_args { + int fd; + char *path; + fhandle_t *fhp; + int flags; +}; +#endif +int +sys_getfhat(struct thread *td, struct getfhat_args *uap) +{ + + if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW)) != 0) + return (EINVAL); + return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE, + uap->fhp)); +} + +static int +kern_getfhat(struct thread *td, int flags, int fd, const char *path, + enum uio_seg pathseg, fhandle_t *fhp) +{ + struct nameidata nd; + fhandle_t fh; + struct vnode *vp; + int error; + + error = priv_check(td, PRIV_VFS_GETFH); + if (error != 0) + return (error); + NDINIT_AT(&nd, LOOKUP, ((flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW : + FOLLOW) | /*((flags & AT_BENEATH) != 0 ? BENEATH : 0) |*/ LOCKLEAF | + AUDITVNODE1, pathseg, path, fd, td); + error = namei(&nd); + if (error != 0) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + bzero(&fh, sizeof(fh)); + fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; + error = VOP_VPTOFH(vp, &fh.fh_fid); + vput(vp); + if (error == 0) + error = copyout(&fh, fhp, sizeof (fh)); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct fhlink_args { + fhandle_t *fhp; + const char *to; +}; +#endif +int +sys_fhlink(struct thread *td, struct fhlink_args *uap) +{ + + return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct fhlinkat_args { + fhandle_t *fhp; + int tofd; + const char *to; +}; +#endif +int +sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap) +{ + + return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp)); +} + +static int +kern_fhlinkat(struct thread *td, int fd, const char *path, + enum uio_seg pathseg, fhandle_t *fhp) +{ + fhandle_t fh; + struct mount *mp; + struct vnode *vp; + int error; + + error = priv_check(td, PRIV_VFS_GETFH); + if (error != 0) + return (error); + error = copyin(fhp, &fh, sizeof(fh)); + if (error != 0) + return (error); + do { + bwillwrite(); + if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); + vfs_unbusy(mp); + if (error != 0) + return (error); + VOP_UNLOCK(vp, 0); + } while ((error = kern_linkat_vp(td, vp, fd, path, pathseg)) == EAGAIN); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct fhreadlink_args { + fhandle_t *fhp; + char *buf; + size_t bufsize; +}; +#endif +int +sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap) +{ + fhandle_t fh; + struct mount *mp; + struct vnode *vp; + int error; + + error = priv_check(td, PRIV_VFS_GETFH); + if (error != 0) + return (error); + if (uap->bufsize > IOSIZE_MAX) + return (EINVAL); + error = copyin(uap->fhp, &fh, sizeof(fh)); + if (error != 0) + return (error); + if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); + vfs_unbusy(mp); + if (error != 0) + return (error); + error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td); + vput(vp); + return (error); +} + +/* + * syscall for the rpc.lockd to use to translate a NFS file handle into an + * open descriptor. + * + * warning: do not remove the priv_check() call or this becomes one giant + * security hole. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhopen_args { + const struct fhandle *u_fhp; + int flags; +}; +#endif +int +sys_fhopen(struct thread *td, struct fhopen_args *uap) +{ + struct mount *mp; + struct vnode *vp; + struct fhandle fhp; + struct file *fp; + int fmode, error; + int indx; + + error = priv_check(td, PRIV_VFS_FHOPEN); + if (error != 0) + return (error); + indx = -1; + fmode = FFLAGS(uap->flags); + /* why not allow a non-read/write open for our lockd? */ + if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) + return (EINVAL); + error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); + if (error != 0) + return(error); + /* find the mount point */ + mp = vfs_busyfs(&fhp.fh_fsid); + if (mp == NULL) + return (ESTALE); + /* now give me my vnode, it gets returned to me locked */ + error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp); + vfs_unbusy(mp); + if (error != 0) + return (error); + + error = falloc_noinstall(td, &fp); + if (error != 0) { + vput(vp); + return (error); + } + /* + * An extra reference on `fp' has been held for us by + * falloc_noinstall(). + */ + +#ifdef INVARIANTS + td->td_dupfd = -1; +#endif + error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp); + if (error != 0) { + KASSERT(fp->f_ops == &badfileops, + ("VOP_OPEN in fhopen() set f_ops")); + KASSERT(td->td_dupfd < 0, + ("fhopen() encountered fdopen()")); + + vput(vp); + goto bad; + } +#ifdef INVARIANTS + td->td_dupfd = 0; +#endif + fp->f_vnode = vp; + fp->f_seqcount = 1; + finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp, + &vnops); + VOP_UNLOCK(vp, 0); + if ((fmode & O_TRUNC) != 0) { + error = fo_truncate(fp, 0, td->td_ucred, td); + if (error != 0) + goto bad; + } + + error = finstall(td, fp, &indx, fmode, NULL); +bad: + fdrop(fp, td); + td->td_retval[0] = indx; + return (error); +} + +/* + * Stat an (NFS) file handle. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhstat_args { + struct fhandle *u_fhp; + struct stat *sb; +}; +#endif +int +sys_fhstat(struct thread *td, struct fhstat_args *uap) +{ + struct stat sb; + struct fhandle fh; + int error; + + error = copyin(uap->u_fhp, &fh, sizeof(fh)); + if (error != 0) + return (error); + error = kern_fhstat(td, fh, &sb); + if (error == 0) + error = copyout(&sb, uap->sb, sizeof(sb)); + return (error); +} + +int +kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb) +{ + struct mount *mp; + struct vnode *vp; + int error; + + error = priv_check(td, PRIV_VFS_FHSTAT); + if (error != 0) + return (error); + if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); + vfs_unbusy(mp); + if (error != 0) + return (error); + error = vn_stat(vp, sb, td->td_ucred, NOCRED, td); + vput(vp); + return (error); +} + +/* + * Implement fstatfs() for (NFS) file handles. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fhstatfs_args { + struct fhandle *u_fhp; + struct statfs *buf; +}; +#endif +int +sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap) +{ + struct statfs *sfp; + fhandle_t fh; + int error; + + error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); + if (error != 0) + return (error); + sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); + error = kern_fhstatfs(td, fh, sfp); + if (error == 0) + error = copyout(sfp, uap->buf, sizeof(*sfp)); + free(sfp, M_STATFS); + return (error); +} + +int +kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) +{ + struct statfs *sp; + struct mount *mp; + struct vnode *vp; + int error; + + error = priv_check(td, PRIV_VFS_FHSTATFS); + if (error != 0) + return (error); + if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) + return (ESTALE); + error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); + if (error != 0) { + vfs_unbusy(mp); + return (error); + } + vput(vp); + error = prison_canseemount(td->td_ucred, mp); + if (error != 0) + goto out; +#ifdef MAC + error = mac_mount_check_stat(td->td_ucred, mp); + if (error != 0) + goto out; +#endif + /* + * Set these in case the underlying filesystem fails to do so. + */ + sp = &mp->mnt_stat; + sp->f_version = STATFS_VERSION; + sp->f_namemax = NAME_MAX; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = VFS_STATFS(mp, sp); + if (error == 0) + *buf = *sp; +out: + vfs_unbusy(mp); + return (error); +} + +int +kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) +{ + struct file *fp; + struct mount *mp; + struct vnode *vp; + off_t olen, ooffset; + int error; +#ifdef AUDIT + int audited_vnode1 = 0; +#endif + + AUDIT_ARG_FD(fd); + if (offset < 0 || len <= 0) + return (EINVAL); + /* Check for wrap. */ + if (offset > OFF_MAX - len) + return (EFBIG); + AUDIT_ARG_FD(fd); + error = fget(td, fd, &cap_pwrite_rights, &fp); + if (error != 0) + return (error); + AUDIT_ARG_FILE(td->td_proc, fp); + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { + error = ESPIPE; + goto out; + } + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_type != DTYPE_VNODE) { + error = ENODEV; + goto out; + } + vp = fp->f_vnode; + if (vp->v_type != VREG) { + error = ENODEV; + goto out; + } + + /* Allocating blocks may take a long time, so iterate. */ + for (;;) { + olen = len; + ooffset = offset; + + bwillwrite(); + mp = NULL; + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error != 0) + break; + error = vn_lock(vp, LK_EXCLUSIVE); + if (error != 0) { + vn_finished_write(mp); + break; + } +#ifdef AUDIT + if (!audited_vnode1) { + AUDIT_ARG_VNODE1(vp); + audited_vnode1 = 1; + } +#endif +#ifdef MAC + error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); + if (error == 0) +#endif + error = VOP_ALLOCATE(vp, &offset, &len); + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); + + if (olen + ooffset != offset + len) { + panic("offset + len changed from %jx/%jx to %jx/%jx", + ooffset, olen, offset, len); + } + if (error != 0 || len == 0) + break; + KASSERT(olen > len, ("Iteration did not make progress?")); + maybe_yield(); + } + out: + fdrop(fp, td); + return (error); +} + +int +sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) +{ + int error; + + error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len); + return (kern_posix_error(td, error)); +} + +/* + * Unlike madvise(2), we do not make a best effort to remember every + * possible caching hint. Instead, we remember the last setting with + * the exception that we will allow POSIX_FADV_NORMAL to adjust the + * region of any current setting. + */ +int +kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, + int advice) +{ + struct fadvise_info *fa, *new; + struct file *fp; + struct vnode *vp; + off_t end; + int error; + + if (offset < 0 || len < 0 || offset > OFF_MAX - len) + return (EINVAL); + AUDIT_ARG_VALUE(advice); + switch (advice) { + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_NOREUSE: + new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); + break; + case POSIX_FADV_NORMAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_DONTNEED: + new = NULL; + break; + default: + return (EINVAL); + } + /* XXX: CAP_POSIX_FADVISE? */ + AUDIT_ARG_FD(fd); + error = fget(td, fd, &cap_no_rights, &fp); + if (error != 0) + goto out; + AUDIT_ARG_FILE(td->td_proc, fp); + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { + error = ESPIPE; + goto out; + } + if (fp->f_type != DTYPE_VNODE) { + error = ENODEV; + goto out; + } + vp = fp->f_vnode; + if (vp->v_type != VREG) { + error = ENODEV; + goto out; + } + if (len == 0) + end = OFF_MAX; + else + end = offset + len - 1; + switch (advice) { + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_NOREUSE: + /* + * Try to merge any existing non-standard region with + * this new region if possible, otherwise create a new + * non-standard region for this request. + */ + mtx_pool_lock(mtxpool_sleep, fp); + fa = fp->f_advice; + if (fa != NULL && fa->fa_advice == advice && + ((fa->fa_start <= end && fa->fa_end >= offset) || + (end != OFF_MAX && fa->fa_start == end + 1) || + (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) { + if (offset < fa->fa_start) + fa->fa_start = offset; + if (end > fa->fa_end) + fa->fa_end = end; + } else { + new->fa_advice = advice; + new->fa_start = offset; + new->fa_end = end; + fp->f_advice = new; + new = fa; + } + mtx_pool_unlock(mtxpool_sleep, fp); + break; + case POSIX_FADV_NORMAL: + /* + * If a the "normal" region overlaps with an existing + * non-standard region, trim or remove the + * non-standard region. + */ + mtx_pool_lock(mtxpool_sleep, fp); + fa = fp->f_advice; + if (fa != NULL) { + if (offset <= fa->fa_start && end >= fa->fa_end) { + new = fa; + fp->f_advice = NULL; + } else if (offset <= fa->fa_start && + end >= fa->fa_start) + fa->fa_start = end + 1; + else if (offset <= fa->fa_end && end >= fa->fa_end) + fa->fa_end = offset - 1; + else if (offset >= fa->fa_start && end <= fa->fa_end) { + /* + * If the "normal" region is a middle + * portion of the existing + * non-standard region, just remove + * the whole thing rather than picking + * one side or the other to + * preserve. + */ + new = fa; + fp->f_advice = NULL; + } + } + mtx_pool_unlock(mtxpool_sleep, fp); + break; + case POSIX_FADV_WILLNEED: + case POSIX_FADV_DONTNEED: + error = VOP_ADVISE(vp, offset, end, advice); + break; + } +out: + if (fp != NULL) + fdrop(fp, td); + free(new, M_FADVISE); + return (error); +} + +int +sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) +{ + int error; + + error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, + uap->advice); + return (kern_posix_error(td, error)); +} diff --git a/freebsd/sys/kern/vfs_vnops.c b/freebsd/sys/kern/vfs_vnops.c new file mode 100644 index 00000000..bdd6692d --- /dev/null +++ b/freebsd/sys/kern/vfs_vnops.c @@ -0,0 +1,2607 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Copyright (c) 2012 Konstantin Belousov + * Copyright (c) 2013, 2014 The FreeBSD Foundation + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_hwpmc_hooks.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef HWPMC_HOOKS +#include +#endif + +static fo_rdwr_t vn_read; +static fo_rdwr_t vn_write; +static fo_rdwr_t vn_io_fault; +static fo_truncate_t vn_truncate; +static fo_ioctl_t vn_ioctl; +static fo_poll_t vn_poll; +static fo_kqfilter_t vn_kqfilter; +static fo_stat_t vn_statfile; +static fo_close_t vn_closefile; +static fo_mmap_t vn_mmap; + +struct fileops vnops = { + .fo_read = vn_io_fault, + .fo_write = vn_io_fault, + .fo_truncate = vn_truncate, + .fo_ioctl = vn_ioctl, + .fo_poll = vn_poll, + .fo_kqfilter = vn_kqfilter, + .fo_stat = vn_statfile, + .fo_close = vn_closefile, + .fo_chmod = vn_chmod, + .fo_chown = vn_chown, + .fo_sendfile = vn_sendfile, + .fo_seek = vn_seek, + .fo_fill_kinfo = vn_fill_kinfo, + .fo_mmap = vn_mmap, + .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE +}; + +static const int io_hold_cnt = 16; +static int vn_io_fault_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, + &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); +static int vn_io_fault_prefault = 0; +SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW, + &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); +static u_long vn_io_faults_cnt; +SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, + &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); + +/* + * Returns true if vn_io_fault mode of handling the i/o request should + * be used. + */ +static bool +do_vn_io_fault(struct vnode *vp, struct uio *uio) +{ + struct mount *mp; + + return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && + (mp = vp->v_mount) != NULL && + (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); +} + +/* + * Structure used to pass arguments to vn_io_fault1(), to do either + * file- or vnode-based I/O calls. + */ +struct vn_io_fault_args { + enum { + VN_IO_FAULT_FOP, + VN_IO_FAULT_VOP + } kind; + struct ucred *cred; + int flags; + union { + struct fop_args_tag { + struct file *fp; + fo_rdwr_t *doio; + } fop_args; + struct vop_args_tag { + struct vnode *vp; + } vop_args; + } args; +}; + +static int vn_io_fault1(struct vnode *vp, struct uio *uio, + struct vn_io_fault_args *args, struct thread *td); + +int +vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) +{ + struct thread *td = ndp->ni_cnd.cn_thread; + + return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); +} + +/* + * Common code for vnode open operations via a name lookup. + * Lookup the vnode and invoke VOP_CREATE if needed. + * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. + * + * Note that this does NOT free nameidata for the successful case, + * due to the NDINIT being done elsewhere. + */ +int +vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, + struct ucred *cred, struct file *fp) +{ + struct vnode *vp; + struct mount *mp; + struct thread *td = ndp->ni_cnd.cn_thread; + struct vattr vat; + struct vattr *vap = &vat; + int fmode, error; + +restart: + fmode = *flagp; + if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | + O_EXCL | O_DIRECTORY)) + return (EINVAL); + else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { + ndp->ni_cnd.cn_nameiop = CREATE; + /* + * Set NOCACHE to avoid flushing the cache when + * rolling in many files at once. + */ + ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE; + if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) + ndp->ni_cnd.cn_flags |= FOLLOW; + if (!(vn_open_flags & VN_OPEN_NOAUDIT)) + ndp->ni_cnd.cn_flags |= AUDITVNODE1; + if (vn_open_flags & VN_OPEN_NOCAPCHECK) + ndp->ni_cnd.cn_flags |= NOCAPCHECK; + if ((vn_open_flags & VN_OPEN_INVFS) == 0) + bwillwrite(); + if ((error = namei(ndp)) != 0) + return (error); + if (ndp->ni_vp == NULL) { + VATTR_NULL(vap); + vap->va_type = VREG; + vap->va_mode = cmode; + if (fmode & O_EXCL) + vap->va_vaflags |= VA_EXCLUSIVE; + if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { + NDFREE(ndp, NDF_ONLY_PNBUF); + vput(ndp->ni_dvp); + if ((error = vn_start_write(NULL, &mp, + V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) + ndp->ni_cnd.cn_flags |= MAKEENTRY; +#ifdef MAC + error = mac_vnode_check_create(cred, ndp->ni_dvp, + &ndp->ni_cnd, vap); + if (error == 0) +#endif + error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, + &ndp->ni_cnd, vap); + vput(ndp->ni_dvp); + vn_finished_write(mp); + if (error) { + NDFREE(ndp, NDF_ONLY_PNBUF); + return (error); + } + fmode &= ~O_TRUNC; + vp = ndp->ni_vp; + } else { + if (ndp->ni_dvp == ndp->ni_vp) + vrele(ndp->ni_dvp); + else + vput(ndp->ni_dvp); + ndp->ni_dvp = NULL; + vp = ndp->ni_vp; + if (fmode & O_EXCL) { + error = EEXIST; + goto bad; + } + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + fmode &= ~O_CREAT; + } + } else { + ndp->ni_cnd.cn_nameiop = LOOKUP; + ndp->ni_cnd.cn_flags = ISOPEN | + ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; + if (!(fmode & FWRITE)) + ndp->ni_cnd.cn_flags |= LOCKSHARED; + if (!(vn_open_flags & VN_OPEN_NOAUDIT)) + ndp->ni_cnd.cn_flags |= AUDITVNODE1; + if (vn_open_flags & VN_OPEN_NOCAPCHECK) + ndp->ni_cnd.cn_flags |= NOCAPCHECK; + if ((error = namei(ndp)) != 0) + return (error); + vp = ndp->ni_vp; + } + error = vn_open_vnode(vp, fmode, cred, td, fp); + if (error) + goto bad; + *flagp = fmode; + return (0); +bad: + NDFREE(ndp, NDF_ONLY_PNBUF); + vput(vp); + *flagp = fmode; + ndp->ni_vp = NULL; + return (error); +} + +static int +vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) +{ + struct flock lf; + int error, lock_flags, type; + + ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); + if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) + return (0); + KASSERT(fp != NULL, ("open with flock requires fp")); + if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) + return (EOPNOTSUPP); + + lock_flags = VOP_ISLOCKED(vp); + VOP_UNLOCK(vp, 0); + + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; + type = F_FLOCK; + if ((fmode & FNONBLOCK) == 0) + type |= F_WAIT; + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); + if (error == 0) + fp->f_flag |= FHASLOCK; + + vn_lock(vp, lock_flags | LK_RETRY); + if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) + error = ENOENT; + return (error); +} + +/* + * Common code for vnode open operations once a vnode is located. + * Check permissions, and call the VOP_OPEN routine. + */ +int +vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, + struct thread *td, struct file *fp) +{ + accmode_t accmode; + int error; + + if (vp->v_type == VLNK) + return (EMLINK); + if (vp->v_type == VSOCK) + return (EOPNOTSUPP); + if (vp->v_type != VDIR && fmode & O_DIRECTORY) + return (ENOTDIR); + accmode = 0; + if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) + return (EISDIR); + accmode |= VWRITE; + } + if (fmode & FREAD) + accmode |= VREAD; + if (fmode & FEXEC) + accmode |= VEXEC; + if ((fmode & O_APPEND) && (fmode & FWRITE)) + accmode |= VAPPEND; +#ifdef MAC + if (fmode & O_CREAT) + accmode |= VCREAT; + if (fmode & O_VERIFY) + accmode |= VVERIFY; + error = mac_vnode_check_open(cred, vp, accmode); + if (error) + return (error); + + accmode &= ~(VCREAT | VVERIFY); +#endif + if ((fmode & O_CREAT) == 0 && accmode != 0) { + error = VOP_ACCESS(vp, accmode, cred, td); + if (error != 0) + return (error); + } + if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) + vn_lock(vp, LK_UPGRADE | LK_RETRY); + error = VOP_OPEN(vp, fmode, cred, td, fp); + if (error != 0) + return (error); + + error = vn_open_vnode_advlock(vp, fmode, fp); + if (error == 0 && (fmode & FWRITE) != 0) { + error = VOP_ADD_WRITECOUNT(vp, 1); + if (error == 0) { + CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", + __func__, vp, vp->v_writecount); + } + } + + /* + * Error from advlock or VOP_ADD_WRITECOUNT() still requires + * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). + * Arrange for that by having fdrop() to use vn_closefile(). + */ + if (error != 0) { + fp->f_flag |= FOPENFAILED; + fp->f_vnode = vp; + if (fp->f_ops == &badfileops) { + fp->f_type = DTYPE_VNODE; + fp->f_ops = &vnops; + } + vref(vp); + } + + ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); + return (error); + +} + +/* + * Check for write permissions on the specified vnode. + * Prototype text segments cannot be written. + * It is racy. + */ +int +vn_writechk(struct vnode *vp) +{ + + ASSERT_VOP_LOCKED(vp, "vn_writechk"); + /* + * If there's shared text associated with + * the vnode, try to free it up once. If + * we fail, we can't allow writing. + */ + if (VOP_IS_TEXT(vp)) + return (ETXTBSY); + + return (0); +} + +/* + * Vnode close call + */ +static int +vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, + struct thread *td, bool keep_ref) +{ + struct mount *mp; + int error, lock_flags; + + if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && + MNT_EXTENDED_SHARED(vp->v_mount)) + lock_flags = LK_SHARED; + else + lock_flags = LK_EXCLUSIVE; + + vn_start_write(vp, &mp, V_WAIT); + vn_lock(vp, lock_flags | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { + VOP_ADD_WRITECOUNT_CHECKED(vp, -1); + CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", + __func__, vp, vp->v_writecount); + } + error = VOP_CLOSE(vp, flags, file_cred, td); + if (keep_ref) + VOP_UNLOCK(vp, 0); + else + vput(vp); + vn_finished_write(mp); + return (error); +} + +int +vn_close(struct vnode *vp, int flags, struct ucred *file_cred, + struct thread *td) +{ + + return (vn_close1(vp, flags, file_cred, td, false)); +} + +/* + * Heuristic to detect sequential operation. + */ +static int +sequential_heuristic(struct uio *uio, struct file *fp) +{ + + ASSERT_VOP_LOCKED(fp->f_vnode, __func__); + if (fp->f_flag & FRDAHEAD) + return (fp->f_seqcount << IO_SEQSHIFT); + + /* + * Offset 0 is handled specially. open() sets f_seqcount to 1 so + * that the first I/O is normally considered to be slightly + * sequential. Seeking to offset 0 doesn't change sequentiality + * unless previous seeks have reduced f_seqcount to 0, in which + * case offset 0 is not special. + */ + if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || + uio->uio_offset == fp->f_nextoff) { + /* + * f_seqcount is in units of fixed-size blocks so that it + * depends mainly on the amount of sequential I/O and not + * much on the number of sequential I/O's. The fixed size + * of 16384 is hard-coded here since it is (not quite) just + * a magic size that works well here. This size is more + * closely related to the best I/O size for real disks than + * to any block size used by software. + */ + if (uio->uio_resid >= IO_SEQMAX * 16384) + fp->f_seqcount = IO_SEQMAX; + else { + fp->f_seqcount += howmany(uio->uio_resid, 16384); + if (fp->f_seqcount > IO_SEQMAX) + fp->f_seqcount = IO_SEQMAX; + } + return (fp->f_seqcount << IO_SEQSHIFT); + } + + /* Not sequential. Quickly draw-down sequentiality. */ + if (fp->f_seqcount > 1) + fp->f_seqcount = 1; + else + fp->f_seqcount = 0; + return (0); +} + +/* + * Package up an I/O request on a vnode into a uio and do it. + */ +int +vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, + enum uio_seg segflg, int ioflg, struct ucred *active_cred, + struct ucred *file_cred, ssize_t *aresid, struct thread *td) +{ + struct uio auio; + struct iovec aiov; + struct mount *mp; + struct ucred *cred; + void *rl_cookie; + struct vn_io_fault_args args; + int error, lock_flags; + + if (offset < 0 && vp->v_type != VCHR) + return (EINVAL); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + + if ((ioflg & IO_NODELOCKED) == 0) { + if ((ioflg & IO_RANGELOCKED) == 0) { + if (rw == UIO_READ) { + rl_cookie = vn_rangelock_rlock(vp, offset, + offset + len); + } else { + rl_cookie = vn_rangelock_wlock(vp, offset, + offset + len); + } + } else + rl_cookie = NULL; + mp = NULL; + if (rw == UIO_WRITE) { + if (vp->v_type != VCHR && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) + != 0) + goto out; + if (MNT_SHARED_WRITES(mp) || + ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) + lock_flags = LK_SHARED; + else + lock_flags = LK_EXCLUSIVE; + } else + lock_flags = LK_SHARED; + vn_lock(vp, lock_flags | LK_RETRY); + } else + rl_cookie = NULL; + + ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); +#ifdef MAC + if ((ioflg & IO_NOMACCHECK) == 0) { + if (rw == UIO_READ) + error = mac_vnode_check_read(active_cred, file_cred, + vp); + else + error = mac_vnode_check_write(active_cred, file_cred, + vp); + } +#endif + if (error == 0) { + if (file_cred != NULL) + cred = file_cred; + else + cred = active_cred; + if (do_vn_io_fault(vp, &auio)) { + args.kind = VN_IO_FAULT_VOP; + args.cred = cred; + args.flags = ioflg; + args.args.vop_args.vp = vp; + error = vn_io_fault1(vp, &auio, &args, td); + } else if (rw == UIO_READ) { + error = VOP_READ(vp, &auio, ioflg, cred); + } else /* if (rw == UIO_WRITE) */ { + error = VOP_WRITE(vp, &auio, ioflg, cred); + } + } + if (aresid) + *aresid = auio.uio_resid; + else + if (auio.uio_resid && error == 0) + error = EIO; + if ((ioflg & IO_NODELOCKED) == 0) { + VOP_UNLOCK(vp, 0); + if (mp != NULL) + vn_finished_write(mp); + } + out: + if (rl_cookie != NULL) + vn_rangelock_unlock(vp, rl_cookie); + return (error); +} + +/* + * Package up an I/O request on a vnode into a uio and do it. The I/O + * request is split up into smaller chunks and we try to avoid saturating + * the buffer cache while potentially holding a vnode locked, so we + * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() + * to give other processes a chance to lock the vnode (either other processes + * core'ing the same binary, or unrelated processes scanning the directory). + */ +int +vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, + off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, + struct ucred *file_cred, size_t *aresid, struct thread *td) +{ + int error = 0; + ssize_t iaresid; + + do { + int chunk; + + /* + * Force `offset' to a multiple of MAXBSIZE except possibly + * for the first chunk, so that filesystems only need to + * write full blocks except possibly for the first and last + * chunks. + */ + chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; + + if (chunk > len) + chunk = len; + if (rw != UIO_READ && vp->v_type == VREG) + bwillwrite(); + iaresid = 0; + error = vn_rdwr(rw, vp, base, chunk, offset, segflg, + ioflg, active_cred, file_cred, &iaresid, td); + len -= chunk; /* aresid calc already includes length */ + if (error) + break; + offset += chunk; + base = (char *)base + chunk; + kern_yield(PRI_USER); + } while (len); + if (aresid) + *aresid = len + iaresid; + return (error); +} + +off_t +foffset_lock(struct file *fp, int flags) +{ + struct mtx *mtxp; + off_t res; + + KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); + +#if OFF_MAX <= LONG_MAX + /* + * Caller only wants the current f_offset value. Assume that + * the long and shorter integer types reads are atomic. + */ + if ((flags & FOF_NOLOCK) != 0) + return (fp->f_offset); +#endif + + /* + * According to McKusick the vn lock was protecting f_offset here. + * It is now protected by the FOFFSET_LOCKED flag. + */ + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + if ((flags & FOF_NOLOCK) == 0) { + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vofflock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + } + res = fp->f_offset; + mtx_unlock(mtxp); + return (res); +} + +void +foffset_unlock(struct file *fp, off_t val, int flags) +{ + struct mtx *mtxp; + + KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); + +#if OFF_MAX <= LONG_MAX + if ((flags & FOF_NOLOCK) != 0) { + if ((flags & FOF_NOUPDATE) == 0) + fp->f_offset = val; + if ((flags & FOF_NEXTOFF) != 0) + fp->f_nextoff = val; + return; + } +#endif + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + if ((flags & FOF_NOUPDATE) == 0) + fp->f_offset = val; + if ((flags & FOF_NEXTOFF) != 0) + fp->f_nextoff = val; + if ((flags & FOF_NOLOCK) == 0) { + KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, + ("Lost FOFFSET_LOCKED")); + if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) + wakeup(&fp->f_vnread_flags); + fp->f_vnread_flags = 0; + } + mtx_unlock(mtxp); +} + +void +foffset_lock_uio(struct file *fp, struct uio *uio, int flags) +{ + + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = foffset_lock(fp, flags); +} + +void +foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) +{ + + if ((flags & FOF_OFFSET) == 0) + foffset_unlock(fp, uio->uio_offset, flags); +} + +static int +get_advice(struct file *fp, struct uio *uio) +{ + struct mtx *mtxp; + int ret; + + ret = POSIX_FADV_NORMAL; + if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) + return (ret); + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + if (fp->f_advice != NULL && + uio->uio_offset >= fp->f_advice->fa_start && + uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) + ret = fp->f_advice->fa_advice; + mtx_unlock(mtxp); + return (ret); +} + +/* + * File table vnode read routine. + */ +static int +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) +{ + struct vnode *vp; + off_t orig_offset; + int error, ioflag; + int advice; + + KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", + uio->uio_td, td)); + KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); + vp = fp->f_vnode; + ioflag = 0; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + advice = get_advice(fp, uio); + vn_lock(vp, LK_SHARED | LK_RETRY); + + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_NOREUSE: + ioflag |= sequential_heuristic(uio, fp); + break; + case POSIX_FADV_RANDOM: + /* Disable read-ahead for random I/O. */ + break; + } + orig_offset = uio->uio_offset; + +#ifdef MAC + error = mac_vnode_check_read(active_cred, fp->f_cred, vp); + if (error == 0) +#endif + error = VOP_READ(vp, uio, ioflag, fp->f_cred); + fp->f_nextoff = uio->uio_offset; + VOP_UNLOCK(vp, 0); + if (error == 0 && advice == POSIX_FADV_NOREUSE && + orig_offset != uio->uio_offset) + /* + * Use POSIX_FADV_DONTNEED to flush pages and buffers + * for the backing file after a POSIX_FADV_NOREUSE + * read(2). + */ + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); + return (error); +} + +/* + * File table vnode write routine. + */ +static int +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) +{ + struct vnode *vp; + struct mount *mp; + off_t orig_offset; + int error, ioflag, lock_flags; + int advice; + + KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", + uio->uio_td, td)); + KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); + vp = fp->f_vnode; + if (vp->v_type == VREG) + bwillwrite(); + ioflag = IO_UNIT; + if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) + ioflag |= IO_APPEND; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if (fp->f_flag & O_DIRECT) + ioflag |= IO_DIRECT; + if ((fp->f_flag & O_FSYNC) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + ioflag |= IO_SYNC; + mp = NULL; + if (vp->v_type != VCHR && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto unlock; + + advice = get_advice(fp, uio); + + if (MNT_SHARED_WRITES(mp) || + (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { + lock_flags = LK_SHARED; + } else { + lock_flags = LK_EXCLUSIVE; + } + + vn_lock(vp, lock_flags | LK_RETRY); + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_NOREUSE: + ioflag |= sequential_heuristic(uio, fp); + break; + case POSIX_FADV_RANDOM: + /* XXX: Is this correct? */ + break; + } + orig_offset = uio->uio_offset; + +#ifdef MAC + error = mac_vnode_check_write(active_cred, fp->f_cred, vp); + if (error == 0) +#endif + error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); + fp->f_nextoff = uio->uio_offset; + VOP_UNLOCK(vp, 0); + if (vp->v_type != VCHR) + vn_finished_write(mp); + if (error == 0 && advice == POSIX_FADV_NOREUSE && + orig_offset != uio->uio_offset) + /* + * Use POSIX_FADV_DONTNEED to flush pages and buffers + * for the backing file after a POSIX_FADV_NOREUSE + * write(2). + */ + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); +unlock: + return (error); +} + +/* + * The vn_io_fault() is a wrapper around vn_read() and vn_write() to + * prevent the following deadlock: + * + * Assume that the thread A reads from the vnode vp1 into userspace + * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is + * currently not resident, then system ends up with the call chain + * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> + * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) + * which establishes lock order vp1->vn_lock, then vp2->vn_lock. + * If, at the same time, thread B reads from vnode vp2 into buffer buf2 + * backed by the pages of vnode vp1, and some page in buf2 is not + * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. + * + * To prevent the lock order reversal and deadlock, vn_io_fault() does + * not allow page faults to happen during VOP_READ() or VOP_WRITE(). + * Instead, it first tries to do the whole range i/o with pagefaults + * disabled. If all pages in the i/o buffer are resident and mapped, + * VOP will succeed (ignoring the genuine filesystem errors). + * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do + * i/o in chunks, with all pages in the chunk prefaulted and held + * using vm_fault_quick_hold_pages(). + * + * Filesystems using this deadlock avoidance scheme should use the + * array of the held pages from uio, saved in the curthread->td_ma, + * instead of doing uiomove(). A helper function + * vn_io_fault_uiomove() converts uiomove request into + * uiomove_fromphys() over td_ma array. + * + * Since vnode locks do not cover the whole i/o anymore, rangelocks + * make the current i/o request atomic with respect to other i/os and + * truncations. + */ + +/* + * Decode vn_io_fault_args and perform the corresponding i/o. + */ +static int +vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, + struct thread *td) +{ + int error, save; + + error = 0; + save = vm_fault_disable_pagefaults(); + switch (args->kind) { + case VN_IO_FAULT_FOP: + error = (args->args.fop_args.doio)(args->args.fop_args.fp, + uio, args->cred, args->flags, td); + break; + case VN_IO_FAULT_VOP: + if (uio->uio_rw == UIO_READ) { + error = VOP_READ(args->args.vop_args.vp, uio, + args->flags, args->cred); + } else if (uio->uio_rw == UIO_WRITE) { + error = VOP_WRITE(args->args.vop_args.vp, uio, + args->flags, args->cred); + } + break; + default: + panic("vn_io_fault_doio: unknown kind of io %d %d", + args->kind, uio->uio_rw); + } + vm_fault_enable_pagefaults(save); + return (error); +} + +static int +vn_io_fault_touch(char *base, const struct uio *uio) +{ + int r; + + r = fubyte(base); + if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) + return (EFAULT); + return (0); +} + +static int +vn_io_fault_prefault_user(const struct uio *uio) +{ + char *base; + const struct iovec *iov; + size_t len; + ssize_t resid; + int error, i; + + KASSERT(uio->uio_segflg == UIO_USERSPACE, + ("vn_io_fault_prefault userspace")); + + error = i = 0; + iov = uio->uio_iov; + resid = uio->uio_resid; + base = iov->iov_base; + len = iov->iov_len; + while (resid > 0) { + error = vn_io_fault_touch(base, uio); + if (error != 0) + break; + if (len < PAGE_SIZE) { + if (len != 0) { + error = vn_io_fault_touch(base + len - 1, uio); + if (error != 0) + break; + resid -= len; + } + if (++i >= uio->uio_iovcnt) + break; + iov = uio->uio_iov + i; + base = iov->iov_base; + len = iov->iov_len; + } else { + len -= PAGE_SIZE; + base += PAGE_SIZE; + resid -= PAGE_SIZE; + } + } + return (error); +} + +/* + * Common code for vn_io_fault(), agnostic to the kind of i/o request. + * Uses vn_io_fault_doio() to make the call to an actual i/o function. + * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request + * into args and call vn_io_fault1() to handle faults during the user + * mode buffer accesses. + */ +static int +vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, + struct thread *td) +{ + vm_page_t ma[io_hold_cnt + 2]; + struct uio *uio_clone, short_uio; + struct iovec short_iovec[1]; + vm_page_t *prev_td_ma; + vm_prot_t prot; + vm_offset_t addr, end; + size_t len, resid; + ssize_t adv; + int error, cnt, saveheld, prev_td_ma_cnt; + + if (vn_io_fault_prefault) { + error = vn_io_fault_prefault_user(uio); + if (error != 0) + return (error); /* Or ignore ? */ + } + + prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; + + /* + * The UFS follows IO_UNIT directive and replays back both + * uio_offset and uio_resid if an error is encountered during the + * operation. But, since the iovec may be already advanced, + * uio is still in an inconsistent state. + * + * Cache a copy of the original uio, which is advanced to the redo + * point using UIO_NOCOPY below. + */ + uio_clone = cloneuio(uio); + resid = uio->uio_resid; + + short_uio.uio_segflg = UIO_USERSPACE; + short_uio.uio_rw = uio->uio_rw; + short_uio.uio_td = uio->uio_td; + + error = vn_io_fault_doio(args, uio, td); + if (error != EFAULT) + goto out; + + atomic_add_long(&vn_io_faults_cnt, 1); + uio_clone->uio_segflg = UIO_NOCOPY; + uiomove(NULL, resid - uio->uio_resid, uio_clone); + uio_clone->uio_segflg = uio->uio_segflg; + + saveheld = curthread_pflags_set(TDP_UIOHELD); + prev_td_ma = td->td_ma; + prev_td_ma_cnt = td->td_ma_cnt; + + while (uio_clone->uio_resid != 0) { + len = uio_clone->uio_iov->iov_len; + if (len == 0) { + KASSERT(uio_clone->uio_iovcnt >= 1, + ("iovcnt underflow")); + uio_clone->uio_iov++; + uio_clone->uio_iovcnt--; + continue; + } + if (len > io_hold_cnt * PAGE_SIZE) + len = io_hold_cnt * PAGE_SIZE; + addr = (uintptr_t)uio_clone->uio_iov->iov_base; + end = round_page(addr + len); + if (end < addr) { + error = EFAULT; + break; + } + cnt = atop(end - trunc_page(addr)); + /* + * A perfectly misaligned address and length could cause + * both the start and the end of the chunk to use partial + * page. +2 accounts for such a situation. + */ + cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, + addr, len, prot, ma, io_hold_cnt + 2); + if (cnt == -1) { + error = EFAULT; + break; + } + short_uio.uio_iov = &short_iovec[0]; + short_iovec[0].iov_base = (void *)addr; + short_uio.uio_iovcnt = 1; + short_uio.uio_resid = short_iovec[0].iov_len = len; + short_uio.uio_offset = uio_clone->uio_offset; + td->td_ma = ma; + td->td_ma_cnt = cnt; + + error = vn_io_fault_doio(args, &short_uio, td); + vm_page_unhold_pages(ma, cnt); + adv = len - short_uio.uio_resid; + + uio_clone->uio_iov->iov_base = + (char *)uio_clone->uio_iov->iov_base + adv; + uio_clone->uio_iov->iov_len -= adv; + uio_clone->uio_resid -= adv; + uio_clone->uio_offset += adv; + + uio->uio_resid -= adv; + uio->uio_offset += adv; + + if (error != 0 || adv == 0) + break; + } + td->td_ma = prev_td_ma; + td->td_ma_cnt = prev_td_ma_cnt; + curthread_pflags_restore(saveheld); +out: + free(uio_clone, M_IOV); + return (error); +} + +static int +vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + fo_rdwr_t *doio; + struct vnode *vp; + void *rl_cookie; + struct vn_io_fault_args args; + int error; + + doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; + vp = fp->f_vnode; + foffset_lock_uio(fp, uio, flags); + if (do_vn_io_fault(vp, uio)) { + args.kind = VN_IO_FAULT_FOP; + args.args.fop_args.fp = fp; + args.args.fop_args.doio = doio; + args.cred = active_cred; + args.flags = flags | FOF_OFFSET; + if (uio->uio_rw == UIO_READ) { + rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, + uio->uio_offset + uio->uio_resid); + } else if ((fp->f_flag & O_APPEND) != 0 || + (flags & FOF_OFFSET) == 0) { + /* For appenders, punt and lock the whole range. */ + rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); + } else { + rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, + uio->uio_offset + uio->uio_resid); + } + error = vn_io_fault1(vp, uio, &args, td); + vn_rangelock_unlock(vp, rl_cookie); + } else { + error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); + } + foffset_unlock_uio(fp, uio, flags); + return (error); +} + +/* + * Helper function to perform the requested uiomove operation using + * the held pages for io->uio_iov[0].iov_base buffer instead of + * copyin/copyout. Access to the pages with uiomove_fromphys() + * instead of iov_base prevents page faults that could occur due to + * pmap_collect() invalidating the mapping created by + * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or + * object cleanup revoking the write access from page mappings. + * + * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() + * instead of plain uiomove(). + */ +int +vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) +{ + struct uio transp_uio; + struct iovec transp_iov[1]; + struct thread *td; + size_t adv; + int error, pgadv; + + td = curthread; + if ((td->td_pflags & TDP_UIOHELD) == 0 || + uio->uio_segflg != UIO_USERSPACE) + return (uiomove(data, xfersize, uio)); + + KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); + transp_iov[0].iov_base = data; + transp_uio.uio_iov = &transp_iov[0]; + transp_uio.uio_iovcnt = 1; + if (xfersize > uio->uio_resid) + xfersize = uio->uio_resid; + transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; + transp_uio.uio_offset = 0; + transp_uio.uio_segflg = UIO_SYSSPACE; + /* + * Since transp_iov points to data, and td_ma page array + * corresponds to original uio->uio_iov, we need to invert the + * direction of the i/o operation as passed to + * uiomove_fromphys(). + */ + switch (uio->uio_rw) { + case UIO_WRITE: + transp_uio.uio_rw = UIO_READ; + break; + case UIO_READ: + transp_uio.uio_rw = UIO_WRITE; + break; + } + transp_uio.uio_td = uio->uio_td; + error = uiomove_fromphys(td->td_ma, + ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, + xfersize, &transp_uio); + adv = xfersize - transp_uio.uio_resid; + pgadv = + (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - + (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); + td->td_ma += pgadv; + KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, + pgadv)); + td->td_ma_cnt -= pgadv; + uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; + uio->uio_iov->iov_len -= adv; + uio->uio_resid -= adv; + uio->uio_offset += adv; + return (error); +} + +int +vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, + struct uio *uio) +{ + struct thread *td; + vm_offset_t iov_base; + int cnt, pgadv; + + td = curthread; + if ((td->td_pflags & TDP_UIOHELD) == 0 || + uio->uio_segflg != UIO_USERSPACE) + return (uiomove_fromphys(ma, offset, xfersize, uio)); + + KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); + cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; + iov_base = (vm_offset_t)uio->uio_iov->iov_base; + switch (uio->uio_rw) { + case UIO_WRITE: + pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, + offset, cnt); + break; + case UIO_READ: + pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, + cnt); + break; + } + pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); + td->td_ma += pgadv; + KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, + pgadv)); + td->td_ma_cnt -= pgadv; + uio->uio_iov->iov_base = (char *)(iov_base + cnt); + uio->uio_iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + return (0); +} + + +/* + * File table truncate routine. + */ +static int +vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) +{ + struct vattr vattr; + struct mount *mp; + struct vnode *vp; + void *rl_cookie; + int error; + + vp = fp->f_vnode; + + /* + * Lock the whole range for truncation. Otherwise split i/o + * might happen partly before and partly after the truncation. + */ + rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); + error = vn_start_write(vp, &mp, V_WAIT | PCATCH); + if (error) + goto out1; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + if (vp->v_type == VDIR) { + error = EISDIR; + goto out; + } +#ifdef MAC + error = mac_vnode_check_write(active_cred, fp->f_cred, vp); + if (error) + goto out; +#endif + error = VOP_ADD_WRITECOUNT(vp, 1); + if (error == 0) { + VATTR_NULL(&vattr); + vattr.va_size = length; + if ((fp->f_flag & O_FSYNC) != 0) + vattr.va_vaflags |= VA_SYNC; + error = VOP_SETATTR(vp, &vattr, fp->f_cred); + VOP_ADD_WRITECOUNT_CHECKED(vp, -1); + } +out: + VOP_UNLOCK(vp, 0); + vn_finished_write(mp); +out1: + vn_rangelock_unlock(vp, rl_cookie); + return (error); +} + +/* + * File table vnode stat routine. + */ +static int +vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred, + struct thread *td) +{ + struct vnode *vp = fp->f_vnode; + int error; + + vn_lock(vp, LK_SHARED | LK_RETRY); + error = vn_stat(vp, sb, active_cred, fp->f_cred, td); + VOP_UNLOCK(vp, 0); + + return (error); +} + +/* + * Stat a vnode; implementation for the stat syscall + */ +int +vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, + struct ucred *file_cred, struct thread *td) +{ + struct vattr vattr; + struct vattr *vap; + int error; + u_short mode; + + AUDIT_ARG_VNODE1(vp); +#ifdef MAC + error = mac_vnode_check_stat(active_cred, file_cred, vp); + if (error) + return (error); +#endif + + vap = &vattr; + + /* + * Initialize defaults for new and unusual fields, so that file + * systems which don't support these fields don't need to know + * about them. + */ + vap->va_birthtime.tv_sec = -1; + vap->va_birthtime.tv_nsec = 0; + vap->va_fsid = VNOVAL; + vap->va_rdev = NODEV; + + error = VOP_GETATTR(vp, vap, active_cred); + if (error) + return (error); + + /* + * Zero the spare stat fields + */ + bzero(sb, sizeof *sb); + + /* + * Copy from vattr table + */ + if (vap->va_fsid != VNOVAL) + sb->st_dev = vap->va_fsid; + else + sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; + sb->st_ino = vap->va_fileid; + mode = vap->va_mode; + switch (vap->va_type) { + case VREG: + mode |= S_IFREG; + break; + case VDIR: + mode |= S_IFDIR; + break; + case VBLK: + mode |= S_IFBLK; + break; + case VCHR: + mode |= S_IFCHR; + break; + case VLNK: + mode |= S_IFLNK; + break; + case VSOCK: + mode |= S_IFSOCK; + break; + case VFIFO: + mode |= S_IFIFO; + break; + default: + return (EBADF); + } + sb->st_mode = mode; + sb->st_nlink = vap->va_nlink; + sb->st_uid = vap->va_uid; + sb->st_gid = vap->va_gid; + sb->st_rdev = vap->va_rdev; + if (vap->va_size > OFF_MAX) + return (EOVERFLOW); + sb->st_size = vap->va_size; + sb->st_atim = vap->va_atime; + sb->st_mtim = vap->va_mtime; + sb->st_ctim = vap->va_ctime; + sb->st_birthtim = vap->va_birthtime; + + /* + * According to www.opengroup.org, the meaning of st_blksize is + * "a filesystem-specific preferred I/O block size for this + * object. In some filesystem types, this may vary from file + * to file" + * Use miminum/default of PAGE_SIZE (e.g. for VCHR). + */ + + sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); + + sb->st_flags = vap->va_flags; + if (priv_check(td, PRIV_VFS_GENERATION)) + sb->st_gen = 0; + else + sb->st_gen = vap->va_gen; + + sb->st_blocks = vap->va_bytes / S_BLKSIZE; + return (0); +} + +/* + * File table vnode ioctl routine. + */ +static int +vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, + struct thread *td) +{ + struct vattr vattr; + struct vnode *vp; + struct fiobmap2_arg *bmarg; + int error; + + vp = fp->f_vnode; + switch (vp->v_type) { + case VDIR: + case VREG: + switch (com) { + case FIONREAD: + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &vattr, active_cred); + VOP_UNLOCK(vp, 0); + if (error == 0) + *(int *)data = vattr.va_size - fp->f_offset; + return (error); + case FIOBMAP2: + bmarg = (struct fiobmap2_arg *)data; + vn_lock(vp, LK_SHARED | LK_RETRY); +#ifdef MAC + error = mac_vnode_check_read(active_cred, fp->f_cred, + vp); + if (error == 0) +#endif + error = VOP_BMAP(vp, bmarg->bn, NULL, + &bmarg->bn, &bmarg->runp, &bmarg->runb); + VOP_UNLOCK(vp, 0); + return (error); + case FIONBIO: + case FIOASYNC: + return (0); + default: + return (VOP_IOCTL(vp, com, data, fp->f_flag, + active_cred, td)); + } + break; + case VCHR: + return (VOP_IOCTL(vp, com, data, fp->f_flag, + active_cred, td)); + default: + return (ENOTTY); + } +} + +/* + * File table vnode poll routine. + */ +static int +vn_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct vnode *vp; + int error; + + vp = fp->f_vnode; +#ifdef MAC + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); + VOP_UNLOCK(vp, 0); + if (!error) +#endif + + error = VOP_POLL(vp, events, fp->f_cred, td); + return (error); +} + +/* + * Acquire the requested lock and then check for validity. LK_RETRY + * permits vn_lock to return doomed vnodes. + */ +int +_vn_lock(struct vnode *vp, int flags, char *file, int line) +{ + int error; + + VNASSERT((flags & LK_TYPE_MASK) != 0, vp, + ("vn_lock: no locktype")); + VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count")); +retry: + error = VOP_LOCK1(vp, flags, file, line); + flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ + KASSERT((flags & LK_RETRY) == 0 || error == 0, + ("vn_lock: error %d incompatible with flags %#x", error, flags)); + + if ((flags & LK_RETRY) == 0) { + if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) { + VOP_UNLOCK(vp, 0); + error = ENOENT; + } + } else if (error != 0) + goto retry; + return (error); +} + +/* + * File table vnode close routine. + */ +static int +vn_closefile(struct file *fp, struct thread *td) +{ + struct vnode *vp; + struct flock lf; + int error; + bool ref; + + vp = fp->f_vnode; + fp->f_ops = &badfileops; + ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE; + + error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); + + if (__predict_false(ref)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); + vrele(vp); + } + return (error); +} + +static bool +vn_suspendable(struct mount *mp) +{ + + return (mp->mnt_op->vfs_susp_clean != NULL); +} + +/* + * Preparing to start a filesystem write operation. If the operation is + * permitted, then we bump the count of operations in progress and + * proceed. If a suspend request is in progress, we wait until the + * suspension is over, and then proceed. + */ +static int +vn_start_write_locked(struct mount *mp, int flags) +{ + int error, mflags; + + mtx_assert(MNT_MTX(mp), MA_OWNED); + error = 0; + + /* + * Check on status of suspension. + */ + if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || + mp->mnt_susp_owner != curthread) { + mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? + (flags & PCATCH) : 0) | (PUSER - 1); + while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { + if (flags & V_NOWAIT) { + error = EWOULDBLOCK; + goto unlock; + } + error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, + "suspfs", 0); + if (error) + goto unlock; + } + } + if (flags & V_XSLEEP) + goto unlock; + mp->mnt_writeopcount++; +unlock: + if (error != 0 || (flags & V_XSLEEP) != 0) + MNT_REL(mp); + MNT_IUNLOCK(mp); + return (error); +} + +int +vn_start_write(struct vnode *vp, struct mount **mpp, int flags) +{ + struct mount *mp; + int error; + + KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), + ("V_MNTREF requires mp")); + + error = 0; + /* + * If a vnode is provided, get and return the mount point that + * to which it will write. + */ + if (vp != NULL) { + if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { + *mpp = NULL; + if (error != EOPNOTSUPP) + return (error); + return (0); + } + } + if ((mp = *mpp) == NULL) + return (0); + + if (!vn_suspendable(mp)) { + if (vp != NULL || (flags & V_MNTREF) != 0) + vfs_rel(mp); + return (0); + } + + /* + * VOP_GETWRITEMOUNT() returns with the mp refcount held through + * a vfs_ref(). + * As long as a vnode is not provided we need to acquire a + * refcount for the provided mountpoint too, in order to + * emulate a vfs_ref(). + */ + MNT_ILOCK(mp); + if (vp == NULL && (flags & V_MNTREF) == 0) + MNT_REF(mp); + + return (vn_start_write_locked(mp, flags)); +} + +/* + * Secondary suspension. Used by operations such as vop_inactive + * routines that are needed by the higher level functions. These + * are allowed to proceed until all the higher level functions have + * completed (indicated by mnt_writeopcount dropping to zero). At that + * time, these operations are halted until the suspension is over. + */ +int +vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) +{ + struct mount *mp; + int error; + + KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), + ("V_MNTREF requires mp")); + + retry: + if (vp != NULL) { + if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { + *mpp = NULL; + if (error != EOPNOTSUPP) + return (error); + return (0); + } + } + /* + * If we are not suspended or have not yet reached suspended + * mode, then let the operation proceed. + */ + if ((mp = *mpp) == NULL) + return (0); + + if (!vn_suspendable(mp)) { + if (vp != NULL || (flags & V_MNTREF) != 0) + vfs_rel(mp); + return (0); + } + + /* + * VOP_GETWRITEMOUNT() returns with the mp refcount held through + * a vfs_ref(). + * As long as a vnode is not provided we need to acquire a + * refcount for the provided mountpoint too, in order to + * emulate a vfs_ref(). + */ + MNT_ILOCK(mp); + if (vp == NULL && (flags & V_MNTREF) == 0) + MNT_REF(mp); + if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { + mp->mnt_secondary_writes++; + mp->mnt_secondary_accwrites++; + MNT_IUNLOCK(mp); + return (0); + } + if (flags & V_NOWAIT) { + MNT_REL(mp); + MNT_IUNLOCK(mp); + return (EWOULDBLOCK); + } + /* + * Wait for the suspension to finish. + */ + error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | + ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), + "suspfs", 0); + vfs_rel(mp); + if (error == 0) + goto retry; + return (error); +} + +/* + * Filesystem write operation has completed. If we are suspending and this + * operation is the last one, notify the suspender that the suspension is + * now in effect. + */ +void +vn_finished_write(struct mount *mp) +{ + if (mp == NULL || !vn_suspendable(mp)) + return; + MNT_ILOCK(mp); + MNT_REL(mp); + mp->mnt_writeopcount--; + if (mp->mnt_writeopcount < 0) + panic("vn_finished_write: neg cnt"); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && + mp->mnt_writeopcount <= 0) + wakeup(&mp->mnt_writeopcount); + MNT_IUNLOCK(mp); +} + + +/* + * Filesystem secondary write operation has completed. If we are + * suspending and this operation is the last one, notify the suspender + * that the suspension is now in effect. + */ +void +vn_finished_secondary_write(struct mount *mp) +{ + if (mp == NULL || !vn_suspendable(mp)) + return; + MNT_ILOCK(mp); + MNT_REL(mp); + mp->mnt_secondary_writes--; + if (mp->mnt_secondary_writes < 0) + panic("vn_finished_secondary_write: neg cnt"); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && + mp->mnt_secondary_writes <= 0) + wakeup(&mp->mnt_secondary_writes); + MNT_IUNLOCK(mp); +} + + + +/* + * Request a filesystem to suspend write operations. + */ +int +vfs_write_suspend(struct mount *mp, int flags) +{ + int error; + + MPASS(vn_suspendable(mp)); + + MNT_ILOCK(mp); + if (mp->mnt_susp_owner == curthread) { + MNT_IUNLOCK(mp); + return (EALREADY); + } + while (mp->mnt_kern_flag & MNTK_SUSPEND) + msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); + + /* + * Unmount holds a write reference on the mount point. If we + * own busy reference and drain for writers, we deadlock with + * the reference draining in the unmount path. Callers of + * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if + * vfs_busy() reference is owned and caller is not in the + * unmount context. + */ + if ((flags & VS_SKIP_UNMOUNT) != 0 && + (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { + MNT_IUNLOCK(mp); + return (EBUSY); + } + + mp->mnt_kern_flag |= MNTK_SUSPEND; + mp->mnt_susp_owner = curthread; + if (mp->mnt_writeopcount > 0) + (void) msleep(&mp->mnt_writeopcount, + MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); + else + MNT_IUNLOCK(mp); + if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) + vfs_write_resume(mp, 0); + return (error); +} + +/* + * Request a filesystem to resume write operations. + */ +void +vfs_write_resume(struct mount *mp, int flags) +{ + + MPASS(vn_suspendable(mp)); + + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { + KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); + mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | + MNTK_SUSPENDED); + mp->mnt_susp_owner = NULL; + wakeup(&mp->mnt_writeopcount); + wakeup(&mp->mnt_flag); + curthread->td_pflags &= ~TDP_IGNSUSP; + if ((flags & VR_START_WRITE) != 0) { + MNT_REF(mp); + mp->mnt_writeopcount++; + } + MNT_IUNLOCK(mp); + if ((flags & VR_NO_SUSPCLR) == 0) + VFS_SUSP_CLEAN(mp); + } else if ((flags & VR_START_WRITE) != 0) { + MNT_REF(mp); + vn_start_write_locked(mp, 0); + } else { + MNT_IUNLOCK(mp); + } +} + +/* + * Helper loop around vfs_write_suspend() for filesystem unmount VFS + * methods. + */ +int +vfs_write_suspend_umnt(struct mount *mp) +{ + int error; + + MPASS(vn_suspendable(mp)); + KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, + ("vfs_write_suspend_umnt: recursed")); + + /* dounmount() already called vn_start_write(). */ + for (;;) { + vn_finished_write(mp); + error = vfs_write_suspend(mp, 0); + if (error != 0) { + vn_start_write(NULL, &mp, V_WAIT); + return (error); + } + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) + break; + MNT_IUNLOCK(mp); + vn_start_write(NULL, &mp, V_WAIT); + } + mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); + wakeup(&mp->mnt_flag); + MNT_IUNLOCK(mp); + curthread->td_pflags |= TDP_IGNSUSP; + return (0); +} + +/* + * Implement kqueues for files by translating it to vnode operation. + */ +static int +vn_kqfilter(struct file *fp, struct knote *kn) +{ + + return (VOP_KQFILTER(fp->f_vnode, kn)); +} + +/* + * Simplified in-kernel wrapper calls for extended attribute access. + * Both calls pass in a NULL credential, authorizing as "kernel" access. + * Set IO_NODELOCKED in ioflg if the vnode is already locked. + */ +int +vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, + const char *attrname, int *buflen, char *buf, struct thread *td) +{ + struct uio auio; + struct iovec iov; + int error; + + iov.iov_len = *buflen; + iov.iov_base = buf; + + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_offset = 0; + auio.uio_resid = *buflen; + + if ((ioflg & IO_NODELOCKED) == 0) + vn_lock(vp, LK_SHARED | LK_RETRY); + + ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); + + /* authorize attribute retrieval as kernel */ + error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, + td); + + if ((ioflg & IO_NODELOCKED) == 0) + VOP_UNLOCK(vp, 0); + + if (error == 0) { + *buflen = *buflen - auio.uio_resid; + } + + return (error); +} + +/* + * XXX failure mode if partially written? + */ +int +vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, + const char *attrname, int buflen, char *buf, struct thread *td) +{ + struct uio auio; + struct iovec iov; + struct mount *mp; + int error; + + iov.iov_len = buflen; + iov.iov_base = buf; + + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_offset = 0; + auio.uio_resid = buflen; + + if ((ioflg & IO_NODELOCKED) == 0) { + if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + } + + ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); + + /* authorize attribute setting as kernel */ + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); + + if ((ioflg & IO_NODELOCKED) == 0) { + vn_finished_write(mp); + VOP_UNLOCK(vp, 0); + } + + return (error); +} + +int +vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, + const char *attrname, struct thread *td) +{ + struct mount *mp; + int error; + + if ((ioflg & IO_NODELOCKED) == 0) { + if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) + return (error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + } + + ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); + + /* authorize attribute removal as kernel */ + error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); + if (error == EOPNOTSUPP) + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, + NULL, td); + + if ((ioflg & IO_NODELOCKED) == 0) { + vn_finished_write(mp); + VOP_UNLOCK(vp, 0); + } + + return (error); +} + +static int +vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, + struct vnode **rvp) +{ + + return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); +} + +int +vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) +{ + + return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, + lkflags, rvp)); +} + +int +vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, + int lkflags, struct vnode **rvp) +{ + struct mount *mp; + int ltype, error; + + ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); + mp = vp->v_mount; + ltype = VOP_ISLOCKED(vp); + KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, + ("vn_vget_ino: vp not locked")); + error = vfs_busy(mp, MBF_NOWAIT); + if (error != 0) { + vfs_ref(mp); + VOP_UNLOCK(vp, 0); + error = vfs_busy(mp, 0); + vn_lock(vp, ltype | LK_RETRY); + vfs_rel(mp); + if (error != 0) + return (ENOENT); + if (vp->v_iflag & VI_DOOMED) { + vfs_unbusy(mp); + return (ENOENT); + } + } + VOP_UNLOCK(vp, 0); + error = alloc(mp, alloc_arg, lkflags, rvp); + vfs_unbusy(mp); + if (error != 0 || *rvp != vp) + vn_lock(vp, ltype | LK_RETRY); + if (vp->v_iflag & VI_DOOMED) { + if (error == 0) { + if (*rvp == vp) + vunref(vp); + else + vput(*rvp); + } + error = ENOENT; + } + return (error); +} + +int +vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, + struct thread *td) +{ + + if (vp->v_type != VREG || td == NULL) + return (0); + if ((uoff_t)uio->uio_offset + uio->uio_resid > + lim_cur(td, RLIMIT_FSIZE)) { + PROC_LOCK(td->td_proc); + kern_psignal(td->td_proc, SIGXFSZ); + PROC_UNLOCK(td->td_proc); + return (EFBIG); + } + return (0); +} + +int +vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, + struct thread *td) +{ + struct vnode *vp; + + vp = fp->f_vnode; +#ifdef AUDIT + vn_lock(vp, LK_SHARED | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + VOP_UNLOCK(vp, 0); +#endif + return (setfmode(td, active_cred, vp, mode)); +} + +int +vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, + struct thread *td) +{ + struct vnode *vp; + + vp = fp->f_vnode; +#ifdef AUDIT + vn_lock(vp, LK_SHARED | LK_RETRY); + AUDIT_ARG_VNODE1(vp); + VOP_UNLOCK(vp, 0); +#endif + return (setfown(td, active_cred, vp, uid, gid)); +} + +void +vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) +{ + vm_object_t object; + + if ((object = vp->v_object) == NULL) + return; + VM_OBJECT_WLOCK(object); + vm_object_page_remove(object, start, end, 0); + VM_OBJECT_WUNLOCK(object); +} + +int +vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) +{ + struct vattr va; + daddr_t bn, bnp; + uint64_t bsize; + off_t noff; + int error; + + KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, + ("Wrong command %lu", cmd)); + + if (vn_lock(vp, LK_SHARED) != 0) + return (EBADF); + if (vp->v_type != VREG) { + error = ENOTTY; + goto unlock; + } + error = VOP_GETATTR(vp, &va, cred); + if (error != 0) + goto unlock; + noff = *off; + if (noff >= va.va_size) { + error = ENXIO; + goto unlock; + } + bsize = vp->v_mount->mnt_stat.f_iosize; + for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize - + noff % bsize) { + error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); + if (error == EOPNOTSUPP) { + error = ENOTTY; + goto unlock; + } + if ((bnp == -1 && cmd == FIOSEEKHOLE) || + (bnp != -1 && cmd == FIOSEEKDATA)) { + noff = bn * bsize; + if (noff < *off) + noff = *off; + goto unlock; + } + } + if (noff > va.va_size) + noff = va.va_size; + /* noff == va.va_size. There is an implicit hole at the end of file. */ + if (cmd == FIOSEEKDATA) + error = ENXIO; +unlock: + VOP_UNLOCK(vp, 0); + if (error == 0) + *off = noff; + return (error); +} + +int +vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) +{ + struct ucred *cred; + struct vnode *vp; + struct vattr vattr; + off_t foffset, size; + int error, noneg; + + cred = td->td_ucred; + vp = fp->f_vnode; + foffset = foffset_lock(fp, 0); + noneg = (vp->v_type != VCHR); + error = 0; + switch (whence) { + case L_INCR: + if (noneg && + (foffset < 0 || + (offset > 0 && foffset > OFF_MAX - offset))) { + error = EOVERFLOW; + break; + } + offset += foffset; + break; + case L_XTND: + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &vattr, cred); + VOP_UNLOCK(vp, 0); + if (error) + break; + + /* + * If the file references a disk device, then fetch + * the media size and use that to determine the ending + * offset. + */ + if (vattr.va_size == 0 && vp->v_type == VCHR && + fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) + vattr.va_size = size; + if (noneg && + (vattr.va_size > OFF_MAX || + (offset > 0 && vattr.va_size > OFF_MAX - offset))) { + error = EOVERFLOW; + break; + } + offset += vattr.va_size; + break; + case L_SET: + break; + case SEEK_DATA: + error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); + break; + case SEEK_HOLE: + error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); + break; + default: + error = EINVAL; + } + if (error == 0 && noneg && offset < 0) + error = EINVAL; + if (error != 0) + goto drop; + VFS_KNOTE_UNLOCKED(vp, 0); + td->td_uretoff.tdu_off = offset; +drop: + foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); + return (error); +} + +int +vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, + struct thread *td) +{ + int error; + + /* + * Grant permission if the caller is the owner of the file, or + * the super-user, or has ACL_WRITE_ATTRIBUTES permission on + * on the file. If the time pointer is null, then write + * permission on the file is also sufficient. + * + * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: + * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES + * will be allowed to set the times [..] to the current + * server time. + */ + error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); + if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) + error = VOP_ACCESS(vp, VWRITE, cred, td); + return (error); +} + +int +vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) +{ + struct vnode *vp; + int error; + + if (fp->f_type == DTYPE_FIFO) + kif->kf_type = KF_TYPE_FIFO; + else + kif->kf_type = KF_TYPE_VNODE; + vp = fp->f_vnode; + vref(vp); + FILEDESC_SUNLOCK(fdp); + error = vn_fill_kinfo_vnode(vp, kif); + vrele(vp); + FILEDESC_SLOCK(fdp); + return (error); +} + +static inline void +vn_fill_junk(struct kinfo_file *kif) +{ + size_t len, olen; + + /* + * Simulate vn_fullpath returning changing values for a given + * vp during e.g. coredump. + */ + len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; + olen = strlen(kif->kf_path); + if (len < olen) + strcpy(&kif->kf_path[len - 1], "$"); + else + for (; olen < len; olen++) + strcpy(&kif->kf_path[olen], "A"); +} + +int +vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) +{ + struct vattr va; + char *fullpath, *freepath; + int error; + + kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); + freepath = NULL; + fullpath = "-"; + error = vn_fullpath(curthread, vp, &fullpath, &freepath); + if (error == 0) { + strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); + } + if (freepath != NULL) + free(freepath, M_TEMP); + + KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, + vn_fill_junk(kif); + ); + + /* + * Retrieve vnode attributes. + */ + va.va_fsid = VNOVAL; + va.va_rdev = NODEV; + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &va, curthread->td_ucred); + VOP_UNLOCK(vp, 0); + if (error != 0) + return (error); + if (va.va_fsid != VNOVAL) + kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; + else + kif->kf_un.kf_file.kf_file_fsid = + vp->v_mount->mnt_stat.f_fsid.val[0]; + kif->kf_un.kf_file.kf_file_fsid_freebsd11 = + kif->kf_un.kf_file.kf_file_fsid; /* truncate */ + kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; + kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); + kif->kf_un.kf_file.kf_file_size = va.va_size; + kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; + kif->kf_un.kf_file.kf_file_rdev_freebsd11 = + kif->kf_un.kf_file.kf_file_rdev; /* truncate */ + return (0); +} + +int +vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, + vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, + struct thread *td) +{ +#ifdef HWPMC_HOOKS + struct pmckern_map_in pkm; +#endif + struct mount *mp; + struct vnode *vp; + vm_object_t object; + vm_prot_t maxprot; + boolean_t writecounted; + int error; + +#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ + defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) + /* + * POSIX shared-memory objects are defined to have + * kernel persistence, and are not defined to support + * read(2)/write(2) -- or even open(2). Thus, we can + * use MAP_ASYNC to trade on-disk coherence for speed. + * The shm_open(3) library routine turns on the FPOSIXSHM + * flag to request this behavior. + */ + if ((fp->f_flag & FPOSIXSHM) != 0) + flags |= MAP_NOSYNC; +#endif + vp = fp->f_vnode; + + /* + * Ensure that file and memory protections are + * compatible. Note that we only worry about + * writability if mapping is shared; in this case, + * current and max prot are dictated by the open file. + * XXX use the vnode instead? Problem is: what + * credentials do we use for determination? What if + * proc does a setuid? + */ + mp = vp->v_mount; + if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { + maxprot = VM_PROT_NONE; + if ((prot & VM_PROT_EXECUTE) != 0) + return (EACCES); + } else + maxprot = VM_PROT_EXECUTE; + if ((fp->f_flag & FREAD) != 0) + maxprot |= VM_PROT_READ; + else if ((prot & VM_PROT_READ) != 0) + return (EACCES); + + /* + * If we are sharing potential changes via MAP_SHARED and we + * are trying to get write permission although we opened it + * without asking for it, bail out. + */ + if ((flags & MAP_SHARED) != 0) { + if ((fp->f_flag & FWRITE) != 0) + maxprot |= VM_PROT_WRITE; + else if ((prot & VM_PROT_WRITE) != 0) + return (EACCES); + } else { + maxprot |= VM_PROT_WRITE; + cap_maxprot |= VM_PROT_WRITE; + } + maxprot &= cap_maxprot; + + /* + * For regular files and shared memory, POSIX requires that + * the value of foff be a legitimate offset within the data + * object. In particular, negative offsets are invalid. + * Blocking negative offsets and overflows here avoids + * possible wraparound or user-level access into reserved + * ranges of the data object later. In contrast, POSIX does + * not dictate how offsets are used by device drivers, so in + * the case of a device mapping a negative offset is passed + * on. + */ + if ( +#ifdef _LP64 + size > OFF_MAX || +#endif + foff < 0 || foff > OFF_MAX - size) + return (EINVAL); + + writecounted = FALSE; + error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, + &foff, &object, &writecounted); + if (error != 0) + return (error); + error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, + foff, writecounted, td); + if (error != 0) { + /* + * If this mapping was accounted for in the vnode's + * writecount, then undo that now. + */ + if (writecounted) + vm_pager_release_writecount(object, 0, size); + vm_object_deallocate(object); + } +#ifdef HWPMC_HOOKS + /* Inform hwpmc(4) if an executable is being mapped. */ + if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { + if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { + pkm.pm_file = vp; + pkm.pm_address = (uintptr_t) *addr; + PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); + } + } +#endif + return (error); +} + +void +vn_fsid(struct vnode *vp, struct vattr *va) +{ + fsid_t *f; + + f = &vp->v_mount->mnt_stat.f_fsid; + va->va_fsid = (uint32_t)f->val[1]; + va->va_fsid <<= sizeof(f->val[1]) * NBBY; + va->va_fsid += (uint32_t)f->val[0]; +} + +int +vn_fsync_buf(struct vnode *vp, int waitfor) +{ + struct buf *bp, *nbp; + struct bufobj *bo; + struct mount *mp; + int error, maxretry; + + error = 0; + maxretry = 10000; /* large, arbitrarily chosen */ + mp = NULL; + if (vp->v_type == VCHR) { + VI_LOCK(vp); + mp = vp->v_rdev->si_mountpt; + VI_UNLOCK(vp); + } + bo = &vp->v_bufobj; + BO_LOCK(bo); +loop1: + /* + * MARK/SCAN initialization to avoid infinite loops. + */ + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { + bp->b_vflags &= ~BV_SCANNED; + bp->b_error = 0; + } + + /* + * Flush all dirty buffers associated with a vnode. + */ +loop2: + TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { + if ((bp->b_vflags & BV_SCANNED) != 0) + continue; + bp->b_vflags |= BV_SCANNED; + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { + if (waitfor != MNT_WAIT) + continue; + if (BUF_LOCK(bp, + LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, + BO_LOCKPTR(bo)) != 0) { + BO_LOCK(bo); + goto loop1; + } + BO_LOCK(bo); + } + BO_UNLOCK(bo); + KASSERT(bp->b_bufobj == bo, + ("bp %p wrong b_bufobj %p should be %p", + bp, bp->b_bufobj, bo)); + if ((bp->b_flags & B_DELWRI) == 0) + panic("fsync: not dirty"); + if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { + vfs_bio_awrite(bp); + } else { + bremfree(bp); + bawrite(bp); + } + if (maxretry < 1000) + pause("dirty", hz < 1000 ? 1 : hz / 1000); + BO_LOCK(bo); + goto loop2; + } + + /* + * If synchronous the caller expects us to completely resolve all + * dirty buffers in the system. Wait for in-progress I/O to + * complete (which could include background bitmap writes), then + * retry if dirty blocks still exist. + */ + if (waitfor == MNT_WAIT) { + bufobj_wwait(bo, 0, 0); + if (bo->bo_dirty.bv_cnt > 0) { + /* + * If we are unable to write any of these buffers + * then we fail now rather than trying endlessly + * to write them out. + */ + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) + if ((error = bp->b_error) != 0) + break; + if ((mp != NULL && mp->mnt_secondary_writes > 0) || + (error == 0 && --maxretry >= 0)) + goto loop1; + if (error == 0) + error = EAGAIN; + } + } + BO_UNLOCK(bo); + if (error != 0) + vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); + + return (error); +} diff --git a/freebsd/sys/sys/bio.h b/freebsd/sys/sys/bio.h new file mode 100644 index 00000000..1dab6155 --- /dev/null +++ b/freebsd/sys/sys/bio.h @@ -0,0 +1,184 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)buf.h 8.9 (Berkeley) 3/30/95 + * $FreeBSD$ + */ + +#ifndef _SYS_BIO_H_ +#define _SYS_BIO_H_ + +#include +#include + +/* bio_cmd */ +#define BIO_READ 0x01 /* Read I/O data */ +#define BIO_WRITE 0x02 /* Write I/O data */ +#define BIO_DELETE 0x03 /* TRIM or free blocks, i.e. mark as unused */ +#define BIO_GETATTR 0x04 /* Get GEOM attributes of object */ +#define BIO_FLUSH 0x05 /* Commit outstanding I/O now */ +#define BIO_CMD0 0x06 /* Available for local hacks */ +#define BIO_CMD1 0x07 /* Available for local hacks */ +#define BIO_CMD2 0x08 /* Available for local hacks */ +#define BIO_ZONE 0x09 /* Zone command */ + +/* bio_flags */ +#define BIO_ERROR 0x01 /* An error occurred processing this bio. */ +#define BIO_DONE 0x02 /* This bio is finished. */ +#define BIO_ONQUEUE 0x04 /* This bio is in a queue & not yet taken. */ +/* + * This bio must be executed after all previous bios in the queue have been + * executed, and before any successive bios can be executed. + */ +#define BIO_ORDERED 0x08 +#define BIO_UNMAPPED 0x10 +#define BIO_TRANSIENT_MAPPING 0x20 +#define BIO_VLIST 0x40 + +#ifdef _KERNEL +struct disk; +struct bio; +struct vm_map; + +/* Empty classifier tag, to prevent further classification. */ +#define BIO_NOTCLASSIFIED (void *)(~0UL) + +typedef void bio_task_t(void *); + +/* + * The bio structure describes an I/O operation in the kernel. + */ +struct bio { + uint16_t bio_cmd; /* I/O operation. */ + uint16_t bio_flags; /* General flags. */ + uint16_t bio_cflags; /* Private use by the consumer. */ + uint16_t bio_pflags; /* Private use by the provider. */ + struct cdev *bio_dev; /* Device to do I/O on. */ + struct disk *bio_disk; /* Valid below geom_disk.c only */ + off_t bio_offset; /* Offset into file. */ + long bio_bcount; /* Valid bytes in buffer. */ + caddr_t bio_data; /* Memory, superblocks, indirect etc. */ + struct vm_page **bio_ma; /* Or unmapped. */ + int bio_ma_offset; /* Offset in the first page of bio_ma. */ + int bio_ma_n; /* Number of pages in bio_ma. */ + int bio_error; /* Errno for BIO_ERROR. */ + long bio_resid; /* Remaining I/O in bytes. */ + void (*bio_done)(struct bio *); + void *bio_driver1; /* Private use by the provider. */ + void *bio_driver2; /* Private use by the provider. */ + void *bio_caller1; /* Private use by the consumer. */ + void *bio_caller2; /* Private use by the consumer. */ + TAILQ_ENTRY(bio) bio_queue; /* Disksort queue. */ + const char *bio_attribute; /* Attribute for BIO_[GS]ETATTR */ + struct disk_zone_args bio_zone;/* Used for BIO_ZONE */ + struct g_consumer *bio_from; /* GEOM linkage */ + struct g_provider *bio_to; /* GEOM linkage */ + off_t bio_length; /* Like bio_bcount */ + off_t bio_completed; /* Inverse of bio_resid */ + u_int bio_children; /* Number of spawned bios */ + u_int bio_inbed; /* Children safely home by now */ + struct bio *bio_parent; /* Pointer to parent */ + struct bintime bio_t0; /* Time request started */ + + bio_task_t *bio_task; /* Task_queue handler */ + void *bio_task_arg; /* Argument to above */ + + void *bio_classifier1; /* Classifier tag. */ + void *bio_classifier2; /* Classifier tag. */ + +#ifdef DIAGNOSTIC + void *_bio_caller1; + void *_bio_caller2; + uint8_t _bio_cflags; +#endif +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) + struct buf *bio_track_bp; /* Parent buf for tracking */ +#endif + + /* XXX: these go away when bio chaining is introduced */ + daddr_t bio_pblkno; /* physical block number */ +}; + +struct uio; +struct devstat; + +struct bio_queue_head { + TAILQ_HEAD(bio_queue, bio) queue; + off_t last_offset; + struct bio *insert_point; + int total; + int batched; +}; + +extern struct vm_map *bio_transient_map; +extern int bio_transient_maxcnt; + +void biodone(struct bio *bp); +void biofinish(struct bio *bp, struct devstat *stat, int error); +int biowait(struct bio *bp, const char *wchan); + +#if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) +void biotrack_buf(struct bio *bp, const char *location); + +static __inline void +biotrack(struct bio *bp, const char *location) +{ + + if (bp->bio_track_bp != NULL) + biotrack_buf(bp, location); +} +#else +static __inline void +biotrack(struct bio *bp __unused, const char *location __unused) +{ +} +#endif + +void bioq_disksort(struct bio_queue_head *ap, struct bio *bp); +struct bio *bioq_first(struct bio_queue_head *head); +struct bio *bioq_takefirst(struct bio_queue_head *head); +void bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error); +void bioq_init(struct bio_queue_head *head); +void bioq_insert_head(struct bio_queue_head *head, struct bio *bp); +void bioq_insert_tail(struct bio_queue_head *head, struct bio *bp); +void bioq_remove(struct bio_queue_head *head, struct bio *bp); + +int physio(struct cdev *dev, struct uio *uio, int ioflag); +#define physread physio +#define physwrite physio + +#endif /* _KERNEL */ + +#endif /* !_SYS_BIO_H_ */ diff --git a/freebsd/sys/sys/namei.h b/freebsd/sys/sys/namei.h new file mode 100644 index 00000000..53814117 --- /dev/null +++ b/freebsd/sys/sys/namei.h @@ -0,0 +1,226 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1985, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)namei.h 8.5 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _SYS_NAMEI_H_ +#define _SYS_NAMEI_H_ + +#include +#include +#include +#include + +struct componentname { + /* + * Arguments to lookup. + */ + u_long cn_nameiop; /* namei operation */ + u_int64_t cn_flags; /* flags to namei */ + struct thread *cn_thread;/* thread requesting lookup */ + struct ucred *cn_cred; /* credentials */ + int cn_lkflags; /* Lock flags LK_EXCLUSIVE or LK_SHARED */ + /* + * Shared between lookup and commit routines. + */ + char *cn_pnbuf; /* pathname buffer */ + char *cn_nameptr; /* pointer to looked up name */ + long cn_namelen; /* length of looked up component */ +}; + +struct nameicap_tracker; +TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker); + +/* + * Encapsulation of namei parameters. + */ +struct nameidata { + /* + * Arguments to namei/lookup. + */ + const char *ni_dirp; /* pathname pointer */ + enum uio_seg ni_segflg; /* location of pathname */ + cap_rights_t ni_rightsneeded; /* rights required to look up vnode */ + /* + * Arguments to lookup. + */ + struct vnode *ni_startdir; /* starting directory */ + struct vnode *ni_rootdir; /* logical root directory */ + struct vnode *ni_topdir; /* logical top directory */ + int ni_dirfd; /* starting directory for *at functions */ + int ni_lcf; /* local call flags */ + /* + * Results: returned from namei + */ + struct filecaps ni_filecaps; /* rights the *at base has */ + /* + * Results: returned from/manipulated by lookup + */ + struct vnode *ni_vp; /* vnode of result */ + struct vnode *ni_dvp; /* vnode of intermediate directory */ + /* + * Results: flags returned from namei + */ + u_int ni_resflags; + /* + * Shared between namei and lookup/commit routines. + */ + size_t ni_pathlen; /* remaining chars in path */ + char *ni_next; /* next location in pathname */ + u_int ni_loopcnt; /* count of symlinks encountered */ + /* + * Lookup parameters: this structure describes the subset of + * information from the nameidata structure that is passed + * through the VOP interface. + */ + struct componentname ni_cnd; + struct nameicap_tracker_head ni_cap_tracker; +}; + +#ifdef _KERNEL +/* + * namei operations + */ +#define LOOKUP 0 /* perform name lookup only */ +#define CREATE 1 /* setup for file creation */ +#define DELETE 2 /* setup for file deletion */ +#define RENAME 3 /* setup for file renaming */ +#define OPMASK 3 /* mask for operation */ +/* + * namei operational modifier flags, stored in ni_cnd.flags + */ +#define LOCKLEAF 0x0004 /* lock vnode on return */ +#define LOCKPARENT 0x0008 /* want parent vnode returned locked */ +#define WANTPARENT 0x0010 /* want parent vnode returned unlocked */ +#define NOCACHE 0x0020 /* name must not be left in cache */ +#define FOLLOW 0x0040 /* follow symbolic links */ +#define LOCKSHARED 0x0100 /* Shared lock leaf */ +#define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */ +#define MODMASK 0x01fc /* mask of operational modifiers */ +/* + * Namei parameter descriptors. + * + * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP. + * If the caller of namei sets the flag (for example execve wants to + * know the name of the program that is being executed), then it must + * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must + * be freed by either the commit routine or the VOP_ABORT routine. + * SAVESTART is set only by the callers of namei. It implies SAVENAME + * plus the addition of saving the parent directory that contains the + * name in ni_startdir. It allows repeated calls to lookup for the + * name being sought. The caller is responsible for releasing the + * buffer and for vrele'ing ni_startdir. + */ +#define RDONLY 0x00000200 /* lookup with read-only semantics */ +#define HASBUF 0x00000400 /* has allocated pathname buffer */ +#define SAVENAME 0x00000800 /* save pathname buffer */ +#define SAVESTART 0x00001000 /* save starting directory */ +#define ISDOTDOT 0x00002000 /* current component name is .. */ +#define MAKEENTRY 0x00004000 /* entry is to be added to name cache */ +#define ISLASTCN 0x00008000 /* this is last component of pathname */ +#define ISSYMLINK 0x00010000 /* symlink needs interpretation */ +#define ISWHITEOUT 0x00020000 /* found whiteout */ +#define DOWHITEOUT 0x00040000 /* do whiteouts */ +#define WILLBEDIR 0x00080000 /* new files will be dirs; allow trailing / */ +#define ISUNICODE 0x00100000 /* current component name is unicode*/ +#define ISOPEN 0x00200000 /* caller is opening; return a real vnode. */ +#define NOCROSSMOUNT 0x00400000 /* do not cross mount points */ +#define NOMACCHECK 0x00800000 /* do not perform MAC checks */ +#define AUDITVNODE1 0x04000000 /* audit the looked up vnode information */ +#define AUDITVNODE2 0x08000000 /* audit the looked up vnode information */ +#define TRAILINGSLASH 0x10000000 /* path ended in a slash */ +#define NOCAPCHECK 0x20000000 /* do not perform capability checks */ +#define NOEXECCHECK 0x40000000 /* do not perform exec check on dir */ +#define PARAMASK 0x7ffffe00 /* mask of parameter descriptors */ + +/* + * Namei results flags + */ +#define NIRES_ABS 0x00000001 /* Path was absolute */ + +/* + * Flags in ni_lcf, valid for the duration of the namei call. + */ +#define NI_LCF_STRICTRELATIVE 0x0001 /* relative lookup only */ +#define NI_LCF_CAP_DOTDOT 0x0002 /* ".." in strictrelative case */ + +/* + * Initialization of a nameidata structure. + */ +#define NDINIT(ndp, op, flags, segflg, namep, td) \ + NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, 0, td) +#define NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td) \ + NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, 0, td) +#define NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rightsp, td) \ + NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, rightsp, td) +#define NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td) \ + NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, 0, td) + +void NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, + enum uio_seg segflg, const char *namep, int dirfd, struct vnode *startdir, + cap_rights_t *rightsp, struct thread *td); + +#define NDF_NO_DVP_RELE 0x00000001 +#define NDF_NO_DVP_UNLOCK 0x00000002 +#define NDF_NO_DVP_PUT 0x00000003 +#define NDF_NO_VP_RELE 0x00000004 +#define NDF_NO_VP_UNLOCK 0x00000008 +#define NDF_NO_VP_PUT 0x0000000c +#define NDF_NO_STARTDIR_RELE 0x00000010 +#define NDF_NO_FREE_PNBUF 0x00000020 +#define NDF_ONLY_PNBUF (~NDF_NO_FREE_PNBUF) + +void NDFREE(struct nameidata *, const u_int); + +int namei(struct nameidata *ndp); +int lookup(struct nameidata *ndp); +int relookup(struct vnode *dvp, struct vnode **vpp, + struct componentname *cnp); +#endif + +/* + * Stats on usefulness of namei caches. + */ +struct nchstats { + long ncs_goodhits; /* hits that we can really use */ + long ncs_neghits; /* negative hits that we can use */ + long ncs_badhits; /* hits we must drop */ + long ncs_falsehits; /* hits with id mismatch */ + long ncs_miss; /* misses */ + long ncs_long; /* long names that ignore cache */ + long ncs_pass2; /* names found with passes == 2 */ + long ncs_2passes; /* number of times we attempt it */ +}; + +extern struct nchstats nchstats; + +#endif /* !_SYS_NAMEI_H_ */ diff --git a/freebsd/sys/sys/pctrie.h b/freebsd/sys/sys/pctrie.h new file mode 100644 index 00000000..88d5d258 --- /dev/null +++ b/freebsd/sys/sys/pctrie.h @@ -0,0 +1,152 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 EMC Corp. + * Copyright (c) 2011 Jeffrey Roberson + * Copyright (c) 2008 Mayur Shardul + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_PCTRIE_H_ +#define _SYS_PCTRIE_H_ + +#include + +#ifdef _KERNEL + +#define PCTRIE_DEFINE(name, type, field, allocfn, freefn) \ + \ +CTASSERT(sizeof(((struct type *)0)->field) == sizeof(uint64_t)); \ +/* \ + * XXX This assert protects flag bits, it does not enforce natural \ + * alignment. 32bit architectures do not naturally align 64bit fields. \ + */ \ +CTASSERT((__offsetof(struct type, field) & (sizeof(uint32_t) - 1)) == 0); \ + \ +static __inline struct type * \ +name##_PCTRIE_VAL2PTR(uint64_t *val) \ +{ \ + \ + if (val == NULL) \ + return (NULL); \ + return (struct type *) \ + ((uintptr_t)val - __offsetof(struct type, field)); \ +} \ + \ +static __inline uint64_t * \ +name##_PCTRIE_PTR2VAL(struct type *ptr) \ +{ \ + \ + return &ptr->field; \ +} \ + \ +static __inline int \ +name##_PCTRIE_INSERT(struct pctrie *ptree, struct type *ptr) \ +{ \ + \ + return pctrie_insert(ptree, name##_PCTRIE_PTR2VAL(ptr), \ + allocfn); \ +} \ + \ +static __inline struct type * \ +name##_PCTRIE_LOOKUP(struct pctrie *ptree, uint64_t key) \ +{ \ + \ + return name##_PCTRIE_VAL2PTR(pctrie_lookup(ptree, key)); \ +} \ + \ +static __inline __unused struct type * \ +name##_PCTRIE_LOOKUP_LE(struct pctrie *ptree, uint64_t key) \ +{ \ + \ + return name##_PCTRIE_VAL2PTR(pctrie_lookup_le(ptree, key)); \ +} \ + \ +static __inline __unused struct type * \ +name##_PCTRIE_LOOKUP_GE(struct pctrie *ptree, uint64_t key) \ +{ \ + \ + return name##_PCTRIE_VAL2PTR(pctrie_lookup_ge(ptree, key)); \ +} \ + \ +static __inline __unused void \ +name##_PCTRIE_RECLAIM(struct pctrie *ptree) \ +{ \ + \ + pctrie_reclaim_allnodes(ptree, freefn); \ +} \ + \ +static __inline void \ +name##_PCTRIE_REMOVE(struct pctrie *ptree, uint64_t key) \ +{ \ + \ + pctrie_remove(ptree, key, freefn); \ +} + +typedef void *(*pctrie_alloc_t)(struct pctrie *ptree); +typedef void (*pctrie_free_t)(struct pctrie *ptree, void *node); + +int pctrie_insert(struct pctrie *ptree, uint64_t *val, + pctrie_alloc_t allocfn); +uint64_t *pctrie_lookup(struct pctrie *ptree, uint64_t key); +uint64_t *pctrie_lookup_ge(struct pctrie *ptree, uint64_t key); +uint64_t *pctrie_lookup_le(struct pctrie *ptree, uint64_t key); +void pctrie_reclaim_allnodes(struct pctrie *ptree, + pctrie_free_t freefn); +void pctrie_remove(struct pctrie *ptree, uint64_t key, + pctrie_free_t freefn); +size_t pctrie_node_size(void); +int pctrie_zone_init(void *mem, int size, int flags); + +static __inline void +pctrie_init(struct pctrie *ptree) +{ + + ptree->pt_root = 0; +} + +static __inline boolean_t +pctrie_is_empty(struct pctrie *ptree) +{ + + return (ptree->pt_root == 0); +} + +/* + * These widths should allow the pointers to a node's children to fit within + * a single cache line. The extra levels from a narrow width should not be + * a problem thanks to path compression. + */ +#ifdef __LP64__ +#define PCTRIE_WIDTH 4 +#else +#define PCTRIE_WIDTH 3 +#endif + +#define PCTRIE_COUNT (1 << PCTRIE_WIDTH) + +#endif /* _KERNEL */ +#endif /* !_SYS_PCTRIE_H_ */ diff --git a/freebsd/sys/sys/syscallsubr.h b/freebsd/sys/sys/syscallsubr.h new file mode 100644 index 00000000..677afdd6 --- /dev/null +++ b/freebsd/sys/sys/syscallsubr.h @@ -0,0 +1,317 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2002 Ian Dowse. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SYSCALLSUBR_H_ +#define _SYS_SYSCALLSUBR_H_ + +#include +#include +#include +#include +#include +#include +#include + +struct __wrusage; +struct file; +struct filecaps; +enum idtype; +struct itimerval; +struct image_args; +struct jail; +struct kevent; +struct kevent_copyops; +struct kld_file_stat; +struct ksiginfo; +struct mbuf; +struct msghdr; +struct msqid_ds; +struct pollfd; +struct ogetdirentries_args; +struct rlimit; +struct rusage; +struct sched_param; +union semun; +struct sockaddr; +struct stat; +struct thr_param; +struct uio; + +typedef int (*mmap_check_fp_fn)(struct file *, int, int, int); + +int kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, + size_t buflen, size_t path_max); +int kern_accept(struct thread *td, int s, struct sockaddr **name, + socklen_t *namelen, struct file **fp); +int kern_accept4(struct thread *td, int s, struct sockaddr **name, + socklen_t *namelen, int flags, struct file **fp); +int kern_accessat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, int flags, int mode); +int kern_adjtime(struct thread *td, struct timeval *delta, + struct timeval *olddelta); +int kern_alternate_path(struct thread *td, const char *prefix, const char *path, + enum uio_seg pathseg, char **pathbuf, int create, int dirfd); +int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa); +int kern_break(struct thread *td, uintptr_t *addr); +int kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, + size_t ncmds); +int kern_cap_rights_limit(struct thread *td, int fd, cap_rights_t *rights); +int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg); +int kern_clock_getcpuclockid2(struct thread *td, id_t id, int which, + clockid_t *clk_id); +int kern_clock_getres(struct thread *td, clockid_t clock_id, + struct timespec *ts); +int kern_clock_gettime(struct thread *td, clockid_t clock_id, + struct timespec *ats); +int kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, + const struct timespec *rqtp, struct timespec *rmtp); +int kern_clock_settime(struct thread *td, clockid_t clock_id, + struct timespec *ats); +int kern_close(struct thread *td, int fd); +int kern_connectat(struct thread *td, int dirfd, int fd, + struct sockaddr *sa); +int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp); +int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t cpusetsize, + const cpuset_t *maskp); +int kern_cpuset_getdomain(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t domainsetsize, + domainset_t *maskp, int *policyp); +int kern_cpuset_setdomain(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, size_t domainsetsize, + const domainset_t *maskp, int policy); +int kern_cpuset_getid(struct thread *td, cpulevel_t level, + cpuwhich_t which, id_t id, cpusetid_t *setid); +int kern_cpuset_setid(struct thread *td, cpuwhich_t which, + id_t id, cpusetid_t setid); +int kern_dup(struct thread *td, u_int mode, int flags, int old, int new); +int kern_execve(struct thread *td, struct image_args *args, + struct mac *mac_p); +int kern_fchmodat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, mode_t mode, int flag); +int kern_fchownat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, int uid, int gid, int flag); +int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg); +int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg); +int kern_fhstat(struct thread *td, fhandle_t fh, struct stat *buf); +int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf); +int kern_fpathconf(struct thread *td, int fd, int name, long *valuep); +int kern_fstat(struct thread *td, int fd, struct stat *sbp); +int kern_fstatfs(struct thread *td, int fd, struct statfs *buf); +int kern_fsync(struct thread *td, int fd, bool fullsync); +int kern_ftruncate(struct thread *td, int fd, off_t length); +int kern_futimes(struct thread *td, int fd, struct timeval *tptr, + enum uio_seg tptrseg); +int kern_futimens(struct thread *td, int fd, struct timespec *tptr, + enum uio_seg tptrseg); +int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, + off_t *basep, ssize_t *residp, enum uio_seg bufseg); +int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, + size_t *countp, enum uio_seg bufseg, int mode); +int kern_getitimer(struct thread *, u_int, struct itimerval *); +int kern_getppid(struct thread *); +int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, + socklen_t *alen); +int kern_getrusage(struct thread *td, int who, struct rusage *rup); +int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, + socklen_t *alen); +int kern_getsockopt(struct thread *td, int s, int level, int name, + void *optval, enum uio_seg valseg, socklen_t *valsize); +int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data); +int kern_jail(struct thread *td, struct jail *j); +int kern_jail_get(struct thread *td, struct uio *options, int flags); +int kern_jail_set(struct thread *td, struct uio *options, int flags); +int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, + struct kevent_copyops *k_ops, const struct timespec *timeout); +int kern_kevent_anonymous(struct thread *td, int nevents, + struct kevent_copyops *k_ops); +int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, + int nevents, struct kevent_copyops *k_ops, + const struct timespec *timeout); +int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps); +int kern_kldload(struct thread *td, const char *file, int *fileid); +int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat); +int kern_kldunload(struct thread *td, int fileid, int flags); +int kern_linkat(struct thread *td, int fd1, int fd2, char *path1, + char *path2, enum uio_seg segflg, int follow); +int kern_listen(struct thread *td, int s, int backlog); +int kern_lseek(struct thread *td, int fd, off_t offset, int whence); +int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, + struct timeval *tptr, enum uio_seg tptrseg); +int kern_madvise(struct thread *td, uintptr_t addr, size_t len, int behav); +int kern_mincore(struct thread *td, uintptr_t addr, size_t len, char *vec); +int kern_mkdirat(struct thread *td, int fd, char *path, + enum uio_seg segflg, int mode); +int kern_mkfifoat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, int mode); +int kern_mknodat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, int mode, dev_t dev); +int kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr, + size_t len); +int kern_mmap(struct thread *td, uintptr_t addr, size_t size, int prot, + int flags, int fd, off_t pos); +int kern_mmap_fpcheck(struct thread *td, uintptr_t addr, size_t len, + int prot, int flags, int fd, off_t pos, + mmap_check_fp_fn check_fp_fn); +int kern_mprotect(struct thread *td, uintptr_t addr, size_t size, int prot); +int kern_msgctl(struct thread *, int, int, struct msqid_ds *); +int kern_msgrcv(struct thread *, int, void *, size_t, long, int, long *); +int kern_msgsnd(struct thread *, int, const void *, size_t, int, long); +int kern_msync(struct thread *td, uintptr_t addr, size_t size, int flags); +int kern_munlock(struct thread *td, uintptr_t addr, size_t size); +int kern_munmap(struct thread *td, uintptr_t addr, size_t size); +int kern_nanosleep(struct thread *td, struct timespec *rqt, + struct timespec *rmt); +int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, + long *ploff); +int kern_openat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, int flags, int mode); +int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, + int name, u_long flags, long *valuep); +int kern_pipe(struct thread *td, int fildes[2], int flags, + struct filecaps *fcaps1, struct filecaps *fcaps2); +int kern_poll(struct thread *td, struct pollfd *fds, u_int nfds, + struct timespec *tsp, sigset_t *uset); +int kern_posix_error(struct thread *td, int error); +int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, + int advice); +int kern_posix_fallocate(struct thread *td, int fd, off_t offset, + off_t len); +int kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com, + void *data); +int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, + off_t offset); +int kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset); +int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, + fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits); +int kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, + int data); +int kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, + off_t offset); +int kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset); +int kern_readlinkat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count); +int kern_readv(struct thread *td, int fd, struct uio *auio); +int kern_recvit(struct thread *td, int s, struct msghdr *mp, + enum uio_seg fromseg, struct mbuf **controlp); +int kern_renameat(struct thread *td, int oldfd, char *old, int newfd, + char *new, enum uio_seg pathseg); +int kern_rmdirat(struct thread *td, int fd, char *path, + enum uio_seg pathseg); +int kern_sched_getparam(struct thread *td, struct thread *targettd, + struct sched_param *param); +int kern_sched_getscheduler(struct thread *td, struct thread *targettd, + int *policy); +int kern_sched_setparam(struct thread *td, struct thread *targettd, + struct sched_param *param); +int kern_sched_setscheduler(struct thread *td, struct thread *targettd, + int policy, struct sched_param *param); +int kern_sched_rr_get_interval(struct thread *td, pid_t pid, + struct timespec *ts); +int kern_sched_rr_get_interval_td(struct thread *td, struct thread *targettd, + struct timespec *ts); +int kern_semctl(struct thread *td, int semid, int semnum, int cmd, + union semun *arg, register_t *rval); +int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, + fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits); +int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags, + struct mbuf *control, enum uio_seg segflg); +int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups); +int kern_setitimer(struct thread *, u_int, struct itimerval *, + struct itimerval *); +int kern_setrlimit(struct thread *, u_int, struct rlimit *); +int kern_setsockopt(struct thread *td, int s, int level, int name, + void *optval, enum uio_seg valseg, socklen_t valsize); +int kern_settimeofday(struct thread *td, struct timeval *tv, + struct timezone *tzp); +int kern_shm_open(struct thread *td, const char *userpath, int flags, + mode_t mode, struct filecaps *fcaps); +int kern_shmat(struct thread *td, int shmid, const void *shmaddr, + int shmflg); +int kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, + size_t *bufsz); +int kern_shutdown(struct thread *td, int s, int how); +int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, + struct sigaction *oact, int flags); +int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss); +int kern_sigprocmask(struct thread *td, int how, + sigset_t *set, sigset_t *oset, int flags); +int kern_sigsuspend(struct thread *td, sigset_t mask); +int kern_sigtimedwait(struct thread *td, sigset_t waitset, + struct ksiginfo *ksi, struct timespec *timeout); +int kern_sigqueue(struct thread *td, pid_t pid, int signum, + union sigval *value); +int kern_socket(struct thread *td, int domain, int type, int protocol); +int kern_statat(struct thread *td, int flag, int fd, char *path, + enum uio_seg pathseg, struct stat *sbp, + void (*hook)(struct vnode *vp, struct stat *sbp)); +int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, + struct statfs *buf); +int kern_symlinkat(struct thread *td, char *path1, int fd, char *path2, + enum uio_seg segflg); +int kern_ktimer_create(struct thread *td, clockid_t clock_id, + struct sigevent *evp, int *timerid, int preset_id); +int kern_ktimer_delete(struct thread *, int); +int kern_ktimer_settime(struct thread *td, int timer_id, int flags, + struct itimerspec *val, struct itimerspec *oval); +int kern_ktimer_gettime(struct thread *td, int timer_id, + struct itimerspec *val); +int kern_ktimer_getoverrun(struct thread *td, int timer_id); +int kern_thr_alloc(struct proc *, int pages, struct thread **); +int kern_thr_exit(struct thread *td); +int kern_thr_new(struct thread *td, struct thr_param *param); +int kern_thr_suspend(struct thread *td, struct timespec *tsp); +int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, + off_t length); +int kern_unlinkat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, ino_t oldinum); +int kern_utimesat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg); +int kern_utimensat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg, + int follow); +int kern_wait(struct thread *td, pid_t pid, int *status, int options, + struct rusage *rup); +int kern_wait6(struct thread *td, enum idtype idtype, id_t id, int *status, + int options, struct __wrusage *wrup, siginfo_t *sip); +int kern_writev(struct thread *td, int fd, struct uio *auio); +int kern_socketpair(struct thread *td, int domain, int type, int protocol, + int *rsv); + +/* flags for kern_sigaction */ +#define KSA_OSIGSET 0x0001 /* uses osigact_t */ +#define KSA_FREEBSD4 0x0002 /* uses ucontext4 */ + +struct freebsd11_dirent; + +int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int + count, long *basep, void (*func)(struct freebsd11_dirent *)); + +#endif /* !_SYS_SYSCALLSUBR_H_ */ diff --git a/freebsd/sys/sys/sysent.h b/freebsd/sys/sys/sysent.h new file mode 100644 index 00000000..d1d9e99b --- /dev/null +++ b/freebsd/sys/sys/sysent.h @@ -0,0 +1,327 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1982, 1988, 1991 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SYSENT_H_ +#define _SYS_SYSENT_H_ + +#include + +struct rlimit; +struct sysent; +struct thread; +struct ksiginfo; +struct syscall_args; + +enum systrace_probe_t { + SYSTRACE_ENTRY, + SYSTRACE_RETURN, +}; + +typedef int sy_call_t(struct thread *, void *); + +typedef void (*systrace_probe_func_t)(struct syscall_args *, + enum systrace_probe_t, int); +typedef void (*systrace_args_func_t)(int, void *, uint64_t *, int *); + +#ifdef _KERNEL +extern bool systrace_enabled; +#endif +extern systrace_probe_func_t systrace_probe_func; + +struct sysent { /* system call table */ + int sy_narg; /* number of arguments */ + sy_call_t *sy_call; /* implementing function */ + au_event_t sy_auevent; /* audit event associated with syscall */ + systrace_args_func_t sy_systrace_args_func; + /* optional argument conversion function. */ + u_int32_t sy_entry; /* DTrace entry ID for systrace. */ + u_int32_t sy_return; /* DTrace return ID for systrace. */ + u_int32_t sy_flags; /* General flags for system calls. */ + u_int32_t sy_thrcnt; +}; + +/* + * A system call is permitted in capability mode. + */ +#define SYF_CAPENABLED 0x00000001 + +#define SY_THR_FLAGMASK 0x7 +#define SY_THR_STATIC 0x1 +#define SY_THR_DRAINING 0x2 +#define SY_THR_ABSENT 0x4 +#define SY_THR_INCR 0x8 + +#ifdef KLD_MODULE +#define SY_THR_STATIC_KLD 0 +#else +#define SY_THR_STATIC_KLD SY_THR_STATIC +#endif + +struct image_params; +struct __sigset; +struct trapframe; +struct vnode; + +struct sysentvec { + int sv_size; /* number of entries */ + struct sysent *sv_table; /* pointer to sysent */ + u_int sv_mask; /* optional mask to index */ + int sv_errsize; /* size of errno translation table */ + const int *sv_errtbl; /* errno translation table */ + int (*sv_transtrap)(int, int); + /* translate trap-to-signal mapping */ + int (*sv_fixup)(register_t **, struct image_params *); + /* stack fixup function */ + void (*sv_sendsig)(void (*)(int), struct ksiginfo *, struct __sigset *); + /* send signal */ + char *sv_sigcode; /* start of sigtramp code */ + int *sv_szsigcode; /* size of sigtramp code */ + char *sv_name; /* name of binary type */ + int (*sv_coredump)(struct thread *, struct vnode *, off_t, int); + /* function to dump core, or NULL */ + int (*sv_imgact_try)(struct image_params *); + void (*sv_stackgap)(struct image_params *, u_long *); + int sv_minsigstksz; /* minimum signal stack size */ + int sv_pagesize; /* spare / no longer used */ + vm_offset_t sv_minuser; /* VM_MIN_ADDRESS */ + vm_offset_t sv_maxuser; /* VM_MAXUSER_ADDRESS */ + vm_offset_t sv_usrstack; /* USRSTACK */ + vm_offset_t sv_psstrings; /* PS_STRINGS */ + int sv_stackprot; /* vm protection for stack */ + register_t *(*sv_copyout_strings)(struct image_params *); + void (*sv_setregs)(struct thread *, struct image_params *, + u_long); + void (*sv_fixlimit)(struct rlimit *, int); + u_long *sv_maxssiz; + u_int sv_flags; + void (*sv_set_syscall_retval)(struct thread *, int); + int (*sv_fetch_syscall_args)(struct thread *); + const char **sv_syscallnames; + vm_offset_t sv_timekeep_base; + vm_offset_t sv_shared_page_base; + vm_offset_t sv_shared_page_len; + vm_offset_t sv_sigcode_base; + void *sv_shared_page_obj; + void (*sv_schedtail)(struct thread *); + void (*sv_thread_detach)(struct thread *); + int (*sv_trap)(struct thread *); + u_long *sv_hwcap; /* Value passed in AT_HWCAP. */ + u_long *sv_hwcap2; /* Value passed in AT_HWCAP2. */ +}; + +#define SV_ILP32 0x000100 /* 32-bit executable. */ +#define SV_LP64 0x000200 /* 64-bit executable. */ +#define SV_IA32 0x004000 /* Intel 32-bit executable. */ +#define SV_AOUT 0x008000 /* a.out executable. */ +#define SV_SHP 0x010000 /* Shared page. */ +#define SV_CAPSICUM 0x020000 /* Force cap_enter() on startup. */ +#define SV_TIMEKEEP 0x040000 /* Shared page timehands. */ +#define SV_ASLR 0x080000 /* ASLR allowed. */ + +#define SV_ABI_MASK 0xff +#define SV_ABI_ERRNO(p, e) ((p)->p_sysent->sv_errsize <= 0 ? e : \ + ((e) >= (p)->p_sysent->sv_errsize ? -1 : (p)->p_sysent->sv_errtbl[e])) +#define SV_PROC_FLAG(p, x) ((p)->p_sysent->sv_flags & (x)) +#define SV_PROC_ABI(p) ((p)->p_sysent->sv_flags & SV_ABI_MASK) +#define SV_CURPROC_FLAG(x) SV_PROC_FLAG(curproc, x) +#define SV_CURPROC_ABI() SV_PROC_ABI(curproc) +/* same as ELFOSABI_XXX, to prevent header pollution */ +#define SV_ABI_LINUX 3 +#define SV_ABI_FREEBSD 9 +#define SV_ABI_CLOUDABI 17 +#define SV_ABI_UNDEF 255 + +#ifdef _KERNEL +extern struct sysentvec aout_sysvec; +extern struct sysent sysent[]; +extern const char *syscallnames[]; + +#if defined(__amd64__) +extern int i386_read_exec; +#endif + +#define NO_SYSCALL (-1) + +struct module; + +struct syscall_module_data { + int (*chainevh)(struct module *, int, void *); /* next handler */ + void *chainarg; /* arg for next event handler */ + int *offset; /* offset into sysent */ + struct sysent *new_sysent; /* new sysent */ + struct sysent old_sysent; /* old sysent */ + int flags; /* flags for syscall_register */ +}; + +/* separate initialization vector so it can be used in a substructure */ +#define SYSENT_INIT_VALS(_syscallname) { \ + .sy_narg = (sizeof(struct _syscallname ## _args ) \ + / sizeof(register_t)), \ + .sy_call = (sy_call_t *)&sys_##_syscallname, \ + .sy_auevent = SYS_AUE_##_syscallname, \ + .sy_systrace_args_func = NULL, \ + .sy_entry = 0, \ + .sy_return = 0, \ + .sy_flags = 0, \ + .sy_thrcnt = 0 \ +} + +#define MAKE_SYSENT(syscallname) \ +static struct sysent syscallname##_sysent = SYSENT_INIT_VALS(syscallname); + +#define MAKE_SYSENT_COMPAT(syscallname) \ +static struct sysent syscallname##_sysent = { \ + (sizeof(struct syscallname ## _args ) \ + / sizeof(register_t)), \ + (sy_call_t *)& syscallname, \ + SYS_AUE_##syscallname \ +} + +#define SYSCALL_MODULE(name, offset, new_sysent, evh, arg) \ +static struct syscall_module_data name##_syscall_mod = { \ + evh, arg, offset, new_sysent, { 0, NULL, AUE_NULL } \ +}; \ + \ +static moduledata_t name##_mod = { \ + "sys/" #name, \ + syscall_module_handler, \ + &name##_syscall_mod \ +}; \ +DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE) + +#define SYSCALL_MODULE_HELPER(syscallname) \ +static int syscallname##_syscall = SYS_##syscallname; \ +MAKE_SYSENT(syscallname); \ +SYSCALL_MODULE(syscallname, \ + & syscallname##_syscall, & syscallname##_sysent, \ + NULL, NULL) + +#define SYSCALL_MODULE_PRESENT(syscallname) \ + (sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmnosys && \ + sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmressys) + +/* + * Syscall registration helpers with resource allocation handling. + */ +struct syscall_helper_data { + struct sysent new_sysent; + struct sysent old_sysent; + int syscall_no; + int registered; +}; +#define SYSCALL_INIT_HELPER_F(syscallname, flags) { \ + .new_sysent = { \ + .sy_narg = (sizeof(struct syscallname ## _args ) \ + / sizeof(register_t)), \ + .sy_call = (sy_call_t *)& sys_ ## syscallname, \ + .sy_auevent = SYS_AUE_##syscallname, \ + .sy_flags = (flags) \ + }, \ + .syscall_no = SYS_##syscallname \ +} +#define SYSCALL_INIT_HELPER_COMPAT_F(syscallname, flags) { \ + .new_sysent = { \ + .sy_narg = (sizeof(struct syscallname ## _args ) \ + / sizeof(register_t)), \ + .sy_call = (sy_call_t *)& syscallname, \ + .sy_auevent = SYS_AUE_##syscallname, \ + .sy_flags = (flags) \ + }, \ + .syscall_no = SYS_##syscallname \ +} +#define SYSCALL_INIT_HELPER(syscallname) \ + SYSCALL_INIT_HELPER_F(syscallname, 0) +#define SYSCALL_INIT_HELPER_COMPAT(syscallname) \ + SYSCALL_INIT_HELPER_COMPAT_F(syscallname, 0) +#define SYSCALL_INIT_LAST { \ + .syscall_no = NO_SYSCALL \ +} + +int syscall_module_handler(struct module *mod, int what, void *arg); +int syscall_helper_register(struct syscall_helper_data *sd, int flags); +int syscall_helper_unregister(struct syscall_helper_data *sd); +/* Implementation, exposed for COMPAT code */ +int kern_syscall_register(struct sysent *sysents, int *offset, + struct sysent *new_sysent, struct sysent *old_sysent, int flags); +int kern_syscall_deregister(struct sysent *sysents, int offset, + const struct sysent *old_sysent); +int kern_syscall_module_handler(struct sysent *sysents, + struct module *mod, int what, void *arg); +int kern_syscall_helper_register(struct sysent *sysents, + struct syscall_helper_data *sd, int flags); +int kern_syscall_helper_unregister(struct sysent *sysents, + struct syscall_helper_data *sd); + +struct proc; +const char *syscallname(struct proc *p, u_int code); + +/* Special purpose system call functions. */ +struct nosys_args; + +int lkmnosys(struct thread *, struct nosys_args *); +int lkmressys(struct thread *, struct nosys_args *); + +int _syscall_thread_enter(struct thread *td, struct sysent *se); +void _syscall_thread_exit(struct thread *td, struct sysent *se); + +static inline int +syscall_thread_enter(struct thread *td, struct sysent *se) +{ + + if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0)) + return (0); + return (_syscall_thread_enter(td, se)); +} + +static inline void +syscall_thread_exit(struct thread *td, struct sysent *se) +{ + + if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0)) + return; + _syscall_thread_exit(td, se); +} + +int shared_page_alloc(int size, int align); +int shared_page_fill(int size, int align, const void *data); +void shared_page_write(int base, int size, const void *data); +void exec_sysvec_init(void *param); +void exec_inittk(void); + +#define INIT_SYSENTVEC(name, sv) \ + SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \ + (sysinit_cfunc_t)exec_sysvec_init, sv); + +#endif /* _KERNEL */ + +#endif /* !_SYS_SYSENT_H_ */ diff --git a/freebsd/sys/sys/vmem.h b/freebsd/sys/sys/vmem.h new file mode 100644 index 00000000..e74d1e3f --- /dev/null +++ b/freebsd/sys/sys/vmem.h @@ -0,0 +1,145 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c)2006 YAMAMOTO Takashi, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* From $NetBSD: vmem.h,v 1.20 2013/01/29 21:26:24 para Exp $ */ + +/* $FreeBSD$ */ + +#ifndef _SYS_VMEM_H_ +#define _SYS_VMEM_H_ + +#include + +#ifdef _KERNEL + +typedef struct vmem vmem_t; + +typedef uintptr_t vmem_addr_t; +typedef size_t vmem_size_t; + +#define VMEM_ADDR_MIN 0 +#define VMEM_ADDR_QCACHE_MIN 1 +#define VMEM_ADDR_MAX (~(vmem_addr_t)0) + +typedef int (vmem_import_t)(void *, vmem_size_t, int, vmem_addr_t *); +typedef void (vmem_release_t)(void *, vmem_addr_t, vmem_size_t); +typedef void (vmem_reclaim_t)(vmem_t *, int); + +/* + * Create a vmem: + * name - Name of the region + * base - Initial span start (optional) + * size - Initial span size + * quantum - Natural unit of allocation (ie PAGE_SIZE, 1, etc) + * qcache_max - Maximum size to quantum cache. This creates a UMA + * cache for each multiple of quantum up to qcache_max. + * flags - M_* flags + */ +vmem_t *vmem_create(const char *name, vmem_addr_t base, + vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags); +vmem_t *vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, + vmem_size_t size, vmem_size_t quantum, vmem_size_t qcache_max, int flags); +void vmem_destroy(vmem_t *); + +/* + * Set callbacks for bringing in dynamic regions: + * importfn - Backing store import routine. + * releasefn - Backing store release routine. + * arg - Backing store argument + * import_quantum - Size to import from backing store + */ + +void vmem_set_import(vmem_t *vm, vmem_import_t *importfn, + vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum); + +/* + * Set a limit on the total size of a vmem. + */ + +void vmem_set_limit(vmem_t *vm, vmem_size_t limit); + +/* + * Set a callback for reclaiming memory when space is exhausted: + */ +void vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn); + +/* + * Allocate and free linear regions from a vmem. Must specify + * BESTFIT or FIRSTFIT. Free is non-blocking. These routines + * respect the quantum caches. + */ +int vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp); +void vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size); + +/* + * Constrained allocate and free routines. These bypass the quantum cache. + * size - Size in units of 1, not quantum. + * align - Required alignment of the start of region + * phase - Offset from alignment + * nocross - Illegal boundary + * minaddr - Minimum allowed address for last byte + * maxaddr - Maximum allowed address for first byte + * flags - M_* flags + * addrp - result + */ +int vmem_xalloc(vmem_t *vm, vmem_size_t size, vmem_size_t align, + vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr, + vmem_addr_t maxaddr, int flags, vmem_addr_t *addrp); +void vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size); + +/* + * Add a static region to a vmem after create. This won't be freed + * until the vmem is destroyed. + */ +int vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags); + +/* + * Given roundup size to the vmem's native quantum size. + */ +vmem_size_t vmem_roundup_size(vmem_t *vm, vmem_size_t size); + +/* + * Report vmem utilization according to the requested type. + */ +vmem_size_t vmem_size(vmem_t *vm, int typemask); + +void vmem_whatis(vmem_addr_t addr, int (*fn)(const char *, ...) + __printflike(1, 2)); +void vmem_print(vmem_addr_t addr, const char *, int (*fn)(const char *, ...) + __printflike(1, 2)); +void vmem_printall(const char *, int (*fn)(const char *, ...) + __printflike(1, 2)); +void vmem_startup(void); + +/* vmem_size typemask */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 +#define VMEM_MAXFREE 0x10 + +#endif /* _KERNEL */ + +#endif /* !_SYS_VMEM_H_ */ diff --git a/freebsd/sys/vm/vm_meter.c b/freebsd/sys/vm/vm_meter.c new file mode 100644 index 00000000..dfd50081 --- /dev/null +++ b/freebsd/sys/vm/vm_meter.c @@ -0,0 +1,561 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct vmmeter __read_mostly vm_cnt = { + .v_swtch = EARLY_COUNTER, + .v_trap = EARLY_COUNTER, + .v_syscall = EARLY_COUNTER, + .v_intr = EARLY_COUNTER, + .v_soft = EARLY_COUNTER, + .v_vm_faults = EARLY_COUNTER, + .v_io_faults = EARLY_COUNTER, + .v_cow_faults = EARLY_COUNTER, + .v_cow_optim = EARLY_COUNTER, + .v_zfod = EARLY_COUNTER, + .v_ozfod = EARLY_COUNTER, + .v_swapin = EARLY_COUNTER, + .v_swapout = EARLY_COUNTER, + .v_swappgsin = EARLY_COUNTER, + .v_swappgsout = EARLY_COUNTER, + .v_vnodein = EARLY_COUNTER, + .v_vnodeout = EARLY_COUNTER, + .v_vnodepgsin = EARLY_COUNTER, + .v_vnodepgsout = EARLY_COUNTER, + .v_intrans = EARLY_COUNTER, + .v_reactivated = EARLY_COUNTER, + .v_pdwakeups = EARLY_COUNTER, + .v_pdpages = EARLY_COUNTER, + .v_pdshortfalls = EARLY_COUNTER, + .v_dfree = EARLY_COUNTER, + .v_pfree = EARLY_COUNTER, + .v_tfree = EARLY_COUNTER, + .v_forks = EARLY_COUNTER, + .v_vforks = EARLY_COUNTER, + .v_rforks = EARLY_COUNTER, + .v_kthreads = EARLY_COUNTER, + .v_forkpages = EARLY_COUNTER, + .v_vforkpages = EARLY_COUNTER, + .v_rforkpages = EARLY_COUNTER, + .v_kthreadpages = EARLY_COUNTER, + .v_wire_count = EARLY_COUNTER, +}; + +static void +vmcounter_startup(void) +{ + counter_u64_t *cnt = (counter_u64_t *)&vm_cnt; + + COUNTER_ARRAY_ALLOC(cnt, VM_METER_NCOUNTERS, M_WAITOK); +} +SYSINIT(counter, SI_SUB_KMEM, SI_ORDER_FIRST, vmcounter_startup, NULL); + +SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, + CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold"); +SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, + CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages"); +SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, + CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock"); +SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, + CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive"); +SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, + CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel"); +SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, + CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point"); + +static int +sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS) +{ + +#ifdef SCTL_MASK32 + u_int32_t la[4]; + + if (req->flags & SCTL_MASK32) { + la[0] = averunnable.ldavg[0]; + la[1] = averunnable.ldavg[1]; + la[2] = averunnable.ldavg[2]; + la[3] = averunnable.fscale; + return SYSCTL_OUT(req, la, sizeof(la)); + } else +#endif + return SYSCTL_OUT(req, &averunnable, sizeof(averunnable)); +} +SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT | CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg", + "Machine loadaverage history"); + +/* + * This function aims to determine if the object is mapped, + * specifically, if it is referenced by a vm_map_entry. Because + * objects occasionally acquire transient references that do not + * represent a mapping, the method used here is inexact. However, it + * has very low overhead and is good enough for the advisory + * vm.vmtotal sysctl. + */ +static bool +is_object_active(vm_object_t obj) +{ + + return (obj->ref_count > obj->shadow_count); +} + +#if defined(COMPAT_FREEBSD11) +struct vmtotal11 { + int16_t t_rq; + int16_t t_dw; + int16_t t_pw; + int16_t t_sl; + int16_t t_sw; + int32_t t_vm; + int32_t t_avm; + int32_t t_rm; + int32_t t_arm; + int32_t t_vmshr; + int32_t t_avmshr; + int32_t t_rmshr; + int32_t t_armshr; + int32_t t_free; +}; +#endif + +static int +vmtotal(SYSCTL_HANDLER_ARGS) +{ + struct vmtotal total; +#if defined(COMPAT_FREEBSD11) + struct vmtotal11 total11; +#endif + vm_object_t object; + struct proc *p; + struct thread *td; + + if (req->oldptr == NULL) { +#if defined(COMPAT_FREEBSD11) + if (curproc->p_osrel < P_OSREL_VMTOTAL64) + return (SYSCTL_OUT(req, NULL, sizeof(total11))); +#endif + return (SYSCTL_OUT(req, NULL, sizeof(total))); + } + bzero(&total, sizeof(total)); + + /* + * Calculate process statistics. + */ + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if ((p->p_flag & P_SYSTEM) != 0) + continue; + PROC_LOCK(p); + if (p->p_state != PRS_NEW) { + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + switch (td->td_state) { + case TDS_INHIBITED: + if (TD_IS_SWAPPED(td)) + total.t_sw++; + else if (TD_IS_SLEEPING(td)) { + if (td->td_priority <= PZERO) + total.t_dw++; + else + total.t_sl++; + } + break; + case TDS_CAN_RUN: + total.t_sw++; + break; + case TDS_RUNQ: + case TDS_RUNNING: + total.t_rq++; + break; + default: + break; + } + thread_unlock(td); + } + } + PROC_UNLOCK(p); + } + sx_sunlock(&allproc_lock); + /* + * Calculate object memory usage statistics. + */ + mtx_lock(&vm_object_list_mtx); + TAILQ_FOREACH(object, &vm_object_list, object_list) { + /* + * Perform unsynchronized reads on the object. In + * this case, the lack of synchronization should not + * impair the accuracy of the reported statistics. + */ + if ((object->flags & OBJ_FICTITIOUS) != 0) { + /* + * Devices, like /dev/mem, will badly skew our totals. + */ + continue; + } + if (object->ref_count == 0) { + /* + * Also skip unreferenced objects, including + * vnodes representing mounted file systems. + */ + continue; + } + if (object->ref_count == 1 && + (object->flags & OBJ_NOSPLIT) != 0) { + /* + * Also skip otherwise unreferenced swap + * objects backing tmpfs vnodes, and POSIX or + * SysV shared memory. + */ + continue; + } + total.t_vm += object->size; + total.t_rm += object->resident_page_count; + if (is_object_active(object)) { + total.t_avm += object->size; + total.t_arm += object->resident_page_count; + } + if (object->shadow_count > 1) { + /* shared object */ + total.t_vmshr += object->size; + total.t_rmshr += object->resident_page_count; + if (is_object_active(object)) { + total.t_avmshr += object->size; + total.t_armshr += object->resident_page_count; + } + } + } + mtx_unlock(&vm_object_list_mtx); + total.t_pw = vm_wait_count(); + total.t_free = vm_free_count(); +#if defined(COMPAT_FREEBSD11) + /* sysctl(8) allocates twice as much memory as reported by sysctl(3) */ + if (curproc->p_osrel < P_OSREL_VMTOTAL64 && (req->oldlen == + sizeof(total11) || req->oldlen == 2 * sizeof(total11))) { + bzero(&total11, sizeof(total11)); + total11.t_rq = total.t_rq; + total11.t_dw = total.t_dw; + total11.t_pw = total.t_pw; + total11.t_sl = total.t_sl; + total11.t_sw = total.t_sw; + total11.t_vm = total.t_vm; /* truncate */ + total11.t_avm = total.t_avm; /* truncate */ + total11.t_rm = total.t_rm; /* truncate */ + total11.t_arm = total.t_arm; /* truncate */ + total11.t_vmshr = total.t_vmshr; /* truncate */ + total11.t_avmshr = total.t_avmshr; /* truncate */ + total11.t_rmshr = total.t_rmshr; /* truncate */ + total11.t_armshr = total.t_armshr; /* truncate */ + total11.t_free = total.t_free; /* truncate */ + return (SYSCTL_OUT(req, &total11, sizeof(total11))); + } +#endif + return (SYSCTL_OUT(req, &total, sizeof(total))); +} + +SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE | CTLFLAG_RD | + CTLFLAG_MPSAFE, NULL, 0, vmtotal, "S,vmtotal", + "System virtual memory statistics"); +SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats"); +static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0, + "VM meter sys stats"); +static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0, + "VM meter vm stats"); +SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats"); + +static int +sysctl_handle_vmstat(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; +#ifdef COMPAT_FREEBSD11 + uint32_t val32; +#endif + + val = counter_u64_fetch(*(counter_u64_t *)arg1); +#ifdef COMPAT_FREEBSD11 + if (req->oldlen == sizeof(val32)) { + val32 = val; /* truncate */ + return (SYSCTL_OUT(req, &val32, sizeof(val32))); + } +#endif + return (SYSCTL_OUT(req, &val, sizeof(val))); +} + +#define VM_STATS(parent, var, descr) \ + SYSCTL_OID(parent, OID_AUTO, var, CTLTYPE_U64 | CTLFLAG_MPSAFE | \ + CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr) +#define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) +#define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) + +VM_STATS_SYS(v_swtch, "Context switches"); +VM_STATS_SYS(v_trap, "Traps"); +VM_STATS_SYS(v_syscall, "System calls"); +VM_STATS_SYS(v_intr, "Device interrupts"); +VM_STATS_SYS(v_soft, "Software interrupts"); +VM_STATS_VM(v_vm_faults, "Address memory faults"); +VM_STATS_VM(v_io_faults, "Page faults requiring I/O"); +VM_STATS_VM(v_cow_faults, "Copy-on-write faults"); +VM_STATS_VM(v_cow_optim, "Optimized COW faults"); +VM_STATS_VM(v_zfod, "Pages zero-filled on demand"); +VM_STATS_VM(v_ozfod, "Optimized zero fill pages"); +VM_STATS_VM(v_swapin, "Swap pager pageins"); +VM_STATS_VM(v_swapout, "Swap pager pageouts"); +VM_STATS_VM(v_swappgsin, "Swap pages swapped in"); +VM_STATS_VM(v_swappgsout, "Swap pages swapped out"); +VM_STATS_VM(v_vnodein, "Vnode pager pageins"); +VM_STATS_VM(v_vnodeout, "Vnode pager pageouts"); +VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); +VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); +VM_STATS_VM(v_intrans, "In transit page faults"); +VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); +VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); +VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); +VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); +VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); +VM_STATS_VM(v_tfree, "Total pages freed"); +VM_STATS_VM(v_forks, "Number of fork() calls"); +VM_STATS_VM(v_vforks, "Number of vfork() calls"); +VM_STATS_VM(v_rforks, "Number of rfork() calls"); +VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel"); +VM_STATS_VM(v_forkpages, "VM pages affected by fork()"); +VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()"); +VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()"); +VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel"); + +static int +sysctl_handle_vmstat_proc(SYSCTL_HANDLER_ARGS) +{ + u_int (*fn)(void); + uint32_t val; + + fn = arg1; + val = fn(); + return (SYSCTL_OUT(req, &val, sizeof(val))); +} + +#define VM_STATS_PROC(var, descr, fn) \ + SYSCTL_OID(_vm_stats_vm, OID_AUTO, var, CTLTYPE_U32 | CTLFLAG_MPSAFE | \ + CTLFLAG_RD, fn, 0, sysctl_handle_vmstat_proc, "IU", descr) + +#define VM_STATS_UINT(var, descr) \ + SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr) + +VM_STATS_UINT(v_page_size, "Page size in bytes"); +VM_STATS_UINT(v_page_count, "Total number of pages in system"); +VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock"); +VM_STATS_UINT(v_free_target, "Pages desired free"); +VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold"); +VM_STATS_PROC(v_free_count, "Free pages", vm_free_count); +VM_STATS_PROC(v_wire_count, "Wired pages", vm_wire_count); +VM_STATS_PROC(v_active_count, "Active pages", vm_active_count); +VM_STATS_UINT(v_inactive_target, "Desired inactive pages"); +VM_STATS_PROC(v_inactive_count, "Inactive pages", vm_inactive_count); +VM_STATS_PROC(v_laundry_count, "Pages eligible for laundering", + vm_laundry_count); +VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel"); +VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code"); +VM_STATS_UINT(v_free_severe, "Severe page depletion point"); + +#ifdef COMPAT_FREEBSD11 +/* + * Provide compatibility sysctls for the benefit of old utilities which exit + * with an error if they cannot be found. + */ +SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_count, CTLFLAG_RD, + SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); +SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD, + SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); +#endif + +u_int +vm_free_count(void) +{ + u_int v; + int i; + + v = 0; + for (i = 0; i < vm_ndomains; i++) + v += vm_dom[i].vmd_free_count; + + return (v); +} + +static u_int +vm_pagequeue_count(int pq) +{ + u_int v; + int i; + + v = 0; + for (i = 0; i < vm_ndomains; i++) + v += vm_dom[i].vmd_pagequeues[pq].pq_cnt; + + return (v); +} + +u_int +vm_active_count(void) +{ + + return (vm_pagequeue_count(PQ_ACTIVE)); +} + +u_int +vm_inactive_count(void) +{ + + return (vm_pagequeue_count(PQ_INACTIVE)); +} + +u_int +vm_laundry_count(void) +{ + + return (vm_pagequeue_count(PQ_LAUNDRY)); +} + +static int +sysctl_vm_pdpages(SYSCTL_HANDLER_ARGS) +{ + struct vm_pagequeue *pq; + uint64_t ret; + int dom, i; + + ret = counter_u64_fetch(vm_cnt.v_pdpages); + for (dom = 0; dom < vm_ndomains; dom++) + for (i = 0; i < PQ_COUNT; i++) { + pq = &VM_DOMAIN(dom)->vmd_pagequeues[i]; + ret += pq->pq_pdpages; + } + return (SYSCTL_OUT(req, &ret, sizeof(ret))); +} +SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_vm_pdpages, "QU", + "Pages analyzed by pagedaemon"); + +static void +vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent) +{ + struct sysctl_oid *oid; + + vmd->vmd_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, + vmd->vmd_name, CTLFLAG_RD, NULL, ""); + oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, + "stats", CTLFLAG_RD, NULL, ""); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "free_count", CTLFLAG_RD, &vmd->vmd_free_count, 0, + "Free pages"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "active", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_cnt, 0, + "Active pages"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "actpdpgs", CTLFLAG_RD, + &vmd->vmd_pagequeues[PQ_ACTIVE].pq_pdpages, 0, + "Active pages scanned by the page daemon"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "inactive", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt, 0, + "Inactive pages"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "inactpdpgs", CTLFLAG_RD, + &vmd->vmd_pagequeues[PQ_INACTIVE].pq_pdpages, 0, + "Inactive pages scanned by the page daemon"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "laundry", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 0, + "laundry pages"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "laundpdpgs", CTLFLAG_RD, + &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_pdpages, 0, + "Laundry pages scanned by the page daemon"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswappable", + CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt, 0, + "Unswappable pages"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "unswppdpgs", CTLFLAG_RD, + &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_pdpages, 0, + "Unswappable pages scanned by the page daemon"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "inactive_target", CTLFLAG_RD, &vmd->vmd_inactive_target, 0, + "Target inactive pages"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "free_target", CTLFLAG_RD, &vmd->vmd_free_target, 0, + "Target free pages"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "free_reserved", CTLFLAG_RD, &vmd->vmd_free_reserved, 0, + "Reserved free pages"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "free_min", CTLFLAG_RD, &vmd->vmd_free_min, 0, + "Minimum free pages"); + SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0, + "Severe free pages"); + +} + +static void +vm_stats_init(void *arg __unused) +{ + struct sysctl_oid *oid; + int i; + + oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm), OID_AUTO, + "domain", CTLFLAG_RD, NULL, ""); + for (i = 0; i < vm_ndomains; i++) + vm_domain_stats_init(VM_DOMAIN(i), oid); +} + +SYSINIT(vmstats_init, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_stats_init, NULL); -- cgit v1.2.3