diff options
author | Jennifer Averett <jennifer.averett@oarcorp.com> | 2012-11-26 09:47:09 -0600 |
---|---|---|
committer | Jennifer Averett <jennifer.averett@oarcorp.com> | 2012-11-26 09:47:09 -0600 |
commit | 0bde19eee050bbdc4511070cf14f48719e400c26 (patch) | |
tree | d7bf926c193da25600c5612fe25070c5133d9979 | |
parent | Add custom limits.h and timespec.h (diff) | |
download | rtems-libbsd-0bde19eee050bbdc4511070cf14f48719e400c26.tar.bz2 |
Switch to a version of select that is closer to bsd's version.
27 files changed, 12823 insertions, 58 deletions
@@ -12,7 +12,6 @@ CFLAGS += -I freebsd/$(RTEMS_CPU)/include CFLAGS += -I contrib/altq CFLAGS += -I contrib/pf CFLAGS += -I copied/rtemsbsd/$(RTEMS_CPU)/include -CFLAGS += -g CFLAGS += -w CFLAGS += -std=gnu99 CFLAGS += -MT $@ -MD -MP -MF $(basename $@).d @@ -38,7 +37,6 @@ C_FILES += rtemsbsd/src/rtems-bsd-lock.c C_FILES += rtemsbsd/src/rtems-bsd-log.c C_FILES += rtemsbsd/src/rtems-bsd-sx.c C_FILES += rtemsbsd/src/rtems-bsd-rwlock.c -C_FILES += rtemsbsd/src/rtems-bsd-generic.c C_FILES += rtemsbsd/src/rtems-bsd-page.c C_FILES += rtemsbsd/src/rtems-bsd-panic.c C_FILES += rtemsbsd/src/rtems-bsd-synch.c @@ -406,6 +404,9 @@ C_FILES += freebsd/netatalk/ddp_pcb.c C_FILES += freebsd/netatalk/ddp_usrreq.c C_FILES += freebsd/netatalk/at_proto.c C_FILES += freebsd/netatalk/ddp_output.c +C_FILES += freebsd/kern/sys_generic.c +C_FILES += freebsd/kern/kern_descrip.c +C_FILES += freebsd/kern/kern_mtxpool.c ifeq ($(RTEMS_CPU), i386) C_FILES += freebsd/i386/pci/pci_bus.c C_FILES += freebsd/i386/i386/legacy.c diff --git a/freebsd-to-rtems.py b/freebsd-to-rtems.py index 9fb8788c..81b1b289 100755 --- a/freebsd-to-rtems.py +++ b/freebsd-to-rtems.py @@ -659,7 +659,7 @@ rtems.addRTEMSSourceFiles( 'src/rtems-bsd-log.c', 'src/rtems-bsd-sx.c', 'src/rtems-bsd-rwlock.c', - 'src/rtems-bsd-generic.c', + #'src/rtems-bsd-generic.c', 'src/rtems-bsd-page.c', 'src/rtems-bsd-panic.c', 'src/rtems-bsd-synch.c', @@ -711,13 +711,13 @@ rtems.addEmptyHeaderFiles( 'sys/cpuset.h', 'sys/exec.h', 'sys/fail.h', - 'sys/limits.h', + #'sys/limits.h', 'sys/sleepqueue.h', 'sys/namei.h', 'sys/_pthreadtypes.h', #'sys/resourcevar.h', 'sys/sched.h', - 'sys/select.h', + #'sys/select.h', 'sys/syscallsubr.h', 'sys/sysent.h', 'sys/syslimits.h', @@ -725,7 +725,7 @@ rtems.addEmptyHeaderFiles( 'sys/stat.h', #'sys/time.h', 'time.h', - 'sys/timespec.h', + #'sys/timespec.h', 'sys/_timeval.h', #'sys/vmmeter.h', #'sys/vnode.h', @@ -1346,6 +1346,14 @@ devNic.addHeaderFiles( 'netatalk/ddp_var.h', 'netatalk/phase2.h', 'sys/mman.h', + 'sys/buf.h', + 'sys/mqueue.h', + 'sys/tty.h', + 'sys/ttyqueue.h', + 'sys/ttydisc.h', + 'sys/ttydevsw.h', + 'sys/ttyhook.h', + 'sys/user.h', ] ) @@ -1412,6 +1420,9 @@ devNic.addSourceFiles( 'netatalk/ddp_usrreq.c', 'netatalk/at_proto.c', 'netatalk/ddp_output.c', + 'kern/sys_generic.c', + 'kern/kern_descrip.c', + 'kern/kern_mtxpool.c', ] ) diff --git a/freebsd-userspace/Makefile b/freebsd-userspace/Makefile index cef8546e..18dde49a 100644 --- a/freebsd-userspace/Makefile +++ b/freebsd-userspace/Makefile @@ -160,6 +160,9 @@ C_FILES += lib/libipsec/ipsec_dump_policy.c C_FILES += lib/libipsec/policy_token.c C_FILES += lib/libipsec/policy_parse.c +C_FILES += lib/libc_r/uthread/uthread_select.c +C_FILES += lib/libc_r/uthread/uthread_kern.c + # RTEMS Specific Files # C_FILES += rtems/rtems-net-setup.c C_FILES += rtems/syslog.c @@ -171,6 +174,9 @@ C_FILES += rtems/rtems-uthread_main_np.c C_FILES += rtems/rtems-uthread_kevent.c C_FILES += rtems/rtems-uthread_kqueue.c C_FILES += rtems/rtems-shell.c +C_FILES += rtems/rtems-syspoll.c +C_FILES += rtems/rtems-uthread_init.c +C_FILES += rtems/rtems-get_curthread.c # ping command sources C_FILES += commands/sbin/ping/ping.c @@ -278,14 +284,14 @@ GEN_FILES += commands/sbin/route/keywords.h # lib/libc/net GEN_FILES += lib/libc/net/nslexer.c GEN_FILES += lib/libc/net/nsparser.c -EXTRA_CLEAN = lib/libc/net/nsparser.i -EXTRA_CLEAN += lib/libc/net/y.tab.h +CLEAN_FILES = lib/libc/net/nsparser.i +CLEAN_FILES += lib/libc/net/y.tab.h # lib/libipsec GEN_FILES += lib/libipsec/policy_token.c GEN_FILES += lib/libipsec/policy_parse.c -EXTRA_CLEAN += lib/libipsec/policy_parse.i -EXTRA_CLEAN += lib/libipsec/y.tab.h +CLEAN_FILES += lib/libipsec/policy_parse.i +CLEAN_FILES += lib/libipsec/y.tab.h all: $(LIB) diff --git a/freebsd-userspace/commands/sbin/ping/ping.c b/freebsd-userspace/commands/sbin/ping/ping.c index 45162532..34d86325 100644 --- a/freebsd-userspace/commands/sbin/ping/ping.c +++ b/freebsd-userspace/commands/sbin/ping/ping.c @@ -106,6 +106,10 @@ __FBSDID("$FreeBSD$"); #include <sysexits.h> #include <unistd.h> +#ifdef __rtems__ +#define select __select +#endif + #define INADDR_LEN ((int)sizeof(in_addr_t)) #define TIMEVAL_LEN ((int)sizeof(struct tv32)) #define MASK_LEN (ICMP_MASKLEN - ICMP_MINLEN) diff --git a/freebsd-userspace/lib/libc_r/uthread/uthread_kern.c b/freebsd-userspace/lib/libc_r/uthread/uthread_kern.c new file mode 100644 index 00000000..a8c8720d --- /dev/null +++ b/freebsd-userspace/lib/libc_r/uthread/uthread_kern.c @@ -0,0 +1,1157 @@ +/* + * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ +#include <errno.h> +#include <poll.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> +#include <setjmp.h> +#include <sys/param.h> +#include <sys/types.h> +#ifdef __rtems__ +#include <freebsd/sys/signalvar.h> +#include <freebsd/sys/timespec.h> +#else +#include <sys/signalvar.h> +#endif +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <sys/uio.h> +#ifndef __rtems__ +#include <sys/syscall.h> +#endif +#include <fcntl.h> +#include <pthread.h> +#include "pthread_private.h" + +#ifdef __rtems__ +#include <rtems.h> +#endif + +/* #define DEBUG_THREAD_KERN */ +#ifdef DEBUG_THREAD_KERN +#define DBG_MSG stdout_debug +#else +#define DBG_MSG(x...) +#endif + +/* Static function prototype definitions: */ +static void +thread_kern_poll(int wait_reqd); + +static void +dequeue_signals(void); + +static inline void +thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in); + +/* Static variables: */ +static int last_tick = 0; +static int called_from_handler = 0; + +#ifndef __rtems__ +/* + * This is called when a signal handler finishes and wants to + * return to a previous frame. + */ +void +_thread_kern_sched_frame(struct pthread_signal_frame *psf) +{ + struct pthread *curthread = _get_curthread(); + + /* + * Flag the pthread kernel as executing scheduler code + * to avoid a signal from interrupting this execution and + * corrupting the (soon-to-be) current frame. + */ + _thread_kern_in_sched = 1; + + /* Restore the signal frame: */ + _thread_sigframe_restore(curthread, psf); + + /* The signal mask was restored; check for any pending signals: */ + curthread->check_pending = 1; + + /* Switch to the thread scheduler: */ + ___longjmp(_thread_kern_sched_jb, 1); +} + + +void +_thread_kern_sched(ucontext_t *ucp) +{ + struct pthread *curthread = _get_curthread(); + + /* + * Flag the pthread kernel as executing scheduler code + * to avoid a scheduler signal from interrupting this + * execution and calling the scheduler again. + */ + _thread_kern_in_sched = 1; + + /* Check if this function was called from the signal handler: */ + if (ucp != NULL) { + called_from_handler = 1; + DBG_MSG("Entering scheduler due to signal\n"); + } + + /* Save the state of the current thread: */ + if (_setjmp(curthread->ctx.jb) != 0) { + DBG_MSG("Returned from ___longjmp, thread %p\n", + curthread); + /* + * This point is reached when a longjmp() is called + * to restore the state of a thread. + * + * This is the normal way out of the scheduler. + */ + _thread_kern_in_sched = 0; + + if (curthread->sig_defer_count == 0) { + if (((curthread->cancelflags & + PTHREAD_AT_CANCEL_POINT) == 0) && + ((curthread->cancelflags & + PTHREAD_CANCEL_ASYNCHRONOUS) != 0)) + /* + * Cancellations override signals. + * + * Stick a cancellation point at the + * start of each async-cancellable + * thread's resumption. + * + * We allow threads woken at cancel + * points to do their own checks. + */ + pthread_testcancel(); + } + + if (_sched_switch_hook != NULL) { + /* Run the installed switch hook: */ + thread_run_switch_hook(_last_user_thread, curthread); + } + if (ucp == NULL) + return; + else { + /* + * Set the process signal mask in the context; it + * could have changed by the handler. + */ + ucp->uc_sigmask = _process_sigmask; + + /* Resume the interrupted thread: */ + __sys_sigreturn(ucp); + } + } + /* Switch to the thread scheduler: */ + ___longjmp(_thread_kern_sched_jb, 1); +} + +void +_thread_kern_sched_sig(void) +{ + struct pthread *curthread = _get_curthread(); + + curthread->check_pending = 1; + _thread_kern_sched(NULL); +} + + +void +_thread_kern_scheduler(void) +{ + struct timespec ts; + struct timeval tv; + struct pthread *curthread = _get_curthread(); + pthread_t pthread, pthread_h; + unsigned int current_tick; + int add_to_prioq; + + /* If the currently running thread is a user thread, save it: */ + if ((curthread->flags & PTHREAD_FLAGS_PRIVATE) == 0) + _last_user_thread = curthread; + + if (called_from_handler != 0) { + called_from_handler = 0; + + /* + * We were called from a signal handler; restore the process + * signal mask. + */ + if (__sys_sigprocmask(SIG_SETMASK, + &_process_sigmask, NULL) != 0) + PANIC("Unable to restore process mask after signal"); + } + + /* + * Enter a scheduling loop that finds the next thread that is + * ready to run. This loop completes when there are no more threads + * in the global list or when a thread has its state restored by + * either a sigreturn (if the state was saved as a sigcontext) or a + * longjmp (if the state was saved by a setjmp). + */ + while (!(TAILQ_EMPTY(&_thread_list))) { + /* Get the current time of day: */ + GET_CURRENT_TOD(tv); + TIMEVAL_TO_TIMESPEC(&tv, &ts); + current_tick = _sched_ticks; + + /* + * Protect the scheduling queues from access by the signal + * handler. + */ + _queue_signals = 1; + add_to_prioq = 0; + + if (curthread != &_thread_kern_thread) { + /* + * This thread no longer needs to yield the CPU. + */ + curthread->yield_on_sig_undefer = 0; + + if (curthread->state != PS_RUNNING) { + /* + * Save the current time as the time that the + * thread became inactive: + */ + curthread->last_inactive = (long)current_tick; + if (curthread->last_inactive < + curthread->last_active) { + /* Account for a rollover: */ + curthread->last_inactive =+ + UINT_MAX + 1; + } + } + + /* + * Place the currently running thread into the + * appropriate queue(s). + */ + switch (curthread->state) { + case PS_DEAD: + case PS_STATE_MAX: /* to silence -Wall */ + case PS_SUSPENDED: + /* + * Dead and suspended threads are not placed + * in any queue: + */ + break; + + case PS_RUNNING: + /* + * Runnable threads can't be placed in the + * priority queue until after waiting threads + * are polled (to preserve round-robin + * scheduling). + */ + add_to_prioq = 1; + break; + + /* + * States which do not depend on file descriptor I/O + * operations or timeouts: + */ + case PS_DEADLOCK: + case PS_FDLR_WAIT: + case PS_FDLW_WAIT: + case PS_FILE_WAIT: + case PS_JOIN: + case PS_MUTEX_WAIT: + case PS_SIGSUSPEND: + case PS_SIGTHREAD: + case PS_SIGWAIT: + case PS_WAIT_WAIT: + /* No timeouts for these states: */ + curthread->wakeup_time.tv_sec = -1; + curthread->wakeup_time.tv_nsec = -1; + + /* Restart the time slice: */ + curthread->slice_usec = -1; + + /* Insert into the waiting queue: */ + PTHREAD_WAITQ_INSERT(curthread); + break; + + /* States which can timeout: */ + case PS_COND_WAIT: + case PS_SLEEP_WAIT: + /* Restart the time slice: */ + curthread->slice_usec = -1; + + /* Insert into the waiting queue: */ + PTHREAD_WAITQ_INSERT(curthread); + break; + + /* States that require periodic work: */ + case PS_SPINBLOCK: + /* No timeouts for this state: */ + curthread->wakeup_time.tv_sec = -1; + curthread->wakeup_time.tv_nsec = -1; + + /* Increment spinblock count: */ + _spinblock_count++; + + /* FALLTHROUGH */ + case PS_FDR_WAIT: + case PS_FDW_WAIT: + case PS_POLL_WAIT: + case PS_SELECT_WAIT: + /* Restart the time slice: */ + curthread->slice_usec = -1; + + /* Insert into the waiting queue: */ + PTHREAD_WAITQ_INSERT(curthread); + + /* Insert into the work queue: */ + PTHREAD_WORKQ_INSERT(curthread); + break; + } + + /* + * Are there pending signals for this thread? + * + * This check has to be performed after the thread + * has been placed in the queue(s) appropriate for + * its state. The process of adding pending signals + * can change a threads state, which in turn will + * attempt to add or remove the thread from any + * scheduling queue to which it belongs. + */ + if (curthread->check_pending != 0) { + curthread->check_pending = 0; + _thread_sig_check_pending(curthread); + } + } + + /* + * Avoid polling file descriptors if there are none + * waiting: + */ + if (TAILQ_EMPTY(&_workq) != 0) { + } + /* + * Poll file descriptors only if a new scheduling signal + * has occurred or if we have no more runnable threads. + */ + else if (((current_tick = _sched_ticks) != last_tick) || + ((curthread->state != PS_RUNNING) && + (PTHREAD_PRIOQ_FIRST() == NULL))) { + /* Unprotect the scheduling queues: */ + _queue_signals = 0; + + /* + * Poll file descriptors to update the state of threads + * waiting on file I/O where data may be available: + */ + thread_kern_poll(0); + + /* Protect the scheduling queues: */ + _queue_signals = 1; + } + last_tick = current_tick; + + /* + * Wake up threads that have timedout. This has to be + * done after polling in case a thread does a poll or + * select with zero time. + */ + PTHREAD_WAITQ_SETACTIVE(); + while (((pthread = TAILQ_FIRST(&_waitingq)) != NULL) && + (pthread->wakeup_time.tv_sec != -1) && + (((pthread->wakeup_time.tv_sec == 0) && + (pthread->wakeup_time.tv_nsec == 0)) || + (pthread->wakeup_time.tv_sec < ts.tv_sec) || + ((pthread->wakeup_time.tv_sec == ts.tv_sec) && + (pthread->wakeup_time.tv_nsec <= ts.tv_nsec)))) { + switch (pthread->state) { + case PS_POLL_WAIT: + case PS_SELECT_WAIT: + /* Return zero file descriptors ready: */ + pthread->data.poll_data->nfds = 0; + /* FALLTHROUGH */ + default: + /* + * Remove this thread from the waiting queue + * (and work queue if necessary) and place it + * in the ready queue. + */ + PTHREAD_WAITQ_CLEARACTIVE(); + if (pthread->flags & PTHREAD_FLAGS_IN_WORKQ) + PTHREAD_WORKQ_REMOVE(pthread); + PTHREAD_NEW_STATE(pthread, PS_RUNNING); + PTHREAD_WAITQ_SETACTIVE(); + break; + } + /* + * Flag the timeout in the thread structure: + */ + pthread->timeout = 1; + } + PTHREAD_WAITQ_CLEARACTIVE(); + + /* + * Check to see if the current thread needs to be added + * to the priority queue: + */ + if (add_to_prioq != 0) { + /* + * Save the current time as the time that the + * thread became inactive: + */ + current_tick = _sched_ticks; + curthread->last_inactive = (long)current_tick; + if (curthread->last_inactive < + curthread->last_active) { + /* Account for a rollover: */ + curthread->last_inactive =+ UINT_MAX + 1; + } + + if ((curthread->slice_usec != -1) && + (curthread->attr.sched_policy != SCHED_FIFO)) { + /* + * Accumulate the number of microseconds for + * which the current thread has run: + */ + curthread->slice_usec += + (curthread->last_inactive - + curthread->last_active) * + (long)_clock_res_usec; + /* Check for time quantum exceeded: */ + if (curthread->slice_usec > TIMESLICE_USEC) + curthread->slice_usec = -1; + } + + if (curthread->slice_usec == -1) { + /* + * The thread exceeded its time + * quantum or it yielded the CPU; + * place it at the tail of the + * queue for its priority. + */ + PTHREAD_PRIOQ_INSERT_TAIL(curthread); + } else { + /* + * The thread hasn't exceeded its + * interval. Place it at the head + * of the queue for its priority. + */ + PTHREAD_PRIOQ_INSERT_HEAD(curthread); + } + } + + /* + * Get the highest priority thread in the ready queue. + */ + pthread_h = PTHREAD_PRIOQ_FIRST(); + + /* Check if there are no threads ready to run: */ + if (pthread_h == NULL) { + /* + * Lock the pthread kernel by changing the pointer to + * the running thread to point to the global kernel + * thread structure: + */ + _set_curthread(&_thread_kern_thread); + curthread = &_thread_kern_thread; + + DBG_MSG("No runnable threads, using kernel thread %p\n", + curthread); + + /* Unprotect the scheduling queues: */ + _queue_signals = 0; + + /* + * There are no threads ready to run, so wait until + * something happens that changes this condition: + */ + thread_kern_poll(1); + + /* + * This process' usage will likely be very small + * while waiting in a poll. Since the scheduling + * clock is based on the profiling timer, it is + * unlikely that the profiling timer will fire + * and update the time of day. To account for this, + * get the time of day after polling with a timeout. + */ + gettimeofday((struct timeval *) &_sched_tod, NULL); + + /* Check once more for a runnable thread: */ + _queue_signals = 1; + pthread_h = PTHREAD_PRIOQ_FIRST(); + _queue_signals = 0; + } + + if (pthread_h != NULL) { + /* Remove the thread from the ready queue: */ + PTHREAD_PRIOQ_REMOVE(pthread_h); + + /* Unprotect the scheduling queues: */ + _queue_signals = 0; + + /* + * Check for signals queued while the scheduling + * queues were protected: + */ + while (_sigq_check_reqd != 0) { + /* Clear before handling queued signals: */ + _sigq_check_reqd = 0; + + /* Protect the scheduling queues again: */ + _queue_signals = 1; + + dequeue_signals(); + + /* + * Check for a higher priority thread that + * became runnable due to signal handling. + */ + if (((pthread = PTHREAD_PRIOQ_FIRST()) != NULL) && + (pthread->active_priority > pthread_h->active_priority)) { + /* Remove the thread from the ready queue: */ + PTHREAD_PRIOQ_REMOVE(pthread); + + /* + * Insert the lower priority thread + * at the head of its priority list: + */ + PTHREAD_PRIOQ_INSERT_HEAD(pthread_h); + + /* There's a new thread in town: */ + pthread_h = pthread; + } + + /* Unprotect the scheduling queues: */ + _queue_signals = 0; + } + + /* Make the selected thread the current thread: */ + _set_curthread(pthread_h); + curthread = pthread_h; + + /* + * Save the current time as the time that the thread + * became active: + */ + current_tick = _sched_ticks; + curthread->last_active = (long) current_tick; + + /* + * Check if this thread is running for the first time + * or running again after using its full time slice + * allocation: + */ + if (curthread->slice_usec == -1) { + /* Reset the accumulated time slice period: */ + curthread->slice_usec = 0; + } + + /* + * If we had a context switch, run any + * installed switch hooks. + */ + if ((_sched_switch_hook != NULL) && + (_last_user_thread != curthread)) { + thread_run_switch_hook(_last_user_thread, + curthread); + } + /* + * Continue the thread at its current frame: + */ +#if NOT_YET + _setcontext(&curthread->ctx.uc); +#else + ___longjmp(curthread->ctx.jb, 1); +#endif + /* This point should not be reached. */ + PANIC("Thread has returned from sigreturn or longjmp"); + } + } + + /* There are no more threads, so exit this process: */ + exit(0); +} +#endif /* __rtems__ */ + +void +_thread_kern_sched_state(enum pthread_state state, char *fname, int lineno) +{ +#ifdef __rtems__ + rtems_task_wake_after(RTEMS_YIELD_PROCESSOR); +#else + + struct pthread *curthread = _get_curthread(); + + /* + * Flag the pthread kernel as executing scheduler code + * to avoid a scheduler signal from interrupting this + * execution and calling the scheduler again. + */ + _thread_kern_in_sched = 1; + + /* + * Prevent the signal handler from fiddling with this thread + * before its state is set and is placed into the proper queue. + */ + _queue_signals = 1; + + /* Change the state of the current thread: */ + curthread->state = state; + curthread->fname = fname; + curthread->lineno = lineno; + + /* Schedule the next thread that is ready: */ + _thread_kern_sched(NULL); +#endif /* __rtems__ */ +} + +#ifndef __rtems__ +void +_thread_kern_sched_state_unlock(enum pthread_state state, + spinlock_t *lock, char *fname, int lineno) +{ + struct pthread *curthread = _get_curthread(); + + /* + * Flag the pthread kernel as executing scheduler code + * to avoid a scheduler signal from interrupting this + * execution and calling the scheduler again. + */ + _thread_kern_in_sched = 1; + + /* + * Prevent the signal handler from fiddling with this thread + * before its state is set and it is placed into the proper + * queue(s). + */ + _queue_signals = 1; + + /* Change the state of the current thread: */ + curthread->state = state; + curthread->fname = fname; + curthread->lineno = lineno; + + _SPINUNLOCK(lock); + + /* Schedule the next thread that is ready: */ + _thread_kern_sched(NULL); +} + +static void +thread_kern_poll(int wait_reqd) +{ + int count = 0; + int i, found; + int kern_pipe_added = 0; + int nfds = 0; + int timeout_ms = 0; + struct pthread *pthread; + struct timespec ts; + struct timeval tv; + + /* Check if the caller wants to wait: */ + if (wait_reqd == 0) { + timeout_ms = 0; + } + else { + /* Get the current time of day: */ + GET_CURRENT_TOD(tv); + TIMEVAL_TO_TIMESPEC(&tv, &ts); + + _queue_signals = 1; + pthread = TAILQ_FIRST(&_waitingq); + _queue_signals = 0; + + if ((pthread == NULL) || (pthread->wakeup_time.tv_sec == -1)) { + /* + * Either there are no threads in the waiting queue, + * or there are no threads that can timeout. + */ + timeout_ms = INFTIM; + } + else if (pthread->wakeup_time.tv_sec - ts.tv_sec > 60000) + /* Limit maximum timeout to prevent rollover. */ + timeout_ms = 60000; + else { + /* + * Calculate the time left for the next thread to + * timeout: + */ + timeout_ms = ((pthread->wakeup_time.tv_sec - ts.tv_sec) * + 1000) + ((pthread->wakeup_time.tv_nsec - ts.tv_nsec) / + 1000000); + /* + * Don't allow negative timeouts: + */ + if (timeout_ms < 0) + timeout_ms = 0; + } + } + + /* Protect the scheduling queues: */ + _queue_signals = 1; + + /* + * Check to see if the signal queue needs to be walked to look + * for threads awoken by a signal while in the scheduler. + */ + if (_sigq_check_reqd != 0) { + /* Reset flag before handling queued signals: */ + _sigq_check_reqd = 0; + + dequeue_signals(); + } + + /* + * Check for a thread that became runnable due to a signal: + */ + if (PTHREAD_PRIOQ_FIRST() != NULL) { + /* + * Since there is at least one runnable thread, + * disable the wait. + */ + timeout_ms = 0; + } + + /* + * Form the poll table: + */ + nfds = 0; + if (timeout_ms != 0) { + /* Add the kernel pipe to the poll table: */ + _thread_pfd_table[nfds].fd = _thread_kern_pipe[0]; + _thread_pfd_table[nfds].events = POLLRDNORM; + _thread_pfd_table[nfds].revents = 0; + nfds++; + kern_pipe_added = 1; + } + + PTHREAD_WAITQ_SETACTIVE(); + TAILQ_FOREACH(pthread, &_workq, qe) { + switch (pthread->state) { + case PS_SPINBLOCK: + /* + * If the lock is available, let the thread run. + */ + if (pthread->data.spinlock->access_lock == 0) { + PTHREAD_WAITQ_CLEARACTIVE(); + PTHREAD_WORKQ_REMOVE(pthread); + PTHREAD_NEW_STATE(pthread,PS_RUNNING); + PTHREAD_WAITQ_SETACTIVE(); + /* One less thread in a spinblock state: */ + _spinblock_count--; + /* + * Since there is at least one runnable + * thread, disable the wait. + */ + timeout_ms = 0; + } + break; + + /* File descriptor read wait: */ + case PS_FDR_WAIT: + /* Limit number of polled files to table size: */ + if (nfds < _thread_dtablesize) { + _thread_pfd_table[nfds].events = POLLRDNORM; + _thread_pfd_table[nfds].fd = pthread->data.fd.fd; + nfds++; + } + break; + + /* File descriptor write wait: */ + case PS_FDW_WAIT: + /* Limit number of polled files to table size: */ + if (nfds < _thread_dtablesize) { + _thread_pfd_table[nfds].events = POLLWRNORM; + _thread_pfd_table[nfds].fd = pthread->data.fd.fd; + nfds++; + } + break; + + /* File descriptor poll or select wait: */ + case PS_POLL_WAIT: + case PS_SELECT_WAIT: + /* Limit number of polled files to table size: */ + if (pthread->data.poll_data->nfds + nfds < + _thread_dtablesize) { + for (i = 0; i < pthread->data.poll_data->nfds; i++) { + _thread_pfd_table[nfds + i].fd = + pthread->data.poll_data->fds[i].fd; + _thread_pfd_table[nfds + i].events = + pthread->data.poll_data->fds[i].events; + } + nfds += pthread->data.poll_data->nfds; + } + break; + + /* Other states do not depend on file I/O. */ + default: + break; + } + } + PTHREAD_WAITQ_CLEARACTIVE(); + + /* + * Wait for a file descriptor to be ready for read, write, or + * an exception, or a timeout to occur: + */ + count = __sys_poll(_thread_pfd_table, nfds, timeout_ms); + + if (kern_pipe_added != 0) + /* + * Remove the pthread kernel pipe file descriptor + * from the pollfd table: + */ + nfds = 1; + else + nfds = 0; + + /* + * Check if it is possible that there are bytes in the kernel + * read pipe waiting to be read: + */ + if (count < 0 || ((kern_pipe_added != 0) && + (_thread_pfd_table[0].revents & POLLRDNORM))) { + /* + * If the kernel read pipe was included in the + * count: + */ + if (count > 0) { + /* Decrement the count of file descriptors: */ + count--; + } + + if (_sigq_check_reqd != 0) { + /* Reset flag before handling signals: */ + _sigq_check_reqd = 0; + + dequeue_signals(); + } + } + + /* + * Check if any file descriptors are ready: + */ + if (count > 0) { + /* + * Enter a loop to look for threads waiting on file + * descriptors that are flagged as available by the + * _poll syscall: + */ + PTHREAD_WAITQ_SETACTIVE(); + TAILQ_FOREACH(pthread, &_workq, qe) { + switch (pthread->state) { + case PS_SPINBLOCK: + /* + * If the lock is available, let the thread run. + */ + if (pthread->data.spinlock->access_lock == 0) { + PTHREAD_WAITQ_CLEARACTIVE(); + PTHREAD_WORKQ_REMOVE(pthread); + PTHREAD_NEW_STATE(pthread,PS_RUNNING); + PTHREAD_WAITQ_SETACTIVE(); + + /* + * One less thread in a spinblock state: + */ + _spinblock_count--; + } + break; + + /* File descriptor read wait: */ + case PS_FDR_WAIT: + if ((nfds < _thread_dtablesize) && + (_thread_pfd_table[nfds].revents + & (POLLRDNORM|POLLERR|POLLHUP|POLLNVAL)) + != 0) { + PTHREAD_WAITQ_CLEARACTIVE(); + PTHREAD_WORKQ_REMOVE(pthread); + PTHREAD_NEW_STATE(pthread,PS_RUNNING); + PTHREAD_WAITQ_SETACTIVE(); + } + nfds++; + break; + + /* File descriptor write wait: */ + case PS_FDW_WAIT: + if ((nfds < _thread_dtablesize) && + (_thread_pfd_table[nfds].revents + & (POLLWRNORM|POLLERR|POLLHUP|POLLNVAL)) + != 0) { + PTHREAD_WAITQ_CLEARACTIVE(); + PTHREAD_WORKQ_REMOVE(pthread); + PTHREAD_NEW_STATE(pthread,PS_RUNNING); + PTHREAD_WAITQ_SETACTIVE(); + } + nfds++; + break; + + /* File descriptor poll or select wait: */ + case PS_POLL_WAIT: + case PS_SELECT_WAIT: + if (pthread->data.poll_data->nfds + nfds < + _thread_dtablesize) { + /* + * Enter a loop looking for I/O + * readiness: + */ + found = 0; + for (i = 0; i < pthread->data.poll_data->nfds; i++) { + if (_thread_pfd_table[nfds + i].revents != 0) { + pthread->data.poll_data->fds[i].revents = + _thread_pfd_table[nfds + i].revents; + found++; + } + } + + /* Increment before destroying: */ + nfds += pthread->data.poll_data->nfds; + + if (found != 0) { + pthread->data.poll_data->nfds = found; + PTHREAD_WAITQ_CLEARACTIVE(); + PTHREAD_WORKQ_REMOVE(pthread); + PTHREAD_NEW_STATE(pthread,PS_RUNNING); + PTHREAD_WAITQ_SETACTIVE(); + } + } + else + nfds += pthread->data.poll_data->nfds; + break; + + /* Other states do not depend on file I/O. */ + default: + break; + } + } + PTHREAD_WAITQ_CLEARACTIVE(); + } + else if (_spinblock_count != 0) { + /* + * Enter a loop to look for threads waiting on a spinlock + * that is now available. + */ + PTHREAD_WAITQ_SETACTIVE(); + TAILQ_FOREACH(pthread, &_workq, qe) { + if (pthread->state == PS_SPINBLOCK) { + /* + * If the lock is available, let the thread run. + */ + if (pthread->data.spinlock->access_lock == 0) { + PTHREAD_WAITQ_CLEARACTIVE(); + PTHREAD_WORKQ_REMOVE(pthread); + PTHREAD_NEW_STATE(pthread,PS_RUNNING); + PTHREAD_WAITQ_SETACTIVE(); + + /* + * One less thread in a spinblock state: + */ + _spinblock_count--; + } + } + } + PTHREAD_WAITQ_CLEARACTIVE(); + } + + /* Unprotect the scheduling queues: */ + _queue_signals = 0; + + while (_sigq_check_reqd != 0) { + /* Handle queued signals: */ + _sigq_check_reqd = 0; + + /* Protect the scheduling queues: */ + _queue_signals = 1; + + dequeue_signals(); + + /* Unprotect the scheduling queues: */ + _queue_signals = 0; + } +} +#endif /* __rtems__ */ + +void +_thread_kern_set_timeout(const struct timespec * timeout) +{ + struct pthread *curthread = _get_curthread(); + struct timespec current_time; + struct timeval tv; + + /* Reset the timeout flag for the running thread: */ + curthread->timeout = 0; + + /* Check if the thread is to wait forever: */ + if (timeout == NULL) { + /* + * Set the wakeup time to something that can be recognised as + * different to an actual time of day: + */ + curthread->wakeup_time.tv_sec = -1; + curthread->wakeup_time.tv_nsec = -1; + } + /* Check if no waiting is required: */ + else if (timeout->tv_sec == 0 && timeout->tv_nsec == 0) { + /* Set the wake up time to 'immediately': */ + curthread->wakeup_time.tv_sec = 0; + curthread->wakeup_time.tv_nsec = 0; + } else { + /* Get the current time: */ + GET_CURRENT_TOD(tv); + TIMEVAL_TO_TIMESPEC(&tv, ¤t_time); + + /* Calculate the time for the current thread to wake up: */ + curthread->wakeup_time.tv_sec = current_time.tv_sec + timeout->tv_sec; + curthread->wakeup_time.tv_nsec = current_time.tv_nsec + timeout->tv_nsec; + + /* Check if the nanosecond field needs to wrap: */ + if (curthread->wakeup_time.tv_nsec >= 1000000000) { + /* Wrap the nanosecond field: */ + curthread->wakeup_time.tv_sec += 1; + curthread->wakeup_time.tv_nsec -= 1000000000; + } + } +} + +#ifndef __rtems__ +void +_thread_kern_sig_defer(void) +{ + struct pthread *curthread = _get_curthread(); + + /* Allow signal deferral to be recursive. */ + curthread->sig_defer_count++; +} + +void +_thread_kern_sig_undefer(void) +{ + struct pthread *curthread = _get_curthread(); + + /* + * Perform checks to yield only if we are about to undefer + * signals. + */ + if (curthread->sig_defer_count > 1) { + /* Decrement the signal deferral count. */ + curthread->sig_defer_count--; + } + else if (curthread->sig_defer_count == 1) { + /* Reenable signals: */ + curthread->sig_defer_count = 0; + + /* + * Check if there are queued signals: + */ + if (_sigq_check_reqd != 0) + _thread_kern_sched(NULL); + + /* + * Check for asynchronous cancellation before delivering any + * pending signals: + */ + if (((curthread->cancelflags & PTHREAD_AT_CANCEL_POINT) == 0) && + ((curthread->cancelflags & PTHREAD_CANCEL_ASYNCHRONOUS) != 0)) + pthread_testcancel(); + + /* + * If there are pending signals or this thread has + * to yield the CPU, call the kernel scheduler: + * + * XXX - Come back and revisit the pending signal problem + */ + if ((curthread->yield_on_sig_undefer != 0) || + SIGNOTEMPTY(curthread->sigpend)) { + curthread->yield_on_sig_undefer = 0; + _thread_kern_sched(NULL); + } + } +} + +static void +dequeue_signals(void) +{ + char bufr[128]; + int num; + + /* + * Enter a loop to clear the pthread kernel pipe: + */ + while (((num = __sys_read(_thread_kern_pipe[0], bufr, + sizeof(bufr))) > 0) || (num == -1 && errno == EINTR)) { + } + if ((num < 0) && (errno != EAGAIN)) { + /* + * The only error we should expect is if there is + * no data to read. + */ + PANIC("Unable to read from thread kernel pipe"); + } + /* Handle any pending signals: */ + _thread_sig_handle_pending(); +} + +static inline void +thread_run_switch_hook(pthread_t thread_out, pthread_t thread_in) +{ + pthread_t tid_out = thread_out; + pthread_t tid_in = thread_in; + + if ((tid_out != NULL) && + (tid_out->flags & PTHREAD_FLAGS_PRIVATE) != 0) + tid_out = NULL; + if ((tid_in != NULL) && + (tid_in->flags & PTHREAD_FLAGS_PRIVATE) != 0) + tid_in = NULL; + + if ((_sched_switch_hook != NULL) && (tid_out != tid_in)) { + /* Run the scheduler switch hook: */ + _sched_switch_hook(tid_out, tid_in); + } +} + +struct pthread * +_get_curthread(void) +{ + if (_thread_initial == NULL) + _thread_init(); + + return (_thread_run); +} + +void +_set_curthread(struct pthread *newthread) +{ + _thread_run = newthread; +} +#endif /* __rtems__ */ diff --git a/freebsd-userspace/lib/libc_r/uthread/uthread_select.c b/freebsd-userspace/lib/libc_r/uthread/uthread_select.c new file mode 100644 index 00000000..632f44d2 --- /dev/null +++ b/freebsd-userspace/lib/libc_r/uthread/uthread_select.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ +#include <unistd.h> +#include <errno.h> +#include <poll.h> +#include <stdlib.h> +#include <string.h> +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <pthread.h> +#include "pthread_private.h" + +__weak_reference(__select, select); + +#ifdef __rtems__ +#include <freebsd/sys/timespec.h> +#define realloc _bsd_realloc +#endif + + +int +_select(int numfds, fd_set * readfds, fd_set * writefds, fd_set * exceptfds, + struct timeval * timeout) +{ + struct pthread *curthread = _get_curthread(); + struct timespec ts; + int i, ret = 0, f_wait = 1; + int pfd_index, got_events = 0, fd_count = 0; + struct pthread_poll_data data; + +#ifndef __rtems__ /* XXX - NOT SURE WHAT TO DEFINE _thread_dtablesize TO. */ + if (numfds > _thread_dtablesize) { + numfds = _thread_dtablesize; + } +#endif + /* Count the number of file descriptors to be polled: */ + if (readfds || writefds || exceptfds) { + for (i = 0; i < numfds; i++) { + if ((readfds && FD_ISSET(i, readfds)) || + (exceptfds && FD_ISSET(i, exceptfds)) || + (writefds && FD_ISSET(i, writefds))) { + fd_count++; + } + } + } + + /* + * Allocate memory for poll data if it hasn't already been + * allocated or if previously allocated memory is insufficient. + */ + if ((curthread->poll_data.fds == NULL) || + (curthread->poll_data.nfds < fd_count)) { + data.fds = (struct pollfd *) realloc(curthread->poll_data.fds, + sizeof(struct pollfd) * MAX(128, fd_count)); + if (data.fds == NULL) { + errno = ENOMEM; + ret = -1; + } + else { + /* + * Note that the threads poll data always + * indicates what is allocated, not what is + * currently being polled. + */ + curthread->poll_data.fds = data.fds; + curthread->poll_data.nfds = MAX(128, fd_count); + } + } + /* Check if a timeout was specified: */ + if (timeout) { + if (timeout->tv_sec < 0 || + timeout->tv_usec < 0 || timeout->tv_usec >= 1000000) { + errno = EINVAL; + return (-1); + } + + /* Convert the timeval to a timespec: */ + TIMEVAL_TO_TIMESPEC(timeout, &ts); + + /* Set the wake up time: */ + _thread_kern_set_timeout(&ts); + if (ts.tv_sec == 0 && ts.tv_nsec == 0) + f_wait = 0; + } else { + /* Wait for ever: */ + _thread_kern_set_timeout(NULL); + } + + if (ret == 0) { + /* Setup the wait data. */ + data.fds = curthread->poll_data.fds; + data.nfds = fd_count; + + /* + * Setup the array of pollfds. Optimize this by + running the loop in reverse and stopping when + * the number of selected file descriptors is reached. + */ + for (i = numfds - 1, pfd_index = fd_count - 1; + (i >= 0) && (pfd_index >= 0); i--) { + data.fds[pfd_index].events = 0; + if (readfds && FD_ISSET(i, readfds)) { + data.fds[pfd_index].events = POLLRDNORM; + } + if (exceptfds && FD_ISSET(i, exceptfds)) { + data.fds[pfd_index].events |= POLLRDBAND; + } + if (writefds && FD_ISSET(i, writefds)) { + data.fds[pfd_index].events |= POLLWRNORM; + } + if (data.fds[pfd_index].events != 0) { + /* + * Set the file descriptor to be polled and + * clear revents in case of a timeout which + * leaves fds unchanged: + */ + data.fds[pfd_index].fd = i; + data.fds[pfd_index].revents = 0; + pfd_index--; + } + } + if (((ret = __sys_poll(data.fds, data.nfds, 0)) == 0) && + (f_wait != 0)) { + curthread->data.poll_data = &data; + curthread->interrupted = 0; + _thread_kern_sched_state(PS_SELECT_WAIT, __FILE__, __LINE__); + if (curthread->interrupted) { + errno = EINTR; + data.nfds = 0; + ret = -1; + } else + ret = data.nfds; + } + } + + if (ret >= 0) { + numfds = 0; + for (i = 0; i < fd_count; i++) { + /* + * Check the results of the poll and clear + * this file descriptor from the fdset if + * the requested event wasn't ready. + */ + + /* + * First check for invalid descriptor. + * If found, set errno and return -1. + */ + if (data.fds[i].revents & POLLNVAL) { + errno = EBADF; + return -1; + } + + got_events = 0; + if (readfds != NULL) { + if (FD_ISSET(data.fds[i].fd, readfds)) { + if ((data.fds[i].revents & (POLLIN + | POLLRDNORM | POLLERR + | POLLHUP | POLLNVAL)) != 0) + got_events++; + else + FD_CLR(data.fds[i].fd, readfds); + } + } + if (writefds != NULL) { + if (FD_ISSET(data.fds[i].fd, writefds)) { + if ((data.fds[i].revents & (POLLOUT + | POLLWRNORM | POLLWRBAND | POLLERR + | POLLHUP | POLLNVAL)) != 0) + got_events++; + else + FD_CLR(data.fds[i].fd, + writefds); + } + } + if (exceptfds != NULL) { + if (FD_ISSET(data.fds[i].fd, exceptfds)) { + if (data.fds[i].revents & (POLLRDBAND | + POLLPRI)) + got_events++; + else + FD_CLR(data.fds[i].fd, + exceptfds); + } + } + if (got_events != 0) + numfds+=got_events; + } + ret = numfds; + } + + return (ret); +} + +int +__select(int numfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, + struct timeval *timeout) +{ + int ret; + +#ifndef __rtems__ + _thread_enter_cancellation_point(); +#endif + ret = _select(numfds, readfds, writefds, exceptfds, timeout); +#ifndef __rtems__ + _thread_leave_cancellation_point(); +#endif + + return ret; +} diff --git a/freebsd-userspace/rtems/include/pthread_private.h b/freebsd-userspace/rtems/include/pthread_private.h new file mode 100644 index 00000000..ce5ba6d8 --- /dev/null +++ b/freebsd-userspace/rtems/include/pthread_private.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Private thread definitions for the uthread kernel. + * + * $FreeBSD$ + */ + +#ifndef _PTHREAD_PRIVATE_H +#define _PTHREAD_PRIVATE_H + +/* + * Evaluate the storage class specifier. + */ +#ifdef GLOBAL_PTHREAD_PRIVATE +#define SCLASS +#else +#define SCLASS extern +#endif + +/* + * Include files. + */ +#include <setjmp.h> +#include <signal.h> +#include <stdio.h> +#include <freebsd/sys/param.h> +#include <freebsd/sys/queue.h> +#include <freebsd/sys/types.h> +#include <freebsd/sys/time.h> +#include <freebsd/sys/cdefs.h> +#include <sched.h> +#include <spinlock.h> +#include <pthread_np.h> +#include <freebsd/sys/malloc.h> + +/* + * Define a thread-safe macro to get the current time of day + * which is updated at regular intervals by the scheduling signal + * handler. + */ +#define GET_CURRENT_TOD(tv) \ + do { \ + tv.tv_sec = _sched_tod.tv_sec; \ + tv.tv_usec = _sched_tod.tv_usec; \ + } while (tv.tv_sec != _sched_tod.tv_sec) + + +/* + * rtems uses the following structure to allow the method + * _thread_kern_sched_state to be called. This function + * is stubbed out to cause a processor yeild. + */ + +/* + * Thread states. + */ +enum pthread_state { +#if 0 + PS_RUNNING, + PS_SIGTHREAD, + PS_MUTEX_WAIT, + PS_COND_WAIT, + PS_FDLR_WAIT, + PS_FDLW_WAIT, + PS_FDR_WAIT, + PS_FDW_WAIT, + PS_FILE_WAIT, + PS_POLL_WAIT, +#endif + PS_SELECT_WAIT, +#if 0 + PS_SLEEP_WAIT, + PS_WAIT_WAIT, + PS_SIGSUSPEND, + PS_SIGWAIT, + PS_SPINBLOCK, + PS_JOIN, + PS_SUSPENDED, + PS_DEAD, + PS_DEADLOCK, +#endif + PS_STATE_MAX +}; + +struct pthread_poll_data { + int nfds; + struct pollfd *fds; +}; + +struct pthread_wait_data { + struct pthread_poll_data *poll_data; +}; + +/* + * Thread structure. + */ +struct pthread { + + /* + * Time to wake up thread. This is used for sleeping threads and + * for any operation which may time out (such as select). + */ + struct timespec wakeup_time; + + /* TRUE if operation has timed out. */ + int timeout; + + /* Wait data. */ + struct pthread_wait_data data; + + /* + * Allocated for converting select into poll. + */ + struct pthread_poll_data poll_data; + + /* + * Set to TRUE if a blocking operation was + * interrupted by a signal: + */ + int interrupted; +}; + +/* Time of day at last scheduling timer signal: */ +SCLASS struct timeval volatile _sched_tod +#ifdef GLOBAL_PTHREAD_PRIVATE += { 0, 0 }; +#else +; +#endif +struct pthread *_get_curthread(void); + +#endif /* !_PTHREAD_PRIVATE_H */ diff --git a/freebsd-userspace/rtems/rtems-get_curthread.c b/freebsd-userspace/rtems/rtems-get_curthread.c new file mode 100644 index 00000000..263cac60 --- /dev/null +++ b/freebsd-userspace/rtems/rtems-get_curthread.c @@ -0,0 +1,72 @@ +#include <freebsd/machine/rtems-bsd-config.h> + +#include <freebsd/sys/malloc.h> + +#include <pthread.h> +#include "pthread_private.h" + + +static struct pthread *rtems_bsd_curpthread = NULL; + + +static void rtems_bsd_pthread_descriptor_dtor(void *td) +{ + // XXX are there other pieces to clean up? + free(td, M_TEMP); +} + +static struct pthread * +rtems_bsd_pthread_init( rtems_id id ) +{ + rtems_status_code sc = RTEMS_SUCCESSFUL; + unsigned index = 0; + struct pthread *td; + + td = _bsd_malloc( sizeof(struct pthread), M_TEMP, M_WAITOK | M_ZERO); + if (td == NULL) + return NULL; + + td->timeout = 0; + td->data.poll_data = NULL; + td->poll_data.nfds = 0; + td->poll_data.fds = NULL; + td->interrupted = 0; + rtems_bsd_curpthread = td; + + // Now add the task descriptor as a per-task variable + sc = rtems_task_variable_add( + id, + &rtems_bsd_curpthread, + rtems_bsd_pthread_descriptor_dtor + ); + if (sc != RTEMS_SUCCESSFUL) { + free(td, M_TEMP); + return NULL; + } + + return td; +} + +/* + */ + +struct pthread * +_get_curthread(void) +{ + struct pthread *td; + + /* + * If we already have a struct thread associated with this thread, + * obtain it. Otherwise, allocate and initialize one. + */ + td = rtems_bsd_curpthread; + if ( td == NULL ) { + td = rtems_bsd_pthread_init( rtems_task_self() ); + if ( td == NULL ){ + panic("_get_curthread: Unable to create pthread\n"); + } + } + + return td; +} + diff --git a/freebsd-userspace/rtems/rtems-syspoll.c b/freebsd-userspace/rtems/rtems-syspoll.c new file mode 100644 index 00000000..d53d14e8 --- /dev/null +++ b/freebsd-userspace/rtems/rtems-syspoll.c @@ -0,0 +1,30 @@ +#include <unistd.h> +#include <errno.h> +#include <sys/time.h> +#include <rtems.h> +#include <rtems/error.h> +#include <freebsd/sys/poll.h> + +struct poll_args { + struct pollfd *fds; + u_int nfds; + int timeout; +}; + +int kern_poll( struct thread *td, struct poll_args *uap ); + + +int +__sys_poll(struct pollfd *fds, unsigned nfds, int timeout) +{ + struct poll_args uap; + struct thread *td = rtems_get_curthread(); + + uap.fds = fds; + uap.nfds = nfds; + uap.timeout = timeout; + + kern_poll(td, &uap); + + return -1; +} diff --git a/freebsd-userspace/rtems/rtems-uthread_init.c b/freebsd-userspace/rtems/rtems-uthread_init.c new file mode 100644 index 00000000..3226ed62 --- /dev/null +++ b/freebsd-userspace/rtems/rtems-uthread_init.c @@ -0,0 +1,8 @@ +#include <freebsd/machine/rtems-bsd-config.h> + +/* Allocate space for global thread variables here: */ +#define GLOBAL_PTHREAD_PRIVATE + +#include <freebsd/sys/types.h> +#include <pthread.h> +#include "pthread_private.h" diff --git a/freebsd/kern/kern_condvar.c b/freebsd/kern/kern_condvar.c new file mode 100644 index 00000000..34ec29cb --- /dev/null +++ b/freebsd/kern/kern_condvar.c @@ -0,0 +1,455 @@ +#include <freebsd/machine/rtems-bsd-config.h> + +/*- + * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <freebsd/sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <freebsd/local/opt_ktrace.h> + +#include <freebsd/sys/param.h> +#include <freebsd/sys/systm.h> +#include <freebsd/sys/lock.h> +#include <freebsd/sys/mutex.h> +#include <freebsd/sys/proc.h> +#include <freebsd/sys/kernel.h> +#include <freebsd/sys/ktr.h> +#include <freebsd/sys/condvar.h> +#include <freebsd/sys/sched.h> +#include <freebsd/sys/signalvar.h> +#include <freebsd/sys/sleepqueue.h> +#include <freebsd/sys/resourcevar.h> +#ifdef KTRACE +#include <freebsd/sys/uio.h> +#include <freebsd/sys/ktrace.h> +#endif + +/* + * Common sanity checks for cv_wait* functions. + */ +#define CV_ASSERT(cvp, lock, td) do { \ + KASSERT((td) != NULL, ("%s: curthread NULL", __func__)); \ + KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__)); \ + KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \ + KASSERT((lock) != NULL, ("%s: lock NULL", __func__)); \ +} while (0) + +/* + * Initialize a condition variable. Must be called before use. + */ +void +cv_init(struct cv *cvp, const char *desc) +{ + + cvp->cv_description = desc; + cvp->cv_waiters = 0; +} + +/* + * Destroy a condition variable. The condition variable must be re-initialized + * in order to be re-used. + */ +void +cv_destroy(struct cv *cvp) +{ +#ifdef INVARIANTS + struct sleepqueue *sq; + + sleepq_lock(cvp); + sq = sleepq_lookup(cvp); + sleepq_release(cvp); + KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__)); +#endif +} + +/* + * Wait on a condition variable. The current thread is placed on the condition + * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same + * condition variable will resume the thread. The mutex is released before + * sleeping and will be held on return. It is recommended that the mutex be + * held when cv_signal or cv_broadcast are called. + */ +void +_cv_wait(struct cv *cvp, struct lock_object *lock) +{ + WITNESS_SAVE_DECL(lock_witness); + struct lock_class *class; + struct thread *td; + int lock_state; + + td = curthread; + lock_state = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, lock, td); + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, + "Waiting on \"%s\"", cvp->cv_description); + class = LOCK_CLASS(lock); + + if (cold || panicstr) { + /* + * During autoconfiguration, just give interrupts + * a chance, then just return. Don't run any other + * thread or panic below, in case this is the idle + * process and already asleep. + */ + return; + } + + sleepq_lock(cvp); + + cvp->cv_waiters++; + if (lock == &Giant.lock_object) + mtx_assert(&Giant, MA_OWNED); + DROP_GIANT(); + + sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); + if (lock != &Giant.lock_object) { + if (class->lc_flags & LC_SLEEPABLE) + sleepq_release(cvp); + WITNESS_SAVE(lock, lock_witness); + lock_state = class->lc_unlock(lock); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_lock(cvp); + } + sleepq_wait(cvp, 0); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + if (lock != &Giant.lock_object) { + class->lc_lock(lock, lock_state); + WITNESS_RESTORE(lock, lock_witness); + } +} + +/* + * Wait on a condition variable. This function differs from cv_wait by + * not aquiring the mutex after condition variable was signaled. + */ +void +_cv_wait_unlock(struct cv *cvp, struct lock_object *lock) +{ + struct lock_class *class; + struct thread *td; + + td = curthread; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, lock, td); + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, + "Waiting on \"%s\"", cvp->cv_description); + KASSERT(lock != &Giant.lock_object, + ("cv_wait_unlock cannot be used with Giant")); + class = LOCK_CLASS(lock); + + if (cold || panicstr) { + /* + * During autoconfiguration, just give interrupts + * a chance, then just return. Don't run any other + * thread or panic below, in case this is the idle + * process and already asleep. + */ + class->lc_unlock(lock); + return; + } + + sleepq_lock(cvp); + + cvp->cv_waiters++; + DROP_GIANT(); + + sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_release(cvp); + class->lc_unlock(lock); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_lock(cvp); + sleepq_wait(cvp, 0); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); +} + +/* + * Wait on a condition variable, allowing interruption by signals. Return 0 if + * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if + * a signal was caught. If ERESTART is returned the system call should be + * restarted if possible. + */ +int +_cv_wait_sig(struct cv *cvp, struct lock_object *lock) +{ + WITNESS_SAVE_DECL(lock_witness); + struct lock_class *class; + struct thread *td; + int lock_state, rval; + + td = curthread; + lock_state = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, lock, td); + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, + "Waiting on \"%s\"", cvp->cv_description); + class = LOCK_CLASS(lock); + + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, just give + * interrupts a chance, then just return; don't run any other + * procs or panic below, in case this is the idle process and + * already asleep. + */ + return (0); + } + + sleepq_lock(cvp); + + cvp->cv_waiters++; + if (lock == &Giant.lock_object) + mtx_assert(&Giant, MA_OWNED); + DROP_GIANT(); + + sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | + SLEEPQ_INTERRUPTIBLE, 0); + if (lock != &Giant.lock_object) { + if (class->lc_flags & LC_SLEEPABLE) + sleepq_release(cvp); + WITNESS_SAVE(lock, lock_witness); + lock_state = class->lc_unlock(lock); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_lock(cvp); + } + rval = sleepq_wait_sig(cvp, 0); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + if (lock != &Giant.lock_object) { + class->lc_lock(lock, lock_state); + WITNESS_RESTORE(lock, lock_witness); + } + + return (rval); +} + +/* + * Wait on a condition variable for at most timo/hz seconds. Returns 0 if the + * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout + * expires. + */ +int +_cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo) +{ + WITNESS_SAVE_DECL(lock_witness); + struct lock_class *class; + struct thread *td; + int lock_state, rval; + + td = curthread; + lock_state = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, lock, td); + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, + "Waiting on \"%s\"", cvp->cv_description); + class = LOCK_CLASS(lock); + + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, just give + * interrupts a chance, then just return; don't run any other + * thread or panic below, in case this is the idle process and + * already asleep. + */ + return 0; + } + + sleepq_lock(cvp); + + cvp->cv_waiters++; + if (lock == &Giant.lock_object) + mtx_assert(&Giant, MA_OWNED); + DROP_GIANT(); + + sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); + sleepq_set_timeout(cvp, timo); + if (lock != &Giant.lock_object) { + if (class->lc_flags & LC_SLEEPABLE) + sleepq_release(cvp); + WITNESS_SAVE(lock, lock_witness); + lock_state = class->lc_unlock(lock); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_lock(cvp); + } + rval = sleepq_timedwait(cvp, 0); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + if (lock != &Giant.lock_object) { + class->lc_lock(lock, lock_state); + WITNESS_RESTORE(lock, lock_witness); + } + + return (rval); +} + +/* + * Wait on a condition variable for at most timo/hz seconds, allowing + * interruption by signals. Returns 0 if the thread was resumed by cv_signal + * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if + * a signal was caught. + */ +int +_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo) +{ + WITNESS_SAVE_DECL(lock_witness); + struct lock_class *class; + struct thread *td; + int lock_state, rval; + + td = curthread; + lock_state = 0; +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(1, 0); +#endif + CV_ASSERT(cvp, lock, td); + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, + "Waiting on \"%s\"", cvp->cv_description); + class = LOCK_CLASS(lock); + + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, just give + * interrupts a chance, then just return; don't run any other + * thread or panic below, in case this is the idle process and + * already asleep. + */ + return 0; + } + + sleepq_lock(cvp); + + cvp->cv_waiters++; + if (lock == &Giant.lock_object) + mtx_assert(&Giant, MA_OWNED); + DROP_GIANT(); + + sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | + SLEEPQ_INTERRUPTIBLE, 0); + sleepq_set_timeout(cvp, timo); + if (lock != &Giant.lock_object) { + if (class->lc_flags & LC_SLEEPABLE) + sleepq_release(cvp); + WITNESS_SAVE(lock, lock_witness); + lock_state = class->lc_unlock(lock); + if (class->lc_flags & LC_SLEEPABLE) + sleepq_lock(cvp); + } + rval = sleepq_timedwait_sig(cvp, 0); + +#ifdef KTRACE + if (KTRPOINT(td, KTR_CSW)) + ktrcsw(0, 0); +#endif + PICKUP_GIANT(); + if (lock != &Giant.lock_object) { + class->lc_lock(lock, lock_state); + WITNESS_RESTORE(lock, lock_witness); + } + + return (rval); +} + +/* + * Signal a condition variable, wakes up one waiting thread. Will also wakeup + * the swapper if the process is not in memory, so that it can bring the + * sleeping process in. Note that this may also result in additional threads + * being made runnable. Should be called with the same mutex as was passed to + * cv_wait held. + */ +void +cv_signal(struct cv *cvp) +{ + int wakeup_swapper; + + wakeup_swapper = 0; + sleepq_lock(cvp); + if (cvp->cv_waiters > 0) { + cvp->cv_waiters--; + wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0); + } + sleepq_release(cvp); + if (wakeup_swapper) + kick_proc0(); +} + +/* + * Broadcast a signal to a condition variable. Wakes up all waiting threads. + * Should be called with the same mutex as was passed to cv_wait held. + */ +void +cv_broadcastpri(struct cv *cvp, int pri) +{ + int wakeup_swapper; + + /* + * XXX sleepq_broadcast pri argument changed from -1 meaning + * no pri to 0 meaning no pri. + */ + wakeup_swapper = 0; + if (pri == -1) + pri = 0; + sleepq_lock(cvp); + if (cvp->cv_waiters > 0) { + cvp->cv_waiters = 0; + wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0); + } + sleepq_release(cvp); + if (wakeup_swapper) + kick_proc0(); +} diff --git a/freebsd/kern/kern_descrip.c b/freebsd/kern/kern_descrip.c new file mode 100644 index 00000000..0ba063f5 --- /dev/null +++ b/freebsd/kern/kern_descrip.c @@ -0,0 +1,6912 @@ +#include <freebsd/machine/rtems-bsd-config.h> + +/*- + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 + */ + +#include <freebsd/sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <freebsd/local/opt_compat.h> +#include <freebsd/local/opt_ddb.h> +#include <freebsd/local/opt_ktrace.h> + +#include <freebsd/sys/param.h> +#include <freebsd/sys/systm.h> + +#include <freebsd/sys/conf.h> +#include <freebsd/sys/domain.h> +#include <freebsd/sys/fcntl.h> +#include <freebsd/sys/file.h> +#include <freebsd/sys/filedesc.h> +#include <freebsd/sys/filio.h> +#include <freebsd/sys/jail.h> +#include <freebsd/sys/kernel.h> +#include <freebsd/sys/limits.h> +#include <freebsd/sys/lock.h> +#include <freebsd/sys/malloc.h> +#include <freebsd/sys/mount.h> +#include <freebsd/sys/mqueue.h> +#include <freebsd/sys/mutex.h> +#include <freebsd/sys/namei.h> +#include <freebsd/sys/priv.h> +#include <freebsd/sys/proc.h> +#include <freebsd/sys/protosw.h> +#include <freebsd/sys/resourcevar.h> +#include <freebsd/sys/signalvar.h> +#include <freebsd/sys/socketvar.h> +#include <freebsd/sys/stat.h> +#include <freebsd/sys/sx.h> +#include <freebsd/sys/syscallsubr.h> +#include <freebsd/sys/sysctl.h> +#include <freebsd/sys/sysproto.h> +#include <freebsd/sys/tty.h> +#include <freebsd/sys/unistd.h> +#include <freebsd/sys/user.h> +#include <freebsd/sys/vnode.h> +#ifdef KTRACE +#include <freebsd/sys/ktrace.h> +#endif + +#include <freebsd/security/audit/audit.h> + +#include <freebsd/vm/uma.h> + +#include <freebsd/ddb/ddb.h> + +static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); +static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", + "file desc to leader structures"); +static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); + +static uma_zone_t file_zone; + + +/* Flags for do_dup() */ +#define DUP_FIXED 0x1 /* Force fixed allocation */ +#define DUP_FCNTL 0x2 /* fcntl()-style errors */ + +static int do_dup(struct thread *td, int flags, int old, int new, + register_t *retval); +static int fd_first_free(struct filedesc *, int, int); +static int fd_last_used(struct filedesc *, int, int); +static void fdgrowtable(struct filedesc *, int); +static void fdunused(struct filedesc *fdp, int fd); +static void fdused(struct filedesc *fdp, int fd); + +/* + * A process is initially started out with NDFILE descriptors stored within + * this structure, selected to be enough for typical applications based on + * the historical limit of 20 open files (and the usage of descriptors by + * shells). If these descriptors are exhausted, a larger descriptor table + * may be allocated, up to a process' resource limit; the internal arrays + * are then unused. + */ +#define NDFILE 20 +#define NDSLOTSIZE sizeof(NDSLOTTYPE) +#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) +#define NDSLOT(x) ((x) / NDENTRIES) +#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) +#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) + +/* + * Storage required per open file descriptor. + */ +#define OFILESIZE (sizeof(struct file *) + sizeof(char)) + +/* + * Storage to hold unused ofiles that need to be reclaimed. + */ +struct freetable { + struct file **ft_table; + SLIST_ENTRY(freetable) ft_next; +}; + +/* + * Basic allocation of descriptors: + * one of the above, plus arrays for NDFILE descriptors. + */ +struct filedesc0 { + struct filedesc fd_fd; + /* + * ofiles which need to be reclaimed on free. + */ + SLIST_HEAD(,freetable) fd_free; + /* + * These arrays are used when the number of open files is + * <= NDFILE, and are then pointed to by the pointers above. + */ + struct file *fd_dfiles[NDFILE]; + char fd_dfileflags[NDFILE]; + NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; +}; + +/* + * Descriptor management. + */ +volatile int openfiles; /* actual number of open files */ +struct mtx sigio_lock; /* mtx to protect pointers to sigio */ +#ifndef __rtems__ +void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); + +/* A mutex to protect the association between a proc and filedesc. */ +static struct mtx fdesc_mtx; + +/* + * Find the first zero bit in the given bitmap, starting at low and not + * exceeding size - 1. + */ +static int +fd_first_free(struct filedesc *fdp, int low, int size) +{ + NDSLOTTYPE *map = fdp->fd_map; + NDSLOTTYPE mask; + int off, maxoff; + + if (low >= size) + return (low); + + off = NDSLOT(low); + if (low % NDENTRIES) { + mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); + if ((mask &= ~map[off]) != 0UL) + return (off * NDENTRIES + ffsl(mask) - 1); + ++off; + } + for (maxoff = NDSLOTS(size); off < maxoff; ++off) + if (map[off] != ~0UL) + return (off * NDENTRIES + ffsl(~map[off]) - 1); + return (size); +} + +/* + * Find the highest non-zero bit in the given bitmap, starting at low and + * not exceeding size - 1. + */ +static int +fd_last_used(struct filedesc *fdp, int low, int size) +{ + NDSLOTTYPE *map = fdp->fd_map; + NDSLOTTYPE mask; + int off, minoff; + + if (low >= size) + return (-1); + + off = NDSLOT(size); + if (size % NDENTRIES) { + mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); + if ((mask &= map[off]) != 0) + return (off * NDENTRIES + flsl(mask) - 1); + --off; + } + for (minoff = NDSLOT(low); off >= minoff; --off) + if (map[off] != 0) + return (off * NDENTRIES + flsl(map[off]) - 1); + return (low - 1); +} + +static int +fdisused(struct filedesc *fdp, int fd) +{ + KASSERT(fd >= 0 && fd < fdp->fd_nfiles, + ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); + return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); +} + +/* + * Mark a file descriptor as used. + */ +static void +fdused(struct filedesc *fdp, int fd) +{ + + FILEDESC_XLOCK_ASSERT(fdp); + KASSERT(!fdisused(fdp, fd), + ("fd already used")); + + fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); + if (fd > fdp->fd_lastfile) + fdp->fd_lastfile = fd; + if (fd == fdp->fd_freefile) + fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); +} + +/* + * Mark a file descriptor as unused. + */ +static void +fdunused(struct filedesc *fdp, int fd) +{ + + FILEDESC_XLOCK_ASSERT(fdp); + KASSERT(fdisused(fdp, fd), + ("fd is already unused")); + KASSERT(fdp->fd_ofiles[fd] == NULL, + ("fd is still in use")); + + fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; + if (fd == fdp->fd_lastfile) + fdp->fd_lastfile = fd_last_used(fdp, 0, fd); +} + +/* + * System calls on descriptors. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct getdtablesize_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +getdtablesize(struct thread *td, struct getdtablesize_args *uap) +{ + struct proc *p = td->td_proc; + + PROC_LOCK(p); + td->td_retval[0] = + min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + return (0); +} + +/* + * Duplicate a file descriptor to a particular value. + * + * Note: keep in mind that a potential race condition exists when closing + * descriptors from a shared descriptor table (via rfork). + */ +#ifndef _SYS_SYSPROTO_HH_ +struct dup2_args { + u_int from; + u_int to; +}; +#endif +/* ARGSUSED */ +int +dup2(struct thread *td, struct dup2_args *uap) +{ + + return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, + td->td_retval)); +} + +/* + * Duplicate a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct dup_args { + u_int fd; +}; +#endif +/* ARGSUSED */ +int +dup(struct thread *td, struct dup_args *uap) +{ + + return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval)); +} + +/* + * The file control system call. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct fcntl_args { + int fd; + int cmd; + long arg; +}; +#endif +/* ARGSUSED */ +int +fcntl(struct thread *td, struct fcntl_args *uap) +{ + struct flock fl; + struct oflock ofl; + intptr_t arg; + int error; + int cmd; + + error = 0; + cmd = uap->cmd; + switch (uap->cmd) { + case F_OGETLK: + case F_OSETLK: + case F_OSETLKW: + /* + * Convert old flock structure to new. + */ + error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl)); + fl.l_start = ofl.l_start; + fl.l_len = ofl.l_len; + fl.l_pid = ofl.l_pid; + fl.l_type = ofl.l_type; + fl.l_whence = ofl.l_whence; + fl.l_sysid = 0; + + switch (uap->cmd) { + case F_OGETLK: + cmd = F_GETLK; + break; + case F_OSETLK: + cmd = F_SETLK; + break; + case F_OSETLKW: + cmd = F_SETLKW; + break; + } + arg = (intptr_t)&fl; + break; + case F_GETLK: + case F_SETLK: + case F_SETLKW: + case F_SETLK_REMOTE: + error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); + arg = (intptr_t)&fl; + break; + default: + arg = uap->arg; + break; + } + if (error) + return (error); + error = kern_fcntl(td, uap->fd, cmd, arg); + if (error) + return (error); + if (uap->cmd == F_OGETLK) { + ofl.l_start = fl.l_start; + ofl.l_len = fl.l_len; + ofl.l_pid = fl.l_pid; + ofl.l_type = fl.l_type; + ofl.l_whence = fl.l_whence; + error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl)); + } else if (uap->cmd == F_GETLK) { + error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); + } + return (error); +} + +static inline struct file * +fdtofp(int fd, struct filedesc *fdp) +{ + struct file *fp; + + FILEDESC_LOCK_ASSERT(fdp); + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (NULL); + return (fp); +} + +int +kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) +{ + struct filedesc *fdp; + struct flock *flp; + struct file *fp; + struct proc *p; + char *pop; + struct vnode *vp; + int error, flg, tmp; + int vfslocked; + u_int old, new; + uint64_t bsize; + + vfslocked = 0; + error = 0; + flg = F_POSIX; + p = td->td_proc; + fdp = p->p_fd; + + switch (cmd) { + case F_DUPFD: + tmp = arg; + error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval); + break; + + case F_DUP2FD: + tmp = arg; + error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval); + break; + + case F_GETFD: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + pop = &fdp->fd_ofileflags[fd]; + td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; + FILEDESC_SUNLOCK(fdp); + break; + + case F_SETFD: + FILEDESC_XLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_XUNLOCK(fdp); + error = EBADF; + break; + } + pop = &fdp->fd_ofileflags[fd]; + *pop = (*pop &~ UF_EXCLOSE) | + (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); + FILEDESC_XUNLOCK(fdp); + break; + + case F_GETFL: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + td->td_retval[0] = OFLAGS(fp->f_flag); + FILEDESC_SUNLOCK(fdp); + break; + + case F_SETFL: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + do { + tmp = flg = fp->f_flag; + tmp &= ~FCNTLFLAGS; + tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; + } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); + tmp = fp->f_flag & FNONBLOCK; + error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); + if (error) { + fdrop(fp, td); + break; + } + tmp = fp->f_flag & FASYNC; + error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); + if (error == 0) { + fdrop(fp, td); + break; + } + atomic_clear_int(&fp->f_flag, FNONBLOCK); + tmp = 0; + (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); + fdrop(fp, td); + break; + + case F_GETOWN: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); + if (error == 0) + td->td_retval[0] = tmp; + fdrop(fp, td); + break; + + case F_SETOWN: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + tmp = arg; + error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); + fdrop(fp, td); + break; + + case F_SETLK_REMOTE: + error = priv_check(td, PRIV_NFS_LOCKD); + if (error) + return (error); + flg = F_REMOTE; + goto do_setlk; + + case F_SETLKW: + flg |= F_WAIT; + /* FALLTHROUGH F_SETLK */ + + case F_SETLK: + do_setlk: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + flp = (struct flock *)arg; + if (flp->l_whence == SEEK_CUR) { + if (fp->f_offset < 0 || + (flp->l_start > 0 && + fp->f_offset > OFF_MAX - flp->l_start)) { + FILEDESC_SUNLOCK(fdp); + error = EOVERFLOW; + break; + } + flp->l_start += fp->f_offset; + } + + /* + * VOP_ADVLOCK() may block. + */ + fhold(fp); + FILEDESC_SUNLOCK(fdp); + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + switch (flp->l_type) { + case F_RDLCK: + if ((fp->f_flag & FREAD) == 0) { + error = EBADF; + break; + } + PROC_LOCK(p->p_leader); + p->p_leader->p_flag |= P_ADVLOCK; + PROC_UNLOCK(p->p_leader); + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + flp, flg); + break; + case F_WRLCK: + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + break; + } + PROC_LOCK(p->p_leader); + p->p_leader->p_flag |= P_ADVLOCK; + PROC_UNLOCK(p->p_leader); + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + flp, flg); + break; + case F_UNLCK: + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, + flp, flg); + break; + case F_UNLCKSYS: + /* + * Temporary api for testing remote lock + * infrastructure. + */ + if (flg != F_REMOTE) { + error = EINVAL; + break; + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCKSYS, flp, flg); + break; + default: + error = EINVAL; + break; + } + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + /* Check for race with close */ + FILEDESC_SLOCK(fdp); + if ((unsigned) fd >= fdp->fd_nfiles || + fp != fdp->fd_ofiles[fd]) { + FILEDESC_SUNLOCK(fdp); + flp->l_whence = SEEK_SET; + flp->l_start = 0; + flp->l_len = 0; + flp->l_type = F_UNLCK; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCK, flp, F_POSIX); + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + } else + FILEDESC_SUNLOCK(fdp); + fdrop(fp, td); + break; + + case F_GETLK: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + flp = (struct flock *)arg; + if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && + flp->l_type != F_UNLCK) { + FILEDESC_SUNLOCK(fdp); + error = EINVAL; + break; + } + if (flp->l_whence == SEEK_CUR) { + if ((flp->l_start > 0 && + fp->f_offset > OFF_MAX - flp->l_start) || + (flp->l_start < 0 && + fp->f_offset < OFF_MIN - flp->l_start)) { + FILEDESC_SUNLOCK(fdp); + error = EOVERFLOW; + break; + } + flp->l_start += fp->f_offset; + } + /* + * VOP_ADVLOCK() may block. + */ + fhold(fp); + FILEDESC_SUNLOCK(fdp); + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, + F_POSIX); + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + fdrop(fp, td); + break; + + case F_RDAHEAD: + arg = arg ? 128 * 1024: 0; + /* FALLTHROUGH */ + case F_READAHEAD: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + if (arg != 0) { + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + error = vn_lock(vp, LK_SHARED); + if (error != 0) + goto readahead_vnlock_fail; + bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; + VOP_UNLOCK(vp, 0); + fp->f_seqcount = (arg + bsize - 1) / bsize; + do { + new = old = fp->f_flag; + new |= FRDAHEAD; + } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); +readahead_vnlock_fail: + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + } else { + do { + new = old = fp->f_flag; + new &= ~FRDAHEAD; + } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); + } + fdrop(fp, td); + break; + + default: + error = EINVAL; + break; + } + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} + +/* + * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). + */ +static int +do_dup(struct thread *td, int flags, int old, int new, + register_t *retval) +{ + struct filedesc *fdp; + struct proc *p; + struct file *fp; + struct file *delfp; + int error, holdleaders, maxfd; + + p = td->td_proc; + fdp = p->p_fd; + + /* + * Verify we have a valid descriptor to dup from and possibly to + * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should + * return EINVAL when the new descriptor is out of bounds. + */ + if (old < 0) + return (EBADF); + if (new < 0) + return (flags & DUP_FCNTL ? EINVAL : EBADF); + PROC_LOCK(p); + maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + if (new >= maxfd) + return (flags & DUP_FCNTL ? EINVAL : EMFILE); + + FILEDESC_XLOCK(fdp); + if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + if (flags & DUP_FIXED && old == new) { + *retval = new; + FILEDESC_XUNLOCK(fdp); + return (0); + } + fp = fdp->fd_ofiles[old]; + fhold(fp); + + /* + * If the caller specified a file descriptor, make sure the file + * table is large enough to hold it, and grab it. Otherwise, just + * allocate a new descriptor the usual way. Since the filedesc + * lock may be temporarily dropped in the process, we have to look + * out for a race. + */ + if (flags & DUP_FIXED) { + if (new >= fdp->fd_nfiles) + fdgrowtable(fdp, new + 1); + if (fdp->fd_ofiles[new] == NULL) + fdused(fdp, new); + } else { + if ((error = fdalloc(td, new, &new)) != 0) { + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + return (error); + } + } + + /* + * If the old file changed out from under us then treat it as a + * bad file descriptor. Userland should do its own locking to + * avoid this case. + */ + if (fdp->fd_ofiles[old] != fp) { + /* we've allocated a descriptor which we won't use */ + if (fdp->fd_ofiles[new] == NULL) + fdunused(fdp, new); + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + return (EBADF); + } + KASSERT(old != new, + ("new fd is same as old")); + + /* + * Save info on the descriptor being overwritten. We cannot close + * it without introducing an ownership race for the slot, since we + * need to drop the filedesc lock to call closef(). + * + * XXX this duplicates parts of close(). + */ + delfp = fdp->fd_ofiles[new]; + holdleaders = 0; + if (delfp != NULL) { + if (td->td_proc->p_fdtol != NULL) { + /* + * Ask fdfree() to sleep to ensure that all relevant + * process leaders can be traversed in closef(). + */ + fdp->fd_holdleaderscount++; + holdleaders = 1; + } + } + + /* + * Duplicate the source descriptor + */ + fdp->fd_ofiles[new] = fp; + fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; + if (new > fdp->fd_lastfile) + fdp->fd_lastfile = new; + *retval = new; + + /* + * If we dup'd over a valid file, we now own the reference to it + * and must dispose of it using closef() semantics (as if a + * close() were performed on it). + * + * XXX this duplicates parts of close(). + */ + if (delfp != NULL) { + knote_fdclose(td, new); + if (delfp->f_type == DTYPE_MQUEUE) + mq_fdclose(td, new, delfp); + FILEDESC_XUNLOCK(fdp); + (void) closef(delfp, td); + if (holdleaders) { + FILEDESC_XLOCK(fdp); + fdp->fd_holdleaderscount--; + if (fdp->fd_holdleaderscount == 0 && + fdp->fd_holdleaderswakeup != 0) { + fdp->fd_holdleaderswakeup = 0; + wakeup(&fdp->fd_holdleaderscount); + } + FILEDESC_XUNLOCK(fdp); + } + } else { + FILEDESC_XUNLOCK(fdp); + } + return (0); +} + +/* + * If sigio is on the list associated with a process or process group, + * disable signalling from the device, remove sigio from the list and + * free sigio. + */ +void +funsetown(struct sigio **sigiop) +{ + struct sigio *sigio; + + SIGIO_LOCK(); + sigio = *sigiop; + if (sigio == NULL) { + SIGIO_UNLOCK(); + return; + } + *(sigio->sio_myref) = NULL; + if ((sigio)->sio_pgid < 0) { + struct pgrp *pg = (sigio)->sio_pgrp; + PGRP_LOCK(pg); + SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, + sigio, sio_pgsigio); + PGRP_UNLOCK(pg); + } else { + struct proc *p = (sigio)->sio_proc; + PROC_LOCK(p); + SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, + sigio, sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); +} + +/* + * Free a list of sigio structures. + * We only need to lock the SIGIO_LOCK because we have made ourselves + * inaccessible to callers of fsetown and therefore do not need to lock + * the proc or pgrp struct for the list manipulation. + */ +void +funsetownlst(struct sigiolst *sigiolst) +{ + struct proc *p; + struct pgrp *pg; + struct sigio *sigio; + + sigio = SLIST_FIRST(sigiolst); + if (sigio == NULL) + return; + p = NULL; + pg = NULL; + + /* + * Every entry of the list should belong + * to a single proc or pgrp. + */ + if (sigio->sio_pgid < 0) { + pg = sigio->sio_pgrp; + PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); + } else /* if (sigio->sio_pgid > 0) */ { + p = sigio->sio_proc; + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + } + + SIGIO_LOCK(); + while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { + *(sigio->sio_myref) = NULL; + if (pg != NULL) { + KASSERT(sigio->sio_pgid < 0, + ("Proc sigio in pgrp sigio list")); + KASSERT(sigio->sio_pgrp == pg, + ("Bogus pgrp in sigio list")); + PGRP_LOCK(pg); + SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, + sio_pgsigio); + PGRP_UNLOCK(pg); + } else /* if (p != NULL) */ { + KASSERT(sigio->sio_pgid > 0, + ("Pgrp sigio in proc sigio list")); + KASSERT(sigio->sio_proc == p, + ("Bogus proc in sigio list")); + PROC_LOCK(p); + SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, + sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); + SIGIO_LOCK(); + } + SIGIO_UNLOCK(); +} + +/* + * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). + * + * After permission checking, add a sigio structure to the sigio list for + * the process or process group. + */ +int +fsetown(pid_t pgid, struct sigio **sigiop) +{ + struct proc *proc; + struct pgrp *pgrp; + struct sigio *sigio; + int ret; + + if (pgid == 0) { + funsetown(sigiop); + return (0); + } + + ret = 0; + + /* Allocate and fill in the new sigio out of locks. */ + sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); + sigio->sio_pgid = pgid; + sigio->sio_ucred = crhold(curthread->td_ucred); + sigio->sio_myref = sigiop; + + sx_slock(&proctree_lock); + if (pgid > 0) { + proc = pfind(pgid); + if (proc == NULL) { + ret = ESRCH; + goto fail; + } + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + PROC_UNLOCK(proc); + if (proc->p_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + pgrp = NULL; + } else /* if (pgid < 0) */ { + pgrp = pgfind(-pgid); + if (pgrp == NULL) { + ret = ESRCH; + goto fail; + } + PGRP_UNLOCK(pgrp); + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + if (pgrp->pg_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + proc = NULL; + } + funsetown(sigiop); + if (pgid > 0) { + PROC_LOCK(proc); + /* + * Since funsetownlst() is called without the proctree + * locked, we need to check for P_WEXIT. + * XXX: is ESRCH correct? + */ + if ((proc->p_flag & P_WEXIT) != 0) { + PROC_UNLOCK(proc); + ret = ESRCH; + goto fail; + } + SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); + sigio->sio_proc = proc; + PROC_UNLOCK(proc); + } else { + PGRP_LOCK(pgrp); + SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); + sigio->sio_pgrp = pgrp; + PGRP_UNLOCK(pgrp); + } + sx_sunlock(&proctree_lock); + SIGIO_LOCK(); + *sigiop = sigio; + SIGIO_UNLOCK(); + return (0); + +fail: + sx_sunlock(&proctree_lock); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); + return (ret); +} + +/* + * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). + */ +pid_t +fgetown(sigiop) + struct sigio **sigiop; +{ + pid_t pgid; + + SIGIO_LOCK(); + pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; + SIGIO_UNLOCK(); + return (pgid); +} + +/* + * Close a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct close_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +close(td, uap) + struct thread *td; + struct close_args *uap; +{ + + return (kern_close(td, uap->fd)); +} + +int +kern_close(td, fd) + struct thread *td; + int fd; +{ + struct filedesc *fdp; + struct file *fp; + int error; + int holdleaders; + + error = 0; + holdleaders = 0; + fdp = td->td_proc->p_fd; + + AUDIT_SYSCLOSE(td, fd); + + FILEDESC_XLOCK(fdp); + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + fdp->fd_ofiles[fd] = NULL; + fdp->fd_ofileflags[fd] = 0; + fdunused(fdp, fd); + if (td->td_proc->p_fdtol != NULL) { + /* + * Ask fdfree() to sleep to ensure that all relevant + * process leaders can be traversed in closef(). + */ + fdp->fd_holdleaderscount++; + holdleaders = 1; + } + + /* + * We now hold the fp reference that used to be owned by the + * descriptor array. We have to unlock the FILEDESC *AFTER* + * knote_fdclose to prevent a race of the fd getting opened, a knote + * added, and deleteing a knote for the new fd. + */ + knote_fdclose(td, fd); + if (fp->f_type == DTYPE_MQUEUE) + mq_fdclose(td, fd, fp); + FILEDESC_XUNLOCK(fdp); + + error = closef(fp, td); + if (holdleaders) { + FILEDESC_XLOCK(fdp); + fdp->fd_holdleaderscount--; + if (fdp->fd_holdleaderscount == 0 && + fdp->fd_holdleaderswakeup != 0) { + fdp->fd_holdleaderswakeup = 0; + wakeup(&fdp->fd_holdleaderscount); + } + FILEDESC_XUNLOCK(fdp); + } + return (error); +} + +/* + * Close open file descriptors. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct closefrom_args { + int lowfd; +}; +#endif +/* ARGSUSED */ +int +closefrom(struct thread *td, struct closefrom_args *uap) +{ + struct filedesc *fdp; + int fd; + + fdp = td->td_proc->p_fd; + AUDIT_ARG_FD(uap->lowfd); + + /* + * Treat negative starting file descriptor values identical to + * closefrom(0) which closes all files. + */ + if (uap->lowfd < 0) + uap->lowfd = 0; + FILEDESC_SLOCK(fdp); + for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) { + if (fdp->fd_ofiles[fd] != NULL) { + FILEDESC_SUNLOCK(fdp); + (void)kern_close(td, fd); + FILEDESC_SLOCK(fdp); + } + } + FILEDESC_SUNLOCK(fdp); + return (0); +} + +#if defined(COMPAT_43) +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct ofstat_args { + int fd; + struct ostat *sb; +}; +#endif +/* ARGSUSED */ +int +ofstat(struct thread *td, struct ofstat_args *uap) +{ + struct ostat oub; + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) { + cvtstat(&ub, &oub); + error = copyout(&oub, uap->sb, sizeof(oub)); + } + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct fstat_args { + int fd; + struct stat *sb; +}; +#endif +/* ARGSUSED */ +int +fstat(struct thread *td, struct fstat_args *uap) +{ + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) + error = copyout(&ub, uap->sb, sizeof(ub)); + return (error); +} + +int +kern_fstat(struct thread *td, int fd, struct stat *sbp) +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(fd); + + if ((error = fget(td, fd, &fp)) != 0) + return (error); + + AUDIT_ARG_FILE(td->td_proc, fp); + + error = fo_stat(fp, sbp, td->td_ucred, td); + fdrop(fp, td); +#ifdef KTRACE + if (error == 0 && KTRPOINT(td, KTR_STRUCT)) + ktrstat(sbp); +#endif + return (error); +} + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct nfstat_args { + int fd; + struct nstat *sb; +}; +#endif +/* ARGSUSED */ +int +nfstat(struct thread *td, struct nfstat_args *uap) +{ + struct nstat nub; + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) { + cvtnstat(&ub, &nub); + error = copyout(&nub, uap->sb, sizeof(nub)); + } + return (error); +} + +/* + * Return pathconf information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct fpathconf_args { + int fd; + int name; +}; +#endif +/* ARGSUSED */ +int +fpathconf(struct thread *td, struct fpathconf_args *uap) +{ + struct file *fp; + struct vnode *vp; + int error; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + + /* If asynchronous I/O is available, it works for all descriptors. */ + if (uap->name == _PC_ASYNC_IO) { + td->td_retval[0] = async_io_version; + goto out; + } + vp = fp->f_vnode; + if (vp != NULL) { + int vfslocked; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_PATHCONF(vp, uap->name, td->td_retval); + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { + if (uap->name != _PC_PIPE_BUF) { + error = EINVAL; + } else { + td->td_retval[0] = PIPE_BUF; + error = 0; + } + } else { + error = EOPNOTSUPP; + } +out: + fdrop(fp, td); + return (error); +} + +/* + * Grow the file table to accomodate (at least) nfd descriptors. This may + * block and drop the filedesc lock, but it will reacquire it before + * returning. + */ +static void +fdgrowtable(struct filedesc *fdp, int nfd) +{ + struct filedesc0 *fdp0; + struct freetable *fo; + struct file **ntable; + struct file **otable; + char *nfileflags; + int nnfiles, onfiles; + NDSLOTTYPE *nmap; + + FILEDESC_XLOCK_ASSERT(fdp); + + KASSERT(fdp->fd_nfiles > 0, + ("zero-length file table")); + + /* compute the size of the new table */ + onfiles = fdp->fd_nfiles; + nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ + if (nnfiles <= onfiles) + /* the table is already large enough */ + return; + + /* allocate a new table and (if required) new bitmaps */ + FILEDESC_XUNLOCK(fdp); + ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable), + M_FILEDESC, M_ZERO | M_WAITOK); + nfileflags = (char *)&ntable[nnfiles]; + if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) + nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, + M_FILEDESC, M_ZERO | M_WAITOK); + else + nmap = NULL; + FILEDESC_XLOCK(fdp); + + /* + * We now have new tables ready to go. Since we dropped the + * filedesc lock to call malloc(), watch out for a race. + */ + onfiles = fdp->fd_nfiles; + if (onfiles >= nnfiles) { + /* we lost the race, but that's OK */ + free(ntable, M_FILEDESC); + if (nmap != NULL) + free(nmap, M_FILEDESC); + return; + } + bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); + bcopy(fdp->fd_ofileflags, nfileflags, onfiles); + otable = fdp->fd_ofiles; + fdp->fd_ofileflags = nfileflags; + fdp->fd_ofiles = ntable; + /* + * We must preserve ofiles until the process exits because we can't + * be certain that no threads have references to the old table via + * _fget(). + */ + if (onfiles > NDFILE) { + fo = (struct freetable *)&otable[onfiles]; + fdp0 = (struct filedesc0 *)fdp; + fo->ft_table = otable; + SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next); + } + if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { + bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); + if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) + free(fdp->fd_map, M_FILEDESC); + fdp->fd_map = nmap; + } + fdp->fd_nfiles = nnfiles; +} + +/* + * Allocate a file descriptor for the process. + */ +int +fdalloc(struct thread *td, int minfd, int *result) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = p->p_fd; + int fd = -1, maxfd; + + FILEDESC_XLOCK_ASSERT(fdp); + + if (fdp->fd_freefile > minfd) + minfd = fdp->fd_freefile; + + PROC_LOCK(p); + maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + + /* + * Search the bitmap for a free descriptor. If none is found, try + * to grow the file table. Keep at it until we either get a file + * descriptor or run into process or system limits; fdgrowtable() + * may drop the filedesc lock, so we're in a race. + */ + for (;;) { + fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); + if (fd >= maxfd) + return (EMFILE); + if (fd < fdp->fd_nfiles) + break; + fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); + } + + /* + * Perform some sanity checks, then mark the file descriptor as + * used and return it to the caller. + */ + KASSERT(!fdisused(fdp, fd), + ("fd_first_free() returned non-free descriptor")); + KASSERT(fdp->fd_ofiles[fd] == NULL, + ("free descriptor isn't")); + fdp->fd_ofileflags[fd] = 0; /* XXX needed? */ + fdused(fdp, fd); + *result = fd; + return (0); +} + +/* + * Check to see whether n user file descriptors are available to the process + * p. + */ +int +fdavail(struct thread *td, int n) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = td->td_proc->p_fd; + struct file **fpp; + int i, lim, last; + + FILEDESC_LOCK_ASSERT(fdp); + + PROC_LOCK(p); + lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) + return (1); + last = min(fdp->fd_nfiles, lim); + fpp = &fdp->fd_ofiles[fdp->fd_freefile]; + for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { + if (*fpp == NULL && --n <= 0) + return (1); + } + return (0); +} + +/* + * Create a new open file structure and allocate a file decriptor for the + * process that refers to it. We add one reference to the file for the + * descriptor table and one reference for resultfp. This is to prevent us + * being preempted and the entry in the descriptor table closed after we + * release the FILEDESC lock. + */ +int +falloc(struct thread *td, struct file **resultfp, int *resultfd) +{ + struct proc *p = td->td_proc; + struct file *fp; + int error, i; + int maxuserfiles = maxfiles - (maxfiles / 20); + static struct timeval lastfail; + static int curfail; + + fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); + if ((openfiles >= maxuserfiles && + priv_check(td, PRIV_MAXFILES) != 0) || + openfiles >= maxfiles) { + if (ppsratecheck(&lastfail, &curfail, 1)) { + printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", + td->td_ucred->cr_ruid); + } + uma_zfree(file_zone, fp); + return (ENFILE); + } + atomic_add_int(&openfiles, 1); + + /* + * If the process has file descriptor zero open, add the new file + * descriptor to the list of open files at that point, otherwise + * put it at the front of the list of open files. + */ + refcount_init(&fp->f_count, 1); + if (resultfp) + fhold(fp); + fp->f_cred = crhold(td->td_ucred); + fp->f_ops = &badfileops; + fp->f_data = NULL; + fp->f_vnode = NULL; + FILEDESC_XLOCK(p->p_fd); + if ((error = fdalloc(td, 0, &i))) { + FILEDESC_XUNLOCK(p->p_fd); + + fdrop(fp, td); + if (resultfp) + fdrop(fp, td); + return (error); + } + p->p_fd->fd_ofiles[i] = fp; + FILEDESC_XUNLOCK(p->p_fd); + if (resultfp) + *resultfp = fp; + if (resultfd) + *resultfd = i; + return (0); +} + +/* + * Build a new filedesc structure from another. + * Copy the current, root, and jail root vnode references. + */ +struct filedesc * +fdinit(struct filedesc *fdp) +{ + struct filedesc0 *newfdp; + + newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); + FILEDESC_LOCK_INIT(&newfdp->fd_fd); + if (fdp != NULL) { + FILEDESC_XLOCK(fdp); + newfdp->fd_fd.fd_cdir = fdp->fd_cdir; + if (newfdp->fd_fd.fd_cdir) + VREF(newfdp->fd_fd.fd_cdir); + newfdp->fd_fd.fd_rdir = fdp->fd_rdir; + if (newfdp->fd_fd.fd_rdir) + VREF(newfdp->fd_fd.fd_rdir); + newfdp->fd_fd.fd_jdir = fdp->fd_jdir; + if (newfdp->fd_fd.fd_jdir) + VREF(newfdp->fd_fd.fd_jdir); + FILEDESC_XUNLOCK(fdp); + } + + /* Create the file descriptor table. */ + newfdp->fd_fd.fd_refcnt = 1; + newfdp->fd_fd.fd_holdcnt = 1; + newfdp->fd_fd.fd_cmask = CMASK; + newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; + newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; + newfdp->fd_fd.fd_nfiles = NDFILE; + newfdp->fd_fd.fd_map = newfdp->fd_dmap; + newfdp->fd_fd.fd_lastfile = -1; + return (&newfdp->fd_fd); +} + +static struct filedesc * +fdhold(struct proc *p) +{ + struct filedesc *fdp; + + mtx_lock(&fdesc_mtx); + fdp = p->p_fd; + if (fdp != NULL) + fdp->fd_holdcnt++; + mtx_unlock(&fdesc_mtx); + return (fdp); +} + +static void +fddrop(struct filedesc *fdp) +{ + struct filedesc0 *fdp0; + struct freetable *ft; + int i; + + mtx_lock(&fdesc_mtx); + i = --fdp->fd_holdcnt; + mtx_unlock(&fdesc_mtx); + if (i > 0) + return; + + FILEDESC_LOCK_DESTROY(fdp); + fdp0 = (struct filedesc0 *)fdp; + while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) { + SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next); + free(ft->ft_table, M_FILEDESC); + } + free(fdp, M_FILEDESC); +} + +/* + * Share a filedesc structure. + */ +struct filedesc * +fdshare(struct filedesc *fdp) +{ + + FILEDESC_XLOCK(fdp); + fdp->fd_refcnt++; + FILEDESC_XUNLOCK(fdp); + return (fdp); +} + +/* + * Unshare a filedesc structure, if necessary by making a copy + */ +void +fdunshare(struct proc *p, struct thread *td) +{ + + FILEDESC_XLOCK(p->p_fd); + if (p->p_fd->fd_refcnt > 1) { + struct filedesc *tmp; + + FILEDESC_XUNLOCK(p->p_fd); + tmp = fdcopy(p->p_fd); + fdfree(td); + p->p_fd = tmp; + } else + FILEDESC_XUNLOCK(p->p_fd); +} + +/* + * Copy a filedesc structure. A NULL pointer in returns a NULL reference, + * this is to ease callers, not catch errors. + */ +struct filedesc * +fdcopy(struct filedesc *fdp) +{ + struct filedesc *newfdp; + int i; + + /* Certain daemons might not have file descriptors. */ + if (fdp == NULL) + return (NULL); + + newfdp = fdinit(fdp); + FILEDESC_SLOCK(fdp); + while (fdp->fd_lastfile >= newfdp->fd_nfiles) { + FILEDESC_SUNLOCK(fdp); + FILEDESC_XLOCK(newfdp); + fdgrowtable(newfdp, fdp->fd_lastfile + 1); + FILEDESC_XUNLOCK(newfdp); + FILEDESC_SLOCK(fdp); + } + /* copy everything except kqueue descriptors */ + newfdp->fd_freefile = -1; + for (i = 0; i <= fdp->fd_lastfile; ++i) { + if (fdisused(fdp, i) && + fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE && + fdp->fd_ofiles[i]->f_ops != &badfileops) { + newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; + newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; + fhold(newfdp->fd_ofiles[i]); + newfdp->fd_lastfile = i; + } else { + if (newfdp->fd_freefile == -1) + newfdp->fd_freefile = i; + } + } + newfdp->fd_cmask = fdp->fd_cmask; + FILEDESC_SUNLOCK(fdp); + FILEDESC_XLOCK(newfdp); + for (i = 0; i <= newfdp->fd_lastfile; ++i) + if (newfdp->fd_ofiles[i] != NULL) + fdused(newfdp, i); + if (newfdp->fd_freefile == -1) + newfdp->fd_freefile = i; + FILEDESC_XUNLOCK(newfdp); + return (newfdp); +} + +/* + * Release a filedesc structure. + */ +void +fdfree(struct thread *td) +{ + struct filedesc *fdp; + struct file **fpp; + int i, locked; + struct filedesc_to_leader *fdtol; + struct file *fp; + struct vnode *cdir, *jdir, *rdir, *vp; + struct flock lf; + + /* Certain daemons might not have file descriptors. */ + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return; + + /* Check for special need to clear POSIX style locks */ + fdtol = td->td_proc->p_fdtol; + if (fdtol != NULL) { + FILEDESC_XLOCK(fdp); + KASSERT(fdtol->fdl_refcount > 0, + ("filedesc_to_refcount botch: fdl_refcount=%d", + fdtol->fdl_refcount)); + if (fdtol->fdl_refcount == 1 && + (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + for (i = 0, fpp = fdp->fd_ofiles; + i <= fdp->fd_lastfile; + i++, fpp++) { + if (*fpp == NULL || + (*fpp)->f_type != DTYPE_VNODE) + continue; + fp = *fpp; + fhold(fp); + FILEDESC_XUNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_vnode; + locked = VFS_LOCK_GIANT(vp->v_mount); + (void) VOP_ADVLOCK(vp, + (caddr_t)td->td_proc-> + p_leader, + F_UNLCK, + &lf, + F_POSIX); + VFS_UNLOCK_GIANT(locked); + FILEDESC_XLOCK(fdp); + fdrop(fp, td); + fpp = fdp->fd_ofiles + i; + } + } + retry: + if (fdtol->fdl_refcount == 1) { + if (fdp->fd_holdleaderscount > 0 && + (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + /* + * close() or do_dup() has cleared a reference + * in a shared file descriptor table. + */ + fdp->fd_holdleaderswakeup = 1; + sx_sleep(&fdp->fd_holdleaderscount, + FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); + goto retry; + } + if (fdtol->fdl_holdcount > 0) { + /* + * Ensure that fdtol->fdl_leader remains + * valid in closef(). + */ + fdtol->fdl_wakeup = 1; + sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, + "fdlhold", 0); + goto retry; + } + } + fdtol->fdl_refcount--; + if (fdtol->fdl_refcount == 0 && + fdtol->fdl_holdcount == 0) { + fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; + fdtol->fdl_prev->fdl_next = fdtol->fdl_next; + } else + fdtol = NULL; + td->td_proc->p_fdtol = NULL; + FILEDESC_XUNLOCK(fdp); + if (fdtol != NULL) + free(fdtol, M_FILEDESC_TO_LEADER); + } + FILEDESC_XLOCK(fdp); + i = --fdp->fd_refcnt; + FILEDESC_XUNLOCK(fdp); + if (i > 0) + return; + + fpp = fdp->fd_ofiles; + for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { + if (*fpp) { + FILEDESC_XLOCK(fdp); + fp = *fpp; + *fpp = NULL; + FILEDESC_XUNLOCK(fdp); + (void) closef(fp, td); + } + } + FILEDESC_XLOCK(fdp); + + /* XXX This should happen earlier. */ + mtx_lock(&fdesc_mtx); + td->td_proc->p_fd = NULL; + mtx_unlock(&fdesc_mtx); + + if (fdp->fd_nfiles > NDFILE) + free(fdp->fd_ofiles, M_FILEDESC); + if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) + free(fdp->fd_map, M_FILEDESC); + + fdp->fd_nfiles = 0; + + cdir = fdp->fd_cdir; + fdp->fd_cdir = NULL; + rdir = fdp->fd_rdir; + fdp->fd_rdir = NULL; + jdir = fdp->fd_jdir; + fdp->fd_jdir = NULL; + FILEDESC_XUNLOCK(fdp); + + if (cdir) { + locked = VFS_LOCK_GIANT(cdir->v_mount); + vrele(cdir); + VFS_UNLOCK_GIANT(locked); + } + if (rdir) { + locked = VFS_LOCK_GIANT(rdir->v_mount); + vrele(rdir); + VFS_UNLOCK_GIANT(locked); + } + if (jdir) { + locked = VFS_LOCK_GIANT(jdir->v_mount); + vrele(jdir); + VFS_UNLOCK_GIANT(locked); + } + + fddrop(fdp); +} + +/* + * For setugid programs, we don't want to people to use that setugidness + * to generate error messages which write to a file which otherwise would + * otherwise be off-limits to the process. We check for filesystems where + * the vnode can change out from under us after execve (like [lin]procfs). + * + * Since setugidsafety calls this only for fd 0, 1 and 2, this check is + * sufficient. We also don't check for setugidness since we know we are. + */ +static int +is_unsafe(struct file *fp) +{ + if (fp->f_type == DTYPE_VNODE) { + struct vnode *vp = fp->f_vnode; + + if ((vp->v_vflag & VV_PROCDEP) != 0) + return (1); + } + return (0); +} + +/* + * Make this setguid thing safe, if at all possible. + */ +void +setugidsafety(struct thread *td) +{ + struct filedesc *fdp; + int i; + + /* Certain daemons might not have file descriptors. */ + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return; + + /* + * Note: fdp->fd_ofiles may be reallocated out from under us while + * we are blocked in a close. Be careful! + */ + FILEDESC_XLOCK(fdp); + for (i = 0; i <= fdp->fd_lastfile; i++) { + if (i > 2) + break; + if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { + struct file *fp; + + knote_fdclose(td, i); + /* + * NULL-out descriptor prior to close to avoid + * a race while close blocks. + */ + fp = fdp->fd_ofiles[i]; + fdp->fd_ofiles[i] = NULL; + fdp->fd_ofileflags[i] = 0; + fdunused(fdp, i); + FILEDESC_XUNLOCK(fdp); + (void) closef(fp, td); + FILEDESC_XLOCK(fdp); + } + } + FILEDESC_XUNLOCK(fdp); +} + +/* + * If a specific file object occupies a specific file descriptor, close the + * file descriptor entry and drop a reference on the file object. This is a + * convenience function to handle a subsequent error in a function that calls + * falloc() that handles the race that another thread might have closed the + * file descriptor out from under the thread creating the file object. + */ +void +fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) +{ + + FILEDESC_XLOCK(fdp); + if (fdp->fd_ofiles[idx] == fp) { + fdp->fd_ofiles[idx] = NULL; + fdunused(fdp, idx); + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_XUNLOCK(fdp); +} + +/* + * Close any files on exec? + */ +void +fdcloseexec(struct thread *td) +{ + struct filedesc *fdp; + int i; + + /* Certain daemons might not have file descriptors. */ + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return; + + FILEDESC_XLOCK(fdp); + + /* + * We cannot cache fd_ofiles or fd_ofileflags since operations + * may block and rip them out from under us. + */ + for (i = 0; i <= fdp->fd_lastfile; i++) { + if (fdp->fd_ofiles[i] != NULL && + (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || + (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { + struct file *fp; + + knote_fdclose(td, i); + /* + * NULL-out descriptor prior to close to avoid + * a race while close blocks. + */ + fp = fdp->fd_ofiles[i]; + fdp->fd_ofiles[i] = NULL; + fdp->fd_ofileflags[i] = 0; + fdunused(fdp, i); + if (fp->f_type == DTYPE_MQUEUE) + mq_fdclose(td, i, fp); + FILEDESC_XUNLOCK(fdp); + (void) closef(fp, td); + FILEDESC_XLOCK(fdp); + } + } + FILEDESC_XUNLOCK(fdp); +} + +/* + * It is unsafe for set[ug]id processes to be started with file + * descriptors 0..2 closed, as these descriptors are given implicit + * significance in the Standard C library. fdcheckstd() will create a + * descriptor referencing /dev/null for each of stdin, stdout, and + * stderr that is not already open. + */ +int +fdcheckstd(struct thread *td) +{ + struct filedesc *fdp; + register_t retval, save; + int i, error, devnull; + + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return (0); + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); + devnull = -1; + error = 0; + for (i = 0; i < 3; i++) { + if (fdp->fd_ofiles[i] != NULL) + continue; + if (devnull < 0) { + save = td->td_retval[0]; + error = kern_open(td, "/dev/null", UIO_SYSSPACE, + O_RDWR, 0); + devnull = td->td_retval[0]; + KASSERT(devnull == i, ("oof, we didn't get our fd")); + td->td_retval[0] = save; + if (error) + break; + } else { + error = do_dup(td, DUP_FIXED, devnull, i, &retval); + if (error != 0) + break; + } + } + return (error); +} + +/* + * Internal form of close. Decrement reference count on file structure. + * Note: td may be NULL when closing a file that was being passed in a + * message. + * + * XXXRW: Giant is not required for the caller, but often will be held; this + * makes it moderately likely the Giant will be recursed in the VFS case. + */ +int +closef(struct file *fp, struct thread *td) +{ + struct vnode *vp; + struct flock lf; + struct filedesc_to_leader *fdtol; + struct filedesc *fdp; + + /* + * POSIX record locking dictates that any close releases ALL + * locks owned by this process. This is handled by setting + * a flag in the unlock to free ONLY locks obeying POSIX + * semantics, and not to free BSD-style file locks. + * If the descriptor was in a message, POSIX-style locks + * aren't passed with the descriptor, and the thread pointer + * will be NULL. Callers should be careful only to pass a + * NULL thread pointer when there really is no owning + * context that might have locks, or the locks will be + * leaked. + */ + if (fp->f_type == DTYPE_VNODE && td != NULL) { + int vfslocked; + + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, + F_UNLCK, &lf, F_POSIX); + } + fdtol = td->td_proc->p_fdtol; + if (fdtol != NULL) { + /* + * Handle special case where file descriptor table is + * shared between multiple process leaders. + */ + fdp = td->td_proc->p_fd; + FILEDESC_XLOCK(fdp); + for (fdtol = fdtol->fdl_next; + fdtol != td->td_proc->p_fdtol; + fdtol = fdtol->fdl_next) { + if ((fdtol->fdl_leader->p_flag & + P_ADVLOCK) == 0) + continue; + fdtol->fdl_holdcount++; + FILEDESC_XUNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_vnode; + (void) VOP_ADVLOCK(vp, + (caddr_t)fdtol->fdl_leader, + F_UNLCK, &lf, F_POSIX); + FILEDESC_XLOCK(fdp); + fdtol->fdl_holdcount--; + if (fdtol->fdl_holdcount == 0 && + fdtol->fdl_wakeup != 0) { + fdtol->fdl_wakeup = 0; + wakeup(fdtol); + } + } + FILEDESC_XUNLOCK(fdp); + } + VFS_UNLOCK_GIANT(vfslocked); + } + return (fdrop(fp, td)); +} + +/* + * Initialize the file pointer with the specified properties. + * + * The ops are set with release semantics to be certain that the flags, type, + * and data are visible when ops is. This is to prevent ops methods from being + * called with bad data. + */ +void +finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) +{ + fp->f_data = data; + fp->f_flag = flag; + fp->f_type = type; + atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); +} +#endif /* __rtems__ */ + +struct file * +fget_unlocked(struct filedesc *fdp, int fd) +{ + struct file *fp; + u_int count; + + if (fd < 0 || fd >= fdp->fd_nfiles) + return (NULL); + /* + * Fetch the descriptor locklessly. We avoid fdrop() races by + * never raising a refcount above 0. To accomplish this we have + * to use a cmpset loop rather than an atomic_add. The descriptor + * must be re-verified once we acquire a reference to be certain + * that the identity is still correct and we did not lose a race + * due to preemption. + */ + for (;;) { + fp = fdp->fd_ofiles[fd]; + if (fp == NULL) + break; + count = fp->f_count; + if (count == 0) + continue; + /* + * Use an acquire barrier to prevent caching of fd_ofiles + * so it is refreshed for verification. + */ + if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1) + continue; + if (fp == fdp->fd_ofiles[fd]) + break; + fdrop(fp, curthread); + } + + return (fp); +} + +/* + * Extract the file pointer associated with the specified descriptor for the + * current user process. + * + * If the descriptor doesn't exist or doesn't match 'flags', EBADF is + * returned. + * + * If an error occured the non-zero error is returned and *fpp is set to + * NULL. Otherwise *fpp is held and set and zero is returned. Caller is + * responsible for fdrop(). + */ +static __inline int +_fget(struct thread *td, int fd, struct file **fpp, int flags) +{ + struct filedesc *fdp; + struct file *fp; + + *fpp = NULL; + if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) + return (EBADF); + if ((fp = fget_unlocked(fdp, fd)) == NULL) + return (EBADF); + if (fp->f_ops == &badfileops) { + fdrop(fp, td); + return (EBADF); + } + /* + * FREAD and FWRITE failure return EBADF as per POSIX. + * + * Only one flag, or 0, may be specified. + */ + if ((flags == FREAD && (fp->f_flag & FREAD) == 0) || + (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) { + fdrop(fp, td); + return (EBADF); + } + *fpp = fp; + return (0); +} + +int +fget(struct thread *td, int fd, struct file **fpp) +{ + + return(_fget(td, fd, fpp, 0)); +} + +int +fget_read(struct thread *td, int fd, struct file **fpp) +{ + + return(_fget(td, fd, fpp, FREAD)); +} + +#ifndef __rtems__ +int +fget_write(struct thread *td, int fd, struct file **fpp) +{ + + return(_fget(td, fd, fpp, FWRITE)); +} + +/* + * Like fget() but loads the underlying vnode, or returns an error if the + * descriptor does not represent a vnode. Note that pipes use vnodes but + * never have VM objects. The returned vnode will be vref()'d. + * + * XXX: what about the unused flags ? + */ +static __inline int +_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) +{ + struct file *fp; + int error; + + *vpp = NULL; + if ((error = _fget(td, fd, &fp, flags)) != 0) + return (error); + if (fp->f_vnode == NULL) { + error = EINVAL; + } else { + *vpp = fp->f_vnode; + vref(*vpp); + } + fdrop(fp, td); + + return (error); +} + +int +fgetvp(struct thread *td, int fd, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, vpp, 0)); +} + +int +fgetvp_read(struct thread *td, int fd, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, vpp, FREAD)); +} + +#ifdef notyet +int +fgetvp_write(struct thread *td, int fd, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, vpp, FWRITE)); +} +#endif + +/* + * Like fget() but loads the underlying socket, or returns an error if the + * descriptor does not represent a socket. + * + * We bump the ref count on the returned socket. XXX Also obtain the SX lock + * in the future. + * + * Note: fgetsock() and fputsock() are deprecated, as consumers should rely + * on their file descriptor reference to prevent the socket from being free'd + * during use. + */ +int +fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) +{ + struct file *fp; + int error; + + *spp = NULL; + if (fflagp != NULL) + *fflagp = 0; + if ((error = _fget(td, fd, &fp, 0)) != 0) + return (error); + if (fp->f_type != DTYPE_SOCKET) { + error = ENOTSOCK; + } else { + *spp = fp->f_data; + if (fflagp) + *fflagp = fp->f_flag; + SOCK_LOCK(*spp); + soref(*spp); + SOCK_UNLOCK(*spp); + } + fdrop(fp, td); + + return (error); +} + +/* + * Drop the reference count on the socket and XXX release the SX lock in the + * future. The last reference closes the socket. + * + * Note: fputsock() is deprecated, see comment for fgetsock(). + */ +void +fputsock(struct socket *so) +{ + + ACCEPT_LOCK(); + SOCK_LOCK(so); + sorele(so); +} +#endif /* __rtems__ */ + +/* + * Handle the last reference to a file being closed. + */ +int +_fdrop(struct file *fp, struct thread *td) +{ +#ifdef __rtems__ + panic("fdrop: RTEMS unsupported"); + +#else /* __rtems__ */ + int error; + + error = 0; + if (fp->f_count != 0) + panic("fdrop: count %d", fp->f_count); + if (fp->f_ops != &badfileops) + error = fo_close(fp, td); + /* + * The f_cdevpriv cannot be assigned non-NULL value while we + * are destroying the file. + */ + if (fp->f_cdevpriv != NULL) + devfs_fpdrop(fp); + atomic_subtract_int(&openfiles, 1); + crfree(fp->f_cred); + uma_zfree(file_zone, fp); + + return (error); +#endif /* __rtems__ */ +} + +#ifndef __rtems__ +/* + * Apply an advisory lock on a file descriptor. + * + * Just attempt to get a record lock of the requested type on the entire file + * (l_whence = SEEK_SET, l_start = 0, l_len = 0). + */ +#ifndef _SYS_SYSPROTO_HH_ +struct flock_args { + int fd; + int how; +}; +#endif +/* ARGSUSED */ +int +flock(struct thread *td, struct flock_args *uap) +{ + struct file *fp; + struct vnode *vp; + struct flock lf; + int vfslocked; + int error; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + return (EOPNOTSUPP); + } + + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (uap->how & LOCK_UN) { + lf.l_type = F_UNLCK; + atomic_clear_int(&fp->f_flag, FHASLOCK); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + goto done2; + } + if (uap->how & LOCK_EX) + lf.l_type = F_WRLCK; + else if (uap->how & LOCK_SH) + lf.l_type = F_RDLCK; + else { + error = EBADF; + goto done2; + } + atomic_set_int(&fp->f_flag, FHASLOCK); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); +done2: + fdrop(fp, td); + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} +/* + * Duplicate the specified descriptor to a free descriptor. + */ +int +dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error) +{ + struct file *wfp; + struct file *fp; + + /* + * If the to-be-dup'd fd number is greater than the allowed number + * of file descriptors, or the fd to be dup'd has already been + * closed, then reject. + */ + FILEDESC_XLOCK(fdp); + if (dfd < 0 || dfd >= fdp->fd_nfiles || + (wfp = fdp->fd_ofiles[dfd]) == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + + /* + * There are two cases of interest here. + * + * For ENODEV simply dup (dfd) to file descriptor (indx) and return. + * + * For ENXIO steal away the file structure from (dfd) and store it in + * (indx). (dfd) is effectively closed by this operation. + * + * Any other error code is just returned. + */ + switch (error) { + case ENODEV: + /* + * Check that the mode the file is being opened for is a + * subset of the mode of the existing descriptor. + */ + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { + FILEDESC_XUNLOCK(fdp); + return (EACCES); + } + fp = fdp->fd_ofiles[indx]; + fdp->fd_ofiles[indx] = wfp; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + if (fp == NULL) + fdused(fdp, indx); + fhold(wfp); + FILEDESC_XUNLOCK(fdp); + if (fp != NULL) + /* + * We now own the reference to fp that the ofiles[] + * array used to own. Release it. + */ + fdrop(fp, td); + return (0); + + case ENXIO: + /* + * Steal away the file pointer from dfd and stuff it into indx. + */ + fp = fdp->fd_ofiles[indx]; + fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; + fdp->fd_ofiles[dfd] = NULL; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[dfd] = 0; + fdunused(fdp, dfd); + if (fp == NULL) + fdused(fdp, indx); + FILEDESC_XUNLOCK(fdp); + + /* + * We now own the reference to fp that the ofiles[] array + * used to own. Release it. + */ + if (fp != NULL) + fdrop(fp, td); + return (0); + + default: + FILEDESC_XUNLOCK(fdp); + return (error); + } + /* NOTREACHED */ +} + +/* + * Scan all active processes and prisons to see if any of them have a current + * or root directory of `olddp'. If so, replace them with the new mount point. + */ +void +mountcheckdirs(struct vnode *olddp, struct vnode *newdp) +{ + struct filedesc *fdp; + struct prison *pr; + struct proc *p; + int nrele; + + if (vrefcnt(olddp) == 1) + return; + nrele = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + fdp = fdhold(p); + if (fdp == NULL) + continue; + FILEDESC_XLOCK(fdp); + if (fdp->fd_cdir == olddp) { + vref(newdp); + fdp->fd_cdir = newdp; + nrele++; + } + if (fdp->fd_rdir == olddp) { + vref(newdp); + fdp->fd_rdir = newdp; + nrele++; + } + if (fdp->fd_jdir == olddp) { + vref(newdp); + fdp->fd_jdir = newdp; + nrele++; + } + FILEDESC_XUNLOCK(fdp); + fddrop(fdp); + } + sx_sunlock(&allproc_lock); + if (rootvnode == olddp) { + vref(newdp); + rootvnode = newdp; + nrele++; + } + mtx_lock(&prison0.pr_mtx); + if (prison0.pr_root == olddp) { + vref(newdp); + prison0.pr_root = newdp; + nrele++; + } + mtx_unlock(&prison0.pr_mtx); + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + mtx_lock(&pr->pr_mtx); + if (pr->pr_root == olddp) { + vref(newdp); + pr->pr_root = newdp; + nrele++; + } + mtx_unlock(&pr->pr_mtx); + } + sx_sunlock(&allprison_lock); + while (nrele--) + vrele(olddp); +} + +struct filedesc_to_leader * +filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) +{ + struct filedesc_to_leader *fdtol; + + fdtol = malloc(sizeof(struct filedesc_to_leader), + M_FILEDESC_TO_LEADER, + M_WAITOK); + fdtol->fdl_refcount = 1; + fdtol->fdl_holdcount = 0; + fdtol->fdl_wakeup = 0; + fdtol->fdl_leader = leader; + if (old != NULL) { + FILEDESC_XLOCK(fdp); + fdtol->fdl_next = old->fdl_next; + fdtol->fdl_prev = old; + old->fdl_next = fdtol; + fdtol->fdl_next->fdl_prev = fdtol; + FILEDESC_XUNLOCK(fdp); + } else { + fdtol->fdl_next = fdtol; + fdtol->fdl_prev = fdtol; + } + return (fdtol); +} + +/* + * Get file structures globally. + */ +static int +sysctl_kern_file(SYSCTL_HANDLER_ARGS) +{ + struct xfile xf; + struct filedesc *fdp; + struct file *fp; + struct proc *p; + int error, n; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + if (req->oldptr == NULL) { + n = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = fdhold(p); + if (fdp == NULL) + continue; + /* overestimates sparse tables. */ + if (fdp->fd_lastfile > 0) + n += fdp->fd_lastfile; + fddrop(fdp); + } + sx_sunlock(&allproc_lock); + return (SYSCTL_OUT(req, 0, n * sizeof(xf))); + } + error = 0; + bzero(&xf, sizeof(xf)); + xf.xf_size = sizeof(xf); + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + PROC_LOCK(p); + if (p_cansee(req->td, p) != 0) { + PROC_UNLOCK(p); + continue; + } + xf.xf_pid = p->p_pid; + xf.xf_uid = p->p_ucred->cr_uid; + PROC_UNLOCK(p); + fdp = fdhold(p); + if (fdp == NULL) + continue; + FILEDESC_SLOCK(fdp); + for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { + if ((fp = fdp->fd_ofiles[n]) == NULL) + continue; + xf.xf_fd = n; + xf.xf_file = fp; + xf.xf_data = fp->f_data; + xf.xf_vnode = fp->f_vnode; + xf.xf_type = fp->f_type; + xf.xf_count = fp->f_count; + xf.xf_msgcount = 0; + xf.xf_offset = fp->f_offset; + xf.xf_flag = fp->f_flag; + error = SYSCTL_OUT(req, &xf, sizeof(xf)); + if (error) + break; + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + if (error) + break; + } + sx_sunlock(&allproc_lock); + return (error); +} + +SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); + +#ifdef KINFO_OFILE_SIZE +CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); +#endif + +#ifdef COMPAT_FREEBSD7 +static int +export_vnode_for_osysctl(struct vnode *vp, int type, + struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req) +{ + int error; + char *fullpath, *freepath; + int vfslocked; + + bzero(kif, sizeof(*kif)); + kif->kf_structsize = sizeof(*kif); + + vref(vp); + kif->kf_fd = type; + kif->kf_type = KF_TYPE_VNODE; + /* This function only handles directories. */ + if (vp->v_type != VDIR) { + vrele(vp); + return (ENOTDIR); + } + kif->kf_vnode_type = KF_VTYPE_VDIR; + + /* + * This is not a true file descriptor, so we set a bogus refcount + * and offset to indicate these fields should be ignored. + */ + kif->kf_ref_count = -1; + kif->kf_offset = -1; + + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + error = SYSCTL_OUT(req, kif, sizeof(*kif)); + FILEDESC_SLOCK(fdp); + return (error); +} + +/* + * Get per-process file descriptors for use by procstat(1), et al. + */ +static int +sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) +{ + char *fullpath, *freepath; + struct kinfo_ofile *kif; + struct filedesc *fdp; + int error, i, *name; + struct socket *so; + struct vnode *vp; + struct file *fp; + struct proc *p; + struct tty *tp; + int vfslocked; + + name = (int *)arg1; + if ((p = pfind((pid_t)name[0])) == NULL) + return (ESRCH); + if ((error = p_candebug(curthread, p))) { + PROC_UNLOCK(p); + return (error); + } + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + return (ENOENT); + kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); + FILEDESC_SLOCK(fdp); + if (fdp->fd_cdir != NULL) + export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, + fdp, req); + if (fdp->fd_rdir != NULL) + export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, + fdp, req); + if (fdp->fd_jdir != NULL) + export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, + fdp, req); + for (i = 0; i < fdp->fd_nfiles; i++) { + if ((fp = fdp->fd_ofiles[i]) == NULL) + continue; + bzero(kif, sizeof(*kif)); + kif->kf_structsize = sizeof(*kif); + vp = NULL; + so = NULL; + tp = NULL; + kif->kf_fd = i; + switch (fp->f_type) { + case DTYPE_VNODE: + kif->kf_type = KF_TYPE_VNODE; + vp = fp->f_vnode; + break; + + case DTYPE_SOCKET: + kif->kf_type = KF_TYPE_SOCKET; + so = fp->f_data; + break; + + case DTYPE_PIPE: + kif->kf_type = KF_TYPE_PIPE; + break; + + case DTYPE_FIFO: + kif->kf_type = KF_TYPE_FIFO; + vp = fp->f_vnode; + break; + + case DTYPE_KQUEUE: + kif->kf_type = KF_TYPE_KQUEUE; + break; + + case DTYPE_CRYPTO: + kif->kf_type = KF_TYPE_CRYPTO; + break; + + case DTYPE_MQUEUE: + kif->kf_type = KF_TYPE_MQUEUE; + break; + + case DTYPE_SHM: + kif->kf_type = KF_TYPE_SHM; + break; + + case DTYPE_SEM: + kif->kf_type = KF_TYPE_SEM; + break; + + case DTYPE_PTS: + kif->kf_type = KF_TYPE_PTS; + tp = fp->f_data; + break; + + default: + kif->kf_type = KF_TYPE_UNKNOWN; + break; + } + kif->kf_ref_count = fp->f_count; + if (fp->f_flag & FREAD) + kif->kf_flags |= KF_FLAG_READ; + if (fp->f_flag & FWRITE) + kif->kf_flags |= KF_FLAG_WRITE; + if (fp->f_flag & FAPPEND) + kif->kf_flags |= KF_FLAG_APPEND; + if (fp->f_flag & FASYNC) + kif->kf_flags |= KF_FLAG_ASYNC; + if (fp->f_flag & FFSYNC) + kif->kf_flags |= KF_FLAG_FSYNC; + if (fp->f_flag & FNONBLOCK) + kif->kf_flags |= KF_FLAG_NONBLOCK; + if (fp->f_flag & O_DIRECT) + kif->kf_flags |= KF_FLAG_DIRECT; + if (fp->f_flag & FHASLOCK) + kif->kf_flags |= KF_FLAG_HASLOCK; + kif->kf_offset = fp->f_offset; + if (vp != NULL) { + vref(vp); + switch (vp->v_type) { + case VNON: + kif->kf_vnode_type = KF_VTYPE_VNON; + break; + case VREG: + kif->kf_vnode_type = KF_VTYPE_VREG; + break; + case VDIR: + kif->kf_vnode_type = KF_VTYPE_VDIR; + break; + case VBLK: + kif->kf_vnode_type = KF_VTYPE_VBLK; + break; + case VCHR: + kif->kf_vnode_type = KF_VTYPE_VCHR; + break; + case VLNK: + kif->kf_vnode_type = KF_VTYPE_VLNK; + break; + case VSOCK: + kif->kf_vnode_type = KF_VTYPE_VSOCK; + break; + case VFIFO: + kif->kf_vnode_type = KF_VTYPE_VFIFO; + break; + case VBAD: + kif->kf_vnode_type = KF_VTYPE_VBAD; + break; + default: + kif->kf_vnode_type = KF_VTYPE_UNKNOWN; + break; + } + /* + * It is OK to drop the filedesc lock here as we will + * re-validate and re-evaluate its properties when + * the loop continues. + */ + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, + sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + FILEDESC_SLOCK(fdp); + } + if (so != NULL) { + struct sockaddr *sa; + + if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { + bcopy(sa, &kif->kf_sa_local, sa->sa_len); + free(sa, M_SONAME); + } + if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { + bcopy(sa, &kif->kf_sa_peer, sa->sa_len); + free(sa, M_SONAME); + } + kif->kf_sock_domain = + so->so_proto->pr_domain->dom_family; + kif->kf_sock_type = so->so_type; + kif->kf_sock_protocol = so->so_proto->pr_protocol; + } + if (tp != NULL) { + strlcpy(kif->kf_path, tty_devname(tp), + sizeof(kif->kf_path)); + } + error = SYSCTL_OUT(req, kif, sizeof(*kif)); + if (error) + break; + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + free(kif, M_TEMP); + return (0); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD, + sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); +#endif /* COMPAT_FREEBSD7 */ + +#ifdef KINFO_FILE_SIZE +CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); +#endif + +static int +export_vnode_for_sysctl(struct vnode *vp, int type, + struct kinfo_file *kif, struct filedesc *fdp, struct sysctl_req *req) +{ + int error; + char *fullpath, *freepath; + int vfslocked; + + bzero(kif, sizeof(*kif)); + + vref(vp); + kif->kf_fd = type; + kif->kf_type = KF_TYPE_VNODE; + /* This function only handles directories. */ + if (vp->v_type != VDIR) { + vrele(vp); + return (ENOTDIR); + } + kif->kf_vnode_type = KF_VTYPE_VDIR; + + /* + * This is not a true file descriptor, so we set a bogus refcount + * and offset to indicate these fields should be ignored. + */ + kif->kf_ref_count = -1; + kif->kf_offset = -1; + + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + /* Pack record size down */ + kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + + strlen(kif->kf_path) + 1; + kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); + error = SYSCTL_OUT(req, kif, kif->kf_structsize); + FILEDESC_SLOCK(fdp); + return (error); +} + +/* + * Get per-process file descriptors for use by procstat(1), et al. + */ +static int +sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) +{ + char *fullpath, *freepath; + struct kinfo_file *kif; + struct filedesc *fdp; + int error, i, *name; + struct socket *so; + struct vnode *vp; + struct file *fp; + struct proc *p; + struct tty *tp; + int vfslocked; + size_t oldidx; + + name = (int *)arg1; + if ((p = pfind((pid_t)name[0])) == NULL) + return (ESRCH); + if ((error = p_candebug(curthread, p))) { + PROC_UNLOCK(p); + return (error); + } + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + return (ENOENT); + kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); + FILEDESC_SLOCK(fdp); + if (fdp->fd_cdir != NULL) + export_vnode_for_sysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, + fdp, req); + if (fdp->fd_rdir != NULL) + export_vnode_for_sysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, + fdp, req); + if (fdp->fd_jdir != NULL) + export_vnode_for_sysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, + fdp, req); + for (i = 0; i < fdp->fd_nfiles; i++) { + if ((fp = fdp->fd_ofiles[i]) == NULL) + continue; + bzero(kif, sizeof(*kif)); + vp = NULL; + so = NULL; + tp = NULL; + kif->kf_fd = i; + switch (fp->f_type) { + case DTYPE_VNODE: + kif->kf_type = KF_TYPE_VNODE; + vp = fp->f_vnode; + break; + + case DTYPE_SOCKET: + kif->kf_type = KF_TYPE_SOCKET; + so = fp->f_data; + break; + + case DTYPE_PIPE: + kif->kf_type = KF_TYPE_PIPE; + break; + + case DTYPE_FIFO: + kif->kf_type = KF_TYPE_FIFO; + vp = fp->f_vnode; + break; + + case DTYPE_KQUEUE: + kif->kf_type = KF_TYPE_KQUEUE; + break; + + case DTYPE_CRYPTO: + kif->kf_type = KF_TYPE_CRYPTO; + break; + + case DTYPE_MQUEUE: + kif->kf_type = KF_TYPE_MQUEUE; + break; + + case DTYPE_SHM: + kif->kf_type = KF_TYPE_SHM; + break; + + case DTYPE_SEM: + kif->kf_type = KF_TYPE_SEM; + break; + + case DTYPE_PTS: + kif->kf_type = KF_TYPE_PTS; + tp = fp->f_data; + break; + + default: + kif->kf_type = KF_TYPE_UNKNOWN; + break; + } + kif->kf_ref_count = fp->f_count; + if (fp->f_flag & FREAD) + kif->kf_flags |= KF_FLAG_READ; + if (fp->f_flag & FWRITE) + kif->kf_flags |= KF_FLAG_WRITE; + if (fp->f_flag & FAPPEND) + kif->kf_flags |= KF_FLAG_APPEND; + if (fp->f_flag & FASYNC) + kif->kf_flags |= KF_FLAG_ASYNC; + if (fp->f_flag & FFSYNC) + kif->kf_flags |= KF_FLAG_FSYNC; + if (fp->f_flag & FNONBLOCK) + kif->kf_flags |= KF_FLAG_NONBLOCK; + if (fp->f_flag & O_DIRECT) + kif->kf_flags |= KF_FLAG_DIRECT; + if (fp->f_flag & FHASLOCK) + kif->kf_flags |= KF_FLAG_HASLOCK; + kif->kf_offset = fp->f_offset; + if (vp != NULL) { + vref(vp); + switch (vp->v_type) { + case VNON: + kif->kf_vnode_type = KF_VTYPE_VNON; + break; + case VREG: + kif->kf_vnode_type = KF_VTYPE_VREG; + break; + case VDIR: + kif->kf_vnode_type = KF_VTYPE_VDIR; + break; + case VBLK: + kif->kf_vnode_type = KF_VTYPE_VBLK; + break; + case VCHR: + kif->kf_vnode_type = KF_VTYPE_VCHR; + break; + case VLNK: + kif->kf_vnode_type = KF_VTYPE_VLNK; + break; + case VSOCK: + kif->kf_vnode_type = KF_VTYPE_VSOCK; + break; + case VFIFO: + kif->kf_vnode_type = KF_VTYPE_VFIFO; + break; + case VBAD: + kif->kf_vnode_type = KF_VTYPE_VBAD; + break; + default: + kif->kf_vnode_type = KF_VTYPE_UNKNOWN; + break; + } + /* + * It is OK to drop the filedesc lock here as we will + * re-validate and re-evaluate its properties when + * the loop continues. + */ + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, + sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + FILEDESC_SLOCK(fdp); + } + if (so != NULL) { + struct sockaddr *sa; + + if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { + bcopy(sa, &kif->kf_sa_local, sa->sa_len); + free(sa, M_SONAME); + } + if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { + bcopy(sa, &kif->kf_sa_peer, sa->sa_len); + free(sa, M_SONAME); + } + kif->kf_sock_domain = + so->so_proto->pr_domain->dom_family; + kif->kf_sock_type = so->so_type; + kif->kf_sock_protocol = so->so_proto->pr_protocol; + } + if (tp != NULL) { + strlcpy(kif->kf_path, tty_devname(tp), + sizeof(kif->kf_path)); + } + /* Pack record size down */ + kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + + strlen(kif->kf_path) + 1; + kif->kf_structsize = roundup(kif->kf_structsize, + sizeof(uint64_t)); + oldidx = req->oldidx; + error = SYSCTL_OUT(req, kif, kif->kf_structsize); + if (error) { + if (error == ENOMEM) { + /* + * The hack to keep the ABI of sysctl + * kern.proc.filedesc intact, but not + * to account a partially copied + * kinfo_file into the oldidx. + */ + req->oldidx = oldidx; + error = 0; + } + break; + } + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + free(kif, M_TEMP); + return (error); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD, + sysctl_kern_proc_filedesc, "Process filedesc entries"); + +#ifdef DDB +/* + * For the purposes of debugging, generate a human-readable string for the + * file type. + */ +static const char * +file_type_to_name(short type) +{ + + switch (type) { + case 0: + return ("zero"); + case DTYPE_VNODE: + return ("vnod"); + case DTYPE_SOCKET: + return ("sock"); + case DTYPE_PIPE: + return ("pipe"); + case DTYPE_FIFO: + return ("fifo"); + case DTYPE_KQUEUE: + return ("kque"); + case DTYPE_CRYPTO: + return ("crpt"); + case DTYPE_MQUEUE: + return ("mque"); + case DTYPE_SHM: + return ("shm"); + case DTYPE_SEM: + return ("ksem"); + default: + return ("unkn"); + } +} + +/* + * For the purposes of debugging, identify a process (if any, perhaps one of + * many) that references the passed file in its file descriptor array. Return + * NULL if none. + */ +static struct proc * +file_to_first_proc(struct file *fp) +{ + struct filedesc *fdp; + struct proc *p; + int n; + + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = p->p_fd; + if (fdp == NULL) + continue; + for (n = 0; n < fdp->fd_nfiles; n++) { + if (fp == fdp->fd_ofiles[n]) + return (p); + } + } + return (NULL); +} + +static void +db_print_file(struct file *fp, int header) +{ + struct proc *p; + + if (header) + db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", + "File", "Type", "Data", "Flag", "GCFl", "Count", + "MCount", "Vnode", "FPID", "FCmd"); + p = file_to_first_proc(fp); + db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, + file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, + 0, fp->f_count, 0, fp->f_vnode, + p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); +} + +DB_SHOW_COMMAND(file, db_show_file) +{ + struct file *fp; + + if (!have_addr) { + db_printf("usage: show file <addr>\n"); + return; + } + fp = (struct file *)addr; + db_print_file(fp, 1); +} + +DB_SHOW_COMMAND(files, db_show_files) +{ + struct filedesc *fdp; + struct file *fp; + struct proc *p; + int header; + int n; + + header = 1; + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + if ((fdp = p->p_fd) == NULL) + continue; + for (n = 0; n < fdp->fd_nfiles; ++n) { + if ((fp = fdp->fd_ofiles[n]) == NULL) + continue; + db_print_file(fp, header); + header = 0; + } + } +} +#endif + +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, + &maxfilesperproc, 0, "Maximum files allowed open per process"); + +SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, + &maxfiles, 0, "Maximum number of files"); + +SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, + __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); + +/* ARGSUSED*/ +static void +filelistinit(void *dummy) +{ + + file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); + mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); +} +SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); +#endif /* __rtems__ */ + +/*-------------------------------------------------------------------*/ + +static int +badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) +{ + + return (EINVAL); +} + +static int +badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) +{ + + return (0); +} + +static int +badfo_kqfilter(struct file *fp, struct knote *kn) +{ + + return (EBADF); +} + +static int +badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_close(struct file *fp, struct thread *td) +{ + + return (EBADF); +} + +struct fileops badfileops = { + .fo_read = badfo_readwrite, + .fo_write = badfo_readwrite, + .fo_truncate = badfo_truncate, + .fo_ioctl = badfo_ioctl, + .fo_poll = badfo_poll, + .fo_kqfilter = badfo_kqfilter, + .fo_stat = badfo_stat, + .fo_close = badfo_close, +}; + +#ifndef __rtems__ +/*-------------------------------------------------------------------*/ + +/* + * File Descriptor pseudo-device driver (/dev/fd/). + * + * Opening minor device N dup()s the file (if any) connected to file + * descriptor N belonging to the calling process. Note that this driver + * consists of only the ``open()'' routine, because all subsequent + * references to this file will be direct to the other driver. + * + * XXX: we could give this one a cloning event handler if necessary. + */ + +/* ARGSUSED */ +static int +fdopen(struct cdev *dev, int mode, int type, struct thread *td) +{ + + /* +#include <freebsd/machine/rtems-bsd-config.h> + +/*- + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 + */ + +#include <freebsd/sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <freebsd/local/opt_compat.h> +#include <freebsd/local/opt_ddb.h> +#include <freebsd/local/opt_ktrace.h> + +#include <freebsd/sys/param.h> +#include <freebsd/sys/systm.h> + +#include <freebsd/sys/conf.h> +#include <freebsd/sys/domain.h> +#include <freebsd/sys/fcntl.h> +#include <freebsd/sys/file.h> +#include <freebsd/sys/filedesc.h> +#include <freebsd/sys/filio.h> +#include <freebsd/sys/jail.h> +#include <freebsd/sys/kernel.h> +#include <freebsd/sys/limits.h> +#include <freebsd/sys/lock.h> +#include <freebsd/sys/malloc.h> +#include <freebsd/sys/mount.h> +#include <freebsd/sys/mqueue.h> +#include <freebsd/sys/mutex.h> +#include <freebsd/sys/namei.h> +#include <freebsd/sys/priv.h> +#include <freebsd/sys/proc.h> +#include <freebsd/sys/protosw.h> +#include <freebsd/sys/resourcevar.h> +#include <freebsd/sys/signalvar.h> +#include <freebsd/sys/socketvar.h> +#include <freebsd/sys/stat.h> +#include <freebsd/sys/sx.h> +#include <freebsd/sys/syscallsubr.h> +#include <freebsd/sys/sysctl.h> +#include <freebsd/sys/sysproto.h> +#include <freebsd/sys/tty.h> +#include <freebsd/sys/unistd.h> +#include <freebsd/sys/user.h> +#include <freebsd/sys/vnode.h> +#ifdef KTRACE +#include <freebsd/sys/ktrace.h> +#endif + +#include <freebsd/security/audit/audit.h> + +#include <freebsd/vm/uma.h> + +#include <freebsd/ddb/ddb.h> + +static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); +static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", + "file desc to leader structures"); +static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); + +static uma_zone_t file_zone; + + +/* Flags for do_dup() */ +#define DUP_FIXED 0x1 /* Force fixed allocation */ +#define DUP_FCNTL 0x2 /* fcntl()-style errors */ + +static int do_dup(struct thread *td, int flags, int old, int new, + register_t *retval); +static int fd_first_free(struct filedesc *, int, int); +static int fd_last_used(struct filedesc *, int, int); +static void fdgrowtable(struct filedesc *, int); +static void fdunused(struct filedesc *fdp, int fd); +static void fdused(struct filedesc *fdp, int fd); + +/* + * A process is initially started out with NDFILE descriptors stored within + * this structure, selected to be enough for typical applications based on + * the historical limit of 20 open files (and the usage of descriptors by + * shells). If these descriptors are exhausted, a larger descriptor table + * may be allocated, up to a process' resource limit; the internal arrays + * are then unused. + */ +#define NDFILE 20 +#define NDSLOTSIZE sizeof(NDSLOTTYPE) +#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) +#define NDSLOT(x) ((x) / NDENTRIES) +#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) +#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) + +/* + * Storage required per open file descriptor. + */ +#define OFILESIZE (sizeof(struct file *) + sizeof(char)) + +/* + * Storage to hold unused ofiles that need to be reclaimed. + */ +struct freetable { + struct file **ft_table; + SLIST_ENTRY(freetable) ft_next; +}; + +/* + * Basic allocation of descriptors: + * one of the above, plus arrays for NDFILE descriptors. + */ +struct filedesc0 { + struct filedesc fd_fd; + /* + * ofiles which need to be reclaimed on free. + */ + SLIST_HEAD(,freetable) fd_free; + /* + * These arrays are used when the number of open files is + * <= NDFILE, and are then pointed to by the pointers above. + */ + struct file *fd_dfiles[NDFILE]; + char fd_dfileflags[NDFILE]; + NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; +}; + +/* + * Descriptor management. + */ +volatile int openfiles; /* actual number of open files */ +struct mtx sigio_lock; /* mtx to protect pointers to sigio */ +#ifndef __rtems__ +void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); + +/* A mutex to protect the association between a proc and filedesc. */ +static struct mtx fdesc_mtx; + +/* + * Find the first zero bit in the given bitmap, starting at low and not + * exceeding size - 1. + */ +static int +fd_first_free(struct filedesc *fdp, int low, int size) +{ + NDSLOTTYPE *map = fdp->fd_map; + NDSLOTTYPE mask; + int off, maxoff; + + if (low >= size) + return (low); + + off = NDSLOT(low); + if (low % NDENTRIES) { + mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); + if ((mask &= ~map[off]) != 0UL) + return (off * NDENTRIES + ffsl(mask) - 1); + ++off; + } + for (maxoff = NDSLOTS(size); off < maxoff; ++off) + if (map[off] != ~0UL) + return (off * NDENTRIES + ffsl(~map[off]) - 1); + return (size); +} + +/* + * Find the highest non-zero bit in the given bitmap, starting at low and + * not exceeding size - 1. + */ +static int +fd_last_used(struct filedesc *fdp, int low, int size) +{ + NDSLOTTYPE *map = fdp->fd_map; + NDSLOTTYPE mask; + int off, minoff; + + if (low >= size) + return (-1); + + off = NDSLOT(size); + if (size % NDENTRIES) { + mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); + if ((mask &= map[off]) != 0) + return (off * NDENTRIES + flsl(mask) - 1); + --off; + } + for (minoff = NDSLOT(low); off >= minoff; --off) + if (map[off] != 0) + return (off * NDENTRIES + flsl(map[off]) - 1); + return (low - 1); +} + +static int +fdisused(struct filedesc *fdp, int fd) +{ + KASSERT(fd >= 0 && fd < fdp->fd_nfiles, + ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); + return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); +} + +/* + * Mark a file descriptor as used. + */ +static void +fdused(struct filedesc *fdp, int fd) +{ + + FILEDESC_XLOCK_ASSERT(fdp); + KASSERT(!fdisused(fdp, fd), + ("fd already used")); + + fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); + if (fd > fdp->fd_lastfile) + fdp->fd_lastfile = fd; + if (fd == fdp->fd_freefile) + fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); +} + +/* + * Mark a file descriptor as unused. + */ +static void +fdunused(struct filedesc *fdp, int fd) +{ + + FILEDESC_XLOCK_ASSERT(fdp); + KASSERT(fdisused(fdp, fd), + ("fd is already unused")); + KASSERT(fdp->fd_ofiles[fd] == NULL, + ("fd is still in use")); + + fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; + if (fd == fdp->fd_lastfile) + fdp->fd_lastfile = fd_last_used(fdp, 0, fd); +} + +/* + * System calls on descriptors. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct getdtablesize_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +getdtablesize(struct thread *td, struct getdtablesize_args *uap) +{ + struct proc *p = td->td_proc; + + PROC_LOCK(p); + td->td_retval[0] = + min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + return (0); +} + +/* + * Duplicate a file descriptor to a particular value. + * + * Note: keep in mind that a potential race condition exists when closing + * descriptors from a shared descriptor table (via rfork). + */ +#ifndef _SYS_SYSPROTO_HH_ +struct dup2_args { + u_int from; + u_int to; +}; +#endif +/* ARGSUSED */ +int +dup2(struct thread *td, struct dup2_args *uap) +{ + + return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, + td->td_retval)); +} + +/* + * Duplicate a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct dup_args { + u_int fd; +}; +#endif +/* ARGSUSED */ +int +dup(struct thread *td, struct dup_args *uap) +{ + + return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval)); +} + +/* + * The file control system call. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct fcntl_args { + int fd; + int cmd; + long arg; +}; +#endif +/* ARGSUSED */ +int +fcntl(struct thread *td, struct fcntl_args *uap) +{ + struct flock fl; + struct oflock ofl; + intptr_t arg; + int error; + int cmd; + + error = 0; + cmd = uap->cmd; + switch (uap->cmd) { + case F_OGETLK: + case F_OSETLK: + case F_OSETLKW: + /* + * Convert old flock structure to new. + */ + error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl)); + fl.l_start = ofl.l_start; + fl.l_len = ofl.l_len; + fl.l_pid = ofl.l_pid; + fl.l_type = ofl.l_type; + fl.l_whence = ofl.l_whence; + fl.l_sysid = 0; + + switch (uap->cmd) { + case F_OGETLK: + cmd = F_GETLK; + break; + case F_OSETLK: + cmd = F_SETLK; + break; + case F_OSETLKW: + cmd = F_SETLKW; + break; + } + arg = (intptr_t)&fl; + break; + case F_GETLK: + case F_SETLK: + case F_SETLKW: + case F_SETLK_REMOTE: + error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); + arg = (intptr_t)&fl; + break; + default: + arg = uap->arg; + break; + } + if (error) + return (error); + error = kern_fcntl(td, uap->fd, cmd, arg); + if (error) + return (error); + if (uap->cmd == F_OGETLK) { + ofl.l_start = fl.l_start; + ofl.l_len = fl.l_len; + ofl.l_pid = fl.l_pid; + ofl.l_type = fl.l_type; + ofl.l_whence = fl.l_whence; + error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl)); + } else if (uap->cmd == F_GETLK) { + error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); + } + return (error); +} + +static inline struct file * +fdtofp(int fd, struct filedesc *fdp) +{ + struct file *fp; + + FILEDESC_LOCK_ASSERT(fdp); + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (NULL); + return (fp); +} + +int +kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) +{ + struct filedesc *fdp; + struct flock *flp; + struct file *fp; + struct proc *p; + char *pop; + struct vnode *vp; + int error, flg, tmp; + int vfslocked; + u_int old, new; + uint64_t bsize; + + vfslocked = 0; + error = 0; + flg = F_POSIX; + p = td->td_proc; + fdp = p->p_fd; + + switch (cmd) { + case F_DUPFD: + tmp = arg; + error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval); + break; + + case F_DUP2FD: + tmp = arg; + error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval); + break; + + case F_GETFD: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + pop = &fdp->fd_ofileflags[fd]; + td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; + FILEDESC_SUNLOCK(fdp); + break; + + case F_SETFD: + FILEDESC_XLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_XUNLOCK(fdp); + error = EBADF; + break; + } + pop = &fdp->fd_ofileflags[fd]; + *pop = (*pop &~ UF_EXCLOSE) | + (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); + FILEDESC_XUNLOCK(fdp); + break; + + case F_GETFL: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + td->td_retval[0] = OFLAGS(fp->f_flag); + FILEDESC_SUNLOCK(fdp); + break; + + case F_SETFL: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + do { + tmp = flg = fp->f_flag; + tmp &= ~FCNTLFLAGS; + tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; + } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); + tmp = fp->f_flag & FNONBLOCK; + error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); + if (error) { + fdrop(fp, td); + break; + } + tmp = fp->f_flag & FASYNC; + error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); + if (error == 0) { + fdrop(fp, td); + break; + } + atomic_clear_int(&fp->f_flag, FNONBLOCK); + tmp = 0; + (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); + fdrop(fp, td); + break; + + case F_GETOWN: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); + if (error == 0) + td->td_retval[0] = tmp; + fdrop(fp, td); + break; + + case F_SETOWN: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + tmp = arg; + error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); + fdrop(fp, td); + break; + + case F_SETLK_REMOTE: + error = priv_check(td, PRIV_NFS_LOCKD); + if (error) + return (error); + flg = F_REMOTE; + goto do_setlk; + + case F_SETLKW: + flg |= F_WAIT; + /* FALLTHROUGH F_SETLK */ + + case F_SETLK: + do_setlk: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + flp = (struct flock *)arg; + if (flp->l_whence == SEEK_CUR) { + if (fp->f_offset < 0 || + (flp->l_start > 0 && + fp->f_offset > OFF_MAX - flp->l_start)) { + FILEDESC_SUNLOCK(fdp); + error = EOVERFLOW; + break; + } + flp->l_start += fp->f_offset; + } + + /* + * VOP_ADVLOCK() may block. + */ + fhold(fp); + FILEDESC_SUNLOCK(fdp); + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + switch (flp->l_type) { + case F_RDLCK: + if ((fp->f_flag & FREAD) == 0) { + error = EBADF; + break; + } + PROC_LOCK(p->p_leader); + p->p_leader->p_flag |= P_ADVLOCK; + PROC_UNLOCK(p->p_leader); + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + flp, flg); + break; + case F_WRLCK: + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + break; + } + PROC_LOCK(p->p_leader); + p->p_leader->p_flag |= P_ADVLOCK; + PROC_UNLOCK(p->p_leader); + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, + flp, flg); + break; + case F_UNLCK: + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, + flp, flg); + break; + case F_UNLCKSYS: + /* + * Temporary api for testing remote lock + * infrastructure. + */ + if (flg != F_REMOTE) { + error = EINVAL; + break; + } + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCKSYS, flp, flg); + break; + default: + error = EINVAL; + break; + } + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + /* Check for race with close */ + FILEDESC_SLOCK(fdp); + if ((unsigned) fd >= fdp->fd_nfiles || + fp != fdp->fd_ofiles[fd]) { + FILEDESC_SUNLOCK(fdp); + flp->l_whence = SEEK_SET; + flp->l_start = 0; + flp->l_len = 0; + flp->l_type = F_UNLCK; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, + F_UNLCK, flp, F_POSIX); + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + } else + FILEDESC_SUNLOCK(fdp); + fdrop(fp, td); + break; + + case F_GETLK: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + flp = (struct flock *)arg; + if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && + flp->l_type != F_UNLCK) { + FILEDESC_SUNLOCK(fdp); + error = EINVAL; + break; + } + if (flp->l_whence == SEEK_CUR) { + if ((flp->l_start > 0 && + fp->f_offset > OFF_MAX - flp->l_start) || + (flp->l_start < 0 && + fp->f_offset < OFF_MIN - flp->l_start)) { + FILEDESC_SUNLOCK(fdp); + error = EOVERFLOW; + break; + } + flp->l_start += fp->f_offset; + } + /* + * VOP_ADVLOCK() may block. + */ + fhold(fp); + FILEDESC_SUNLOCK(fdp); + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, + F_POSIX); + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + fdrop(fp, td); + break; + + case F_RDAHEAD: + arg = arg ? 128 * 1024: 0; + /* FALLTHROUGH */ + case F_READAHEAD: + FILEDESC_SLOCK(fdp); + if ((fp = fdtofp(fd, fdp)) == NULL) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + if (fp->f_type != DTYPE_VNODE) { + FILEDESC_SUNLOCK(fdp); + error = EBADF; + break; + } + fhold(fp); + FILEDESC_SUNLOCK(fdp); + if (arg != 0) { + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + error = vn_lock(vp, LK_SHARED); + if (error != 0) + goto readahead_vnlock_fail; + bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; + VOP_UNLOCK(vp, 0); + fp->f_seqcount = (arg + bsize - 1) / bsize; + do { + new = old = fp->f_flag; + new |= FRDAHEAD; + } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); +readahead_vnlock_fail: + VFS_UNLOCK_GIANT(vfslocked); + vfslocked = 0; + } else { + do { + new = old = fp->f_flag; + new &= ~FRDAHEAD; + } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new)); + } + fdrop(fp, td); + break; + + default: + error = EINVAL; + break; + } + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} + +/* + * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). + */ +static int +do_dup(struct thread *td, int flags, int old, int new, + register_t *retval) +{ + struct filedesc *fdp; + struct proc *p; + struct file *fp; + struct file *delfp; + int error, holdleaders, maxfd; + + p = td->td_proc; + fdp = p->p_fd; + + /* + * Verify we have a valid descriptor to dup from and possibly to + * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should + * return EINVAL when the new descriptor is out of bounds. + */ + if (old < 0) + return (EBADF); + if (new < 0) + return (flags & DUP_FCNTL ? EINVAL : EBADF); + PROC_LOCK(p); + maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + if (new >= maxfd) + return (flags & DUP_FCNTL ? EINVAL : EMFILE); + + FILEDESC_XLOCK(fdp); + if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + if (flags & DUP_FIXED && old == new) { + *retval = new; + FILEDESC_XUNLOCK(fdp); + return (0); + } + fp = fdp->fd_ofiles[old]; + fhold(fp); + + /* + * If the caller specified a file descriptor, make sure the file + * table is large enough to hold it, and grab it. Otherwise, just + * allocate a new descriptor the usual way. Since the filedesc + * lock may be temporarily dropped in the process, we have to look + * out for a race. + */ + if (flags & DUP_FIXED) { + if (new >= fdp->fd_nfiles) + fdgrowtable(fdp, new + 1); + if (fdp->fd_ofiles[new] == NULL) + fdused(fdp, new); + } else { + if ((error = fdalloc(td, new, &new)) != 0) { + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + return (error); + } + } + + /* + * If the old file changed out from under us then treat it as a + * bad file descriptor. Userland should do its own locking to + * avoid this case. + */ + if (fdp->fd_ofiles[old] != fp) { + /* we've allocated a descriptor which we won't use */ + if (fdp->fd_ofiles[new] == NULL) + fdunused(fdp, new); + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + return (EBADF); + } + KASSERT(old != new, + ("new fd is same as old")); + + /* + * Save info on the descriptor being overwritten. We cannot close + * it without introducing an ownership race for the slot, since we + * need to drop the filedesc lock to call closef(). + * + * XXX this duplicates parts of close(). + */ + delfp = fdp->fd_ofiles[new]; + holdleaders = 0; + if (delfp != NULL) { + if (td->td_proc->p_fdtol != NULL) { + /* + * Ask fdfree() to sleep to ensure that all relevant + * process leaders can be traversed in closef(). + */ + fdp->fd_holdleaderscount++; + holdleaders = 1; + } + } + + /* + * Duplicate the source descriptor + */ + fdp->fd_ofiles[new] = fp; + fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; + if (new > fdp->fd_lastfile) + fdp->fd_lastfile = new; + *retval = new; + + /* + * If we dup'd over a valid file, we now own the reference to it + * and must dispose of it using closef() semantics (as if a + * close() were performed on it). + * + * XXX this duplicates parts of close(). + */ + if (delfp != NULL) { + knote_fdclose(td, new); + if (delfp->f_type == DTYPE_MQUEUE) + mq_fdclose(td, new, delfp); + FILEDESC_XUNLOCK(fdp); + (void) closef(delfp, td); + if (holdleaders) { + FILEDESC_XLOCK(fdp); + fdp->fd_holdleaderscount--; + if (fdp->fd_holdleaderscount == 0 && + fdp->fd_holdleaderswakeup != 0) { + fdp->fd_holdleaderswakeup = 0; + wakeup(&fdp->fd_holdleaderscount); + } + FILEDESC_XUNLOCK(fdp); + } + } else { + FILEDESC_XUNLOCK(fdp); + } + return (0); +} + +/* + * If sigio is on the list associated with a process or process group, + * disable signalling from the device, remove sigio from the list and + * free sigio. + */ +void +funsetown(struct sigio **sigiop) +{ + struct sigio *sigio; + + SIGIO_LOCK(); + sigio = *sigiop; + if (sigio == NULL) { + SIGIO_UNLOCK(); + return; + } + *(sigio->sio_myref) = NULL; + if ((sigio)->sio_pgid < 0) { + struct pgrp *pg = (sigio)->sio_pgrp; + PGRP_LOCK(pg); + SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, + sigio, sio_pgsigio); + PGRP_UNLOCK(pg); + } else { + struct proc *p = (sigio)->sio_proc; + PROC_LOCK(p); + SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, + sigio, sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); +} + +/* + * Free a list of sigio structures. + * We only need to lock the SIGIO_LOCK because we have made ourselves + * inaccessible to callers of fsetown and therefore do not need to lock + * the proc or pgrp struct for the list manipulation. + */ +void +funsetownlst(struct sigiolst *sigiolst) +{ + struct proc *p; + struct pgrp *pg; + struct sigio *sigio; + + sigio = SLIST_FIRST(sigiolst); + if (sigio == NULL) + return; + p = NULL; + pg = NULL; + + /* + * Every entry of the list should belong + * to a single proc or pgrp. + */ + if (sigio->sio_pgid < 0) { + pg = sigio->sio_pgrp; + PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); + } else /* if (sigio->sio_pgid > 0) */ { + p = sigio->sio_proc; + PROC_LOCK_ASSERT(p, MA_NOTOWNED); + } + + SIGIO_LOCK(); + while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { + *(sigio->sio_myref) = NULL; + if (pg != NULL) { + KASSERT(sigio->sio_pgid < 0, + ("Proc sigio in pgrp sigio list")); + KASSERT(sigio->sio_pgrp == pg, + ("Bogus pgrp in sigio list")); + PGRP_LOCK(pg); + SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, + sio_pgsigio); + PGRP_UNLOCK(pg); + } else /* if (p != NULL) */ { + KASSERT(sigio->sio_pgid > 0, + ("Pgrp sigio in proc sigio list")); + KASSERT(sigio->sio_proc == p, + ("Bogus proc in sigio list")); + PROC_LOCK(p); + SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, + sio_pgsigio); + PROC_UNLOCK(p); + } + SIGIO_UNLOCK(); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); + SIGIO_LOCK(); + } + SIGIO_UNLOCK(); +} + +/* + * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). + * + * After permission checking, add a sigio structure to the sigio list for + * the process or process group. + */ +int +fsetown(pid_t pgid, struct sigio **sigiop) +{ + struct proc *proc; + struct pgrp *pgrp; + struct sigio *sigio; + int ret; + + if (pgid == 0) { + funsetown(sigiop); + return (0); + } + + ret = 0; + + /* Allocate and fill in the new sigio out of locks. */ + sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); + sigio->sio_pgid = pgid; + sigio->sio_ucred = crhold(curthread->td_ucred); + sigio->sio_myref = sigiop; + + sx_slock(&proctree_lock); + if (pgid > 0) { + proc = pfind(pgid); + if (proc == NULL) { + ret = ESRCH; + goto fail; + } + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + PROC_UNLOCK(proc); + if (proc->p_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + pgrp = NULL; + } else /* if (pgid < 0) */ { + pgrp = pgfind(-pgid); + if (pgrp == NULL) { + ret = ESRCH; + goto fail; + } + PGRP_UNLOCK(pgrp); + + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + if (pgrp->pg_session != curthread->td_proc->p_session) { + ret = EPERM; + goto fail; + } + + proc = NULL; + } + funsetown(sigiop); + if (pgid > 0) { + PROC_LOCK(proc); + /* + * Since funsetownlst() is called without the proctree + * locked, we need to check for P_WEXIT. + * XXX: is ESRCH correct? + */ + if ((proc->p_flag & P_WEXIT) != 0) { + PROC_UNLOCK(proc); + ret = ESRCH; + goto fail; + } + SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); + sigio->sio_proc = proc; + PROC_UNLOCK(proc); + } else { + PGRP_LOCK(pgrp); + SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); + sigio->sio_pgrp = pgrp; + PGRP_UNLOCK(pgrp); + } + sx_sunlock(&proctree_lock); + SIGIO_LOCK(); + *sigiop = sigio; + SIGIO_UNLOCK(); + return (0); + +fail: + sx_sunlock(&proctree_lock); + crfree(sigio->sio_ucred); + free(sigio, M_SIGIO); + return (ret); +} + +/* + * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). + */ +pid_t +fgetown(sigiop) + struct sigio **sigiop; +{ + pid_t pgid; + + SIGIO_LOCK(); + pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; + SIGIO_UNLOCK(); + return (pgid); +} + +/* + * Close a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct close_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +close(td, uap) + struct thread *td; + struct close_args *uap; +{ + + return (kern_close(td, uap->fd)); +} + +int +kern_close(td, fd) + struct thread *td; + int fd; +{ + struct filedesc *fdp; + struct file *fp; + int error; + int holdleaders; + + error = 0; + holdleaders = 0; + fdp = td->td_proc->p_fd; + + AUDIT_SYSCLOSE(td, fd); + + FILEDESC_XLOCK(fdp); + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + fdp->fd_ofiles[fd] = NULL; + fdp->fd_ofileflags[fd] = 0; + fdunused(fdp, fd); + if (td->td_proc->p_fdtol != NULL) { + /* + * Ask fdfree() to sleep to ensure that all relevant + * process leaders can be traversed in closef(). + */ + fdp->fd_holdleaderscount++; + holdleaders = 1; + } + + /* + * We now hold the fp reference that used to be owned by the + * descriptor array. We have to unlock the FILEDESC *AFTER* + * knote_fdclose to prevent a race of the fd getting opened, a knote + * added, and deleteing a knote for the new fd. + */ + knote_fdclose(td, fd); + if (fp->f_type == DTYPE_MQUEUE) + mq_fdclose(td, fd, fp); + FILEDESC_XUNLOCK(fdp); + + error = closef(fp, td); + if (holdleaders) { + FILEDESC_XLOCK(fdp); + fdp->fd_holdleaderscount--; + if (fdp->fd_holdleaderscount == 0 && + fdp->fd_holdleaderswakeup != 0) { + fdp->fd_holdleaderswakeup = 0; + wakeup(&fdp->fd_holdleaderscount); + } + FILEDESC_XUNLOCK(fdp); + } + return (error); +} + +/* + * Close open file descriptors. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct closefrom_args { + int lowfd; +}; +#endif +/* ARGSUSED */ +int +closefrom(struct thread *td, struct closefrom_args *uap) +{ + struct filedesc *fdp; + int fd; + + fdp = td->td_proc->p_fd; + AUDIT_ARG_FD(uap->lowfd); + + /* + * Treat negative starting file descriptor values identical to + * closefrom(0) which closes all files. + */ + if (uap->lowfd < 0) + uap->lowfd = 0; + FILEDESC_SLOCK(fdp); + for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) { + if (fdp->fd_ofiles[fd] != NULL) { + FILEDESC_SUNLOCK(fdp); + (void)kern_close(td, fd); + FILEDESC_SLOCK(fdp); + } + } + FILEDESC_SUNLOCK(fdp); + return (0); +} + +#if defined(COMPAT_43) +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct ofstat_args { + int fd; + struct ostat *sb; +}; +#endif +/* ARGSUSED */ +int +ofstat(struct thread *td, struct ofstat_args *uap) +{ + struct ostat oub; + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) { + cvtstat(&ub, &oub); + error = copyout(&oub, uap->sb, sizeof(oub)); + } + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct fstat_args { + int fd; + struct stat *sb; +}; +#endif +/* ARGSUSED */ +int +fstat(struct thread *td, struct fstat_args *uap) +{ + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) + error = copyout(&ub, uap->sb, sizeof(ub)); + return (error); +} + +int +kern_fstat(struct thread *td, int fd, struct stat *sbp) +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(fd); + + if ((error = fget(td, fd, &fp)) != 0) + return (error); + + AUDIT_ARG_FILE(td->td_proc, fp); + + error = fo_stat(fp, sbp, td->td_ucred, td); + fdrop(fp, td); +#ifdef KTRACE + if (error == 0 && KTRPOINT(td, KTR_STRUCT)) + ktrstat(sbp); +#endif + return (error); +} + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct nfstat_args { + int fd; + struct nstat *sb; +}; +#endif +/* ARGSUSED */ +int +nfstat(struct thread *td, struct nfstat_args *uap) +{ + struct nstat nub; + struct stat ub; + int error; + + error = kern_fstat(td, uap->fd, &ub); + if (error == 0) { + cvtnstat(&ub, &nub); + error = copyout(&nub, uap->sb, sizeof(nub)); + } + return (error); +} + +/* + * Return pathconf information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_HH_ +struct fpathconf_args { + int fd; + int name; +}; +#endif +/* ARGSUSED */ +int +fpathconf(struct thread *td, struct fpathconf_args *uap) +{ + struct file *fp; + struct vnode *vp; + int error; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + + /* If asynchronous I/O is available, it works for all descriptors. */ + if (uap->name == _PC_ASYNC_IO) { + td->td_retval[0] = async_io_version; + goto out; + } + vp = fp->f_vnode; + if (vp != NULL) { + int vfslocked; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_PATHCONF(vp, uap->name, td->td_retval); + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { + if (uap->name != _PC_PIPE_BUF) { + error = EINVAL; + } else { + td->td_retval[0] = PIPE_BUF; + error = 0; + } + } else { + error = EOPNOTSUPP; + } +out: + fdrop(fp, td); + return (error); +} + +/* + * Grow the file table to accomodate (at least) nfd descriptors. This may + * block and drop the filedesc lock, but it will reacquire it before + * returning. + */ +static void +fdgrowtable(struct filedesc *fdp, int nfd) +{ + struct filedesc0 *fdp0; + struct freetable *fo; + struct file **ntable; + struct file **otable; + char *nfileflags; + int nnfiles, onfiles; + NDSLOTTYPE *nmap; + + FILEDESC_XLOCK_ASSERT(fdp); + + KASSERT(fdp->fd_nfiles > 0, + ("zero-length file table")); + + /* compute the size of the new table */ + onfiles = fdp->fd_nfiles; + nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ + if (nnfiles <= onfiles) + /* the table is already large enough */ + return; + + /* allocate a new table and (if required) new bitmaps */ + FILEDESC_XUNLOCK(fdp); + ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable), + M_FILEDESC, M_ZERO | M_WAITOK); + nfileflags = (char *)&ntable[nnfiles]; + if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) + nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, + M_FILEDESC, M_ZERO | M_WAITOK); + else + nmap = NULL; + FILEDESC_XLOCK(fdp); + + /* + * We now have new tables ready to go. Since we dropped the + * filedesc lock to call malloc(), watch out for a race. + */ + onfiles = fdp->fd_nfiles; + if (onfiles >= nnfiles) { + /* we lost the race, but that's OK */ + free(ntable, M_FILEDESC); + if (nmap != NULL) + free(nmap, M_FILEDESC); + return; + } + bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); + bcopy(fdp->fd_ofileflags, nfileflags, onfiles); + otable = fdp->fd_ofiles; + fdp->fd_ofileflags = nfileflags; + fdp->fd_ofiles = ntable; + /* + * We must preserve ofiles until the process exits because we can't + * be certain that no threads have references to the old table via + * _fget(). + */ + if (onfiles > NDFILE) { + fo = (struct freetable *)&otable[onfiles]; + fdp0 = (struct filedesc0 *)fdp; + fo->ft_table = otable; + SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next); + } + if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { + bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); + if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) + free(fdp->fd_map, M_FILEDESC); + fdp->fd_map = nmap; + } + fdp->fd_nfiles = nnfiles; +} + +/* + * Allocate a file descriptor for the process. + */ +int +fdalloc(struct thread *td, int minfd, int *result) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = p->p_fd; + int fd = -1, maxfd; + + FILEDESC_XLOCK_ASSERT(fdp); + + if (fdp->fd_freefile > minfd) + minfd = fdp->fd_freefile; + + PROC_LOCK(p); + maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + + /* + * Search the bitmap for a free descriptor. If none is found, try + * to grow the file table. Keep at it until we either get a file + * descriptor or run into process or system limits; fdgrowtable() + * may drop the filedesc lock, so we're in a race. + */ + for (;;) { + fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); + if (fd >= maxfd) + return (EMFILE); + if (fd < fdp->fd_nfiles) + break; + fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); + } + + /* + * Perform some sanity checks, then mark the file descriptor as + * used and return it to the caller. + */ + KASSERT(!fdisused(fdp, fd), + ("fd_first_free() returned non-free descriptor")); + KASSERT(fdp->fd_ofiles[fd] == NULL, + ("free descriptor isn't")); + fdp->fd_ofileflags[fd] = 0; /* XXX needed? */ + fdused(fdp, fd); + *result = fd; + return (0); +} + +/* + * Check to see whether n user file descriptors are available to the process + * p. + */ +int +fdavail(struct thread *td, int n) +{ + struct proc *p = td->td_proc; + struct filedesc *fdp = td->td_proc->p_fd; + struct file **fpp; + int i, lim, last; + + FILEDESC_LOCK_ASSERT(fdp); + + PROC_LOCK(p); + lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + PROC_UNLOCK(p); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) + return (1); + last = min(fdp->fd_nfiles, lim); + fpp = &fdp->fd_ofiles[fdp->fd_freefile]; + for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { + if (*fpp == NULL && --n <= 0) + return (1); + } + return (0); +} + +/* + * Create a new open file structure and allocate a file decriptor for the + * process that refers to it. We add one reference to the file for the + * descriptor table and one reference for resultfp. This is to prevent us + * being preempted and the entry in the descriptor table closed after we + * release the FILEDESC lock. + */ +int +falloc(struct thread *td, struct file **resultfp, int *resultfd) +{ + struct proc *p = td->td_proc; + struct file *fp; + int error, i; + int maxuserfiles = maxfiles - (maxfiles / 20); + static struct timeval lastfail; + static int curfail; + + fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); + if ((openfiles >= maxuserfiles && + priv_check(td, PRIV_MAXFILES) != 0) || + openfiles >= maxfiles) { + if (ppsratecheck(&lastfail, &curfail, 1)) { + printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", + td->td_ucred->cr_ruid); + } + uma_zfree(file_zone, fp); + return (ENFILE); + } + atomic_add_int(&openfiles, 1); + + /* + * If the process has file descriptor zero open, add the new file + * descriptor to the list of open files at that point, otherwise + * put it at the front of the list of open files. + */ + refcount_init(&fp->f_count, 1); + if (resultfp) + fhold(fp); + fp->f_cred = crhold(td->td_ucred); + fp->f_ops = &badfileops; + fp->f_data = NULL; + fp->f_vnode = NULL; + FILEDESC_XLOCK(p->p_fd); + if ((error = fdalloc(td, 0, &i))) { + FILEDESC_XUNLOCK(p->p_fd); + + fdrop(fp, td); + if (resultfp) + fdrop(fp, td); + return (error); + } + p->p_fd->fd_ofiles[i] = fp; + FILEDESC_XUNLOCK(p->p_fd); + if (resultfp) + *resultfp = fp; + if (resultfd) + *resultfd = i; + return (0); +} + +/* + * Build a new filedesc structure from another. + * Copy the current, root, and jail root vnode references. + */ +struct filedesc * +fdinit(struct filedesc *fdp) +{ + struct filedesc0 *newfdp; + + newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); + FILEDESC_LOCK_INIT(&newfdp->fd_fd); + if (fdp != NULL) { + FILEDESC_XLOCK(fdp); + newfdp->fd_fd.fd_cdir = fdp->fd_cdir; + if (newfdp->fd_fd.fd_cdir) + VREF(newfdp->fd_fd.fd_cdir); + newfdp->fd_fd.fd_rdir = fdp->fd_rdir; + if (newfdp->fd_fd.fd_rdir) + VREF(newfdp->fd_fd.fd_rdir); + newfdp->fd_fd.fd_jdir = fdp->fd_jdir; + if (newfdp->fd_fd.fd_jdir) + VREF(newfdp->fd_fd.fd_jdir); + FILEDESC_XUNLOCK(fdp); + } + + /* Create the file descriptor table. */ + newfdp->fd_fd.fd_refcnt = 1; + newfdp->fd_fd.fd_holdcnt = 1; + newfdp->fd_fd.fd_cmask = CMASK; + newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; + newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; + newfdp->fd_fd.fd_nfiles = NDFILE; + newfdp->fd_fd.fd_map = newfdp->fd_dmap; + newfdp->fd_fd.fd_lastfile = -1; + return (&newfdp->fd_fd); +} + +static struct filedesc * +fdhold(struct proc *p) +{ + struct filedesc *fdp; + + mtx_lock(&fdesc_mtx); + fdp = p->p_fd; + if (fdp != NULL) + fdp->fd_holdcnt++; + mtx_unlock(&fdesc_mtx); + return (fdp); +} + +static void +fddrop(struct filedesc *fdp) +{ + struct filedesc0 *fdp0; + struct freetable *ft; + int i; + + mtx_lock(&fdesc_mtx); + i = --fdp->fd_holdcnt; + mtx_unlock(&fdesc_mtx); + if (i > 0) + return; + + FILEDESC_LOCK_DESTROY(fdp); + fdp0 = (struct filedesc0 *)fdp; + while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) { + SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next); + free(ft->ft_table, M_FILEDESC); + } + free(fdp, M_FILEDESC); +} + +/* + * Share a filedesc structure. + */ +struct filedesc * +fdshare(struct filedesc *fdp) +{ + + FILEDESC_XLOCK(fdp); + fdp->fd_refcnt++; + FILEDESC_XUNLOCK(fdp); + return (fdp); +} + +/* + * Unshare a filedesc structure, if necessary by making a copy + */ +void +fdunshare(struct proc *p, struct thread *td) +{ + + FILEDESC_XLOCK(p->p_fd); + if (p->p_fd->fd_refcnt > 1) { + struct filedesc *tmp; + + FILEDESC_XUNLOCK(p->p_fd); + tmp = fdcopy(p->p_fd); + fdfree(td); + p->p_fd = tmp; + } else + FILEDESC_XUNLOCK(p->p_fd); +} + +/* + * Copy a filedesc structure. A NULL pointer in returns a NULL reference, + * this is to ease callers, not catch errors. + */ +struct filedesc * +fdcopy(struct filedesc *fdp) +{ + struct filedesc *newfdp; + int i; + + /* Certain daemons might not have file descriptors. */ + if (fdp == NULL) + return (NULL); + + newfdp = fdinit(fdp); + FILEDESC_SLOCK(fdp); + while (fdp->fd_lastfile >= newfdp->fd_nfiles) { + FILEDESC_SUNLOCK(fdp); + FILEDESC_XLOCK(newfdp); + fdgrowtable(newfdp, fdp->fd_lastfile + 1); + FILEDESC_XUNLOCK(newfdp); + FILEDESC_SLOCK(fdp); + } + /* copy everything except kqueue descriptors */ + newfdp->fd_freefile = -1; + for (i = 0; i <= fdp->fd_lastfile; ++i) { + if (fdisused(fdp, i) && + fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE && + fdp->fd_ofiles[i]->f_ops != &badfileops) { + newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; + newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; + fhold(newfdp->fd_ofiles[i]); + newfdp->fd_lastfile = i; + } else { + if (newfdp->fd_freefile == -1) + newfdp->fd_freefile = i; + } + } + newfdp->fd_cmask = fdp->fd_cmask; + FILEDESC_SUNLOCK(fdp); + FILEDESC_XLOCK(newfdp); + for (i = 0; i <= newfdp->fd_lastfile; ++i) + if (newfdp->fd_ofiles[i] != NULL) + fdused(newfdp, i); + if (newfdp->fd_freefile == -1) + newfdp->fd_freefile = i; + FILEDESC_XUNLOCK(newfdp); + return (newfdp); +} + +/* + * Release a filedesc structure. + */ +void +fdfree(struct thread *td) +{ + struct filedesc *fdp; + struct file **fpp; + int i, locked; + struct filedesc_to_leader *fdtol; + struct file *fp; + struct vnode *cdir, *jdir, *rdir, *vp; + struct flock lf; + + /* Certain daemons might not have file descriptors. */ + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return; + + /* Check for special need to clear POSIX style locks */ + fdtol = td->td_proc->p_fdtol; + if (fdtol != NULL) { + FILEDESC_XLOCK(fdp); + KASSERT(fdtol->fdl_refcount > 0, + ("filedesc_to_refcount botch: fdl_refcount=%d", + fdtol->fdl_refcount)); + if (fdtol->fdl_refcount == 1 && + (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + for (i = 0, fpp = fdp->fd_ofiles; + i <= fdp->fd_lastfile; + i++, fpp++) { + if (*fpp == NULL || + (*fpp)->f_type != DTYPE_VNODE) + continue; + fp = *fpp; + fhold(fp); + FILEDESC_XUNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_vnode; + locked = VFS_LOCK_GIANT(vp->v_mount); + (void) VOP_ADVLOCK(vp, + (caddr_t)td->td_proc-> + p_leader, + F_UNLCK, + &lf, + F_POSIX); + VFS_UNLOCK_GIANT(locked); + FILEDESC_XLOCK(fdp); + fdrop(fp, td); + fpp = fdp->fd_ofiles + i; + } + } + retry: + if (fdtol->fdl_refcount == 1) { + if (fdp->fd_holdleaderscount > 0 && + (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + /* + * close() or do_dup() has cleared a reference + * in a shared file descriptor table. + */ + fdp->fd_holdleaderswakeup = 1; + sx_sleep(&fdp->fd_holdleaderscount, + FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); + goto retry; + } + if (fdtol->fdl_holdcount > 0) { + /* + * Ensure that fdtol->fdl_leader remains + * valid in closef(). + */ + fdtol->fdl_wakeup = 1; + sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, + "fdlhold", 0); + goto retry; + } + } + fdtol->fdl_refcount--; + if (fdtol->fdl_refcount == 0 && + fdtol->fdl_holdcount == 0) { + fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; + fdtol->fdl_prev->fdl_next = fdtol->fdl_next; + } else + fdtol = NULL; + td->td_proc->p_fdtol = NULL; + FILEDESC_XUNLOCK(fdp); + if (fdtol != NULL) + free(fdtol, M_FILEDESC_TO_LEADER); + } + FILEDESC_XLOCK(fdp); + i = --fdp->fd_refcnt; + FILEDESC_XUNLOCK(fdp); + if (i > 0) + return; + + fpp = fdp->fd_ofiles; + for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { + if (*fpp) { + FILEDESC_XLOCK(fdp); + fp = *fpp; + *fpp = NULL; + FILEDESC_XUNLOCK(fdp); + (void) closef(fp, td); + } + } + FILEDESC_XLOCK(fdp); + + /* XXX This should happen earlier. */ + mtx_lock(&fdesc_mtx); + td->td_proc->p_fd = NULL; + mtx_unlock(&fdesc_mtx); + + if (fdp->fd_nfiles > NDFILE) + free(fdp->fd_ofiles, M_FILEDESC); + if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) + free(fdp->fd_map, M_FILEDESC); + + fdp->fd_nfiles = 0; + + cdir = fdp->fd_cdir; + fdp->fd_cdir = NULL; + rdir = fdp->fd_rdir; + fdp->fd_rdir = NULL; + jdir = fdp->fd_jdir; + fdp->fd_jdir = NULL; + FILEDESC_XUNLOCK(fdp); + + if (cdir) { + locked = VFS_LOCK_GIANT(cdir->v_mount); + vrele(cdir); + VFS_UNLOCK_GIANT(locked); + } + if (rdir) { + locked = VFS_LOCK_GIANT(rdir->v_mount); + vrele(rdir); + VFS_UNLOCK_GIANT(locked); + } + if (jdir) { + locked = VFS_LOCK_GIANT(jdir->v_mount); + vrele(jdir); + VFS_UNLOCK_GIANT(locked); + } + + fddrop(fdp); +} + +/* + * For setugid programs, we don't want to people to use that setugidness + * to generate error messages which write to a file which otherwise would + * otherwise be off-limits to the process. We check for filesystems where + * the vnode can change out from under us after execve (like [lin]procfs). + * + * Since setugidsafety calls this only for fd 0, 1 and 2, this check is + * sufficient. We also don't check for setugidness since we know we are. + */ +static int +is_unsafe(struct file *fp) +{ + if (fp->f_type == DTYPE_VNODE) { + struct vnode *vp = fp->f_vnode; + + if ((vp->v_vflag & VV_PROCDEP) != 0) + return (1); + } + return (0); +} + +/* + * Make this setguid thing safe, if at all possible. + */ +void +setugidsafety(struct thread *td) +{ + struct filedesc *fdp; + int i; + + /* Certain daemons might not have file descriptors. */ + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return; + + /* + * Note: fdp->fd_ofiles may be reallocated out from under us while + * we are blocked in a close. Be careful! + */ + FILEDESC_XLOCK(fdp); + for (i = 0; i <= fdp->fd_lastfile; i++) { + if (i > 2) + break; + if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { + struct file *fp; + + knote_fdclose(td, i); + /* + * NULL-out descriptor prior to close to avoid + * a race while close blocks. + */ + fp = fdp->fd_ofiles[i]; + fdp->fd_ofiles[i] = NULL; + fdp->fd_ofileflags[i] = 0; + fdunused(fdp, i); + FILEDESC_XUNLOCK(fdp); + (void) closef(fp, td); + FILEDESC_XLOCK(fdp); + } + } + FILEDESC_XUNLOCK(fdp); +} + +/* + * If a specific file object occupies a specific file descriptor, close the + * file descriptor entry and drop a reference on the file object. This is a + * convenience function to handle a subsequent error in a function that calls + * falloc() that handles the race that another thread might have closed the + * file descriptor out from under the thread creating the file object. + */ +void +fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) +{ + + FILEDESC_XLOCK(fdp); + if (fdp->fd_ofiles[idx] == fp) { + fdp->fd_ofiles[idx] = NULL; + fdunused(fdp, idx); + FILEDESC_XUNLOCK(fdp); + fdrop(fp, td); + } else + FILEDESC_XUNLOCK(fdp); +} + +/* + * Close any files on exec? + */ +void +fdcloseexec(struct thread *td) +{ + struct filedesc *fdp; + int i; + + /* Certain daemons might not have file descriptors. */ + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return; + + FILEDESC_XLOCK(fdp); + + /* + * We cannot cache fd_ofiles or fd_ofileflags since operations + * may block and rip them out from under us. + */ + for (i = 0; i <= fdp->fd_lastfile; i++) { + if (fdp->fd_ofiles[i] != NULL && + (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || + (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { + struct file *fp; + + knote_fdclose(td, i); + /* + * NULL-out descriptor prior to close to avoid + * a race while close blocks. + */ + fp = fdp->fd_ofiles[i]; + fdp->fd_ofiles[i] = NULL; + fdp->fd_ofileflags[i] = 0; + fdunused(fdp, i); + if (fp->f_type == DTYPE_MQUEUE) + mq_fdclose(td, i, fp); + FILEDESC_XUNLOCK(fdp); + (void) closef(fp, td); + FILEDESC_XLOCK(fdp); + } + } + FILEDESC_XUNLOCK(fdp); +} + +/* + * It is unsafe for set[ug]id processes to be started with file + * descriptors 0..2 closed, as these descriptors are given implicit + * significance in the Standard C library. fdcheckstd() will create a + * descriptor referencing /dev/null for each of stdin, stdout, and + * stderr that is not already open. + */ +int +fdcheckstd(struct thread *td) +{ + struct filedesc *fdp; + register_t retval, save; + int i, error, devnull; + + fdp = td->td_proc->p_fd; + if (fdp == NULL) + return (0); + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); + devnull = -1; + error = 0; + for (i = 0; i < 3; i++) { + if (fdp->fd_ofiles[i] != NULL) + continue; + if (devnull < 0) { + save = td->td_retval[0]; + error = kern_open(td, "/dev/null", UIO_SYSSPACE, + O_RDWR, 0); + devnull = td->td_retval[0]; + KASSERT(devnull == i, ("oof, we didn't get our fd")); + td->td_retval[0] = save; + if (error) + break; + } else { + error = do_dup(td, DUP_FIXED, devnull, i, &retval); + if (error != 0) + break; + } + } + return (error); +} + +/* + * Internal form of close. Decrement reference count on file structure. + * Note: td may be NULL when closing a file that was being passed in a + * message. + * + * XXXRW: Giant is not required for the caller, but often will be held; this + * makes it moderately likely the Giant will be recursed in the VFS case. + */ +int +closef(struct file *fp, struct thread *td) +{ + struct vnode *vp; + struct flock lf; + struct filedesc_to_leader *fdtol; + struct filedesc *fdp; + + /* + * POSIX record locking dictates that any close releases ALL + * locks owned by this process. This is handled by setting + * a flag in the unlock to free ONLY locks obeying POSIX + * semantics, and not to free BSD-style file locks. + * If the descriptor was in a message, POSIX-style locks + * aren't passed with the descriptor, and the thread pointer + * will be NULL. Callers should be careful only to pass a + * NULL thread pointer when there really is no owning + * context that might have locks, or the locks will be + * leaked. + */ + if (fp->f_type == DTYPE_VNODE && td != NULL) { + int vfslocked; + + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, + F_UNLCK, &lf, F_POSIX); + } + fdtol = td->td_proc->p_fdtol; + if (fdtol != NULL) { + /* + * Handle special case where file descriptor table is + * shared between multiple process leaders. + */ + fdp = td->td_proc->p_fd; + FILEDESC_XLOCK(fdp); + for (fdtol = fdtol->fdl_next; + fdtol != td->td_proc->p_fdtol; + fdtol = fdtol->fdl_next) { + if ((fdtol->fdl_leader->p_flag & + P_ADVLOCK) == 0) + continue; + fdtol->fdl_holdcount++; + FILEDESC_XUNLOCK(fdp); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = fp->f_vnode; + (void) VOP_ADVLOCK(vp, + (caddr_t)fdtol->fdl_leader, + F_UNLCK, &lf, F_POSIX); + FILEDESC_XLOCK(fdp); + fdtol->fdl_holdcount--; + if (fdtol->fdl_holdcount == 0 && + fdtol->fdl_wakeup != 0) { + fdtol->fdl_wakeup = 0; + wakeup(fdtol); + } + } + FILEDESC_XUNLOCK(fdp); + } + VFS_UNLOCK_GIANT(vfslocked); + } + return (fdrop(fp, td)); +} + +/* + * Initialize the file pointer with the specified properties. + * + * The ops are set with release semantics to be certain that the flags, type, + * and data are visible when ops is. This is to prevent ops methods from being + * called with bad data. + */ +void +finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) +{ + fp->f_data = data; + fp->f_flag = flag; + fp->f_type = type; + atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); +} +#endif /* __rtems__ */ + +struct file * +fget_unlocked(struct filedesc *fdp, int fd) +{ + struct file *fp; + u_int count; + + if (fd < 0 || fd >= fdp->fd_nfiles) + return (NULL); + /* + * Fetch the descriptor locklessly. We avoid fdrop() races by + * never raising a refcount above 0. To accomplish this we have + * to use a cmpset loop rather than an atomic_add. The descriptor + * must be re-verified once we acquire a reference to be certain + * that the identity is still correct and we did not lose a race + * due to preemption. + */ + for (;;) { + fp = fdp->fd_ofiles[fd]; + if (fp == NULL) + break; + count = fp->f_count; + if (count == 0) + continue; + /* + * Use an acquire barrier to prevent caching of fd_ofiles + * so it is refreshed for verification. + */ + if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1) + continue; + if (fp == fdp->fd_ofiles[fd]) + break; + fdrop(fp, curthread); + } + + return (fp); +} + +/* + * Extract the file pointer associated with the specified descriptor for the + * current user process. + * + * If the descriptor doesn't exist or doesn't match 'flags', EBADF is + * returned. + * + * If an error occured the non-zero error is returned and *fpp is set to + * NULL. Otherwise *fpp is held and set and zero is returned. Caller is + * responsible for fdrop(). + */ +static __inline int +_fget(struct thread *td, int fd, struct file **fpp, int flags) +{ + struct filedesc *fdp; + struct file *fp; + + *fpp = NULL; + if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) + return (EBADF); + if ((fp = fget_unlocked(fdp, fd)) == NULL) + return (EBADF); + if (fp->f_ops == &badfileops) { + fdrop(fp, td); + return (EBADF); + } + /* + * FREAD and FWRITE failure return EBADF as per POSIX. + * + * Only one flag, or 0, may be specified. + */ + if ((flags == FREAD && (fp->f_flag & FREAD) == 0) || + (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) { + fdrop(fp, td); + return (EBADF); + } + *fpp = fp; + return (0); +} + +int +fget(struct thread *td, int fd, struct file **fpp) +{ + + return(_fget(td, fd, fpp, 0)); +} + +int +fget_read(struct thread *td, int fd, struct file **fpp) +{ + + return(_fget(td, fd, fpp, FREAD)); +} + +#ifndef __rtems__ +int +fget_write(struct thread *td, int fd, struct file **fpp) +{ + + return(_fget(td, fd, fpp, FWRITE)); +} + +/* + * Like fget() but loads the underlying vnode, or returns an error if the + * descriptor does not represent a vnode. Note that pipes use vnodes but + * never have VM objects. The returned vnode will be vref()'d. + * + * XXX: what about the unused flags ? + */ +static __inline int +_fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) +{ + struct file *fp; + int error; + + *vpp = NULL; + if ((error = _fget(td, fd, &fp, flags)) != 0) + return (error); + if (fp->f_vnode == NULL) { + error = EINVAL; + } else { + *vpp = fp->f_vnode; + vref(*vpp); + } + fdrop(fp, td); + + return (error); +} + +int +fgetvp(struct thread *td, int fd, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, vpp, 0)); +} + +int +fgetvp_read(struct thread *td, int fd, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, vpp, FREAD)); +} + +#ifdef notyet +int +fgetvp_write(struct thread *td, int fd, struct vnode **vpp) +{ + + return (_fgetvp(td, fd, vpp, FWRITE)); +} +#endif + +/* + * Like fget() but loads the underlying socket, or returns an error if the + * descriptor does not represent a socket. + * + * We bump the ref count on the returned socket. XXX Also obtain the SX lock + * in the future. + * + * Note: fgetsock() and fputsock() are deprecated, as consumers should rely + * on their file descriptor reference to prevent the socket from being free'd + * during use. + */ +int +fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) +{ + struct file *fp; + int error; + + *spp = NULL; + if (fflagp != NULL) + *fflagp = 0; + if ((error = _fget(td, fd, &fp, 0)) != 0) + return (error); + if (fp->f_type != DTYPE_SOCKET) { + error = ENOTSOCK; + } else { + *spp = fp->f_data; + if (fflagp) + *fflagp = fp->f_flag; + SOCK_LOCK(*spp); + soref(*spp); + SOCK_UNLOCK(*spp); + } + fdrop(fp, td); + + return (error); +} + +/* + * Drop the reference count on the socket and XXX release the SX lock in the + * future. The last reference closes the socket. + * + * Note: fputsock() is deprecated, see comment for fgetsock(). + */ +void +fputsock(struct socket *so) +{ + + ACCEPT_LOCK(); + SOCK_LOCK(so); + sorele(so); +} +#endif /* __rtems__ */ + +/* + * Handle the last reference to a file being closed. + */ +int +_fdrop(struct file *fp, struct thread *td) +{ +#ifdef __rtems__ + panic("fdrop: RTEMS unsupported"); + +#else /* __rtems__ */ + int error; + + error = 0; + if (fp->f_count != 0) + panic("fdrop: count %d", fp->f_count); + if (fp->f_ops != &badfileops) + error = fo_close(fp, td); + /* + * The f_cdevpriv cannot be assigned non-NULL value while we + * are destroying the file. + */ + if (fp->f_cdevpriv != NULL) + devfs_fpdrop(fp); + atomic_subtract_int(&openfiles, 1); + crfree(fp->f_cred); + uma_zfree(file_zone, fp); + + return (error); +#endif /* __rtems__ */ +} + +#ifndef __rtems__ +/* + * Apply an advisory lock on a file descriptor. + * + * Just attempt to get a record lock of the requested type on the entire file + * (l_whence = SEEK_SET, l_start = 0, l_len = 0). + */ +#ifndef _SYS_SYSPROTO_HH_ +struct flock_args { + int fd; + int how; +}; +#endif +/* ARGSUSED */ +int +flock(struct thread *td, struct flock_args *uap) +{ + struct file *fp; + struct vnode *vp; + struct flock lf; + int vfslocked; + int error; + + if ((error = fget(td, uap->fd, &fp)) != 0) + return (error); + if (fp->f_type != DTYPE_VNODE) { + fdrop(fp, td); + return (EOPNOTSUPP); + } + + vp = fp->f_vnode; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (uap->how & LOCK_UN) { + lf.l_type = F_UNLCK; + atomic_clear_int(&fp->f_flag, FHASLOCK); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + goto done2; + } + if (uap->how & LOCK_EX) + lf.l_type = F_WRLCK; + else if (uap->how & LOCK_SH) + lf.l_type = F_RDLCK; + else { + error = EBADF; + goto done2; + } + atomic_set_int(&fp->f_flag, FHASLOCK); + error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, + (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); +done2: + fdrop(fp, td); + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} +/* + * Duplicate the specified descriptor to a free descriptor. + */ +int +dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error) +{ + struct file *wfp; + struct file *fp; + + /* + * If the to-be-dup'd fd number is greater than the allowed number + * of file descriptors, or the fd to be dup'd has already been + * closed, then reject. + */ + FILEDESC_XLOCK(fdp); + if (dfd < 0 || dfd >= fdp->fd_nfiles || + (wfp = fdp->fd_ofiles[dfd]) == NULL) { + FILEDESC_XUNLOCK(fdp); + return (EBADF); + } + + /* + * There are two cases of interest here. + * + * For ENODEV simply dup (dfd) to file descriptor (indx) and return. + * + * For ENXIO steal away the file structure from (dfd) and store it in + * (indx). (dfd) is effectively closed by this operation. + * + * Any other error code is just returned. + */ + switch (error) { + case ENODEV: + /* + * Check that the mode the file is being opened for is a + * subset of the mode of the existing descriptor. + */ + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { + FILEDESC_XUNLOCK(fdp); + return (EACCES); + } + fp = fdp->fd_ofiles[indx]; + fdp->fd_ofiles[indx] = wfp; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + if (fp == NULL) + fdused(fdp, indx); + fhold(wfp); + FILEDESC_XUNLOCK(fdp); + if (fp != NULL) + /* + * We now own the reference to fp that the ofiles[] + * array used to own. Release it. + */ + fdrop(fp, td); + return (0); + + case ENXIO: + /* + * Steal away the file pointer from dfd and stuff it into indx. + */ + fp = fdp->fd_ofiles[indx]; + fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; + fdp->fd_ofiles[dfd] = NULL; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[dfd] = 0; + fdunused(fdp, dfd); + if (fp == NULL) + fdused(fdp, indx); + FILEDESC_XUNLOCK(fdp); + + /* + * We now own the reference to fp that the ofiles[] array + * used to own. Release it. + */ + if (fp != NULL) + fdrop(fp, td); + return (0); + + default: + FILEDESC_XUNLOCK(fdp); + return (error); + } + /* NOTREACHED */ +} + +/* + * Scan all active processes and prisons to see if any of them have a current + * or root directory of `olddp'. If so, replace them with the new mount point. + */ +void +mountcheckdirs(struct vnode *olddp, struct vnode *newdp) +{ + struct filedesc *fdp; + struct prison *pr; + struct proc *p; + int nrele; + + if (vrefcnt(olddp) == 1) + return; + nrele = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + fdp = fdhold(p); + if (fdp == NULL) + continue; + FILEDESC_XLOCK(fdp); + if (fdp->fd_cdir == olddp) { + vref(newdp); + fdp->fd_cdir = newdp; + nrele++; + } + if (fdp->fd_rdir == olddp) { + vref(newdp); + fdp->fd_rdir = newdp; + nrele++; + } + if (fdp->fd_jdir == olddp) { + vref(newdp); + fdp->fd_jdir = newdp; + nrele++; + } + FILEDESC_XUNLOCK(fdp); + fddrop(fdp); + } + sx_sunlock(&allproc_lock); + if (rootvnode == olddp) { + vref(newdp); + rootvnode = newdp; + nrele++; + } + mtx_lock(&prison0.pr_mtx); + if (prison0.pr_root == olddp) { + vref(newdp); + prison0.pr_root = newdp; + nrele++; + } + mtx_unlock(&prison0.pr_mtx); + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + mtx_lock(&pr->pr_mtx); + if (pr->pr_root == olddp) { + vref(newdp); + pr->pr_root = newdp; + nrele++; + } + mtx_unlock(&pr->pr_mtx); + } + sx_sunlock(&allprison_lock); + while (nrele--) + vrele(olddp); +} + +struct filedesc_to_leader * +filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) +{ + struct filedesc_to_leader *fdtol; + + fdtol = malloc(sizeof(struct filedesc_to_leader), + M_FILEDESC_TO_LEADER, + M_WAITOK); + fdtol->fdl_refcount = 1; + fdtol->fdl_holdcount = 0; + fdtol->fdl_wakeup = 0; + fdtol->fdl_leader = leader; + if (old != NULL) { + FILEDESC_XLOCK(fdp); + fdtol->fdl_next = old->fdl_next; + fdtol->fdl_prev = old; + old->fdl_next = fdtol; + fdtol->fdl_next->fdl_prev = fdtol; + FILEDESC_XUNLOCK(fdp); + } else { + fdtol->fdl_next = fdtol; + fdtol->fdl_prev = fdtol; + } + return (fdtol); +} + +/* + * Get file structures globally. + */ +static int +sysctl_kern_file(SYSCTL_HANDLER_ARGS) +{ + struct xfile xf; + struct filedesc *fdp; + struct file *fp; + struct proc *p; + int error, n; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + if (req->oldptr == NULL) { + n = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = fdhold(p); + if (fdp == NULL) + continue; + /* overestimates sparse tables. */ + if (fdp->fd_lastfile > 0) + n += fdp->fd_lastfile; + fddrop(fdp); + } + sx_sunlock(&allproc_lock); + return (SYSCTL_OUT(req, 0, n * sizeof(xf))); + } + error = 0; + bzero(&xf, sizeof(xf)); + xf.xf_size = sizeof(xf); + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + PROC_LOCK(p); + if (p_cansee(req->td, p) != 0) { + PROC_UNLOCK(p); + continue; + } + xf.xf_pid = p->p_pid; + xf.xf_uid = p->p_ucred->cr_uid; + PROC_UNLOCK(p); + fdp = fdhold(p); + if (fdp == NULL) + continue; + FILEDESC_SLOCK(fdp); + for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { + if ((fp = fdp->fd_ofiles[n]) == NULL) + continue; + xf.xf_fd = n; + xf.xf_file = fp; + xf.xf_data = fp->f_data; + xf.xf_vnode = fp->f_vnode; + xf.xf_type = fp->f_type; + xf.xf_count = fp->f_count; + xf.xf_msgcount = 0; + xf.xf_offset = fp->f_offset; + xf.xf_flag = fp->f_flag; + error = SYSCTL_OUT(req, &xf, sizeof(xf)); + if (error) + break; + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + if (error) + break; + } + sx_sunlock(&allproc_lock); + return (error); +} + +SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); + +#ifdef KINFO_OFILE_SIZE +CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); +#endif + +#ifdef COMPAT_FREEBSD7 +static int +export_vnode_for_osysctl(struct vnode *vp, int type, + struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req) +{ + int error; + char *fullpath, *freepath; + int vfslocked; + + bzero(kif, sizeof(*kif)); + kif->kf_structsize = sizeof(*kif); + + vref(vp); + kif->kf_fd = type; + kif->kf_type = KF_TYPE_VNODE; + /* This function only handles directories. */ + if (vp->v_type != VDIR) { + vrele(vp); + return (ENOTDIR); + } + kif->kf_vnode_type = KF_VTYPE_VDIR; + + /* + * This is not a true file descriptor, so we set a bogus refcount + * and offset to indicate these fields should be ignored. + */ + kif->kf_ref_count = -1; + kif->kf_offset = -1; + + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + error = SYSCTL_OUT(req, kif, sizeof(*kif)); + FILEDESC_SLOCK(fdp); + return (error); +} + +/* + * Get per-process file descriptors for use by procstat(1), et al. + */ +static int +sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) +{ + char *fullpath, *freepath; + struct kinfo_ofile *kif; + struct filedesc *fdp; + int error, i, *name; + struct socket *so; + struct vnode *vp; + struct file *fp; + struct proc *p; + struct tty *tp; + int vfslocked; + + name = (int *)arg1; + if ((p = pfind((pid_t)name[0])) == NULL) + return (ESRCH); + if ((error = p_candebug(curthread, p))) { + PROC_UNLOCK(p); + return (error); + } + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + return (ENOENT); + kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); + FILEDESC_SLOCK(fdp); + if (fdp->fd_cdir != NULL) + export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, + fdp, req); + if (fdp->fd_rdir != NULL) + export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, + fdp, req); + if (fdp->fd_jdir != NULL) + export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, + fdp, req); + for (i = 0; i < fdp->fd_nfiles; i++) { + if ((fp = fdp->fd_ofiles[i]) == NULL) + continue; + bzero(kif, sizeof(*kif)); + kif->kf_structsize = sizeof(*kif); + vp = NULL; + so = NULL; + tp = NULL; + kif->kf_fd = i; + switch (fp->f_type) { + case DTYPE_VNODE: + kif->kf_type = KF_TYPE_VNODE; + vp = fp->f_vnode; + break; + + case DTYPE_SOCKET: + kif->kf_type = KF_TYPE_SOCKET; + so = fp->f_data; + break; + + case DTYPE_PIPE: + kif->kf_type = KF_TYPE_PIPE; + break; + + case DTYPE_FIFO: + kif->kf_type = KF_TYPE_FIFO; + vp = fp->f_vnode; + break; + + case DTYPE_KQUEUE: + kif->kf_type = KF_TYPE_KQUEUE; + break; + + case DTYPE_CRYPTO: + kif->kf_type = KF_TYPE_CRYPTO; + break; + + case DTYPE_MQUEUE: + kif->kf_type = KF_TYPE_MQUEUE; + break; + + case DTYPE_SHM: + kif->kf_type = KF_TYPE_SHM; + break; + + case DTYPE_SEM: + kif->kf_type = KF_TYPE_SEM; + break; + + case DTYPE_PTS: + kif->kf_type = KF_TYPE_PTS; + tp = fp->f_data; + break; + + default: + kif->kf_type = KF_TYPE_UNKNOWN; + break; + } + kif->kf_ref_count = fp->f_count; + if (fp->f_flag & FREAD) + kif->kf_flags |= KF_FLAG_READ; + if (fp->f_flag & FWRITE) + kif->kf_flags |= KF_FLAG_WRITE; + if (fp->f_flag & FAPPEND) + kif->kf_flags |= KF_FLAG_APPEND; + if (fp->f_flag & FASYNC) + kif->kf_flags |= KF_FLAG_ASYNC; + if (fp->f_flag & FFSYNC) + kif->kf_flags |= KF_FLAG_FSYNC; + if (fp->f_flag & FNONBLOCK) + kif->kf_flags |= KF_FLAG_NONBLOCK; + if (fp->f_flag & O_DIRECT) + kif->kf_flags |= KF_FLAG_DIRECT; + if (fp->f_flag & FHASLOCK) + kif->kf_flags |= KF_FLAG_HASLOCK; + kif->kf_offset = fp->f_offset; + if (vp != NULL) { + vref(vp); + switch (vp->v_type) { + case VNON: + kif->kf_vnode_type = KF_VTYPE_VNON; + break; + case VREG: + kif->kf_vnode_type = KF_VTYPE_VREG; + break; + case VDIR: + kif->kf_vnode_type = KF_VTYPE_VDIR; + break; + case VBLK: + kif->kf_vnode_type = KF_VTYPE_VBLK; + break; + case VCHR: + kif->kf_vnode_type = KF_VTYPE_VCHR; + break; + case VLNK: + kif->kf_vnode_type = KF_VTYPE_VLNK; + break; + case VSOCK: + kif->kf_vnode_type = KF_VTYPE_VSOCK; + break; + case VFIFO: + kif->kf_vnode_type = KF_VTYPE_VFIFO; + break; + case VBAD: + kif->kf_vnode_type = KF_VTYPE_VBAD; + break; + default: + kif->kf_vnode_type = KF_VTYPE_UNKNOWN; + break; + } + /* + * It is OK to drop the filedesc lock here as we will + * re-validate and re-evaluate its properties when + * the loop continues. + */ + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, + sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + FILEDESC_SLOCK(fdp); + } + if (so != NULL) { + struct sockaddr *sa; + + if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { + bcopy(sa, &kif->kf_sa_local, sa->sa_len); + free(sa, M_SONAME); + } + if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { + bcopy(sa, &kif->kf_sa_peer, sa->sa_len); + free(sa, M_SONAME); + } + kif->kf_sock_domain = + so->so_proto->pr_domain->dom_family; + kif->kf_sock_type = so->so_type; + kif->kf_sock_protocol = so->so_proto->pr_protocol; + } + if (tp != NULL) { + strlcpy(kif->kf_path, tty_devname(tp), + sizeof(kif->kf_path)); + } + error = SYSCTL_OUT(req, kif, sizeof(*kif)); + if (error) + break; + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + free(kif, M_TEMP); + return (0); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD, + sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); +#endif /* COMPAT_FREEBSD7 */ + +#ifdef KINFO_FILE_SIZE +CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); +#endif + +static int +export_vnode_for_sysctl(struct vnode *vp, int type, + struct kinfo_file *kif, struct filedesc *fdp, struct sysctl_req *req) +{ + int error; + char *fullpath, *freepath; + int vfslocked; + + bzero(kif, sizeof(*kif)); + + vref(vp); + kif->kf_fd = type; + kif->kf_type = KF_TYPE_VNODE; + /* This function only handles directories. */ + if (vp->v_type != VDIR) { + vrele(vp); + return (ENOTDIR); + } + kif->kf_vnode_type = KF_VTYPE_VDIR; + + /* + * This is not a true file descriptor, so we set a bogus refcount + * and offset to indicate these fields should be ignored. + */ + kif->kf_ref_count = -1; + kif->kf_offset = -1; + + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + /* Pack record size down */ + kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + + strlen(kif->kf_path) + 1; + kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); + error = SYSCTL_OUT(req, kif, kif->kf_structsize); + FILEDESC_SLOCK(fdp); + return (error); +} + +/* + * Get per-process file descriptors for use by procstat(1), et al. + */ +static int +sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) +{ + char *fullpath, *freepath; + struct kinfo_file *kif; + struct filedesc *fdp; + int error, i, *name; + struct socket *so; + struct vnode *vp; + struct file *fp; + struct proc *p; + struct tty *tp; + int vfslocked; + size_t oldidx; + + name = (int *)arg1; + if ((p = pfind((pid_t)name[0])) == NULL) + return (ESRCH); + if ((error = p_candebug(curthread, p))) { + PROC_UNLOCK(p); + return (error); + } + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + return (ENOENT); + kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); + FILEDESC_SLOCK(fdp); + if (fdp->fd_cdir != NULL) + export_vnode_for_sysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, + fdp, req); + if (fdp->fd_rdir != NULL) + export_vnode_for_sysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, + fdp, req); + if (fdp->fd_jdir != NULL) + export_vnode_for_sysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, + fdp, req); + for (i = 0; i < fdp->fd_nfiles; i++) { + if ((fp = fdp->fd_ofiles[i]) == NULL) + continue; + bzero(kif, sizeof(*kif)); + vp = NULL; + so = NULL; + tp = NULL; + kif->kf_fd = i; + switch (fp->f_type) { + case DTYPE_VNODE: + kif->kf_type = KF_TYPE_VNODE; + vp = fp->f_vnode; + break; + + case DTYPE_SOCKET: + kif->kf_type = KF_TYPE_SOCKET; + so = fp->f_data; + break; + + case DTYPE_PIPE: + kif->kf_type = KF_TYPE_PIPE; + break; + + case DTYPE_FIFO: + kif->kf_type = KF_TYPE_FIFO; + vp = fp->f_vnode; + break; + + case DTYPE_KQUEUE: + kif->kf_type = KF_TYPE_KQUEUE; + break; + + case DTYPE_CRYPTO: + kif->kf_type = KF_TYPE_CRYPTO; + break; + + case DTYPE_MQUEUE: + kif->kf_type = KF_TYPE_MQUEUE; + break; + + case DTYPE_SHM: + kif->kf_type = KF_TYPE_SHM; + break; + + case DTYPE_SEM: + kif->kf_type = KF_TYPE_SEM; + break; + + case DTYPE_PTS: + kif->kf_type = KF_TYPE_PTS; + tp = fp->f_data; + break; + + default: + kif->kf_type = KF_TYPE_UNKNOWN; + break; + } + kif->kf_ref_count = fp->f_count; + if (fp->f_flag & FREAD) + kif->kf_flags |= KF_FLAG_READ; + if (fp->f_flag & FWRITE) + kif->kf_flags |= KF_FLAG_WRITE; + if (fp->f_flag & FAPPEND) + kif->kf_flags |= KF_FLAG_APPEND; + if (fp->f_flag & FASYNC) + kif->kf_flags |= KF_FLAG_ASYNC; + if (fp->f_flag & FFSYNC) + kif->kf_flags |= KF_FLAG_FSYNC; + if (fp->f_flag & FNONBLOCK) + kif->kf_flags |= KF_FLAG_NONBLOCK; + if (fp->f_flag & O_DIRECT) + kif->kf_flags |= KF_FLAG_DIRECT; + if (fp->f_flag & FHASLOCK) + kif->kf_flags |= KF_FLAG_HASLOCK; + kif->kf_offset = fp->f_offset; + if (vp != NULL) { + vref(vp); + switch (vp->v_type) { + case VNON: + kif->kf_vnode_type = KF_VTYPE_VNON; + break; + case VREG: + kif->kf_vnode_type = KF_VTYPE_VREG; + break; + case VDIR: + kif->kf_vnode_type = KF_VTYPE_VDIR; + break; + case VBLK: + kif->kf_vnode_type = KF_VTYPE_VBLK; + break; + case VCHR: + kif->kf_vnode_type = KF_VTYPE_VCHR; + break; + case VLNK: + kif->kf_vnode_type = KF_VTYPE_VLNK; + break; + case VSOCK: + kif->kf_vnode_type = KF_VTYPE_VSOCK; + break; + case VFIFO: + kif->kf_vnode_type = KF_VTYPE_VFIFO; + break; + case VBAD: + kif->kf_vnode_type = KF_VTYPE_VBAD; + break; + default: + kif->kf_vnode_type = KF_VTYPE_UNKNOWN; + break; + } + /* + * It is OK to drop the filedesc lock here as we will + * re-validate and re-evaluate its properties when + * the loop continues. + */ + freepath = NULL; + fullpath = "-"; + FILEDESC_SUNLOCK(fdp); + vn_fullpath(curthread, vp, &fullpath, &freepath); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vrele(vp); + VFS_UNLOCK_GIANT(vfslocked); + strlcpy(kif->kf_path, fullpath, + sizeof(kif->kf_path)); + if (freepath != NULL) + free(freepath, M_TEMP); + FILEDESC_SLOCK(fdp); + } + if (so != NULL) { + struct sockaddr *sa; + + if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { + bcopy(sa, &kif->kf_sa_local, sa->sa_len); + free(sa, M_SONAME); + } + if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) + == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { + bcopy(sa, &kif->kf_sa_peer, sa->sa_len); + free(sa, M_SONAME); + } + kif->kf_sock_domain = + so->so_proto->pr_domain->dom_family; + kif->kf_sock_type = so->so_type; + kif->kf_sock_protocol = so->so_proto->pr_protocol; + } + if (tp != NULL) { + strlcpy(kif->kf_path, tty_devname(tp), + sizeof(kif->kf_path)); + } + /* Pack record size down */ + kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + + strlen(kif->kf_path) + 1; + kif->kf_structsize = roundup(kif->kf_structsize, + sizeof(uint64_t)); + oldidx = req->oldidx; + error = SYSCTL_OUT(req, kif, kif->kf_structsize); + if (error) { + if (error == ENOMEM) { + /* + * The hack to keep the ABI of sysctl + * kern.proc.filedesc intact, but not + * to account a partially copied + * kinfo_file into the oldidx. + */ + req->oldidx = oldidx; + error = 0; + } + break; + } + } + FILEDESC_SUNLOCK(fdp); + fddrop(fdp); + free(kif, M_TEMP); + return (error); +} + +static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD, + sysctl_kern_proc_filedesc, "Process filedesc entries"); + +#ifdef DDB +/* + * For the purposes of debugging, generate a human-readable string for the + * file type. + */ +static const char * +file_type_to_name(short type) +{ + + switch (type) { + case 0: + return ("zero"); + case DTYPE_VNODE: + return ("vnod"); + case DTYPE_SOCKET: + return ("sock"); + case DTYPE_PIPE: + return ("pipe"); + case DTYPE_FIFO: + return ("fifo"); + case DTYPE_KQUEUE: + return ("kque"); + case DTYPE_CRYPTO: + return ("crpt"); + case DTYPE_MQUEUE: + return ("mque"); + case DTYPE_SHM: + return ("shm"); + case DTYPE_SEM: + return ("ksem"); + default: + return ("unkn"); + } +} + +/* + * For the purposes of debugging, identify a process (if any, perhaps one of + * many) that references the passed file in its file descriptor array. Return + * NULL if none. + */ +static struct proc * +file_to_first_proc(struct file *fp) +{ + struct filedesc *fdp; + struct proc *p; + int n; + + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = p->p_fd; + if (fdp == NULL) + continue; + for (n = 0; n < fdp->fd_nfiles; n++) { + if (fp == fdp->fd_ofiles[n]) + return (p); + } + } + return (NULL); +} + +static void +db_print_file(struct file *fp, int header) +{ + struct proc *p; + + if (header) + db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", + "File", "Type", "Data", "Flag", "GCFl", "Count", + "MCount", "Vnode", "FPID", "FCmd"); + p = file_to_first_proc(fp); + db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, + file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, + 0, fp->f_count, 0, fp->f_vnode, + p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); +} + +DB_SHOW_COMMAND(file, db_show_file) +{ + struct file *fp; + + if (!have_addr) { + db_printf("usage: show file <addr>\n"); + return; + } + fp = (struct file *)addr; + db_print_file(fp, 1); +} + +DB_SHOW_COMMAND(files, db_show_files) +{ + struct filedesc *fdp; + struct file *fp; + struct proc *p; + int header; + int n; + + header = 1; + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + if ((fdp = p->p_fd) == NULL) + continue; + for (n = 0; n < fdp->fd_nfiles; ++n) { + if ((fp = fdp->fd_ofiles[n]) == NULL) + continue; + db_print_file(fp, header); + header = 0; + } + } +} +#endif + +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, + &maxfilesperproc, 0, "Maximum files allowed open per process"); + +SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, + &maxfiles, 0, "Maximum number of files"); + +SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, + __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); + +/* ARGSUSED*/ +static void +filelistinit(void *dummy) +{ + + file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); + mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); +} +SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); +#endif /* __rtems__ */ + +/*-------------------------------------------------------------------*/ + +static int +badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) +{ + + return (EINVAL); +} + +static int +badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) +{ + + return (0); +} + +static int +badfo_kqfilter(struct file *fp, struct knote *kn) +{ + + return (EBADF); +} + +static int +badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) +{ + + return (EBADF); +} + +static int +badfo_close(struct file *fp, struct thread *td) +{ + + return (EBADF); +} + +struct fileops badfileops = { + .fo_read = badfo_readwrite, + .fo_write = badfo_readwrite, + .fo_truncate = badfo_truncate, + .fo_ioctl = badfo_ioctl, + .fo_poll = badfo_poll, + .fo_kqfilter = badfo_kqfilter, + .fo_stat = badfo_stat, + .fo_close = badfo_close, +}; + +#ifndef __rtems__ +/*-------------------------------------------------------------------*/ + +/* + * File Descriptor pseudo-device driver (/dev/fd/). + * + * Opening minor device N dup()s the file (if any) connected to file + * descriptor N belonging to the calling process. Note that this driver + * consists of only the ``open()'' routine, because all subsequent + * references to this file will be direct to the other driver. + * + * XXX: we could give this one a cloning event handler if necessary. + */ + +/* ARGSUSED */ +static int +fdopen(struct cdev *dev, int mode, int type, struct thread *td) +{ + + /* + * XXX Kludge: set curthread->td_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be released + * by vn_open. Open will detect this special error and take the + * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * will simply report the error. + */ + td->td_dupfd = dev2unit(dev); + return (ENODEV); +} + +static struct cdevsw fildesc_cdevsw = { + .d_version = D_VERSION, + .d_open = fdopen, + .d_name = "FD", +}; + +static void +fildesc_drvinit(void *unused) +{ + struct cdev *dev; + + dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0"); + make_dev_alias(dev, "stdin"); + dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1"); + make_dev_alias(dev, "stdout"); + dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2"); + make_dev_alias(dev, "stderr"); +} + +SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); +#endif /* __rtems__ */ + * XXX Kludge: set curthread->td_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be released + * by vn_open. Open will detect this special error and take the + * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * will simply report the error. + */ + td->td_dupfd = dev2unit(dev); + return (ENODEV); +} + +static struct cdevsw fildesc_cdevsw = { + .d_version = D_VERSION, + .d_open = fdopen, + .d_name = "FD", +}; + +static void +fildesc_drvinit(void *unused) +{ + struct cdev *dev; + + dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0"); + make_dev_alias(dev, "stdin"); + dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1"); + make_dev_alias(dev, "stdout"); + dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2"); + make_dev_alias(dev, "stderr"); +} + +SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); +#endif /* __rtems__ */ diff --git a/freebsd/kern/kern_mtxpool.c b/freebsd/kern/kern_mtxpool.c new file mode 100644 index 00000000..b2c0267a --- /dev/null +++ b/freebsd/kern/kern_mtxpool.c @@ -0,0 +1,220 @@ +#include <freebsd/machine/rtems-bsd-config.h> + +/*- + * Copyright (c) 2001 Matthew Dillon. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Mutex pool routines. These routines are designed to be used as short + * term leaf mutexes (e.g. the last mutex you might acquire other then + * calling msleep()). They operate using a shared pool. A mutex is chosen + * from the pool based on the supplied pointer (which may or may not be + * valid). + * + * Advantages: + * - no structural overhead. Mutexes can be associated with structures + * without adding bloat to the structures. + * - mutexes can be obtained for invalid pointers, useful when uses + * mutexes to interlock destructor ops. + * - no initialization/destructor overhead. + * - can be used with msleep. + * + * Disadvantages: + * - should generally only be used as leaf mutexes. + * - pool/pool dependancy ordering cannot be depended on. + * - possible L1 cache mastersip contention between cpus. + */ + +#include <freebsd/sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <freebsd/sys/param.h> +#include <freebsd/sys/proc.h> +#include <freebsd/sys/kernel.h> +#include <freebsd/sys/ktr.h> +#include <freebsd/sys/lock.h> +#include <freebsd/sys/malloc.h> +#include <freebsd/sys/mutex.h> +#include <freebsd/sys/systm.h> + + +static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool"); + +/* Pool sizes must be a power of two */ +#ifndef MTX_POOL_LOCKBUILDER_SIZE +#define MTX_POOL_LOCKBUILDER_SIZE 128 +#endif +#ifndef MTX_POOL_SLEEP_SIZE +#define MTX_POOL_SLEEP_SIZE 128 +#endif + +struct mtxpool_header { + int mtxpool_size; + int mtxpool_mask; + int mtxpool_shift; + int mtxpool_next; +}; + +struct mtx_pool { + struct mtxpool_header mtx_pool_header; + struct mtx mtx_pool_ary[1]; +}; + +static struct mtx_pool_lockbuilder { + struct mtxpool_header mtx_pool_header; + struct mtx mtx_pool_ary[MTX_POOL_LOCKBUILDER_SIZE]; +} lockbuilder_pool; + +#define mtx_pool_size mtx_pool_header.mtxpool_size +#define mtx_pool_mask mtx_pool_header.mtxpool_mask +#define mtx_pool_shift mtx_pool_header.mtxpool_shift +#define mtx_pool_next mtx_pool_header.mtxpool_next + +struct mtx_pool *mtxpool_sleep; +struct mtx_pool *mtxpool_lockbuilder; + +#if UINTPTR_MAX == UINT64_MAX /* 64 bits */ +# define POINTER_BITS 64 +# define HASH_MULTIPLIER 11400714819323198485u /* (2^64)*(sqrt(5)-1)/2 */ +#else /* assume 32 bits */ +# define POINTER_BITS 32 +# define HASH_MULTIPLIER 2654435769u /* (2^32)*(sqrt(5)-1)/2 */ +#endif + +/* + * Return the (shared) pool mutex associated with the specified address. + * The returned mutex is a leaf level mutex, meaning that if you obtain it + * you cannot obtain any other mutexes until you release it. You can + * legally msleep() on the mutex. + */ +struct mtx * +mtx_pool_find(struct mtx_pool *pool, void *ptr) +{ + int p; + + KASSERT(pool != NULL, ("_mtx_pool_find(): null pool")); + /* + * Fibonacci hash, see Knuth's + * _Art of Computer Programming, Volume 3 / Sorting and Searching_ + */ + p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) & + pool->mtx_pool_mask; + return (&pool->mtx_pool_ary[p]); +} + +static void +mtx_pool_initialize(struct mtx_pool *pool, const char *mtx_name, int pool_size, + int opts) +{ + int i, maskbits; + + pool->mtx_pool_size = pool_size; + pool->mtx_pool_mask = pool_size - 1; + for (i = 1, maskbits = 0; (i & pool_size) == 0; i = i << 1) + maskbits++; + pool->mtx_pool_shift = POINTER_BITS - maskbits; + pool->mtx_pool_next = 0; + for (i = 0; i < pool_size; ++i) + mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts); +} + +struct mtx_pool * +mtx_pool_create(const char *mtx_name, int pool_size, int opts) +{ + struct mtx_pool *pool; + + if (pool_size <= 0 || !powerof2(pool_size)) { + printf("WARNING: %s pool size is not a power of 2.\n", + mtx_name); + pool_size = 128; + } + pool = malloc(sizeof (struct mtx_pool) + + ((pool_size - 1) * sizeof (struct mtx)), + M_MTXPOOL, M_WAITOK | M_ZERO); + mtx_pool_initialize(pool, mtx_name, pool_size, opts); + return pool; +} + +void +mtx_pool_destroy(struct mtx_pool **poolp) +{ + int i; + struct mtx_pool *pool = *poolp; + + for (i = pool->mtx_pool_size - 1; i >= 0; --i) + mtx_destroy(&pool->mtx_pool_ary[i]); + free(pool, M_MTXPOOL); + *poolp = NULL; +} + +static void +mtx_pool_setup_static(void *dummy __unused) +{ + mtx_pool_initialize((struct mtx_pool *)&lockbuilder_pool, + "lockbuilder mtxpool", MTX_POOL_LOCKBUILDER_SIZE, + MTX_DEF | MTX_NOWITNESS | MTX_QUIET); + mtxpool_lockbuilder = (struct mtx_pool *)&lockbuilder_pool; +} + +static void +mtx_pool_setup_dynamic(void *dummy __unused) +{ + mtxpool_sleep = mtx_pool_create("sleep mtxpool", + MTX_POOL_SLEEP_SIZE, MTX_DEF); +} + +/* + * Obtain a (shared) mutex from the pool. The returned mutex is a leaf + * level mutex, meaning that if you obtain it you cannot obtain any other + * mutexes until you release it. You can legally msleep() on the mutex. + */ +struct mtx * +mtx_pool_alloc(struct mtx_pool *pool) +{ + int i; + + KASSERT(pool != NULL, ("mtx_pool_alloc(): null pool")); + /* + * mtx_pool_next is unprotected against multiple accesses, + * but simultaneous access by two CPUs should not be very + * harmful. + */ + i = pool->mtx_pool_next; + pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask; + return (&pool->mtx_pool_ary[i]); +} + +/* + * The lockbuilder pool must be initialized early because the lockmgr + * and sx locks depend on it. The sx locks are used in the kernel + * memory allocator. The lockmgr subsystem is initialized by + * SYSINIT(..., SI_SUB_LOCKMGR, ...). + * + * We can't call malloc() to dynamically allocate the sleep pool + * until after kmeminit() has been called, which is done by + * SYSINIT(..., SI_SUB_KMEM, ...). + */ +SYSINIT(mtxpooli1, SI_SUB_MTX_POOL_STATIC, SI_ORDER_FIRST, + mtx_pool_setup_static, NULL); +SYSINIT(mtxpooli2, SI_SUB_MTX_POOL_DYNAMIC, SI_ORDER_FIRST, + mtx_pool_setup_dynamic, NULL); diff --git a/freebsd/kern/kern_subr.c b/freebsd/kern/kern_subr.c index fecb91c5..9a28a7d9 100644 --- a/freebsd/kern/kern_subr.c +++ b/freebsd/kern/kern_subr.c @@ -545,7 +545,6 @@ copyiniov(struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error) return (error); } -#ifndef __rtems__ int copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) { @@ -582,6 +581,7 @@ copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop) return (0); } +#ifndef __rtems__ struct uio * cloneuio(struct uio *uiop) { diff --git a/freebsd/kern/kern_time.c b/freebsd/kern/kern_time.c index e07abc3a..8c760b48 100644 --- a/freebsd/kern/kern_time.c +++ b/freebsd/kern/kern_time.c @@ -698,6 +698,7 @@ realitexpire(void *arg) } /*NOTREACHED*/ } +#endif /* __rtems__ */ /* * Check that a proposed value to load into the .it_value or @@ -716,6 +717,7 @@ itimerfix(struct timeval *tv) return (0); } +#ifndef __rtems__ /* * Decrement an interval timer by a specified number * of microseconds, which must be less than a second, @@ -756,6 +758,7 @@ expire: itp->it_value.tv_usec = 0; /* sec is already 0 */ return (0); } +#endif /* __rtems__ */ /* * Add and subtract routines for timevals. @@ -772,7 +775,6 @@ timevaladd(struct timeval *t1, const struct timeval *t2) t1->tv_usec += t2->tv_usec; timevalfix(t1); } -#endif /* __rtems__ */ void timevalsub(struct timeval *t1, const struct timeval *t2) diff --git a/freebsd/kern/sys_generic.c b/freebsd/kern/sys_generic.c new file mode 100644 index 00000000..c90d632b --- /dev/null +++ b/freebsd/kern/sys_generic.c @@ -0,0 +1,1665 @@ +#include <freebsd/machine/rtems-bsd-config.h> + +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 + */ + +#include <freebsd/sys/cdefs.h> +__FBSDID("$FreeBSD$"); + + +#include <freebsd/local/opt_compat.h> +#include <freebsd/local/opt_ktrace.h> + +#include <freebsd/sys/param.h> +#include <freebsd/sys/systm.h> +#include <freebsd/sys/sysproto.h> +#include <freebsd/sys/filedesc.h> +#include <freebsd/sys/filio.h> +#include <freebsd/sys/fcntl.h> +#include <freebsd/sys/file.h> +#include <freebsd/sys/proc.h> +#include <freebsd/sys/signalvar.h> +#include <freebsd/sys/socketvar.h> +#include <freebsd/sys/uio.h> +#include <freebsd/sys/kernel.h> +#include <freebsd/sys/ktr.h> +#include <freebsd/sys/limits.h> +#include <freebsd/sys/malloc.h> +#include <freebsd/sys/poll.h> +#include <freebsd/sys/resourcevar.h> +#include <freebsd/sys/selinfo.h> +#include <freebsd/sys/sleepqueue.h> +#include <freebsd/sys/syscallsubr.h> +#include <freebsd/sys/sysctl.h> +#include <freebsd/sys/sysent.h> +#include <freebsd/sys/vnode.h> +#include <freebsd/sys/bio.h> +#ifndef __rtems__ +#include <freebsd/sys/buf.h> +#endif +#include <freebsd/sys/condvar.h> +#ifdef KTRACE +#include <freebsd/sys/ktrace.h> +#endif + +#include <freebsd/security/audit/audit.h> + +#ifdef __rtems__ +typedef long fd_mask; +#include <freebsd/vm/uma.h> +#include <freebsd/sys/mutex.h> +#include <freebsd/machine/rtems-bsd-symbols.h> +#endif /* __rtems__ */ + +static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); +static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); +#ifndef __rtems__ +MALLOC_DEFINE(M_IOV, "iov", "large iov's"); +#endif /* __rtems__ */ + +static int pollout(struct thread *, struct pollfd *, struct pollfd *, + u_int); +static int pollscan(struct thread *, struct pollfd *, u_int); +static int pollrescan(struct thread *); +static int selscan(struct thread *, fd_mask **, fd_mask **, int); +static int selrescan(struct thread *, fd_mask **, fd_mask **); +static void selfdalloc(struct thread *, void *); +static void selfdfree(struct seltd *, struct selfd *); +static int dofileread(struct thread *, int, struct file *, struct uio *, + off_t, int); +static int dofilewrite(struct thread *, int, struct file *, struct uio *, + off_t, int); +static void doselwakeup(struct selinfo *, int); +static void seltdinit(struct thread *); +static int seltdwait(struct thread *, int); +static void seltdclear(struct thread *); + +/* + * One seltd per-thread allocated on demand as needed. + * + * t - protected by st_mtx + * k - Only accessed by curthread or read-only + */ +struct seltd { + STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ + struct selfd *st_free1; /* (k) free fd for read set. */ + struct selfd *st_free2; /* (k) free fd for write set. */ + struct mtx st_mtx; /* Protects struct seltd */ + struct cv st_wait; /* (t) Wait channel. */ + int st_flags; /* (t) SELTD_ flags. */ +}; + +#define SELTD_PENDING 0x0001 /* We have pending events. */ +#define SELTD_RESCAN 0x0002 /* Doing a rescan. */ + +/* + * One selfd allocated per-thread per-file-descriptor. + * f - protected by sf_mtx + */ +struct selfd { + STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ + TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ + struct selinfo *sf_si; /* (f) selinfo when linked. */ + struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ + struct seltd *sf_td; /* (k) owning seltd. */ + void *sf_cookie; /* (k) fd or pollfd. */ +}; + +static uma_zone_t selfd_zone; +static struct mtx_pool *mtxpool_select; + +#ifndef _SYS_SYSPROTO_H_ +struct read_args { + int fd; + void *buf; + size_t nbyte; +}; +#endif +#ifndef __rtems__ +int +read(td, uap) + struct thread *td; + struct read_args *uap; +{ + struct uio auio; + struct iovec aiov; + int error; + + if (uap->nbyte > INT_MAX) + return (EINVAL); + aiov.iov_base = uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = uap->nbyte; + auio.uio_segflg = UIO_USERSPACE; + error = kern_readv(td, uap->fd, &auio); + return(error); +} +#endif + +/* + * Positioned read system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct pread_args { + int fd; + void *buf; + size_t nbyte; + int pad; + off_t offset; +}; +#endif +int +pread(td, uap) + struct thread *td; + struct pread_args *uap; +{ + struct uio auio; + struct iovec aiov; + int error; + + if (uap->nbyte > INT_MAX) + return (EINVAL); + aiov.iov_base = uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = uap->nbyte; + auio.uio_segflg = UIO_USERSPACE; + error = kern_preadv(td, uap->fd, &auio, uap->offset); + return(error); +} + +#ifndef __rtems__ +int +freebsd6_pread(td, uap) + struct thread *td; + struct freebsd6_pread_args *uap; +{ + struct pread_args oargs; + + oargs.fd = uap->fd; + oargs.buf = uap->buf; + oargs.nbyte = uap->nbyte; + oargs.offset = uap->offset; + return (pread(td, &oargs)); +} +#endif /* __rtems__ */ + +/* + * Scatter read system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readv_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif +int +readv(struct thread *td, struct readv_args *uap) +{ + struct uio *auio; + int error; + + error = copyinuio(uap->iovp, uap->iovcnt, &auio); + if (error) + return (error); + error = kern_readv(td, uap->fd, auio); + free(auio, M_IOV); + return (error); +} + +int +kern_readv(struct thread *td, int fd, struct uio *auio) +{ + struct file *fp; + int error; + + error = fget_read(td, fd, &fp); + if (error) + return (error); + error = dofileread(td, fd, fp, auio, (off_t)-1, 0); + fdrop(fp, td); + return (error); +} + +/* + * Scatter positioned read system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct preadv_args { + int fd; + struct iovec *iovp; + u_int iovcnt; + off_t offset; +}; +#endif +int +preadv(struct thread *td, struct preadv_args *uap) +{ + struct uio *auio; + int error; + + error = copyinuio(uap->iovp, uap->iovcnt, &auio); + if (error) + return (error); + error = kern_preadv(td, uap->fd, auio, uap->offset); + free(auio, M_IOV); + return (error); +} + +int +kern_preadv(td, fd, auio, offset) + struct thread *td; + int fd; + struct uio *auio; + off_t offset; +{ + struct file *fp; + int error; + + error = fget_read(td, fd, &fp); + if (error) + return (error); + if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) + error = ESPIPE; +#ifndef __rtems__ + else if (offset < 0 && fp->f_vnode->v_type != VCHR) + error = EINVAL; +#endif /* __rtems__ */ + else + error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); + fdrop(fp, td); + return (error); +} + +/* + * Common code for readv and preadv that reads data in + * from a file using the passed in uio, offset, and flags. + */ +static int +dofileread(td, fd, fp, auio, offset, flags) + struct thread *td; + int fd; + struct file *fp; + struct uio *auio; + off_t offset; + int flags; +{ + ssize_t cnt; + int error; +#ifdef KTRACE + struct uio *ktruio = NULL; +#endif + + /* Finish zero length reads right here */ + if (auio->uio_resid == 0) { + td->td_retval[0] = 0; + return(0); + } + auio->uio_rw = UIO_READ; + auio->uio_offset = offset; + auio->uio_td = td; +#ifdef KTRACE + if (KTRPOINT(td, KTR_GENIO)) + ktruio = cloneuio(auio); +#endif + cnt = auio->uio_resid; + if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { + if (auio->uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + } + cnt -= auio->uio_resid; +#ifdef KTRACE + if (ktruio != NULL) { + ktruio->uio_resid = cnt; + ktrgenio(fd, UIO_READ, ktruio, error); + } +#endif + td->td_retval[0] = cnt; + return (error); +} + +#ifndef __rtems__ +#ifndef _SYS_SYSPROTO_H_ +struct write_args { + int fd; + const void *buf; + size_t nbyte; +}; +#endif +int +write(td, uap) + struct thread *td; + struct write_args *uap; +{ + struct uio auio; + struct iovec aiov; + int error; + + if (uap->nbyte > INT_MAX) + return (EINVAL); + aiov.iov_base = (void *)(uintptr_t)uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = uap->nbyte; + auio.uio_segflg = UIO_USERSPACE; + error = kern_writev(td, uap->fd, &auio); + return(error); +} + +/* + * Positioned write system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pwrite_args { + int fd; + const void *buf; + size_t nbyte; + int pad; + off_t offset; +}; +#endif +int +pwrite(td, uap) + struct thread *td; + struct pwrite_args *uap; +{ + struct uio auio; + struct iovec aiov; + int error; + + if (uap->nbyte > INT_MAX) + return (EINVAL); + aiov.iov_base = (void *)(uintptr_t)uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = uap->nbyte; + auio.uio_segflg = UIO_USERSPACE; + error = kern_pwritev(td, uap->fd, &auio, uap->offset); + return(error); +} + +int +freebsd6_pwrite(td, uap) + struct thread *td; + struct freebsd6_pwrite_args *uap; +{ + struct pwrite_args oargs; + + oargs.fd = uap->fd; + oargs.buf = uap->buf; + oargs.nbyte = uap->nbyte; + oargs.offset = uap->offset; + return (pwrite(td, &oargs)); +} + +/* + * Gather write system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct writev_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif +int +writev(struct thread *td, struct writev_args *uap) +{ + struct uio *auio; + int error; + + error = copyinuio(uap->iovp, uap->iovcnt, &auio); + if (error) + return (error); + error = kern_writev(td, uap->fd, auio); + free(auio, M_IOV); + return (error); +} + +int +kern_writev(struct thread *td, int fd, struct uio *auio) +{ + struct file *fp; + int error; + + error = fget_write(td, fd, &fp); + if (error) + return (error); + error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); + fdrop(fp, td); + return (error); +} + +/* + * Gather positioned write system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pwritev_args { + int fd; + struct iovec *iovp; + u_int iovcnt; + off_t offset; +}; +#endif +int +pwritev(struct thread *td, struct pwritev_args *uap) +{ + struct uio *auio; + int error; + + error = copyinuio(uap->iovp, uap->iovcnt, &auio); + if (error) + return (error); + error = kern_pwritev(td, uap->fd, auio, uap->offset); + free(auio, M_IOV); + return (error); +} + +int +kern_pwritev(td, fd, auio, offset) + struct thread *td; + struct uio *auio; + int fd; + off_t offset; +{ + struct file *fp; + int error; + + error = fget_write(td, fd, &fp); + if (error) + return (error); + if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) + error = ESPIPE; + else if (offset < 0 && fp->f_vnode->v_type != VCHR) + error = EINVAL; + else + error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); + fdrop(fp, td); + return (error); +} + +/* + * Common code for writev and pwritev that writes data to + * a file using the passed in uio, offset, and flags. + */ +static int +dofilewrite(td, fd, fp, auio, offset, flags) + struct thread *td; + int fd; + struct file *fp; + struct uio *auio; + off_t offset; + int flags; +{ + ssize_t cnt; + int error; +#ifdef KTRACE + struct uio *ktruio = NULL; +#endif + + auio->uio_rw = UIO_WRITE; + auio->uio_td = td; + auio->uio_offset = offset; +#ifdef KTRACE + if (KTRPOINT(td, KTR_GENIO)) + ktruio = cloneuio(auio); +#endif + cnt = auio->uio_resid; + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { + if (auio->uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + /* Socket layer is responsible for issuing SIGPIPE. */ + if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { + PROC_LOCK(td->td_proc); + tdksignal(td, SIGPIPE, NULL); + PROC_UNLOCK(td->td_proc); + } + } + cnt -= auio->uio_resid; +#ifdef KTRACE + if (ktruio != NULL) { + ktruio->uio_resid = cnt; + ktrgenio(fd, UIO_WRITE, ktruio, error); + } +#endif + td->td_retval[0] = cnt; + return (error); +} + +/* + * Truncate a file given a file descriptor. + * + * Can't use fget_write() here, since must return EINVAL and not EBADF if the + * descriptor isn't writable. + */ +int +kern_ftruncate(td, fd, length) + struct thread *td; + int fd; + off_t length; +{ + struct file *fp; + int error; + + AUDIT_ARG_FD(fd); + if (length < 0) + return (EINVAL); + error = fget(td, fd, &fp); + if (error) + return (error); + AUDIT_ARG_FILE(td->td_proc, fp); + if (!(fp->f_flag & FWRITE)) { + fdrop(fp, td); + return (EINVAL); + } + error = fo_truncate(fp, length, td->td_ucred, td); + fdrop(fp, td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +#endif +int +ftruncate(td, uap) + struct thread *td; + struct ftruncate_args *uap; +{ + + return (kern_ftruncate(td, uap->fd, uap->length)); +} + +#if defined(COMPAT_43) +#ifndef _SYS_SYSPROTO_H_ +struct oftruncate_args { + int fd; + long length; +}; +#endif +int +oftruncate(td, uap) + struct thread *td; + struct oftruncate_args *uap; +{ + + return (kern_ftruncate(td, uap->fd, uap->length)); +} +#endif /* COMPAT_43 */ + +#ifndef _SYS_SYSPROTO_H_ +struct ioctl_args { + int fd; + u_long com; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +ioctl(struct thread *td, struct ioctl_args *uap) +{ + u_long com; + int arg, error; + u_int size; + caddr_t data; + + if (uap->com > 0xffffffff) { + printf( + "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", + td->td_proc->p_pid, td->td_name, uap->com); + uap->com &= 0xffffffff; + } + com = uap->com; + + /* + * Interpret high order word to find amount of data to be + * copied to/from the user's address space. + */ + size = IOCPARM_LEN(com); + if ((size > IOCPARM_MAX) || + ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || +#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) + ((com & IOC_OUT) && size == 0) || +#else + ((com & (IOC_IN | IOC_OUT)) && size == 0) || +#endif + ((com & IOC_VOID) && size > 0 && size != sizeof(int))) + return (ENOTTY); + + if (size > 0) { + if (com & IOC_VOID) { + /* Integer argument. */ + arg = (intptr_t)uap->data; + data = (void *)&arg; + size = 0; + } else + data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); + } else + data = (void *)&uap->data; + if (com & IOC_IN) { + error = copyin(uap->data, data, (u_int)size); + if (error) { + if (size > 0) + free(data, M_IOCTLOPS); + return (error); + } + } else if (com & IOC_OUT) { + /* + * Zero the buffer so the user always + * gets back something deterministic. + */ + bzero(data, size); + } + + error = kern_ioctl(td, uap->fd, com, data); + + if (error == 0 && (com & IOC_OUT)) + error = copyout(data, uap->data, (u_int)size); + + if (size > 0) + free(data, M_IOCTLOPS); + return (error); +} + +int +kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) +{ + struct file *fp; + struct filedesc *fdp; + int error; + int tmp; + + AUDIT_ARG_FD(fd); + AUDIT_ARG_CMD(com); + if ((error = fget(td, fd, &fp)) != 0) + return (error); + if ((fp->f_flag & (FREAD | FWRITE)) == 0) { + fdrop(fp, td); + return (EBADF); + } + fdp = td->td_proc->p_fd; + switch (com) { + case FIONCLEX: + FILEDESC_XLOCK(fdp); + fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; + FILEDESC_XUNLOCK(fdp); + goto out; + case FIOCLEX: + FILEDESC_XLOCK(fdp); + fdp->fd_ofileflags[fd] |= UF_EXCLOSE; + FILEDESC_XUNLOCK(fdp); + goto out; + case FIONBIO: + if ((tmp = *(int *)data)) + atomic_set_int(&fp->f_flag, FNONBLOCK); + else + atomic_clear_int(&fp->f_flag, FNONBLOCK); + data = (void *)&tmp; + break; + case FIOASYNC: + if ((tmp = *(int *)data)) + atomic_set_int(&fp->f_flag, FASYNC); + else + atomic_clear_int(&fp->f_flag, FASYNC); + data = (void *)&tmp; + break; + } + + error = fo_ioctl(fp, com, data, td->td_ucred, td); +out: + fdrop(fp, td); + return (error); +} + +int +poll_no_poll(int events) +{ + /* + * Return true for read/write. If the user asked for something + * special, return POLLNVAL, so that clients have a way of + * determining reliably whether or not the extended + * functionality is present without hard-coding knowledge + * of specific filesystem implementations. + */ + if (events & ~POLLSTANDARD) + return (POLLNVAL); + + return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); +} + +int +pselect(struct thread *td, struct pselect_args *uap) +{ + struct timespec ts; + struct timeval tv, *tvp; + sigset_t set, *uset; + int error; + + if (uap->ts != NULL) { + error = copyin(uap->ts, &ts, sizeof(ts)); + if (error != 0) + return (error); + TIMESPEC_TO_TIMEVAL(&tv, &ts); + tvp = &tv; + } else + tvp = NULL; + if (uap->sm != NULL) { + error = copyin(uap->sm, &set, sizeof(set)); + if (error != 0) + return (error); + uset = &set; + } else + uset = NULL; + return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, + uset, NFDBITS)); +} + +int +kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, + struct timeval *tvp, sigset_t *uset, int abi_nfdbits) +{ + int error; + + if (uset != NULL) { + error = kern_sigprocmask(td, SIG_SETMASK, uset, + &td->td_oldsigmask, 0); + if (error != 0) + return (error); + td->td_pflags |= TDP_OLDMASK; + /* + * Make sure that ast() is called on return to + * usermode and TDP_OLDMASK is cleared, restoring old + * sigmask. + */ + thread_lock(td); + td->td_flags |= TDF_ASTPENDING; + thread_unlock(td); + } + error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct select_args { + int nd; + fd_set *in, *ou, *ex; + struct timeval *tv; +}; +#endif +int +select(struct thread *td, struct select_args *uap) +{ + struct timeval tv, *tvp; + int error; + + if (uap->tv != NULL) { + error = copyin(uap->tv, &tv, sizeof(tv)); + if (error) + return (error); + tvp = &tv; + } else + tvp = NULL; + + return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, + NFDBITS)); +} + +int +kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, + fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) +{ + struct filedesc *fdp; + /* + * The magic 2048 here is chosen to be just enough for FD_SETSIZE + * infds with the new FD_SETSIZE of 1024, and more than enough for + * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE + * of 256. + */ + fd_mask s_selbits[howmany(2048, NFDBITS)]; + fd_mask *ibits[3], *obits[3], *selbits, *sbp; + struct timeval atv, rtv, ttv; + int error, timo; + u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; + + if (nd < 0) + return (EINVAL); + fdp = td->td_proc->p_fd; + if (nd > fdp->fd_lastfile + 1) + nd = fdp->fd_lastfile + 1; + + /* + * Allocate just enough bits for the non-null fd_sets. Use the + * preallocated auto buffer if possible. + */ + nfdbits = roundup(nd, NFDBITS); + ncpbytes = nfdbits / NBBY; + ncpubytes = roundup(nd, abi_nfdbits) / NBBY; + nbufbytes = 0; + if (fd_in != NULL) + nbufbytes += 2 * ncpbytes; + if (fd_ou != NULL) + nbufbytes += 2 * ncpbytes; + if (fd_ex != NULL) + nbufbytes += 2 * ncpbytes; + if (nbufbytes <= sizeof s_selbits) + selbits = &s_selbits[0]; + else + selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); + + /* + * Assign pointers into the bit buffers and fetch the input bits. + * Put the output buffers together so that they can be bzeroed + * together. + */ + sbp = selbits; +#define getbits(name, x) \ + do { \ + if (name == NULL) { \ + ibits[x] = NULL; \ + obits[x] = NULL; \ + } else { \ + ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ + obits[x] = sbp; \ + sbp += ncpbytes / sizeof *sbp; \ + error = copyin(name, ibits[x], ncpubytes); \ + if (error != 0) \ + goto done; \ + bzero((char *)ibits[x] + ncpubytes, \ + ncpbytes - ncpubytes); \ + } \ + } while (0) + getbits(fd_in, 0); + getbits(fd_ou, 1); + getbits(fd_ex, 2); +#undef getbits + +#if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) + /* + * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, + * we are running under 32-bit emulation. This should be more + * generic. + */ +#define swizzle_fdset(bits) \ + if (abi_nfdbits != NFDBITS && bits != NULL) { \ + int i; \ + for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ + bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ + } +#else +#define swizzle_fdset(bits) +#endif + + /* Make sure the bit order makes it through an ABI transition */ + swizzle_fdset(ibits[0]); + swizzle_fdset(ibits[1]); + swizzle_fdset(ibits[2]); + + if (nbufbytes != 0) + bzero(selbits, nbufbytes / 2); + + if (tvp != NULL) { + atv = *tvp; + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } + timo = 0; + seltdinit(td); + /* Iterate until the timeout expires or descriptors become ready. */ + for (;;) { + error = selscan(td, ibits, obits, nd); + if (error || td->td_retval[0] != 0) + break; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + break; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td, timo); + if (error) + break; + error = selrescan(td, ibits, obits); + if (error || td->td_retval[0] != 0) + break; + } + seltdclear(td); + +done: + /* select is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; + + /* swizzle bit order back, if necessary */ + swizzle_fdset(obits[0]); + swizzle_fdset(obits[1]); + swizzle_fdset(obits[2]); +#undef swizzle_fdset + +#define putbits(name, x) \ + if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ + error = error2; + if (error == 0) { + int error2; + + putbits(fd_in, 0); + putbits(fd_ou, 1); + putbits(fd_ex, 2); +#undef putbits + } + if (selbits != &s_selbits[0]) + free(selbits, M_SELECT); + + return (error); +} +/* + * Convert a select bit set to poll flags. + * + * The backend always returns POLLHUP/POLLERR if appropriate and we + * return this as a set bit in any set. + */ +static int select_flags[3] = { + POLLRDNORM | POLLHUP | POLLERR, + POLLWRNORM | POLLHUP | POLLERR, + POLLRDBAND | POLLERR +}; + +/* + * Compute the fo_poll flags required for a fd given by the index and + * bit position in the fd_mask array. + */ +static __inline int +selflags(fd_mask **ibits, int idx, fd_mask bit) +{ + int flags; + int msk; + + flags = 0; + for (msk = 0; msk < 3; msk++) { + if (ibits[msk] == NULL) + continue; + if ((ibits[msk][idx] & bit) == 0) + continue; + flags |= select_flags[msk]; + } + return (flags); +} + +/* + * Set the appropriate output bits given a mask of fired events and the + * input bits originally requested. + */ +static __inline int +selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) +{ + int msk; + int n; + + n = 0; + for (msk = 0; msk < 3; msk++) { + if ((events & select_flags[msk]) == 0) + continue; + if (ibits[msk] == NULL) + continue; + if ((ibits[msk][idx] & bit) == 0) + continue; + /* + * XXX Check for a duplicate set. This can occur because a + * socket calls selrecord() twice for each poll() call + * resulting in two selfds per real fd. selrescan() will + * call selsetbits twice as a result. + */ + if ((obits[msk][idx] & bit) != 0) + continue; + obits[msk][idx] |= bit; + n++; + } + + return (n); +} + +/* + * Traverse the list of fds attached to this thread's seltd and check for + * completion. + */ +static int +selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) +{ + struct filedesc *fdp; + struct selinfo *si; + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + struct file *fp; + fd_mask bit; + int fd, ev, n, idx; + + fdp = td->td_proc->p_fd; + stp = td->td_sel; + n = 0; + STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { + fd = (int)(uintptr_t)sfp->sf_cookie; + si = sfp->sf_si; + selfdfree(stp, sfp); + /* If the selinfo wasn't cleared the event didn't fire. */ + if (si != NULL) + continue; + if ((fp = fget_unlocked(fdp, fd)) == NULL) + return (EBADF); + idx = fd / NFDBITS; + bit = (fd_mask)1 << (fd % NFDBITS); + ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); + fdrop(fp, td); + if (ev != 0) + n += selsetbits(ibits, obits, idx, bit, ev); + } + stp->st_flags = 0; + td->td_retval[0] = n; + return (0); +} + +/* + * Perform the initial filedescriptor scan and register ourselves with + * each selinfo. + */ +static int +selscan(td, ibits, obits, nfd) + struct thread *td; + fd_mask **ibits, **obits; + int nfd; +{ + struct filedesc *fdp; + struct file *fp; + fd_mask bit; + int ev, flags, end, fd; + int n, idx; + + fdp = td->td_proc->p_fd; + n = 0; + for (idx = 0, fd = 0; fd < nfd; idx++) { + end = imin(fd + NFDBITS, nfd); + for (bit = 1; fd < end; bit <<= 1, fd++) { + /* Compute the list of events we're interested in. */ + flags = selflags(ibits, idx, bit); + if (flags == 0) + continue; + if ((fp = fget_unlocked(fdp, fd)) == NULL) + return (EBADF); + selfdalloc(td, (void *)(uintptr_t)fd); + ev = fo_poll(fp, flags, td->td_ucred, td); + fdrop(fp, td); + if (ev != 0) + n += selsetbits(ibits, obits, idx, bit, ev); + } + } + + td->td_retval[0] = n; + return (0); +} +#endif /* __rtems__ */ + +#ifndef _SYS_SYSPROTO_H_ +struct poll_args { + struct pollfd *fds; + u_int nfds; + int timeout; +}; +#endif +int +#ifdef __rtems__ +kern_poll(td, uap) +#else +poll(td, uap) +#endif /* __rtems__ */ + struct thread *td; + struct poll_args *uap; +{ + struct pollfd *bits; + struct pollfd smallbits[32]; + struct timeval atv, rtv, ttv; + int error = 0, timo; + u_int nfds; + size_t ni; + + nfds = uap->nfds; + if (nfds > maxfilesperproc && nfds > FD_SETSIZE) + return (EINVAL); + ni = nfds * sizeof(struct pollfd); + if (ni > sizeof(smallbits)) + bits = malloc(ni, M_TEMP, M_WAITOK); + else + bits = smallbits; + error = copyin(uap->fds, bits, ni); + if (error) + goto done; + if (uap->timeout != INFTIM) { + atv.tv_sec = uap->timeout / 1000; + atv.tv_usec = (uap->timeout % 1000) * 1000; + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } + timo = 0; + seltdinit(td); + /* Iterate until the timeout expires or descriptors become ready. */ + for (;;) { + error = pollscan(td, bits, nfds); + if (error || td->td_retval[0] != 0) + break; + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + break; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td, timo); + if (error) + break; + error = pollrescan(td); + if (error || td->td_retval[0] != 0) + break; + } + seltdclear(td); + +done: + /* poll is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; + if (error == 0) { + error = pollout(td, bits, uap->fds, nfds); + if (error) + goto out; + } +out: + if (ni > sizeof(smallbits)) + free(bits, M_TEMP); + return (error); +} + +static int +pollrescan(struct thread *td) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + struct selinfo *si; + struct filedesc *fdp; + struct file *fp; + struct pollfd *fd; + int n; + + n = 0; + fdp = td->td_proc->p_fd; + stp = td->td_sel; + FILEDESC_SLOCK(fdp); + STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { + fd = (struct pollfd *)sfp->sf_cookie; + si = sfp->sf_si; + selfdfree(stp, sfp); + /* If the selinfo wasn't cleared the event didn't fire. */ + if (si != NULL) + continue; + fp = fdp->fd_ofiles[fd->fd]; + if (fp == NULL) { + fd->revents = POLLNVAL; + n++; + continue; + } + /* + * Note: backend also returns POLLHUP and + * POLLERR if appropriate. + */ + fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); + if (fd->revents != 0) + n++; + } + FILEDESC_SUNLOCK(fdp); + stp->st_flags = 0; + td->td_retval[0] = n; + return (0); +} + + +static int +pollout(td, fds, ufds, nfd) + struct thread *td; + struct pollfd *fds; + struct pollfd *ufds; + u_int nfd; +{ + int error = 0; + u_int i = 0; + u_int n = 0; + + for (i = 0; i < nfd; i++) { + error = copyout(&fds->revents, &ufds->revents, + sizeof(ufds->revents)); + if (error) + return (error); + if (fds->revents != 0) + n++; + fds++; + ufds++; + } + td->td_retval[0] = n; + return (0); +} + +static int +pollscan(td, fds, nfd) + struct thread *td; + struct pollfd *fds; + u_int nfd; +{ + struct filedesc *fdp = td->td_proc->p_fd; + int i; + struct file *fp; + int n = 0; + + FILEDESC_SLOCK(fdp); + for (i = 0; i < nfd; i++, fds++) { + if (fds->fd >= fdp->fd_nfiles) { + fds->revents = POLLNVAL; + n++; + } else if (fds->fd < 0) { + fds->revents = 0; + } else { + fp = fdp->fd_ofiles[fds->fd]; + if (fp == NULL) { + fds->revents = POLLNVAL; + n++; + } else { + /* + * Note: backend also returns POLLHUP and + * POLLERR if appropriate. + */ + selfdalloc(td, fds); + fds->revents = fo_poll(fp, fds->events, + td->td_ucred, td); + /* + * POSIX requires POLLOUT to be never + * set simultaneously with POLLHUP. + */ + if ((fds->revents & POLLHUP) != 0) + fds->revents &= ~POLLOUT; + + if (fds->revents != 0) + n++; + } + } + } + FILEDESC_SUNLOCK(fdp); + td->td_retval[0] = n; + return (0); +} + +/* + * OpenBSD poll system call. + * + * XXX this isn't quite a true representation.. OpenBSD uses select ops. + */ +#ifndef _SYS_SYSPROTO_H_ +struct openbsd_poll_args { + struct pollfd *fds; + u_int nfds; + int timeout; +}; +#endif +int +openbsd_poll(td, uap) + register struct thread *td; + register struct openbsd_poll_args *uap; +{ +#ifdef __rtems__ + return (kern_poll(td, (struct poll_args *)uap)); +#else + return (poll(td, (struct poll_args *)uap)); +#endif +} + +/* + * XXX This was created specifically to support netncp and netsmb. This + * allows the caller to specify a socket to wait for events on. It returns + * 0 if any events matched and an error otherwise. There is no way to + * determine which events fired. + */ +int +selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) +{ + struct timeval atv, rtv, ttv; + int error, timo; + + if (tvp != NULL) { + atv = *tvp; + if (itimerfix(&atv)) + return (EINVAL); + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else { + atv.tv_sec = 0; + atv.tv_usec = 0; + } + + timo = 0; + seltdinit(td); + /* + * Iterate until the timeout expires or the socket becomes ready. + */ + for (;;) { + selfdalloc(td, NULL); + error = sopoll(so, events, NULL, td); + /* error here is actually the ready events. */ + if (error) + return (0); + if (atv.tv_sec || atv.tv_usec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) { + seltdclear(td); + return (EWOULDBLOCK); + } + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + error = seltdwait(td, timo); + seltdclear(td); + if (error) + break; + } + /* XXX Duplicates ncp/smb behavior. */ + if (error == ERESTART) + error = 0; + return (error); +} + +/* + * Preallocate two selfds associated with 'cookie'. Some fo_poll routines + * have two select sets, one for read and another for write. + */ +static void +selfdalloc(struct thread *td, void *cookie) +{ + struct seltd *stp; + + stp = td->td_sel; + if (stp->st_free1 == NULL) + stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); + stp->st_free1->sf_td = stp; + stp->st_free1->sf_cookie = cookie; + if (stp->st_free2 == NULL) + stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); + stp->st_free2->sf_td = stp; + stp->st_free2->sf_cookie = cookie; +} + +static void +selfdfree(struct seltd *stp, struct selfd *sfp) +{ + STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); + mtx_lock(sfp->sf_mtx); + if (sfp->sf_si) + TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); + mtx_unlock(sfp->sf_mtx); + uma_zfree(selfd_zone, sfp); +} + +/* + * Record a select request. + */ +void +selrecord(selector, sip) + struct thread *selector; + struct selinfo *sip; +{ + struct selfd *sfp; + struct seltd *stp; + struct mtx *mtxp; + + stp = selector->td_sel; + /* + * Don't record when doing a rescan. + */ + if (stp->st_flags & SELTD_RESCAN) + return; + /* + * Grab one of the preallocated descriptors. + */ + sfp = NULL; + if ((sfp = stp->st_free1) != NULL) + stp->st_free1 = NULL; + else if ((sfp = stp->st_free2) != NULL) + stp->st_free2 = NULL; + else + panic("selrecord: No free selfd on selq"); + mtxp = sip->si_mtx; + if (mtxp == NULL) + mtxp = mtx_pool_find(mtxpool_select, sip); + /* + * Initialize the sfp and queue it in the thread. + */ + sfp->sf_si = sip; + sfp->sf_mtx = mtxp; + STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); + /* + * Now that we've locked the sip, check for initialization. + */ + mtx_lock(mtxp); + if (sip->si_mtx == NULL) { + sip->si_mtx = mtxp; + TAILQ_INIT(&sip->si_tdlist); + } + /* + * Add this thread to the list of selfds listening on this selinfo. + */ + TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); + mtx_unlock(sip->si_mtx); +} + +/* Wake up a selecting thread. */ +void +selwakeup(sip) + struct selinfo *sip; +{ + doselwakeup(sip, -1); +} + +/* Wake up a selecting thread, and set its priority. */ +void +selwakeuppri(sip, pri) + struct selinfo *sip; + int pri; +{ + doselwakeup(sip, pri); +} + +/* + * Do a wakeup when a selectable event occurs. + */ +static void +doselwakeup(sip, pri) + struct selinfo *sip; + int pri; +{ + struct selfd *sfp; + struct selfd *sfn; + struct seltd *stp; + + /* If it's not initialized there can't be any waiters. */ + if (sip->si_mtx == NULL) + return; + /* + * Locking the selinfo locks all selfds associated with it. + */ + mtx_lock(sip->si_mtx); + TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { + /* + * Once we remove this sfp from the list and clear the + * sf_si seltdclear will know to ignore this si. + */ + TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); + sfp->sf_si = NULL; + stp = sfp->sf_td; + mtx_lock(&stp->st_mtx); + stp->st_flags |= SELTD_PENDING; + cv_broadcastpri(&stp->st_wait, pri); + mtx_unlock(&stp->st_mtx); + } + mtx_unlock(sip->si_mtx); +} + +static void +seltdinit(struct thread *td) +{ + struct seltd *stp; + + if ((stp = td->td_sel) != NULL) + goto out; + td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); + mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); + cv_init(&stp->st_wait, "select"); +out: + stp->st_flags = 0; + STAILQ_INIT(&stp->st_selq); +} + +static int +seltdwait(struct thread *td, int timo) +{ + struct seltd *stp; + int error; + + stp = td->td_sel; + /* + * An event of interest may occur while we do not hold the seltd + * locked so check the pending flag before we sleep. + */ + mtx_lock(&stp->st_mtx); + /* + * Any further calls to selrecord will be a rescan. + */ + stp->st_flags |= SELTD_RESCAN; + if (stp->st_flags & SELTD_PENDING) { + mtx_unlock(&stp->st_mtx); + return (0); + } + if (timo > 0) + error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); + else + error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); + mtx_unlock(&stp->st_mtx); + + return (error); +} + +void +seltdfini(struct thread *td) +{ + struct seltd *stp; + + stp = td->td_sel; + if (stp == NULL) + return; + if (stp->st_free1) + uma_zfree(selfd_zone, stp->st_free1); + if (stp->st_free2) + uma_zfree(selfd_zone, stp->st_free2); + td->td_sel = NULL; + free(stp, M_SELECT); +} + +/* + * Remove the references to the thread from all of the objects we were + * polling. + */ +static void +seltdclear(struct thread *td) +{ + struct seltd *stp; + struct selfd *sfp; + struct selfd *sfn; + + stp = td->td_sel; + STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) + selfdfree(stp, sfp); + stp->st_flags = 0; +} + +static void selectinit(void *); +SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); +static void +selectinit(void *dummy __unused) +{ + + selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, + NULL, NULL, UMA_ALIGN_PTR, 0); + mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); +} diff --git a/freebsd/sys/buf.h b/freebsd/sys/buf.h new file mode 100644 index 00000000..88e55d95 --- /dev/null +++ b/freebsd/sys/buf.h @@ -0,0 +1,526 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)buf.h 8.9 (Berkeley) 3/30/95 + * $FreeBSD$ + */ + +#ifndef _SYS_BUF_HH_ +#define _SYS_BUF_HH_ + +#include <freebsd/sys/bufobj.h> +#include <freebsd/sys/queue.h> +#include <freebsd/sys/lock.h> +#include <freebsd/sys/lockmgr.h> + +struct bio; +struct buf; +struct bufobj; +struct mount; +struct vnode; +struct uio; + +/* + * To avoid including <ufs/ffs/softdep.h> + */ +LIST_HEAD(workhead, worklist); +/* + * These are currently used only by the soft dependency code, hence + * are stored once in a global variable. If other subsystems wanted + * to use these hooks, a pointer to a set of bio_ops could be added + * to each buffer. + */ +extern struct bio_ops { + void (*io_start)(struct buf *); + void (*io_complete)(struct buf *); + void (*io_deallocate)(struct buf *); + int (*io_countdeps)(struct buf *, int); +} bioops; + +struct vm_object; + +typedef unsigned char b_xflags_t; + +/* + * The buffer header describes an I/O operation in the kernel. + * + * NOTES: + * b_bufsize, b_bcount. b_bufsize is the allocation size of the + * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the + * originally requested buffer size and can serve as a bounds check + * against EOF. For most, but not all uses, b_bcount == b_bufsize. + * + * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned + * ranges of dirty data that need to be written to backing store. + * The range is typically clipped at b_bcount ( not b_bufsize ). + * + * b_resid. Number of bytes remaining in I/O. After an I/O operation + * completes, b_resid is usually 0 indicating 100% success. + * + * All fields are protected by the buffer lock except those marked: + * V - Protected by owning bufobj lock + * Q - Protected by the buf queue lock + * D - Protected by an dependency implementation specific lock + */ +struct buf { + struct bufobj *b_bufobj; + long b_bcount; + void *b_caller1; + caddr_t b_data; + int b_error; + uint8_t b_iocmd; + uint8_t b_ioflags; + off_t b_iooffset; + long b_resid; + void (*b_iodone)(struct buf *); + daddr_t b_blkno; /* Underlying physical block number. */ + off_t b_offset; /* Offset into file. */ + TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ + struct buf *b_left; /* (V) splay tree link */ + struct buf *b_right; /* (V) splay tree link */ + uint32_t b_vflags; /* (V) BV_* flags */ + TAILQ_ENTRY(buf) b_freelist; /* (Q) Free list position inactive. */ + unsigned short b_qindex; /* (Q) buffer queue index */ + uint32_t b_flags; /* B_* flags. */ + b_xflags_t b_xflags; /* extra flags */ + struct lock b_lock; /* Buffer lock */ + long b_bufsize; /* Allocated buffer size. */ + long b_runningbufspace; /* when I/O is running, pipelining */ + caddr_t b_kvabase; /* base kva for buffer */ + int b_kvasize; /* size of kva for buffer */ + daddr_t b_lblkno; /* Logical block number. */ + struct vnode *b_vp; /* Device vnode. */ + int b_dirtyoff; /* Offset in buffer of dirty region. */ + int b_dirtyend; /* Offset of end of dirty region. */ + struct ucred *b_rcred; /* Read credentials reference. */ + struct ucred *b_wcred; /* Write credentials reference. */ + void *b_saveaddr; /* Original b_addr for physio. */ + union pager_info { + int pg_reqpage; + } b_pager; + union cluster_info { + TAILQ_HEAD(cluster_list_head, buf) cluster_head; + TAILQ_ENTRY(buf) cluster_entry; + } b_cluster; + struct vm_page *b_pages[btoc(MAXPHYS)]; + int b_npages; + struct workhead b_dep; /* (D) List of filesystem dependencies. */ + void *b_fsprivate1; + void *b_fsprivate2; + void *b_fsprivate3; + int b_pin_count; +}; + +#define b_object b_bufobj->bo_object + +/* + * These flags are kept in b_flags. + * + * Notes: + * + * B_ASYNC VOP calls on bp's are usually async whether or not + * B_ASYNC is set, but some subsystems, such as NFS, like + * to know what is best for the caller so they can + * optimize the I/O. + * + * B_PAGING Indicates that bp is being used by the paging system or + * some paging system and that the bp is not linked into + * the b_vp's clean/dirty linked lists or ref counts. + * Buffer vp reassignments are illegal in this case. + * + * B_CACHE This may only be set if the buffer is entirely valid. + * The situation where B_DELWRI is set and B_CACHE is + * clear MUST be committed to disk by getblk() so + * B_DELWRI can also be cleared. See the comments for + * getblk() in kern/vfs_bio.c. If B_CACHE is clear, + * the caller is expected to clear BIO_ERROR and B_INVAL, + * set BIO_READ, and initiate an I/O. + * + * The 'entire buffer' is defined to be the range from + * 0 through b_bcount. + * + * B_MALLOC Request that the buffer be allocated from the malloc + * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. + * + * B_CLUSTEROK This flag is typically set for B_DELWRI buffers + * by filesystems that allow clustering when the buffer + * is fully dirty and indicates that it may be clustered + * with other adjacent dirty buffers. Note the clustering + * may not be used with the stage 1 data write under NFS + * but may be used for the commit rpc portion. + * + * B_VMIO Indicates that the buffer is tied into an VM object. + * The buffer's data is always PAGE_SIZE aligned even + * if b_bufsize and b_bcount are not. ( b_bufsize is + * always at least DEV_BSIZE aligned, though ). + * + * B_DIRECT Hint that we should attempt to completely free + * the pages underlying the buffer. B_DIRECT is + * sticky until the buffer is released and typically + * only has an effect when B_RELBUF is also set. + * + */ + +#define B_AGE 0x00000001 /* Move to age queue when I/O done. */ +#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ +#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ +#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ +#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ +#define B_CACHE 0x00000020 /* Bread found us in the cache. */ +#define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ +#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ +#define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ +#define B_DONE 0x00000200 /* I/O completed. */ +#define B_EINTR 0x00000400 /* I/O was interrupted */ +#define B_00000800 0x00000800 /* Available flag. */ +#define B_00001000 0x00001000 /* Available flag. */ +#define B_INVAL 0x00002000 /* Does not contain valid info. */ +#define B_00004000 0x00004000 /* Available flag. */ +#define B_NOCACHE 0x00008000 /* Do not cache block after use. */ +#define B_MALLOC 0x00010000 /* malloced b_data */ +#define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ +#define B_000400000 0x00040000 /* Available flag. */ +#define B_000800000 0x00080000 /* Available flag. */ +#define B_00100000 0x00100000 /* Available flag. */ +#define B_DIRTY 0x00200000 /* Needs writing later (in EXT2FS). */ +#define B_RELBUF 0x00400000 /* Release VMIO buffer. */ +#define B_00800000 0x00800000 /* Available flag. */ +#define B_01000000 0x01000000 /* Available flag. */ +#define B_NEEDSGIANT 0x02000000 /* Buffer's vnode needs giant. */ +#define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ +#define B_MANAGED 0x08000000 /* Managed by FS. */ +#define B_RAM 0x10000000 /* Read ahead mark (flag) */ +#define B_VMIO 0x20000000 /* VMIO flag */ +#define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ +#define B_REMFREE 0x80000000 /* Delayed bremfree */ + +#define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34b27" \ + "\33paging\32b25\31b24\30b23\27relbuf\26dirty\25b20" \ + "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ + "\15b12\14b11\13eintr\12done\11persist\10delwri\7validsuspwrt" \ + "\6cache\5deferred\4direct\3async\2needcommit\1age" + +/* + * These flags are kept in b_xflags. + */ +#define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ +#define BX_VNCLEAN 0x00000002 /* On vnode clean list */ +#define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ +#define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ +#define BX_ALTDATA 0x00000040 /* Holds extended data */ + +#define NOOFFSET (-1LL) /* No buffer offset calculated yet */ + +/* + * These flags are kept in b_vflags. + */ +#define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ +#define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ +#define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ +#define BV_INFREECNT 0x80000000 /* buf is counted in numfreebufs */ + +#ifdef _KERNEL +/* + * Buffer locking + */ +extern const char *buf_wmesg; /* Default buffer lock message */ +#define BUF_WMESG "bufwait" +#include <freebsd/sys/proc.h> /* XXX for curthread */ +#include <freebsd/sys/mutex.h> + +/* + * Initialize a lock. + */ +#define BUF_LOCKINIT(bp) \ + lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0) +/* + * + * Get a lock sleeping non-interruptably until it becomes available. + */ +#define BUF_LOCK(bp, locktype, interlock) \ + _lockmgr_args(&(bp)->b_lock, (locktype), (interlock), \ + LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ + LOCK_FILE, LOCK_LINE) + +/* + * Get a lock sleeping with specified interruptably and timeout. + */ +#define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ + _lockmgr_args(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ + (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ + LOCK_FILE, LOCK_LINE) + +/* + * Release a lock. Only the acquiring process may free the lock unless + * it has been handed off to biodone. + */ +#define BUF_UNLOCK(bp) do { \ + KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ + ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ + \ + (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ + LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ + LOCK_FILE, LOCK_LINE); \ +} while (0) + +/* + * Check if a buffer lock is recursed. + */ +#define BUF_LOCKRECURSED(bp) \ + lockmgr_recursed(&(bp)->b_lock) + +/* + * Check if a buffer lock is currently held. + */ +#define BUF_ISLOCKED(bp) \ + lockstatus(&(bp)->b_lock) +/* + * Free a buffer lock. + */ +#define BUF_LOCKFREE(bp) \ + lockdestroy(&(bp)->b_lock) + +/* + * Buffer lock assertions. + */ +#if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) +#define BUF_ASSERT_LOCKED(bp) \ + _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) +#define BUF_ASSERT_SLOCKED(bp) \ + _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) +#define BUF_ASSERT_XLOCKED(bp) \ + _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) +#define BUF_ASSERT_UNLOCKED(bp) \ + _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) +#define BUF_ASSERT_HELD(bp) +#define BUF_ASSERT_UNHELD(bp) +#else +#define BUF_ASSERT_LOCKED(bp) +#define BUF_ASSERT_SLOCKED(bp) +#define BUF_ASSERT_XLOCKED(bp) +#define BUF_ASSERT_UNLOCKED(bp) +#define BUF_ASSERT_HELD(bp) +#define BUF_ASSERT_UNHELD(bp) +#endif + +#ifdef _SYS_PROC_HH_ /* Avoid #include <freebsd/sys/proc.h> pollution */ +/* + * When initiating asynchronous I/O, change ownership of the lock to the + * kernel. Once done, the lock may legally released by biodone. The + * original owning process can no longer acquire it recursively, but must + * wait until the I/O is completed and the lock has been freed by biodone. + */ +#define BUF_KERNPROC(bp) \ + _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) +#endif + +/* + * Find out if the lock has waiters or not. + */ +#define BUF_LOCKWAITERS(bp) \ + lockmgr_waiters(&(bp)->b_lock) + +#endif /* _KERNEL */ + +struct buf_queue_head { + TAILQ_HEAD(buf_queue, buf) queue; + daddr_t last_pblkno; + struct buf *insert_point; + struct buf *switch_point; +}; + +/* + * This structure describes a clustered I/O. It is stored in the b_saveaddr + * field of the buffer on which I/O is done. At I/O completion, cluster + * callback uses the structure to parcel I/O's to individual buffers, and + * then free's this structure. + */ +struct cluster_save { + long bs_bcount; /* Saved b_bcount. */ + long bs_bufsize; /* Saved b_bufsize. */ + void *bs_saveaddr; /* Saved b_addr. */ + int bs_nchildren; /* Number of associated buffers. */ + struct buf **bs_children; /* List of associated buffers. */ +}; + +#ifdef _KERNEL + +static __inline int +bwrite(struct buf *bp) +{ + + KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); + KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); + KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, + ("bwrite: no bop_write bp=%p", bp)); + return (BO_WRITE(bp->b_bufobj, bp)); +} + +static __inline void +bstrategy(struct buf *bp) +{ + + KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); + KASSERT(bp->b_bufobj->bo_ops != NULL, + ("bstrategy: no bo_ops bp=%p", bp)); + KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, + ("bstrategy: no bop_strategy bp=%p", bp)); + BO_STRATEGY(bp->b_bufobj, bp); +} + +static __inline void +buf_start(struct buf *bp) +{ + if (bioops.io_start) + (*bioops.io_start)(bp); +} + +static __inline void +buf_complete(struct buf *bp) +{ + if (bioops.io_complete) + (*bioops.io_complete)(bp); +} + +static __inline void +buf_deallocate(struct buf *bp) +{ + if (bioops.io_deallocate) + (*bioops.io_deallocate)(bp); + BUF_LOCKFREE(bp); +} + +static __inline int +buf_countdeps(struct buf *bp, int i) +{ + if (bioops.io_countdeps) + return ((*bioops.io_countdeps)(bp, i)); + else + return (0); +} + +#endif /* _KERNEL */ + +/* + * Zero out the buffer's data area. + */ +#define clrbuf(bp) { \ + bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ + (bp)->b_resid = 0; \ +} + +/* + * Flags for getblk's last parameter. + */ +#define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ +#define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ +#define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon */ + +#ifdef _KERNEL +extern int nbuf; /* The number of buffer headers */ +extern long maxswzone; /* Max KVA for swap structures */ +extern long maxbcache; /* Max KVA for buffer cache */ +extern long runningbufspace; +extern long hibufspace; +extern int dirtybufthresh; +extern int bdwriteskip; +extern int dirtybufferflushes; +extern int altbufferflushes; +extern int buf_maxio; /* nominal maximum I/O for buffer */ +extern struct buf *buf; /* The buffer headers. */ +extern char *buffers; /* The buffer contents. */ +extern int bufpages; /* Number of memory pages in the buffer pool. */ +extern struct buf *swbuf; /* Swap I/O buffer headers. */ +extern int nswbuf; /* Number of swap I/O buffer headers. */ +extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */ +extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */ + +void runningbufwakeup(struct buf *); +void waitrunningbufspace(void); +caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); +void bufinit(void); +void bwillwrite(void); +int buf_dirty_count_severe(void); +void bremfree(struct buf *); +void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ +int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **); +void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); +int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int, + struct ucred *, struct buf **); +void bdwrite(struct buf *); +void bawrite(struct buf *); +void bdirty(struct buf *); +void bundirty(struct buf *); +void bufstrategy(struct bufobj *, struct buf *); +void brelse(struct buf *); +void bqrelse(struct buf *); +int vfs_bio_awrite(struct buf *); +struct buf * getpbuf(int *); +struct buf *incore(struct bufobj *, daddr_t); +struct buf *gbincore(struct bufobj *, daddr_t); +struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); +struct buf *geteblk(int, int); +int bufwait(struct buf *); +int bufwrite(struct buf *); +void bufdone(struct buf *); +void bufdone_finish(struct buf *); + +int cluster_read(struct vnode *, u_quad_t, daddr_t, long, + struct ucred *, long, int, struct buf **); +int cluster_wbuild(struct vnode *, long, daddr_t, int); +void cluster_write(struct vnode *, struct buf *, u_quad_t, int); +void vfs_bio_set_valid(struct buf *, int base, int size); +void vfs_bio_clrbuf(struct buf *); +void vfs_busy_pages(struct buf *, int clear_modify); +void vfs_unbusy_pages(struct buf *); +int vmapbuf(struct buf *); +void vunmapbuf(struct buf *); +void relpbuf(struct buf *, int *); +void brelvp(struct buf *); +void bgetvp(struct vnode *, struct buf *); +void pbgetbo(struct bufobj *bo, struct buf *bp); +void pbgetvp(struct vnode *, struct buf *); +void pbrelbo(struct buf *); +void pbrelvp(struct buf *); +int allocbuf(struct buf *bp, int size); +void reassignbuf(struct buf *); +struct buf *trypbuf(int *); +void bwait(struct buf *, u_char, const char *); +void bdone(struct buf *); +void bpin(struct buf *); +void bunpin(struct buf *); +void bunpin_wait(struct buf *); + +#endif /* _KERNEL */ + +#endif /* !_SYS_BUF_HH_ */ diff --git a/freebsd/sys/mqueue.h b/freebsd/sys/mqueue.h new file mode 100644 index 00000000..80d40479 --- /dev/null +++ b/freebsd/sys/mqueue.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 2005 David Xu <davidxu@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_MQUEUE_HH_ +#define _SYS_MQUEUE_HH_ + +struct mq_attr { + long mq_flags; /* Message queue flags. */ + long mq_maxmsg; /* Maximum number of messages. */ + long mq_msgsize; /* Maximum message size. */ + long mq_curmsgs; /* Number of messages currently queued. */ + long __reserved[4]; /* Ignored for input, zeroed for output */ +}; + +#ifdef _KERNEL +struct thread; +struct file; +extern void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); +#endif +#endif diff --git a/freebsd/sys/proc.h b/freebsd/sys/proc.h index 39ddd782..04022ee5 100644 --- a/freebsd/sys/proc.h +++ b/freebsd/sys/proc.h @@ -208,7 +208,9 @@ struct thread { TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ +#endif /* __rtems__ */ struct seltd *td_sel; /* Select queue/channel. */ +#ifndef __rtems__ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ @@ -478,7 +480,9 @@ struct proc { TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ +#endif /* __rtems__ */ struct filedesc *p_fd; /* (b) Open files. */ +#ifndef __rtems__ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Process limits. */ @@ -572,6 +576,9 @@ struct proc { struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec */ #else /* __rtems__ */ + struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ + int p_flag; /* (c) P_* flags. */ + struct proc *p_leader; /* (b) */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct mtx p_mtx; /* (n) Lock for this struct. */ rtems_id p_pid; diff --git a/freebsd/sys/tty.h b/freebsd/sys/tty.h new file mode 100644 index 00000000..a56d38fb --- /dev/null +++ b/freebsd/sys/tty.h @@ -0,0 +1,217 @@ +/*- + * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org> + * All rights reserved. + * + * Portions of this software were developed under sponsorship from Snow + * B.V., the Netherlands. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_TTY_HH_ +#define _SYS_TTY_HH_ + +#include <freebsd/sys/param.h> +#include <freebsd/sys/queue.h> +#include <freebsd/sys/lock.h> +#include <freebsd/sys/mutex.h> +#include <freebsd/sys/condvar.h> +#include <freebsd/sys/selinfo.h> +#ifndef __rtems__ +#include <freebsd/sys/termios.h> +#endif +#include <freebsd/sys/ttycom.h> +#include <freebsd/sys/ttyqueue.h> + +struct cdev; +struct file; +struct pgrp; +struct session; +struct ucred; + +struct ttydevsw; + +/* + * Per-TTY structure, containing buffers, etc. + * + * List of locks + * (t) locked by t_mtx + * (l) locked by tty_list_sx + * (c) const until freeing + */ +struct tty { + struct mtx *t_mtx; /* TTY lock. */ + struct mtx t_mtxobj; /* Per-TTY lock (when not borrowing). */ + TAILQ_ENTRY(tty) t_list; /* (l) TTY list entry. */ + unsigned int t_flags; /* (t) Terminal option flags. */ +/* Keep flags in sync with db_show_tty and pstat(8). */ +#define TF_NOPREFIX 0x00001 /* Don't prepend "tty" to device name. */ +#define TF_INITLOCK 0x00002 /* Create init/lock state devices. */ +#define TF_CALLOUT 0x00004 /* Create "cua" devices. */ +#define TF_OPENED_IN 0x00008 /* "tty" node is in use. */ +#define TF_OPENED_OUT 0x00010 /* "cua" node is in use. */ +#define TF_OPENED_CONS 0x00020 /* Device in use as console. */ +#define TF_OPENED (TF_OPENED_IN|TF_OPENED_OUT|TF_OPENED_CONS) +#define TF_GONE 0x00040 /* Device node is gone. */ +#define TF_OPENCLOSE 0x00080 /* Device is in open()/close(). */ +#define TF_ASYNC 0x00100 /* Asynchronous I/O enabled. */ +#define TF_LITERAL 0x00200 /* Accept the next character literally. */ +#define TF_HIWAT_IN 0x00400 /* We've reached the input watermark. */ +#define TF_HIWAT_OUT 0x00800 /* We've reached the output watermark. */ +#define TF_HIWAT (TF_HIWAT_IN|TF_HIWAT_OUT) +#define TF_STOPPED 0x01000 /* Output flow control - stopped. */ +#define TF_EXCLUDE 0x02000 /* Exclusive access. */ +#define TF_BYPASS 0x04000 /* Optimized input path. */ +#define TF_ZOMBIE 0x08000 /* Modem disconnect received. */ +#define TF_HOOK 0x10000 /* TTY has hook attached. */ +#define TF_BUSY_IN 0x20000 /* Process busy in read() -- not supported. */ +#define TF_BUSY_OUT 0x40000 /* Process busy in write(). */ +#define TF_BUSY (TF_BUSY_IN|TF_BUSY_OUT) + unsigned int t_revokecnt; /* (t) revoke() count. */ + + /* Buffering mechanisms. */ + struct ttyinq t_inq; /* (t) Input queue. */ + size_t t_inlow; /* (t) Input low watermark. */ + struct ttyoutq t_outq; /* (t) Output queue. */ + size_t t_outlow; /* (t) Output low watermark. */ + + /* Sleeping mechanisms. */ + struct cv t_inwait; /* (t) Input wait queue. */ + struct cv t_outwait; /* (t) Output wait queue. */ + struct cv t_outserwait; /* (t) Serial output wait queue. */ + struct cv t_bgwait; /* (t) Background wait queue. */ + struct cv t_dcdwait; /* (t) Carrier Detect wait queue. */ + + /* Polling mechanisms. */ + struct selinfo t_inpoll; /* (t) Input poll queue. */ + struct selinfo t_outpoll; /* (t) Output poll queue. */ + struct sigio *t_sigio; /* (t) Asynchronous I/O. */ + + struct termios t_termios; /* (t) I/O processing flags. */ + struct winsize t_winsize; /* (t) Window size. */ + unsigned int t_column; /* (t) Current cursor position. */ + unsigned int t_writepos; /* (t) Where input was interrupted. */ + int t_compatflags; /* (t) COMPAT_43TTY flags. */ + + /* Init/lock-state devices. */ + struct termios t_termios_init_in; /* tty%s.init. */ + struct termios t_termios_lock_in; /* tty%s.lock. */ + struct termios t_termios_init_out; /* cua%s.init. */ + struct termios t_termios_lock_out; /* cua%s.lock. */ + + struct ttydevsw *t_devsw; /* (c) Driver hooks. */ + struct ttyhook *t_hook; /* (t) Capture/inject hook. */ + + /* Process signal delivery. */ + struct pgrp *t_pgrp; /* (t) Foreground process group. */ + struct session *t_session; /* (t) Associated session. */ + unsigned int t_sessioncnt; /* (t) Backpointing sessions. */ + + void *t_devswsoftc; /* (c) Soft config, for drivers. */ + void *t_hooksoftc; /* (t) Soft config, for hooks. */ + struct cdev *t_dev; /* (c) Primary character device. */ +}; + +/* + * Userland version of struct tty, for sysctl kern.ttys + */ +struct xtty { + size_t xt_size; /* Structure size. */ + size_t xt_insize; /* Input queue size. */ + size_t xt_incc; /* Canonicalized characters. */ + size_t xt_inlc; /* Input line charaters. */ + size_t xt_inlow; /* Input low watermark. */ + size_t xt_outsize; /* Output queue size. */ + size_t xt_outcc; /* Output queue usage. */ + size_t xt_outlow; /* Output low watermark. */ + unsigned int xt_column; /* Current column position. */ + pid_t xt_pgid; /* Foreground process group. */ + pid_t xt_sid; /* Session. */ + unsigned int xt_flags; /* Terminal option flags. */ + dev_t xt_dev; /* Userland device. */ +}; + +#ifdef _KERNEL + +/* Allocation and deallocation. */ +struct tty *tty_alloc(struct ttydevsw *tsw, void *softc); +struct tty *tty_alloc_mutex(struct ttydevsw *tsw, void *softc, struct mtx *mtx); +void tty_rel_pgrp(struct tty *tp, struct pgrp *pgrp); +void tty_rel_sess(struct tty *tp, struct session *sess); +void tty_rel_gone(struct tty *tp); + +#define tty_lock(tp) mtx_lock((tp)->t_mtx) +#define tty_unlock(tp) mtx_unlock((tp)->t_mtx) +#define tty_lock_assert(tp,ma) mtx_assert((tp)->t_mtx, (ma)) +#define tty_getlock(tp) ((tp)->t_mtx) + +/* Device node creation. */ +void tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...) + __printflike(3, 4); +#define tty_makealias(tp,fmt,...) \ + make_dev_alias((tp)->t_dev, fmt, ## __VA_ARGS__) + +/* Signalling processes. */ +void tty_signal_sessleader(struct tty *tp, int signal); +void tty_signal_pgrp(struct tty *tp, int signal); +/* Waking up readers/writers. */ +int tty_wait(struct tty *tp, struct cv *cv); +int tty_timedwait(struct tty *tp, struct cv *cv, int timo); +void tty_wakeup(struct tty *tp, int flags); + +/* System messages. */ +int tty_checkoutq(struct tty *tp); +int tty_putchar(struct tty *tp, char c); + +int tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, + struct thread *td); +int tty_ioctl_compat(struct tty *tp, u_long cmd, caddr_t data, + int fflag, struct thread *td); +void tty_init_console(struct tty *tp, speed_t speed); +void tty_flush(struct tty *tp, int flags); +void tty_hiwat_in_block(struct tty *tp); +void tty_hiwat_in_unblock(struct tty *tp); +dev_t tty_udev(struct tty *tp); +#define tty_opened(tp) ((tp)->t_flags & TF_OPENED) +#define tty_gone(tp) ((tp)->t_flags & TF_GONE) +#define tty_softc(tp) ((tp)->t_devswsoftc) +#define tty_devname(tp) devtoname((tp)->t_dev) + +/* Status line printing. */ +void tty_info(struct tty *tp); + +/* /dev/console selection. */ +void ttyconsdev_select(const char *name); + +/* Pseudo-terminal hooks. */ +int pts_alloc_external(int fd, struct thread *td, struct file *fp, + struct cdev *dev, const char *name); + +/* Drivers and line disciplines also need to call these. */ +#include <freebsd/sys/ttydisc.h> +#include <freebsd/sys/ttydevsw.h> +#include <freebsd/sys/ttyhook.h> +#endif /* _KERNEL */ + +#endif /* !_SYS_TTY_HH_ */ diff --git a/freebsd/sys/ttydevsw.h b/freebsd/sys/ttydevsw.h new file mode 100644 index 00000000..a4035585 --- /dev/null +++ b/freebsd/sys/ttydevsw.h @@ -0,0 +1,169 @@ +/*- + * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org> + * All rights reserved. + * + * Portions of this software were developed under sponsorship from Snow + * B.V., the Netherlands. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_TTYDEVSW_HH_ +#define _SYS_TTYDEVSW_HH_ + +#ifndef _SYS_TTY_HH_ +#error "can only be included through <sys/tty.h>" +#endif /* !_SYS_TTY_HH_ */ + +/* + * Driver routines that are called from the line discipline to adjust + * hardware parameters and such. + */ +typedef int tsw_open_t(struct tty *tp); +typedef void tsw_close_t(struct tty *tp); +typedef void tsw_outwakeup_t(struct tty *tp); +typedef void tsw_inwakeup_t(struct tty *tp); +typedef int tsw_ioctl_t(struct tty *tp, u_long cmd, caddr_t data, + struct thread *td); +typedef int tsw_param_t(struct tty *tp, struct termios *t); +typedef int tsw_modem_t(struct tty *tp, int sigon, int sigoff); +typedef int tsw_mmap_t(struct tty *tp, vm_offset_t offset, + vm_paddr_t * paddr, int nprot); +typedef void tsw_pktnotify_t(struct tty *tp, char event); +typedef void tsw_free_t(void *softc); + +struct ttydevsw { + unsigned int tsw_flags; /* Default TTY flags. */ + + tsw_open_t *tsw_open; /* Device opening. */ + tsw_close_t *tsw_close; /* Device closure. */ + + tsw_outwakeup_t *tsw_outwakeup; /* Output available. */ + tsw_inwakeup_t *tsw_inwakeup; /* Input can be stored again. */ + + tsw_ioctl_t *tsw_ioctl; /* ioctl() hooks. */ + tsw_param_t *tsw_param; /* TIOCSETA device parameter setting. */ + tsw_modem_t *tsw_modem; /* Modem sigon/sigoff. */ + + tsw_mmap_t *tsw_mmap; /* mmap() hooks. */ + tsw_pktnotify_t *tsw_pktnotify; /* TIOCPKT events. */ + + tsw_free_t *tsw_free; /* Destructor. */ +}; + +static __inline int +ttydevsw_open(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + return tp->t_devsw->tsw_open(tp); +} + +static __inline void +ttydevsw_close(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + tp->t_devsw->tsw_close(tp); +} + +static __inline void +ttydevsw_outwakeup(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + /* Prevent spurious wakeups. */ + if (ttydisc_getc_poll(tp) == 0) + return; + + tp->t_devsw->tsw_outwakeup(tp); +} + +static __inline void +ttydevsw_inwakeup(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + /* Prevent spurious wakeups. */ + if (tp->t_flags & TF_HIWAT_IN) + return; + + tp->t_devsw->tsw_inwakeup(tp); +} + +static __inline int +ttydevsw_ioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + return tp->t_devsw->tsw_ioctl(tp, cmd, data, td); +} + +static __inline int +ttydevsw_param(struct tty *tp, struct termios *t) +{ + MPASS(!tty_gone(tp)); + + return tp->t_devsw->tsw_param(tp, t); +} + +static __inline int +ttydevsw_modem(struct tty *tp, int sigon, int sigoff) +{ + MPASS(!tty_gone(tp)); + + return tp->t_devsw->tsw_modem(tp, sigon, sigoff); +} + +static __inline int +ttydevsw_mmap(struct tty *tp, vm_offset_t offset, vm_paddr_t *paddr, int nprot) +{ + MPASS(!tty_gone(tp)); + + return tp->t_devsw->tsw_mmap(tp, offset, paddr, nprot); +} + +static __inline void +ttydevsw_pktnotify(struct tty *tp, char event) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + tp->t_devsw->tsw_pktnotify(tp, event); +} + +static __inline void +ttydevsw_free(struct tty *tp) +{ + MPASS(tty_gone(tp)); + + tp->t_devsw->tsw_free(tty_softc(tp)); +} + +#endif /* !_SYS_TTYDEVSW_HH_ */ diff --git a/freebsd/sys/ttydisc.h b/freebsd/sys/ttydisc.h new file mode 100644 index 00000000..00194988 --- /dev/null +++ b/freebsd/sys/ttydisc.h @@ -0,0 +1,86 @@ +/*- + * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org> + * All rights reserved. + * + * Portions of this software were developed under sponsorship from Snow + * B.V., the Netherlands. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_TTYDISC_HH_ +#define _SYS_TTYDISC_HH_ + +#ifndef _SYS_TTY_HH_ +#error "can only be included through <sys/tty.h>" +#endif /* !_SYS_TTY_HH_ */ + +struct cv; +struct thread; +struct tty; +struct uio; + +/* Top half routines. */ +void ttydisc_open(struct tty *tp); +void ttydisc_close(struct tty *tp); +int ttydisc_read(struct tty *tp, struct uio *uio, int ioflag); +int ttydisc_write(struct tty *tp, struct uio *uio, int ioflag); +void ttydisc_optimize(struct tty *tp); + +/* Bottom half routines. */ +void ttydisc_modem(struct tty *tp, int open); +#define ttydisc_can_bypass(tp) ((tp)->t_flags & TF_BYPASS) +int ttydisc_rint(struct tty *tp, char c, int flags); +size_t ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len); +void ttydisc_rint_done(struct tty *tp); +size_t ttydisc_rint_poll(struct tty *tp); +size_t ttydisc_getc(struct tty *tp, void *buf, size_t len); +int ttydisc_getc_uio(struct tty *tp, struct uio *uio); +size_t ttydisc_getc_poll(struct tty *tp); + +/* Error codes for ttydisc_rint(). */ +#define TRE_FRAMING 0x01 +#define TRE_PARITY 0x02 +#define TRE_OVERRUN 0x04 +#define TRE_BREAK 0x08 + +static __inline size_t +ttydisc_read_poll(struct tty *tp) +{ + + tty_lock_assert(tp, MA_OWNED); + + return ttyinq_bytescanonicalized(&tp->t_inq); +} + +static __inline size_t +ttydisc_write_poll(struct tty *tp) +{ + + tty_lock_assert(tp, MA_OWNED); + + return ttyoutq_bytesleft(&tp->t_outq); +} + +#endif /* !_SYS_TTYDISC_HH_ */ diff --git a/freebsd/sys/ttyhook.h b/freebsd/sys/ttyhook.h new file mode 100644 index 00000000..a15fbbb7 --- /dev/null +++ b/freebsd/sys/ttyhook.h @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_TTYHOOK_HH_ +#define _SYS_TTYHOOK_HH_ + +#ifndef _SYS_TTY_HH_ +#error "can only be included through <sys/tty.h>" +#endif /* !_SYS_TTY_HH_ */ + +struct tty; + +/* + * Hooks interface, which allows to capture and inject traffic into the + * input and output paths of a TTY. + */ + +typedef int th_rint_t(struct tty *tp, char c, int flags); +typedef size_t th_rint_bypass_t(struct tty *tp, const void *buf, size_t len); +typedef void th_rint_done_t(struct tty *tp); +typedef size_t th_rint_poll_t(struct tty *tp); + +typedef size_t th_getc_inject_t(struct tty *tp, void *buf, size_t len); +typedef void th_getc_capture_t(struct tty *tp, const void *buf, size_t len); +typedef size_t th_getc_poll_t(struct tty *tp); + +typedef void th_close_t(struct tty *tp); + +struct ttyhook { + /* Character input. */ + th_rint_t *th_rint; + th_rint_bypass_t *th_rint_bypass; + th_rint_done_t *th_rint_done; + th_rint_poll_t *th_rint_poll; + + /* Character output. */ + th_getc_inject_t *th_getc_inject; + th_getc_capture_t *th_getc_capture; + th_getc_poll_t *th_getc_poll; + + th_close_t *th_close; +}; + +int ttyhook_register(struct tty **, struct proc *, int, + struct ttyhook *, void *); +void ttyhook_unregister(struct tty *); +#define ttyhook_softc(tp) ((tp)->t_hooksoftc) +#define ttyhook_hashook(tp,hook) ((tp)->t_hook != NULL && \ + (tp)->t_hook->th_ ## hook != NULL) + +static __inline int +ttyhook_rint(struct tty *tp, char c, int flags) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + return tp->t_hook->th_rint(tp, c, flags); +} + +static __inline size_t +ttyhook_rint_bypass(struct tty *tp, const void *buf, size_t len) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + return tp->t_hook->th_rint_bypass(tp, buf, len); +} + +static __inline void +ttyhook_rint_done(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + tp->t_hook->th_rint_done(tp); +} + +static __inline size_t +ttyhook_rint_poll(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + return tp->t_hook->th_rint_poll(tp); +} + +static __inline size_t +ttyhook_getc_inject(struct tty *tp, void *buf, size_t len) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + return tp->t_hook->th_getc_inject(tp, buf, len); +} + +static __inline void +ttyhook_getc_capture(struct tty *tp, const void *buf, size_t len) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + tp->t_hook->th_getc_capture(tp, buf, len); +} + +static __inline size_t +ttyhook_getc_poll(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + MPASS(!tty_gone(tp)); + + return tp->t_hook->th_getc_poll(tp); +} + +static __inline void +ttyhook_close(struct tty *tp) +{ + tty_lock_assert(tp, MA_OWNED); + + tp->t_hook->th_close(tp); +} + +#endif /* !_SYS_TTYHOOK_HH_ */ diff --git a/freebsd/sys/ttyqueue.h b/freebsd/sys/ttyqueue.h new file mode 100644 index 00000000..b9228bdc --- /dev/null +++ b/freebsd/sys/ttyqueue.h @@ -0,0 +1,178 @@ +/*- + * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org> + * All rights reserved. + * + * Portions of this software were developed under sponsorship from Snow + * B.V., the Netherlands. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_TTYQUEUE_HH_ +#define _SYS_TTYQUEUE_HH_ + +#ifndef _SYS_TTY_HH_ +#error "can only be included through <sys/tty.h>" +#endif /* !_SYS_TTY_HH_ */ + +struct tty; +struct ttyinq_block; +struct ttyoutq_block; +struct uio; + +/* Data input queue. */ +struct ttyinq { + struct ttyinq_block *ti_firstblock; + struct ttyinq_block *ti_startblock; + struct ttyinq_block *ti_reprintblock; + struct ttyinq_block *ti_lastblock; + unsigned int ti_begin; + unsigned int ti_linestart; + unsigned int ti_reprint; + unsigned int ti_end; + unsigned int ti_nblocks; + unsigned int ti_quota; +}; +#define TTYINQ_DATASIZE 128 + +/* Data output queue. */ +struct ttyoutq { + struct ttyoutq_block *to_firstblock; + struct ttyoutq_block *to_lastblock; + unsigned int to_begin; + unsigned int to_end; + unsigned int to_nblocks; + unsigned int to_quota; +}; +#define TTYOUTQ_DATASIZE (256 - sizeof(struct ttyoutq_block *)) + +#ifdef _KERNEL +/* Input queue handling routines. */ +void ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t len); +void ttyinq_free(struct ttyinq *ti); +int ttyinq_read_uio(struct ttyinq *ti, struct tty *tp, struct uio *uio, + size_t readlen, size_t flushlen); +size_t ttyinq_write(struct ttyinq *ti, const void *buf, size_t len, + int quote); +int ttyinq_write_nofrag(struct ttyinq *ti, const void *buf, size_t len, + int quote); +void ttyinq_canonicalize(struct ttyinq *ti); +size_t ttyinq_findchar(struct ttyinq *ti, const char *breakc, size_t maxlen, + char *lastc); +void ttyinq_flush(struct ttyinq *ti); +int ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote); +void ttyinq_unputchar(struct ttyinq *ti); +void ttyinq_reprintpos_set(struct ttyinq *ti); +void ttyinq_reprintpos_reset(struct ttyinq *ti); + +static __inline size_t +ttyinq_getsize(struct ttyinq *ti) +{ + return (ti->ti_nblocks * TTYINQ_DATASIZE); +} + +static __inline size_t +ttyinq_getallocatedsize(struct ttyinq *ti) +{ + + return (ti->ti_quota * TTYINQ_DATASIZE); +} + +static __inline size_t +ttyinq_bytesleft(struct ttyinq *ti) +{ + size_t len; + + /* Make sure the usage never exceeds the length. */ + len = ti->ti_nblocks * TTYINQ_DATASIZE; + MPASS(len >= ti->ti_end); + + return (len - ti->ti_end); +} + +static __inline size_t +ttyinq_bytescanonicalized(struct ttyinq *ti) +{ + MPASS(ti->ti_begin <= ti->ti_linestart); + + return (ti->ti_linestart - ti->ti_begin); +} + +static __inline size_t +ttyinq_bytesline(struct ttyinq *ti) +{ + MPASS(ti->ti_linestart <= ti->ti_end); + + return (ti->ti_end - ti->ti_linestart); +} + +/* Input buffer iteration. */ +typedef void ttyinq_line_iterator_t(void *data, char c, int flags); +void ttyinq_line_iterate_from_linestart(struct ttyinq *ti, + ttyinq_line_iterator_t *iterator, void *data); +void ttyinq_line_iterate_from_reprintpos(struct ttyinq *ti, + ttyinq_line_iterator_t *iterator, void *data); + +/* Output queue handling routines. */ +void ttyoutq_flush(struct ttyoutq *to); +void ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t len); +void ttyoutq_free(struct ttyoutq *to); +size_t ttyoutq_read(struct ttyoutq *to, void *buf, size_t len); +int ttyoutq_read_uio(struct ttyoutq *to, struct tty *tp, struct uio *uio); +size_t ttyoutq_write(struct ttyoutq *to, const void *buf, size_t len); +int ttyoutq_write_nofrag(struct ttyoutq *to, const void *buf, size_t len); + +static __inline size_t +ttyoutq_getsize(struct ttyoutq *to) +{ + return (to->to_nblocks * TTYOUTQ_DATASIZE); +} + +static __inline size_t +ttyoutq_getallocatedsize(struct ttyoutq *to) +{ + + return (to->to_quota * TTYOUTQ_DATASIZE); +} + +static __inline size_t +ttyoutq_bytesleft(struct ttyoutq *to) +{ + size_t len; + + /* Make sure the usage never exceeds the length. */ + len = to->to_nblocks * TTYOUTQ_DATASIZE; + MPASS(len >= to->to_end); + + return (len - to->to_end); +} + +static __inline size_t +ttyoutq_bytesused(struct ttyoutq *to) +{ + return (to->to_end - to->to_begin); +} +#endif /* _KERNEL */ + +#endif /* !_SYS_TTYQUEUE_HH_ */ diff --git a/freebsd/sys/user.h b/freebsd/sys/user.h new file mode 100644 index 00000000..df788c06 --- /dev/null +++ b/freebsd/sys/user.h @@ -0,0 +1,414 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. + * Copyright (c) 2007 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)user.h 8.2 (Berkeley) 9/23/93 + * $FreeBSD$ + */ + +#ifndef _SYS_USER_HH_ +#define _SYS_USER_HH_ + +#include <freebsd/machine/pcb.h> +#ifndef _KERNEL +/* stuff that *used* to be included by user.h, or is now needed */ +#include <freebsd/sys/errno.h> +#include <freebsd/sys/time.h> +#include <freebsd/sys/resource.h> +#include <freebsd/sys/ucred.h> +#include <freebsd/sys/uio.h> +#include <freebsd/sys/queue.h> +#include <freebsd/sys/_lock.h> +#include <freebsd/sys/_mutex.h> +#include <freebsd/sys/proc.h> +#include <freebsd/vm/vm.h> /* XXX */ +#include <freebsd/vm/vm_param.h> /* XXX */ +#include <freebsd/vm/pmap.h> /* XXX */ +#include <freebsd/vm/vm_map.h> /* XXX */ +#endif /* !_KERNEL */ +#ifndef _SYS_RESOURCEVAR_HH_ +#include <freebsd/sys/resourcevar.h> +#endif +#ifndef _SYS_SIGNALVAR_HH_ +#include <freebsd/sys/signalvar.h> +#endif +#ifndef _SYS_SOCKET_VAR_HH_ +#include <freebsd/sys/socket.h> +#endif + +/* + * KERN_PROC subtype ops return arrays of selected proc structure entries: + * + * This struct includes several arrays of spare space, with different arrays + * for different standard C-types. When adding new variables to this struct, + * the space for byte-aligned data should be taken from the ki_sparestring, + * pointers from ki_spareptrs, word-aligned data from ki_spareints, and + * doubleword-aligned data from ki_sparelongs. Make sure the space for new + * variables come from the array which matches the size and alignment of + * those variables on ALL hardware platforms, and then adjust the appropriate + * KI_NSPARE_* value(s) to match. + * + * Always verify that sizeof(struct kinfo_proc) == KINFO_PROC_SIZE on all + * platforms after you have added new variables. Note that if you change + * the value of KINFO_PROC_SIZE, then many userland programs will stop + * working until they are recompiled! + * + * Once you have added the new field, you will need to add code to initialize + * it in two places: function fill_kinfo_proc in sys/kern/kern_proc.c and + * function kvm_proclist in lib/libkvm/kvm_proc.c . + */ +#define KI_NSPARE_INT 9 +#define KI_NSPARE_LONG 12 +#define KI_NSPARE_PTR 6 + +#ifndef _KERNEL +#ifndef KINFO_PROC_SIZE +#error "Unknown architecture" +#endif +#endif /* !_KERNEL */ + +#define WMESGLEN 8 /* size of returned wchan message */ +#define LOCKNAMELEN 8 /* size of returned lock name */ +#define OCOMMLEN 16 /* size of returned thread name */ +#define COMMLEN 19 /* size of returned ki_comm name */ +#define KI_EMULNAMELEN 16 /* size of returned ki_emul */ +#define KI_NGROUPS 16 /* number of groups in ki_groups */ +#define LOGNAMELEN 17 /* size of returned ki_login */ + +/* + * Steal a bit from ki_cr_flags (cr_flags is never used) to indicate + * that the cred had more than KI_NGROUPS groups. + */ +#define KI_CRF_GRP_OVERFLOW 0x80000000 + +struct kinfo_proc { + int ki_structsize; /* size of this structure */ + int ki_layout; /* reserved: layout identifier */ + struct pargs *ki_args; /* address of command arguments */ + struct proc *ki_paddr; /* address of proc */ + struct user *ki_addr; /* kernel virtual addr of u-area */ + struct vnode *ki_tracep; /* pointer to trace file */ + struct vnode *ki_textvp; /* pointer to executable file */ + struct filedesc *ki_fd; /* pointer to open file info */ + struct vmspace *ki_vmspace; /* pointer to kernel vmspace struct */ + void *ki_wchan; /* sleep address */ + pid_t ki_pid; /* Process identifier */ + pid_t ki_ppid; /* parent process id */ + pid_t ki_pgid; /* process group id */ + pid_t ki_tpgid; /* tty process group id */ + pid_t ki_sid; /* Process session ID */ + pid_t ki_tsid; /* Terminal session ID */ + short ki_jobc; /* job control counter */ + short ki_spare_short1; /* unused (just here for alignment) */ + dev_t ki_tdev; /* controlling tty dev */ + sigset_t ki_siglist; /* Signals arrived but not delivered */ + sigset_t ki_sigmask; /* Current signal mask */ + sigset_t ki_sigignore; /* Signals being ignored */ + sigset_t ki_sigcatch; /* Signals being caught by user */ + uid_t ki_uid; /* effective user id */ + uid_t ki_ruid; /* Real user id */ + uid_t ki_svuid; /* Saved effective user id */ + gid_t ki_rgid; /* Real group id */ + gid_t ki_svgid; /* Saved effective group id */ + short ki_ngroups; /* number of groups */ + short ki_spare_short2; /* unused (just here for alignment) */ + gid_t ki_groups[KI_NGROUPS]; /* groups */ + vm_size_t ki_size; /* virtual size */ + segsz_t ki_rssize; /* current resident set size in pages */ + segsz_t ki_swrss; /* resident set size before last swap */ + segsz_t ki_tsize; /* text size (pages) XXX */ + segsz_t ki_dsize; /* data size (pages) XXX */ + segsz_t ki_ssize; /* stack size (pages) */ + u_short ki_xstat; /* Exit status for wait & stop signal */ + u_short ki_acflag; /* Accounting flags */ + fixpt_t ki_pctcpu; /* %cpu for process during ki_swtime */ + u_int ki_estcpu; /* Time averaged value of ki_cpticks */ + u_int ki_slptime; /* Time since last blocked */ + u_int ki_swtime; /* Time swapped in or out */ + int ki_spareint1; /* unused (just here for alignment) */ + u_int64_t ki_runtime; /* Real time in microsec */ + struct timeval ki_start; /* starting time */ + struct timeval ki_childtime; /* time used by process children */ + long ki_flag; /* P_* flags */ + long ki_kiflag; /* KI_* flags (below) */ + int ki_traceflag; /* Kernel trace points */ + char ki_stat; /* S* process status */ + signed char ki_nice; /* Process "nice" value */ + char ki_lock; /* Process lock (prevent swap) count */ + char ki_rqindex; /* Run queue index */ + u_char ki_oncpu; /* Which cpu we are on */ + u_char ki_lastcpu; /* Last cpu we were on */ + char ki_ocomm[OCOMMLEN+1]; /* thread name */ + char ki_wmesg[WMESGLEN+1]; /* wchan message */ + char ki_login[LOGNAMELEN+1]; /* setlogin name */ + char ki_lockname[LOCKNAMELEN+1]; /* lock name */ + char ki_comm[COMMLEN+1]; /* command name */ + char ki_emul[KI_EMULNAMELEN+1]; /* emulation name */ + /* + * When adding new variables, take space for char-strings from the + * front of ki_sparestrings, and ints from the end of ki_spareints. + * That way the spare room from both arrays will remain contiguous. + */ + char ki_sparestrings[68]; /* spare string space */ + int ki_spareints[KI_NSPARE_INT]; /* spare room for growth */ + u_int ki_cr_flags; /* Credential flags */ + int ki_jid; /* Process jail ID */ + int ki_numthreads; /* XXXKSE number of threads in total */ + lwpid_t ki_tid; /* XXXKSE thread id */ + struct priority ki_pri; /* process priority */ + struct rusage ki_rusage; /* process rusage statistics */ + /* XXX - most fields in ki_rusage_ch are not (yet) filled in */ + struct rusage ki_rusage_ch; /* rusage of children processes */ + struct pcb *ki_pcb; /* kernel virtual addr of pcb */ + void *ki_kstack; /* kernel virtual addr of stack */ + void *ki_udata; /* User convenience pointer */ + struct thread *ki_tdaddr; /* address of thread */ + /* + * When adding new variables, take space for pointers from the + * front of ki_spareptrs, and longs from the end of ki_sparelongs. + * That way the spare room from both arrays will remain contiguous. + */ + void *ki_spareptrs[KI_NSPARE_PTR]; /* spare room for growth */ + long ki_sparelongs[KI_NSPARE_LONG]; /* spare room for growth */ + long ki_sflag; /* PS_* flags */ + long ki_tdflags; /* XXXKSE kthread flag */ +}; +void fill_kinfo_proc(struct proc *, struct kinfo_proc *); +/* XXX - the following two defines are temporary */ +#define ki_childstime ki_rusage_ch.ru_stime +#define ki_childutime ki_rusage_ch.ru_utime + +/* + * Legacy PS_ flag. This moved to p_flag but is maintained for + * compatibility. + */ +#define PS_INMEM 0x00001 /* Loaded into memory. */ + +/* ki_sessflag values */ +#define KI_CTTY 0x00000001 /* controlling tty vnode active */ +#define KI_SLEADER 0x00000002 /* session leader */ +#define KI_LOCKBLOCK 0x00000004 /* proc blocked on lock ki_lockname */ + +/* + * This used to be the per-process structure containing data that + * isn't needed in core when the process is swapped out, but now it + * remains only for the benefit of a.out core dumps. + */ +struct user { + struct pstats u_stats; /* *p_stats */ + struct kinfo_proc u_kproc; /* eproc */ +}; + +/* + * The KERN_PROC_FILE sysctl allows a process to dump the file descriptor + * array of another process. + */ +#define KF_TYPE_NONE 0 +#define KF_TYPE_VNODE 1 +#define KF_TYPE_SOCKET 2 +#define KF_TYPE_PIPE 3 +#define KF_TYPE_FIFO 4 +#define KF_TYPE_KQUEUE 5 +#define KF_TYPE_CRYPTO 6 +#define KF_TYPE_MQUEUE 7 +#define KF_TYPE_SHM 8 +#define KF_TYPE_SEM 9 +#define KF_TYPE_PTS 10 +#define KF_TYPE_UNKNOWN 255 + +#define KF_VTYPE_VNON 0 +#define KF_VTYPE_VREG 1 +#define KF_VTYPE_VDIR 2 +#define KF_VTYPE_VBLK 3 +#define KF_VTYPE_VCHR 4 +#define KF_VTYPE_VLNK 5 +#define KF_VTYPE_VSOCK 6 +#define KF_VTYPE_VFIFO 7 +#define KF_VTYPE_VBAD 8 +#define KF_VTYPE_UNKNOWN 255 + +#define KF_FD_TYPE_CWD -1 /* Current working directory */ +#define KF_FD_TYPE_ROOT -2 /* Root directory */ +#define KF_FD_TYPE_JAIL -3 /* Jail directory */ + +#define KF_FLAG_READ 0x00000001 +#define KF_FLAG_WRITE 0x00000002 +#define KF_FLAG_APPEND 0x00000004 +#define KF_FLAG_ASYNC 0x00000008 +#define KF_FLAG_FSYNC 0x00000010 +#define KF_FLAG_NONBLOCK 0x00000020 +#define KF_FLAG_DIRECT 0x00000040 +#define KF_FLAG_HASLOCK 0x00000080 + +/* + * Old format. Has variable hidden padding due to alignment. + * This is a compatability hack for pre-build 7.1 packages. + */ +#if defined(__amd64__) +#define KINFO_OFILE_SIZE 1328 +#endif +#if defined(__i386__) +#define KINFO_OFILE_SIZE 1324 +#endif + +struct kinfo_ofile { + int kf_structsize; /* Size of kinfo_file. */ + int kf_type; /* Descriptor type. */ + int kf_fd; /* Array index. */ + int kf_ref_count; /* Reference count. */ + int kf_flags; /* Flags. */ + /* XXX Hidden alignment padding here on amd64 */ + off_t kf_offset; /* Seek location. */ + int kf_vnode_type; /* Vnode type. */ + int kf_sock_domain; /* Socket domain. */ + int kf_sock_type; /* Socket type. */ + int kf_sock_protocol; /* Socket protocol. */ + char kf_path[PATH_MAX]; /* Path to file, if any. */ + struct sockaddr_storage kf_sa_local; /* Socket address. */ + struct sockaddr_storage kf_sa_peer; /* Peer address. */ +}; + +#if defined(__amd64__) || defined(__i386__) +#define KINFO_FILE_SIZE 1392 +#endif + +struct kinfo_file { + int kf_structsize; /* Variable size of record. */ + int kf_type; /* Descriptor type. */ + int kf_fd; /* Array index. */ + int kf_ref_count; /* Reference count. */ + int kf_flags; /* Flags. */ + int _kf_pad0; /* Round to 64 bit alignment */ + int64_t kf_offset; /* Seek location. */ + int kf_vnode_type; /* Vnode type. */ + int kf_sock_domain; /* Socket domain. */ + int kf_sock_type; /* Socket type. */ + int kf_sock_protocol; /* Socket protocol. */ + struct sockaddr_storage kf_sa_local; /* Socket address. */ + struct sockaddr_storage kf_sa_peer; /* Peer address. */ + int _kf_ispare[16]; /* Space for more stuff. */ + /* Truncated before copyout in sysctl */ + char kf_path[PATH_MAX]; /* Path to file, if any. */ +}; + +/* + * The KERN_PROC_VMMAP sysctl allows a process to dump the VM layout of + * another process as a series of entries. + */ +#define KVME_TYPE_NONE 0 +#define KVME_TYPE_DEFAULT 1 +#define KVME_TYPE_VNODE 2 +#define KVME_TYPE_SWAP 3 +#define KVME_TYPE_DEVICE 4 +#define KVME_TYPE_PHYS 5 +#define KVME_TYPE_DEAD 6 +#define KVME_TYPE_SG 7 +#define KVME_TYPE_UNKNOWN 255 + +#define KVME_PROT_READ 0x00000001 +#define KVME_PROT_WRITE 0x00000002 +#define KVME_PROT_EXEC 0x00000004 + +#define KVME_FLAG_COW 0x00000001 +#define KVME_FLAG_NEEDS_COPY 0x00000002 +#define KVME_FLAG_NOCOREDUMP 0x00000004 + +#if defined(__amd64__) +#define KINFO_OVMENTRY_SIZE 1168 +#endif +#if defined(__i386__) +#define KINFO_OVMENTRY_SIZE 1128 +#endif + +struct kinfo_ovmentry { + int kve_structsize; /* Size of kinfo_vmmapentry. */ + int kve_type; /* Type of map entry. */ + void *kve_start; /* Starting address. */ + void *kve_end; /* Finishing address. */ + int kve_flags; /* Flags on map entry. */ + int kve_resident; /* Number of resident pages. */ + int kve_private_resident; /* Number of private pages. */ + int kve_protection; /* Protection bitmask. */ + int kve_ref_count; /* VM obj ref count. */ + int kve_shadow_count; /* VM obj shadow count. */ + char kve_path[PATH_MAX]; /* Path to VM obj, if any. */ + void *_kve_pspare[8]; /* Space for more stuff. */ + off_t kve_offset; /* Mapping offset in object */ + uint64_t kve_fileid; /* inode number if vnode */ + dev_t kve_fsid; /* dev_t of vnode location */ + int _kve_ispare[3]; /* Space for more stuff. */ +}; + +#if defined(__amd64__) || defined(__i386__) +#define KINFO_VMENTRY_SIZE 1160 +#endif + +struct kinfo_vmentry { + int kve_structsize; /* Variable size of record. */ + int kve_type; /* Type of map entry. */ + uint64_t kve_start; /* Starting address. */ + uint64_t kve_end; /* Finishing address. */ + uint64_t kve_offset; /* Mapping offset in object */ + uint64_t kve_fileid; /* inode number if vnode */ + uint32_t kve_fsid; /* dev_t of vnode location */ + int kve_flags; /* Flags on map entry. */ + int kve_resident; /* Number of resident pages. */ + int kve_private_resident; /* Number of private pages. */ + int kve_protection; /* Protection bitmask. */ + int kve_ref_count; /* VM obj ref count. */ + int kve_shadow_count; /* VM obj shadow count. */ + int _kve_pad0; /* 64bit align next field */ + int _kve_ispare[16]; /* Space for more stuff. */ + /* Truncated before copyout in sysctl */ + char kve_path[PATH_MAX]; /* Path to VM obj, if any. */ +}; + +/* + * The KERN_PROC_KSTACK sysctl allows a process to dump the kernel stacks of + * another process as a series of entries. Each stack is represented by a + * series of symbol names and offsets as generated by stack_sbuf_print(9). + */ +#define KKST_MAXLEN 1024 + +#define KKST_STATE_STACKOK 0 /* Stack is valid. */ +#define KKST_STATE_SWAPPED 1 /* Stack swapped out. */ +#define KKST_STATE_RUNNING 2 /* Stack ephemeral. */ + +#if defined(__amd64__) || defined(__i386__) +#define KINFO_KSTACK_SIZE 1096 +#endif + +struct kinfo_kstack { + lwpid_t kkst_tid; /* ID of thread. */ + int kkst_state; /* Validity of stack. */ + char kkst_trace[KKST_MAXLEN]; /* String representing stack. */ + int _kkst_ispare[16]; /* Space for more stuff. */ +}; + +#endif diff --git a/rtemsbsd/src/rtems-bsd-condvar.c b/rtemsbsd/src/rtems-bsd-condvar.c index 7c16940d..63f30007 100644 --- a/rtemsbsd/src/rtems-bsd-condvar.c +++ b/rtemsbsd/src/rtems-bsd-condvar.c @@ -183,3 +183,20 @@ cv_broadcastpri(struct cv *cv, int pri) rv = pthread_cond_broadcast(&cv->cv_id); BSD_ASSERT_RV(rv); } +int +_cv_wait_sig(struct cv *cvp, struct lock_object *lock) +{ + /* XXX */ + _cv_wait_support(cvp, lock, 0, true); +} + +int +_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo) +{ + /* XXX */ + if (timo <= 0) { + timo = 1; + } + + return _cv_wait_support(cvp, lock, timo, true); +} diff --git a/rtemsbsd/src/rtems-bsd-thread.c b/rtemsbsd/src/rtems-bsd-thread.c index fc414114..ca1ff1f2 100644 --- a/rtemsbsd/src/rtems-bsd-thread.c +++ b/rtemsbsd/src/rtems-bsd-thread.c @@ -49,6 +49,7 @@ #include <freebsd/sys/mutex.h> #include <freebsd/sys/jail.h> #include <freebsd/sys/resourcevar.h> +#include <freebsd/sys/filedesc.h> RTEMS_CHAIN_DEFINE_EMPTY(rtems_bsd_thread_chain); @@ -56,6 +57,9 @@ RTEMS_CHAIN_DEFINE_EMPTY(rtems_bsd_thread_chain); static struct ucred FIXME_ucred = { .cr_ref = 1 /* reference count */ }; +static struct filedesc FIXME_fd = { + .fd_ofiles = NULL /* file structures for open files */ +}; static struct proc FIXME_proc = { .p_ucred = NULL /* (c) Process owner's identity. */ }; @@ -67,29 +71,34 @@ static struct prison FIXME_prison = { static struct uidinfo FIXME_uidinfo; /* per euid resource consumption */ static struct uidinfo FIXME_ruidinfo; /* per ruid resource consumption */ +static struct thread *rtems_bsd_current_td = NULL; + +static void rtems_bsd_thread_descriptor_dtor(void *td) +{ + // XXX are there other pieces to clean up? + free(td, M_TEMP); +} + static struct thread * -rtems_bsd_thread_init_note( rtems_id id ) +rtems_bsd_thread_init( rtems_id id ) { rtems_status_code sc = RTEMS_SUCCESSFUL; unsigned index = 0; char name [5] = "_???"; - struct thread *td = malloc(sizeof(struct thread), M_TEMP, M_WAITOK | M_ZERO); - struct proc *proc; - - if ( td == NULL ) - return td; + struct thread *td; + struct proc *proc; - sc = rtems_task_set_note( id, RTEMS_NOTEPAD_0, ( uint32_t )td ); - if (sc != RTEMS_SUCCESSFUL) { - free(td, M_TEMP); + td = malloc(sizeof(struct thread), M_TEMP, M_WAITOK | M_ZERO); + if (td == NULL) return NULL; - } + // Initialize the thread descriptor index = rtems_object_id_get_index(id); snprintf(name + 1, sizeof(name) - 1, "%03u", index); sc = rtems_object_set_name(id, name); if (sc != RTEMS_SUCCESSFUL) { - rtems_task_delete(id); + // XXX does the thread get deleted? Seems wrong + // rtems_task_delete(id); free(td, M_TEMP); return NULL; } @@ -98,55 +107,62 @@ rtems_bsd_thread_init_note( rtems_id id ) td->td_ucred = crhold(&FIXME_ucred); td->td_proc = &FIXME_proc; - if (td->td_proc->p_ucred != NULL) - return td; - - if (prison_init ) { - mtx_init(&FIXME_prison.pr_mtx, "prison lock", NULL, MTX_DEF | MTX_DUPOK); - - prison_init = 0; - } + if (td->td_proc->p_ucred == NULL) { + if ( prison_init ) { + mtx_init(&FIXME_prison.pr_mtx, "prison lock", NULL, MTX_DEF | MTX_DUPOK); + prison_init = 0; + } + FIXME_ucred.cr_prison = &FIXME_prison; /* jail(2) */ + FIXME_ucred.cr_uidinfo = uifind(0); + FIXME_ucred.cr_ruidinfo = uifind(0); + FIXME_ucred.cr_ngroups = 1; /* group 0 */ + + td->td_proc->p_ucred = crhold(&FIXME_ucred); + mtx_init(&td->td_proc->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); + td->td_proc->p_pid = getpid(); + td->td_proc->p_fibnum = 0; + td->td_proc->p_fd = &FIXME_fd; + sx_init_flags(&FIXME_fd.fd_sx, "config SX thread lock", SX_DUPOK); + } - FIXME_ucred.cr_prison = &FIXME_prison; /* jail(2) */ - FIXME_ucred.cr_uidinfo = uifind(0); - FIXME_ucred.cr_ruidinfo = uifind(0); - FIXME_ucred.cr_ngroups = 1; /* group 0 */ + // Actually set the global pointer + rtems_bsd_current_td = td; - td->td_proc->p_ucred = crhold(&FIXME_ucred); - mtx_init(&td->td_proc->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); - td->td_proc->p_pid = getpid(); - td->td_proc->p_fibnum = 0; + // Now add the task descriptor as a per-task variable + sc = rtems_task_variable_add( + id, + &rtems_bsd_current_td, + rtems_bsd_thread_descriptor_dtor + ); + if (sc != RTEMS_SUCCESSFUL) { + free(td, M_TEMP); + return NULL; + } - return td; + return td; } /* - * XXX Threads which delete themselves will leak this - * XXX Maybe better integrated into the TCB OR a task variable. - * XXX but this is OK for now + * Threads which delete themselves would leak the task + * descriptor so we are using the per-task variable so + * it can be cleaned up. */ struct thread *rtems_get_curthread(void) { struct thread *td; - rtems_status_code sc; - rtems_id id; /* * If we already have a struct thread associated with this thread, - * obtain it + * obtain it. Otherwise, allocate and initialize one. */ - id = rtems_task_self(); - - sc = rtems_task_get_note( id, RTEMS_NOTEPAD_0, (uint32_t *) &td ); - if (sc != RTEMS_SUCCESSFUL) { - panic("rtems_get_curthread: get note Error\n"); + td = rtems_bsd_current_td; + if ( td == NULL ) { + td = rtems_bsd_thread_init( rtems_task_self() ); + if ( td == NULL ){ + panic("rtems_get_curthread: Unable to thread descriptor\n"); + } } - td = rtems_bsd_thread_init_note( id); - if ( td == NULL ){ - panic("rtems_get_curthread: Unable to generate thread note\n"); - } - return td; } @@ -163,6 +179,8 @@ rtems_bsd_thread_start(struct thread **td_ptr, void (*func)(void *), void *arg, BSD_ASSERT(pages >= 0); + memset( td, 0, sizeof(struct thread) ); + sc = rtems_task_create( rtems_build_name('_', 'T', 'S', 'K'), BSD_TASK_PRIORITY_NORMAL, @@ -177,8 +195,8 @@ rtems_bsd_thread_start(struct thread **td_ptr, void (*func)(void *), void *arg, return ENOMEM; } - td = rtems_bsd_thread_init_note( id ); - if (!td) + td = rtems_bsd_thread_init( id ); + if (!td) return ENOMEM; sc = rtems_task_start(id, (rtems_task_entry) func, (rtems_task_argument) arg); |