summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/kern
diff options
context:
space:
mode:
authorSebastian Huber <sebastian.huber@embedded-brains.de>2018-08-07 14:56:50 +0200
committerSebastian Huber <sebastian.huber@embedded-brains.de>2018-09-21 10:29:37 +0200
commitc37f9fba70085fedc8eede7559489d2321393005 (patch)
tree042455ebf1fa89a277a825f72e1ed805d0b4d296 /freebsd/sys/kern
parentUpdate to FreeBSD head 2017-06-01 (diff)
downloadrtems-libbsd-c37f9fba70085fedc8eede7559489d2321393005.tar.bz2
Update to FreeBSD head 2017-08-01
Git mirror commit f5002f5e5f78cae9f0269d812dc0aedb0339312c. Update #3472.
Diffstat (limited to 'freebsd/sys/kern')
-rw-r--r--freebsd/sys/kern/init_main.c3
-rw-r--r--freebsd/sys/kern/kern_event.c185
-rw-r--r--freebsd/sys/kern/kern_linker.c8
-rw-r--r--freebsd/sys/kern/kern_uuid.c9
-rw-r--r--freebsd/sys/kern/subr_blist.c658
-rw-r--r--freebsd/sys/kern/subr_prf.c54
-rw-r--r--freebsd/sys/kern/subr_sbuf.c2
-rw-r--r--freebsd/sys/kern/subr_taskqueue.c25
-rw-r--r--freebsd/sys/kern/subr_uio.c47
-rw-r--r--freebsd/sys/kern/sys_socket.c78
-rw-r--r--freebsd/sys/kern/uipc_accf.c111
-rw-r--r--freebsd/sys/kern/uipc_mbuf.c2
-rw-r--r--freebsd/sys/kern/uipc_sockbuf.c82
-rw-r--r--freebsd/sys/kern/uipc_socket.c939
-rw-r--r--freebsd/sys/kern/uipc_syscalls.c65
-rw-r--r--freebsd/sys/kern/uipc_usrreq.c199
16 files changed, 1460 insertions, 1007 deletions
diff --git a/freebsd/sys/kern/init_main.c b/freebsd/sys/kern/init_main.c
index 467888b2..f211b363 100644
--- a/freebsd/sys/kern/init_main.c
+++ b/freebsd/sys/kern/init_main.c
@@ -384,8 +384,7 @@ SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2,
#endif
static int
-null_fetch_syscall_args(struct thread *td __unused,
- struct syscall_args *sa __unused)
+null_fetch_syscall_args(struct thread *td __unused)
{
panic("null_fetch_syscall_args");
diff --git a/freebsd/sys/kern/kern_event.c b/freebsd/sys/kern/kern_event.c
index 0a64adbe..2428182c 100644
--- a/freebsd/sys/kern/kern_event.c
+++ b/freebsd/sys/kern/kern_event.c
@@ -31,6 +31,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <rtems/bsd/local/opt_compat.h>
#include <rtems/bsd/local/opt_ktrace.h>
#include <rtems/bsd/local/opt_kqueue.h>
@@ -119,6 +120,10 @@ static int kqueue_scan(struct kqueue *kq, int maxevents,
static void kqueue_wakeup(struct kqueue *kq);
static struct filterops *kqueue_fo_find(int filt);
static void kqueue_fo_release(int filt);
+struct g_kevent_args;
+static int kern_kevent_generic(struct thread *td,
+ struct g_kevent_args *uap,
+ struct kevent_copyops *k_ops);
#ifndef __rtems__
static fo_rdwr_t kqueue_read;
@@ -640,12 +645,13 @@ knote_fork(struct knlist *list, int pid)
* interval timer support code.
*/
-#define NOTE_TIMER_PRECMASK (NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \
- NOTE_NSECONDS)
+#define NOTE_TIMER_PRECMASK \
+ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
static sbintime_t
timer2sbintime(intptr_t data, int flags)
{
+ int64_t secs;
/*
* Macros for converting to the fractional second portion of an
@@ -664,27 +670,27 @@ timer2sbintime(intptr_t data, int flags)
case NOTE_MSECONDS: /* FALLTHROUGH */
case 0:
if (data >= 1000) {
- int64_t secs = data / 1000;
+ secs = data / 1000;
#ifdef __LP64__
if (secs > (SBT_MAX / SBT_1S))
return (SBT_MAX);
#endif
return (secs << 32 | MS_TO_SBT(data % 1000));
}
- return MS_TO_SBT(data);
+ return (MS_TO_SBT(data));
case NOTE_USECONDS:
if (data >= 1000000) {
- int64_t secs = data / 1000000;
+ secs = data / 1000000;
#ifdef __LP64__
if (secs > (SBT_MAX / SBT_1S))
return (SBT_MAX);
#endif
return (secs << 32 | US_TO_SBT(data % 1000000));
}
- return US_TO_SBT(data);
+ return (US_TO_SBT(data));
case NOTE_NSECONDS:
if (data >= 1000000000) {
- int64_t secs = data / 1000000000;
+ secs = data / 1000000000;
#ifdef __LP64__
if (secs > (SBT_MAX / SBT_1S))
return (SBT_MAX);
@@ -701,7 +707,7 @@ timer2sbintime(intptr_t data, int flags)
struct kq_timer_cb_data {
struct callout c;
sbintime_t next; /* next timer event fires at */
- sbintime_t to; /* precalculated timer period */
+ sbintime_t to; /* precalculated timer period, 0 for abs */
};
static void
@@ -716,8 +722,9 @@ filt_timerexpire(void *knx)
if ((kn->kn_flags & EV_ONESHOT) != 0)
return;
-
kc = kn->kn_ptr.p_v;
+ if (kc->to == 0)
+ return;
kc->next += kc->to;
callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
PCPU_GET(cpuid), C_ABSOLUTE);
@@ -730,7 +737,8 @@ static int
filt_timerattach(struct knote *kn)
{
struct kq_timer_cb_data *kc;
- sbintime_t to;
+ struct bintime bt;
+ sbintime_t to, sbt;
unsigned int ncallouts;
if (kn->kn_sdata < 0)
@@ -738,10 +746,15 @@ filt_timerattach(struct knote *kn)
if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
kn->kn_sdata = 1;
/* Only precision unit are supported in flags so far */
- if ((kn->kn_sfflags & ~NOTE_TIMER_PRECMASK) != 0)
+ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
return (EINVAL);
to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
+ if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
+ getboottimebin(&bt);
+ sbt = bttosbt(bt);
+ to -= sbt;
+ }
if (to < 0)
return (EINVAL);
@@ -751,12 +764,18 @@ filt_timerattach(struct knote *kn)
return (ENOMEM);
} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
- kn->kn_flags |= EV_CLEAR; /* automatically set */
+ if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
+ kn->kn_flags |= EV_CLEAR; /* automatically set */
kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */
kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
callout_init(&kc->c, 1);
- kc->next = to + sbinuptime();
- kc->to = to;
+ if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
+ kc->next = to;
+ kc->to = 0;
+ } else {
+ kc->next = to + sbinuptime();
+ kc->to = to;
+ }
callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
PCPU_GET(cpuid), C_ABSOLUTE);
@@ -970,25 +989,24 @@ kqueue(void)
#ifdef KTRACE
static size_t
-kev_iovlen(int n, u_int kgio)
+kev_iovlen(int n, u_int kgio, size_t kevent_size)
{
- if (n < 0 || n >= kgio / sizeof(struct kevent))
+ if (n < 0 || n >= kgio / kevent_size)
return (kgio);
- return (n * sizeof(struct kevent));
+ return (n * kevent_size);
}
#endif
-#ifndef _SYS_SYSPROTO_H_
-struct kevent_args {
+struct g_kevent_args {
int fd;
- const struct kevent *changelist;
+ void *changelist;
int nchanges;
- struct kevent *eventlist;
+ void *eventlist;
int nevents;
const struct timespec *timeout;
};
-#endif
+
#ifdef __rtems__
static int kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
struct kevent_copyops *k_ops, const struct timespec *timeout);
@@ -1001,12 +1019,29 @@ static
int
sys_kevent(struct thread *td, struct kevent_args *uap)
{
- struct timespec ts, *tsp;
struct kevent_copyops k_ops = {
.arg = uap,
.k_copyout = kevent_copyout,
.k_copyin = kevent_copyin,
+ .kevent_size = sizeof(struct kevent),
};
+ struct g_kevent_args gk_args = {
+ .fd = uap->fd,
+ .changelist = uap->changelist,
+ .nchanges = uap->nchanges,
+ .eventlist = uap->eventlist,
+ .nevents = uap->nevents,
+ .timeout = uap->timeout,
+ };
+
+ return (kern_kevent_generic(td, &gk_args, &k_ops));
+}
+
+static int
+kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
+ struct kevent_copyops *k_ops)
+{
+ struct timespec ts, *tsp;
int error;
#ifdef KTRACE
struct uio ktruio;
@@ -1028,26 +1063,30 @@ sys_kevent(struct thread *td, struct kevent_args *uap)
if (KTRPOINT(td, KTR_GENIO)) {
kgio = ktr_geniosize;
ktriov.iov_base = uap->changelist;
- ktriov.iov_len = kev_iovlen(uap->nchanges, kgio);
+ ktriov.iov_len = kev_iovlen(uap->nchanges, kgio,
+ k_ops->kevent_size);
ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
.uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
.uio_td = td };
ktruioin = cloneuio(&ktruio);
ktriov.iov_base = uap->eventlist;
- ktriov.iov_len = kev_iovlen(uap->nevents, kgio);
- ktriov.iov_len = uap->nevents * sizeof(struct kevent);
+ ktriov.iov_len = kev_iovlen(uap->nevents, kgio,
+ k_ops->kevent_size);
+ ktriov.iov_len = uap->nevents * k_ops->kevent_size;
ktruioout = cloneuio(&ktruio);
}
#endif
error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
- &k_ops, tsp);
+ k_ops, tsp);
#ifdef KTRACE
if (ktruioin != NULL) {
- ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio);
+ ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio,
+ k_ops->kevent_size);
ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
- ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio);
+ ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio,
+ k_ops->kevent_size);
ktrgenio(uap->fd, UIO_READ, ktruioout, error);
}
#endif
@@ -1123,6 +1162,94 @@ kevent_copyin(void *arg, struct kevent *kevp, int count)
return (error);
}
+#ifdef COMPAT_FREEBSD11
+struct kevent_freebsd11 {
+ __uintptr_t ident; /* identifier for this event */
+ short filter; /* filter for event */
+ unsigned short flags;
+ unsigned int fflags;
+ __intptr_t data;
+ void *udata; /* opaque user data identifier */
+};
+
+static int
+kevent11_copyout(void *arg, struct kevent *kevp, int count)
+{
+ struct freebsd11_kevent_args *uap;
+ struct kevent_freebsd11 kev11;
+ int error, i;
+
+ KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+ uap = (struct freebsd11_kevent_args *)arg;
+
+ for (i = 0; i < count; i++) {
+ kev11.ident = kevp->ident;
+ kev11.filter = kevp->filter;
+ kev11.flags = kevp->flags;
+ kev11.fflags = kevp->fflags;
+ kev11.data = kevp->data;
+ kev11.udata = kevp->udata;
+ error = copyout(&kev11, uap->eventlist, sizeof(kev11));
+ if (error != 0)
+ break;
+ uap->eventlist++;
+ kevp++;
+ }
+ return (error);
+}
+
+/*
+ * Copy 'count' items from the list pointed to by uap->changelist.
+ */
+static int
+kevent11_copyin(void *arg, struct kevent *kevp, int count)
+{
+ struct freebsd11_kevent_args *uap;
+ struct kevent_freebsd11 kev11;
+ int error, i;
+
+ KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+ uap = (struct freebsd11_kevent_args *)arg;
+
+ for (i = 0; i < count; i++) {
+ error = copyin(uap->changelist, &kev11, sizeof(kev11));
+ if (error != 0)
+ break;
+ kevp->ident = kev11.ident;
+ kevp->filter = kev11.filter;
+ kevp->flags = kev11.flags;
+ kevp->fflags = kev11.fflags;
+ kevp->data = (uintptr_t)kev11.data;
+ kevp->udata = kev11.udata;
+ bzero(&kevp->ext, sizeof(kevp->ext));
+ uap->changelist++;
+ kevp++;
+ }
+ return (error);
+}
+
+int
+freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
+{
+ struct kevent_copyops k_ops = {
+ .arg = uap,
+ .k_copyout = kevent11_copyout,
+ .k_copyin = kevent11_copyin,
+ .kevent_size = sizeof(struct kevent_freebsd11),
+ };
+ struct g_kevent_args gk_args = {
+ .fd = uap->fd,
+ .changelist = uap->changelist,
+ .nchanges = uap->nchanges,
+ .eventlist = uap->eventlist,
+ .nevents = uap->nevents,
+ .timeout = uap->timeout,
+ };
+
+ return (kern_kevent_generic(td, &gk_args, &k_ops));
+}
+#endif
+
int
kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
struct kevent_copyops *k_ops, const struct timespec *timeout)
diff --git a/freebsd/sys/kern/kern_linker.c b/freebsd/sys/kern/kern_linker.c
index 214554d3..1c81a61c 100644
--- a/freebsd/sys/kern/kern_linker.c
+++ b/freebsd/sys/kern/kern_linker.c
@@ -1259,8 +1259,8 @@ kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
/* Version 1 fields: */
namelen = strlen(lf->filename) + 1;
- if (namelen > MAXPATHLEN)
- namelen = MAXPATHLEN;
+ if (namelen > sizeof(stat->name))
+ namelen = sizeof(stat->name);
bcopy(lf->filename, &stat->name[0], namelen);
stat->refs = lf->refs;
stat->id = lf->id;
@@ -1268,8 +1268,8 @@ kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
stat->size = lf->size;
/* Version 2 fields: */
namelen = strlen(lf->pathname) + 1;
- if (namelen > MAXPATHLEN)
- namelen = MAXPATHLEN;
+ if (namelen > sizeof(stat->pathname))
+ namelen = sizeof(stat->pathname);
bcopy(lf->pathname, &stat->pathname[0], namelen);
sx_xunlock(&kld_sx);
diff --git a/freebsd/sys/kern/kern_uuid.c b/freebsd/sys/kern/kern_uuid.c
index b6a8915f..1ac19685 100644
--- a/freebsd/sys/kern/kern_uuid.c
+++ b/freebsd/sys/kern/kern_uuid.c
@@ -60,7 +60,7 @@ CTASSERT(sizeof(struct uuid) == 16);
/* We use an alternative, more convenient representation in the generator. */
struct uuid_private {
union {
- uint64_t ll; /* internal. */
+ uint64_t ll; /* internal, for uuid_last only */
struct {
uint32_t low;
uint16_t mid;
@@ -428,3 +428,10 @@ parse_uuid(const char *str, struct uuid *uuid)
(c[3] & 0xc0) != 0x80 && /* variant 1? */
(c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */
}
+
+int
+uuidcmp(const struct uuid *uuid1, const struct uuid *uuid2)
+{
+
+ return (memcmp(uuid1, uuid2, sizeof(struct uuid)));
+}
diff --git a/freebsd/sys/kern/subr_blist.c b/freebsd/sys/kern/subr_blist.c
index 5af51dd4..c8e32c5b 100644
--- a/freebsd/sys/kern/subr_blist.c
+++ b/freebsd/sys/kern/subr_blist.c
@@ -30,18 +30,18 @@
* BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
*
* This module implements a general bitmap allocator/deallocator. The
- * allocator eats around 2 bits per 'block'. The module does not
- * try to interpret the meaning of a 'block' other than to return
+ * allocator eats around 2 bits per 'block'. The module does not
+ * try to interpret the meaning of a 'block' other than to return
* SWAPBLK_NONE on an allocation failure.
*
* A radix tree is used to maintain the bitmap. Two radix constants are
* involved: One for the bitmaps contained in the leaf nodes (typically
- * 32), and one for the meta nodes (typically 16). Both meta and leaf
+ * 64), and one for the meta nodes (typically 16). Both meta and leaf
* nodes have a hint field. This field gives us a hint as to the largest
* free contiguous range of blocks under the node. It may contain a
- * value that is too high, but will never contain a value that is too
+ * value that is too high, but will never contain a value that is too
* low. When the radix tree is searched, allocation failures in subtrees
- * update the hint.
+ * update the hint.
*
* The radix tree also implements two collapsed states for meta nodes:
* the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
@@ -51,7 +51,7 @@
*
* The hinting greatly increases code efficiency for allocations while
* the general radix structure optimizes both allocations and frees. The
- * radix tree should be able to operate well no matter how much
+ * radix tree should be able to operate well no matter how much
* fragmentation there is and no matter how large a bitmap is used.
*
* The blist code wires all necessary memory at creation time. Neither
@@ -63,18 +63,18 @@
* linear array. Each meta node is immediately followed (laid out
* sequentially in memory) by BLIST_META_RADIX lower level nodes. This
* is a recursive structure but one that can be easily scanned through
- * a very simple 'skip' calculation. In order to support large radixes,
- * portions of the tree may reside outside our memory allocation. We
- * handle this with an early-termination optimization (when bighint is
- * set to -1) on the scan. The memory allocation is only large enough
+ * a very simple 'skip' calculation. In order to support large radixes,
+ * portions of the tree may reside outside our memory allocation. We
+ * handle this with an early-termination optimization (when bighint is
+ * set to -1) on the scan. The memory allocation is only large enough
* to cover the number of blocks requested at creation time even if it
* must be encompassed in larger root-node radix.
*
- * NOTE: the allocator cannot currently allocate more than
- * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
- * large' if you try. This is an area that could use improvement. The
- * radix is large enough that this restriction does not effect the swap
- * system, though. Currently only the allocation code is effected by
+ * NOTE: the allocator cannot currently allocate more than
+ * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
+ * large' if you try. This is an area that could use improvement. The
+ * radix is large enough that this restriction does not effect the swap
+ * system, though. Currently only the allocation code is affected by
* this algorithmic unfeature. The freeing code can handle arbitrary
* ranges.
*
@@ -93,7 +93,7 @@ __FBSDID("$FreeBSD$");
#include <sys/blist.h>
#include <sys/malloc.h>
#include <sys/proc.h>
-#include <sys/mutex.h>
+#include <sys/mutex.h>
#else
@@ -101,19 +101,18 @@ __FBSDID("$FreeBSD$");
#define BLIST_DEBUG
#endif
-#define SWAPBLK_NONE ((daddr_t)-1)
-
#include <sys/types.h>
+#include <sys/malloc.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
+#include <stdbool.h>
+#define bitcount64(x) __bitcount64((uint64_t)(x))
#define malloc(a,b,c) calloc(a, 1)
#define free(a,b) free(a)
-typedef unsigned int u_daddr_t;
-
#include <sys/blist.h>
void panic(const char *ctl, ...);
@@ -123,23 +122,23 @@ void panic(const char *ctl, ...);
/*
* static support functions
*/
-
-static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
-static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk,
- daddr_t count, daddr_t radix, int skip);
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
+ daddr_t cursor);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
+ daddr_t radix, daddr_t skip, daddr_t cursor);
static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
-static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
- daddr_t radix, int skip, daddr_t blk);
-static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
- daddr_t skip, blist_t dest, daddr_t count);
-static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
-static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
- daddr_t radix, int skip, daddr_t blk);
-static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix,
- int skip, daddr_t count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
+ daddr_t radix, daddr_t skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
+ daddr_t skip, blist_t dest, daddr_t count);
+static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+ daddr_t radix, daddr_t skip, daddr_t blk);
+static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip,
+ daddr_t count);
#ifndef _KERNEL
-static void blst_radix_print(blmeta_t *scan, daddr_t blk,
- daddr_t radix, int skip, int tab);
+static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
+ daddr_t skip, int tab);
#endif
#ifdef _KERNEL
@@ -153,35 +152,40 @@ static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
* blocks - must be greater than 0
* flags - malloc flags
*
- * The smallest blist consists of a single leaf node capable of
+ * The smallest blist consists of a single leaf node capable of
* managing BLIST_BMAP_RADIX blocks.
*/
-
-blist_t
+blist_t
blist_create(daddr_t blocks, int flags)
{
blist_t bl;
- int radix;
- int skip = 0;
+ daddr_t nodes, radix, skip;
/*
* Calculate radix and skip field used for scanning.
*/
radix = BLIST_BMAP_RADIX;
-
+ skip = 0;
while (radix < blocks) {
radix *= BLIST_META_RADIX;
skip = (skip + 1) * BLIST_META_RADIX;
}
+ nodes = 1 + blst_radix_init(NULL, radix, skip, blocks);
- bl = malloc(sizeof(struct blist), M_SWAP, flags | M_ZERO);
+ bl = malloc(sizeof(struct blist), M_SWAP, flags);
+ if (bl == NULL)
+ return (NULL);
bl->bl_blocks = blocks;
bl->bl_radix = radix;
bl->bl_skip = skip;
- bl->bl_rootblks = 1 +
- blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
- bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, flags);
+ bl->bl_cursor = 0;
+ bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
+ if (bl->bl_root == NULL) {
+ free(bl, M_SWAP);
+ return (NULL);
+ }
+ blst_radix_init(bl->bl_root, radix, skip, blocks);
#if defined(BLIST_DEBUG)
printf(
@@ -189,17 +193,16 @@ blist_create(daddr_t blocks, int flags)
", requiring %lldK of ram\n",
(long long)bl->bl_blocks,
(long long)bl->bl_blocks * 4 / 1024,
- (long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+ (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
);
printf("BLIST raw radix tree contains %lld records\n",
- (long long)bl->bl_rootblks);
+ (long long)nodes);
#endif
- blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
- return(bl);
+ return (bl);
}
-void
+void
blist_destroy(blist_t bl)
{
free(bl->bl_root, M_SWAP);
@@ -207,25 +210,44 @@ blist_destroy(blist_t bl)
}
/*
- * blist_alloc() - reserve space in the block bitmap. Return the base
+ * blist_alloc() - reserve space in the block bitmap. Return the base
* of a contiguous region or SWAPBLK_NONE if space could
* not be allocated.
*/
-
-daddr_t
+daddr_t
blist_alloc(blist_t bl, daddr_t count)
{
- daddr_t blk = SWAPBLK_NONE;
+ daddr_t blk;
- if (bl) {
- if (bl->bl_radix == BLIST_BMAP_RADIX)
- blk = blst_leaf_alloc(bl->bl_root, 0, count);
- else
- blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
- if (blk != SWAPBLK_NONE)
- bl->bl_free -= count;
+ /*
+ * This loop iterates at most twice. An allocation failure in the
+ * first iteration leads to a second iteration only if the cursor was
+ * non-zero. When the cursor is zero, an allocation failure will
+ * reduce the hint, stopping further iterations.
+ */
+ while (count <= bl->bl_root->bm_bighint) {
+ blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix,
+ bl->bl_skip, bl->bl_cursor);
+ if (blk != SWAPBLK_NONE) {
+ bl->bl_cursor = blk + count;
+ return (blk);
+ } else if (bl->bl_cursor != 0)
+ bl->bl_cursor = 0;
}
- return(blk);
+ return (SWAPBLK_NONE);
+}
+
+/*
+ * blist_avail() - return the number of free blocks.
+ */
+daddr_t
+blist_avail(blist_t bl)
+{
+
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ return (bitcount64(bl->bl_root->u.bmu_bitmap));
+ else
+ return (bl->bl_root->u.bmu_avail);
}
/*
@@ -233,17 +255,11 @@ blist_alloc(blist_t bl, daddr_t count)
* of a contiguous region. Panic if an inconsistancy is
* found.
*/
-
-void
+void
blist_free(blist_t bl, daddr_t blkno, daddr_t count)
{
- if (bl) {
- if (bl->bl_radix == BLIST_BMAP_RADIX)
- blst_leaf_free(bl->bl_root, blkno, count);
- else
- blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
- bl->bl_free += count;
- }
+
+ blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
}
/*
@@ -252,22 +268,12 @@ blist_free(blist_t bl, daddr_t blkno, daddr_t count)
* existing allocations. Return the number of blocks
* actually filled that were free before the call.
*/
-
-int
+daddr_t
blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
{
- int filled;
- if (bl) {
- if (bl->bl_radix == BLIST_BMAP_RADIX)
- filled = blst_leaf_fill(bl->bl_root, blkno, count);
- else
- filled = blst_meta_fill(bl->bl_root, blkno, count,
- bl->bl_radix, bl->bl_skip, 0);
- bl->bl_free -= filled;
- return filled;
- } else
- return 0;
+ return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix,
+ bl->bl_skip, 0));
}
/*
@@ -277,7 +283,6 @@ blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
* one. When extending the tree you can specify whether
* the new blocks are to left allocated or freed.
*/
-
void
blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
{
@@ -303,7 +308,6 @@ blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
/*
* blist_print() - dump radix tree
*/
-
void
blist_print(blist_t bl)
{
@@ -318,7 +322,7 @@ blist_print(blist_t bl)
* ALLOCATION SUPPORT FUNCTIONS *
************************************************************************
*
- * These support functions do all the actual work. They may seem
+ * These support functions do all the actual work. They may seem
* rather longish, but that's because I've commented them up. The
* actual code is straight forward.
*
@@ -327,77 +331,91 @@ blist_print(blist_t bl)
/*
* blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
*
- * This is the core of the allocator and is optimized for the 1 block
- * and the BLIST_BMAP_RADIX block allocation cases. Other cases are
- * somewhat slower. The 1 block allocation case is log2 and extremely
- * quick.
+ * This is the core of the allocator and is optimized for the
+ * BLIST_BMAP_RADIX block allocation case. Otherwise, execution
+ * time is proportional to log2(count) + log2(BLIST_BMAP_RADIX).
*/
-
static daddr_t
-blst_leaf_alloc(
- blmeta_t *scan,
- daddr_t blk,
- int count
-) {
- u_daddr_t orig = scan->u.bmu_bitmap;
-
- if (orig == 0) {
+blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
+{
+ u_daddr_t mask;
+ int count1, hi, lo, mid, num_shifts, range1, range_ext;
+
+ if (count == BLIST_BMAP_RADIX) {
/*
- * Optimize bitmap all-allocated case. Also, count = 1
- * case assumes at least 1 bit is free in the bitmap, so
- * we have to take care of this case here.
+ * Optimize allocation of BLIST_BMAP_RADIX bits. If this wasn't
+ * a special case, then forming the final value of 'mask' below
+ * would require special handling to avoid an invalid left shift
+ * when count equals the number of bits in mask.
*/
+ if (~scan->u.bmu_bitmap != 0) {
+ scan->bm_bighint = BLIST_BMAP_RADIX - 1;
+ return (SWAPBLK_NONE);
+ }
+ if (cursor != blk)
+ return (SWAPBLK_NONE);
+ scan->u.bmu_bitmap = 0;
scan->bm_bighint = 0;
- return(SWAPBLK_NONE);
+ return (blk);
}
- if (count == 1) {
+ range1 = 0;
+ count1 = count - 1;
+ num_shifts = fls(count1);
+ mask = scan->u.bmu_bitmap;
+ while (mask != 0 && num_shifts > 0) {
/*
- * Optimized code to allocate one bit out of the bitmap
+ * If bit i is set in mask, then bits in [i, i+range1] are set
+ * in scan->u.bmu_bitmap. The value of range1 is equal to
+ * count1 >> num_shifts. Grow range and reduce num_shifts to 0,
+ * while preserving these invariants. The updates to mask leave
+ * fewer bits set, but each bit that remains set represents a
+ * longer string of consecutive bits set in scan->u.bmu_bitmap.
*/
- u_daddr_t mask;
- int j = BLIST_BMAP_RADIX/2;
- int r = 0;
-
- mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
-
- while (j) {
- if ((orig & mask) == 0) {
- r += j;
- orig >>= j;
- }
- j >>= 1;
- mask >>= j;
- }
- scan->u.bmu_bitmap &= ~(1 << r);
- return(blk + r);
+ num_shifts--;
+ range_ext = range1 + ((count1 >> num_shifts) & 1);
+ mask &= mask >> range_ext;
+ range1 += range_ext;
}
- if (count <= BLIST_BMAP_RADIX) {
+ if (mask == 0) {
/*
- * non-optimized code to allocate N bits out of the bitmap.
- * The more bits, the faster the code runs. It will run
- * the slowest allocating 2 bits, but since there aren't any
- * memory ops in the core loop (or shouldn't be, anyway),
- * you probably won't notice the difference.
+ * Update bighint. There is no allocation bigger than range1
+ * available in this leaf.
*/
- int j;
- int n = BLIST_BMAP_RADIX - count;
- u_daddr_t mask;
+ scan->bm_bighint = range1;
+ return (SWAPBLK_NONE);
+ }
- mask = (u_daddr_t)-1 >> n;
+ /*
+ * Discard any candidates that appear before the cursor.
+ */
+ lo = cursor - blk;
+ mask &= ~(u_daddr_t)0 << lo;
- for (j = 0; j <= n; ++j) {
- if ((orig & mask) == mask) {
- scan->u.bmu_bitmap &= ~mask;
- return(blk + j);
- }
- mask = (mask << 1);
- }
+ if (mask == 0)
+ return (SWAPBLK_NONE);
+
+ /*
+ * The least significant set bit in mask marks the start of the first
+ * available range of sufficient size. Clear all the bits but that one,
+ * and then perform a binary search to find its position.
+ */
+ mask &= -mask;
+ hi = BLIST_BMAP_RADIX - count1;
+ while (lo + 1 < hi) {
+ mid = (lo + hi) >> 1;
+ if ((mask >> mid) != 0)
+ lo = mid;
+ else
+ hi = mid;
}
+
/*
- * We couldn't allocate count in this subtree, update bighint.
+ * Set in mask exactly the bits being allocated, and clear them from
+ * the set of available bits.
*/
- scan->bm_bighint = count - 1;
- return(SWAPBLK_NONE);
+ mask = (mask << count) - mask;
+ scan->u.bmu_bitmap &= ~mask;
+ return (blk + lo);
}
/*
@@ -408,76 +426,75 @@ blst_leaf_alloc(
* calls that hit this node. We have to check for our collapse cases
* and we have a few optimizations strewn in as well.
*/
-
static daddr_t
-blst_meta_alloc(
- blmeta_t *scan,
- daddr_t blk,
- daddr_t count,
- daddr_t radix,
- int skip
-) {
- int i;
- int next_skip = ((u_int)skip / BLIST_META_RADIX);
+blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
+ daddr_t skip, daddr_t cursor)
+{
+ daddr_t i, next_skip, r;
+ int child;
+ bool scan_from_start;
- if (scan->u.bmu_avail == 0) {
+ if (radix == BLIST_BMAP_RADIX)
+ return (blst_leaf_alloc(scan, blk, count, cursor));
+ if (scan->u.bmu_avail < count) {
/*
- * ALL-ALLOCATED special case
+ * The meta node's hint must be too large if the allocation
+ * exceeds the number of free blocks. Reduce the hint, and
+ * return failure.
*/
- scan->bm_bighint = count;
- return(SWAPBLK_NONE);
+ scan->bm_bighint = scan->u.bmu_avail;
+ return (SWAPBLK_NONE);
}
+ next_skip = skip / BLIST_META_RADIX;
+ /*
+ * An ALL-FREE meta node requires special handling before allocating
+ * any of its blocks.
+ */
if (scan->u.bmu_avail == radix) {
radix /= BLIST_META_RADIX;
/*
- * ALL-FREE special case, initialize uninitialize
- * sublevel.
+ * Reinitialize each of the meta node's children. An ALL-FREE
+ * meta node cannot have a terminator in any subtree.
*/
for (i = 1; i <= skip; i += next_skip) {
- if (scan[i].bm_bighint == (daddr_t)-1)
- break;
- if (next_skip == 1) {
+ if (next_skip == 1)
scan[i].u.bmu_bitmap = (u_daddr_t)-1;
- scan[i].bm_bighint = BLIST_BMAP_RADIX;
- } else {
- scan[i].bm_bighint = radix;
+ else
scan[i].u.bmu_avail = radix;
- }
+ scan[i].bm_bighint = radix;
}
} else {
radix /= BLIST_META_RADIX;
}
- for (i = 1; i <= skip; i += next_skip) {
+ if (count > radix) {
+ /*
+ * The allocation exceeds the number of blocks that are
+ * managed by a subtree of this meta node.
+ */
+ panic("allocation too large");
+ }
+ scan_from_start = cursor == blk;
+ child = (cursor - blk) / radix;
+ blk += child * radix;
+ for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
if (count <= scan[i].bm_bighint) {
/*
- * count fits in object
+ * The allocation might fit in the i'th subtree.
*/
- daddr_t r;
- if (next_skip == 1) {
- r = blst_leaf_alloc(&scan[i], blk, count);
- } else {
- r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
- }
+ r = blst_meta_alloc(&scan[i], blk, count, radix,
+ next_skip - 1, cursor > blk ? cursor : blk);
if (r != SWAPBLK_NONE) {
scan->u.bmu_avail -= count;
- if (scan->bm_bighint > scan->u.bmu_avail)
- scan->bm_bighint = scan->u.bmu_avail;
- return(r);
+ return (r);
}
} else if (scan[i].bm_bighint == (daddr_t)-1) {
/*
* Terminator
*/
break;
- } else if (count > radix) {
- /*
- * count does not fit in object even if it were
- * complete free.
- */
- panic("blist_meta_alloc: allocation too large");
}
blk += radix;
}
@@ -485,22 +502,19 @@ blst_meta_alloc(
/*
* We couldn't allocate count in this subtree, update bighint.
*/
- if (scan->bm_bighint >= count)
+ if (scan_from_start && scan->bm_bighint >= count)
scan->bm_bighint = count - 1;
- return(SWAPBLK_NONE);
+
+ return (SWAPBLK_NONE);
}
/*
* BLST_LEAF_FREE() - free allocated block from leaf bitmap
*
*/
-
static void
-blst_leaf_free(
- blmeta_t *scan,
- daddr_t blk,
- int count
-) {
+blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
+{
/*
* free some data in this bitmap
*
@@ -521,7 +535,7 @@ blst_leaf_free(
/*
* We could probably do a better job here. We are required to make
- * bighint at least as large as the biggest contiguous block of
+ * bighint at least as large as the biggest contiguous block of
* data. If we just shoehorn it, a little extra overhead will
* be incured on the next allocation (but only that one typically).
*/
@@ -538,25 +552,18 @@ blst_leaf_free(
* range whereas the allocation code cannot allocate an arbitrary
* range).
*/
+static void
+blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
+ daddr_t skip, daddr_t blk)
+{
+ daddr_t i, next_skip, v;
+ int child;
-static void
-blst_meta_free(
- blmeta_t *scan,
- daddr_t freeBlk,
- daddr_t count,
- daddr_t radix,
- int skip,
- daddr_t blk
-) {
- int i;
- int next_skip = ((u_int)skip / BLIST_META_RADIX);
-
-#if 0
- printf("free (%llx,%lld) FROM (%llx,%lld)\n",
- (long long)freeBlk, (long long)count,
- (long long)blk, (long long)radix
- );
-#endif
+ if (scan->bm_bighint == (daddr_t)-1)
+ panic("freeing invalid range");
+ if (radix == BLIST_BMAP_RADIX)
+ return (blst_leaf_free(scan, freeBlk, count));
+ next_skip = skip / BLIST_META_RADIX;
if (scan->u.bmu_avail == 0) {
/*
@@ -601,27 +608,16 @@ blst_meta_free(
radix /= BLIST_META_RADIX;
- i = (freeBlk - blk) / radix;
- blk += i * radix;
- i = i * next_skip + 1;
-
+ child = (freeBlk - blk) / radix;
+ blk += child * radix;
+ i = 1 + child * next_skip;
while (i <= skip && blk < freeBlk + count) {
- daddr_t v;
-
v = blk + radix - freeBlk;
if (v > count)
v = count;
-
- if (scan->bm_bighint == (daddr_t)-1)
- panic("blst_meta_free: freeing unexpected range");
-
- if (next_skip == 1) {
- blst_leaf_free(&scan[i], freeBlk, v);
- } else {
- blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
- }
+ blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
if (scan->bm_bighint < scan[i].bm_bighint)
- scan->bm_bighint = scan[i].bm_bighint;
+ scan->bm_bighint = scan[i].bm_bighint;
count -= v;
freeBlk += v;
blk += radix;
@@ -635,17 +631,11 @@ blst_meta_free(
* Locates free space in the source tree and frees it in the destination
* tree. The space may not already be free in the destination.
*/
-
-static void blst_copy(
- blmeta_t *scan,
- daddr_t blk,
- daddr_t radix,
- daddr_t skip,
- blist_t dest,
- daddr_t count
-) {
- int next_skip;
- int i;
+static void
+blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
+ blist_t dest, daddr_t count)
+{
+ daddr_t i, next_skip;
/*
* Leaf node
@@ -660,7 +650,7 @@ static void blst_copy(
int i;
for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
- if (v & (1 << i))
+ if (v & ((u_daddr_t)1 << i))
blist_free(dest, blk + i, 1);
}
}
@@ -676,7 +666,7 @@ static void blst_copy(
* Source all allocated, leave dest allocated
*/
return;
- }
+ }
if (scan->u.bmu_avail == radix) {
/*
* Source all free, free entire dest
@@ -690,32 +680,20 @@ static void blst_copy(
radix /= BLIST_META_RADIX;
- next_skip = ((u_int)skip / BLIST_META_RADIX);
+ next_skip = skip / BLIST_META_RADIX;
for (i = 1; count && i <= skip; i += next_skip) {
if (scan[i].bm_bighint == (daddr_t)-1)
break;
if (count >= radix) {
- blst_copy(
- &scan[i],
- blk,
- radix,
- next_skip - 1,
- dest,
- radix
- );
+ blst_copy(&scan[i], blk, radix, next_skip - 1, dest,
+ radix);
count -= radix;
} else {
if (count) {
- blst_copy(
- &scan[i],
- blk,
- radix,
- next_skip - 1,
- dest,
- count
- );
+ blst_copy(&scan[i], blk, radix, next_skip - 1,
+ dest, count);
}
count = 0;
}
@@ -730,24 +708,21 @@ static void blst_copy(
* regardless of any existing allocations in that range. Returns
* the number of blocks allocated by the call.
*/
-
-static int
+static daddr_t
blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
{
int n = blk & (BLIST_BMAP_RADIX - 1);
- int nblks;
- u_daddr_t mask, bitmap;
+ daddr_t nblks;
+ u_daddr_t mask;
mask = ((u_daddr_t)-1 << n) &
((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
- /* Count the number of blocks we're about to allocate */
- bitmap = scan->u.bmu_bitmap & mask;
- for (nblks = 0; bitmap != 0; nblks++)
- bitmap &= bitmap - 1;
+ /* Count the number of blocks that we are allocating. */
+ nblks = bitcount64(scan->u.bmu_bitmap & mask);
scan->u.bmu_bitmap &= ~mask;
- return nblks;
+ return (nblks);
}
/*
@@ -758,80 +733,74 @@ blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
* range must be within the extent of this node. Returns the
* number of blocks allocated by the call.
*/
-static int
-blst_meta_fill(
- blmeta_t *scan,
- daddr_t allocBlk,
- daddr_t count,
- daddr_t radix,
- int skip,
- daddr_t blk
-) {
- int i;
- int next_skip = ((u_int)skip / BLIST_META_RADIX);
- int nblks = 0;
+static daddr_t
+blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
+ daddr_t skip, daddr_t blk)
+{
+ daddr_t i, nblks, next_skip, v;
+ int child;
+ if (scan->bm_bighint == (daddr_t)-1)
+ panic("filling invalid range");
+ if (count > radix) {
+ /*
+ * The allocation exceeds the number of blocks that are
+ * managed by this node.
+ */
+ panic("fill too large");
+ }
+ if (radix == BLIST_BMAP_RADIX)
+ return (blst_leaf_fill(scan, allocBlk, count));
if (count == radix || scan->u.bmu_avail == 0) {
/*
* ALL-ALLOCATED special case
*/
nblks = scan->u.bmu_avail;
scan->u.bmu_avail = 0;
- scan->bm_bighint = count;
- return nblks;
+ scan->bm_bighint = 0;
+ return (nblks);
}
+ next_skip = skip / BLIST_META_RADIX;
+ /*
+ * An ALL-FREE meta node requires special handling before allocating
+ * any of its blocks.
+ */
if (scan->u.bmu_avail == radix) {
radix /= BLIST_META_RADIX;
/*
- * ALL-FREE special case, initialize sublevel
+ * Reinitialize each of the meta node's children. An ALL-FREE
+ * meta node cannot have a terminator in any subtree.
*/
for (i = 1; i <= skip; i += next_skip) {
- if (scan[i].bm_bighint == (daddr_t)-1)
- break;
- if (next_skip == 1) {
+ if (next_skip == 1)
scan[i].u.bmu_bitmap = (u_daddr_t)-1;
- scan[i].bm_bighint = BLIST_BMAP_RADIX;
- } else {
- scan[i].bm_bighint = radix;
+ else
scan[i].u.bmu_avail = radix;
- }
+ scan[i].bm_bighint = radix;
}
} else {
radix /= BLIST_META_RADIX;
}
- if (count > radix)
- panic("blist_meta_fill: allocation too large");
-
- i = (allocBlk - blk) / radix;
- blk += i * radix;
- i = i * next_skip + 1;
-
+ nblks = 0;
+ child = (allocBlk - blk) / radix;
+ blk += child * radix;
+ i = 1 + child * next_skip;
while (i <= skip && blk < allocBlk + count) {
- daddr_t v;
-
v = blk + radix - allocBlk;
if (v > count)
v = count;
-
- if (scan->bm_bighint == (daddr_t)-1)
- panic("blst_meta_fill: filling unexpected range");
-
- if (next_skip == 1) {
- nblks += blst_leaf_fill(&scan[i], allocBlk, v);
- } else {
- nblks += blst_meta_fill(&scan[i], allocBlk, v,
- radix, next_skip - 1, blk);
- }
+ nblks += blst_meta_fill(&scan[i], allocBlk, v, radix,
+ next_skip - 1, blk);
count -= v;
allocBlk += v;
blk += radix;
i += next_skip;
}
scan->u.bmu_avail -= nblks;
- return nblks;
+ return (nblks);
}
/*
@@ -842,13 +811,12 @@ blst_meta_fill(
* be considerably less than the calculated radix due to the large
* RADIX values we use.
*/
-
-static daddr_t
-blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+static daddr_t
+blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
{
- int i;
- int next_skip;
- daddr_t memindex = 0;
+ daddr_t i, memindex, next_skip;
+
+ memindex = 0;
/*
* Leaf node
@@ -859,7 +827,7 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
scan->bm_bighint = 0;
scan->u.bmu_bitmap = 0;
}
- return(memindex);
+ return (memindex);
}
/*
@@ -874,30 +842,24 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
}
radix /= BLIST_META_RADIX;
- next_skip = ((u_int)skip / BLIST_META_RADIX);
+ next_skip = skip / BLIST_META_RADIX;
for (i = 1; i <= skip; i += next_skip) {
if (count >= radix) {
/*
* Allocate the entire object
*/
- memindex = i + blst_radix_init(
- ((scan) ? &scan[i] : NULL),
- radix,
- next_skip - 1,
- radix
- );
+ memindex = i +
+ blst_radix_init(((scan) ? &scan[i] : NULL), radix,
+ next_skip - 1, radix);
count -= radix;
} else if (count > 0) {
/*
* Allocate a partial object
*/
- memindex = i + blst_radix_init(
- ((scan) ? &scan[i] : NULL),
- radix,
- next_skip - 1,
- count
- );
+ memindex = i +
+ blst_radix_init(((scan) ? &scan[i] : NULL), radix,
+ next_skip - 1, count);
count = 0;
} else {
/*
@@ -910,21 +872,20 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
}
if (memindex < i)
memindex = i;
- return(memindex);
+ return (memindex);
}
#ifdef BLIST_DEBUG
-static void
-blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+static void
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
+ int tab)
{
- int i;
- int next_skip;
- int lastState = 0;
+ daddr_t i, next_skip;
if (radix == BLIST_BMAP_RADIX) {
printf(
- "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n",
+ "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
tab, tab, "",
(long long)blk, (long long)radix,
(long long)scan->u.bmu_bitmap,
@@ -962,7 +923,7 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
);
radix /= BLIST_META_RADIX;
- next_skip = ((u_int)skip / BLIST_META_RADIX);
+ next_skip = skip / BLIST_META_RADIX;
tab += 4;
for (i = 1; i <= skip; i += next_skip) {
@@ -972,16 +933,9 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
tab, tab, "",
(long long)blk, (long long)radix
);
- lastState = 0;
break;
}
- blst_radix_print(
- &scan[i],
- blk,
- radix,
- next_skip - 1,
- tab
- );
+ blst_radix_print(&scan[i], blk, radix, next_skip - 1, tab);
blk += radix;
}
tab -= 4;
@@ -1018,11 +972,10 @@ main(int ac, char **av)
for (;;) {
char buf[1024];
- daddr_t da = 0;
- daddr_t count = 0;
-
+ long long da = 0;
+ long long count = 0;
- printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+ printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
(long long)size, (long long)bl->bl_radix);
fflush(stdout);
if (fgets(buf, sizeof(buf), stdin) == NULL)
@@ -1030,7 +983,7 @@ main(int ac, char **av)
switch(buf[0]) {
case 'r':
if (sscanf(buf + 1, "%lld", &count) == 1) {
- blist_resize(&bl, count, 1);
+ blist_resize(&bl, count, 1, M_WAITOK);
} else {
printf("?\n");
}
@@ -1046,18 +999,16 @@ main(int ac, char **av)
}
break;
case 'f':
- if (sscanf(buf + 1, "%llx %lld",
- (long long *)&da, (long long *)&count) == 2) {
+ if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
blist_free(bl, da, count);
} else {
printf("?\n");
}
break;
case 'l':
- if (sscanf(buf + 1, "%llx %lld",
- (long long *)&da, (long long *)&count) == 2) {
- printf(" n=%d\n",
- blist_fill(bl, da, count));
+ if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
+ printf(" n=%jd\n",
+ (intmax_t)blist_fill(bl, da, count));
} else {
printf("?\n");
}
@@ -1094,4 +1045,3 @@ panic(const char *ctl, ...)
}
#endif
-
diff --git a/freebsd/sys/kern/subr_prf.c b/freebsd/sys/kern/subr_prf.c
index 39f5826d..0380cfec 100644
--- a/freebsd/sys/kern/subr_prf.c
+++ b/freebsd/sys/kern/subr_prf.c
@@ -411,7 +411,6 @@ log_console(struct uio *uio)
msgbuftrigger = 1;
free(uio, M_IOV);
free(consbuffer, M_TEMP);
- return;
}
#endif /* __rtems__ */
@@ -678,7 +677,7 @@ kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_lis
uintmax_t num;
int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
int cflag, hflag, jflag, tflag, zflag;
- int dwidth, upper;
+ int bconv, dwidth, upper;
char padc;
int stop = 0, retval = 0;
@@ -704,7 +703,7 @@ kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_lis
}
percent = fmt - 1;
qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
- sign = 0; dot = 0; dwidth = 0; upper = 0;
+ sign = 0; dot = 0; bconv = 0; dwidth = 0; upper = 0;
cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
reswitch: switch (ch = (u_char)*fmt++) {
case '.':
@@ -752,28 +751,9 @@ reswitch: switch (ch = (u_char)*fmt++) {
width = n;
goto reswitch;
case 'b':
- num = (u_int)va_arg(ap, int);
- p = va_arg(ap, char *);
- for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
- PCHAR(*q--);
-
- if (num == 0)
- break;
-
- for (tmp = 0; *p;) {
- n = *p++;
- if (num & (1 << (n - 1))) {
- PCHAR(tmp ? ',' : '<');
- for (; (n = *p) > ' '; ++p)
- PCHAR(n);
- tmp = 1;
- } else
- for (; *p > ' '; ++p)
- continue;
- }
- if (tmp)
- PCHAR('>');
- break;
+ ladjust = 1;
+ bconv = 1;
+ goto handle_nosign;
case 'c':
width -= 1;
@@ -919,6 +899,10 @@ handle_nosign:
num = (u_char)va_arg(ap, int);
else
num = va_arg(ap, u_int);
+ if (bconv) {
+ q = va_arg(ap, char *);
+ base = *q++;
+ }
goto number;
handle_sign:
if (jflag)
@@ -976,6 +960,26 @@ number:
while (*p)
PCHAR(*p--);
+ if (bconv && num != 0) {
+ /* %b conversion flag format. */
+ tmp = retval;
+ while (*q) {
+ n = *q++;
+ if (num & (1 << (n - 1))) {
+ PCHAR(retval != tmp ?
+ ',' : '<');
+ for (; (n = *q) > ' '; ++q)
+ PCHAR(n);
+ } else
+ for (; *q > ' '; ++q)
+ continue;
+ }
+ if (retval != tmp) {
+ PCHAR('>');
+ width -= retval - tmp;
+ }
+ }
+
if (ladjust)
while (width-- > 0)
PCHAR(' ');
diff --git a/freebsd/sys/kern/subr_sbuf.c b/freebsd/sys/kern/subr_sbuf.c
index 680613b1..8dd11b07 100644
--- a/freebsd/sys/kern/subr_sbuf.c
+++ b/freebsd/sys/kern/subr_sbuf.c
@@ -106,7 +106,7 @@ _assert_sbuf_integrity(const char *fun, struct sbuf *s)
("%s called with a NULL sbuf pointer", fun));
KASSERT(s->s_buf != NULL,
("%s called with uninitialized or corrupt sbuf", fun));
- if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) {
+ if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) {
KASSERT(s->s_len <= s->s_size,
("wrote past end of sbuf (%jd >= %jd)",
(intmax_t)s->s_len, (intmax_t)s->s_size));
diff --git a/freebsd/sys/kern/subr_taskqueue.c b/freebsd/sys/kern/subr_taskqueue.c
index 6f1ba19a..74b9cf59 100644
--- a/freebsd/sys/kern/subr_taskqueue.c
+++ b/freebsd/sys/kern/subr_taskqueue.c
@@ -316,8 +316,8 @@ taskqueue_timeout_func(void *arg)
}
int
-taskqueue_enqueue_timeout(struct taskqueue *queue,
- struct timeout_task *timeout_task, int ticks)
+taskqueue_enqueue_timeout_sbt(struct taskqueue *queue,
+ struct timeout_task *timeout_task, sbintime_t sbt, sbintime_t pr, int flags)
{
int res;
@@ -333,7 +333,7 @@ taskqueue_enqueue_timeout(struct taskqueue *queue,
/* Do nothing */
TQ_UNLOCK(queue);
res = -1;
- } else if (ticks == 0) {
+ } else if (sbt == 0) {
taskqueue_enqueue_locked(queue, &timeout_task->t);
/* The lock is released inside. */
} else {
@@ -342,18 +342,27 @@ taskqueue_enqueue_timeout(struct taskqueue *queue,
} else {
queue->tq_callouts++;
timeout_task->f |= DT_CALLOUT_ARMED;
- if (ticks < 0)
- ticks = -ticks; /* Ignore overflow. */
+ if (sbt < 0)
+ sbt = -sbt; /* Ignore overflow. */
}
- if (ticks > 0) {
- callout_reset(&timeout_task->c, ticks,
- taskqueue_timeout_func, timeout_task);
+ if (sbt > 0) {
+ callout_reset_sbt(&timeout_task->c, sbt, pr,
+ taskqueue_timeout_func, timeout_task, flags);
}
TQ_UNLOCK(queue);
}
return (res);
}
+int
+taskqueue_enqueue_timeout(struct taskqueue *queue,
+ struct timeout_task *ttask, int ticks)
+{
+
+ return (taskqueue_enqueue_timeout_sbt(queue, ttask, ticks * tick_sbt,
+ 0, 0));
+}
+
static void
taskqueue_task_nop_fn(void *context, int pending)
{
diff --git a/freebsd/sys/kern/subr_uio.c b/freebsd/sys/kern/subr_uio.c
index 5740e667..904ef1f4 100644
--- a/freebsd/sys/kern/subr_uio.c
+++ b/freebsd/sys/kern/subr_uio.c
@@ -212,41 +212,37 @@ uiomove_nofault(void *cp, int n, struct uio *uio)
static int
uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
{
-#ifndef __rtems__
- struct thread *td;
-#endif /* __rtems__ */
struct iovec *iov;
size_t cnt;
- int error, newflags, save;
-
#ifndef __rtems__
- td = curthread;
+ int error, newflags, save;
+#else /* __rtems__ */
+ int error;
#endif /* __rtems__ */
+
error = 0;
+#ifndef __rtems__
KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
("uiomove: mode"));
-#ifndef __rtems__
- KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
("uiomove proc"));
-#endif /* __rtems__ */
- if (!nofault)
- WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- "Calling uiomove()");
-#ifndef __rtems__
- /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
- newflags = TDP_DEADLKTREAT;
- if (uio->uio_segflg == UIO_USERSPACE && nofault) {
- /*
- * Fail if a non-spurious page fault occurs.
- */
- newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ newflags = TDP_DEADLKTREAT;
+ if (nofault) {
+ /*
+ * Fail if a non-spurious page fault occurs.
+ */
+ newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
+ } else {
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "Calling uiomove()");
+ }
+ save = curthread_pflags_set(newflags);
+ } else {
+ KASSERT(nofault == 0, ("uiomove: nofault"));
}
- save = curthread_pflags_set(newflags);
-#else /* __rtems__ */
- (void) newflags;
- (void) save;
#endif /* __rtems__ */
while (n > 0 && uio->uio_resid) {
@@ -292,7 +288,8 @@ uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
}
out:
#ifndef __rtems__
- curthread_pflags_restore(save);
+ if (uio->uio_segflg == UIO_USERSPACE)
+ curthread_pflags_restore(save);
#endif /* __rtems__ */
return (error);
}
diff --git a/freebsd/sys/kern/sys_socket.c b/freebsd/sys/kern/sys_socket.c
index 8d87c51b..9dd458f1 100644
--- a/freebsd/sys/kern/sys_socket.c
+++ b/freebsd/sys/kern/sys_socket.c
@@ -318,32 +318,36 @@ soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
break;
case FIOASYNC:
- /*
- * XXXRW: This code separately acquires SOCK_LOCK(so) and
- * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
- * mutex to avoid introducing the assumption that they are
- * the same.
- */
if (*(int *)data) {
SOCK_LOCK(so);
so->so_state |= SS_ASYNC;
+ if (SOLISTENING(so)) {
+ so->sol_sbrcv_flags |= SB_ASYNC;
+ so->sol_sbsnd_flags |= SB_ASYNC;
+ } else {
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_flags |= SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_flags |= SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ }
SOCK_UNLOCK(so);
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_rcv.sb_flags |= SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_rcv);
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_flags |= SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_snd);
} else {
SOCK_LOCK(so);
so->so_state &= ~SS_ASYNC;
+ if (SOLISTENING(so)) {
+ so->sol_sbrcv_flags &= ~SB_ASYNC;
+ so->sol_sbsnd_flags &= ~SB_ASYNC;
+ } else {
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_flags &= ~SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_flags &= ~SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ }
SOCK_UNLOCK(so);
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_rcv.sb_flags &= ~SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_rcv);
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_flags &= ~SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_snd);
}
break;
@@ -477,7 +481,6 @@ static int
soo_stat(struct socket *so, struct stat *ub)
{
#endif /* __rtems__ */
- struct sockbuf *sb;
#ifdef MAC
int error;
#endif
@@ -491,22 +494,26 @@ soo_stat(struct socket *so, struct stat *ub)
if (error)
return (error);
#endif
- /*
- * If SBS_CANTRCVMORE is set, but there's still data left in the
- * receive buffer, the socket is still readable.
- */
- sb = &so->so_rcv;
- SOCKBUF_LOCK(sb);
- if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
- ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
- ub->st_size = sbavail(sb) - sb->sb_ctl;
- SOCKBUF_UNLOCK(sb);
+ if (!SOLISTENING(so)) {
+ struct sockbuf *sb;
- sb = &so->so_snd;
- SOCKBUF_LOCK(sb);
- if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
- ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
- SOCKBUF_UNLOCK(sb);
+ /*
+ * If SBS_CANTRCVMORE is set, but there's still data left
+ * in the receive buffer, the socket is still readable.
+ */
+ sb = &so->so_rcv;
+ SOCKBUF_LOCK(sb);
+ if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
+ ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
+ ub->st_size = sbavail(sb) - sb->sb_ctl;
+ SOCKBUF_UNLOCK(sb);
+
+ sb = &so->so_snd;
+ SOCKBUF_LOCK(sb);
+ if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
+ ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
+ SOCKBUF_UNLOCK(sb);
+ }
#ifndef __rtems__
ub->st_uid = so->so_cred->cr_uid;
ub->st_gid = so->so_cred->cr_gid;
@@ -916,6 +923,7 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb)
{
struct kaiocb *job;
+ CURVNET_SET(so->so_vnet);
SOCKBUF_LOCK(sb);
while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) {
job = TAILQ_FIRST(&sb->sb_aiojobq);
@@ -936,9 +944,9 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb)
sb->sb_flags &= ~SB_AIO_RUNNING;
SOCKBUF_UNLOCK(sb);
- ACCEPT_LOCK();
SOCK_LOCK(so);
sorele(so);
+ CURVNET_RESTORE();
}
void
diff --git a/freebsd/sys/kern/uipc_accf.c b/freebsd/sys/kern/uipc_accf.c
index a766adf8..8a0e14e3 100644
--- a/freebsd/sys/kern/uipc_accf.c
+++ b/freebsd/sys/kern/uipc_accf.c
@@ -132,8 +132,7 @@ accept_filt_generic_mod_event(module_t mod, int event, void *data)
switch (event) {
case MOD_LOAD:
- p = malloc(sizeof(*p), M_ACCF,
- M_WAITOK);
+ p = malloc(sizeof(*p), M_ACCF, M_WAITOK);
bcopy(accfp, p, sizeof(*p));
error = accept_filt_add(p);
break;
@@ -164,26 +163,25 @@ accept_filt_generic_mod_event(module_t mod, int event, void *data)
}
int
-do_getopt_accept_filter(struct socket *so, struct sockopt *sopt)
+accept_filt_getopt(struct socket *so, struct sockopt *sopt)
{
struct accept_filter_arg *afap;
int error;
error = 0;
- afap = malloc(sizeof(*afap), M_TEMP,
- M_WAITOK | M_ZERO);
+ afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO);
SOCK_LOCK(so);
if ((so->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
goto out;
}
- if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+ if (so->sol_accept_filter == NULL) {
error = EINVAL;
goto out;
}
- strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
- if (so->so_accf->so_accept_filter_str != NULL)
- strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+ strcpy(afap->af_name, so->sol_accept_filter->accf_name);
+ if (so->sol_accept_filter_str != NULL)
+ strcpy(afap->af_arg, so->sol_accept_filter_str);
out:
SOCK_UNLOCK(so);
if (error == 0)
@@ -193,35 +191,61 @@ out:
}
int
-do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
+accept_filt_setopt(struct socket *so, struct sockopt *sopt)
{
struct accept_filter_arg *afap;
struct accept_filter *afp;
- struct so_accf *newaf;
- int error = 0;
+ char *accept_filter_str = NULL;
+ void *accept_filter_arg = NULL;
+ int error;
/*
* Handle the simple delete case first.
*/
if (sopt == NULL || sopt->sopt_val == NULL) {
+ struct socket *sp, *sp1;
+ int wakeup;
+
SOCK_LOCK(so);
if ((so->so_options & SO_ACCEPTCONN) == 0) {
SOCK_UNLOCK(so);
return (EINVAL);
}
- if (so->so_accf != NULL) {
- struct so_accf *af = so->so_accf;
- if (af->so_accept_filter != NULL &&
- af->so_accept_filter->accf_destroy != NULL) {
- af->so_accept_filter->accf_destroy(so);
- }
- if (af->so_accept_filter_str != NULL)
- free(af->so_accept_filter_str, M_ACCF);
- free(af, M_ACCF);
- so->so_accf = NULL;
+ if (so->sol_accept_filter == NULL) {
+ SOCK_UNLOCK(so);
+ return (0);
}
+ if (so->sol_accept_filter->accf_destroy != NULL)
+ so->sol_accept_filter->accf_destroy(so);
+ if (so->sol_accept_filter_str != NULL)
+ free(so->sol_accept_filter_str, M_ACCF);
+ so->sol_accept_filter = NULL;
+ so->sol_accept_filter_arg = NULL;
+ so->sol_accept_filter_str = NULL;
so->so_options &= ~SO_ACCEPTFILTER;
- SOCK_UNLOCK(so);
+
+ /*
+ * Move from incomplete queue to complete only those
+ * connections, that are blocked by us.
+ */
+ wakeup = 0;
+ TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) {
+ SOCK_LOCK(sp);
+ if (sp->so_options & SO_ACCEPTFILTER) {
+ TAILQ_REMOVE(&so->sol_incomp, sp, so_list);
+ TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list);
+ sp->so_qstate = SQ_COMP;
+ sp->so_options &= ~SO_ACCEPTFILTER;
+ so->sol_incqlen--;
+ so->sol_qlen++;
+ wakeup = 1;
+ }
+ SOCK_UNLOCK(sp);
+ }
+ if (wakeup)
+ solisten_wakeup(so); /* unlocks */
+ else
+ SOLISTEN_UNLOCK(so);
return (0);
}
@@ -229,8 +253,7 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
* Pre-allocate any memory we may need later to avoid blocking at
* untimely moments. This does not optimize for invalid arguments.
*/
- afap = malloc(sizeof(*afap), M_TEMP,
- M_WAITOK);
+ afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK);
error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
afap->af_name[sizeof(afap->af_name)-1] = '\0';
afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
@@ -243,19 +266,10 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
free(afap, M_TEMP);
return (ENOENT);
}
- /*
- * Allocate the new accept filter instance storage. We may
- * have to free it again later if we fail to attach it. If
- * attached properly, 'newaf' is NULLed to avoid a free()
- * while in use.
- */
- newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK |
- M_ZERO);
if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
size_t len = strlen(afap->af_name) + 1;
- newaf->so_accept_filter_str = malloc(len, M_ACCF,
- M_WAITOK);
- strcpy(newaf->so_accept_filter_str, afap->af_name);
+ accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
+ strcpy(accept_filter_str, afap->af_name);
}
/*
@@ -263,8 +277,8 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
* without first removing it.
*/
SOCK_LOCK(so);
- if (((so->so_options & SO_ACCEPTCONN) == 0) ||
- (so->so_accf != NULL)) {
+ if ((so->so_options & SO_ACCEPTCONN) == 0 ||
+ so->sol_accept_filter != NULL) {
error = EINVAL;
goto out;
}
@@ -275,25 +289,20 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
* can't block.
*/
if (afp->accf_create != NULL) {
- newaf->so_accept_filter_arg =
- afp->accf_create(so, afap->af_arg);
- if (newaf->so_accept_filter_arg == NULL) {
+ accept_filter_arg = afp->accf_create(so, afap->af_arg);
+ if (accept_filter_arg == NULL) {
error = EINVAL;
goto out;
}
}
- newaf->so_accept_filter = afp;
- so->so_accf = newaf;
+ so->sol_accept_filter = afp;
+ so->sol_accept_filter_arg = accept_filter_arg;
+ so->sol_accept_filter_str = accept_filter_str;
so->so_options |= SO_ACCEPTFILTER;
- newaf = NULL;
out:
SOCK_UNLOCK(so);
- if (newaf != NULL) {
- if (newaf->so_accept_filter_str != NULL)
- free(newaf->so_accept_filter_str, M_ACCF);
- free(newaf, M_ACCF);
- }
- if (afap != NULL)
- free(afap, M_TEMP);
+ if (accept_filter_str != NULL)
+ free(accept_filter_str, M_ACCF);
+ free(afap, M_TEMP);
return (error);
}
diff --git a/freebsd/sys/kern/uipc_mbuf.c b/freebsd/sys/kern/uipc_mbuf.c
index ba8a2d48..abc30dd3 100644
--- a/freebsd/sys/kern/uipc_mbuf.c
+++ b/freebsd/sys/kern/uipc_mbuf.c
@@ -1519,7 +1519,7 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
* the total data supplied by the uio.
*/
if (len > 0)
- total = min(uio->uio_resid, len);
+ total = (uio->uio_resid < len) ? uio->uio_resid : len;
else
total = uio->uio_resid;
diff --git a/freebsd/sys/kern/uipc_sockbuf.c b/freebsd/sys/kern/uipc_sockbuf.c
index 04193c29..4b710a2c 100644
--- a/freebsd/sys/kern/uipc_sockbuf.c
+++ b/freebsd/sys/kern/uipc_sockbuf.c
@@ -316,14 +316,14 @@ sowakeup(struct socket *so, struct sockbuf *sb)
SOCKBUF_LOCK_ASSERT(sb);
- selwakeuppri(&sb->sb_sel, PSOCK);
- if (!SEL_WAITING(&sb->sb_sel))
+ selwakeuppri(sb->sb_sel, PSOCK);
+ if (!SEL_WAITING(sb->sb_sel))
sb->sb_flags &= ~SB_SEL;
if (sb->sb_flags & SB_WAIT) {
sb->sb_flags &= ~SB_WAIT;
wakeup(&sb->sb_acc);
}
- KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+ KNOTE_LOCKED(&sb->sb_sel->si_note, 0);
if (sb->sb_upcall != NULL) {
ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
if (ret == SU_ISCONNECTED) {
@@ -336,7 +336,7 @@ sowakeup(struct socket *so, struct sockbuf *sb)
if (sb->sb_flags & SB_AIO)
sowakeup_aio(so, sb);
SOCKBUF_UNLOCK(sb);
- if (ret == SU_ISCONNECTED)
+ if (ret == SU_ISCONNECTED && !(so->so_state & SS_ISDISCONNECTED))
soisconnected(so);
if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
pgsigio(&so->so_sigio, SIGIO, 0);
@@ -457,14 +457,78 @@ sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
}
int
-sbreserve(struct sockbuf *sb, u_long cc, struct socket *so,
- struct thread *td)
+sbsetopt(struct socket *so, int cmd, u_long cc)
{
+ struct sockbuf *sb;
+ short *flags;
+ u_int *hiwat, *lowat;
int error;
- SOCKBUF_LOCK(sb);
- error = sbreserve_locked(sb, cc, so, td);
- SOCKBUF_UNLOCK(sb);
+ SOCK_LOCK(so);
+ if (SOLISTENING(so)) {
+ switch (cmd) {
+ case SO_SNDLOWAT:
+ case SO_SNDBUF:
+ lowat = &so->sol_sbsnd_lowat;
+ hiwat = &so->sol_sbsnd_hiwat;
+ flags = &so->sol_sbsnd_flags;
+ break;
+ case SO_RCVLOWAT:
+ case SO_RCVBUF:
+ lowat = &so->sol_sbrcv_lowat;
+ hiwat = &so->sol_sbrcv_hiwat;
+ flags = &so->sol_sbrcv_flags;
+ break;
+ }
+ } else {
+ switch (cmd) {
+ case SO_SNDLOWAT:
+ case SO_SNDBUF:
+ sb = &so->so_snd;
+ break;
+ case SO_RCVLOWAT:
+ case SO_RCVBUF:
+ sb = &so->so_rcv;
+ break;
+ }
+ flags = &sb->sb_flags;
+ hiwat = &sb->sb_hiwat;
+ lowat = &sb->sb_lowat;
+ SOCKBUF_LOCK(sb);
+ }
+
+ error = 0;
+ switch (cmd) {
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ if (SOLISTENING(so)) {
+ if (cc > sb_max_adj) {
+ error = ENOBUFS;
+ break;
+ }
+ *hiwat = cc;
+ if (*lowat > *hiwat)
+ *lowat = *hiwat;
+ } else {
+ if (!sbreserve_locked(sb, cc, so, curthread))
+ error = ENOBUFS;
+ }
+ if (error == 0)
+ *flags &= ~SB_AUTOSIZE;
+ break;
+ case SO_SNDLOWAT:
+ case SO_RCVLOWAT:
+ /*
+ * Make sure the low-water is never greater than the
+ * high-water.
+ */
+ *lowat = (cc > *hiwat) ? *hiwat : cc;
+ break;
+ }
+
+ if (!SOLISTENING(so))
+ SOCKBUF_UNLOCK(sb);
+ SOCK_UNLOCK(so);
return (error);
}
diff --git a/freebsd/sys/kern/uipc_socket.c b/freebsd/sys/kern/uipc_socket.c
index c52a543c..1773606d 100644
--- a/freebsd/sys/kern/uipc_socket.c
+++ b/freebsd/sys/kern/uipc_socket.c
@@ -108,6 +108,7 @@ __FBSDID("$FreeBSD$");
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_compat.h>
+#include <rtems/bsd/local/opt_sctp.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -160,13 +161,21 @@ __FBSDID("$FreeBSD$");
static int soreceive_rcvoob(struct socket *so, struct uio *uio,
int flags);
+static void so_rdknl_lock(void *);
+static void so_rdknl_unlock(void *);
+static void so_rdknl_assert_locked(void *);
+static void so_rdknl_assert_unlocked(void *);
+static void so_wrknl_lock(void *);
+static void so_wrknl_unlock(void *);
+static void so_wrknl_assert_locked(void *);
+static void so_wrknl_assert_unlocked(void *);
static void filt_sordetach(struct knote *kn);
static int filt_soread(struct knote *kn, long hint);
static void filt_sowdetach(struct knote *kn);
static int filt_sowrite(struct knote *kn, long hint);
-static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
static int filt_soempty(struct knote *kn, long hint);
+static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
#ifdef __rtems__
static
#endif /* __rtems__ */
@@ -412,8 +421,16 @@ soalloc(struct vnet *vnet)
return (NULL);
}
+ /*
+ * The socket locking protocol allows to lock 2 sockets at a time,
+ * however, the first one must be a listening socket. WITNESS lacks
+ * a feature to change class of an existing lock, so we use DUPOK.
+ */
+ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+ so->so_rcv.sb_sel = &so->so_rdsel;
+ so->so_snd.sb_sel = &so->so_wrsel;
sx_init(&so->so_snd.sb_sx, "so_snd_sx");
sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
#ifndef __rtems__
@@ -465,15 +482,6 @@ sodealloc(struct socket *so)
so->so_vnet->vnet_sockcnt--;
#endif
mtx_unlock(&so_global_mtx);
- if (so->so_rcv.sb_hiwat)
- (void)chgsbsize(so->so_cred->cr_uidinfo,
- &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
- if (so->so_snd.sb_hiwat)
- (void)chgsbsize(so->so_cred->cr_uidinfo,
- &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
- /* remove accept filter if one is present. */
- if (so->so_accf != NULL)
- do_setopt_accept_filter(so, NULL);
#ifdef MAC
mac_socket_destroy(so);
#endif
@@ -481,10 +489,22 @@ sodealloc(struct socket *so)
crfree(so->so_cred);
khelp_destroy_osd(&so->osd);
- sx_destroy(&so->so_snd.sb_sx);
- sx_destroy(&so->so_rcv.sb_sx);
- SOCKBUF_LOCK_DESTROY(&so->so_snd);
- SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+ if (SOLISTENING(so)) {
+ if (so->sol_accept_filter != NULL)
+ accept_filt_setopt(so, NULL);
+ } else {
+ if (so->so_rcv.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+ if (so->so_snd.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+ sx_destroy(&so->so_snd.sb_sx);
+ sx_destroy(&so->so_rcv.sb_sx);
+ SOCKBUF_LOCK_DESTROY(&so->so_snd);
+ SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+ }
+ mtx_destroy(&so->so_lock);
uma_zfree(socket_zone, so);
}
@@ -527,8 +547,6 @@ socreate(int dom, struct socket **aso, int type, int proto,
if (so == NULL)
return (ENOBUFS);
- TAILQ_INIT(&so->so_incomp);
- TAILQ_INIT(&so->so_comp);
so->so_type = type;
so->so_cred = crhold(cred);
if ((prp->pr_domain->dom_family == PF_INET) ||
@@ -545,9 +563,10 @@ socreate(int dom, struct socket **aso, int type, int proto,
#ifdef MAC
mac_socket_create(cred, so);
#endif
- knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
- knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
- so->so_count = 1;
+ knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+ so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+ knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+ so_wrknl_assert_locked, so_wrknl_assert_unlocked);
/*
* Auto-sizing of socket buffers is managed by the protocols and
* the appropriate flags must be set in the pru_attach function.
@@ -556,12 +575,10 @@ socreate(int dom, struct socket **aso, int type, int proto,
error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
CURVNET_RESTORE();
if (error) {
- KASSERT(so->so_count == 1, ("socreate: so_count %d",
- so->so_count));
- so->so_count = 0;
sodealloc(so);
return (error);
}
+ soref(so);
*aso = so;
return (0);
}
@@ -589,11 +606,11 @@ sonewconn(struct socket *head, int connstatus)
static int overcount;
struct socket *so;
- int over;
+ u_int over;
- ACCEPT_LOCK();
- over = (head->so_qlen > 3 * head->so_qlimit / 2);
- ACCEPT_UNLOCK();
+ SOLISTEN_LOCK(head);
+ over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
+ SOLISTEN_UNLOCK(head);
#ifdef REGRESSION
if (regression_sonewconn_earlytest && over) {
#else
@@ -605,15 +622,15 @@ sonewconn(struct socket *head, int connstatus)
log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
"%i already in queue awaiting acceptance "
"(%d occurrences)\n",
- __func__, head->so_pcb, head->so_qlen, overcount);
+ __func__, head->so_pcb, head->sol_qlen, overcount);
overcount = 0;
}
return (NULL);
}
- VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
- __func__, __LINE__, head));
+ VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
+ __func__, head));
so = soalloc(head->so_vnet);
if (so == NULL) {
log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
@@ -621,11 +638,8 @@ sonewconn(struct socket *head, int connstatus)
__func__, head->so_pcb);
return (NULL);
}
- if ((head->so_options & SO_ACCEPTFILTER) != 0)
- connstatus = 0;
- so->so_head = head;
+ so->so_listen = head;
so->so_type = head->so_type;
- so->so_options = head->so_options &~ SO_ACCEPTCONN;
so->so_linger = head->so_linger;
so->so_state = head->so_state | SS_NOFDREF;
so->so_fibnum = head->so_fibnum;
@@ -634,10 +648,12 @@ sonewconn(struct socket *head, int connstatus)
#ifdef MAC
mac_socket_newconn(head, so);
#endif
- knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
- knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+ knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+ so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+ knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+ so_wrknl_assert_locked, so_wrknl_assert_unlocked);
VNET_SO_ASSERT(head);
- if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+ if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
sodealloc(so);
log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
__func__, head->so_pcb);
@@ -649,32 +665,24 @@ sonewconn(struct socket *head, int connstatus)
__func__, head->so_pcb);
return (NULL);
}
- so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
- so->so_snd.sb_lowat = head->so_snd.sb_lowat;
- so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
- so->so_snd.sb_timeo = head->so_snd.sb_timeo;
- so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
- so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+ so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
+ so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
+ so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
+ so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
+ so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
+ so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
+
+ SOLISTEN_LOCK(head);
+ if (head->sol_accept_filter != NULL)
+ connstatus = 0;
so->so_state |= connstatus;
- ACCEPT_LOCK();
- /*
- * The accept socket may be tearing down but we just
- * won a race on the ACCEPT_LOCK.
- * However, if sctp_peeloff() is called on a 1-to-many
- * style socket, the SO_ACCEPTCONN doesn't need to be set.
- */
- if (!(head->so_options & SO_ACCEPTCONN) &&
- ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
- (head->so_type != SOCK_SEQPACKET))) {
- SOCK_LOCK(so);
- so->so_head = NULL;
- sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */
- return (NULL);
- }
+ so->so_options = head->so_options & ~SO_ACCEPTCONN;
+ soref(head); /* A socket on (in)complete queue refs head. */
if (connstatus) {
- TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
- so->so_qstate |= SQ_COMP;
- head->so_qlen++;
+ TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
+ so->so_qstate = SQ_COMP;
+ head->sol_qlen++;
+ solisten_wakeup(head); /* unlocks */
} else {
/*
* Keep removing sockets from the head until there's room for
@@ -683,28 +691,86 @@ sonewconn(struct socket *head, int connstatus)
* threads and soabort() requires dropping locks, we must
* loop waiting for the condition to be true.
*/
- while (head->so_incqlen > head->so_qlimit) {
+ while (head->sol_incqlen > head->sol_qlimit) {
struct socket *sp;
- sp = TAILQ_FIRST(&head->so_incomp);
- TAILQ_REMOVE(&head->so_incomp, sp, so_list);
- head->so_incqlen--;
- sp->so_qstate &= ~SQ_INCOMP;
- sp->so_head = NULL;
- ACCEPT_UNLOCK();
+
+ sp = TAILQ_FIRST(&head->sol_incomp);
+ TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
+ head->sol_incqlen--;
+ SOCK_LOCK(sp);
+ sp->so_qstate = SQ_NONE;
+ sp->so_listen = NULL;
+ SOCK_UNLOCK(sp);
+ sorele(head); /* does SOLISTEN_UNLOCK, head stays */
soabort(sp);
- ACCEPT_LOCK();
+ SOLISTEN_LOCK(head);
}
- TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
- so->so_qstate |= SQ_INCOMP;
- head->so_incqlen++;
+ TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
+ so->so_qstate = SQ_INCOMP;
+ head->sol_incqlen++;
+ SOLISTEN_UNLOCK(head);
}
- ACCEPT_UNLOCK();
- if (connstatus) {
- sorwakeup(head);
- wakeup_one(&head->so_timeo);
+ return (so);
+}
+
+#ifdef SCTP
+/*
+ * Socket part of sctp_peeloff(). Detach a new socket from an
+ * association. The new socket is returned with a reference.
+ */
+struct socket *
+sopeeloff(struct socket *head)
+{
+ struct socket *so;
+
+ VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
+ __func__, __LINE__, head));
+ so = soalloc(head->so_vnet);
+ if (so == NULL) {
+ log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
+ "limit reached or out of memory\n",
+ __func__, head->so_pcb);
+ return (NULL);
}
+ so->so_type = head->so_type;
+ so->so_options = head->so_options;
+ so->so_linger = head->so_linger;
+ so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
+ so->so_fibnum = head->so_fibnum;
+ so->so_proto = head->so_proto;
+ so->so_cred = crhold(head->so_cred);
+#ifdef MAC
+ mac_socket_newconn(head, so);
+#endif
+ knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+ so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+ knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+ so_wrknl_assert_locked, so_wrknl_assert_unlocked);
+ VNET_SO_ASSERT(head);
+ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+ sodealloc(so);
+ log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
+ __func__, head->so_pcb);
+ return (NULL);
+ }
+ if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sodealloc(so);
+ log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+ __func__, head->so_pcb);
+ return (NULL);
+ }
+ so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+ so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+ so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+ so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+ so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+ so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+
+ soref(so);
+
return (so);
}
+#endif /* SCTP */
int
sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
@@ -766,13 +832,140 @@ solisten_proto_check(struct socket *so)
void
solisten_proto(struct socket *so, int backlog)
{
+ int sbrcv_lowat, sbsnd_lowat;
+ u_int sbrcv_hiwat, sbsnd_hiwat;
+ short sbrcv_flags, sbsnd_flags;
+ sbintime_t sbrcv_timeo, sbsnd_timeo;
SOCK_LOCK_ASSERT(so);
+ if (SOLISTENING(so))
+ goto listening;
+
+ /*
+ * Change this socket to listening state.
+ */
+ sbrcv_lowat = so->so_rcv.sb_lowat;
+ sbsnd_lowat = so->so_snd.sb_lowat;
+ sbrcv_hiwat = so->so_rcv.sb_hiwat;
+ sbsnd_hiwat = so->so_snd.sb_hiwat;
+ sbrcv_flags = so->so_rcv.sb_flags;
+ sbsnd_flags = so->so_snd.sb_flags;
+ sbrcv_timeo = so->so_rcv.sb_timeo;
+ sbsnd_timeo = so->so_snd.sb_timeo;
+
+ sbdestroy(&so->so_snd, so);
+ sbdestroy(&so->so_rcv, so);
+ sx_destroy(&so->so_snd.sb_sx);
+ sx_destroy(&so->so_rcv.sb_sx);
+ SOCKBUF_LOCK_DESTROY(&so->so_snd);
+ SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+
+#ifdef INVARIANTS
+ bzero(&so->so_rcv,
+ sizeof(struct socket) - offsetof(struct socket, so_rcv));
+#endif
+
+ so->sol_sbrcv_lowat = sbrcv_lowat;
+ so->sol_sbsnd_lowat = sbsnd_lowat;
+ so->sol_sbrcv_hiwat = sbrcv_hiwat;
+ so->sol_sbsnd_hiwat = sbsnd_hiwat;
+ so->sol_sbrcv_flags = sbrcv_flags;
+ so->sol_sbsnd_flags = sbsnd_flags;
+ so->sol_sbrcv_timeo = sbrcv_timeo;
+ so->sol_sbsnd_timeo = sbsnd_timeo;
+
+ so->sol_qlen = so->sol_incqlen = 0;
+ TAILQ_INIT(&so->sol_incomp);
+ TAILQ_INIT(&so->sol_comp);
+
+ so->sol_accept_filter = NULL;
+ so->sol_accept_filter_arg = NULL;
+ so->sol_accept_filter_str = NULL;
+
+ so->sol_upcall = NULL;
+ so->sol_upcallarg = NULL;
+
+ so->so_options |= SO_ACCEPTCONN;
+
+listening:
if (backlog < 0 || backlog > somaxconn)
backlog = somaxconn;
- so->so_qlimit = backlog;
- so->so_options |= SO_ACCEPTCONN;
+ so->sol_qlimit = backlog;
+}
+
+/*
+ * Wakeup listeners/subsystems once we have a complete connection.
+ * Enters with lock, returns unlocked.
+ */
+void
+solisten_wakeup(struct socket *sol)
+{
+
+ if (sol->sol_upcall != NULL)
+ (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
+ else {
+ selwakeuppri(&sol->so_rdsel, PSOCK);
+ KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
+ }
+ SOLISTEN_UNLOCK(sol);
+ wakeup_one(&sol->sol_comp);
+}
+
+/*
+ * Return single connection off a listening socket queue. Main consumer of
+ * the function is kern_accept4(). Some modules, that do their own accept
+ * management also use the function.
+ *
+ * Listening socket must be locked on entry and is returned unlocked on
+ * return.
+ * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
+ */
+int
+solisten_dequeue(struct socket *head, struct socket **ret, int flags)
+{
+ struct socket *so;
+ int error;
+
+ SOLISTEN_LOCK_ASSERT(head);
+
+ while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
+ head->so_error == 0) {
+ error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH,
+ "accept", 0);
+ if (error != 0) {
+ SOLISTEN_UNLOCK(head);
+ return (error);
+ }
+ }
+ if (head->so_error) {
+ error = head->so_error;
+ head->so_error = 0;
+ SOLISTEN_UNLOCK(head);
+ return (error);
+ }
+ if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) {
+ SOLISTEN_UNLOCK(head);
+ return (EWOULDBLOCK);
+ }
+ so = TAILQ_FIRST(&head->sol_comp);
+ SOCK_LOCK(so);
+ KASSERT(so->so_qstate == SQ_COMP,
+ ("%s: so %p not SQ_COMP", __func__, so));
+ soref(so);
+ head->sol_qlen--;
+ so->so_qstate = SQ_NONE;
+ so->so_listen = NULL;
+ TAILQ_REMOVE(&head->sol_comp, so, so_list);
+ if (flags & ACCEPT4_INHERIT)
+ so->so_state |= (head->so_state & SS_NBIO);
+ else
+ so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
+ SOCK_UNLOCK(so);
+ sorele(head);
+
+ *ret = so;
+ return (0);
}
/*
@@ -799,44 +992,62 @@ void
sofree(struct socket *so)
{
struct protosw *pr = so->so_proto;
- struct socket *head;
- ACCEPT_LOCK_ASSERT();
SOCK_LOCK_ASSERT(so);
if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
- (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
+ (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
return;
}
- head = so->so_head;
- if (head != NULL) {
- KASSERT((so->so_qstate & SQ_COMP) != 0 ||
- (so->so_qstate & SQ_INCOMP) != 0,
- ("sofree: so_head != NULL, but neither SQ_COMP nor "
- "SQ_INCOMP"));
- KASSERT((so->so_qstate & SQ_COMP) == 0 ||
- (so->so_qstate & SQ_INCOMP) == 0,
- ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
- TAILQ_REMOVE(&head->so_incomp, so, so_list);
- head->so_incqlen--;
- so->so_qstate &= ~SQ_INCOMP;
- so->so_head = NULL;
- }
- KASSERT((so->so_qstate & SQ_COMP) == 0 &&
- (so->so_qstate & SQ_INCOMP) == 0,
- ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
- so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
- if (so->so_options & SO_ACCEPTCONN) {
- KASSERT((TAILQ_EMPTY(&so->so_comp)),
- ("sofree: so_comp populated"));
- KASSERT((TAILQ_EMPTY(&so->so_incomp)),
- ("sofree: so_incomp populated"));
+ if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
+ struct socket *sol;
+
+ sol = so->so_listen;
+ KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
+
+ /*
+ * To solve race between close of a listening socket and
+ * a socket on its incomplete queue, we need to lock both.
+ * The order is first listening socket, then regular.
+ * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
+ * function and the listening socket are the only pointers
+ * to so. To preserve so and sol, we reference both and then
+ * relock.
+ * After relock the socket may not move to so_comp since it
+ * doesn't have PCB already, but it may be removed from
+ * so_incomp. If that happens, we share responsiblity on
+ * freeing the socket, but soclose() has already removed
+ * it from queue.
+ */
+ soref(sol);
+ soref(so);
+ SOCK_UNLOCK(so);
+ SOLISTEN_LOCK(sol);
+ SOCK_LOCK(so);
+ if (so->so_qstate == SQ_INCOMP) {
+ KASSERT(so->so_listen == sol,
+ ("%s: so %p migrated out of sol %p",
+ __func__, so, sol));
+ TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
+ sol->sol_incqlen--;
+ /* This is guarenteed not to be the last. */
+ refcount_release(&sol->so_count);
+ so->so_qstate = SQ_NONE;
+ so->so_listen = NULL;
+ } else
+ KASSERT(so->so_listen == NULL,
+ ("%s: so %p not on (in)comp with so_listen",
+ __func__, so));
+ sorele(sol);
+ KASSERT(so->so_count == 1,
+ ("%s: so %p count %u", __func__, so, so->so_count));
+ so->so_count = 0;
}
+ if (SOLISTENING(so))
+ so->so_error = ECONNABORTED;
SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
VNET_SO_ASSERT(so);
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
@@ -858,12 +1069,14 @@ sofree(struct socket *so)
* before calling pru_detach. This means that protocols shold not
* assume they can perform socket wakeups, etc, in their detach code.
*/
- sbdestroy(&so->so_snd, so);
- sbdestroy(&so->so_rcv, so);
- seldrain(&so->so_snd.sb_sel);
- seldrain(&so->so_rcv.sb_sel);
- knlist_destroy(&so->so_rcv.sb_sel.si_note);
- knlist_destroy(&so->so_snd.sb_sel.si_note);
+ if (!SOLISTENING(so)) {
+ sbdestroy(&so->so_snd, so);
+ sbdestroy(&so->so_rcv, so);
+ }
+ seldrain(&so->so_rdsel);
+ seldrain(&so->so_wrsel);
+ knlist_destroy(&so->so_rdsel.si_note);
+ knlist_destroy(&so->so_wrsel.si_note);
sodealloc(so);
}
@@ -878,6 +1091,8 @@ sofree(struct socket *so)
int
soclose(struct socket *so)
{
+ struct accept_queue lqueue;
+ bool listening;
int error = 0;
KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
@@ -910,41 +1125,42 @@ soclose(struct socket *so)
drop:
if (so->so_proto->pr_usrreqs->pru_close != NULL)
(*so->so_proto->pr_usrreqs->pru_close)(so);
- ACCEPT_LOCK();
- if (so->so_options & SO_ACCEPTCONN) {
+
+ SOCK_LOCK(so);
+ if ((listening = (so->so_options & SO_ACCEPTCONN))) {
struct socket *sp;
- /*
- * Prevent new additions to the accept queues due
- * to ACCEPT_LOCK races while we are draining them.
- */
- so->so_options &= ~SO_ACCEPTCONN;
- while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
- TAILQ_REMOVE(&so->so_incomp, sp, so_list);
- so->so_incqlen--;
- sp->so_qstate &= ~SQ_INCOMP;
- sp->so_head = NULL;
- ACCEPT_UNLOCK();
- soabort(sp);
- ACCEPT_LOCK();
- }
- while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
- TAILQ_REMOVE(&so->so_comp, sp, so_list);
- so->so_qlen--;
- sp->so_qstate &= ~SQ_COMP;
- sp->so_head = NULL;
- ACCEPT_UNLOCK();
- soabort(sp);
- ACCEPT_LOCK();
+
+ TAILQ_INIT(&lqueue);
+ TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
+ TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
+
+ so->sol_qlen = so->sol_incqlen = 0;
+
+ TAILQ_FOREACH(sp, &lqueue, so_list) {
+ SOCK_LOCK(sp);
+ sp->so_qstate = SQ_NONE;
+ sp->so_listen = NULL;
+ SOCK_UNLOCK(sp);
+ /* Guaranteed not to be the last. */
+ refcount_release(&so->so_count);
}
- KASSERT((TAILQ_EMPTY(&so->so_comp)),
- ("%s: so_comp populated", __func__));
- KASSERT((TAILQ_EMPTY(&so->so_incomp)),
- ("%s: so_incomp populated", __func__));
}
- SOCK_LOCK(so);
KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
so->so_state |= SS_NOFDREF;
- sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */
+ sorele(so);
+ if (listening) {
+ struct socket *sp;
+
+ TAILQ_FOREACH(sp, &lqueue, so_list) {
+ SOCK_LOCK(sp);
+ if (sp->so_count == 0) {
+ SOCK_UNLOCK(sp);
+ soabort(sp);
+ } else
+ /* sp is now in sofree() */
+ SOCK_UNLOCK(sp);
+ }
+ }
CURVNET_RESTORE();
return (error);
}
@@ -976,13 +1192,11 @@ soabort(struct socket *so)
KASSERT(so->so_count == 0, ("soabort: so_count"));
KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
- KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
- KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
+ KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE"));
VNET_SO_ASSERT(so);
if (so->so_proto->pr_usrreqs->pru_abort != NULL)
(*so->so_proto->pr_usrreqs->pru_abort)(so);
- ACCEPT_LOCK();
SOCK_LOCK(so);
sofree(so);
}
@@ -1431,8 +1645,14 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
int error;
CURVNET_SET(so->so_vnet);
- error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
- control, flags, td);
+ if (!SOLISTENING(so))
+ error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
+ top, control, flags, td);
+ else {
+ m_freem(top);
+ m_freem(control);
+ error = ENOTCONN;
+ }
CURVNET_RESTORE();
return (error);
}
@@ -2368,8 +2588,11 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
int error;
CURVNET_SET(so->so_vnet);
- error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
- controlp, flagsp));
+ if (!SOLISTENING(so))
+ error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
+ mp0, controlp, flagsp));
+ else
+ error = ENOTCONN;
CURVNET_RESTORE();
return (error);
}
@@ -2565,7 +2788,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
} else {
switch (sopt->sopt_name) {
case SO_ACCEPTFILTER:
- error = do_setopt_accept_filter(so, sopt);
+ error = accept_filt_setopt(so, sopt);
if (error)
goto bad;
break;
@@ -2653,38 +2876,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
goto bad;
}
- switch (sopt->sopt_name) {
- case SO_SNDBUF:
- case SO_RCVBUF:
- if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
- &so->so_snd : &so->so_rcv, (u_long)optval,
- so, curthread) == 0) {
- error = ENOBUFS;
- goto bad;
- }
- (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
- &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
- break;
-
- /*
- * Make sure the low-water is never greater than the
- * high-water.
- */
- case SO_SNDLOWAT:
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_lowat =
- (optval > so->so_snd.sb_hiwat) ?
- so->so_snd.sb_hiwat : optval;
- SOCKBUF_UNLOCK(&so->so_snd);
- break;
- case SO_RCVLOWAT:
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_rcv.sb_lowat =
- (optval > so->so_rcv.sb_hiwat) ?
- so->so_rcv.sb_hiwat : optval;
- SOCKBUF_UNLOCK(&so->so_rcv);
- break;
- }
+ error = sbsetopt(so, sopt->sopt_name, optval);
break;
case SO_SNDTIMEO:
@@ -2825,7 +3017,7 @@ sogetopt(struct socket *so, struct sockopt *sopt)
} else {
switch (sopt->sopt_name) {
case SO_ACCEPTFILTER:
- error = do_getopt_accept_filter(so, sopt);
+ error = accept_filt_getopt(so, sopt);
break;
case SO_LINGER:
@@ -2869,19 +3061,23 @@ integer:
goto integer;
case SO_SNDBUF:
- optval = so->so_snd.sb_hiwat;
+ optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
+ so->so_snd.sb_hiwat;
goto integer;
case SO_RCVBUF:
- optval = so->so_rcv.sb_hiwat;
+ optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
+ so->so_rcv.sb_hiwat;
goto integer;
case SO_SNDLOWAT:
- optval = so->so_snd.sb_lowat;
+ optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
+ so->so_snd.sb_lowat;
goto integer;
case SO_RCVLOWAT:
- optval = so->so_rcv.sb_lowat;
+ optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
+ so->so_rcv.sb_lowat;
goto integer;
case SO_SNDTIMEO:
@@ -2933,15 +3129,15 @@ integer:
break;
case SO_LISTENQLIMIT:
- optval = so->so_qlimit;
+ optval = SOLISTENING(so) ? so->sol_qlimit : 0;
goto integer;
case SO_LISTENQLEN:
- optval = so->so_qlen;
+ optval = SOLISTENING(so) ? so->sol_qlen : 0;
goto integer;
case SO_LISTENINCQLEN:
- optval = so->so_incqlen;
+ optval = SOLISTENING(so) ? so->sol_incqlen : 0;
goto integer;
case SO_TS_CLOCK:
@@ -3092,7 +3288,7 @@ sohasoutofband(struct socket *so)
if (so->so_sigio != NULL)
pgsigio(&so->so_sigio, SIGURG, 0);
#endif /* __rtems__ */
- selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
+ selwakeuppri(&so->so_rdsel, PSOCK);
}
int
@@ -3112,44 +3308,54 @@ int
sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
struct thread *td)
{
- int revents = 0;
-
- SOCKBUF_LOCK(&so->so_snd);
- SOCKBUF_LOCK(&so->so_rcv);
- if (events & (POLLIN | POLLRDNORM))
- if (soreadabledata(so))
- revents |= events & (POLLIN | POLLRDNORM);
+ int revents;
- if (events & (POLLOUT | POLLWRNORM))
- if (sowriteable(so))
- revents |= events & (POLLOUT | POLLWRNORM);
-
- if (events & (POLLPRI | POLLRDBAND))
- if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
- revents |= events & (POLLPRI | POLLRDBAND);
-
- if ((events & POLLINIGNEOF) == 0) {
- if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
- revents |= events & (POLLIN | POLLRDNORM);
- if (so->so_snd.sb_state & SBS_CANTSENDMORE)
- revents |= POLLHUP;
+ SOCK_LOCK(so);
+ if (SOLISTENING(so)) {
+ if (!(events & (POLLIN | POLLRDNORM)))
+ revents = 0;
+ else if (!TAILQ_EMPTY(&so->sol_comp))
+ revents = events & (POLLIN | POLLRDNORM);
+ else {
+ selrecord(td, &so->so_rdsel);
+ revents = 0;
}
- }
-
- if (revents == 0) {
- if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
- selrecord(td, &so->so_rcv.sb_sel);
- so->so_rcv.sb_flags |= SB_SEL;
+ } else {
+ revents = 0;
+ SOCKBUF_LOCK(&so->so_snd);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (events & (POLLIN | POLLRDNORM))
+ if (soreadabledata(so))
+ revents |= events & (POLLIN | POLLRDNORM);
+ if (events & (POLLOUT | POLLWRNORM))
+ if (sowriteable(so))
+ revents |= events & (POLLOUT | POLLWRNORM);
+ if (events & (POLLPRI | POLLRDBAND))
+ if (so->so_oobmark ||
+ (so->so_rcv.sb_state & SBS_RCVATMARK))
+ revents |= events & (POLLPRI | POLLRDBAND);
+ if ((events & POLLINIGNEOF) == 0) {
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ revents |= events & (POLLIN | POLLRDNORM);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE)
+ revents |= POLLHUP;
+ }
}
-
- if (events & (POLLOUT | POLLWRNORM)) {
- selrecord(td, &so->so_snd.sb_sel);
- so->so_snd.sb_flags |= SB_SEL;
+ if (revents == 0) {
+ if (events &
+ (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
+ selrecord(td, &so->so_rdsel);
+ so->so_rcv.sb_flags |= SB_SEL;
+ }
+ if (events & (POLLOUT | POLLWRNORM)) {
+ selrecord(td, &so->so_wrsel);
+ so->so_snd.sb_flags |= SB_SEL;
+ }
}
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_snd);
}
-
- SOCKBUF_UNLOCK(&so->so_rcv);
- SOCKBUF_UNLOCK(&so->so_snd);
+ SOCK_UNLOCK(so);
return (revents);
}
@@ -3158,28 +3364,38 @@ soo_kqfilter(struct file *fp, struct knote *kn)
{
struct socket *so = kn->kn_fp->f_data;
struct sockbuf *sb;
+ struct knlist *knl;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &soread_filtops;
+ knl = &so->so_rdsel.si_note;
sb = &so->so_rcv;
break;
case EVFILT_WRITE:
kn->kn_fop = &sowrite_filtops;
+ knl = &so->so_wrsel.si_note;
sb = &so->so_snd;
break;
case EVFILT_EMPTY:
kn->kn_fop = &soempty_filtops;
+ knl = &so->so_wrsel.si_note;
sb = &so->so_snd;
break;
default:
return (EINVAL);
}
- SOCKBUF_LOCK(sb);
- knlist_add(&sb->sb_sel.si_note, kn, 1);
- sb->sb_flags |= SB_KNOTE;
- SOCKBUF_UNLOCK(sb);
+ SOCK_LOCK(so);
+ if (SOLISTENING(so)) {
+ knlist_add(knl, kn, 1);
+ } else {
+ SOCKBUF_LOCK(sb);
+ knlist_add(knl, kn, 1);
+ sb->sb_flags |= SB_KNOTE;
+ SOCKBUF_UNLOCK(sb);
+ }
+ SOCK_UNLOCK(so);
return (0);
}
#ifdef __rtems__
@@ -3367,11 +3583,11 @@ filt_sordetach(struct knote *kn)
{
struct socket *so = kn->kn_fp->f_data;
- SOCKBUF_LOCK(&so->so_rcv);
- knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
- if (knlist_empty(&so->so_rcv.sb_sel.si_note))
+ so_rdknl_lock(so);
+ knlist_remove(&so->so_rdsel.si_note, kn, 1);
+ if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
so->so_rcv.sb_flags &= ~SB_KNOTE;
- SOCKBUF_UNLOCK(&so->so_rcv);
+ so_rdknl_unlock(so);
}
/*ARGSUSED*/
@@ -3381,11 +3597,13 @@ filt_soread(struct knote *kn, long hint)
struct socket *so;
so = kn->kn_fp->f_data;
- if (so->so_options & SO_ACCEPTCONN) {
- kn->kn_data = so->so_qlen;
- return (!TAILQ_EMPTY(&so->so_comp));
+ if (SOLISTENING(so)) {
+ SOCK_LOCK_ASSERT(so);
+ kn->kn_data = so->sol_qlen;
+ return (!TAILQ_EMPTY(&so->sol_comp));
}
+
SOCKBUF_LOCK_ASSERT(&so->so_rcv);
kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
@@ -3411,11 +3629,11 @@ filt_sowdetach(struct knote *kn)
{
struct socket *so = kn->kn_fp->f_data;
- SOCKBUF_LOCK(&so->so_snd);
- knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
- if (knlist_empty(&so->so_snd.sb_sel.si_note))
+ so_wrknl_lock(so);
+ knlist_remove(&so->so_wrsel.si_note, kn, 1);
+ if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
so->so_snd.sb_flags &= ~SB_KNOTE;
- SOCKBUF_UNLOCK(&so->so_snd);
+ so_wrknl_unlock(so);
}
/*ARGSUSED*/
@@ -3425,6 +3643,10 @@ filt_sowrite(struct knote *kn, long hint)
struct socket *so;
so = kn->kn_fp->f_data;
+
+ if (SOLISTENING(so))
+ return (0);
+
SOCKBUF_LOCK_ASSERT(&so->so_snd);
kn->kn_data = sbspace(&so->so_snd);
@@ -3451,6 +3673,10 @@ filt_soempty(struct knote *kn, long hint)
struct socket *so;
so = kn->kn_fp->f_data;
+
+ if (SOLISTENING(so))
+ return (1);
+
SOCKBUF_LOCK_ASSERT(&so->so_snd);
kn->kn_data = sbused(&so->so_snd);
@@ -3521,42 +3747,52 @@ soisconnected(struct socket *so)
struct socket *head;
int ret;
+ /*
+ * XXXGL: this is the only place where we acquire socket locks
+ * in reverse order: first child, then listening socket. To
+ * avoid possible LOR, use try semantics.
+ */
restart:
- ACCEPT_LOCK();
SOCK_LOCK(so);
+ if ((head = so->so_listen) != NULL &&
+ __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+ SOCK_UNLOCK(so);
+ goto restart;
+ }
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
so->so_state |= SS_ISCONNECTED;
- head = so->so_head;
- if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
+ if (head != NULL && (so->so_qstate == SQ_INCOMP)) {
+again:
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+ TAILQ_REMOVE(&head->sol_incomp, so, so_list);
+ head->sol_incqlen--;
+ TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
+ head->sol_qlen++;
+ so->so_qstate = SQ_COMP;
SOCK_UNLOCK(so);
- TAILQ_REMOVE(&head->so_incomp, so, so_list);
- head->so_incqlen--;
- so->so_qstate &= ~SQ_INCOMP;
- TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
- head->so_qlen++;
- so->so_qstate |= SQ_COMP;
- ACCEPT_UNLOCK();
- sorwakeup(head);
- wakeup_one(&head->so_timeo);
+ solisten_wakeup(head); /* unlocks */
} else {
- ACCEPT_UNLOCK();
+ SOCKBUF_LOCK(&so->so_rcv);
soupcall_set(so, SO_RCV,
- head->so_accf->so_accept_filter->accf_callback,
- head->so_accf->so_accept_filter_arg);
+ head->sol_accept_filter->accf_callback,
+ head->sol_accept_filter_arg);
so->so_options &= ~SO_ACCEPTFILTER;
- ret = head->so_accf->so_accept_filter->accf_callback(so,
- head->so_accf->so_accept_filter_arg, M_NOWAIT);
- if (ret == SU_ISCONNECTED)
+ ret = head->sol_accept_filter->accf_callback(so,
+ head->sol_accept_filter_arg, M_NOWAIT);
+ if (ret == SU_ISCONNECTED) {
soupcall_clear(so, SO_RCV);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ goto again;
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
SOCK_UNLOCK(so);
- if (ret == SU_ISCONNECTED)
- goto restart;
+ SOLISTEN_UNLOCK(head);
}
return;
}
+ if (head != NULL)
+ SOLISTEN_UNLOCK(head);
SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
wakeup(&so->so_timeo);
sorwakeup(so);
sowwakeup(so);
@@ -3566,16 +3802,17 @@ void
soisdisconnecting(struct socket *so)
{
- /*
- * Note: This code assumes that SOCK_LOCK(so) and
- * SOCKBUF_LOCK(&so->so_rcv) are the same.
- */
- SOCKBUF_LOCK(&so->so_rcv);
+ SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTING;
so->so_state |= SS_ISDISCONNECTING;
- socantrcvmore_locked(so);
- SOCKBUF_LOCK(&so->so_snd);
- socantsendmore_locked(so);
+
+ if (!SOLISTENING(so)) {
+ SOCKBUF_LOCK(&so->so_rcv);
+ socantrcvmore_locked(so);
+ SOCKBUF_LOCK(&so->so_snd);
+ socantsendmore_locked(so);
+ }
+ SOCK_UNLOCK(so);
wakeup(&so->so_timeo);
}
@@ -3583,17 +3820,18 @@ void
soisdisconnected(struct socket *so)
{
- /*
- * Note: This code assumes that SOCK_LOCK(so) and
- * SOCKBUF_LOCK(&so->so_rcv) are the same.
- */
- SOCKBUF_LOCK(&so->so_rcv);
+ SOCK_LOCK(so);
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISDISCONNECTED;
- socantrcvmore_locked(so);
- SOCKBUF_LOCK(&so->so_snd);
- sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
- socantsendmore_locked(so);
+
+ if (!SOLISTENING(so)) {
+ SOCKBUF_LOCK(&so->so_rcv);
+ socantrcvmore_locked(so);
+ SOCKBUF_LOCK(&so->so_snd);
+ sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
+ socantsendmore_locked(so);
+ }
+ SOCK_UNLOCK(so);
wakeup(&so->so_timeo);
}
@@ -3615,11 +3853,12 @@ sodupsockaddr(const struct sockaddr *sa, int mflags)
* Register per-socket buffer upcalls.
*/
void
-soupcall_set(struct socket *so, int which,
- int (*func)(struct socket *, void *, int), void *arg)
+soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
{
struct sockbuf *sb;
+ KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
+
switch (which) {
case SO_RCV:
sb = &so->so_rcv;
@@ -3631,10 +3870,6 @@ soupcall_set(struct socket *so, int which,
panic("soupcall_set: bad which");
}
SOCKBUF_LOCK_ASSERT(sb);
-#if 0
- /* XXX: accf_http actually wants to do this on purpose. */
- KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
-#endif
sb->sb_upcall = func;
sb->sb_upcallarg = arg;
sb->sb_flags |= SB_UPCALL;
@@ -3645,6 +3880,8 @@ soupcall_clear(struct socket *so, int which)
{
struct sockbuf *sb;
+ KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
+
switch (which) {
case SO_RCV:
sb = &so->so_rcv;
@@ -3656,12 +3893,110 @@ soupcall_clear(struct socket *so, int which)
panic("soupcall_clear: bad which");
}
SOCKBUF_LOCK_ASSERT(sb);
- KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
+ KASSERT(sb->sb_upcall != NULL,
+ ("%s: so %p no upcall to clear", __func__, so));
sb->sb_upcall = NULL;
sb->sb_upcallarg = NULL;
sb->sb_flags &= ~SB_UPCALL;
}
+void
+solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
+{
+
+ SOLISTEN_LOCK_ASSERT(so);
+ so->sol_upcall = func;
+ so->sol_upcallarg = arg;
+}
+
+static void
+so_rdknl_lock(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_LOCK(so);
+ else
+ SOCKBUF_LOCK(&so->so_rcv);
+}
+
+static void
+so_rdknl_unlock(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_UNLOCK(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+static void
+so_rdknl_assert_locked(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_LOCK_ASSERT(so);
+ else
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+}
+
+static void
+so_rdknl_assert_unlocked(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_UNLOCK_ASSERT(so);
+ else
+ SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
+}
+
+static void
+so_wrknl_lock(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_LOCK(so);
+ else
+ SOCKBUF_LOCK(&so->so_snd);
+}
+
+static void
+so_wrknl_unlock(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_UNLOCK(so);
+ else
+ SOCKBUF_UNLOCK(&so->so_snd);
+}
+
+static void
+so_wrknl_assert_locked(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_LOCK_ASSERT(so);
+ else
+ SOCKBUF_LOCK_ASSERT(&so->so_snd);
+}
+
+static void
+so_wrknl_assert_unlocked(void *arg)
+{
+ struct socket *so = arg;
+
+ if (SOLISTENING(so))
+ SOCK_UNLOCK_ASSERT(so);
+ else
+ SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
+}
+
/*
* Create an external-format (``xsocket'') structure using the information in
* the kernel-format socket structure pointed to by so. This is done to
@@ -3683,36 +4018,28 @@ sotoxsocket(struct socket *so, struct xsocket *xso)
xso->so_pcb = so->so_pcb;
xso->xso_protocol = so->so_proto->pr_protocol;
xso->xso_family = so->so_proto->pr_domain->dom_family;
- xso->so_qlen = so->so_qlen;
- xso->so_incqlen = so->so_incqlen;
- xso->so_qlimit = so->so_qlimit;
xso->so_timeo = so->so_timeo;
xso->so_error = so->so_error;
- xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
- xso->so_oobmark = so->so_oobmark;
- sbtoxsockbuf(&so->so_snd, &xso->so_snd);
- sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
#ifndef __rtems__
xso->so_uid = so->so_cred->cr_uid;
#else /* __rtems__ */
xso->so_uid = BSD_DEFAULT_UID;
#endif /* __rtems__ */
-}
-
-
-/*
- * Socket accessor functions to provide external consumers with
- * a safe interface to socket state
- *
- */
-
-void
-so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
- void *arg)
-{
-
- TAILQ_FOREACH(so, &so->so_comp, so_list)
- func(so, arg);
+ xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ if (SOLISTENING(so)) {
+ xso->so_qlen = so->sol_qlen;
+ xso->so_incqlen = so->sol_incqlen;
+ xso->so_qlimit = so->sol_qlimit;
+ xso->so_oobmark = 0;
+ bzero(&xso->so_snd, sizeof(xso->so_snd));
+ bzero(&xso->so_rcv, sizeof(xso->so_rcv));
+ } else {
+ xso->so_state |= so->so_qstate;
+ xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
+ xso->so_oobmark = so->so_oobmark;
+ sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+ sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ }
}
struct sockbuf *
diff --git a/freebsd/sys/kern/uipc_syscalls.c b/freebsd/sys/kern/uipc_syscalls.c
index f301c12c..5a9a381f 100644
--- a/freebsd/sys/kern/uipc_syscalls.c
+++ b/freebsd/sys/kern/uipc_syscalls.c
@@ -70,13 +70,6 @@ __FBSDID("$FreeBSD$");
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
-/*
- * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
- * and SOCK_NONBLOCK.
- */
-#define ACCEPT4_INHERIT 0x1
-#define ACCEPT4_COMPAT 0x2
-
static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
@@ -524,59 +517,22 @@ kern_accept4(struct thread *td, int s, struct sockaddr **name,
(flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
if (error != 0)
goto done;
- ACCEPT_LOCK();
- if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
- ACCEPT_UNLOCK();
- error = EWOULDBLOCK;
- goto noconnection;
- }
- while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
- if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
- head->so_error = ECONNABORTED;
- break;
- }
- error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
- "accept", 0);
- if (error != 0) {
- ACCEPT_UNLOCK();
- goto noconnection;
- }
- }
- if (head->so_error) {
- error = head->so_error;
- head->so_error = 0;
- ACCEPT_UNLOCK();
+ SOCK_LOCK(head);
+ if (!SOLISTENING(head)) {
+ SOCK_UNLOCK(head);
+ error = EINVAL;
goto noconnection;
}
- so = TAILQ_FIRST(&head->so_comp);
- KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
- KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
- /*
- * Before changing the flags on the socket, we have to bump the
- * reference count. Otherwise, if the protocol calls sofree(),
- * the socket will be released due to a zero refcount.
- */
- SOCK_LOCK(so); /* soref() and so_state update */
- soref(so); /* file descriptor reference */
-
- TAILQ_REMOVE(&head->so_comp, so, so_list);
- head->so_qlen--;
- if (flags & ACCEPT4_INHERIT)
- so->so_state |= (head->so_state & SS_NBIO);
- else
- so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
- so->so_qstate &= ~SQ_COMP;
- so->so_head = NULL;
-
- SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
+ error = solisten_dequeue(head, &so, flags);
+ if (error != 0)
+ goto noconnection;
/* An extra reference on `nfp' has been held for us by falloc(). */
td->td_retval[0] = fd;
- /* connection has been removed from the listen queue */
- KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
+ /* Connection has been removed from the listen queue. */
+ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0);
if (flags & ACCEPT4_INHERIT) {
pgid = fgetown(&head->so_sigio);
@@ -594,7 +550,6 @@ kern_accept4(struct thread *td, int s, struct sockaddr **name,
(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
tmp = fflag & FASYNC;
(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
- sa = NULL;
error = soaccept(so, &sa);
if (error != 0)
goto noconnection;
@@ -769,7 +724,7 @@ kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
}
SOCK_LOCK(so);
while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
- error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
+ error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH,
"connec", 0);
if (error != 0) {
if (error == EINTR || error == ERESTART)
diff --git a/freebsd/sys/kern/uipc_usrreq.c b/freebsd/sys/kern/uipc_usrreq.c
index 8e60f227..7237956a 100644
--- a/freebsd/sys/kern/uipc_usrreq.c
+++ b/freebsd/sys/kern/uipc_usrreq.c
@@ -202,10 +202,9 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
/*
* Locking and synchronization:
*
- * Three types of locks exit in the local domain socket implementation: a
- * global list mutex, a global linkage rwlock, and per-unpcb mutexes. Of the
- * global locks, the list lock protects the socket count, global generation
- * number, and stream/datagram global lists. The linkage lock protects the
+ * Two types of locks exist in the local domain socket implementation: a
+ * a global linkage rwlock and per-unpcb mutexes. The linkage lock protects
+ * the socket count, global generation number, stream/datagram global lists and
* interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
* held exclusively over the acquisition of multiple unpcb locks to prevent
* deadlock.
@@ -246,7 +245,6 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
* to perform namei() and other file system operations.
*/
static struct rwlock unp_link_rwlock;
-static struct mtx unp_list_lock;
static struct mtx unp_defers_lock;
#define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \
@@ -263,11 +261,7 @@ static struct mtx unp_defers_lock;
#define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock)
#define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \
RA_WLOCKED)
-
-#define UNP_LIST_LOCK_INIT() mtx_init(&unp_list_lock, \
- "unp_list_lock", NULL, MTX_DEF)
-#define UNP_LIST_LOCK() mtx_lock(&unp_list_lock)
-#define UNP_LIST_UNLOCK() mtx_unlock(&unp_list_lock)
+#define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock)
#define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \
"unp_defer", NULL, MTX_DEF)
@@ -417,6 +411,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
u_long sendspace, recvspace;
struct unpcb *unp;
int error;
+ bool locked;
KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -451,10 +446,12 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
unp->unp_socket = so;
so->so_pcb = unp;
unp->unp_refcount = 1;
- if (so->so_head != NULL)
+ if (so->so_listen != NULL)
unp->unp_flags |= UNP_NASCENT;
- UNP_LIST_LOCK();
+ if ((locked = UNP_LINK_WOWNED()) == false)
+ UNP_LINK_WLOCK();
+
unp->unp_gencnt = ++unp_gencnt;
unp_count++;
switch (so->so_type) {
@@ -473,7 +470,9 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
default:
panic("uipc_attach");
}
- UNP_LIST_UNLOCK();
+
+ if (locked == false)
+ UNP_LINK_WUNLOCK();
return (0);
}
@@ -516,6 +515,14 @@ static const IMFS_node_control rtems_uipc_imfs_control =
static const IMFS_node_control rtems_uipc_imfs_zombi_control =
IMFS_GENERIC_INITIALIZER(&rtems_filesystem_handlers_default, NULL,
IMFS_node_destroy_default);
+
+static void
+VOP_UNP_DETACH(IMFS_generic_t *vp)
+{
+
+ vp->Node.control = &rtems_uipc_imfs_zombi_control;
+ vp->context = NULL;
+}
#endif /* __rtems__ */
static int
uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
@@ -630,7 +637,7 @@ restart:
UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
#ifndef __rtems__
- VOP_UNP_BIND(vp, unp->unp_socket);
+ VOP_UNP_BIND(vp, unp);
unp->unp_vnode = vp;
#endif /* __rtems__ */
unp->unp_addr = soun;
@@ -690,6 +697,11 @@ static void
uipc_close(struct socket *so)
{
struct unpcb *unp, *unp2;
+#ifndef __rtems__
+ struct vnode *vp;
+#else /* __rtems__ */
+ IMFS_generic_t *vp;
+#endif /* __rtems__ */
unp = sotounpcb(so);
KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
@@ -702,8 +714,16 @@ uipc_close(struct socket *so)
unp_disconnect(unp, unp2);
UNP_PCB_UNLOCK(unp2);
}
+ if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) {
+ VOP_UNP_DETACH(vp);
+ unp->unp_vnode = NULL;
+ }
UNP_PCB_UNLOCK(unp);
UNP_LINK_WUNLOCK();
+#ifndef __rtems__
+ if (vp)
+ vrele(vp);
+#endif /* __rtems__ */
}
static int
@@ -747,29 +767,16 @@ uipc_detach(struct socket *so)
local_unp_rights = 0;
#endif /* __rtems__ */
- UNP_LIST_LOCK();
+ UNP_LINK_WLOCK();
LIST_REMOVE(unp, unp_link);
unp->unp_gencnt = ++unp_gencnt;
--unp_count;
- UNP_LIST_UNLOCK();
-
- if ((unp->unp_flags & UNP_NASCENT) != 0) {
- UNP_PCB_LOCK(unp);
- goto teardown;
- }
- UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
+ if ((unp->unp_flags & UNP_NASCENT) != 0)
+ goto teardown;
- /*
- * XXXRW: Should assert vp->v_socket == so.
- */
if ((vp = unp->unp_vnode) != NULL) {
-#ifndef __rtems__
VOP_UNP_DETACH(vp);
-#else /* __rtems__ */
- vp->Node.control = &rtems_uipc_imfs_zombi_control;
- vp->context = NULL;
-#endif /* __rtems__ */
unp->unp_vnode = NULL;
}
unp2 = unp->unp_conn;
@@ -793,8 +800,8 @@ uipc_detach(struct socket *so)
#ifndef __rtems__
local_unp_rights = unp_rights;
#endif /* __rtems__ */
- UNP_LINK_WUNLOCK();
teardown:
+ UNP_LINK_WUNLOCK();
unp->unp_socket->so_pcb = NULL;
saved_unp_addr = unp->unp_addr;
unp->unp_addr = NULL;
@@ -860,7 +867,6 @@ uipc_listen(struct socket *so, int backlog, struct thread *td)
error = solisten_proto_check(so);
if (error == 0) {
cru2x(td->td_ucred, &unp->unp_peercred);
- unp->unp_flags |= UNP_HAVEPCCACHED;
solisten_proto(so, backlog);
}
SOCK_UNLOCK(so);
@@ -1439,7 +1445,7 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
#else /* __rtems__ */
struct IMFS_jnode_tt *vp;
#endif /* __rtems__ */
- struct socket *so2, *so3;
+ struct socket *so2;
struct unpcb *unp, *unp2, *unp3;
#ifndef __rtems__
struct nameidata nd;
@@ -1450,7 +1456,9 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
const rtems_filesystem_location_info_t *currentloc;
#endif /* __rtems__ */
struct sockaddr *sa;
+#ifndef __rtems__
cap_rights_t rights;
+#endif /* __rtems__ */
int error, len;
if (nam->sa_family != AF_UNIX)
@@ -1535,34 +1543,38 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
*/
UNP_LINK_WLOCK();
#ifndef __rtems__
- VOP_UNP_CONNECT(vp, &so2);
+ VOP_UNP_CONNECT(vp, &unp2);
+ if (unp2 == NULL) {
+ error = ECONNREFUSED;
+ goto bad2;
+ }
+ so2 = unp2->unp_socket;
#else /* __rtems__ */
so2 = IMFS_generic_get_context_by_node(vp);
-#endif /* __rtems__ */
if (so2 == NULL) {
error = ECONNREFUSED;
goto bad2;
}
+ unp2 = sotounpcb(so2);
+#endif /* __rtems__ */
if (so->so_type != so2->so_type) {
error = EPROTOTYPE;
goto bad2;
}
+ UNP_PCB_LOCK(unp);
+ UNP_PCB_LOCK(unp2);
if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
if (so2->so_options & SO_ACCEPTCONN) {
CURVNET_SET(so2->so_vnet);
- so3 = sonewconn(so2, 0);
+ so2 = sonewconn(so2, 0);
CURVNET_RESTORE();
} else
- so3 = NULL;
- if (so3 == NULL) {
+ so2 = NULL;
+ if (so2 == NULL) {
error = ECONNREFUSED;
- goto bad2;
+ goto bad3;
}
- unp = sotounpcb(so);
- unp2 = sotounpcb(so2);
- unp3 = sotounpcb(so3);
- UNP_PCB_LOCK(unp);
- UNP_PCB_LOCK(unp2);
+ unp3 = sotounpcb(so2);
UNP_PCB_LOCK(unp3);
if (unp2->unp_addr != NULL) {
bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
@@ -1583,30 +1595,24 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
* listen(); uipc_listen() cached that process's credentials
* at that time so we can use them now.
*/
- KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
- ("unp_connect: listener without cached peercred"));
memcpy(&unp->unp_peercred, &unp2->unp_peercred,
sizeof(unp->unp_peercred));
unp->unp_flags |= UNP_HAVEPC;
if (unp2->unp_flags & UNP_WANTCRED)
unp3->unp_flags |= UNP_WANTCRED;
- UNP_PCB_UNLOCK(unp3);
UNP_PCB_UNLOCK(unp2);
- UNP_PCB_UNLOCK(unp);
+ unp2 = unp3;
#ifdef MAC
- mac_socketpeer_set_from_socket(so, so3);
- mac_socketpeer_set_from_socket(so3, so);
+ mac_socketpeer_set_from_socket(so, so2);
+ mac_socketpeer_set_from_socket(so2, so);
#endif
-
- so2 = so3;
}
- unp = sotounpcb(so);
- KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
- unp2 = sotounpcb(so2);
- KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
- UNP_PCB_LOCK(unp);
- UNP_PCB_LOCK(unp2);
+
+ KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
+ sotounpcb(so2) == unp2,
+ ("%s: unp2 %p so2 %p", __func__, unp2, so2));
error = unp_connect2(so, so2, PRU_CONNECT);
+bad3:
UNP_PCB_UNLOCK(unp2);
UNP_PCB_UNLOCK(unp);
bad2:
@@ -1750,10 +1756,10 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
* OK, now we're committed to doing something.
*/
xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
- UNP_LIST_LOCK();
+ UNP_LINK_RLOCK();
gencnt = unp_gencnt;
n = unp_count;
- UNP_LIST_UNLOCK();
+ UNP_LINK_RUNLOCK();
xug->xug_len = sizeof *xug;
xug->xug_count = n;
@@ -1767,7 +1773,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
- UNP_LIST_LOCK();
+ UNP_LINK_RLOCK();
for (unp = LIST_FIRST(head), i = 0; unp && i < n;
unp = LIST_NEXT(unp, unp_link)) {
UNP_PCB_LOCK(unp);
@@ -1782,7 +1788,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
}
UNP_PCB_UNLOCK(unp);
}
- UNP_LIST_UNLOCK();
+ UNP_LINK_RUNLOCK();
n = i; /* In case we lost some during malloc. */
error = 0;
@@ -2044,7 +2050,6 @@ unp_init(void)
TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
#endif /* __rtems__ */
UNP_LINK_LOCK_INIT();
- UNP_LIST_LOCK_INIT();
UNP_DEFERRED_LOCK_INIT();
}
@@ -2396,8 +2401,7 @@ unp_accessable(struct filedescent **fdep, int fdcount)
static void
unp_gc_process(struct unpcb *unp)
{
- struct socket *soa;
- struct socket *so;
+ struct socket *so, *soa;
struct file *fp;
/* Already processed. */
@@ -2417,28 +2421,30 @@ unp_gc_process(struct unpcb *unp)
return;
}
- /*
- * Mark all sockets we reference with RIGHTS.
- */
so = unp->unp_socket;
- if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
- SOCKBUF_LOCK(&so->so_rcv);
- unp_scan(so->so_rcv.sb_mb, unp_accessable);
- SOCKBUF_UNLOCK(&so->so_rcv);
- }
-
- /*
- * Mark all sockets in our accept queue.
- */
- ACCEPT_LOCK();
- TAILQ_FOREACH(soa, &so->so_comp, so_list) {
- if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0)
- continue;
- SOCKBUF_LOCK(&soa->so_rcv);
- unp_scan(soa->so_rcv.sb_mb, unp_accessable);
- SOCKBUF_UNLOCK(&soa->so_rcv);
+ SOCK_LOCK(so);
+ if (SOLISTENING(so)) {
+ /*
+ * Mark all sockets in our accept queue.
+ */
+ TAILQ_FOREACH(soa, &so->sol_comp, so_list) {
+ if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
+ continue;
+ SOCKBUF_LOCK(&soa->so_rcv);
+ unp_scan(soa->so_rcv.sb_mb, unp_accessable);
+ SOCKBUF_UNLOCK(&soa->so_rcv);
+ }
+ } else {
+ /*
+ * Mark all sockets we reference with RIGHTS.
+ */
+ if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
+ SOCKBUF_LOCK(&so->so_rcv);
+ unp_scan(so->so_rcv.sb_mb, unp_accessable);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ }
}
- ACCEPT_UNLOCK();
+ SOCK_UNLOCK(so);
unp->unp_gcflag |= UNPGC_SCANNED;
}
@@ -2461,7 +2467,7 @@ unp_gc(__unused void *arg, int pending)
int i, total;
unp_taskcount++;
- UNP_LIST_LOCK();
+ UNP_LINK_RLOCK();
/*
* First clear all gc flags from previous runs, apart from
* UNPGC_IGNORE_RIGHTS.
@@ -2484,7 +2490,7 @@ unp_gc(__unused void *arg, int pending)
LIST_FOREACH(unp, *head, unp_link)
unp_gc_process(unp);
} while (unp_marked);
- UNP_LIST_UNLOCK();
+ UNP_LINK_RUNLOCK();
if (unp_unreachable == 0)
return;
@@ -2499,7 +2505,6 @@ unp_gc(__unused void *arg, int pending)
* as as unreachable and store them locally.
*/
UNP_LINK_RLOCK();
- UNP_LIST_LOCK();
for (total = 0, head = heads; *head != NULL; head++)
LIST_FOREACH(unp, *head, unp_link)
if ((unp->unp_gcflag & UNPGC_DEAD) != 0) {
@@ -2512,7 +2517,6 @@ unp_gc(__unused void *arg, int pending)
KASSERT(total <= unp_unreachable,
("unp_gc: incorrect unreachable count."));
}
- UNP_LIST_UNLOCK();
UNP_LINK_RUNLOCK();
/*
@@ -2555,10 +2559,11 @@ unp_dispose(struct socket *so)
struct unpcb *unp;
unp = sotounpcb(so);
- UNP_LIST_LOCK();
+ UNP_LINK_WLOCK();
unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
- UNP_LIST_UNLOCK();
- unp_dispose_mbuf(so->so_rcv.sb_mb);
+ UNP_LINK_WUNLOCK();
+ if (!SOLISTENING(so))
+ unp_dispose_mbuf(so->so_rcv.sb_mb);
}
static void
@@ -2613,7 +2618,6 @@ unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
void
vfs_unp_reclaim(struct vnode *vp)
{
- struct socket *so;
struct unpcb *unp;
int active;
@@ -2623,10 +2627,7 @@ vfs_unp_reclaim(struct vnode *vp)
active = 0;
UNP_LINK_WLOCK();
- VOP_UNP_CONNECT(vp, &so);
- if (so == NULL)
- goto done;
- unp = sotounpcb(so);
+ VOP_UNP_CONNECT(vp, &unp);
if (unp == NULL)
goto done;
UNP_PCB_LOCK(unp);
@@ -2663,10 +2664,6 @@ db_print_unpflags(int unp_flags)
db_printf("%sUNP_HAVEPC", comma ? ", " : "");
comma = 1;
}
- if (unp_flags & UNP_HAVEPCCACHED) {
- db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
- comma = 1;
- }
if (unp_flags & UNP_WANTCRED) {
db_printf("%sUNP_WANTCRED", comma ? ", " : "");
comma = 1;