diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-08-07 14:56:50 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-09-21 10:29:37 +0200 |
commit | c37f9fba70085fedc8eede7559489d2321393005 (patch) | |
tree | 042455ebf1fa89a277a825f72e1ed805d0b4d296 /freebsd/sys/kern | |
parent | Update to FreeBSD head 2017-06-01 (diff) | |
download | rtems-libbsd-c37f9fba70085fedc8eede7559489d2321393005.tar.bz2 |
Update to FreeBSD head 2017-08-01
Git mirror commit f5002f5e5f78cae9f0269d812dc0aedb0339312c.
Update #3472.
Diffstat (limited to 'freebsd/sys/kern')
-rw-r--r-- | freebsd/sys/kern/init_main.c | 3 | ||||
-rw-r--r-- | freebsd/sys/kern/kern_event.c | 185 | ||||
-rw-r--r-- | freebsd/sys/kern/kern_linker.c | 8 | ||||
-rw-r--r-- | freebsd/sys/kern/kern_uuid.c | 9 | ||||
-rw-r--r-- | freebsd/sys/kern/subr_blist.c | 658 | ||||
-rw-r--r-- | freebsd/sys/kern/subr_prf.c | 54 | ||||
-rw-r--r-- | freebsd/sys/kern/subr_sbuf.c | 2 | ||||
-rw-r--r-- | freebsd/sys/kern/subr_taskqueue.c | 25 | ||||
-rw-r--r-- | freebsd/sys/kern/subr_uio.c | 47 | ||||
-rw-r--r-- | freebsd/sys/kern/sys_socket.c | 78 | ||||
-rw-r--r-- | freebsd/sys/kern/uipc_accf.c | 111 | ||||
-rw-r--r-- | freebsd/sys/kern/uipc_mbuf.c | 2 | ||||
-rw-r--r-- | freebsd/sys/kern/uipc_sockbuf.c | 82 | ||||
-rw-r--r-- | freebsd/sys/kern/uipc_socket.c | 939 | ||||
-rw-r--r-- | freebsd/sys/kern/uipc_syscalls.c | 65 | ||||
-rw-r--r-- | freebsd/sys/kern/uipc_usrreq.c | 199 |
16 files changed, 1460 insertions, 1007 deletions
diff --git a/freebsd/sys/kern/init_main.c b/freebsd/sys/kern/init_main.c index 467888b2..f211b363 100644 --- a/freebsd/sys/kern/init_main.c +++ b/freebsd/sys/kern/init_main.c @@ -384,8 +384,7 @@ SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, #endif static int -null_fetch_syscall_args(struct thread *td __unused, - struct syscall_args *sa __unused) +null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); diff --git a/freebsd/sys/kern/kern_event.c b/freebsd/sys/kern/kern_event.c index 0a64adbe..2428182c 100644 --- a/freebsd/sys/kern/kern_event.c +++ b/freebsd/sys/kern/kern_event.c @@ -31,6 +31,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_compat.h> #include <rtems/bsd/local/opt_ktrace.h> #include <rtems/bsd/local/opt_kqueue.h> @@ -119,6 +120,10 @@ static int kqueue_scan(struct kqueue *kq, int maxevents, static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); +struct g_kevent_args; +static int kern_kevent_generic(struct thread *td, + struct g_kevent_args *uap, + struct kevent_copyops *k_ops); #ifndef __rtems__ static fo_rdwr_t kqueue_read; @@ -640,12 +645,13 @@ knote_fork(struct knlist *list, int pid) * interval timer support code. */ -#define NOTE_TIMER_PRECMASK (NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \ - NOTE_NSECONDS) +#define NOTE_TIMER_PRECMASK \ + (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(intptr_t data, int flags) { + int64_t secs; /* * Macros for converting to the fractional second portion of an @@ -664,27 +670,27 @@ timer2sbintime(intptr_t data, int flags) case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { - int64_t secs = data / 1000; + secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } - return MS_TO_SBT(data); + return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { - int64_t secs = data / 1000000; + secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } - return US_TO_SBT(data); + return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { - int64_t secs = data / 1000000000; + secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); @@ -701,7 +707,7 @@ timer2sbintime(intptr_t data, int flags) struct kq_timer_cb_data { struct callout c; sbintime_t next; /* next timer event fires at */ - sbintime_t to; /* precalculated timer period */ + sbintime_t to; /* precalculated timer period, 0 for abs */ }; static void @@ -716,8 +722,9 @@ filt_timerexpire(void *knx) if ((kn->kn_flags & EV_ONESHOT) != 0) return; - kc = kn->kn_ptr.p_v; + if (kc->to == 0) + return; kc->next += kc->to; callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); @@ -730,7 +737,8 @@ static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; - sbintime_t to; + struct bintime bt; + sbintime_t to, sbt; unsigned int ncallouts; if (kn->kn_sdata < 0) @@ -738,10 +746,15 @@ filt_timerattach(struct knote *kn) if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* Only precision unit are supported in flags so far */ - if ((kn->kn_sfflags & ~NOTE_TIMER_PRECMASK) != 0) + if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); + if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { + getboottimebin(&bt); + sbt = bttosbt(bt); + to -= sbt; + } if (to < 0) return (EINVAL); @@ -751,12 +764,18 @@ filt_timerattach(struct knote *kn) return (ENOMEM); } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); - kn->kn_flags |= EV_CLEAR; /* automatically set */ + if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) + kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); callout_init(&kc->c, 1); - kc->next = to + sbinuptime(); - kc->to = to; + if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { + kc->next = to; + kc->to = 0; + } else { + kc->next = to + sbinuptime(); + kc->to = to; + } callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); @@ -970,25 +989,24 @@ kqueue(void) #ifdef KTRACE static size_t -kev_iovlen(int n, u_int kgio) +kev_iovlen(int n, u_int kgio, size_t kevent_size) { - if (n < 0 || n >= kgio / sizeof(struct kevent)) + if (n < 0 || n >= kgio / kevent_size) return (kgio); - return (n * sizeof(struct kevent)); + return (n * kevent_size); } #endif -#ifndef _SYS_SYSPROTO_H_ -struct kevent_args { +struct g_kevent_args { int fd; - const struct kevent *changelist; + void *changelist; int nchanges; - struct kevent *eventlist; + void *eventlist; int nevents; const struct timespec *timeout; }; -#endif + #ifdef __rtems__ static int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout); @@ -1001,12 +1019,29 @@ static int sys_kevent(struct thread *td, struct kevent_args *uap) { - struct timespec ts, *tsp; struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, + .kevent_size = sizeof(struct kevent), }; + struct g_kevent_args gk_args = { + .fd = uap->fd, + .changelist = uap->changelist, + .nchanges = uap->nchanges, + .eventlist = uap->eventlist, + .nevents = uap->nevents, + .timeout = uap->timeout, + }; + + return (kern_kevent_generic(td, &gk_args, &k_ops)); +} + +static int +kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, + struct kevent_copyops *k_ops) +{ + struct timespec ts, *tsp; int error; #ifdef KTRACE struct uio ktruio; @@ -1028,26 +1063,30 @@ sys_kevent(struct thread *td, struct kevent_args *uap) if (KTRPOINT(td, KTR_GENIO)) { kgio = ktr_geniosize; ktriov.iov_base = uap->changelist; - ktriov.iov_len = kev_iovlen(uap->nchanges, kgio); + ktriov.iov_len = kev_iovlen(uap->nchanges, kgio, + k_ops->kevent_size); ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, .uio_td = td }; ktruioin = cloneuio(&ktruio); ktriov.iov_base = uap->eventlist; - ktriov.iov_len = kev_iovlen(uap->nevents, kgio); - ktriov.iov_len = uap->nevents * sizeof(struct kevent); + ktriov.iov_len = kev_iovlen(uap->nevents, kgio, + k_ops->kevent_size); + ktriov.iov_len = uap->nevents * k_ops->kevent_size; ktruioout = cloneuio(&ktruio); } #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, - &k_ops, tsp); + k_ops, tsp); #ifdef KTRACE if (ktruioin != NULL) { - ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio); + ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio, + k_ops->kevent_size); ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0); - ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio); + ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio, + k_ops->kevent_size); ktrgenio(uap->fd, UIO_READ, ktruioout, error); } #endif @@ -1123,6 +1162,94 @@ kevent_copyin(void *arg, struct kevent *kevp, int count) return (error); } +#ifdef COMPAT_FREEBSD11 +struct kevent_freebsd11 { + __uintptr_t ident; /* identifier for this event */ + short filter; /* filter for event */ + unsigned short flags; + unsigned int fflags; + __intptr_t data; + void *udata; /* opaque user data identifier */ +}; + +static int +kevent11_copyout(void *arg, struct kevent *kevp, int count) +{ + struct freebsd11_kevent_args *uap; + struct kevent_freebsd11 kev11; + int error, i; + + KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); + uap = (struct freebsd11_kevent_args *)arg; + + for (i = 0; i < count; i++) { + kev11.ident = kevp->ident; + kev11.filter = kevp->filter; + kev11.flags = kevp->flags; + kev11.fflags = kevp->fflags; + kev11.data = kevp->data; + kev11.udata = kevp->udata; + error = copyout(&kev11, uap->eventlist, sizeof(kev11)); + if (error != 0) + break; + uap->eventlist++; + kevp++; + } + return (error); +} + +/* + * Copy 'count' items from the list pointed to by uap->changelist. + */ +static int +kevent11_copyin(void *arg, struct kevent *kevp, int count) +{ + struct freebsd11_kevent_args *uap; + struct kevent_freebsd11 kev11; + int error, i; + + KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); + uap = (struct freebsd11_kevent_args *)arg; + + for (i = 0; i < count; i++) { + error = copyin(uap->changelist, &kev11, sizeof(kev11)); + if (error != 0) + break; + kevp->ident = kev11.ident; + kevp->filter = kev11.filter; + kevp->flags = kev11.flags; + kevp->fflags = kev11.fflags; + kevp->data = (uintptr_t)kev11.data; + kevp->udata = kev11.udata; + bzero(&kevp->ext, sizeof(kevp->ext)); + uap->changelist++; + kevp++; + } + return (error); +} + +int +freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) +{ + struct kevent_copyops k_ops = { + .arg = uap, + .k_copyout = kevent11_copyout, + .k_copyin = kevent11_copyin, + .kevent_size = sizeof(struct kevent_freebsd11), + }; + struct g_kevent_args gk_args = { + .fd = uap->fd, + .changelist = uap->changelist, + .nchanges = uap->nchanges, + .eventlist = uap->eventlist, + .nevents = uap->nevents, + .timeout = uap->timeout, + }; + + return (kern_kevent_generic(td, &gk_args, &k_ops)); +} +#endif + int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) diff --git a/freebsd/sys/kern/kern_linker.c b/freebsd/sys/kern/kern_linker.c index 214554d3..1c81a61c 100644 --- a/freebsd/sys/kern/kern_linker.c +++ b/freebsd/sys/kern/kern_linker.c @@ -1259,8 +1259,8 @@ kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat) /* Version 1 fields: */ namelen = strlen(lf->filename) + 1; - if (namelen > MAXPATHLEN) - namelen = MAXPATHLEN; + if (namelen > sizeof(stat->name)) + namelen = sizeof(stat->name); bcopy(lf->filename, &stat->name[0], namelen); stat->refs = lf->refs; stat->id = lf->id; @@ -1268,8 +1268,8 @@ kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat) stat->size = lf->size; /* Version 2 fields: */ namelen = strlen(lf->pathname) + 1; - if (namelen > MAXPATHLEN) - namelen = MAXPATHLEN; + if (namelen > sizeof(stat->pathname)) + namelen = sizeof(stat->pathname); bcopy(lf->pathname, &stat->pathname[0], namelen); sx_xunlock(&kld_sx); diff --git a/freebsd/sys/kern/kern_uuid.c b/freebsd/sys/kern/kern_uuid.c index b6a8915f..1ac19685 100644 --- a/freebsd/sys/kern/kern_uuid.c +++ b/freebsd/sys/kern/kern_uuid.c @@ -60,7 +60,7 @@ CTASSERT(sizeof(struct uuid) == 16); /* We use an alternative, more convenient representation in the generator. */ struct uuid_private { union { - uint64_t ll; /* internal. */ + uint64_t ll; /* internal, for uuid_last only */ struct { uint32_t low; uint16_t mid; @@ -428,3 +428,10 @@ parse_uuid(const char *str, struct uuid *uuid) (c[3] & 0xc0) != 0x80 && /* variant 1? */ (c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */ } + +int +uuidcmp(const struct uuid *uuid1, const struct uuid *uuid2) +{ + + return (memcmp(uuid1, uuid2, sizeof(struct uuid))); +} diff --git a/freebsd/sys/kern/subr_blist.c b/freebsd/sys/kern/subr_blist.c index 5af51dd4..c8e32c5b 100644 --- a/freebsd/sys/kern/subr_blist.c +++ b/freebsd/sys/kern/subr_blist.c @@ -30,18 +30,18 @@ * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting * * This module implements a general bitmap allocator/deallocator. The - * allocator eats around 2 bits per 'block'. The module does not - * try to interpret the meaning of a 'block' other than to return + * allocator eats around 2 bits per 'block'. The module does not + * try to interpret the meaning of a 'block' other than to return * SWAPBLK_NONE on an allocation failure. * * A radix tree is used to maintain the bitmap. Two radix constants are * involved: One for the bitmaps contained in the leaf nodes (typically - * 32), and one for the meta nodes (typically 16). Both meta and leaf + * 64), and one for the meta nodes (typically 16). Both meta and leaf * nodes have a hint field. This field gives us a hint as to the largest * free contiguous range of blocks under the node. It may contain a - * value that is too high, but will never contain a value that is too + * value that is too high, but will never contain a value that is too * low. When the radix tree is searched, allocation failures in subtrees - * update the hint. + * update the hint. * * The radix tree also implements two collapsed states for meta nodes: * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is @@ -51,7 +51,7 @@ * * The hinting greatly increases code efficiency for allocations while * the general radix structure optimizes both allocations and frees. The - * radix tree should be able to operate well no matter how much + * radix tree should be able to operate well no matter how much * fragmentation there is and no matter how large a bitmap is used. * * The blist code wires all necessary memory at creation time. Neither @@ -63,18 +63,18 @@ * linear array. Each meta node is immediately followed (laid out * sequentially in memory) by BLIST_META_RADIX lower level nodes. This * is a recursive structure but one that can be easily scanned through - * a very simple 'skip' calculation. In order to support large radixes, - * portions of the tree may reside outside our memory allocation. We - * handle this with an early-termination optimization (when bighint is - * set to -1) on the scan. The memory allocation is only large enough + * a very simple 'skip' calculation. In order to support large radixes, + * portions of the tree may reside outside our memory allocation. We + * handle this with an early-termination optimization (when bighint is + * set to -1) on the scan. The memory allocation is only large enough * to cover the number of blocks requested at creation time even if it * must be encompassed in larger root-node radix. * - * NOTE: the allocator cannot currently allocate more than - * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too - * large' if you try. This is an area that could use improvement. The - * radix is large enough that this restriction does not effect the swap - * system, though. Currently only the allocation code is effected by + * NOTE: the allocator cannot currently allocate more than + * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too + * large' if you try. This is an area that could use improvement. The + * radix is large enough that this restriction does not effect the swap + * system, though. Currently only the allocation code is affected by * this algorithmic unfeature. The freeing code can handle arbitrary * ranges. * @@ -93,7 +93,7 @@ __FBSDID("$FreeBSD$"); #include <sys/blist.h> #include <sys/malloc.h> #include <sys/proc.h> -#include <sys/mutex.h> +#include <sys/mutex.h> #else @@ -101,19 +101,18 @@ __FBSDID("$FreeBSD$"); #define BLIST_DEBUG #endif -#define SWAPBLK_NONE ((daddr_t)-1) - #include <sys/types.h> +#include <sys/malloc.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include <stdarg.h> +#include <stdbool.h> +#define bitcount64(x) __bitcount64((uint64_t)(x)) #define malloc(a,b,c) calloc(a, 1) #define free(a,b) free(a) -typedef unsigned int u_daddr_t; - #include <sys/blist.h> void panic(const char *ctl, ...); @@ -123,23 +122,23 @@ void panic(const char *ctl, ...); /* * static support functions */ - -static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); -static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, - daddr_t count, daddr_t radix, int skip); +static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, + daddr_t cursor); +static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, + daddr_t radix, daddr_t skip, daddr_t cursor); static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); -static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, - daddr_t radix, int skip, daddr_t blk); -static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, - daddr_t skip, blist_t dest, daddr_t count); -static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); -static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, - daddr_t radix, int skip, daddr_t blk); -static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, - int skip, daddr_t count); +static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, + daddr_t radix, daddr_t skip, daddr_t blk); +static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, + daddr_t skip, blist_t dest, daddr_t count); +static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); +static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, + daddr_t radix, daddr_t skip, daddr_t blk); +static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, + daddr_t count); #ifndef _KERNEL -static void blst_radix_print(blmeta_t *scan, daddr_t blk, - daddr_t radix, int skip, int tab); +static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, + daddr_t skip, int tab); #endif #ifdef _KERNEL @@ -153,35 +152,40 @@ static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); * blocks - must be greater than 0 * flags - malloc flags * - * The smallest blist consists of a single leaf node capable of + * The smallest blist consists of a single leaf node capable of * managing BLIST_BMAP_RADIX blocks. */ - -blist_t +blist_t blist_create(daddr_t blocks, int flags) { blist_t bl; - int radix; - int skip = 0; + daddr_t nodes, radix, skip; /* * Calculate radix and skip field used for scanning. */ radix = BLIST_BMAP_RADIX; - + skip = 0; while (radix < blocks) { radix *= BLIST_META_RADIX; skip = (skip + 1) * BLIST_META_RADIX; } + nodes = 1 + blst_radix_init(NULL, radix, skip, blocks); - bl = malloc(sizeof(struct blist), M_SWAP, flags | M_ZERO); + bl = malloc(sizeof(struct blist), M_SWAP, flags); + if (bl == NULL) + return (NULL); bl->bl_blocks = blocks; bl->bl_radix = radix; bl->bl_skip = skip; - bl->bl_rootblks = 1 + - blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks); - bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, flags); + bl->bl_cursor = 0; + bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags); + if (bl->bl_root == NULL) { + free(bl, M_SWAP); + return (NULL); + } + blst_radix_init(bl->bl_root, radix, skip, blocks); #if defined(BLIST_DEBUG) printf( @@ -189,17 +193,16 @@ blist_create(daddr_t blocks, int flags) ", requiring %lldK of ram\n", (long long)bl->bl_blocks, (long long)bl->bl_blocks * 4 / 1024, - (long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024 + (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 ); printf("BLIST raw radix tree contains %lld records\n", - (long long)bl->bl_rootblks); + (long long)nodes); #endif - blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks); - return(bl); + return (bl); } -void +void blist_destroy(blist_t bl) { free(bl->bl_root, M_SWAP); @@ -207,25 +210,44 @@ blist_destroy(blist_t bl) } /* - * blist_alloc() - reserve space in the block bitmap. Return the base + * blist_alloc() - reserve space in the block bitmap. Return the base * of a contiguous region or SWAPBLK_NONE if space could * not be allocated. */ - -daddr_t +daddr_t blist_alloc(blist_t bl, daddr_t count) { - daddr_t blk = SWAPBLK_NONE; + daddr_t blk; - if (bl) { - if (bl->bl_radix == BLIST_BMAP_RADIX) - blk = blst_leaf_alloc(bl->bl_root, 0, count); - else - blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip); - if (blk != SWAPBLK_NONE) - bl->bl_free -= count; + /* + * This loop iterates at most twice. An allocation failure in the + * first iteration leads to a second iteration only if the cursor was + * non-zero. When the cursor is zero, an allocation failure will + * reduce the hint, stopping further iterations. + */ + while (count <= bl->bl_root->bm_bighint) { + blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, + bl->bl_skip, bl->bl_cursor); + if (blk != SWAPBLK_NONE) { + bl->bl_cursor = blk + count; + return (blk); + } else if (bl->bl_cursor != 0) + bl->bl_cursor = 0; } - return(blk); + return (SWAPBLK_NONE); +} + +/* + * blist_avail() - return the number of free blocks. + */ +daddr_t +blist_avail(blist_t bl) +{ + + if (bl->bl_radix == BLIST_BMAP_RADIX) + return (bitcount64(bl->bl_root->u.bmu_bitmap)); + else + return (bl->bl_root->u.bmu_avail); } /* @@ -233,17 +255,11 @@ blist_alloc(blist_t bl, daddr_t count) * of a contiguous region. Panic if an inconsistancy is * found. */ - -void +void blist_free(blist_t bl, daddr_t blkno, daddr_t count) { - if (bl) { - if (bl->bl_radix == BLIST_BMAP_RADIX) - blst_leaf_free(bl->bl_root, blkno, count); - else - blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); - bl->bl_free += count; - } + + blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); } /* @@ -252,22 +268,12 @@ blist_free(blist_t bl, daddr_t blkno, daddr_t count) * existing allocations. Return the number of blocks * actually filled that were free before the call. */ - -int +daddr_t blist_fill(blist_t bl, daddr_t blkno, daddr_t count) { - int filled; - if (bl) { - if (bl->bl_radix == BLIST_BMAP_RADIX) - filled = blst_leaf_fill(bl->bl_root, blkno, count); - else - filled = blst_meta_fill(bl->bl_root, blkno, count, - bl->bl_radix, bl->bl_skip, 0); - bl->bl_free -= filled; - return filled; - } else - return 0; + return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix, + bl->bl_skip, 0)); } /* @@ -277,7 +283,6 @@ blist_fill(blist_t bl, daddr_t blkno, daddr_t count) * one. When extending the tree you can specify whether * the new blocks are to left allocated or freed. */ - void blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) { @@ -303,7 +308,6 @@ blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) /* * blist_print() - dump radix tree */ - void blist_print(blist_t bl) { @@ -318,7 +322,7 @@ blist_print(blist_t bl) * ALLOCATION SUPPORT FUNCTIONS * ************************************************************************ * - * These support functions do all the actual work. They may seem + * These support functions do all the actual work. They may seem * rather longish, but that's because I've commented them up. The * actual code is straight forward. * @@ -327,77 +331,91 @@ blist_print(blist_t bl) /* * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). * - * This is the core of the allocator and is optimized for the 1 block - * and the BLIST_BMAP_RADIX block allocation cases. Other cases are - * somewhat slower. The 1 block allocation case is log2 and extremely - * quick. + * This is the core of the allocator and is optimized for the + * BLIST_BMAP_RADIX block allocation case. Otherwise, execution + * time is proportional to log2(count) + log2(BLIST_BMAP_RADIX). */ - static daddr_t -blst_leaf_alloc( - blmeta_t *scan, - daddr_t blk, - int count -) { - u_daddr_t orig = scan->u.bmu_bitmap; - - if (orig == 0) { +blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor) +{ + u_daddr_t mask; + int count1, hi, lo, mid, num_shifts, range1, range_ext; + + if (count == BLIST_BMAP_RADIX) { /* - * Optimize bitmap all-allocated case. Also, count = 1 - * case assumes at least 1 bit is free in the bitmap, so - * we have to take care of this case here. + * Optimize allocation of BLIST_BMAP_RADIX bits. If this wasn't + * a special case, then forming the final value of 'mask' below + * would require special handling to avoid an invalid left shift + * when count equals the number of bits in mask. */ + if (~scan->u.bmu_bitmap != 0) { + scan->bm_bighint = BLIST_BMAP_RADIX - 1; + return (SWAPBLK_NONE); + } + if (cursor != blk) + return (SWAPBLK_NONE); + scan->u.bmu_bitmap = 0; scan->bm_bighint = 0; - return(SWAPBLK_NONE); + return (blk); } - if (count == 1) { + range1 = 0; + count1 = count - 1; + num_shifts = fls(count1); + mask = scan->u.bmu_bitmap; + while (mask != 0 && num_shifts > 0) { /* - * Optimized code to allocate one bit out of the bitmap + * If bit i is set in mask, then bits in [i, i+range1] are set + * in scan->u.bmu_bitmap. The value of range1 is equal to + * count1 >> num_shifts. Grow range and reduce num_shifts to 0, + * while preserving these invariants. The updates to mask leave + * fewer bits set, but each bit that remains set represents a + * longer string of consecutive bits set in scan->u.bmu_bitmap. */ - u_daddr_t mask; - int j = BLIST_BMAP_RADIX/2; - int r = 0; - - mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2); - - while (j) { - if ((orig & mask) == 0) { - r += j; - orig >>= j; - } - j >>= 1; - mask >>= j; - } - scan->u.bmu_bitmap &= ~(1 << r); - return(blk + r); + num_shifts--; + range_ext = range1 + ((count1 >> num_shifts) & 1); + mask &= mask >> range_ext; + range1 += range_ext; } - if (count <= BLIST_BMAP_RADIX) { + if (mask == 0) { /* - * non-optimized code to allocate N bits out of the bitmap. - * The more bits, the faster the code runs. It will run - * the slowest allocating 2 bits, but since there aren't any - * memory ops in the core loop (or shouldn't be, anyway), - * you probably won't notice the difference. + * Update bighint. There is no allocation bigger than range1 + * available in this leaf. */ - int j; - int n = BLIST_BMAP_RADIX - count; - u_daddr_t mask; + scan->bm_bighint = range1; + return (SWAPBLK_NONE); + } - mask = (u_daddr_t)-1 >> n; + /* + * Discard any candidates that appear before the cursor. + */ + lo = cursor - blk; + mask &= ~(u_daddr_t)0 << lo; - for (j = 0; j <= n; ++j) { - if ((orig & mask) == mask) { - scan->u.bmu_bitmap &= ~mask; - return(blk + j); - } - mask = (mask << 1); - } + if (mask == 0) + return (SWAPBLK_NONE); + + /* + * The least significant set bit in mask marks the start of the first + * available range of sufficient size. Clear all the bits but that one, + * and then perform a binary search to find its position. + */ + mask &= -mask; + hi = BLIST_BMAP_RADIX - count1; + while (lo + 1 < hi) { + mid = (lo + hi) >> 1; + if ((mask >> mid) != 0) + lo = mid; + else + hi = mid; } + /* - * We couldn't allocate count in this subtree, update bighint. + * Set in mask exactly the bits being allocated, and clear them from + * the set of available bits. */ - scan->bm_bighint = count - 1; - return(SWAPBLK_NONE); + mask = (mask << count) - mask; + scan->u.bmu_bitmap &= ~mask; + return (blk + lo); } /* @@ -408,76 +426,75 @@ blst_leaf_alloc( * calls that hit this node. We have to check for our collapse cases * and we have a few optimizations strewn in as well. */ - static daddr_t -blst_meta_alloc( - blmeta_t *scan, - daddr_t blk, - daddr_t count, - daddr_t radix, - int skip -) { - int i; - int next_skip = ((u_int)skip / BLIST_META_RADIX); +blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix, + daddr_t skip, daddr_t cursor) +{ + daddr_t i, next_skip, r; + int child; + bool scan_from_start; - if (scan->u.bmu_avail == 0) { + if (radix == BLIST_BMAP_RADIX) + return (blst_leaf_alloc(scan, blk, count, cursor)); + if (scan->u.bmu_avail < count) { /* - * ALL-ALLOCATED special case + * The meta node's hint must be too large if the allocation + * exceeds the number of free blocks. Reduce the hint, and + * return failure. */ - scan->bm_bighint = count; - return(SWAPBLK_NONE); + scan->bm_bighint = scan->u.bmu_avail; + return (SWAPBLK_NONE); } + next_skip = skip / BLIST_META_RADIX; + /* + * An ALL-FREE meta node requires special handling before allocating + * any of its blocks. + */ if (scan->u.bmu_avail == radix) { radix /= BLIST_META_RADIX; /* - * ALL-FREE special case, initialize uninitialize - * sublevel. + * Reinitialize each of the meta node's children. An ALL-FREE + * meta node cannot have a terminator in any subtree. */ for (i = 1; i <= skip; i += next_skip) { - if (scan[i].bm_bighint == (daddr_t)-1) - break; - if (next_skip == 1) { + if (next_skip == 1) scan[i].u.bmu_bitmap = (u_daddr_t)-1; - scan[i].bm_bighint = BLIST_BMAP_RADIX; - } else { - scan[i].bm_bighint = radix; + else scan[i].u.bmu_avail = radix; - } + scan[i].bm_bighint = radix; } } else { radix /= BLIST_META_RADIX; } - for (i = 1; i <= skip; i += next_skip) { + if (count > radix) { + /* + * The allocation exceeds the number of blocks that are + * managed by a subtree of this meta node. + */ + panic("allocation too large"); + } + scan_from_start = cursor == blk; + child = (cursor - blk) / radix; + blk += child * radix; + for (i = 1 + child * next_skip; i <= skip; i += next_skip) { if (count <= scan[i].bm_bighint) { /* - * count fits in object + * The allocation might fit in the i'th subtree. */ - daddr_t r; - if (next_skip == 1) { - r = blst_leaf_alloc(&scan[i], blk, count); - } else { - r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1); - } + r = blst_meta_alloc(&scan[i], blk, count, radix, + next_skip - 1, cursor > blk ? cursor : blk); if (r != SWAPBLK_NONE) { scan->u.bmu_avail -= count; - if (scan->bm_bighint > scan->u.bmu_avail) - scan->bm_bighint = scan->u.bmu_avail; - return(r); + return (r); } } else if (scan[i].bm_bighint == (daddr_t)-1) { /* * Terminator */ break; - } else if (count > radix) { - /* - * count does not fit in object even if it were - * complete free. - */ - panic("blist_meta_alloc: allocation too large"); } blk += radix; } @@ -485,22 +502,19 @@ blst_meta_alloc( /* * We couldn't allocate count in this subtree, update bighint. */ - if (scan->bm_bighint >= count) + if (scan_from_start && scan->bm_bighint >= count) scan->bm_bighint = count - 1; - return(SWAPBLK_NONE); + + return (SWAPBLK_NONE); } /* * BLST_LEAF_FREE() - free allocated block from leaf bitmap * */ - static void -blst_leaf_free( - blmeta_t *scan, - daddr_t blk, - int count -) { +blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) +{ /* * free some data in this bitmap * @@ -521,7 +535,7 @@ blst_leaf_free( /* * We could probably do a better job here. We are required to make - * bighint at least as large as the biggest contiguous block of + * bighint at least as large as the biggest contiguous block of * data. If we just shoehorn it, a little extra overhead will * be incured on the next allocation (but only that one typically). */ @@ -538,25 +552,18 @@ blst_leaf_free( * range whereas the allocation code cannot allocate an arbitrary * range). */ +static void +blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix, + daddr_t skip, daddr_t blk) +{ + daddr_t i, next_skip, v; + int child; -static void -blst_meta_free( - blmeta_t *scan, - daddr_t freeBlk, - daddr_t count, - daddr_t radix, - int skip, - daddr_t blk -) { - int i; - int next_skip = ((u_int)skip / BLIST_META_RADIX); - -#if 0 - printf("free (%llx,%lld) FROM (%llx,%lld)\n", - (long long)freeBlk, (long long)count, - (long long)blk, (long long)radix - ); -#endif + if (scan->bm_bighint == (daddr_t)-1) + panic("freeing invalid range"); + if (radix == BLIST_BMAP_RADIX) + return (blst_leaf_free(scan, freeBlk, count)); + next_skip = skip / BLIST_META_RADIX; if (scan->u.bmu_avail == 0) { /* @@ -601,27 +608,16 @@ blst_meta_free( radix /= BLIST_META_RADIX; - i = (freeBlk - blk) / radix; - blk += i * radix; - i = i * next_skip + 1; - + child = (freeBlk - blk) / radix; + blk += child * radix; + i = 1 + child * next_skip; while (i <= skip && blk < freeBlk + count) { - daddr_t v; - v = blk + radix - freeBlk; if (v > count) v = count; - - if (scan->bm_bighint == (daddr_t)-1) - panic("blst_meta_free: freeing unexpected range"); - - if (next_skip == 1) { - blst_leaf_free(&scan[i], freeBlk, v); - } else { - blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk); - } + blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk); if (scan->bm_bighint < scan[i].bm_bighint) - scan->bm_bighint = scan[i].bm_bighint; + scan->bm_bighint = scan[i].bm_bighint; count -= v; freeBlk += v; blk += radix; @@ -635,17 +631,11 @@ blst_meta_free( * Locates free space in the source tree and frees it in the destination * tree. The space may not already be free in the destination. */ - -static void blst_copy( - blmeta_t *scan, - daddr_t blk, - daddr_t radix, - daddr_t skip, - blist_t dest, - daddr_t count -) { - int next_skip; - int i; +static void +blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip, + blist_t dest, daddr_t count) +{ + daddr_t i, next_skip; /* * Leaf node @@ -660,7 +650,7 @@ static void blst_copy( int i; for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { - if (v & (1 << i)) + if (v & ((u_daddr_t)1 << i)) blist_free(dest, blk + i, 1); } } @@ -676,7 +666,7 @@ static void blst_copy( * Source all allocated, leave dest allocated */ return; - } + } if (scan->u.bmu_avail == radix) { /* * Source all free, free entire dest @@ -690,32 +680,20 @@ static void blst_copy( radix /= BLIST_META_RADIX; - next_skip = ((u_int)skip / BLIST_META_RADIX); + next_skip = skip / BLIST_META_RADIX; for (i = 1; count && i <= skip; i += next_skip) { if (scan[i].bm_bighint == (daddr_t)-1) break; if (count >= radix) { - blst_copy( - &scan[i], - blk, - radix, - next_skip - 1, - dest, - radix - ); + blst_copy(&scan[i], blk, radix, next_skip - 1, dest, + radix); count -= radix; } else { if (count) { - blst_copy( - &scan[i], - blk, - radix, - next_skip - 1, - dest, - count - ); + blst_copy(&scan[i], blk, radix, next_skip - 1, + dest, count); } count = 0; } @@ -730,24 +708,21 @@ static void blst_copy( * regardless of any existing allocations in that range. Returns * the number of blocks allocated by the call. */ - -static int +static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) { int n = blk & (BLIST_BMAP_RADIX - 1); - int nblks; - u_daddr_t mask, bitmap; + daddr_t nblks; + u_daddr_t mask; mask = ((u_daddr_t)-1 << n) & ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); - /* Count the number of blocks we're about to allocate */ - bitmap = scan->u.bmu_bitmap & mask; - for (nblks = 0; bitmap != 0; nblks++) - bitmap &= bitmap - 1; + /* Count the number of blocks that we are allocating. */ + nblks = bitcount64(scan->u.bmu_bitmap & mask); scan->u.bmu_bitmap &= ~mask; - return nblks; + return (nblks); } /* @@ -758,80 +733,74 @@ blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) * range must be within the extent of this node. Returns the * number of blocks allocated by the call. */ -static int -blst_meta_fill( - blmeta_t *scan, - daddr_t allocBlk, - daddr_t count, - daddr_t radix, - int skip, - daddr_t blk -) { - int i; - int next_skip = ((u_int)skip / BLIST_META_RADIX); - int nblks = 0; +static daddr_t +blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix, + daddr_t skip, daddr_t blk) +{ + daddr_t i, nblks, next_skip, v; + int child; + if (scan->bm_bighint == (daddr_t)-1) + panic("filling invalid range"); + if (count > radix) { + /* + * The allocation exceeds the number of blocks that are + * managed by this node. + */ + panic("fill too large"); + } + if (radix == BLIST_BMAP_RADIX) + return (blst_leaf_fill(scan, allocBlk, count)); if (count == radix || scan->u.bmu_avail == 0) { /* * ALL-ALLOCATED special case */ nblks = scan->u.bmu_avail; scan->u.bmu_avail = 0; - scan->bm_bighint = count; - return nblks; + scan->bm_bighint = 0; + return (nblks); } + next_skip = skip / BLIST_META_RADIX; + /* + * An ALL-FREE meta node requires special handling before allocating + * any of its blocks. + */ if (scan->u.bmu_avail == radix) { radix /= BLIST_META_RADIX; /* - * ALL-FREE special case, initialize sublevel + * Reinitialize each of the meta node's children. An ALL-FREE + * meta node cannot have a terminator in any subtree. */ for (i = 1; i <= skip; i += next_skip) { - if (scan[i].bm_bighint == (daddr_t)-1) - break; - if (next_skip == 1) { + if (next_skip == 1) scan[i].u.bmu_bitmap = (u_daddr_t)-1; - scan[i].bm_bighint = BLIST_BMAP_RADIX; - } else { - scan[i].bm_bighint = radix; + else scan[i].u.bmu_avail = radix; - } + scan[i].bm_bighint = radix; } } else { radix /= BLIST_META_RADIX; } - if (count > radix) - panic("blist_meta_fill: allocation too large"); - - i = (allocBlk - blk) / radix; - blk += i * radix; - i = i * next_skip + 1; - + nblks = 0; + child = (allocBlk - blk) / radix; + blk += child * radix; + i = 1 + child * next_skip; while (i <= skip && blk < allocBlk + count) { - daddr_t v; - v = blk + radix - allocBlk; if (v > count) v = count; - - if (scan->bm_bighint == (daddr_t)-1) - panic("blst_meta_fill: filling unexpected range"); - - if (next_skip == 1) { - nblks += blst_leaf_fill(&scan[i], allocBlk, v); - } else { - nblks += blst_meta_fill(&scan[i], allocBlk, v, - radix, next_skip - 1, blk); - } + nblks += blst_meta_fill(&scan[i], allocBlk, v, radix, + next_skip - 1, blk); count -= v; allocBlk += v; blk += radix; i += next_skip; } scan->u.bmu_avail -= nblks; - return nblks; + return (nblks); } /* @@ -842,13 +811,12 @@ blst_meta_fill( * be considerably less than the calculated radix due to the large * RADIX values we use. */ - -static daddr_t -blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count) +static daddr_t +blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count) { - int i; - int next_skip; - daddr_t memindex = 0; + daddr_t i, memindex, next_skip; + + memindex = 0; /* * Leaf node @@ -859,7 +827,7 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count) scan->bm_bighint = 0; scan->u.bmu_bitmap = 0; } - return(memindex); + return (memindex); } /* @@ -874,30 +842,24 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count) } radix /= BLIST_META_RADIX; - next_skip = ((u_int)skip / BLIST_META_RADIX); + next_skip = skip / BLIST_META_RADIX; for (i = 1; i <= skip; i += next_skip) { if (count >= radix) { /* * Allocate the entire object */ - memindex = i + blst_radix_init( - ((scan) ? &scan[i] : NULL), - radix, - next_skip - 1, - radix - ); + memindex = i + + blst_radix_init(((scan) ? &scan[i] : NULL), radix, + next_skip - 1, radix); count -= radix; } else if (count > 0) { /* * Allocate a partial object */ - memindex = i + blst_radix_init( - ((scan) ? &scan[i] : NULL), - radix, - next_skip - 1, - count - ); + memindex = i + + blst_radix_init(((scan) ? &scan[i] : NULL), radix, + next_skip - 1, count); count = 0; } else { /* @@ -910,21 +872,20 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count) } if (memindex < i) memindex = i; - return(memindex); + return (memindex); } #ifdef BLIST_DEBUG -static void -blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) +static void +blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip, + int tab) { - int i; - int next_skip; - int lastState = 0; + daddr_t i, next_skip; if (radix == BLIST_BMAP_RADIX) { printf( - "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", + "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n", tab, tab, "", (long long)blk, (long long)radix, (long long)scan->u.bmu_bitmap, @@ -962,7 +923,7 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) ); radix /= BLIST_META_RADIX; - next_skip = ((u_int)skip / BLIST_META_RADIX); + next_skip = skip / BLIST_META_RADIX; tab += 4; for (i = 1; i <= skip; i += next_skip) { @@ -972,16 +933,9 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) tab, tab, "", (long long)blk, (long long)radix ); - lastState = 0; break; } - blst_radix_print( - &scan[i], - blk, - radix, - next_skip - 1, - tab - ); + blst_radix_print(&scan[i], blk, radix, next_skip - 1, tab); blk += radix; } tab -= 4; @@ -1018,11 +972,10 @@ main(int ac, char **av) for (;;) { char buf[1024]; - daddr_t da = 0; - daddr_t count = 0; - + long long da = 0; + long long count = 0; - printf("%lld/%lld/%lld> ", (long long)bl->bl_free, + printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), (long long)size, (long long)bl->bl_radix); fflush(stdout); if (fgets(buf, sizeof(buf), stdin) == NULL) @@ -1030,7 +983,7 @@ main(int ac, char **av) switch(buf[0]) { case 'r': if (sscanf(buf + 1, "%lld", &count) == 1) { - blist_resize(&bl, count, 1); + blist_resize(&bl, count, 1, M_WAITOK); } else { printf("?\n"); } @@ -1046,18 +999,16 @@ main(int ac, char **av) } break; case 'f': - if (sscanf(buf + 1, "%llx %lld", - (long long *)&da, (long long *)&count) == 2) { + if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { blist_free(bl, da, count); } else { printf("?\n"); } break; case 'l': - if (sscanf(buf + 1, "%llx %lld", - (long long *)&da, (long long *)&count) == 2) { - printf(" n=%d\n", - blist_fill(bl, da, count)); + if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { + printf(" n=%jd\n", + (intmax_t)blist_fill(bl, da, count)); } else { printf("?\n"); } @@ -1094,4 +1045,3 @@ panic(const char *ctl, ...) } #endif - diff --git a/freebsd/sys/kern/subr_prf.c b/freebsd/sys/kern/subr_prf.c index 39f5826d..0380cfec 100644 --- a/freebsd/sys/kern/subr_prf.c +++ b/freebsd/sys/kern/subr_prf.c @@ -411,7 +411,6 @@ log_console(struct uio *uio) msgbuftrigger = 1; free(uio, M_IOV); free(consbuffer, M_TEMP); - return; } #endif /* __rtems__ */ @@ -678,7 +677,7 @@ kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_lis uintmax_t num; int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot; int cflag, hflag, jflag, tflag, zflag; - int dwidth, upper; + int bconv, dwidth, upper; char padc; int stop = 0, retval = 0; @@ -704,7 +703,7 @@ kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_lis } percent = fmt - 1; qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; - sign = 0; dot = 0; dwidth = 0; upper = 0; + sign = 0; dot = 0; bconv = 0; dwidth = 0; upper = 0; cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0; reswitch: switch (ch = (u_char)*fmt++) { case '.': @@ -752,28 +751,9 @@ reswitch: switch (ch = (u_char)*fmt++) { width = n; goto reswitch; case 'b': - num = (u_int)va_arg(ap, int); - p = va_arg(ap, char *); - for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;) - PCHAR(*q--); - - if (num == 0) - break; - - for (tmp = 0; *p;) { - n = *p++; - if (num & (1 << (n - 1))) { - PCHAR(tmp ? ',' : '<'); - for (; (n = *p) > ' '; ++p) - PCHAR(n); - tmp = 1; - } else - for (; *p > ' '; ++p) - continue; - } - if (tmp) - PCHAR('>'); - break; + ladjust = 1; + bconv = 1; + goto handle_nosign; case 'c': width -= 1; @@ -919,6 +899,10 @@ handle_nosign: num = (u_char)va_arg(ap, int); else num = va_arg(ap, u_int); + if (bconv) { + q = va_arg(ap, char *); + base = *q++; + } goto number; handle_sign: if (jflag) @@ -976,6 +960,26 @@ number: while (*p) PCHAR(*p--); + if (bconv && num != 0) { + /* %b conversion flag format. */ + tmp = retval; + while (*q) { + n = *q++; + if (num & (1 << (n - 1))) { + PCHAR(retval != tmp ? + ',' : '<'); + for (; (n = *q) > ' '; ++q) + PCHAR(n); + } else + for (; *q > ' '; ++q) + continue; + } + if (retval != tmp) { + PCHAR('>'); + width -= retval - tmp; + } + } + if (ladjust) while (width-- > 0) PCHAR(' '); diff --git a/freebsd/sys/kern/subr_sbuf.c b/freebsd/sys/kern/subr_sbuf.c index 680613b1..8dd11b07 100644 --- a/freebsd/sys/kern/subr_sbuf.c +++ b/freebsd/sys/kern/subr_sbuf.c @@ -106,7 +106,7 @@ _assert_sbuf_integrity(const char *fun, struct sbuf *s) ("%s called with a NULL sbuf pointer", fun)); KASSERT(s->s_buf != NULL, ("%s called with uninitialized or corrupt sbuf", fun)); - if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) { + if (SBUF_ISFINISHED(s) && SBUF_NULINCLUDED(s)) { KASSERT(s->s_len <= s->s_size, ("wrote past end of sbuf (%jd >= %jd)", (intmax_t)s->s_len, (intmax_t)s->s_size)); diff --git a/freebsd/sys/kern/subr_taskqueue.c b/freebsd/sys/kern/subr_taskqueue.c index 6f1ba19a..74b9cf59 100644 --- a/freebsd/sys/kern/subr_taskqueue.c +++ b/freebsd/sys/kern/subr_taskqueue.c @@ -316,8 +316,8 @@ taskqueue_timeout_func(void *arg) } int -taskqueue_enqueue_timeout(struct taskqueue *queue, - struct timeout_task *timeout_task, int ticks) +taskqueue_enqueue_timeout_sbt(struct taskqueue *queue, + struct timeout_task *timeout_task, sbintime_t sbt, sbintime_t pr, int flags) { int res; @@ -333,7 +333,7 @@ taskqueue_enqueue_timeout(struct taskqueue *queue, /* Do nothing */ TQ_UNLOCK(queue); res = -1; - } else if (ticks == 0) { + } else if (sbt == 0) { taskqueue_enqueue_locked(queue, &timeout_task->t); /* The lock is released inside. */ } else { @@ -342,18 +342,27 @@ taskqueue_enqueue_timeout(struct taskqueue *queue, } else { queue->tq_callouts++; timeout_task->f |= DT_CALLOUT_ARMED; - if (ticks < 0) - ticks = -ticks; /* Ignore overflow. */ + if (sbt < 0) + sbt = -sbt; /* Ignore overflow. */ } - if (ticks > 0) { - callout_reset(&timeout_task->c, ticks, - taskqueue_timeout_func, timeout_task); + if (sbt > 0) { + callout_reset_sbt(&timeout_task->c, sbt, pr, + taskqueue_timeout_func, timeout_task, flags); } TQ_UNLOCK(queue); } return (res); } +int +taskqueue_enqueue_timeout(struct taskqueue *queue, + struct timeout_task *ttask, int ticks) +{ + + return (taskqueue_enqueue_timeout_sbt(queue, ttask, ticks * tick_sbt, + 0, 0)); +} + static void taskqueue_task_nop_fn(void *context, int pending) { diff --git a/freebsd/sys/kern/subr_uio.c b/freebsd/sys/kern/subr_uio.c index 5740e667..904ef1f4 100644 --- a/freebsd/sys/kern/subr_uio.c +++ b/freebsd/sys/kern/subr_uio.c @@ -212,41 +212,37 @@ uiomove_nofault(void *cp, int n, struct uio *uio) static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault) { -#ifndef __rtems__ - struct thread *td; -#endif /* __rtems__ */ struct iovec *iov; size_t cnt; - int error, newflags, save; - #ifndef __rtems__ - td = curthread; + int error, newflags, save; +#else /* __rtems__ */ + int error; #endif /* __rtems__ */ + error = 0; +#ifndef __rtems__ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove: mode")); -#ifndef __rtems__ - KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td, + KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomove proc")); -#endif /* __rtems__ */ - if (!nofault) - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, - "Calling uiomove()"); -#ifndef __rtems__ - /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */ - newflags = TDP_DEADLKTREAT; - if (uio->uio_segflg == UIO_USERSPACE && nofault) { - /* - * Fail if a non-spurious page fault occurs. - */ - newflags |= TDP_NOFAULTING | TDP_RESETSPUR; + if (uio->uio_segflg == UIO_USERSPACE) { + newflags = TDP_DEADLKTREAT; + if (nofault) { + /* + * Fail if a non-spurious page fault occurs. + */ + newflags |= TDP_NOFAULTING | TDP_RESETSPUR; + } else { + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "Calling uiomove()"); + } + save = curthread_pflags_set(newflags); + } else { + KASSERT(nofault == 0, ("uiomove: nofault")); } - save = curthread_pflags_set(newflags); -#else /* __rtems__ */ - (void) newflags; - (void) save; #endif /* __rtems__ */ while (n > 0 && uio->uio_resid) { @@ -292,7 +288,8 @@ uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault) } out: #ifndef __rtems__ - curthread_pflags_restore(save); + if (uio->uio_segflg == UIO_USERSPACE) + curthread_pflags_restore(save); #endif /* __rtems__ */ return (error); } diff --git a/freebsd/sys/kern/sys_socket.c b/freebsd/sys/kern/sys_socket.c index 8d87c51b..9dd458f1 100644 --- a/freebsd/sys/kern/sys_socket.c +++ b/freebsd/sys/kern/sys_socket.c @@ -318,32 +318,36 @@ soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, break; case FIOASYNC: - /* - * XXXRW: This code separately acquires SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) even though they are the same - * mutex to avoid introducing the assumption that they are - * the same. - */ if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags |= SB_ASYNC; + so->sol_sbsnd_flags |= SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags &= ~SB_ASYNC; + so->sol_sbsnd_flags &= ~SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } break; @@ -477,7 +481,6 @@ static int soo_stat(struct socket *so, struct stat *ub) { #endif /* __rtems__ */ - struct sockbuf *sb; #ifdef MAC int error; #endif @@ -491,22 +494,26 @@ soo_stat(struct socket *so, struct stat *ub) if (error) return (error); #endif - /* - * If SBS_CANTRCVMORE is set, but there's still data left in the - * receive buffer, the socket is still readable. - */ - sb = &so->so_rcv; - SOCKBUF_LOCK(sb); - if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) - ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; - ub->st_size = sbavail(sb) - sb->sb_ctl; - SOCKBUF_UNLOCK(sb); + if (!SOLISTENING(so)) { + struct sockbuf *sb; - sb = &so->so_snd; - SOCKBUF_LOCK(sb); - if ((sb->sb_state & SBS_CANTSENDMORE) == 0) - ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; - SOCKBUF_UNLOCK(sb); + /* + * If SBS_CANTRCVMORE is set, but there's still data left + * in the receive buffer, the socket is still readable. + */ + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) + ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; + ub->st_size = sbavail(sb) - sb->sb_ctl; + SOCKBUF_UNLOCK(sb); + + sb = &so->so_snd; + SOCKBUF_LOCK(sb); + if ((sb->sb_state & SBS_CANTSENDMORE) == 0) + ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; + SOCKBUF_UNLOCK(sb); + } #ifndef __rtems__ ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; @@ -916,6 +923,7 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb) { struct kaiocb *job; + CURVNET_SET(so->so_vnet); SOCKBUF_LOCK(sb); while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) { job = TAILQ_FIRST(&sb->sb_aiojobq); @@ -936,9 +944,9 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb) sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); - ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); + CURVNET_RESTORE(); } void diff --git a/freebsd/sys/kern/uipc_accf.c b/freebsd/sys/kern/uipc_accf.c index a766adf8..8a0e14e3 100644 --- a/freebsd/sys/kern/uipc_accf.c +++ b/freebsd/sys/kern/uipc_accf.c @@ -132,8 +132,7 @@ accept_filt_generic_mod_event(module_t mod, int event, void *data) switch (event) { case MOD_LOAD: - p = malloc(sizeof(*p), M_ACCF, - M_WAITOK); + p = malloc(sizeof(*p), M_ACCF, M_WAITOK); bcopy(accfp, p, sizeof(*p)); error = accept_filt_add(p); break; @@ -164,26 +163,25 @@ accept_filt_generic_mod_event(module_t mod, int event, void *data) } int -do_getopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_getopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; int error; error = 0; - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK | M_ZERO); + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto out; } - if ((so->so_options & SO_ACCEPTFILTER) == 0) { + if (so->sol_accept_filter == NULL) { error = EINVAL; goto out; } - strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); - if (so->so_accf->so_accept_filter_str != NULL) - strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); + strcpy(afap->af_name, so->sol_accept_filter->accf_name); + if (so->sol_accept_filter_str != NULL) + strcpy(afap->af_arg, so->sol_accept_filter_str); out: SOCK_UNLOCK(so); if (error == 0) @@ -193,35 +191,61 @@ out: } int -do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_setopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *newaf; - int error = 0; + char *accept_filter_str = NULL; + void *accept_filter_arg = NULL; + int error; /* * Handle the simple delete case first. */ if (sopt == NULL || sopt->sopt_val == NULL) { + struct socket *sp, *sp1; + int wakeup; + SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { SOCK_UNLOCK(so); return (EINVAL); } - if (so->so_accf != NULL) { - struct so_accf *af = so->so_accf; - if (af->so_accept_filter != NULL && - af->so_accept_filter->accf_destroy != NULL) { - af->so_accept_filter->accf_destroy(so); - } - if (af->so_accept_filter_str != NULL) - free(af->so_accept_filter_str, M_ACCF); - free(af, M_ACCF); - so->so_accf = NULL; + if (so->sol_accept_filter == NULL) { + SOCK_UNLOCK(so); + return (0); } + if (so->sol_accept_filter->accf_destroy != NULL) + so->sol_accept_filter->accf_destroy(so); + if (so->sol_accept_filter_str != NULL) + free(so->sol_accept_filter_str, M_ACCF); + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; so->so_options &= ~SO_ACCEPTFILTER; - SOCK_UNLOCK(so); + + /* + * Move from incomplete queue to complete only those + * connections, that are blocked by us. + */ + wakeup = 0; + TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) { + SOCK_LOCK(sp); + if (sp->so_options & SO_ACCEPTFILTER) { + TAILQ_REMOVE(&so->sol_incomp, sp, so_list); + TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list); + sp->so_qstate = SQ_COMP; + sp->so_options &= ~SO_ACCEPTFILTER; + so->sol_incqlen--; + so->sol_qlen++; + wakeup = 1; + } + SOCK_UNLOCK(sp); + } + if (wakeup) + solisten_wakeup(so); /* unlocks */ + else + SOLISTEN_UNLOCK(so); return (0); } @@ -229,8 +253,7 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK); + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; @@ -243,19 +266,10 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) free(afap, M_TEMP); return (ENOENT); } - /* - * Allocate the new accept filter instance storage. We may - * have to free it again later if we fail to attach it. If - * attached properly, 'newaf' is NULLed to avoid a free() - * while in use. - */ - newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | - M_ZERO); if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; - newaf->so_accept_filter_str = malloc(len, M_ACCF, - M_WAITOK); - strcpy(newaf->so_accept_filter_str, afap->af_name); + accept_filter_str = malloc(len, M_ACCF, M_WAITOK); + strcpy(accept_filter_str, afap->af_name); } /* @@ -263,8 +277,8 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) * without first removing it. */ SOCK_LOCK(so); - if (((so->so_options & SO_ACCEPTCONN) == 0) || - (so->so_accf != NULL)) { + if ((so->so_options & SO_ACCEPTCONN) == 0 || + so->sol_accept_filter != NULL) { error = EINVAL; goto out; } @@ -275,25 +289,20 @@ do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) * can't block. */ if (afp->accf_create != NULL) { - newaf->so_accept_filter_arg = - afp->accf_create(so, afap->af_arg); - if (newaf->so_accept_filter_arg == NULL) { + accept_filter_arg = afp->accf_create(so, afap->af_arg); + if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } - newaf->so_accept_filter = afp; - so->so_accf = newaf; + so->sol_accept_filter = afp; + so->sol_accept_filter_arg = accept_filter_arg; + so->sol_accept_filter_str = accept_filter_str; so->so_options |= SO_ACCEPTFILTER; - newaf = NULL; out: SOCK_UNLOCK(so); - if (newaf != NULL) { - if (newaf->so_accept_filter_str != NULL) - free(newaf->so_accept_filter_str, M_ACCF); - free(newaf, M_ACCF); - } - if (afap != NULL) - free(afap, M_TEMP); + if (accept_filter_str != NULL) + free(accept_filter_str, M_ACCF); + free(afap, M_TEMP); return (error); } diff --git a/freebsd/sys/kern/uipc_mbuf.c b/freebsd/sys/kern/uipc_mbuf.c index ba8a2d48..abc30dd3 100644 --- a/freebsd/sys/kern/uipc_mbuf.c +++ b/freebsd/sys/kern/uipc_mbuf.c @@ -1519,7 +1519,7 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) * the total data supplied by the uio. */ if (len > 0) - total = min(uio->uio_resid, len); + total = (uio->uio_resid < len) ? uio->uio_resid : len; else total = uio->uio_resid; diff --git a/freebsd/sys/kern/uipc_sockbuf.c b/freebsd/sys/kern/uipc_sockbuf.c index 04193c29..4b710a2c 100644 --- a/freebsd/sys/kern/uipc_sockbuf.c +++ b/freebsd/sys/kern/uipc_sockbuf.c @@ -316,14 +316,14 @@ sowakeup(struct socket *so, struct sockbuf *sb) SOCKBUF_LOCK_ASSERT(sb); - selwakeuppri(&sb->sb_sel, PSOCK); - if (!SEL_WAITING(&sb->sb_sel)) + selwakeuppri(sb->sb_sel, PSOCK); + if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } - KNOTE_LOCKED(&sb->sb_sel.si_note, 0); + KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { @@ -336,7 +336,7 @@ sowakeup(struct socket *so, struct sockbuf *sb) if (sb->sb_flags & SB_AIO) sowakeup_aio(so, sb); SOCKBUF_UNLOCK(sb); - if (ret == SU_ISCONNECTED) + if (ret == SU_ISCONNECTED && !(so->so_state & SS_ISDISCONNECTED)) soisconnected(so); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); @@ -457,14 +457,78 @@ sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so, } int -sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, - struct thread *td) +sbsetopt(struct socket *so, int cmd, u_long cc) { + struct sockbuf *sb; + short *flags; + u_int *hiwat, *lowat; int error; - SOCKBUF_LOCK(sb); - error = sbreserve_locked(sb, cc, so, td); - SOCKBUF_UNLOCK(sb); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + switch (cmd) { + case SO_SNDLOWAT: + case SO_SNDBUF: + lowat = &so->sol_sbsnd_lowat; + hiwat = &so->sol_sbsnd_hiwat; + flags = &so->sol_sbsnd_flags; + break; + case SO_RCVLOWAT: + case SO_RCVBUF: + lowat = &so->sol_sbrcv_lowat; + hiwat = &so->sol_sbrcv_hiwat; + flags = &so->sol_sbrcv_flags; + break; + } + } else { + switch (cmd) { + case SO_SNDLOWAT: + case SO_SNDBUF: + sb = &so->so_snd; + break; + case SO_RCVLOWAT: + case SO_RCVBUF: + sb = &so->so_rcv; + break; + } + flags = &sb->sb_flags; + hiwat = &sb->sb_hiwat; + lowat = &sb->sb_lowat; + SOCKBUF_LOCK(sb); + } + + error = 0; + switch (cmd) { + case SO_SNDBUF: + case SO_RCVBUF: + if (SOLISTENING(so)) { + if (cc > sb_max_adj) { + error = ENOBUFS; + break; + } + *hiwat = cc; + if (*lowat > *hiwat) + *lowat = *hiwat; + } else { + if (!sbreserve_locked(sb, cc, so, curthread)) + error = ENOBUFS; + } + if (error == 0) + *flags &= ~SB_AUTOSIZE; + break; + case SO_SNDLOWAT: + case SO_RCVLOWAT: + /* + * Make sure the low-water is never greater than the + * high-water. + */ + *lowat = (cc > *hiwat) ? *hiwat : cc; + break; + } + + if (!SOLISTENING(so)) + SOCKBUF_UNLOCK(sb); + SOCK_UNLOCK(so); return (error); } diff --git a/freebsd/sys/kern/uipc_socket.c b/freebsd/sys/kern/uipc_socket.c index c52a543c..1773606d 100644 --- a/freebsd/sys/kern/uipc_socket.c +++ b/freebsd/sys/kern/uipc_socket.c @@ -108,6 +108,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_compat.h> +#include <rtems/bsd/local/opt_sctp.h> #include <sys/param.h> #include <sys/systm.h> @@ -160,13 +161,21 @@ __FBSDID("$FreeBSD$"); static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); +static void so_rdknl_lock(void *); +static void so_rdknl_unlock(void *); +static void so_rdknl_assert_locked(void *); +static void so_rdknl_assert_unlocked(void *); +static void so_wrknl_lock(void *); +static void so_wrknl_unlock(void *); +static void so_wrknl_assert_locked(void *); +static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); static int filt_soempty(struct knote *kn, long hint); +static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); #ifdef __rtems__ static #endif /* __rtems__ */ @@ -412,8 +421,16 @@ soalloc(struct vnet *vnet) return (NULL); } + /* + * The socket locking protocol allows to lock 2 sockets at a time, + * however, the first one must be a listening socket. WITNESS lacks + * a feature to change class of an existing lock, so we use DUPOK. + */ + mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); + so->so_rcv.sb_sel = &so->so_rdsel; + so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); #ifndef __rtems__ @@ -465,15 +482,6 @@ sodealloc(struct socket *so) so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); - if (so->so_rcv.sb_hiwat) - (void)chgsbsize(so->so_cred->cr_uidinfo, - &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); - if (so->so_snd.sb_hiwat) - (void)chgsbsize(so->so_cred->cr_uidinfo, - &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - /* remove accept filter if one is present. */ - if (so->so_accf != NULL) - do_setopt_accept_filter(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif @@ -481,10 +489,22 @@ sodealloc(struct socket *so) crfree(so->so_cred); khelp_destroy_osd(&so->osd); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); - SOCKBUF_LOCK_DESTROY(&so->so_snd); - SOCKBUF_LOCK_DESTROY(&so->so_rcv); + if (SOLISTENING(so)) { + if (so->sol_accept_filter != NULL) + accept_filt_setopt(so, NULL); + } else { + if (so->so_rcv.sb_hiwat) + (void)chgsbsize(so->so_cred->cr_uidinfo, + &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); + if (so->so_snd.sb_hiwat) + (void)chgsbsize(so->so_cred->cr_uidinfo, + &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + } + mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } @@ -527,8 +547,6 @@ socreate(int dom, struct socket **aso, int type, int proto, if (so == NULL) return (ENOBUFS); - TAILQ_INIT(&so->so_incomp); - TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || @@ -545,9 +563,10 @@ socreate(int dom, struct socket **aso, int type, int proto, #ifdef MAC mac_socket_create(cred, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); - so->so_count = 1; + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. @@ -556,12 +575,10 @@ socreate(int dom, struct socket **aso, int type, int proto, error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { - KASSERT(so->so_count == 1, ("socreate: so_count %d", - so->so_count)); - so->so_count = 0; sodealloc(so); return (error); } + soref(so); *aso = so; return (0); } @@ -589,11 +606,11 @@ sonewconn(struct socket *head, int connstatus) static int overcount; struct socket *so; - int over; + u_int over; - ACCEPT_LOCK(); - over = (head->so_qlen > 3 * head->so_qlimit / 2); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + over = (head->sol_qlen > 3 * head->sol_qlimit / 2); + SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else @@ -605,15 +622,15 @@ sonewconn(struct socket *head, int connstatus) log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", - __func__, head->so_pcb, head->so_qlen, overcount); + __func__, head->so_pcb, head->sol_qlen, overcount); overcount = 0; } return (NULL); } - VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", - __func__, __LINE__, head)); + VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", + __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " @@ -621,11 +638,8 @@ sonewconn(struct socket *head, int connstatus) __func__, head->so_pcb); return (NULL); } - if ((head->so_options & SO_ACCEPTFILTER) != 0) - connstatus = 0; - so->so_head = head; + so->so_listen = head; so->so_type = head->so_type; - so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; @@ -634,10 +648,12 @@ sonewconn(struct socket *head, int connstatus) #ifdef MAC mac_socket_newconn(head, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); @@ -649,32 +665,24 @@ sonewconn(struct socket *head, int connstatus) __func__, head->so_pcb); return (NULL); } - so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - so->so_snd.sb_lowat = head->so_snd.sb_lowat; - so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; - so->so_snd.sb_timeo = head->so_snd.sb_timeo; - so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; - so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; + so->so_snd.sb_lowat = head->sol_sbsnd_lowat; + so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; + so->so_snd.sb_timeo = head->sol_sbsnd_timeo; + so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; + + SOLISTEN_LOCK(head); + if (head->sol_accept_filter != NULL) + connstatus = 0; so->so_state |= connstatus; - ACCEPT_LOCK(); - /* - * The accept socket may be tearing down but we just - * won a race on the ACCEPT_LOCK. - * However, if sctp_peeloff() is called on a 1-to-many - * style socket, the SO_ACCEPTCONN doesn't need to be set. - */ - if (!(head->so_options & SO_ACCEPTCONN) && - ((head->so_proto->pr_protocol != IPPROTO_SCTP) || - (head->so_type != SOCK_SEQPACKET))) { - SOCK_LOCK(so); - so->so_head = NULL; - sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ - return (NULL); - } + so->so_options = head->so_options & ~SO_ACCEPTCONN; + soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - so->so_qstate |= SQ_COMP; - head->so_qlen++; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + so->so_qstate = SQ_COMP; + head->sol_qlen++; + solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for @@ -683,28 +691,86 @@ sonewconn(struct socket *head, int connstatus) * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ - while (head->so_incqlen > head->so_qlimit) { + while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; - sp = TAILQ_FIRST(&head->so_incomp); - TAILQ_REMOVE(&head->so_incomp, sp, so_list); - head->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); + + sp = TAILQ_FIRST(&head->sol_incomp); + TAILQ_REMOVE(&head->sol_incomp, sp, so_list); + head->sol_incqlen--; + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); - ACCEPT_LOCK(); + SOLISTEN_LOCK(head); } - TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); - so->so_qstate |= SQ_INCOMP; - head->so_incqlen++; + TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); + so->so_qstate = SQ_INCOMP; + head->sol_incqlen++; + SOLISTEN_UNLOCK(head); } - ACCEPT_UNLOCK(); - if (connstatus) { - sorwakeup(head); - wakeup_one(&head->so_timeo); + return (so); +} + +#ifdef SCTP +/* + * Socket part of sctp_peeloff(). Detach a new socket from an + * association. The new socket is returned with a reference. + */ +struct socket * +sopeeloff(struct socket *head) +{ + struct socket *so; + + VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", + __func__, __LINE__, head)); + so = soalloc(head->so_vnet); + if (so == NULL) { + log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " + "limit reached or out of memory\n", + __func__, head->so_pcb); + return (NULL); } + so->so_type = head->so_type; + so->so_options = head->so_options; + so->so_linger = head->so_linger; + so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; + so->so_fibnum = head->so_fibnum; + so->so_proto = head->so_proto; + so->so_cred = crhold(head->so_cred); +#ifdef MAC + mac_socket_newconn(head, so); +#endif + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); + VNET_SO_ASSERT(head); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", + __func__, head->so_pcb); + return (NULL); + } + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; + so->so_snd.sb_lowat = head->so_snd.sb_lowat; + so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + + soref(so); + return (so); } +#endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) @@ -766,13 +832,140 @@ solisten_proto_check(struct socket *so) void solisten_proto(struct socket *so, int backlog) { + int sbrcv_lowat, sbsnd_lowat; + u_int sbrcv_hiwat, sbsnd_hiwat; + short sbrcv_flags, sbsnd_flags; + sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); + if (SOLISTENING(so)) + goto listening; + + /* + * Change this socket to listening state. + */ + sbrcv_lowat = so->so_rcv.sb_lowat; + sbsnd_lowat = so->so_snd.sb_lowat; + sbrcv_hiwat = so->so_rcv.sb_hiwat; + sbsnd_hiwat = so->so_snd.sb_hiwat; + sbrcv_flags = so->so_rcv.sb_flags; + sbsnd_flags = so->so_snd.sb_flags; + sbrcv_timeo = so->so_rcv.sb_timeo; + sbsnd_timeo = so->so_snd.sb_timeo; + + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + +#ifdef INVARIANTS + bzero(&so->so_rcv, + sizeof(struct socket) - offsetof(struct socket, so_rcv)); +#endif + + so->sol_sbrcv_lowat = sbrcv_lowat; + so->sol_sbsnd_lowat = sbsnd_lowat; + so->sol_sbrcv_hiwat = sbrcv_hiwat; + so->sol_sbsnd_hiwat = sbsnd_hiwat; + so->sol_sbrcv_flags = sbrcv_flags; + so->sol_sbsnd_flags = sbsnd_flags; + so->sol_sbrcv_timeo = sbrcv_timeo; + so->sol_sbsnd_timeo = sbsnd_timeo; + + so->sol_qlen = so->sol_incqlen = 0; + TAILQ_INIT(&so->sol_incomp); + TAILQ_INIT(&so->sol_comp); + + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; + + so->sol_upcall = NULL; + so->sol_upcallarg = NULL; + + so->so_options |= SO_ACCEPTCONN; + +listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; - so->so_qlimit = backlog; - so->so_options |= SO_ACCEPTCONN; + so->sol_qlimit = backlog; +} + +/* + * Wakeup listeners/subsystems once we have a complete connection. + * Enters with lock, returns unlocked. + */ +void +solisten_wakeup(struct socket *sol) +{ + + if (sol->sol_upcall != NULL) + (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); + else { + selwakeuppri(&sol->so_rdsel, PSOCK); + KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); + } + SOLISTEN_UNLOCK(sol); + wakeup_one(&sol->sol_comp); +} + +/* + * Return single connection off a listening socket queue. Main consumer of + * the function is kern_accept4(). Some modules, that do their own accept + * management also use the function. + * + * Listening socket must be locked on entry and is returned unlocked on + * return. + * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. + */ +int +solisten_dequeue(struct socket *head, struct socket **ret, int flags) +{ + struct socket *so; + int error; + + SOLISTEN_LOCK_ASSERT(head); + + while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && + head->so_error == 0) { + error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, + "accept", 0); + if (error != 0) { + SOLISTEN_UNLOCK(head); + return (error); + } + } + if (head->so_error) { + error = head->so_error; + head->so_error = 0; + SOLISTEN_UNLOCK(head); + return (error); + } + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { + SOLISTEN_UNLOCK(head); + return (EWOULDBLOCK); + } + so = TAILQ_FIRST(&head->sol_comp); + SOCK_LOCK(so); + KASSERT(so->so_qstate == SQ_COMP, + ("%s: so %p not SQ_COMP", __func__, so)); + soref(so); + head->sol_qlen--; + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + TAILQ_REMOVE(&head->sol_comp, so, so_list); + if (flags & ACCEPT4_INHERIT) + so->so_state |= (head->so_state & SS_NBIO); + else + so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; + SOCK_UNLOCK(so); + sorele(head); + + *ret = so; + return (0); } /* @@ -799,44 +992,62 @@ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; - struct socket *head; - ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || - (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { + (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); return; } - head = so->so_head; - if (head != NULL) { - KASSERT((so->so_qstate & SQ_COMP) != 0 || - (so->so_qstate & SQ_INCOMP) != 0, - ("sofree: so_head != NULL, but neither SQ_COMP nor " - "SQ_INCOMP")); - KASSERT((so->so_qstate & SQ_COMP) == 0 || - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - so->so_head = NULL; - } - KASSERT((so->so_qstate & SQ_COMP) == 0 && - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", - so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); - if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("sofree: so_incomp populated")); + if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { + struct socket *sol; + + sol = so->so_listen; + KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); + + /* + * To solve race between close of a listening socket and + * a socket on its incomplete queue, we need to lock both. + * The order is first listening socket, then regular. + * Since we don't have SS_NOFDREF neither SS_PROTOREF, this + * function and the listening socket are the only pointers + * to so. To preserve so and sol, we reference both and then + * relock. + * After relock the socket may not move to so_comp since it + * doesn't have PCB already, but it may be removed from + * so_incomp. If that happens, we share responsiblity on + * freeing the socket, but soclose() has already removed + * it from queue. + */ + soref(sol); + soref(so); + SOCK_UNLOCK(so); + SOLISTEN_LOCK(sol); + SOCK_LOCK(so); + if (so->so_qstate == SQ_INCOMP) { + KASSERT(so->so_listen == sol, + ("%s: so %p migrated out of sol %p", + __func__, so, sol)); + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; + /* This is guarenteed not to be the last. */ + refcount_release(&sol->so_count); + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + } else + KASSERT(so->so_listen == NULL, + ("%s: so %p not on (in)comp with so_listen", + __func__, so)); + sorele(sol); + KASSERT(so->so_count == 1, + ("%s: so %p count %u", __func__, so, so->so_count)); + so->so_count = 0; } + if (SOLISTENING(so)) + so->so_error = ECONNABORTED; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) @@ -858,12 +1069,14 @@ sofree(struct socket *so) * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ - sbdestroy(&so->so_snd, so); - sbdestroy(&so->so_rcv, so); - seldrain(&so->so_snd.sb_sel); - seldrain(&so->so_rcv.sb_sel); - knlist_destroy(&so->so_rcv.sb_sel.si_note); - knlist_destroy(&so->so_snd.sb_sel.si_note); + if (!SOLISTENING(so)) { + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + } + seldrain(&so->so_rdsel); + seldrain(&so->so_wrsel); + knlist_destroy(&so->so_rdsel.si_note); + knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } @@ -878,6 +1091,8 @@ sofree(struct socket *so) int soclose(struct socket *so) { + struct accept_queue lqueue; + bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); @@ -910,41 +1125,42 @@ soclose(struct socket *so) drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - ACCEPT_LOCK(); - if (so->so_options & SO_ACCEPTCONN) { + + SOCK_LOCK(so); + if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; - /* - * Prevent new additions to the accept queues due - * to ACCEPT_LOCK races while we are draining them. - */ - so->so_options &= ~SO_ACCEPTCONN; - while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { - TAILQ_REMOVE(&so->so_incomp, sp, so_list); - so->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_qstate &= ~SQ_COMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); + + TAILQ_INIT(&lqueue); + TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); + TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); + + so->sol_qlen = so->sol_incqlen = 0; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + /* Guaranteed not to be the last. */ + refcount_release(&so->so_count); } - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("%s: so_comp populated", __func__)); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("%s: so_incomp populated", __func__)); } - SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; - sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ + sorele(so); + if (listening) { + struct socket *sp; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + if (sp->so_count == 0) { + SOCK_UNLOCK(sp); + soabort(sp); + } else + /* sp is now in sofree() */ + SOCK_UNLOCK(sp); + } + } CURVNET_RESTORE(); return (error); } @@ -976,13 +1192,11 @@ soabort(struct socket *so) KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); - KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); + KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); - ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } @@ -1431,8 +1645,14 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, int error; CURVNET_SET(so->so_vnet); - error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, - control, flags, td); + if (!SOLISTENING(so)) + error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, + top, control, flags, td); + else { + m_freem(top); + m_freem(control); + error = ENOTCONN; + } CURVNET_RESTORE(); return (error); } @@ -2368,8 +2588,11 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, int error; CURVNET_SET(so->so_vnet); - error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, - controlp, flagsp)); + if (!SOLISTENING(so)) + error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, + mp0, controlp, flagsp)); + else + error = ENOTCONN; CURVNET_RESTORE(); return (error); } @@ -2565,7 +2788,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: - error = do_setopt_accept_filter(so, sopt); + error = accept_filt_setopt(so, sopt); if (error) goto bad; break; @@ -2653,38 +2876,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) goto bad; } - switch (sopt->sopt_name) { - case SO_SNDBUF: - case SO_RCVBUF: - if (sbreserve(sopt->sopt_name == SO_SNDBUF ? - &so->so_snd : &so->so_rcv, (u_long)optval, - so, curthread) == 0) { - error = ENOBUFS; - goto bad; - } - (sopt->sopt_name == SO_SNDBUF ? &so->so_snd : - &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE; - break; - - /* - * Make sure the low-water is never greater than the - * high-water. - */ - case SO_SNDLOWAT: - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_lowat = - (optval > so->so_snd.sb_hiwat) ? - so->so_snd.sb_hiwat : optval; - SOCKBUF_UNLOCK(&so->so_snd); - break; - case SO_RCVLOWAT: - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_lowat = - (optval > so->so_rcv.sb_hiwat) ? - so->so_rcv.sb_hiwat : optval; - SOCKBUF_UNLOCK(&so->so_rcv); - break; - } + error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: @@ -2825,7 +3017,7 @@ sogetopt(struct socket *so, struct sockopt *sopt) } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: - error = do_getopt_accept_filter(so, sopt); + error = accept_filt_getopt(so, sopt); break; case SO_LINGER: @@ -2869,19 +3061,23 @@ integer: goto integer; case SO_SNDBUF: - optval = so->so_snd.sb_hiwat; + optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : + so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: - optval = so->so_rcv.sb_hiwat; + optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : + so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: - optval = so->so_snd.sb_lowat; + optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : + so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: - optval = so->so_rcv.sb_lowat; + optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : + so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: @@ -2933,15 +3129,15 @@ integer: break; case SO_LISTENQLIMIT: - optval = so->so_qlimit; + optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: - optval = so->so_qlen; + optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: - optval = so->so_incqlen; + optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: @@ -3092,7 +3288,7 @@ sohasoutofband(struct socket *so) if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); #endif /* __rtems__ */ - selwakeuppri(&so->so_rcv.sb_sel, PSOCK); + selwakeuppri(&so->so_rdsel, PSOCK); } int @@ -3112,44 +3308,54 @@ int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { - int revents = 0; - - SOCKBUF_LOCK(&so->so_snd); - SOCKBUF_LOCK(&so->so_rcv); - if (events & (POLLIN | POLLRDNORM)) - if (soreadabledata(so)) - revents |= events & (POLLIN | POLLRDNORM); + int revents; - if (events & (POLLOUT | POLLWRNORM)) - if (sowriteable(so)) - revents |= events & (POLLOUT | POLLWRNORM); - - if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) - revents |= events & (POLLPRI | POLLRDBAND); - - if ((events & POLLINIGNEOF) == 0) { - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - revents |= events & (POLLIN | POLLRDNORM); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) - revents |= POLLHUP; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + if (!(events & (POLLIN | POLLRDNORM))) + revents = 0; + else if (!TAILQ_EMPTY(&so->sol_comp)) + revents = events & (POLLIN | POLLRDNORM); + else { + selrecord(td, &so->so_rdsel); + revents = 0; } - } - - if (revents == 0) { - if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { - selrecord(td, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_SEL; + } else { + revents = 0; + SOCKBUF_LOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (events & (POLLIN | POLLRDNORM)) + if (soreadabledata(so)) + revents |= events & (POLLIN | POLLRDNORM); + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || + (so->so_rcv.sb_state & SBS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + if ((events & POLLINIGNEOF) == 0) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + revents |= events & (POLLIN | POLLRDNORM); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) + revents |= POLLHUP; + } } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(td, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_SEL; + if (revents == 0) { + if (events & + (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(td, &so->so_rdsel); + so->so_rcv.sb_flags |= SB_SEL; + } + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &so->so_wrsel); + so->so_snd.sb_flags |= SB_SEL; + } } + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_snd); } - - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_UNLOCK(so); return (revents); } @@ -3158,28 +3364,38 @@ soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; + struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; + knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } - SOCKBUF_LOCK(sb); - knlist_add(&sb->sb_sel.si_note, kn, 1); - sb->sb_flags |= SB_KNOTE; - SOCKBUF_UNLOCK(sb); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + knlist_add(knl, kn, 1); + } else { + SOCKBUF_LOCK(sb); + knlist_add(knl, kn, 1); + sb->sb_flags |= SB_KNOTE; + SOCKBUF_UNLOCK(sb); + } + SOCK_UNLOCK(so); return (0); } #ifdef __rtems__ @@ -3367,11 +3583,11 @@ filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_rcv); - knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_rcv.sb_sel.si_note)) + so_rdknl_lock(so); + knlist_remove(&so->so_rdsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_rcv); + so_rdknl_unlock(so); } /*ARGSUSED*/ @@ -3381,11 +3597,13 @@ filt_soread(struct knote *kn, long hint) struct socket *so; so = kn->kn_fp->f_data; - if (so->so_options & SO_ACCEPTCONN) { - kn->kn_data = so->so_qlen; - return (!TAILQ_EMPTY(&so->so_comp)); + if (SOLISTENING(so)) { + SOCK_LOCK_ASSERT(so); + kn->kn_data = so->sol_qlen; + return (!TAILQ_EMPTY(&so->sol_comp)); } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; @@ -3411,11 +3629,11 @@ filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_snd); - knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_snd.sb_sel.si_note)) + so_wrknl_lock(so); + knlist_remove(&so->so_wrsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_snd); + so_wrknl_unlock(so); } /*ARGSUSED*/ @@ -3425,6 +3643,10 @@ filt_sowrite(struct knote *kn, long hint) struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (0); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); @@ -3451,6 +3673,10 @@ filt_soempty(struct knote *kn, long hint) struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (1); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); @@ -3521,42 +3747,52 @@ soisconnected(struct socket *so) struct socket *head; int ret; + /* + * XXXGL: this is the only place where we acquire socket locks + * in reverse order: first child, then listening socket. To + * avoid possible LOR, use try semantics. + */ restart: - ACCEPT_LOCK(); SOCK_LOCK(so); + if ((head = so->so_listen) != NULL && + __predict_false(SOLISTEN_TRYLOCK(head) == 0)) { + SOCK_UNLOCK(so); + goto restart; + } so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - head = so->so_head; - if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if (head != NULL && (so->so_qstate == SQ_INCOMP)) { +again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { + TAILQ_REMOVE(&head->sol_incomp, so, so_list); + head->sol_incqlen--; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + head->sol_qlen++; + so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; - so->so_qstate |= SQ_COMP; - ACCEPT_UNLOCK(); - sorwakeup(head); - wakeup_one(&head->so_timeo); + solisten_wakeup(head); /* unlocks */ } else { - ACCEPT_UNLOCK(); + SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, - head->so_accf->so_accept_filter->accf_callback, - head->so_accf->so_accept_filter_arg); + head->sol_accept_filter->accf_callback, + head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; - ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_NOWAIT); - if (ret == SU_ISCONNECTED) + ret = head->sol_accept_filter->accf_callback(so, + head->sol_accept_filter_arg, M_NOWAIT); + if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); + SOCKBUF_UNLOCK(&so->so_rcv); + goto again; + } + SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); - if (ret == SU_ISCONNECTED) - goto restart; + SOLISTEN_UNLOCK(head); } return; } + if (head != NULL) + SOLISTEN_UNLOCK(head); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); @@ -3566,16 +3802,17 @@ void soisdisconnecting(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3583,17 +3820,18 @@ void soisdisconnected(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3615,11 +3853,12 @@ sodupsockaddr(const struct sockaddr *sa, int mflags) * Register per-socket buffer upcalls. */ void -soupcall_set(struct socket *so, int which, - int (*func)(struct socket *, void *, int), void *arg) +soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3631,10 +3870,6 @@ soupcall_set(struct socket *so, int which, panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); -#if 0 - /* XXX: accf_http actually wants to do this on purpose. */ - KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); -#endif sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; @@ -3645,6 +3880,8 @@ soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3656,12 +3893,110 @@ soupcall_clear(struct socket *so, int which) panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); - KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); + KASSERT(sb->sb_upcall != NULL, + ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } +void +solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) +{ + + SOLISTEN_LOCK_ASSERT(so); + so->sol_upcall = func; + so->sol_upcallarg = arg; +} + +static void +so_rdknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_rcv); +} + +static void +so_rdknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +static void +so_rdknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +} + +static void +so_rdknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); +} + +static void +so_wrknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_snd); +} + +static void +so_wrknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_snd); +} + +static void +so_wrknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_snd); +} + +static void +so_wrknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); +} + /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to @@ -3683,36 +4018,28 @@ sotoxsocket(struct socket *so, struct xsocket *xso) xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); #ifndef __rtems__ xso->so_uid = so->so_cred->cr_uid; #else /* __rtems__ */ xso->so_uid = BSD_DEFAULT_UID; #endif /* __rtems__ */ -} - - -/* - * Socket accessor functions to provide external consumers with - * a safe interface to socket state - * - */ - -void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), - void *arg) -{ - - TAILQ_FOREACH(so, &so->so_comp, so_list) - func(so, arg); + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + xso->so_oobmark = 0; + bzero(&xso->so_snd, sizeof(xso->so_snd)); + bzero(&xso->so_rcv, sizeof(xso->so_rcv)); + } else { + xso->so_state |= so->so_qstate; + xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + } } struct sockbuf * diff --git a/freebsd/sys/kern/uipc_syscalls.c b/freebsd/sys/kern/uipc_syscalls.c index f301c12c..5a9a381f 100644 --- a/freebsd/sys/kern/uipc_syscalls.c +++ b/freebsd/sys/kern/uipc_syscalls.c @@ -70,13 +70,6 @@ __FBSDID("$FreeBSD$"); #include <security/audit/audit.h> #include <security/mac/mac_framework.h> -/* - * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC - * and SOCK_NONBLOCK. - */ -#define ACCEPT4_INHERIT 0x1 -#define ACCEPT4_COMPAT 0x2 - static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); @@ -524,59 +517,22 @@ kern_accept4(struct thread *td, int s, struct sockaddr **name, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; - ACCEPT_LOCK(); - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); - error = EWOULDBLOCK; - goto noconnection; - } - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, - "accept", 0); - if (error != 0) { - ACCEPT_UNLOCK(); - goto noconnection; - } - } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); + SOCK_LOCK(head); + if (!SOLISTENING(head)) { + SOCK_UNLOCK(head); + error = EINVAL; goto noconnection; } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - if (flags & ACCEPT4_INHERIT) - so->so_state |= (head->so_state & SS_NBIO); - else - so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + error = solisten_dequeue(head, &so, flags); + if (error != 0) + goto noconnection; /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; - /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + /* Connection has been removed from the listen queue. */ + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); @@ -594,7 +550,6 @@ kern_accept4(struct thread *td, int s, struct sockaddr **name, (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); - sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; @@ -769,7 +724,7 @@ kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, + error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) diff --git a/freebsd/sys/kern/uipc_usrreq.c b/freebsd/sys/kern/uipc_usrreq.c index 8e60f227..7237956a 100644 --- a/freebsd/sys/kern/uipc_usrreq.c +++ b/freebsd/sys/kern/uipc_usrreq.c @@ -202,10 +202,9 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD, /* * Locking and synchronization: * - * Three types of locks exit in the local domain socket implementation: a - * global list mutex, a global linkage rwlock, and per-unpcb mutexes. Of the - * global locks, the list lock protects the socket count, global generation - * number, and stream/datagram global lists. The linkage lock protects the + * Two types of locks exist in the local domain socket implementation: a + * a global linkage rwlock and per-unpcb mutexes. The linkage lock protects + * the socket count, global generation number, stream/datagram global lists and * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be * held exclusively over the acquisition of multiple unpcb locks to prevent * deadlock. @@ -246,7 +245,6 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD, * to perform namei() and other file system operations. */ static struct rwlock unp_link_rwlock; -static struct mtx unp_list_lock; static struct mtx unp_defers_lock; #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \ @@ -263,11 +261,7 @@ static struct mtx unp_defers_lock; #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock) #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_WLOCKED) - -#define UNP_LIST_LOCK_INIT() mtx_init(&unp_list_lock, \ - "unp_list_lock", NULL, MTX_DEF) -#define UNP_LIST_LOCK() mtx_lock(&unp_list_lock) -#define UNP_LIST_UNLOCK() mtx_unlock(&unp_list_lock) +#define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock) #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \ "unp_defer", NULL, MTX_DEF) @@ -417,6 +411,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td) u_long sendspace, recvspace; struct unpcb *unp; int error; + bool locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -451,10 +446,12 @@ uipc_attach(struct socket *so, int proto, struct thread *td) unp->unp_socket = so; so->so_pcb = unp; unp->unp_refcount = 1; - if (so->so_head != NULL) + if (so->so_listen != NULL) unp->unp_flags |= UNP_NASCENT; - UNP_LIST_LOCK(); + if ((locked = UNP_LINK_WOWNED()) == false) + UNP_LINK_WLOCK(); + unp->unp_gencnt = ++unp_gencnt; unp_count++; switch (so->so_type) { @@ -473,7 +470,9 @@ uipc_attach(struct socket *so, int proto, struct thread *td) default: panic("uipc_attach"); } - UNP_LIST_UNLOCK(); + + if (locked == false) + UNP_LINK_WUNLOCK(); return (0); } @@ -516,6 +515,14 @@ static const IMFS_node_control rtems_uipc_imfs_control = static const IMFS_node_control rtems_uipc_imfs_zombi_control = IMFS_GENERIC_INITIALIZER(&rtems_filesystem_handlers_default, NULL, IMFS_node_destroy_default); + +static void +VOP_UNP_DETACH(IMFS_generic_t *vp) +{ + + vp->Node.control = &rtems_uipc_imfs_zombi_control; + vp->context = NULL; +} #endif /* __rtems__ */ static int uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) @@ -630,7 +637,7 @@ restart: UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); #ifndef __rtems__ - VOP_UNP_BIND(vp, unp->unp_socket); + VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; #endif /* __rtems__ */ unp->unp_addr = soun; @@ -690,6 +697,11 @@ static void uipc_close(struct socket *so) { struct unpcb *unp, *unp2; +#ifndef __rtems__ + struct vnode *vp; +#else /* __rtems__ */ + IMFS_generic_t *vp; +#endif /* __rtems__ */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); @@ -702,8 +714,16 @@ uipc_close(struct socket *so) unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } + if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) { + VOP_UNP_DETACH(vp); + unp->unp_vnode = NULL; + } UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); +#ifndef __rtems__ + if (vp) + vrele(vp); +#endif /* __rtems__ */ } static int @@ -747,29 +767,16 @@ uipc_detach(struct socket *so) local_unp_rights = 0; #endif /* __rtems__ */ - UNP_LIST_LOCK(); + UNP_LINK_WLOCK(); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; - UNP_LIST_UNLOCK(); - - if ((unp->unp_flags & UNP_NASCENT) != 0) { - UNP_PCB_LOCK(unp); - goto teardown; - } - UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); + if ((unp->unp_flags & UNP_NASCENT) != 0) + goto teardown; - /* - * XXXRW: Should assert vp->v_socket == so. - */ if ((vp = unp->unp_vnode) != NULL) { -#ifndef __rtems__ VOP_UNP_DETACH(vp); -#else /* __rtems__ */ - vp->Node.control = &rtems_uipc_imfs_zombi_control; - vp->context = NULL; -#endif /* __rtems__ */ unp->unp_vnode = NULL; } unp2 = unp->unp_conn; @@ -793,8 +800,8 @@ uipc_detach(struct socket *so) #ifndef __rtems__ local_unp_rights = unp_rights; #endif /* __rtems__ */ - UNP_LINK_WUNLOCK(); teardown: + UNP_LINK_WUNLOCK(); unp->unp_socket->so_pcb = NULL; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; @@ -860,7 +867,6 @@ uipc_listen(struct socket *so, int backlog, struct thread *td) error = solisten_proto_check(so); if (error == 0) { cru2x(td->td_ucred, &unp->unp_peercred); - unp->unp_flags |= UNP_HAVEPCCACHED; solisten_proto(so, backlog); } SOCK_UNLOCK(so); @@ -1439,7 +1445,7 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam, #else /* __rtems__ */ struct IMFS_jnode_tt *vp; #endif /* __rtems__ */ - struct socket *so2, *so3; + struct socket *so2; struct unpcb *unp, *unp2, *unp3; #ifndef __rtems__ struct nameidata nd; @@ -1450,7 +1456,9 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam, const rtems_filesystem_location_info_t *currentloc; #endif /* __rtems__ */ struct sockaddr *sa; +#ifndef __rtems__ cap_rights_t rights; +#endif /* __rtems__ */ int error, len; if (nam->sa_family != AF_UNIX) @@ -1535,34 +1543,38 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam, */ UNP_LINK_WLOCK(); #ifndef __rtems__ - VOP_UNP_CONNECT(vp, &so2); + VOP_UNP_CONNECT(vp, &unp2); + if (unp2 == NULL) { + error = ECONNREFUSED; + goto bad2; + } + so2 = unp2->unp_socket; #else /* __rtems__ */ so2 = IMFS_generic_get_context_by_node(vp); -#endif /* __rtems__ */ if (so2 == NULL) { error = ECONNREFUSED; goto bad2; } + unp2 = sotounpcb(so2); +#endif /* __rtems__ */ if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { CURVNET_SET(so2->so_vnet); - so3 = sonewconn(so2, 0); + so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else - so3 = NULL; - if (so3 == NULL) { + so2 = NULL; + if (so2 == NULL) { error = ECONNREFUSED; - goto bad2; + goto bad3; } - unp = sotounpcb(so); - unp2 = sotounpcb(so2); - unp3 = sotounpcb(so3); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + unp3 = sotounpcb(so2); UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); @@ -1583,30 +1595,24 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam, * listen(); uipc_listen() cached that process's credentials * at that time so we can use them now. */ - KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, - ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; - UNP_PCB_UNLOCK(unp3); UNP_PCB_UNLOCK(unp2); - UNP_PCB_UNLOCK(unp); + unp2 = unp3; #ifdef MAC - mac_socketpeer_set_from_socket(so, so3); - mac_socketpeer_set_from_socket(so3, so); + mac_socketpeer_set_from_socket(so, so2); + mac_socketpeer_set_from_socket(so2, so); #endif - - so2 = so3; } - unp = sotounpcb(so); - KASSERT(unp != NULL, ("unp_connect: unp == NULL")); - unp2 = sotounpcb(so2); - KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + + KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && + sotounpcb(so2) == unp2, + ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); +bad3: UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); bad2: @@ -1750,10 +1756,10 @@ unp_pcblist(SYSCTL_HANDLER_ARGS) * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); gencnt = unp_gencnt; n = unp_count; - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; @@ -1767,7 +1773,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS) unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); @@ -1782,7 +1788,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS) } UNP_PCB_UNLOCK(unp); } - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; @@ -2044,7 +2050,6 @@ unp_init(void) TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL); #endif /* __rtems__ */ UNP_LINK_LOCK_INIT(); - UNP_LIST_LOCK_INIT(); UNP_DEFERRED_LOCK_INIT(); } @@ -2396,8 +2401,7 @@ unp_accessable(struct filedescent **fdep, int fdcount) static void unp_gc_process(struct unpcb *unp) { - struct socket *soa; - struct socket *so; + struct socket *so, *soa; struct file *fp; /* Already processed. */ @@ -2417,28 +2421,30 @@ unp_gc_process(struct unpcb *unp) return; } - /* - * Mark all sockets we reference with RIGHTS. - */ so = unp->unp_socket; - if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&so->so_rcv); - } - - /* - * Mark all sockets in our accept queue. - */ - ACCEPT_LOCK(); - TAILQ_FOREACH(soa, &so->so_comp, so_list) { - if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0) - continue; - SOCKBUF_LOCK(&soa->so_rcv); - unp_scan(soa->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&soa->so_rcv); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + /* + * Mark all sockets in our accept queue. + */ + TAILQ_FOREACH(soa, &so->sol_comp, so_list) { + if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) + continue; + SOCKBUF_LOCK(&soa->so_rcv); + unp_scan(soa->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&soa->so_rcv); + } + } else { + /* + * Mark all sockets we reference with RIGHTS. + */ + if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + } } - ACCEPT_UNLOCK(); + SOCK_UNLOCK(so); unp->unp_gcflag |= UNPGC_SCANNED; } @@ -2461,7 +2467,7 @@ unp_gc(__unused void *arg, int pending) int i, total; unp_taskcount++; - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); /* * First clear all gc flags from previous runs, apart from * UNPGC_IGNORE_RIGHTS. @@ -2484,7 +2490,7 @@ unp_gc(__unused void *arg, int pending) LIST_FOREACH(unp, *head, unp_link) unp_gc_process(unp); } while (unp_marked); - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); if (unp_unreachable == 0) return; @@ -2499,7 +2505,6 @@ unp_gc(__unused void *arg, int pending) * as as unreachable and store them locally. */ UNP_LINK_RLOCK(); - UNP_LIST_LOCK(); for (total = 0, head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) if ((unp->unp_gcflag & UNPGC_DEAD) != 0) { @@ -2512,7 +2517,6 @@ unp_gc(__unused void *arg, int pending) KASSERT(total <= unp_unreachable, ("unp_gc: incorrect unreachable count.")); } - UNP_LIST_UNLOCK(); UNP_LINK_RUNLOCK(); /* @@ -2555,10 +2559,11 @@ unp_dispose(struct socket *so) struct unpcb *unp; unp = sotounpcb(so); - UNP_LIST_LOCK(); + UNP_LINK_WLOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; - UNP_LIST_UNLOCK(); - unp_dispose_mbuf(so->so_rcv.sb_mb); + UNP_LINK_WUNLOCK(); + if (!SOLISTENING(so)) + unp_dispose_mbuf(so->so_rcv.sb_mb); } static void @@ -2613,7 +2618,6 @@ unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int)) void vfs_unp_reclaim(struct vnode *vp) { - struct socket *so; struct unpcb *unp; int active; @@ -2623,10 +2627,7 @@ vfs_unp_reclaim(struct vnode *vp) active = 0; UNP_LINK_WLOCK(); - VOP_UNP_CONNECT(vp, &so); - if (so == NULL) - goto done; - unp = sotounpcb(so); + VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); @@ -2663,10 +2664,6 @@ db_print_unpflags(int unp_flags) db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } - if (unp_flags & UNP_HAVEPCCACHED) { - db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : ""); - comma = 1; - } if (unp_flags & UNP_WANTCRED) { db_printf("%sUNP_WANTCRED", comma ? ", " : ""); comma = 1; |