From a5ddb0ea69f21c16b7697a935d7a0c16bb3cffcf Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Tue, 24 Sep 2019 11:05:03 +0200 Subject: Update to FreeBSD head 2019-09-24 Git mirror commit 6b0307a0a5184339393f555d5d424190d8a8277a. --- freebsd/sys/kern/init_main.c | 25 +- freebsd/sys/kern/kern_conf.c | 2 +- freebsd/sys/kern/kern_event.c | 1 + freebsd/sys/kern/kern_intr.c | 37 +- freebsd/sys/kern/kern_mbuf.c | 470 +++++++++++++++++++++++- freebsd/sys/kern/kern_mib.c | 101 ++++-- freebsd/sys/kern/kern_mtxpool.c | 4 +- freebsd/sys/kern/kern_synch.c | 83 +++++ freebsd/sys/kern/kern_sysctl.c | 683 +++++++++++++++++++++++++++++++++-- freebsd/sys/kern/kern_time.c | 12 +- freebsd/sys/kern/kern_timeout.c | 57 ++- freebsd/sys/kern/kern_uuid.c | 2 +- freebsd/sys/kern/subr_blist.c | 344 +++++++++++------- freebsd/sys/kern/subr_bus.c | 135 ++++++- freebsd/sys/kern/subr_eventhandler.c | 1 + freebsd/sys/kern/subr_gtaskqueue.c | 101 +++--- freebsd/sys/kern/subr_kobj.c | 95 ++--- freebsd/sys/kern/subr_lock.c | 42 +-- freebsd/sys/kern/subr_pcpu.c | 12 +- freebsd/sys/kern/subr_prf.c | 45 +++ freebsd/sys/kern/subr_sbuf.c | 159 +++++--- freebsd/sys/kern/subr_sleepqueue.c | 55 ++- freebsd/sys/kern/subr_taskqueue.c | 2 +- freebsd/sys/kern/sys_generic.c | 6 +- freebsd/sys/kern/sys_pipe.c | 95 ++--- freebsd/sys/kern/tty.c | 8 +- freebsd/sys/kern/uipc_mbuf.c | 292 ++++++++++++++- freebsd/sys/kern/uipc_mbuf2.c | 2 +- freebsd/sys/kern/uipc_sockbuf.c | 139 ++++++- freebsd/sys/kern/uipc_socket.c | 136 +++++-- freebsd/sys/kern/uipc_syscalls.c | 45 ++- freebsd/sys/kern/uipc_usrreq.c | 67 +++- 32 files changed, 2708 insertions(+), 550 deletions(-) (limited to 'freebsd/sys/kern') diff --git a/freebsd/sys/kern/init_main.c b/freebsd/sys/kern/init_main.c index c6a9e310..2265d89a 100644 --- a/freebsd/sys/kern/init_main.c +++ b/freebsd/sys/kern/init_main.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -110,6 +111,14 @@ struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; +int +linux_alloc_current_noop(struct thread *td __unused, int flags __unused) +{ + return (0); +} +int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop; + + #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif @@ -155,11 +164,6 @@ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; -EVENTHANDLER_LIST_DECLARE(process_init); -EVENTHANDLER_LIST_DECLARE(thread_init); -EVENTHANDLER_LIST_DECLARE(process_ctor); -EVENTHANDLER_LIST_DECLARE(thread_ctor); - /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. @@ -440,7 +444,6 @@ struct sysentvec null_sysvec = { .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, - .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, @@ -482,7 +485,7 @@ proc0_init(void *dummy __unused) GIANT_REQUIRED; p = &proc0; td = &thread0; - + /* * Initialize magic number and osrel. */ @@ -822,11 +825,9 @@ start_init(void *dummy) } /* - * Like kproc_create(), but runs in its own address space. - * We do this early to reserve pid 1. - * - * Note special case - do not make it runnable yet. Other work - * in progress will change this more. + * Like kproc_create(), but runs in its own address space. We do this + * early to reserve pid 1. Note special case - do not make it + * runnable yet, init execution is started when userspace can be served. */ static void create_init(const void *udata __unused) diff --git a/freebsd/sys/kern/kern_conf.c b/freebsd/sys/kern/kern_conf.c index 560a450a..26718648 100644 --- a/freebsd/sys/kern/kern_conf.c +++ b/freebsd/sys/kern/kern_conf.c @@ -656,7 +656,7 @@ prep_cdevsw(struct cdevsw *devsw, int flags) return (0); } - if (devsw->d_version != D_VERSION_03) { + if (devsw->d_version != D_VERSION_04) { printf( "WARNING: Device driver \"%s\" has wrong version %s\n", devsw->d_name == NULL ? "???" : devsw->d_name, diff --git a/freebsd/sys/kern/kern_event.c b/freebsd/sys/kern/kern_event.c index 5c75c657..f0700e55 100644 --- a/freebsd/sys/kern/kern_event.c +++ b/freebsd/sys/kern/kern_event.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include diff --git a/freebsd/sys/kern/kern_intr.c b/freebsd/sys/kern/kern_intr.c index 122e82bd..65b633f6 100644 --- a/freebsd/sys/kern/kern_intr.c +++ b/freebsd/sys/kern/kern_intr.c @@ -101,7 +101,7 @@ struct proc *intrproc; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); -static int intr_storm_threshold = 1000; +static int intr_storm_threshold = 0; SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN, &intr_storm_threshold, 0, "Number of consecutive interrupts before storm protection is enabled"); @@ -231,10 +231,20 @@ intr_event_update(struct intr_event *ie) } /* - * If the handler names were too long, add +'s to indicate missing - * names. If we run out of room and still have +'s to add, change - * the last character from a + to a *. + * If there is only one handler and its name is too long, just copy in + * as much of the end of the name (includes the unit number) as will + * fit. Otherwise, we have multiple handlers and not all of the names + * will fit. Add +'s to indicate missing names. If we run out of room + * and still have +'s to add, change the last character from a + to a *. */ + if (missed == 1 && space == 1) { + ih = CK_SLIST_FIRST(&ie->ie_handlers); + missed = strlen(ie->ie_fullname) + strlen(ih->ih_name) + 2 - + sizeof(ie->ie_fullname); + strcat(ie->ie_fullname, (missed == 0) ? " " : "-"); + strcat(ie->ie_fullname, &ih->ih_name[missed]); + missed = 0; + } last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2]; while (missed-- > 0) { if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) { @@ -393,6 +403,25 @@ intr_event_bind_ithread(struct intr_event *ie, int cpu) return (_intr_event_bind(ie, cpu, false, true)); } +/* + * Bind an interrupt event's ithread to the specified cpuset. + */ +int +intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs) +{ + lwpid_t id; + + mtx_lock(&ie->ie_lock); + if (ie->ie_thread != NULL) { + id = ie->ie_thread->it_thread->td_tid; + mtx_unlock(&ie->ie_lock); + return (cpuset_setthread(id, cs)); + } else { + mtx_unlock(&ie->ie_lock); + } + return (ENODEV); +} + static struct intr_event * intr_lookup(int irq) { diff --git a/freebsd/sys/kern/kern_mbuf.c b/freebsd/sys/kern/kern_mbuf.c index f94eda5b..85846acd 100644 --- a/freebsd/sys/kern/kern_mbuf.c +++ b/freebsd/sys/kern/kern_mbuf.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include @@ -43,13 +44,20 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include +#include +#include #include +#include #include +#include +#include + #include #include #include @@ -112,11 +120,20 @@ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ +bool mb_use_ext_pgs; /* use EXT_PGS mbufs for sendfile & TLS */ +SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, + &mb_use_ext_pgs, 0, + "Use unmapped mbufs for sendfile(2) and TLS offload"); + static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); +static counter_u64_t snd_tag_count; +SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, + &snd_tag_count, "# of active mbuf send tags"); + /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ @@ -285,6 +302,7 @@ uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; +uma_zone_t zone_extpgs; /* * Local prototypes. @@ -302,6 +320,9 @@ static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); +_Static_assert(sizeof(struct mbuf_ext_pgs) == 256, + "mbuf_ext_pgs size mismatch"); + /* * Initialize FreeBSD Network buffer allocation. */ @@ -383,6 +404,15 @@ mbuf_init(void *dummy) uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); + zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME, + sizeof(struct mbuf_ext_pgs), +#ifdef INVARIANTS + trash_ctor, trash_dtor, trash_init, trash_fini, +#else + NULL, NULL, NULL, NULL, +#endif + UMA_ALIGN_CACHE, 0); + /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA @@ -392,6 +422,8 @@ mbuf_init(void *dummy) EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); #endif /* __rtems__ */ + + snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); @@ -697,14 +729,14 @@ mb_dtor_pack(void *mem, int size, void *arg) #endif /* * If there are processes blocked on zone_clust, waiting for pages - * to be freed up, * cause them to be woken up by draining the - * packet zone. We are exposed to a race here * (in the check for + * to be freed up, cause them to be woken up by draining the + * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) - zone_drain(zone_pack); + uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* @@ -831,6 +863,384 @@ mb_reclaim(uma_zone_t zone __unused, int pending __unused) (*pr->pr_drain)(); } +/* + * Free "count" units of I/O from an mbuf chain. They could be held + * in EXT_PGS or just as a normal mbuf. This code is intended to be + * called in an error path (I/O error, closed connection, etc). + */ +void +mb_free_notready(struct mbuf *m, int count) +{ + int i; + + for (i = 0; i < count && m != NULL; i++) { +#ifndef __rtems__ + if ((m->m_flags & M_EXT) != 0 && + m->m_ext.ext_type == EXT_PGS) { + m->m_ext.ext_pgs->nrdy--; + if (m->m_ext.ext_pgs->nrdy != 0) + continue; + } +#endif /* __rtems__ */ + m = m_free(m); + } + KASSERT(i == count, ("Removed only %d items from %p", i, m)); +} + +#ifndef __rtems__ +/* + * Compress an unmapped mbuf into a simple mbuf when it holds a small + * amount of data. This is used as a DOS defense to avoid having + * small packets tie up wired pages, an ext_pgs structure, and an + * mbuf. Since this converts the existing mbuf in place, it can only + * be used if there are no other references to 'm'. + */ +int +mb_unmapped_compress(struct mbuf *m) +{ + volatile u_int *refcnt; + struct mbuf m_temp; + + /* + * Assert that 'm' does not have a packet header. If 'm' had + * a packet header, it would only be able to hold MHLEN bytes + * and m_data would have to be initialized differently. + */ + KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) && + m->m_ext.ext_type == EXT_PGS, + ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m)); + KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); + + if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { + refcnt = &m->m_ext.ext_count; + } else { + KASSERT(m->m_ext.ext_cnt != NULL, + ("%s: no refcounting pointer on %p", __func__, m)); + refcnt = m->m_ext.ext_cnt; + } + + if (*refcnt != 1) + return (EBUSY); + + /* + * Copy mbuf header and m_ext portion of 'm' to 'm_temp' to + * create a "fake" EXT_PGS mbuf that can be used with + * m_copydata() as well as the ext_free callback. + */ + memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext)); + m_temp.m_next = NULL; + m_temp.m_nextpkt = NULL; + + /* Turn 'm' into a "normal" mbuf. */ + m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP); + m->m_data = m->m_dat; + + /* Copy data from template's ext_pgs. */ + m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t)); + + /* Free the backing pages. */ + m_temp.m_ext.ext_free(&m_temp); + + /* Finally, free the ext_pgs struct. */ + uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs); + return (0); +} + +/* + * These next few routines are used to permit downgrading an unmapped + * mbuf to a chain of mapped mbufs. This is used when an interface + * doesn't supported unmapped mbufs or if checksums need to be + * computed in software. + * + * Each unmapped mbuf is converted to a chain of mbufs. First, any + * TLS header data is stored in a regular mbuf. Second, each page of + * unmapped data is stored in an mbuf with an EXT_SFBUF external + * cluster. These mbufs use an sf_buf to provide a valid KVA for the + * associated physical page. They also hold a reference on the + * original EXT_PGS mbuf to ensure the physical page doesn't go away. + * Finally, any TLS trailer data is stored in a regular mbuf. + * + * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF + * mbufs. It frees the associated sf_buf and releases its reference + * on the original EXT_PGS mbuf. + * + * _mb_unmapped_to_ext() is a helper function that converts a single + * unmapped mbuf into a chain of mbufs. + * + * mb_unmapped_to_ext() is the public function that walks an mbuf + * chain converting any unmapped mbufs to mapped mbufs. It returns + * the new chain of unmapped mbufs on success. On failure it frees + * the original mbuf chain and returns NULL. + */ +static void +mb_unmapped_free_mext(struct mbuf *m) +{ + struct sf_buf *sf; + struct mbuf *old_m; + + sf = m->m_ext.ext_arg1; + sf_buf_free(sf); + + /* Drop the reference on the backing EXT_PGS mbuf. */ + old_m = m->m_ext.ext_arg2; + mb_free_ext(old_m); +} + +static struct mbuf * +_mb_unmapped_to_ext(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + struct mbuf *m_new, *top, *prev, *mref; + struct sf_buf *sf; + vm_page_t pg; + int i, len, off, pglen, pgoff, seglen, segoff; + volatile u_int *refcnt; + u_int ref_inc = 0; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + len = m->m_len; + KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p", + __func__, m)); + + /* See if this is the mbuf that holds the embedded refcount. */ + if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { + refcnt = &m->m_ext.ext_count; + mref = m; + } else { + KASSERT(m->m_ext.ext_cnt != NULL, + ("%s: no refcounting pointer on %p", __func__, m)); + refcnt = m->m_ext.ext_cnt; + mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); + } + + /* Skip over any data removed from the front. */ + off = mtod(m, vm_offset_t); + + top = NULL; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + m_new->m_len = seglen; + prev = top = m_new; + memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff], + seglen); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + if (top == NULL) { + top = prev = m_new; + } else { + prev->m_next = m_new; + prev = m_new; + } + sf = sf_buf_alloc(pg, SFB_NOWAIT); + if (sf == NULL) + goto fail; + + ref_inc++; + m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, + mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); + m_new->m_data += segoff; + m_new->m_len = seglen; + + pgoff = 0; + }; + if (len != 0) { + KASSERT((off + len) <= ext_pgs->trail_len, + ("off + len > trail (%d + %d > %d)", off, len, + ext_pgs->trail_len)); + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + if (top == NULL) + top = m_new; + else + prev->m_next = m_new; + m_new->m_len = len; + memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len); + } + + if (ref_inc != 0) { + /* + * Obtain an additional reference on the old mbuf for + * each created EXT_SFBUF mbuf. They will be dropped + * in mb_unmapped_free_mext(). + */ + if (*refcnt == 1) + *refcnt += ref_inc; + else + atomic_add_int(refcnt, ref_inc); + } + m_free(m); + return (top); + +fail: + if (ref_inc != 0) { + /* + * Obtain an additional reference on the old mbuf for + * each created EXT_SFBUF mbuf. They will be + * immediately dropped when these mbufs are freed + * below. + */ + if (*refcnt == 1) + *refcnt += ref_inc; + else + atomic_add_int(refcnt, ref_inc); + } + m_free(m); + m_freem(top); + return (NULL); +} + +struct mbuf * +mb_unmapped_to_ext(struct mbuf *top) +{ + struct mbuf *m, *next, *prev = NULL; + + prev = NULL; + for (m = top; m != NULL; m = next) { + /* m might be freed, so cache the next pointer. */ + next = m->m_next; + if (m->m_flags & M_NOMAP) { + if (prev != NULL) { + /* + * Remove 'm' from the new chain so + * that the 'top' chain terminates + * before 'm' in case 'top' is freed + * due to an error. + */ + prev->m_next = NULL; + } + m = _mb_unmapped_to_ext(m); + if (m == NULL) { + m_freem(top); + m_freem(next); + return (NULL); + } + if (prev == NULL) { + top = m; + } else { + prev->m_next = m; + } + + /* + * Replaced one mbuf with a chain, so we must + * find the end of chain. + */ + prev = m_last(m); + } else { + if (prev != NULL) { + prev->m_next = m; + } + prev = m; + } + } + return (top); +} + +/* + * Allocate an empty EXT_PGS mbuf. The ext_free routine is + * responsible for freeing any pages backing this mbuf when it is + * freed. + */ +struct mbuf * +mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free) +{ + struct mbuf *m; + struct mbuf_ext_pgs *ext_pgs; + + if (pkthdr) + m = m_gethdr(how, MT_DATA); + else + m = m_get(how, MT_DATA); + if (m == NULL) + return (NULL); + + ext_pgs = uma_zalloc(zone_extpgs, how); + if (ext_pgs == NULL) { + m_free(m); + return (NULL); + } + ext_pgs->npgs = 0; + ext_pgs->nrdy = 0; + ext_pgs->first_pg_off = 0; + ext_pgs->last_pg_len = 0; + ext_pgs->hdr_len = 0; + ext_pgs->trail_len = 0; + ext_pgs->tls = NULL; + ext_pgs->so = NULL; + m->m_data = NULL; + m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP); + m->m_ext.ext_type = EXT_PGS; + m->m_ext.ext_flags = EXT_FLAG_EMBREF; + m->m_ext.ext_count = 1; + m->m_ext.ext_pgs = ext_pgs; + m->m_ext.ext_size = 0; + m->m_ext.ext_free = ext_free; + return (m); +} + +#ifdef INVARIANT_SUPPORT +void +mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs) +{ + + /* + * NB: This expects a non-empty buffer (npgs > 0 and + * last_pg_len > 0). + */ + KASSERT(ext_pgs->npgs > 0, + ("ext_pgs with no valid pages: %p", ext_pgs)); + KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa), + ("ext_pgs with too many pages: %p", ext_pgs)); + KASSERT(ext_pgs->nrdy <= ext_pgs->npgs, + ("ext_pgs with too many ready pages: %p", ext_pgs)); + KASSERT(ext_pgs->first_pg_off < PAGE_SIZE, + ("ext_pgs with too large page offset: %p", ext_pgs)); + KASSERT(ext_pgs->last_pg_len > 0, + ("ext_pgs with zero last page length: %p", ext_pgs)); + KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE, + ("ext_pgs with too large last page length: %p", ext_pgs)); + if (ext_pgs->npgs == 1) { + KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <= + PAGE_SIZE, ("ext_pgs with single page too large: %p", + ext_pgs)); + } + KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr), + ("ext_pgs with too large header length: %p", ext_pgs)); + KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail), + ("ext_pgs with too large header length: %p", ext_pgs)); +} +#endif +#endif /* __rtems__ */ + /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. @@ -865,7 +1275,8 @@ mb_free_ext(struct mbuf *m) */ if (m->m_flags & M_NOFREE) { freembuf = 0; - KASSERT(m->m_ext.ext_type == EXT_EXTREF, + KASSERT(m->m_ext.ext_type == EXT_EXTREF || + m->m_ext.ext_type == EXT_RXRING, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; @@ -896,6 +1307,27 @@ mb_free_ext(struct mbuf *m) uma_zfree(zone_mbuf, mref); break; #ifndef __rtems__ + case EXT_PGS: { +#ifdef KERN_TLS + struct mbuf_ext_pgs *pgs; + struct ktls_session *tls; +#endif + + KASSERT(mref->m_ext.ext_free != NULL, + ("%s: ext_free not set", __func__)); + mref->m_ext.ext_free(mref); +#ifdef KERN_TLS + pgs = mref->m_ext.ext_pgs; + tls = pgs->tls; + if (tls != NULL && + !refcount_release_if_not_last(&tls->refcount)) + ktls_enqueue_to_free(pgs); + else +#endif + uma_zfree(zone_extpgs, mref->m_ext.ext_pgs); + uma_zfree(zone_mbuf, mref); + break; + } case EXT_SFBUF: #endif /* __rtems__ */ case EXT_NET_DRV: @@ -911,6 +1343,10 @@ mb_free_ext(struct mbuf *m) ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; + case EXT_RXRING: + KASSERT(m->m_ext.ext_free == NULL, + ("%s: ext_free is set", __func__)); + break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); @@ -950,7 +1386,7 @@ m_clget(struct mbuf *m, int how) * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { - zone_drain(zone_pack); + uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); @@ -1051,8 +1487,7 @@ m_getjcl(int how, short type, int flags, int size) * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain - * to the existing one but still return the top of the newly allocated - * chain. + * to the existing one and return a pointer to the provided mbuf. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) @@ -1165,3 +1600,24 @@ m_freem(struct mbuf *mb) while (mb != NULL) mb = m_free(mb); } + +void +m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp) +{ + + if_ref(ifp); + mst->ifp = ifp; + refcount_init(&mst->refcount, 1); + counter_u64_add(snd_tag_count, 1); +} + +void +m_snd_tag_destroy(struct m_snd_tag *mst) +{ + struct ifnet *ifp; + + ifp = mst->ifp; + ifp->if_snd_tag_free(mst); + if_rele(ifp); + counter_u64_add(snd_tag_count, -1); +} diff --git a/freebsd/sys/kern/kern_mib.c b/freebsd/sys/kern/kern_mib.c index 52aa32fb..cd9e6285 100644 --- a/freebsd/sys/kern/kern_mib.c +++ b/freebsd/sys/kern/kern_mib.c @@ -46,8 +46,10 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include +#include #include #include #include @@ -147,7 +149,7 @@ SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available"); #endif -char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */ +char kernelname[MAXPATHLEN] = PATH_KERNEL; /* XXX bloat */ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW | CTLFLAG_MPSAFE, kernelname, sizeof kernelname, "Name of kernel file booted"); @@ -170,15 +172,8 @@ sysctl_kern_arnd(SYSCTL_HANDLER_ARGS) char buf[256]; size_t len; - /*- - * This is one of the very few legitimate uses of read_random(9). - * Use of arc4random(9) is not recommended as that will ignore - * an unsafe (i.e. unseeded) random(4). - * - * If random(4) is not seeded, then this returns 0, so the - * sysctl will return a zero-length buffer. - */ - len = read_random(buf, MIN(req->oldlen, sizeof(buf))); + len = MIN(req->oldlen, sizeof(buf)); + read_random(buf, len); return (SYSCTL_OUT(req, buf, len)); } @@ -189,37 +184,51 @@ SYSCTL_PROC(_kern, KERN_ARND, arandom, static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { - u_long val; + u_long val, p; - val = ctob(physmem); + p = SIZE_T_MAX >> PAGE_SHIFT; + if (physmem < p) + p = physmem; + val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } - SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG | CTLFLAG_RD, - 0, 0, sysctl_hw_physmem, "LU", ""); + 0, 0, sysctl_hw_physmem, "LU", + "Amount of physical memory (in bytes)"); static int sysctl_hw_realmem(SYSCTL_HANDLER_ARGS) { - u_long val; - val = ctob(realmem); + u_long val, p; + + p = SIZE_T_MAX >> PAGE_SHIFT; + if (realmem < p) + p = realmem; + val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG | CTLFLAG_RD, - 0, 0, sysctl_hw_realmem, "LU", ""); + 0, 0, sysctl_hw_realmem, "LU", + "Amount of memory (in bytes) reported by the firmware"); + static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { - u_long val; + u_long val, p, p1; - val = ctob(physmem - vm_wire_count()); + p1 = physmem - vm_wire_count(); + p = SIZE_T_MAX >> PAGE_SHIFT; + if (p1 < p) + p = p1; + val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } - SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG | CTLFLAG_RD, - 0, 0, sysctl_hw_usermem, "LU", ""); + 0, 0, sysctl_hw_usermem, "LU", + "Amount of memory (in bytes) which is not wired"); -SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, ""); +SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, + "Amount of physical memory (in pages)"); u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE }; @@ -501,6 +510,54 @@ sysctl_osreldate(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_kern, KERN_OSRELDATE, osreldate, CTLTYPE_INT | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_osreldate, "I", "Kernel release date"); + +/* + * The build-id is copied from the ELF section .note.gnu.build-id. The linker + * script defines two variables to expose the beginning and end. LLVM + * currently uses a SHA-1 hash, but other formats can be supported by checking + * the length of the section. + */ + +extern char __build_id_start[]; +extern char __build_id_end[]; + +#define BUILD_ID_HEADER_LEN 0x10 +#define BUILD_ID_HASH_MAXLEN 0x14 + +static int +sysctl_build_id(SYSCTL_HANDLER_ARGS) +{ + uintptr_t sectionlen = (uintptr_t)(__build_id_end - __build_id_start); + int hashlen; + char buf[2*BUILD_ID_HASH_MAXLEN+1]; + + /* + * The ELF note section has a four byte length for the vendor name, + * four byte length for the value, and a four byte vendor specific + * type. The name for the build id is "GNU\0". We skip the first 16 + * bytes to read the build hash. We will return the remaining bytes up + * to 20 (SHA-1) hash size. If the hash happens to be a custom number + * of bytes we will pad the value with zeros, as the section should be + * four byte aligned. + */ + if (sectionlen <= BUILD_ID_HEADER_LEN || + sectionlen > (BUILD_ID_HEADER_LEN + BUILD_ID_HASH_MAXLEN)) { + return (ENOENT); + } + + + hashlen = sectionlen - BUILD_ID_HEADER_LEN; + for (int i = 0; i < hashlen; i++) { + uint8_t c = __build_id_start[i+BUILD_ID_HEADER_LEN]; + snprintf(&buf[2*i], 3, "%02x", c); + } + + return (SYSCTL_OUT(req, buf, strlen(buf) + 1)); +} + +SYSCTL_PROC(_kern, OID_AUTO, build_id, + CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, sysctl_build_id, "A", "Operating system build-id"); #endif /* __rtems__ */ SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD, 0, "Kernel Features"); diff --git a/freebsd/sys/kern/kern_mtxpool.c b/freebsd/sys/kern/kern_mtxpool.c index 7f6c4dce..bc47d826 100644 --- a/freebsd/sys/kern/kern_mtxpool.c +++ b/freebsd/sys/kern/kern_mtxpool.c @@ -64,14 +64,14 @@ static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool"); /* Pool sizes must be a power of two */ #ifndef MTX_POOL_SLEEP_SIZE -#define MTX_POOL_SLEEP_SIZE 128 +#define MTX_POOL_SLEEP_SIZE 1024 #endif struct mtxpool_header { int mtxpool_size; int mtxpool_mask; int mtxpool_shift; - int mtxpool_next; + int mtxpool_next __aligned(CACHE_LINE_SIZE); }; struct mtx_pool { diff --git a/freebsd/sys/kern/kern_synch.c b/freebsd/sys/kern/kern_synch.c index 2597f91d..7d24c248 100644 --- a/freebsd/sys/kern/kern_synch.c +++ b/freebsd/sys/kern/kern_synch.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -365,6 +366,75 @@ pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) #endif /* __rtems__ */ } +/* + * Potentially release the last reference for refcount. Check for + * unlikely conditions and signal the caller as to whether it was + * the final ref. + */ +bool +refcount_release_last(volatile u_int *count, u_int n, u_int old) +{ + u_int waiter; + + waiter = old & REFCOUNT_WAITER; + old = REFCOUNT_COUNT(old); + if (__predict_false(n > old || REFCOUNT_SATURATED(old))) { + /* + * Avoid multiple destructor invocations if underflow occurred. + * This is not perfect since the memory backing the containing + * object may already have been reallocated. + */ + _refcount_update_saturated(count); + return (false); + } + + /* + * Attempt to atomically clear the waiter bit. Wakeup waiters + * if we are successful. + */ + if (waiter != 0 && atomic_cmpset_int(count, REFCOUNT_WAITER, 0)) + wakeup(__DEVOLATILE(u_int *, count)); + + /* + * Last reference. Signal the user to call the destructor. + * + * Ensure that the destructor sees all updates. The fence_rel + * at the start of refcount_releasen synchronizes with this fence. + */ + atomic_thread_fence_acq(); + return (true); +} + +/* + * Wait for a refcount wakeup. This does not guarantee that the ref is still + * zero on return and may be subject to transient wakeups. Callers wanting + * a precise answer should use refcount_wait(). + */ +void +refcount_sleep(volatile u_int *count, const char *wmesg, int pri) +{ + void *wchan; + u_int old; + + if (REFCOUNT_COUNT(*count) == 0) + return; + wchan = __DEVOLATILE(void *, count); + sleepq_lock(wchan); + old = *count; + for (;;) { + if (REFCOUNT_COUNT(old) == 0) { + sleepq_release(wchan); + return; + } + if (old & REFCOUNT_WAITER) + break; + if (atomic_fcmpset_int(count, &old, old | REFCOUNT_WAITER)) + break; + } + sleepq_add(wchan, NULL, wmesg, 0, 0); + sleepq_wait(wchan, pri); +} + /* * Make all threads sleeping on the specified identifier runnable. */ @@ -402,6 +472,19 @@ wakeup_one(void *ident) kick_proc0(); } +void +wakeup_any(void *ident) +{ + int wakeup_swapper; + + sleepq_lock(ident); + wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR, + 0, 0); + sleepq_release(ident); + if (wakeup_swapper) + kick_proc0(); +} + #ifndef __rtems__ static void kdb_switch(void) diff --git a/freebsd/sys/kern/kern_sysctl.c b/freebsd/sys/kern/kern_sysctl.c index dc7c4c72..1135d7f3 100644 --- a/freebsd/sys/kern/kern_sysctl.c +++ b/freebsd/sys/kern/kern_sysctl.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include @@ -50,11 +51,13 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -66,6 +69,11 @@ __FBSDID("$FreeBSD$"); #include #endif +#ifdef DDB +#include +#include +#endif + #include #include @@ -326,13 +334,6 @@ sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) } #endif /* __rtems__ */ -static int -sbuf_printf_drain(void *arg __unused, const char *data, int len) -{ - - return (printf("%.*s", len, data)); -} - /* * Locate the path to a given oid. Returns the length of the resulting path, * or -1 if the oid was not found. nodes must have room for CTL_MAXNAME @@ -940,13 +941,18 @@ SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * - * {0,0} printf the entire MIB-tree. - * {0,1,...} return the name of the "..." OID. - * {0,2,...} return the next OID. - * {0,3} return the OID of the name in "new" - * {0,4,...} return the kind & format info for the "..." OID. - * {0,5,...} return the description of the "..." OID. - * {0,6,...} return the aggregation label of the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_DEBUG} printf the entire MIB-tree. + * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...} return the name of the "..." + * OID. + * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...} return the next OID. + * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID} return the OID of the name in + * "new" + * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...} return the kind & format info + * for the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...} return the description of the + * "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...} return the aggregation label of + * the "..." OID. */ #ifdef SYSCTL_DEBUG @@ -1014,8 +1020,8 @@ sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) return (ENOENT); } -SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, - 0, 0, sysctl_sysctl_debug, "-", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int @@ -1080,8 +1086,8 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_name, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, @@ -1167,8 +1173,8 @@ sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_next, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) @@ -1254,9 +1260,9 @@ sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ -SYSCTL_PROC(_sysctl, 3, name2oid, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE - | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW | + CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, + sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) @@ -1284,8 +1290,8 @@ sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) } -static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oidfmt, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) @@ -1309,8 +1315,8 @@ sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) return (error); } -static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oiddescr, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD | + CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); static int sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) @@ -1334,8 +1340,8 @@ sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) return (error); } -static SYSCTL_NODE(_sysctl, 6, oidlabel, - CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); /* * Default "handler" functions. @@ -1622,9 +1628,10 @@ sysctl_handle_string(SYSCTL_HANDLER_ARGS) /* * A zero-length buffer indicates a fixed size read-only - * string: + * string. In ddb, don't worry about trying to make a malloced + * snapshot. */ - if (arg2 == 0) { + if (arg2 == 0 || kdb_active) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } @@ -1751,6 +1758,29 @@ sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS) return (0); } +/* + * Convert seconds to a struct timeval. Intended for use with + * intervals and thus does not permit negative seconds. + */ +int +sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS) +{ + struct timeval *tv; + int error, secs; + + tv = arg1; + secs = tv->tv_sec; + + error = sysctl_handle_int(oidp, &secs, 0, req); + if (error || req->newptr == NULL) + return (error); + + if (secs < 0) + return (EINVAL); + tv->tv_sec = secs; + + return (0); +} /* * Transfer functions to/from kernel space. @@ -1853,8 +1883,8 @@ kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, size_t oidlen, plen; int error; - oid[0] = 0; /* sysctl internal magic */ - oid[1] = 3; /* name2oid */ + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, @@ -2149,6 +2179,68 @@ sys___sysctl(struct thread *td, struct sysctl_args *uap) return (error); } +int +kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen, + void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, + int flags, bool inkernel) +{ + int oid[CTL_MAXNAME]; + char namebuf[16]; + char *name; + size_t oidlen; + int error; + + if (namelen > MAXPATHLEN || namelen == 0) + return (EINVAL); + name = namebuf; + if (namelen > sizeof(namebuf)) + name = malloc(namelen, M_SYSCTL, M_WAITOK); + error = copyin(oname, name, namelen); + if (error != 0) + goto out; + + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; + oidlen = sizeof(oid); + error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen, + retval, flags); + if (error != 0) + goto out; + error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp, + inkernel, new, newlen, retval, flags); + +out: + if (namelen > sizeof(namebuf)) + free(name, M_SYSCTL); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct __sysctlbyname_args { + const char *name; + size_t namelen; + void *old; + size_t *oldlenp; + void *new; + size_t newlen; +}; +#endif +int +sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap) +{ + size_t rv; + int error; + + error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old, + uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0); + if (error != 0) + return (error); + if (uap->oldlenp != NULL) + error = copyout(&rv, uap->oldlenp, sizeof(rv)); + + return (error); +} + /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. @@ -2254,3 +2346,528 @@ sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } + +#ifdef DDB + +/* The current OID the debugger is working with */ +static struct sysctl_oid *g_ddb_oid; + +/* The current flags specified by the user */ +static int g_ddb_sysctl_flags; + +/* Check to see if the last sysctl printed */ +static int g_ddb_sysctl_printed; + +static const int ctl_sign[CTLTYPE+1] = { + [CTLTYPE_INT] = 1, + [CTLTYPE_LONG] = 1, + [CTLTYPE_S8] = 1, + [CTLTYPE_S16] = 1, + [CTLTYPE_S32] = 1, + [CTLTYPE_S64] = 1, +}; + +static const int ctl_size[CTLTYPE+1] = { + [CTLTYPE_INT] = sizeof(int), + [CTLTYPE_UINT] = sizeof(u_int), + [CTLTYPE_LONG] = sizeof(long), + [CTLTYPE_ULONG] = sizeof(u_long), + [CTLTYPE_S8] = sizeof(int8_t), + [CTLTYPE_S16] = sizeof(int16_t), + [CTLTYPE_S32] = sizeof(int32_t), + [CTLTYPE_S64] = sizeof(int64_t), + [CTLTYPE_U8] = sizeof(uint8_t), + [CTLTYPE_U16] = sizeof(uint16_t), + [CTLTYPE_U32] = sizeof(uint32_t), + [CTLTYPE_U64] = sizeof(uint64_t), +}; + +#define DB_SYSCTL_NAME_ONLY 0x001 /* Compare with -N */ +#define DB_SYSCTL_VALUE_ONLY 0x002 /* Compare with -n */ +#define DB_SYSCTL_OPAQUE 0x004 /* Compare with -o */ +#define DB_SYSCTL_HEX 0x008 /* Compare with -x */ + +#define DB_SYSCTL_SAFE_ONLY 0x100 /* Only simple types */ + +static const char db_sysctl_modifs[] = { + 'N', 'n', 'o', 'x', +}; + +static const int db_sysctl_modif_values[] = { + DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY, + DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX, +}; + +/* Handlers considered safe to print while recursing */ +static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = { + sysctl_handle_bool, + sysctl_handle_8, + sysctl_handle_16, + sysctl_handle_32, + sysctl_handle_64, + sysctl_handle_int, + sysctl_handle_long, + sysctl_handle_string, + sysctl_handle_opaque, +}; + +/* + * Use in place of sysctl_old_kernel to print sysctl values. + * + * Compare to the output handling in show_var from sbin/sysctl/sysctl.c + */ +static int +sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len) +{ + const u_char *val, *p; + const char *sep1; + size_t intlen, slen; + uintmax_t umv; + intmax_t mv; + int sign, ctltype, hexlen, xflag, error; + + /* Suppress false-positive GCC uninitialized variable warnings */ + mv = 0; + umv = 0; + + slen = len; + val = p = ptr; + + if (ptr == NULL) { + error = 0; + goto out; + } + + /* We are going to print */ + g_ddb_sysctl_printed = 1; + + xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX; + + ctltype = (g_ddb_oid->oid_kind & CTLTYPE); + sign = ctl_sign[ctltype]; + intlen = ctl_size[ctltype]; + + switch (ctltype) { + case CTLTYPE_NODE: + case CTLTYPE_STRING: + db_printf("%.*s", (int) len, (const char *) p); + error = 0; + goto out; + + case CTLTYPE_INT: + case CTLTYPE_UINT: + case CTLTYPE_LONG: + case CTLTYPE_ULONG: + case CTLTYPE_S8: + case CTLTYPE_S16: + case CTLTYPE_S32: + case CTLTYPE_S64: + case CTLTYPE_U8: + case CTLTYPE_U16: + case CTLTYPE_U32: + case CTLTYPE_U64: + hexlen = 2 + (intlen * CHAR_BIT + 3) / 4; + sep1 = ""; + while (len >= intlen) { + switch (ctltype) { + case CTLTYPE_INT: + case CTLTYPE_UINT: + umv = *(const u_int *)p; + mv = *(const int *)p; + break; + case CTLTYPE_LONG: + case CTLTYPE_ULONG: + umv = *(const u_long *)p; + mv = *(const long *)p; + break; + case CTLTYPE_S8: + case CTLTYPE_U8: + umv = *(const uint8_t *)p; + mv = *(const int8_t *)p; + break; + case CTLTYPE_S16: + case CTLTYPE_U16: + umv = *(const uint16_t *)p; + mv = *(const int16_t *)p; + break; + case CTLTYPE_S32: + case CTLTYPE_U32: + umv = *(const uint32_t *)p; + mv = *(const int32_t *)p; + break; + case CTLTYPE_S64: + case CTLTYPE_U64: + umv = *(const uint64_t *)p; + mv = *(const int64_t *)p; + break; + } + + db_printf("%s", sep1); + if (xflag) + db_printf("%#0*jx", hexlen, umv); + else if (!sign) + db_printf("%ju", umv); + else if (g_ddb_oid->oid_fmt[1] == 'K') { + /* Kelvins are currently unsupported. */ + error = EOPNOTSUPP; + goto out; + } else + db_printf("%jd", mv); + + sep1 = " "; + len -= intlen; + p += intlen; + } + error = 0; + goto out; + + case CTLTYPE_OPAQUE: + /* TODO: Support struct functions. */ + + /* FALLTHROUGH */ + default: + db_printf("Format:%s Length:%zu Dump:0x", + g_ddb_oid->oid_fmt, len); + while (len-- && (xflag || p < val + 16)) + db_printf("%02x", *p++); + if (!xflag && len > 16) + db_printf("..."); + error = 0; + goto out; + } + +out: + req->oldidx += slen; + return (error); +} + +/* + * Avoid setting new sysctl values from the debugger + */ +static int +sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l) +{ + + if (!req->newptr) + return (0); + + /* Changing sysctls from the debugger is currently unsupported */ + return (EPERM); +} + +/* + * Run a sysctl handler with the DDB oldfunc and newfunc attached. + * Instead of copying any output to a buffer we'll dump it right to + * the console. + */ +static int +db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen, + void *old, size_t *oldlenp, size_t *retval, int flags) +{ + struct sysctl_req req; + int error; + + /* Setup the request */ + bzero(&req, sizeof req); + req.td = kdb_thread; + req.oldfunc = sysctl_old_ddb; + req.newfunc = sysctl_new_ddb; + req.lock = REQ_UNWIRED; + if (oldlenp) { + req.oldlen = *oldlenp; + } + req.validlen = req.oldlen; + if (old) { + req.oldptr = old; + } + + /* Setup our globals for sysctl_old_ddb */ + g_ddb_oid = oidp; + g_ddb_sysctl_flags = flags; + g_ddb_sysctl_printed = 0; + + error = sysctl_root(0, name, namelen, &req); + + /* Reset globals */ + g_ddb_oid = NULL; + g_ddb_sysctl_flags = 0; + + if (retval) { + if (req.oldptr && req.oldidx > req.validlen) + *retval = req.validlen; + else + *retval = req.oldidx; + } + return (error); +} + +/* + * Show a sysctl's name + */ +static void +db_show_oid_name(int *oid, size_t nlen) +{ + struct sysctl_oid *oidp; + int qoid[CTL_MAXNAME+2]; + int error; + + qoid[0] = 0; + memcpy(qoid + 2, oid, nlen * sizeof(int)); + qoid[1] = 1; + + error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL); + if (error) + db_error("sysctl name oid"); + + error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0); + if (error) + db_error("sysctl name"); +} + +/* + * Check to see if an OID is safe to print from ddb. + */ +static bool +db_oid_safe(const struct sysctl_oid *oidp) +{ + for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) { + if (oidp->oid_handler == db_safe_handlers[i]) + return (true); + } + + return (false); +} + +/* + * Show a sysctl at a specific OID + * Compare to the input handling in show_var from sbin/sysctl/sysctl.c + */ +static int +db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags) +{ + int error, xflag, oflag, Nflag, nflag; + size_t len; + + xflag = flags & DB_SYSCTL_HEX; + oflag = flags & DB_SYSCTL_OPAQUE; + nflag = flags & DB_SYSCTL_VALUE_ONLY; + Nflag = flags & DB_SYSCTL_NAME_ONLY; + + if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE && + (!xflag && !oflag)) + return (0); + + if (Nflag) { + db_show_oid_name(oid, nlen); + error = 0; + goto out; + } + + if (!nflag) { + db_show_oid_name(oid, nlen); + db_printf(": "); + } + + if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) { + db_printf("Skipping, unsafe to print while recursing."); + error = 0; + goto out; + } + + /* Try once, and ask about the size */ + len = 0; + error = db_sysctl(oidp, oid, nlen, + NULL, NULL, &len, flags); + if (error) + goto out; + + if (!g_ddb_sysctl_printed) + /* Lie about the size */ + error = db_sysctl(oidp, oid, nlen, + (void *) 1, &len, NULL, flags); + +out: + db_printf("\n"); + return (error); +} + +/* + * Show all sysctls under a specific OID + * Compare to sysctl_all from sbin/sysctl/sysctl.c + */ +static int +db_show_sysctl_all(int *oid, size_t len, int flags) +{ + struct sysctl_oid *oidp; + int name1[CTL_MAXNAME + 2], name2[CTL_MAXNAME + 2]; + size_t l1, l2; + + name1[0] = CTL_SYSCTL; + name1[1] = CTL_SYSCTL_NEXT; + l1 = 2; + if (len) { + memcpy(name1+2, oid, len * sizeof(int)); + l1 +=len; + } else { + name1[2] = 1; + l1++; + } + for (;;) { + int i, error; + + l2 = sizeof(name2); + error = kernel_sysctl(kdb_thread, name1, l1, + name2, &l2, NULL, 0, &l2, 0); + if (error != 0) { + if (error == ENOENT) + return (0); + else + db_error("sysctl(getnext)"); + } + + l2 /= sizeof(int); + + if (l2 < (unsigned int)len) + return (0); + + for (i = 0; i < len; i++) + if (name2[i] != oid[i]) + return (0); + + /* Find the OID in question */ + error = sysctl_find_oid(name2, l2, &oidp, NULL, NULL); + if (error) + return (error); + + i = db_show_oid(oidp, name2, l2, flags | DB_SYSCTL_SAFE_ONLY); + + if (db_pager_quit) + return (0); + + memcpy(name1+2, name2, l2 * sizeof(int)); + l1 = 2 + l2; + } +} + +/* + * Show a sysctl by its user facing string + */ +static int +db_sysctlbyname(char *name, int flags) +{ + struct sysctl_oid *oidp; + int oid[CTL_MAXNAME]; + int error, nlen; + + error = name2oid(name, oid, &nlen, &oidp); + if (error) { + return (error); + } + + if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + db_show_sysctl_all(oid, nlen, flags); + } else { + error = db_show_oid(oidp, oid, nlen, flags); + } + + return (error); +} + +static void +db_sysctl_cmd_usage(void) +{ + db_printf( + " sysctl [/Nnox] \n" + " \n" + " The name of the sysctl to show. \n" + " \n" + " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT. \n" + " This will work for most sysctls, but should not be used \n" + " with sysctls that are known to malloc. \n" + " \n" + " While recursing any \"unsafe\" sysctls will be skipped. \n" + " Call sysctl directly on the sysctl to try printing the \n" + " skipped sysctl. This is unsafe and may make the ddb \n" + " session unusable. \n" + " \n" + " Arguments: \n" + " /N Display only the name of the sysctl. \n" + " /n Display only the value of the sysctl. \n" + " /o Display opaque values. \n" + " /x Display the sysctl in hex. \n" + " \n" + "For example: \n" + "sysctl vm.v_free_min \n" + "vn.v_free_min: 12669 \n" + ); +} + +/* + * Show a specific sysctl similar to sysctl (8). + */ +DB_FUNC(sysctl, db_sysctl_cmd, db_cmd_table, CS_OWN, NULL) +{ + char name[TOK_STRING_SIZE]; + int error, i, t, flags; + + /* Parse the modifiers */ + t = db_read_token(); + if (t == tSLASH || t == tMINUS) { + t = db_read_token(); + if (t != tIDENT) { + db_printf("Bad modifier\n"); + error = EINVAL; + goto out; + } + db_strcpy(modif, db_tok_string); + } + else { + db_unread_token(t); + modif[0] = '\0'; + } + + flags = 0; + for (i = 0; i < nitems(db_sysctl_modifs); i++) { + if (strchr(modif, db_sysctl_modifs[i])) { + flags |= db_sysctl_modif_values[i]; + } + } + + /* Parse the sysctl names */ + t = db_read_token(); + if (t != tIDENT) { + db_printf("Need sysctl name\n"); + error = EINVAL; + goto out; + } + + /* Copy the name into a temporary buffer */ + db_strcpy(name, db_tok_string); + + /* Ensure there is no trailing cruft */ + t = db_read_token(); + if (t != tEOL) { + db_printf("Unexpected sysctl argument\n"); + error = EINVAL; + goto out; + } + + error = db_sysctlbyname(name, flags); + if (error == ENOENT) { + db_printf("unknown oid: '%s'\n", db_tok_string); + goto out; + } else if (error) { + db_printf("%s: error: %d\n", db_tok_string, error); + goto out; + } + +out: + /* Ensure we eat all of our text */ + db_flush_lex(); + + if (error == EINVAL) { + db_sysctl_cmd_usage(); + } +} + +#endif /* DDB */ diff --git a/freebsd/sys/kern/kern_time.c b/freebsd/sys/kern/kern_time.c index 74b144cb..47eb9032 100644 --- a/freebsd/sys/kern/kern_time.c +++ b/freebsd/sys/kern/kern_time.c @@ -422,7 +422,9 @@ kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats) if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000 || ats->tv_sec < 0) return (EINVAL); - if (!allow_insane_settime && ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60) + if (!allow_insane_settime && + (ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60 || + ats->tv_sec < utc_offset())) return (EINVAL); /* XXX Don't convert nsec->usec and back */ TIMESPEC_TO_TIMEVAL(&atv, ats); @@ -673,8 +675,8 @@ sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap) error = copyout(&atv, uap->tp, sizeof (atv)); } if (error == 0 && uap->tzp != NULL) { - rtz.tz_minuteswest = tz_minuteswest; - rtz.tz_dsttime = tz_dsttime; + rtz.tz_minuteswest = 0; + rtz.tz_dsttime = 0; error = copyout(&rtz, uap->tzp, sizeof (rtz)); } return (error); @@ -726,10 +728,6 @@ kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp) return (EINVAL); error = settime(td, tv); } - if (tzp && error == 0) { - tz_minuteswest = tzp->tz_minuteswest; - tz_dsttime = tzp->tz_dsttime; - } return (error); } diff --git a/freebsd/sys/kern/kern_timeout.c b/freebsd/sys/kern/kern_timeout.c index 2f478afc..983abba2 100644 --- a/freebsd/sys/kern/kern_timeout.c +++ b/freebsd/sys/kern/kern_timeout.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -67,6 +68,7 @@ __FBSDID("$FreeBSD$"); #ifdef DDB #include +#include #include #endif @@ -154,12 +156,16 @@ u_int callwheelsize, callwheelmask; struct cc_exec { struct callout *cc_curr; void (*cc_drain)(void *); +#ifndef __rtems__ + void *cc_last_func; + void *cc_last_arg; +#endif /* __rtems__ */ #ifdef SMP void (*ce_migration_func)(void *); void *ce_migration_arg; - int ce_migration_cpu; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; + int ce_migration_cpu; #endif bool cc_cancel; bool cc_waiting; @@ -200,6 +206,8 @@ struct callout_cpu { #ifndef __rtems__ #define cc_exec_curr(cc, dir) cc->cc_exec_entity[dir].cc_curr +#define cc_exec_last_func(cc, dir) cc->cc_exec_entity[dir].cc_last_func +#define cc_exec_last_arg(cc, dir) cc->cc_exec_entity[dir].cc_last_arg #define cc_exec_drain(cc, dir) cc->cc_exec_entity[dir].cc_drain #else /* __rtems__ */ #define cc_exec_curr(cc, dir) cc->cc_exec_entity.cc_curr @@ -426,8 +434,9 @@ callout_cpu_init(struct callout_cpu *cc, int cpu) SLIST_INIT(&cc->cc_callfree); cc->cc_inited = 1; #ifndef __rtems__ - cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize, - M_CALLOUT, M_WAITOK); + cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) * + callwheelsize, M_CALLOUT, + DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK); #endif /* __rtems__ */ for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); @@ -821,6 +830,10 @@ softclock_call_cc(struct callout *c, struct callout_cpu *cc, c->c_iflags &= ~CALLOUT_PENDING; cc_exec_curr(cc, direct) = c; +#ifndef __rtems__ + cc_exec_last_func(cc, direct) = c_func; + cc_exec_last_arg(cc, direct) = c_arg; +#endif /* __rtems__ */ cc_exec_cancel(cc, direct) = false; cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); @@ -1876,4 +1889,42 @@ DB_SHOW_COMMAND(callout, db_show_callout) _show_callout((struct callout *)addr); } + +static void +_show_last_callout(int cpu, int direct, const char *dirstr) +{ + struct callout_cpu *cc; + void *func, *arg; + + cc = CC_CPU(cpu); + func = cc_exec_last_func(cc, direct); + arg = cc_exec_last_arg(cc, direct); + db_printf("cpu %d last%s callout function: %p ", cpu, dirstr, func); + db_printsym((db_expr_t)func, DB_STGY_ANY); + db_printf("\ncpu %d last%s callout argument: %p\n", cpu, dirstr, arg); +} + +DB_SHOW_COMMAND(callout_last, db_show_callout_last) +{ + int cpu, last; + + if (have_addr) { + if (addr < 0 || addr > mp_maxid || CPU_ABSENT(addr)) { + db_printf("no such cpu: %d\n", (int)addr); + return; + } + cpu = last = addr; + } else { + cpu = 0; + last = mp_maxid; + } + + while (cpu <= last) { + if (!CPU_ABSENT(cpu)) { + _show_last_callout(cpu, 0, ""); + _show_last_callout(cpu, 1, " direct"); + } + cpu++; + } +} #endif /* DDB */ diff --git a/freebsd/sys/kern/kern_uuid.c b/freebsd/sys/kern/kern_uuid.c index a2316b16..c2a5986a 100644 --- a/freebsd/sys/kern/kern_uuid.c +++ b/freebsd/sys/kern/kern_uuid.c @@ -301,7 +301,7 @@ sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid) char buf[38]; snprintf_uuid(buf, sizeof(buf), uuid); - return (sbuf_printf(sb, "%s", buf)); + return (sbuf_cat(sb, buf)); } /* diff --git a/freebsd/sys/kern/subr_blist.c b/freebsd/sys/kern/subr_blist.c index 807a7f3c..8b073bf8 100644 --- a/freebsd/sys/kern/subr_blist.c +++ b/freebsd/sys/kern/subr_blist.c @@ -111,6 +111,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -122,19 +123,20 @@ __FBSDID("$FreeBSD$"); #define malloc(a,b,c) calloc(a, 1) #define free(a,b) free(a) #define ummin(a,b) ((a) < (b) ? (a) : (b)) +#define imin(a,b) ((a) < (b) ? (a) : (b)) +#define KASSERT(a,b) assert(a) #include -void panic(const char *ctl, ...); - #endif /* * static support functions */ -static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); -static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, - u_daddr_t radix); +static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, + int *count, int maxcount); +static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, + int maxcount, u_daddr_t radix); static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix); @@ -194,30 +196,40 @@ bitrange(int n, int count) /* - * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. - * Assumes that the argument has only one bit set. + * Find the first bit set in a u_daddr_t. */ static inline int -bitpos(u_daddr_t mask) +generic_bitpos(u_daddr_t mask) { int hi, lo, mid; + lo = 0; + hi = BLIST_BMAP_RADIX; + while (lo + 1 < hi) { + mid = (lo + hi) >> 1; + if (mask & bitrange(0, mid)) + hi = mid; + else + lo = mid; + } + return (lo); +} + +static inline int +bitpos(u_daddr_t mask) +{ + switch (sizeof(mask)) { #ifdef HAVE_INLINE_FFSLL case sizeof(long long): return (ffsll(mask) - 1); +#endif +#ifdef HAVE_INLINE_FFS + case sizeof(int): + return (ffs(mask) - 1); #endif default: - lo = 0; - hi = BLIST_BMAP_RADIX; - while (lo + 1 < hi) { - mid = (lo + hi) >> 1; - if ((mask >> mid) != 0) - lo = mid; - else - hi = mid; - } - return (lo); + return (generic_bitpos(mask)); } } @@ -237,8 +249,7 @@ blist_create(daddr_t blocks, int flags) blist_t bl; u_daddr_t nodes, radix; - if (blocks == 0) - panic("invalid block count"); + KASSERT(blocks > 0, ("invalid block count")); /* * Calculate the radix and node count used for scanning. @@ -286,12 +297,14 @@ blist_destroy(blist_t bl) * not be allocated. */ daddr_t -blist_alloc(blist_t bl, daddr_t count) +blist_alloc(blist_t bl, int *count, int maxcount) { - daddr_t blk; + daddr_t blk, cursor; - if (count > BLIST_MAX_ALLOC) - panic("allocation too large"); + KASSERT(*count <= maxcount, + ("invalid parameters %d > %d", *count, maxcount)); + KASSERT(*count <= BLIST_MAX_ALLOC, + ("minimum allocation too large: %d", *count)); /* * This loop iterates at most twice. An allocation failure in the @@ -299,18 +312,18 @@ blist_alloc(blist_t bl, daddr_t count) * non-zero. When the cursor is zero, an allocation failure will * stop further iterations. */ - for (;;) { - blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count, + for (cursor = bl->bl_cursor;; cursor = 0) { + blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, bl->bl_radix); if (blk != SWAPBLK_NONE) { - bl->bl_avail -= count; - bl->bl_cursor = blk + count; + bl->bl_avail -= *count; + bl->bl_cursor = blk + *count; if (bl->bl_cursor == bl->bl_blocks) bl->bl_cursor = 0; return (blk); - } else if (bl->bl_cursor == 0) + } + if (cursor == 0) return (SWAPBLK_NONE); - bl->bl_cursor = 0; } } @@ -326,15 +339,15 @@ blist_avail(blist_t bl) /* * blist_free() - free up space in the block bitmap. Return the base - * of a contiguous region. Panic if an inconsistancy is - * found. + * of a contiguous region. */ void blist_free(blist_t bl, daddr_t blkno, daddr_t count) { - if (blkno < 0 || blkno + count > bl->bl_blocks) - panic("freeing invalid range"); + KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, + ("freeing invalid range: blkno %jx, count %d, blocks %jd", + (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); bl->bl_avail += count; } @@ -350,8 +363,9 @@ blist_fill(blist_t bl, daddr_t blkno, daddr_t count) { daddr_t filled; - if (blkno < 0 || blkno + count > bl->bl_blocks) - panic("filling invalid range"); + KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, + ("filling invalid range: blkno %jx, count %d, blocks %jd", + (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); bl->bl_avail -= filled; return (filled); @@ -533,7 +547,8 @@ blist_stats(blist_t bl, struct sbuf *s) struct gap_stats gstats; struct gap_stats *stats = &gstats; daddr_t i, nodes, radix; - u_daddr_t bit, diff, mask; + u_daddr_t diff, mask; + int digit; init_gap_stats(stats); nodes = 0; @@ -571,9 +586,9 @@ blist_stats(blist_t bl, struct sbuf *s) if (gap_stats_counting(stats)) diff ^= 1; while (diff != 0) { - bit = diff & -diff; - update_gap_stats(stats, i + bitpos(bit)); - diff ^= bit; + digit = bitpos(diff); + update_gap_stats(stats, i + digit); + diff ^= bitrange(digit, 1); } } nodes += radix_to_skip(radix); @@ -594,53 +609,104 @@ blist_stats(blist_t bl, struct sbuf *s) */ /* - * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf. + * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. * - * 'scan' is a leaf node, associated with a block containing 'blk'. - * The next leaf node could be adjacent, or several nodes away if the - * least common ancestor of 'scan' and its neighbor is several levels - * up. Use 'blk' to determine how many meta-nodes lie between the - * leaves. If the next leaf has enough initial bits set, clear them - * and clear the bits in the meta nodes on the path up to the least - * common ancestor to mark any subtrees made completely empty. + * 'scan' is a leaf node, and its first block is at address 'start'. The + * next leaf node could be adjacent, or several nodes away if the least + * common ancestor of 'scan' and its neighbor is several levels up. Use + * addresses to determine how many meta-nodes lie between the leaves. If + * sequence of leaves starting with the next one has enough initial bits + * set, clear them and clear the bits in the meta nodes on the path up to + * the least common ancestor to mark any subtrees made completely empty. */ static int -blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) +blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) { - blmeta_t *next; - daddr_t skip; u_daddr_t radix; - int digit; + daddr_t blk; + int avail, digit; - next = scan + 1; - blk += BLIST_BMAP_RADIX; - radix = BLIST_BMAP_RADIX; - while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 && - (next->bm_bitmap & 1) == 1) { - next++; - radix *= BLIST_META_RADIX; - } - if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) { - /* - * The next leaf doesn't have enough free blocks at the - * beginning to complete the spanning allocation. - */ - return (ENOMEM); + start += BLIST_BMAP_RADIX; + for (blk = start; blk - start < maxcount; blk += BLIST_BMAP_RADIX) { + /* Skip meta-nodes, as long as they promise more free blocks. */ + radix = BLIST_BMAP_RADIX; + while (((++scan)->bm_bitmap & 1) == 1 && + ((blk / radix) & BLIST_META_MASK) == 0) + radix *= BLIST_META_RADIX; + if (~scan->bm_bitmap != 0) { + /* + * Either there is no next leaf with any free blocks, + * or we've reached the next leaf and found that some + * of its blocks are not free. In the first case, + * bitpos() returns zero here. + */ + avail = blk - start + bitpos(~scan->bm_bitmap); + if (avail < count || avail == 0) { + /* + * There isn't a next leaf with enough free + * blocks at its beginning to bother + * allocating. + */ + return (avail); + } + maxcount = imin(avail, maxcount); + if (maxcount % BLIST_BMAP_RADIX == 0) { + /* + * There was no next leaf. Back scan up to + * last leaf. + */ + --scan; + while (radix != BLIST_BMAP_RADIX) { + radix /= BLIST_META_RADIX; + --scan; + } + blk -= BLIST_BMAP_RADIX; + } + } } - /* Clear the first 'count' bits in the next leaf to allocate. */ - next->bm_bitmap &= (u_daddr_t)-1 << count; - + /* - * Update bitmaps of next-ancestors, up to least common ancestor. + * 'scan' is the last leaf that provides blocks. Clear from 1 to + * BLIST_BMAP_RADIX bits to represent the allocation of those last + * blocks. */ - skip = radix_to_skip(radix); - while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) { - (--next)->bm_bitmap ^= 1; - radix /= BLIST_META_RADIX; + if (maxcount % BLIST_BMAP_RADIX != 0) + scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_BMAP_RADIX); + else + scan->bm_bitmap = 0; + + for (;;) { + /* Back up over meta-nodes, clearing bits if necessary. */ + blk -= BLIST_BMAP_RADIX; + radix = BLIST_BMAP_RADIX; + while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0) { + if ((scan--)->bm_bitmap == 0) + scan->bm_bitmap ^= 1; + radix *= BLIST_META_RADIX; + } + if ((scan--)->bm_bitmap == 0) + scan[-digit * radix_to_skip(radix)].bm_bitmap ^= + (u_daddr_t)1 << digit; + + if (blk == start) + break; + /* Clear all the bits of this leaf. */ + scan->bm_bitmap = 0; } - if (next->bm_bitmap == 0) - scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit; - return (0); + return (maxcount); +} + +/* + * Given a bitmask, flip all the bits from the least-significant 1-bit to the + * most significant bit. If the result is non-zero, then the least-significant + * 1-bit of the result is in the same position as the least-signification 0-bit + * in mask that is followed by a 1-bit. + */ +static inline u_daddr_t +flip_hibits(u_daddr_t mask) +{ + + return (-mask & ~mask); } /* @@ -651,16 +717,16 @@ blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) * crosses a leaf boundary. */ static daddr_t -blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) +blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) { u_daddr_t cursor_mask, mask; int count1, hi, lo, num_shifts, range1, range_ext; range1 = 0; - count1 = count - 1; + count1 = *count - 1; num_shifts = fls(count1); mask = scan->bm_bitmap; - while ((-mask & ~mask) != 0 && num_shifts > 0) { + while (flip_hibits(mask) != 0 && num_shifts > 0) { /* * If bit i is set in mask, then bits in [i, i+range1] are set * in scan->bm_bitmap. The value of range1 is equal to count1 @@ -712,40 +778,50 @@ blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) /* * The least significant set bit in mask marks the start of the first - * available range of sufficient size. Clear all the bits but that one, - * and then find its position. + * available range of sufficient size. Find its position. */ - mask &= -mask; lo = bitpos(mask); - hi = lo + count; - if (hi > BLIST_BMAP_RADIX) { - /* - * An allocation within this leaf is impossible, so a successful - * allocation depends on the next leaf providing some of the blocks. - */ - if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0) + /* + * Find how much space is available starting at that position. + */ + if (flip_hibits(mask) != 0) { + /* Count the 1 bits starting at position lo. */ + hi = bitpos(flip_hibits(mask)) + count1; + if (maxcount < hi - lo) + hi = lo + maxcount; + *count = hi - lo; + mask = bitrange(lo, *count); + } else if (maxcount <= BLIST_BMAP_RADIX - lo) { + /* All the blocks we can use are available here. */ + hi = lo + maxcount; + *count = maxcount; + mask = bitrange(lo, *count); + } else { + /* Check next leaf for some of the blocks we want or need. */ + count1 = *count - (BLIST_BMAP_RADIX - lo); + maxcount -= BLIST_BMAP_RADIX - lo; + hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); + if (hi < count1) /* - * The hint cannot be updated, because the same - * allocation request could be satisfied later, by this - * leaf, if the state of the next leaf changes, and - * without any changes to this leaf. + * The next leaf cannot supply enough blocks to reach + * the minimum required allocation. The hint cannot be + * updated, because the same allocation request could + * be satisfied later, by this leaf, if the state of + * the next leaf changes, and without any changes to + * this leaf. */ return (SWAPBLK_NONE); + *count = BLIST_BMAP_RADIX - lo + hi; hi = BLIST_BMAP_RADIX; } - /* Set the bits of mask at position 'lo' and higher. */ - mask = -mask; if (hi == BLIST_BMAP_RADIX) { /* * Update bighint. There is no allocation bigger than range1 * available in this leaf after this allocation completes. */ scan->bm_bighint = range1; - } else { - /* Clear the bits of mask at position 'hi' and higher. */ - mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); } /* Clear the allocated bits from this leaf. */ scan->bm_bitmap &= ~mask; @@ -761,15 +837,16 @@ blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) * and we have a few optimizations strewn in as well. */ static daddr_t -blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) +blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, + int maxcount, u_daddr_t radix) { daddr_t blk, i, r, skip; - u_daddr_t bit, mask; + u_daddr_t mask; bool scan_from_start; int digit; if (radix == BLIST_BMAP_RADIX) - return (blst_leaf_alloc(scan, cursor, count)); + return (blst_leaf_alloc(scan, cursor, count, maxcount)); blk = cursor & -radix; scan_from_start = (cursor == blk); radix /= BLIST_META_RADIX; @@ -796,23 +873,22 @@ blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) * Examine the nonempty subtree associated with each bit set in mask. */ do { - bit = mask & -mask; - digit = bitpos(bit); + digit = bitpos(mask); i = 1 + digit * skip; - if (count <= scan[i].bm_bighint) { + if (*count <= scan[i].bm_bighint) { /* * The allocation might fit beginning in the i'th subtree. */ r = blst_meta_alloc(&scan[i], cursor + digit * radix, - count, radix); + count, maxcount, radix); if (r != SWAPBLK_NONE) { if (scan[i].bm_bitmap == 0) - scan->bm_bitmap ^= bit; + scan->bm_bitmap ^= bitrange(digit, 1); return (r); } } cursor = blk; - } while ((mask ^= bit) != 0); + } while ((mask ^= bitrange(digit, 1)) != 0); /* * We couldn't allocate count in this subtree. If the whole tree was @@ -820,7 +896,7 @@ blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) */ if (scan_from_start && !(digit == BLIST_META_RADIX - 1 && scan[i].bm_bighint == BLIST_MAX_ALLOC)) - scan->bm_bighint = count - 1; + scan->bm_bighint = *count - 1; return (SWAPBLK_NONE); } @@ -841,8 +917,9 @@ blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) * count n */ mask = bitrange(blk & BLIST_BMAP_MASK, count); - if (scan->bm_bitmap & mask) - panic("freeing free block"); + KASSERT((scan->bm_bitmap & mask) == 0, + ("freeing free block: %jx, size %d, mask %jx", + (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); scan->bm_bitmap |= mask; } @@ -1006,7 +1083,7 @@ static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) { daddr_t skip; - u_daddr_t bit, mask; + u_daddr_t mask; int digit; if (radix == BLIST_BMAP_RADIX) { @@ -1038,11 +1115,10 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) mask = scan->bm_bitmap; /* Examine the nonempty subtree associated with each bit set in mask */ do { - bit = mask & -mask; - digit = bitpos(bit); + digit = bitpos(mask); blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, radix, tab); - } while ((mask ^= bit) != 0); + } while ((mask ^= bitrange(digit, 1)) != 0); tab -= 4; printf( @@ -1079,7 +1155,7 @@ main(int ac, char **av) for (;;) { char buf[1024]; long long da = 0; - long long count = 0; + int count = 0, maxcount = 0; printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), (long long)size, (long long)bl->bl_radix); @@ -1088,7 +1164,7 @@ main(int ac, char **av) break; switch(buf[0]) { case 'r': - if (sscanf(buf + 1, "%lld", &count) == 1) { + if (sscanf(buf + 1, "%d", &count) == 1) { blist_resize(&bl, count, 1, M_WAITOK); } else { printf("?\n"); @@ -1104,22 +1180,23 @@ main(int ac, char **av) sbuf_delete(s); break; case 'a': - if (sscanf(buf + 1, "%lld", &count) == 1) { - daddr_t blk = blist_alloc(bl, count); - printf(" R=%08llx\n", (long long)blk); + if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { + daddr_t blk = blist_alloc(bl, &count, maxcount); + printf(" R=%08llx, c=%08d\n", + (long long)blk, count); } else { printf("?\n"); } break; case 'f': - if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { + if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { blist_free(bl, da, count); } else { printf("?\n"); } break; case 'l': - if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { + if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { printf(" n=%jd\n", (intmax_t)blist_fill(bl, da, count)); } else { @@ -1131,31 +1208,24 @@ main(int ac, char **av) puts( "p -print\n" "s -stats\n" - "a %d -allocate\n" + "a %d %d -allocate\n" "f %x %d -free\n" "l %x %d -fill\n" "r %d -resize\n" - "h/? -help" + "h/? -help\n" + "q -quit" ); break; + case 'q': + break; default: printf("?\n"); break; } + if (buf[0] == 'q') + break; } - return(0); -} - -void -panic(const char *ctl, ...) -{ - va_list va; - - va_start(va, ctl); - vfprintf(stderr, ctl, va); - fprintf(stderr, "\n"); - va_end(va); - exit(1); + return (0); } #endif diff --git a/freebsd/sys/kern/subr_bus.c b/freebsd/sys/kern/subr_bus.c index a87c02a5..244f1af3 100644 --- a/freebsd/sys/kern/subr_bus.c +++ b/freebsd/sys/kern/subr_bus.c @@ -2472,13 +2472,31 @@ device_print_prettyname(device_t dev) int device_printf(device_t dev, const char * fmt, ...) { + char buf[128]; + struct sbuf sb; + const char *name; va_list ap; - int retval; + size_t retval; + + retval = 0; + + sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); + sbuf_set_drain(&sb, sbuf_printf_drain, &retval); + + name = device_get_name(dev); + + if (name == NULL) + sbuf_cat(&sb, "unknown: "); + else + sbuf_printf(&sb, "%s%d: ", name, device_get_unit(dev)); - retval = device_print_prettyname(dev); va_start(ap, fmt); - retval += vprintf(fmt, ap); + sbuf_vprintf(&sb, fmt, ap); va_end(ap); + + sbuf_finish(&sb); + sbuf_delete(&sb); + return (retval); } @@ -3050,6 +3068,10 @@ device_detach(device_t dev) PDEBUG(("%s", DEVICENAME(dev))); if (dev->state == DS_BUSY) return (EBUSY); + if (dev->state == DS_ATTACHING) { + device_printf(dev, "device in attaching state! Deferring detach.\n"); + return (EBUSY); + } if (dev->state != DS_ATTACHED) return (0); @@ -3914,6 +3936,95 @@ bus_generic_resume(device_t dev) return (0); } + +/** + * @brief Helper function for implementing BUS_RESET_POST + * + * Bus can use this function to implement common operations of + * re-attaching or resuming the children after the bus itself was + * reset, and after restoring bus-unique state of children. + * + * @param dev The bus + * #param flags DEVF_RESET_* + */ +int +bus_helper_reset_post(device_t dev, int flags) +{ + device_t child; + int error, error1; + + error = 0; + TAILQ_FOREACH(child, &dev->children,link) { + BUS_RESET_POST(dev, child); + error1 = (flags & DEVF_RESET_DETACH) != 0 ? + device_probe_and_attach(child) : + BUS_RESUME_CHILD(dev, child); + if (error == 0 && error1 != 0) + error = error1; + } + return (error); +} + +static void +bus_helper_reset_prepare_rollback(device_t dev, device_t child, int flags) +{ + + child = TAILQ_NEXT(child, link); + if (child == NULL) + return; + TAILQ_FOREACH_FROM(child, &dev->children,link) { + BUS_RESET_POST(dev, child); + if ((flags & DEVF_RESET_DETACH) != 0) + device_probe_and_attach(child); + else + BUS_RESUME_CHILD(dev, child); + } +} + +/** + * @brief Helper function for implementing BUS_RESET_PREPARE + * + * Bus can use this function to implement common operations of + * detaching or suspending the children before the bus itself is + * reset, and then save bus-unique state of children that must + * persists around reset. + * + * @param dev The bus + * #param flags DEVF_RESET_* + */ +int +bus_helper_reset_prepare(device_t dev, int flags) +{ + device_t child; + int error; + + if (dev->state != DS_ATTACHED) + return (EBUSY); + + TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) { + if ((flags & DEVF_RESET_DETACH) != 0) { + error = device_get_state(child) == DS_ATTACHED ? + device_detach(child) : 0; + } else { + error = BUS_SUSPEND_CHILD(dev, child); + } + if (error == 0) { + error = BUS_RESET_PREPARE(dev, child); + if (error != 0) { + if ((flags & DEVF_RESET_DETACH) != 0) + device_probe_and_attach(child); + else + BUS_RESUME_CHILD(dev, child); + } + } + if (error != 0) { + bus_helper_reset_prepare_rollback(dev, child, flags); + return (error); + } + } + return (0); +} + /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * @@ -5613,6 +5724,7 @@ devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, case DEV_CLEAR_DRIVER: case DEV_RESCAN: case DEV_DELETE: + case DEV_RESET: error = priv_check(td, PRIV_DRIVER); if (error == 0) error = find_device(req, &dev); @@ -5839,6 +5951,14 @@ devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, device_frozen = false; } break; + case DEV_RESET: + if ((req->dr_flags & ~(DEVF_RESET_DETACH)) != 0) { + error = EINVAL; + break; + } + error = BUS_RESET_CHILD(device_get_parent(dev), dev, + req->dr_flags); + break; } #endif /* __rtems__ */ mtx_unlock(&Giant); @@ -5864,8 +5984,9 @@ devctl2_init(void) */ static int obsolete_panic = 0; SYSCTL_INT(_debug, OID_AUTO, obsolete_panic, CTLFLAG_RWTUN, &obsolete_panic, 0, - "Bus debug level"); -/* 0 - don't panic, 1 - panic if already obsolete, 2 - panic if deprecated */ + "Panic when obsolete features are used (0 = never, 1 = if osbolete, " + "2 = if deprecated)"); + static void gone_panic(int major, int running, const char *msg) { @@ -5890,7 +6011,7 @@ _gone_in(int major, const char *msg) gone_panic(major, P_OSREL_MAJOR(__FreeBSD_version), msg); if (P_OSREL_MAJOR(__FreeBSD_version) >= major) printf("Obsolete code will removed soon: %s\n", msg); - else if (P_OSREL_MAJOR(__FreeBSD_version) + 1 == major) + else printf("Deprecated code (to be removed in FreeBSD %d): %s\n", major, msg); } @@ -5903,7 +6024,7 @@ _gone_in_dev(device_t dev, int major, const char *msg) if (P_OSREL_MAJOR(__FreeBSD_version) >= major) device_printf(dev, "Obsolete code will removed soon: %s\n", msg); - else if (P_OSREL_MAJOR(__FreeBSD_version) + 1 == major) + else device_printf(dev, "Deprecated code (to be removed in FreeBSD %d): %s\n", major, msg); diff --git a/freebsd/sys/kern/subr_eventhandler.c b/freebsd/sys/kern/subr_eventhandler.c index e07248bf..6d36653d 100644 --- a/freebsd/sys/kern/subr_eventhandler.c +++ b/freebsd/sys/kern/subr_eventhandler.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include diff --git a/freebsd/sys/kern/subr_gtaskqueue.c b/freebsd/sys/kern/subr_gtaskqueue.c index 3f80cd2c..af9b65d4 100644 --- a/freebsd/sys/kern/subr_gtaskqueue.c +++ b/freebsd/sys/kern/subr_gtaskqueue.c @@ -35,7 +35,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include @@ -69,6 +68,8 @@ struct gtaskqueue_busy { static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1; +typedef void (*gtaskqueue_enqueue_fn)(void *context); + struct gtaskqueue { STAILQ_HEAD(, gtask) tq_queue; gtaskqueue_enqueue_fn tq_enqueue; @@ -697,7 +698,7 @@ taskqgroup_find(struct taskqgroup *qgroup, void *uniq) } } if (idx == -1) - panic("taskqgroup_find: Failed to pick a qid."); + panic("%s: failed to pick a qid.", __func__); return (idx); } @@ -733,36 +734,36 @@ SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH, void taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask, - void *uniq, int irq, const char *name) + void *uniq, device_t dev, struct resource *irq, const char *name) { #ifndef __rtems__ - cpuset_t mask; - int qid, error; + int cpu, qid, error; #else /* __rtems__ */ int qid; #endif /* __rtems__ */ gtask->gt_uniq = uniq; snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask"); +#ifndef __rtems__ + gtask->gt_dev = dev; gtask->gt_irq = irq; gtask->gt_cpu = -1; +#endif /* __rtems__ */ mtx_lock(&qgroup->tqg_lock); qid = taskqgroup_find(qgroup, uniq); qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; #ifndef __rtems__ - if (irq != -1 && tqg_smp_started) { - gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu; - CPU_ZERO(&mask); - CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask); + if (dev != NULL && irq != NULL && tqg_smp_started) { + cpu = qgroup->tqg_queue[qid].tgc_cpu; + gtask->gt_cpu = cpu; mtx_unlock(&qgroup->tqg_lock); - error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); + error = bus_bind_intr(dev, irq, cpu); if (error) - printf("%s: setaffinity failed for %s: %d\n", __func__, gtask->gt_name, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } else -#else /* __rtems__ */ - BSD_ASSERT(irq == -1); #endif /* __rtems__ */ mtx_unlock(&qgroup->tqg_lock); } @@ -771,7 +772,6 @@ static void taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { #ifndef __rtems__ - cpuset_t mask; int qid, cpu, error; #else /* __rtems__ */ int qid; @@ -781,24 +781,18 @@ taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) qid = taskqgroup_find(qgroup, gtask->gt_uniq); #ifndef __rtems__ cpu = qgroup->tqg_queue[qid].tgc_cpu; - if (gtask->gt_irq != -1) { + if (gtask->gt_dev != NULL && gtask->gt_irq != NULL) { mtx_unlock(&qgroup->tqg_lock); - - CPU_ZERO(&mask); - CPU_SET(cpu, &mask); - error = intr_setaffinity(gtask->gt_irq, CPU_WHICH_IRQ, &mask); + error = bus_bind_intr(gtask->gt_dev, gtask->gt_irq, cpu); mtx_lock(&qgroup->tqg_lock); if (error) - printf("%s: %s setaffinity failed: %d\n", __func__, gtask->gt_name, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } -#else /* __rtems__ */ - BSD_ASSERT(gtask->gt_irq == -1); #endif /* __rtems__ */ qgroup->tqg_queue[qid].tgc_cnt++; - - LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, - gt_list); + LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); @@ -806,10 +800,9 @@ taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) int taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, - void *uniq, int cpu, int irq, const char *name) + void *uniq, int cpu, device_t dev, struct resource *irq, const char *name) { #ifndef __rtems__ - cpuset_t mask; int i, qid, error; #else /* __rtems__ */ int i, qid; @@ -818,8 +811,11 @@ taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, qid = -1; gtask->gt_uniq = uniq; snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask"); +#ifndef __rtems__ + gtask->gt_dev = dev; gtask->gt_irq = irq; gtask->gt_cpu = cpu; +#endif /* __rtems__ */ mtx_lock(&qgroup->tqg_lock); if (tqg_smp_started) { for (i = 0; i < qgroup->tqg_cnt; i++) @@ -843,30 +839,28 @@ taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, mtx_unlock(&qgroup->tqg_lock); #ifndef __rtems__ - CPU_ZERO(&mask); - CPU_SET(cpu, &mask); - if (irq != -1 && tqg_smp_started) { - error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); + if (dev != NULL && irq != NULL && tqg_smp_started) { + error = bus_bind_intr(dev, irq, cpu); if (error) - printf("%s: setaffinity failed: %d\n", __func__, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } #else /* __rtems__ */ - BSD_ASSERT(irq == -1); + BSD_ASSERT(irq == NULL); #endif /* __rtems__ */ return (0); } +#ifndef __rtems__ static int taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { -#ifndef __rtems__ - cpuset_t mask; - int i, qid, irq, cpu, error; -#else /* __rtems__ */ - int i, qid, irq, cpu; -#endif /* __rtems__ */ + device_t dev; + struct resource *irq; + int cpu, error, i, qid; qid = -1; + dev = gtask->gt_dev; irq = gtask->gt_irq; cpu = gtask->gt_cpu; MPASS(tqg_smp_started); @@ -887,20 +881,15 @@ taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtas gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); -#ifndef __rtems__ - CPU_ZERO(&mask); - CPU_SET(cpu, &mask); - - if (irq != -1) { - error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); + if (dev != NULL && irq != NULL) { + error = bus_bind_intr(dev, irq, cpu); if (error) - printf("%s: setaffinity failed: %d\n", __func__, error); + printf("%s: binding interrupt failed for %s: %d\n", + __func__, gtask->gt_name, error); } -#else /* __rtems__ */ - BSD_ASSERT(irq == -1); -#endif /* __rtems__ */ return (0); } +#endif /* __rtems__ */ void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask) @@ -913,7 +902,7 @@ taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask) if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue) break; if (i == qgroup->tqg_cnt) - panic("taskqgroup_detach: task %s not in group\n", gtask->gt_name); + panic("%s: task %s not in group", __func__, gtask->gt_name); qgroup->tqg_queue[i].tgc_cnt--; LIST_REMOVE(gtask, gt_list); mtx_unlock(&qgroup->tqg_lock); @@ -941,8 +930,7 @@ taskqgroup_binder(void *ctx) thread_unlock(curthread); if (error) - printf("%s: setaffinity failed: %d\n", __func__, - error); + printf("%s: binding curthread failed: %d\n", __func__, error); #else /* __rtems__ */ sc = rtems_task_set_affinity(RTEMS_SELF, sizeof(mask), &mask); if (sc != RTEMS_SUCCESSFUL) @@ -1053,10 +1041,14 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) while ((gtask = LIST_FIRST(>ask_head))) { LIST_REMOVE(gtask, gt_list); +#ifndef __rtems__ if (gtask->gt_cpu == -1) +#endif /* __rtems__ */ taskqgroup_attach_deferred(qgroup, gtask); +#ifndef __rtems__ else if (taskqgroup_attach_cpu_deferred(qgroup, gtask)) taskqgroup_attach_deferred(qgroup, gtask); +#endif /* __rtems__ */ } #ifdef INVARIANTS @@ -1115,15 +1107,16 @@ taskqgroup_destroy(struct taskqgroup *qgroup) void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, - const char *name) + const char *name) { GROUPTASK_INIT(gtask, 0, fn, ctx); - taskqgroup_attach(qgroup_config, gtask, gtask, -1, name); + taskqgroup_attach(qgroup_config, gtask, gtask, NULL, NULL, name); } void taskqgroup_config_gtask_deinit(struct grouptask *gtask) { + taskqgroup_detach(qgroup_config, gtask); } diff --git a/freebsd/sys/kern/subr_kobj.c b/freebsd/sys/kern/subr_kobj.c index a6a888d5..3736f64c 100644 --- a/freebsd/sys/kern/subr_kobj.c +++ b/freebsd/sys/kern/subr_kobj.c @@ -127,35 +127,40 @@ kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops) cls->ops = ops; } -void -kobj_class_compile(kobj_class_t cls) +static int +kobj_class_compile1(kobj_class_t cls, int mflags) { kobj_ops_t ops; KOBJ_ASSERT(MA_NOTOWNED); - /* - * Allocate space for the compiled ops table. - */ - ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT); - if (!ops) - panic("%s: out of memory", __func__); + ops = malloc(sizeof(struct kobj_ops), M_KOBJ, mflags); + if (ops == NULL) + return (ENOMEM); - KOBJ_LOCK(); - /* * We may have lost a race for kobj_class_compile here - check * to make sure someone else hasn't already compiled this * class. */ + KOBJ_LOCK(); if (cls->ops) { KOBJ_UNLOCK(); free(ops, M_KOBJ); - return; + return (0); } - kobj_class_compile_common(cls, ops); KOBJ_UNLOCK(); + return (0); +} + +void +kobj_class_compile(kobj_class_t cls) +{ + int error; + + error = kobj_class_compile1(cls, M_WAITOK); + KASSERT(error == 0, ("kobj_class_compile1 returned %d", error)); } void @@ -256,24 +261,6 @@ kobj_class_free(kobj_class_t cls) free(ops, M_KOBJ); } -kobj_t -kobj_create(kobj_class_t cls, - struct malloc_type *mtype, - int mflags) -{ - kobj_t obj; - - /* - * Allocate and initialise the new object. - */ - obj = malloc(cls->size, mtype, mflags | M_ZERO); - if (!obj) - return NULL; - kobj_init(obj, cls); - - return obj; -} - static void kobj_init_common(kobj_t obj, kobj_class_t cls) { @@ -282,30 +269,52 @@ kobj_init_common(kobj_t obj, kobj_class_t cls) cls->refs++; } -void -kobj_init(kobj_t obj, kobj_class_t cls) +static int +kobj_init1(kobj_t obj, kobj_class_t cls, int mflags) { - KOBJ_ASSERT(MA_NOTOWNED); - retry: - KOBJ_LOCK(); + int error; - /* - * Consider compiling the class' method table. - */ - if (!cls->ops) { + KOBJ_LOCK(); + while (cls->ops == NULL) { /* * kobj_class_compile doesn't want the lock held * because of the call to malloc - we drop the lock * and re-try. */ KOBJ_UNLOCK(); - kobj_class_compile(cls); - goto retry; + error = kobj_class_compile1(cls, mflags); + if (error != 0) + return (error); + KOBJ_LOCK(); } - kobj_init_common(obj, cls); - KOBJ_UNLOCK(); + return (0); +} + +kobj_t +kobj_create(kobj_class_t cls, struct malloc_type *mtype, int mflags) +{ + kobj_t obj; + + obj = malloc(cls->size, mtype, mflags | M_ZERO); + if (obj == NULL) + return (NULL); + if (kobj_init1(obj, cls, mflags) != 0) { + free(obj, mtype); + return (NULL); + } + return (obj); +} + +void +kobj_init(kobj_t obj, kobj_class_t cls) +{ + int error; + + error = kobj_init1(obj, cls, M_NOWAIT); + if (error != 0) + panic("kobj_init1 failed: error %d", error); } void diff --git a/freebsd/sys/kern/subr_lock.c b/freebsd/sys/kern/subr_lock.c index c2587cd0..53d99743 100644 --- a/freebsd/sys/kern/subr_lock.c +++ b/freebsd/sys/kern/subr_lock.c @@ -4,7 +4,6 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 John Baldwin - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -258,7 +257,9 @@ struct lock_prof_cpu { struct lock_prof_type lpc_types[2]; /* One for spin one for other. */ }; -struct lock_prof_cpu *lp_cpu[MAXCPU]; +DPCPU_DEFINE_STATIC(struct lock_prof_cpu, lp); +#define LP_CPU_SELF (DPCPU_PTR(lp)) +#define LP_CPU(cpu) (DPCPU_ID_PTR((cpu), lp)) volatile int __read_mostly lock_prof_enable; static volatile int lock_prof_resetting; @@ -304,11 +305,9 @@ lock_prof_init(void *arg) { int cpu; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF, - M_WAITOK | M_ZERO); - lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]); - lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]); + CPU_FOREACH(cpu) { + lock_prof_init_type(&LP_CPU(cpu)->lpc_types[0]); + lock_prof_init_type(&LP_CPU(cpu)->lpc_types[1]); } } SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL); @@ -347,15 +346,15 @@ lock_prof_reset(void) * before we zero the structures. Some items may still be linked * into per-thread lists as well. */ - for (cpu = 0; cpu <= mp_maxid; cpu++) { - lpc = lp_cpu[cpu]; + CPU_FOREACH(cpu) { + lpc = LP_CPU(cpu); for (i = 0; i < LPROF_CACHE_SIZE; i++) { LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link); LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link); } } - for (cpu = 0; cpu <= mp_maxid; cpu++) { - lpc = lp_cpu[cpu]; + CPU_FOREACH(cpu) { + lpc = LP_CPU(cpu); bzero(lpc, sizeof(*lpc)); lock_prof_init_type(&lpc->lpc_types[0]); lock_prof_init_type(&lpc->lpc_types[1]); @@ -395,10 +394,8 @@ lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, dst->class = match->class; dst->name = match->name; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (lp_cpu[cpu] == NULL) - continue; - type = &lp_cpu[cpu]->lpc_types[spin]; + CPU_FOREACH(cpu) { + type = &LP_CPU(cpu)->lpc_types[spin]; SLIST_FOREACH(l, &type->lpt_hash[hash], link) { if (l->ticks == t) continue; @@ -416,7 +413,6 @@ lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, dst->cnt_contest_locking += l->cnt_contest_locking; } } - } static void @@ -455,11 +451,9 @@ dump_lock_prof_stats(SYSCTL_HANDLER_ARGS) lock_prof_enable = 0; quiesce_all_cpus("profstat", 0); t = ticks; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (lp_cpu[cpu] == NULL) - continue; - lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t); - lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t); + CPU_FOREACH(cpu) { + lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[0], sb, 0, t); + lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[1], sb, 1, t); } lock_prof_enable = enabled; @@ -525,7 +519,7 @@ lock_profile_lookup(struct lock_object *lo, int spin, const char *file, p = unknown; hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line; hash &= LPROF_HASH_MASK; - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; head = &type->lpt_hash[hash]; SLIST_FOREACH(lp, head, link) { if (lp->line == line && lp->file == p && @@ -560,7 +554,7 @@ lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file, if (l->lpo_obj == lo && l->lpo_file == file && l->lpo_line == line) return (l); - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; l = LIST_FIRST(&type->lpt_lpoalloc); if (l == NULL) { lock_prof_rejected++; @@ -696,7 +690,7 @@ lock_profile_release_lock(struct lock_object *lo) lp->cnt_cur += l->lpo_cnt; release: LIST_REMOVE(l, lpo_link); - type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; + type = &LP_CPU_SELF->lpc_types[spin]; LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link); out: critical_exit(); diff --git a/freebsd/sys/kern/subr_pcpu.c b/freebsd/sys/kern/subr_pcpu.c index 0ab77996..a3a06c78 100644 --- a/freebsd/sys/kern/subr_pcpu.c +++ b/freebsd/sys/kern/subr_pcpu.c @@ -136,24 +136,20 @@ SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, NULL); /* * UMA_PCPU_ZONE zones, that are available for all kernel * consumers. Right now 64 bit zone is used for counter(9) - * and pointer zone is used by flowtable. + * and int zone is used for mount point counters. */ +uma_zone_t pcpu_zone_int; uma_zone_t pcpu_zone_64; -uma_zone_t pcpu_zone_ptr; static void pcpu_zones_startup(void) { + pcpu_zone_int = uma_zcreate("int pcpu", sizeof(int), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); pcpu_zone_64 = uma_zcreate("64 pcpu", sizeof(uint64_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); - - if (sizeof(uint64_t) == sizeof(void *)) - pcpu_zone_ptr = pcpu_zone_64; - else - pcpu_zone_ptr = uma_zcreate("ptr pcpu", sizeof(void *), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } SYSINIT(pcpu_zones, SI_SUB_VM, SI_ORDER_ANY, pcpu_zones_startup, NULL); diff --git a/freebsd/sys/kern/subr_prf.c b/freebsd/sys/kern/subr_prf.c index 2b45c13e..ed3c8498 100644 --- a/freebsd/sys/kern/subr_prf.c +++ b/freebsd/sys/kern/subr_prf.c @@ -70,6 +70,8 @@ __FBSDID("$FreeBSD$"); #include #endif /* __rtems__ */ #include +#else /* !_KERNEL */ +#include #endif #include #include @@ -1300,3 +1302,46 @@ sbuf_putbuf(struct sbuf *sb) printf("%s", sbuf_data(sb)); } #endif /* __rtems__ */ + +int +sbuf_printf_drain(void *arg, const char *data, int len) +{ +#ifndef __rtems__ + size_t *retvalptr; + int r; +#ifdef _KERNEL + char *dataptr; + char oldchr; + + /* + * This is allowed as an extra byte is always resvered for + * terminating NUL byte. Save and restore the byte because + * we might be flushing a record, and there may be valid + * data after the buffer. + */ + oldchr = data[len]; + dataptr = __DECONST(char *, data); + dataptr[len] = '\0'; + + prf_putbuf(dataptr, TOLOG | TOCONS, -1); + r = len; + + dataptr[len] = oldchr; + +#else /* !_KERNEL */ + + r = printf("%.*s", len, data); + if (r < 0) + return (-errno); + +#endif + + retvalptr = arg; + if (retvalptr != NULL) + *retvalptr += r; + + return (r); +#else /* __rtems__ */ + return (printf("%.*s", len, data)); +#endif /* __rtems__ */ +} diff --git a/freebsd/sys/kern/subr_sbuf.c b/freebsd/sys/kern/subr_sbuf.c index b51ed52c..42e6f8f0 100644 --- a/freebsd/sys/kern/subr_sbuf.c +++ b/freebsd/sys/kern/subr_sbuf.c @@ -58,11 +58,11 @@ __FBSDID("$FreeBSD$"); #ifdef _KERNEL static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); -#define SBMALLOC(size) malloc(size, M_SBUF, M_WAITOK|M_ZERO) +#define SBMALLOC(size, flags) malloc(size, M_SBUF, (flags) | M_ZERO) #define SBFREE(buf) free(buf, M_SBUF) #else /* _KERNEL */ #define KASSERT(e, m) -#define SBMALLOC(size) calloc(1, size) +#define SBMALLOC(size, flags) calloc(1, size) #define SBFREE(buf) free(buf) #endif /* _KERNEL */ @@ -72,6 +72,7 @@ static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); #define SBUF_ISDYNAMIC(s) ((s)->s_flags & SBUF_DYNAMIC) #define SBUF_ISDYNSTRUCT(s) ((s)->s_flags & SBUF_DYNSTRUCT) #define SBUF_ISFINISHED(s) ((s)->s_flags & SBUF_FINISHED) +#define SBUF_ISDRAINATEOL(s) ((s)->s_flags & SBUF_DRAINATEOL) #define SBUF_HASROOM(s) ((s)->s_len < (s)->s_size - 1) #define SBUF_FREESPACE(s) ((s)->s_size - ((s)->s_len + 1)) #define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND) @@ -79,6 +80,8 @@ static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers"); #define SBUF_NULINCLUDED(s) ((s)->s_flags & SBUF_INCLUDENUL) #define SBUF_ISDRAINTOEOR(s) ((s)->s_flags & SBUF_DRAINTOEOR) #define SBUF_DODRAINTOEOR(s) (SBUF_ISSECTION(s) && SBUF_ISDRAINTOEOR(s)) +#define SBUF_MALLOCFLAG(s) \ + (((s)->s_flags & SBUF_NOWAIT) ? M_NOWAIT : M_WAITOK) /* * Set / clear flags @@ -173,7 +176,7 @@ sbuf_extend(struct sbuf *s, int addlen) if (!SBUF_CANEXTEND(s)) return (-1); newsize = sbuf_extendsize(s->s_size + addlen); - newbuf = SBMALLOC(newsize); + newbuf = SBMALLOC(newsize, SBUF_MALLOCFLAG(s)); if (newbuf == NULL) return (-1); memcpy(newbuf, s->s_buf, s->s_size); @@ -186,39 +189,6 @@ sbuf_extend(struct sbuf *s, int addlen) return (0); } -/* - * Initialize the internals of an sbuf. - * If buf is non-NULL, it points to a static or already-allocated string - * big enough to hold at least length characters. - */ -static struct sbuf * -sbuf_newbuf(struct sbuf *s, char *buf, int length, int flags) -{ - - memset(s, 0, sizeof(*s)); - s->s_flags = flags; - s->s_size = length; - s->s_buf = buf; - - if ((s->s_flags & SBUF_AUTOEXTEND) == 0) { - KASSERT(s->s_size >= SBUF_MINSIZE, - ("attempt to create an sbuf smaller than %d bytes", - SBUF_MINSIZE)); - } - - if (s->s_buf != NULL) - return (s); - - if ((flags & SBUF_AUTOEXTEND) != 0) - s->s_size = sbuf_extendsize(s->s_size); - - s->s_buf = SBMALLOC(s->s_size); - if (s->s_buf == NULL) - return (NULL); - SBUF_SETFLAG(s, SBUF_DYNAMIC); - return (s); -} - /* * Initialize an sbuf. * If buf is non-NULL, it points to a static or already-allocated string @@ -232,19 +202,56 @@ sbuf_new(struct sbuf *s, char *buf, int length, int flags) ("attempt to create an sbuf of negative length (%d)", length)); KASSERT((flags & ~SBUF_USRFLAGMSK) == 0, ("%s called with invalid flags", __func__)); + KASSERT((flags & SBUF_AUTOEXTEND) || length >= SBUF_MINSIZE, + ("sbuf buffer %d smaller than minimum %d bytes", length, + SBUF_MINSIZE)); flags &= SBUF_USRFLAGMSK; - if (s != NULL) - return (sbuf_newbuf(s, buf, length, flags)); - s = SBMALLOC(sizeof(*s)); - if (s == NULL) - return (NULL); - if (sbuf_newbuf(s, buf, length, flags) == NULL) { - SBFREE(s); - return (NULL); + /* + * Allocate 'DYNSTRUCT' sbuf from the heap, if NULL 's' was provided. + */ + if (s == NULL) { + s = SBMALLOC(sizeof(*s), + (flags & SBUF_NOWAIT) ? M_NOWAIT : M_WAITOK); + if (s == NULL) + goto out; + SBUF_SETFLAG(s, SBUF_DYNSTRUCT); + } else { + /* + * DYNSTRUCT SBMALLOC sbufs are allocated with M_ZERO, but + * user-provided sbuf objects must be initialized. + */ + memset(s, 0, sizeof(*s)); + } + + s->s_flags |= flags; + s->s_size = length; + s->s_buf = buf; + /* + * Never-written sbufs do not need \n termination. + */ + SBUF_SETFLAG(s, SBUF_DRAINATEOL); + + /* + * Allocate DYNAMIC, i.e., heap data buffer backing the sbuf, if no + * buffer was provided. + */ + if (s->s_buf == NULL) { + if (SBUF_CANEXTEND(s)) + s->s_size = sbuf_extendsize(s->s_size); + s->s_buf = SBMALLOC(s->s_size, SBUF_MALLOCFLAG(s)); + if (s->s_buf == NULL) + goto out; + SBUF_SETFLAG(s, SBUF_DYNAMIC); + } + +out: + if (s != NULL && s->s_buf == NULL) { + if (SBUF_ISDYNSTRUCT(s)) + SBFREE(s); + s = NULL; } - SBUF_SETFLAG(s, SBUF_DYNSTRUCT); return (s); } @@ -310,6 +317,8 @@ sbuf_clear(struct sbuf *s) assert_sbuf_integrity(s); /* don't care if it's finished or not */ + KASSERT(s->s_drain_func == NULL, + ("%s makes no sense on sbuf %p with drain", __func__, s)); SBUF_CLEARFLAG(s, SBUF_FINISHED); s->s_error = 0; @@ -343,6 +352,21 @@ sbuf_setpos(struct sbuf *s, ssize_t pos) return (0); } +/* + * Drain into a counter. Counts amount of data without producing output. + * Useful for cases like sysctl, where user may first request only size. + * This allows to avoid pointless allocation/freeing of large buffers. + */ +int +sbuf_count_drain(void *arg, const char *data __unused, int len) +{ + size_t *sizep; + + sizep = (size_t *)arg; + *sizep += len; + return (len); +} + /* * Set up a drain function and argument on an sbuf to flush data to * when the sbuf buffer overflows. @@ -369,6 +393,7 @@ sbuf_drain(struct sbuf *s) KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s)); KASSERT(s->s_error == 0, ("Called %s with error on %p", __func__, s)); + if (SBUF_DODRAINTOEOR(s) && s->s_rec_off == 0) return (s->s_error = EDEADLK); len = s->s_drain_func(s->s_drain_arg, s->s_buf, @@ -385,8 +410,18 @@ sbuf_drain(struct sbuf *s) * Fast path for the expected case where all the data was * drained. */ - if (s->s_len == 0) + if (s->s_len == 0) { + /* + * When the s_buf is entirely drained, we need to remember if + * the last character was a '\n' or not for + * sbuf_nl_terminate(). + */ + if (s->s_buf[len - 1] == '\n') + SBUF_SETFLAG(s, SBUF_DRAINATEOL); + else + SBUF_CLEARFLAG(s, SBUF_DRAINATEOL); return (0); + } /* * Move the remaining characters to the beginning of the * string. @@ -701,6 +736,38 @@ sbuf_putc(struct sbuf *s, int c) return (0); } +/* + * Append a trailing newline to a non-empty sbuf, if one is not already + * present. Handles sbufs with drain functions correctly. + */ +int +sbuf_nl_terminate(struct sbuf *s) +{ + + assert_sbuf_integrity(s); + assert_sbuf_state(s, 0); + + /* + * If the s_buf isn't empty, the last byte is simply s_buf[s_len - 1]. + * + * If the s_buf is empty because a drain function drained it, we + * remember if the last byte was a \n with the SBUF_DRAINATEOL flag in + * sbuf_drain(). + * + * In either case, we only append a \n if the previous character was + * something else. + */ + if (s->s_len == 0) { + if (!SBUF_ISDRAINATEOL(s)) + sbuf_put_byte(s, '\n'); + } else if (s->s_buf[s->s_len - 1] != '\n') + sbuf_put_byte(s, '\n'); + + if (s->s_error != 0) + return (-1); + return (0); +} + /* * Trim whitespace characters from end of an sbuf. */ diff --git a/freebsd/sys/kern/subr_sleepqueue.c b/freebsd/sys/kern/subr_sleepqueue.c index 57681cce..9665c02f 100644 --- a/freebsd/sys/kern/subr_sleepqueue.c +++ b/freebsd/sys/kern/subr_sleepqueue.c @@ -5,7 +5,6 @@ * * Copyright (c) 2004 John Baldwin * Copyright (c) 2015 embedded brains GmbH - * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -132,7 +131,7 @@ CTASSERT(powerof2(SC_TABLESIZE)); * c - sleep queue chain lock */ struct sleepqueue { - TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ + struct threadqueue sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ @@ -593,6 +592,19 @@ sleepq_catch_signals(void *wchan, int pri) } else { mtx_unlock(&ps->ps_mtx); } + + /* + * Do not go into sleep if this thread was the + * ptrace(2) attach leader. cursig() consumed + * SIGSTOP from PT_ATTACH, but we usually act + * on the signal by interrupting sleep, and + * should do that here as well. + */ + if ((td->td_dbgflags & TDB_FSTP) != 0) { + if (ret == 0) + ret = EINTR; + td->td_dbgflags &= ~TDB_FSTP; + } } /* * Lock the per-process spinlock prior to dropping the PROC_LOCK @@ -1127,13 +1139,15 @@ sleepq_init(void *mem, int size, int flags) } /* - * Find the highest priority thread sleeping on a wait channel and resume it. + * Find thread sleeping on a wait channel and resume it. */ int sleepq_signal(void *wchan, int flags, int pri, int queue) { + struct sleepqueue_chain *sc; struct sleepqueue *sq; #ifndef __rtems__ + struct threadqueue *head; struct thread *td, *besttd; #else /* __rtems__ */ struct thread *besttd; @@ -1150,16 +1164,33 @@ sleepq_signal(void *wchan, int flags, int pri, int queue) ("%s: mismatch between sleep/wakeup and cv_*", __func__)); #ifndef __rtems__ - /* - * Find the highest priority thread on the queue. If there is a - * tie, use the thread that first appears in the queue as it has - * been sleeping the longest since threads are always added to - * the tail of sleep queues. - */ - besttd = TAILQ_FIRST(&sq->sq_blocked[queue]); - TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) { - if (td->td_priority < besttd->td_priority) + head = &sq->sq_blocked[queue]; + if (flags & SLEEPQ_UNFAIR) { + /* + * Find the most recently sleeping thread, but try to + * skip threads still in process of context switch to + * avoid spinning on the thread lock. + */ + sc = SC_LOOKUP(wchan); + besttd = TAILQ_LAST_FAST(head, thread, td_slpq); + while (besttd->td_lock != &sc->sc_lock) { + td = TAILQ_PREV_FAST(besttd, head, thread, td_slpq); + if (td == NULL) + break; besttd = td; + } + } else { + /* + * Find the highest priority thread on the queue. If there + * is a tie, use the thread that first appears in the queue + * as it has been sleeping the longest since threads are + * always added to the tail of sleep queues. + */ + besttd = td = TAILQ_FIRST(head); + while ((td = TAILQ_NEXT(td, td_slpq)) != NULL) { + if (td->td_priority < besttd->td_priority) + besttd = td; + } } #else /* __rtems__ */ besttd = TAILQ_FIRST(&sq->sq_blocked[queue]); diff --git a/freebsd/sys/kern/subr_taskqueue.c b/freebsd/sys/kern/subr_taskqueue.c index 39d9f939..67e62fc8 100644 --- a/freebsd/sys/kern/subr_taskqueue.c +++ b/freebsd/sys/kern/subr_taskqueue.c @@ -841,7 +841,7 @@ taskqueue_thread_enqueue(void *context) tqp = context; tq = *tqp; - wakeup_one(tq); + wakeup_any(tq); } TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL, diff --git a/freebsd/sys/kern/sys_generic.c b/freebsd/sys/kern/sys_generic.c index cc208d6e..1bc4fa6b 100644 --- a/freebsd/sys/kern/sys_generic.c +++ b/freebsd/sys/kern/sys_generic.c @@ -772,7 +772,11 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) fp = NULL; /* fhold() was not called yet */ goto out; } - fhold(fp); + if (!fhold(fp)) { + error = EBADF; + fp = NULL; + goto out; + } if (locked == LA_SLOCKED) { FILEDESC_SUNLOCK(fdp); locked = LA_UNLOCKED; diff --git a/freebsd/sys/kern/sys_pipe.c b/freebsd/sys/kern/sys_pipe.c index 050d63a4..d9b502f0 100755 --- a/freebsd/sys/kern/sys_pipe.c +++ b/freebsd/sys/kern/sys_pipe.c @@ -177,7 +177,6 @@ struct fileops pipeops = { }; #else /* __rtems__ */ #define PIPE_NODIRECT -#define PRIBIO (0) static int rtems_bsd_pipe_open(rtems_libio_t *iop, const char *path, int oflag, mode_t mode); @@ -433,9 +432,7 @@ void pipe_dtor(struct pipe *dpipe) { struct pipe *peer; - ino_t ino; - ino = dpipe->pipe_ino; peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; funsetown(&dpipe->pipe_sigio); pipeclose(dpipe); @@ -802,11 +799,9 @@ pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, /* * Direct copy, bypassing a kernel buffer. */ - } else if ((size = rpipe->pipe_map.cnt) && - (rpipe->pipe_state & PIPE_DIRECTW)) { + } else if ((size = rpipe->pipe_map.cnt) != 0) { if (size > uio->uio_resid) size = (u_int) uio->uio_resid; - PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_map.ms, rpipe->pipe_map.pos, size, uio); @@ -817,7 +812,7 @@ pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { - rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW); + rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } #endif @@ -984,32 +979,33 @@ pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) u_int size; int i; - PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); - KASSERT(wpipe->pipe_state & PIPE_DIRECTW, - ("Clone attempt on non-direct write pipe!")); + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, + ("%s: PIPE_DIRECTW set on %p", __func__, wpipe)); + KASSERT(wpipe->pipe_map.cnt == 0, + ("%s: pipe map for %p contains residual data", __func__, wpipe)); if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; else size = uio->uio_iov->iov_len; - if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + wpipe->pipe_state |= PIPE_DIRECTW; + PIPE_UNLOCK(wpipe); + i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, - wpipe->pipe_map.ms, PIPENPAGES)) < 0) + wpipe->pipe_map.ms, PIPENPAGES); + PIPE_LOCK(wpipe); + if (i < 0) { + wpipe->pipe_state &= ~PIPE_DIRECTW; return (EFAULT); + } -/* - * set up the control block - */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; -/* - * and update the uio data - */ - uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) @@ -1020,13 +1016,19 @@ pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) } /* - * unmap and unwire the process buffer + * Unwire the process buffer. */ static void pipe_destroy_write_buffer(struct pipe *wpipe) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, + ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); + KASSERT(wpipe->pipe_map.cnt == 0, + ("%s: pipe map for %p contains residual data", __func__, wpipe)); + + wpipe->pipe_state &= ~PIPE_DIRECTW; vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); wpipe->pipe_map.npages = 0; } @@ -1045,13 +1047,16 @@ pipe_clone_write_buffer(struct pipe *wpipe) int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, + ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); + size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; + wpipe->pipe_map.cnt = 0; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; - wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; @@ -1090,7 +1095,7 @@ retry: pipeunlock(wpipe); goto error1; } - while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); @@ -1105,7 +1110,6 @@ retry: else goto retry; } - wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; @@ -1122,20 +1126,15 @@ retry: goto retry; } - wpipe->pipe_state |= PIPE_DIRECTW; - - PIPE_UNLOCK(wpipe); error = pipe_build_write_buffer(wpipe, uio); - PIPE_LOCK(wpipe); if (error) { - wpipe->pipe_state &= ~PIPE_DIRECTW; pipeunlock(wpipe); goto error1; } - error = 0; - while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { + while (wpipe->pipe_map.cnt != 0) { if (wpipe->pipe_state & PIPE_EOF) { + wpipe->pipe_map.cnt = 0; pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); pipeunlock(wpipe); @@ -1152,20 +1151,19 @@ retry: error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); + if (error != 0) + break; } if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; - if (wpipe->pipe_state & PIPE_DIRECTW) { - /* - * this bit of trickery substitutes a kernel buffer for - * the process that might be going away. - */ + if (error == EINTR || error == ERESTART) pipe_clone_write_buffer(wpipe); - } else { + else pipe_destroy_write_buffer(wpipe); - } pipeunlock(wpipe); + KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, + ("pipe %p leaked PIPE_DIRECTW", wpipe)); return (error); error1: @@ -1290,7 +1288,7 @@ pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, * pipe buffer. We break out if a signal occurs or the * reader goes away. */ - if (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_map.cnt != 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); @@ -1586,7 +1584,7 @@ pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, PIPE_UNLOCK(mpipe); return (0); } - if (mpipe->pipe_state & PIPE_DIRECTW) + if (mpipe->pipe_map.cnt != 0) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; @@ -1663,8 +1661,7 @@ pipe_poll(struct file *fp, int events, struct ucred *active_cred, #else /* __rtems__ */ if (rtems_bsd_libio_flags_to_fflag(fp->f_io.flags) & FREAD && events & (POLLIN | POLLRDNORM)) #endif /* __rtems__ */ - if ((rpipe->pipe_state & PIPE_DIRECTW) || - (rpipe->pipe_buffer.cnt > 0)) + if (rpipe->pipe_map.cnt > 0 || rpipe->pipe_buffer.cnt > 0) revents |= events & (POLLIN | POLLRDNORM); #ifndef __rtems__ @@ -1674,7 +1671,7 @@ pipe_poll(struct file *fp, int events, struct ucred *active_cred, #endif /* __rtems__ */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || - (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && + ((wpipe->pipe_state & PIPE_DIRECTW) == 0 && ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || wpipe->pipe_buffer.size == 0))) revents |= events & (POLLOUT | POLLWRNORM); @@ -1683,7 +1680,7 @@ pipe_poll(struct file *fp, int events, struct ucred *active_cred, (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); #ifndef __rtems__ if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents && - fp->f_seqcount == rpipe->pipe_wgen) + fp->f_pipegen == rpipe->pipe_wgen) #else /* __rtems__ */ if (rpipe->pipe_state & PIPE_NAMED && rtems_bsd_libio_flags_to_fflag(fp->f_io.flags) & FREAD && levents) #endif /* __rtems__ */ @@ -1792,7 +1789,7 @@ pipe_stat(struct pipe *pipe, struct stat *ub) #endif /* __rtems__ */ ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; - if (pipe->pipe_state & PIPE_DIRECTW) + if (pipe->pipe_map.cnt != 0) ub->st_size = pipe->pipe_map.cnt; else ub->st_size = pipe->pipe_buffer.cnt; @@ -2081,7 +2078,7 @@ filt_piperead(struct knote *kn, long hint) PIPE_LOCK_ASSERT(rpipe, MA_OWNED); kn->kn_data = rpipe->pipe_buffer.cnt; - if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) + if (kn->kn_data == 0) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || @@ -2099,15 +2096,19 @@ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *wpipe; - + + /* + * If this end of the pipe is closed, the knote was removed from the + * knlist and the list lock (i.e., the pipe lock) is therefore not held. + */ wpipe = kn->kn_hook; - PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; return (1); } + PIPE_LOCK_ASSERT(wpipe, MA_OWNED); kn->kn_data = (wpipe->pipe_buffer.size > 0) ? (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF; if (wpipe->pipe_state & PIPE_DIRECTW) diff --git a/freebsd/sys/kern/tty.c b/freebsd/sys/kern/tty.c index 5d9c8a57..ee46a44f 100644 --- a/freebsd/sys/kern/tty.c +++ b/freebsd/sys/kern/tty.c @@ -95,7 +95,7 @@ static const char *dev_console_filename; FLUSHO|NOKERNINFO|NOFLSH) #define TTYSUP_CFLAG (CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\ HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\ - CDSR_OFLOW|CCAR_OFLOW) + CDSR_OFLOW|CCAR_OFLOW|CNO_RTSDTR) #define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT) @@ -336,7 +336,8 @@ ttydev_open(struct cdev *dev, int oflags, int devtype __unused, if (TTY_CALLOUT(tp, dev) || dev == dev_console) tp->t_termios.c_cflag |= CLOCAL; - ttydevsw_modem(tp, SER_DTR|SER_RTS, 0); + if ((tp->t_termios.c_cflag & CNO_RTSDTR) == 0) + ttydevsw_modem(tp, SER_DTR|SER_RTS, 0); error = ttydevsw_open(tp); if (error != 0) @@ -1147,6 +1148,9 @@ tty_rel_free(struct tty *tp) return; } + /* Stop asynchronous I/O. */ + funsetown(&tp->t_sigio); + /* TTY can be deallocated. */ dev = tp->t_dev; tp->t_dev = NULL; diff --git a/freebsd/sys/kern/uipc_mbuf.c b/freebsd/sys/kern/uipc_mbuf.c index 185d14a0..2f1768da 100644 --- a/freebsd/sys/kern/uipc_mbuf.c +++ b/freebsd/sys/kern/uipc_mbuf.c @@ -51,7 +51,11 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include +#include +#include SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", @@ -204,7 +208,7 @@ mb_dupcl(struct mbuf *n, struct mbuf *m) else bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); n->m_flags |= M_EXT; - n->m_flags |= m->m_flags & M_RDONLY; + n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { @@ -248,7 +252,8 @@ m_demote(struct mbuf *m0, int all, int flags) __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); - m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); + m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | + M_NOMAP | flags); } } @@ -343,6 +348,9 @@ m_pkthdr_init(struct mbuf *m, int how) #endif m->m_data = m->m_pktdat; bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); +#ifdef NUMA + m->m_pkthdr.numa_domain = M_NODOM; +#endif #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); @@ -375,12 +383,17 @@ m_move_pkthdr(struct mbuf *to, struct mbuf *from) if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif - to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + to->m_flags = (from->m_flags & M_COPYFLAGS) | + (to->m_flags & (M_EXT | M_NOMAP)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { + from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + from->m_pkthdr.snd_tag = NULL; + } } /* @@ -409,10 +422,13 @@ m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif - to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + to->m_flags = (from->m_flags & M_COPYFLAGS) | + (to->m_flags & (M_EXT | M_NOMAP)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) + m_snd_tag_ref(from->m_pkthdr.snd_tag); SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } @@ -572,6 +588,32 @@ nospace: return (NULL); } +#ifndef __rtems__ +static void +m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) +{ + struct iovec iov; + struct uio uio; + int error; + + KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); + KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); + KASSERT(off < m->m_len, + ("m_copyfromunmapped: len exceeds mbuf length")); + iov.iov_base = cp; + iov.iov_len = len; + uio.uio_resid = len; + uio.uio_iov = &iov; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_rw = UIO_READ; + error = m_unmappedtouio(m, off, &uio, len); + KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off, + len)); +} +#endif /* __rtems__ */ + /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. @@ -593,7 +635,12 @@ m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); - bcopy(mtod(m, caddr_t) + off, cp, count); +#ifndef __rtems__ + if ((m->m_flags & M_NOMAP) != 0) + m_copyfromunmapped(m, off, count, cp); + else +#endif /* __rtems__ */ + bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; @@ -688,6 +735,7 @@ m_cat(struct mbuf *m, struct mbuf *n) m = m->m_next; while (n) { if (!M_WRITABLE(m) || + (n->m_flags & M_NOMAP) != 0 || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; @@ -805,6 +853,9 @@ m_pullup(struct mbuf *n, int len) int count; int space; + KASSERT((n->m_flags & M_NOMAP) == 0, + ("%s: unmapped mbuf %p", __func__, n)); + /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, @@ -923,7 +974,12 @@ m_split(struct mbuf *m0, int len0, int wait) return (NULL); n->m_next = m->m_next; m->m_next = NULL; - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); @@ -931,7 +987,12 @@ m_split(struct mbuf *m0, int len0, int wait) n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) @@ -1347,6 +1408,41 @@ nospace: return (NULL); } +/* + * Return the number of fragments an mbuf will use. This is usually + * used as a proxy for the number of scatter/gather elements needed by + * a DMA engine to access an mbuf. In general mapped mbufs are + * assumed to be backed by physically contiguous buffers that only + * need a single fragment. Unmapped mbufs, on the other hand, can + * span disjoint physical pages. + */ +static int +frags_per_mbuf(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + int frags; + + if ((m->m_flags & M_NOMAP) == 0) + return (1); + + /* + * The header and trailer are counted as a single fragment + * each when present. + * + * XXX: This overestimates the number of fragments by assuming + * all the backing physical pages are disjoint. + */ + ext_pgs = m->m_ext.ext_pgs; + frags = 0; + if (ext_pgs->hdr_len != 0) + frags++; + frags += ext_pgs->npgs; + if (ext_pgs->trail_len != 0) + frags++; + + return (frags); +} + /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and @@ -1367,7 +1463,7 @@ m_collapse(struct mbuf *m0, int how, int maxfrags) */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) - curfrags++; + curfrags += frags_per_mbuf(m); /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the @@ -1382,12 +1478,13 @@ again: break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { - bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, - n->m_len); + m_copydata(n, 0, n->m_len, + mtod(m, char *) + m->m_len); m->m_len += n->m_len; m->m_next = n->m_next; + curfrags -= frags_per_mbuf(n); m_free(n); - if (--curfrags <= maxfrags) + if (curfrags <= maxfrags) return m0; } else m = n; @@ -1404,15 +1501,18 @@ again: m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; - bcopy(mtod(n, void *), mtod(m, void *), n->m_len); - bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, - n2->m_len); + m_copydata(n, 0, n->m_len, mtod(m, char *)); + m_copydata(n2, 0, n2->m_len, + mtod(m, char *) + n->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; + curfrags += 1; /* For the new cluster */ + curfrags -= frags_per_mbuf(n); + curfrags -= frags_per_mbuf(n2); m_free(n); m_free(n2); - if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ + if (curfrags <= maxfrags) return m0; /* * Still not there, try the normal collapse @@ -1512,6 +1612,100 @@ nospace: #endif +#ifndef __rtems__ +/* + * Free pages from mbuf_ext_pgs, assuming they were allocated via + * vm_page_alloc() and aren't associated with any object. Complement + * to allocator from m_uiotombuf_nomap(). + */ +void +mb_free_mext_pgs(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + for (int i = 0; i < ext_pgs->npgs; i++) { + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + vm_page_unwire_noq(pg); + vm_page_free(pg); + } +} + +static struct mbuf * +m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) +{ + struct mbuf *m, *mb, *prev; + struct mbuf_ext_pgs *pgs; + vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; + int error, length, i, needed; + ssize_t total; + int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | + VM_ALLOC_WIRED; + + /* + * len can be zero or an arbitrary large value bound by + * the total data supplied by the uio. + */ + if (len > 0) + total = MIN(uio->uio_resid, len); + else + total = uio->uio_resid; + + if (maxseg == 0) + maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; + + /* + * Allocate the pages + */ + m = NULL; + while (total > 0) { + mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR), + mb_free_mext_pgs); + if (mb == NULL) + goto failed; + if (m == NULL) + m = mb; + else + prev->m_next = mb; + prev = mb; + pgs = mb->m_ext.ext_pgs; + needed = length = MIN(maxseg, total); + for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { +retry_page: + pg_array[i] = vm_page_alloc(NULL, 0, pflags); + if (pg_array[i] == NULL) { + if (how & M_NOWAIT) { + goto failed; + } else { + vm_wait(NULL); + goto retry_page; + } + } + pg_array[i]->flags &= ~PG_ZERO; + pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); + pgs->npgs++; + } + pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1); + MBUF_EXT_PGS_ASSERT_SANITY(pgs); + total -= length; + error = uiomove_fromphys(pg_array, 0, length, uio); + if (error != 0) + goto failed; + mb->m_len = length; + mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs; + if (flags & M_PKTHDR) + m->m_pkthdr.len += length; + } + return (m); + +failed: + m_freem(m); + return (NULL); +} +#endif /* __rtems__ */ + /* * Copy the contents of uio into a properly sized mbuf chain. */ @@ -1523,6 +1717,11 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) ssize_t total; int progress = 0; +#ifndef __rtems__ + if (flags & M_NOMAP) + return (m_uiotombuf_nomap(uio, how, len, align, flags)); +#endif /* __rtems__ */ + /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. @@ -1568,6 +1767,62 @@ m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) return (m); } +/* + * Copy data from an unmapped mbuf into a uio limited by len if set. + */ +int +m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + int error, i, off, pglen, pgoff, seglen, segoff; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + error = 0; + + /* Skip over any data removed from the front. */ + off = mtod(m, vm_offset_t); + + off += m_off; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + error = uiomove(&ext_pgs->hdr[segoff], seglen, uio); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + error = uiomove_fromphys(&pg, segoff, seglen, uio); + pgoff = 0; + }; + if (len != 0 && error == 0) { + KASSERT((off + len) <= ext_pgs->trail_len, + ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, + ext_pgs->trail_len, m_off)); + error = uiomove(&ext_pgs->trail[off], len, uio); + } + return (error); +} + /* * Copy an mbuf chain into a uio limited by len if set. */ @@ -1586,7 +1841,12 @@ m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); - error = uiomove(mtod(m, void *), length, uio); +#ifndef __rtems__ + if ((m->m_flags & M_NOMAP) != 0) + error = m_unmappedtouio(m, 0, uio, length); + else +#endif /* __rtems__ */ + error = uiomove(mtod(m, void *), length, uio); if (error) return (error); diff --git a/freebsd/sys/kern/uipc_mbuf2.c b/freebsd/sys/kern/uipc_mbuf2.c index 7dd2840c..6f98b0a2 100644 --- a/freebsd/sys/kern/uipc_mbuf2.c +++ b/freebsd/sys/kern/uipc_mbuf2.c @@ -218,7 +218,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) goto ok; } if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen - && writable) { + && writable && n->m_next->m_len >= tlen) { n->m_next->m_data -= hlen; n->m_next->m_len += hlen; bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen); diff --git a/freebsd/sys/kern/uipc_sockbuf.c b/freebsd/sys/kern/uipc_sockbuf.c index 0830206a..2305b333 100644 --- a/freebsd/sys/kern/uipc_sockbuf.c +++ b/freebsd/sys/kern/uipc_sockbuf.c @@ -36,11 +36,13 @@ #include __FBSDID("$FreeBSD$"); +#include #include #include #include /* for aio_swake proto */ #include +#include #include #include #include @@ -91,28 +93,135 @@ sbm_clrprotoflags(struct mbuf *m, int flags) } /* - * Mark ready "count" mbufs starting with "m". + * Compress M_NOTREADY mbufs after they have been readied by sbready(). + * + * sbcompress() skips M_NOTREADY mbufs since the data is not available to + * be copied at the time of sbcompress(). This function combines small + * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first + * mbuf sbready() marked ready, and 'end' is the first mbuf still not + * ready. + */ +static void +sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end) +{ + struct mbuf *m, *n; + int ext_size; + + SOCKBUF_LOCK_ASSERT(sb); + + if ((sb->sb_flags & SB_NOCOALESCE) != 0) + return; + + for (m = m0; m != end; m = m->m_next) { + MPASS((m->m_flags & M_NOTREADY) == 0); + + /* Compress small unmapped mbufs into plain mbufs. */ + if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN && + !mbuf_has_tls_session(m)) { + MPASS(m->m_flags & M_EXT); + ext_size = m->m_ext.ext_size; + if (mb_unmapped_compress(m) == 0) { + sb->sb_mbcnt -= ext_size; + sb->sb_ccnt -= 1; + } + } + + /* + * NB: In sbcompress(), 'n' is the last mbuf in the + * socket buffer and 'm' is the new mbuf being copied + * into the trailing space of 'n'. Here, the roles + * are reversed and 'n' is the next mbuf after 'm' + * that is being copied into the trailing space of + * 'm'. + */ + n = m->m_next; + while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && + M_WRITABLE(m) && + (m->m_flags & M_NOMAP) == 0 && + !mbuf_has_tls_session(n) && + !mbuf_has_tls_session(m) && + n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ + n->m_len <= M_TRAILINGSPACE(m) && + m->m_type == n->m_type) { + KASSERT(sb->sb_lastrecord != n, + ("%s: merging start of record (%p) into previous mbuf (%p)", + __func__, n, m)); + m_copydata(n, 0, n->m_len, mtodo(m, m->m_len)); + m->m_len += n->m_len; + m->m_next = n->m_next; + m->m_flags |= n->m_flags & M_EOR; + if (sb->sb_mbtail == n) + sb->sb_mbtail = m; + + sb->sb_mbcnt -= MSIZE; + sb->sb_mcnt -= 1; + if (n->m_flags & M_EXT) { + sb->sb_mbcnt -= n->m_ext.ext_size; + sb->sb_ccnt -= 1; + } + m_free(n); + n = m->m_next; + } + } + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); +} + +/* + * Mark ready "count" units of I/O starting with "m". Most mbufs + * count as a single unit of I/O except for EXT_PGS-backed mbufs which + * can be backed by multiple pages. */ int -sbready(struct sockbuf *sb, struct mbuf *m, int count) +sbready(struct sockbuf *sb, struct mbuf *m0, int count) { + struct mbuf *m; u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); + KASSERT(count > 0, ("%s: invalid count %d", __func__, count)); + m = m0; blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; - for (int i = 0; i < count; i++, m = m->m_next) { + while (count > 0) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); +#ifndef __rtems__ + if ((m->m_flags & M_EXT) != 0 && + m->m_ext.ext_type == EXT_PGS) { + if (count < m->m_ext.ext_pgs->nrdy) { + m->m_ext.ext_pgs->nrdy -= count; + count = 0; + break; + } + count -= m->m_ext.ext_pgs->nrdy; + m->m_ext.ext_pgs->nrdy = 0; + } else +#endif /* __rtems__ */ + count--; + m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; + m = m->m_next; + } + + /* + * If the first mbuf is still not fully ready because only + * some of its backing pages were readied, no further progress + * can be made. + */ + if (m0 == m) { + MPASS(m->m_flags & M_NOTREADY); + return (EINPROGRESS); } - if (!blocker) + if (!blocker) { + sbready_compress(sb, m0, m); return (EINPROGRESS); + } /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { @@ -123,6 +232,7 @@ sbready(struct sockbuf *sb, struct mbuf *m, int count) } sb->sb_fnrdy = m; + sbready_compress(sb, m0, m); return (0); } @@ -571,6 +681,11 @@ sbdestroy(struct sockbuf *sb, struct socket *so) { sbrelease_internal(sb, so); +#ifdef KERN_TLS + if (sb->sb_tls_info != NULL) + ktls_free(sb->sb_tls_info); + sb->sb_tls_info = NULL; +#endif } /* @@ -734,6 +849,11 @@ sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) SBLASTMBUFCHK(sb); +#ifdef KERN_TLS + if (sb->sb_tls_info != NULL) + ktls_seq(sb, m); +#endif + /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); @@ -1036,12 +1156,13 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && - !(n->m_flags & M_NOTREADY) && + !(n->m_flags & (M_NOTREADY | M_NOMAP)) && + !mbuf_has_tls_session(m) && + !mbuf_has_tls_session(n) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { - bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, - (unsigned)m->m_len); + m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) @@ -1052,6 +1173,10 @@ sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) m = m_free(m); continue; } + if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) && + (m->m_flags & M_NOTREADY) == 0 && + !mbuf_has_tls_session(m)) + (void)mb_unmapped_compress(m); if (n) n->m_next = m; else diff --git a/freebsd/sys/kern/uipc_socket.c b/freebsd/sys/kern/uipc_socket.c index 380c97dd..c01535c4 100644 --- a/freebsd/sys/kern/uipc_socket.c +++ b/freebsd/sys/kern/uipc_socket.c @@ -109,6 +109,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -125,6 +126,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -143,6 +145,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include @@ -911,6 +914,8 @@ solisten_wakeup(struct socket *sol) } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); + if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) + pgsigio(&sol->so_sigio, SIGIO, 0); } /* @@ -1067,7 +1072,7 @@ sofree(struct socket *so) * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to - * dom_dispose() and sbrelease_internal() are an inlining of what was + * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down @@ -1154,9 +1159,9 @@ drop: so->so_state |= SS_NOFDREF; sorele(so); if (listening) { - struct socket *sp; + struct socket *sp, *tsp; - TAILQ_FOREACH(sp, &lqueue, so_list) { + TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (sp->so_count == 0) { SOCK_UNLOCK(sp); @@ -1197,7 +1202,6 @@ soabort(struct socket *so) KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) @@ -1468,7 +1472,15 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; - + int pru_flag; +#ifdef KERN_TLS + struct ktls_session *tls; + int tls_enq_cnt, tls_pruflag; + uint8_t tls_rtype; + + tls = NULL; + tls_rtype = TLS_RLTYPE_APP; +#endif if (uio != NULL) resid = uio->uio_resid; else @@ -1502,6 +1514,28 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, if (error) goto out; +#ifdef KERN_TLS + tls_pruflag = 0; + tls = ktls_hold(so->so_snd.sb_tls_info); + if (tls != NULL) { + if (tls->sw_encrypt != NULL) + tls_pruflag = PRUS_NOTREADY; + + if (control != NULL) { + struct cmsghdr *cm = mtod(control, struct cmsghdr *); + + if (clen >= sizeof(*cm) && + cm->cmsg_type == TLS_SET_RECORD_TYPE) { + tls_rtype = *((uint8_t *)CMSG_DATA(cm)); + clen = 0; + m_freem(control); + control = NULL; + atomic = 1; + } + } + } +#endif + restart: do { SOCKBUF_LOCK(&so->so_snd); @@ -1551,7 +1585,8 @@ restart: } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { - if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { + if ((so->so_state & SS_NBIO) || + (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; @@ -1578,10 +1613,27 @@ restart: * is a workaround to prevent protocol send * methods to panic. */ - top = m_uiotombuf(uio, M_WAITOK, space, - (atomic ? max_hdr : 0), - (atomic ? M_PKTHDR : 0) | - ((flags & MSG_EOR) ? M_EOR : 0)); +#ifdef KERN_TLS + if (tls != NULL) { + top = m_uiotombuf(uio, M_WAITOK, space, + tls->params.max_frame_len, + M_NOMAP | + ((flags & MSG_EOR) ? M_EOR : 0)); + if (top != NULL) { + error = ktls_frame(top, tls, + &tls_enq_cnt, tls_rtype); + if (error) { + m_freem(top); + goto release; + } + } + tls_rtype = TLS_RLTYPE_APP; + } else +#endif + top = m_uiotombuf(uio, M_WAITOK, space, + (atomic ? max_hdr : 0), + (atomic ? M_PKTHDR : 0) | + ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; @@ -1605,8 +1657,8 @@ restart: * this. */ VNET_SO_ASSERT(so); - error = (*so->so_proto->pr_usrreqs->pru_send)(so, - (flags & MSG_OOB) ? PRUS_OOB : + + pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use @@ -1618,13 +1670,37 @@ restart: PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || - (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, - top, addr, control, td); + (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; + +#ifdef KERN_TLS + pru_flag |= tls_pruflag; +#endif + + error = (*so->so_proto->pr_usrreqs->pru_send)(so, + pru_flag, top, addr, control, td); + if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } + +#ifdef KERN_TLS + if (tls != NULL && tls->sw_encrypt != NULL) { + /* + * Note that error is intentionally + * ignored. + * + * Like sendfile(), we rely on the + * completion routine (pru_ready()) + * to free the mbufs in the event that + * pru_send() encountered an error and + * did not append them to the sockbuf. + */ + soref(so); + ktls_enqueue(top, so, tls_enq_cnt); + } +#endif clen = 0; control = NULL; top = NULL; @@ -1636,6 +1712,10 @@ restart: release: sbunlock(&so->so_snd); out: +#ifdef KERN_TLS + if (tls != NULL) + ktls_free(tls); +#endif if (top != NULL) m_freem(top); if (control != NULL) @@ -2011,7 +2091,13 @@ dontblock: SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); - error = uiomove(mtod(m, char *) + moff, (int)len, uio); +#ifndef __rtems__ + if ((m->m_flags & M_NOMAP) != 0) + error = m_unmappedtouio(m, moff, uio, (int)len); + else +#endif /* __rtems__ */ + error = uiomove(mtod(m, char *) + moff, + (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* @@ -2225,7 +2311,7 @@ soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) - goto out; + return (error); SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ @@ -2793,12 +2879,10 @@ sosetopt(struct socket *so, struct sockopt *sopt) CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { - if (so->so_proto->pr_ctloutput != NULL) { + if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); - CURVNET_RESTORE(); - return (error); - } - error = ENOPROTOOPT; + else + error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: @@ -2811,7 +2895,12 @@ sosetopt(struct socket *so, struct sockopt *sopt) error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; - + if (l.l_linger < 0 || + l.l_linger > USHRT_MAX || + l.l_linger > (INT_MAX / hz)) { + error = EDOM; + goto bad; + } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) @@ -4162,6 +4251,9 @@ void so_linger_set(struct socket *so, int val) { + KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), + ("%s: val %d out of range", __func__, val)); + so->so_linger = val; } diff --git a/freebsd/sys/kern/uipc_syscalls.c b/freebsd/sys/kern/uipc_syscalls.c index 529268a9..39e96abe 100644 --- a/freebsd/sys/kern/uipc_syscalls.c +++ b/freebsd/sys/kern/uipc_syscalls.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -451,7 +452,8 @@ accept1(td, s, uname, anamelen, flags) if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK - if (flags & ACCEPT4_COMPAT) + if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && + (flags & ACCEPT4_COMPAT) != 0) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif @@ -968,7 +970,8 @@ sendit(struct thread *td, int s, struct msghdr *mp, int flags) if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK - && mp->msg_flags != MSG_COMPAT + && (mp->msg_flags != MSG_COMPAT || + !SV_PROC_FLAG(td->td_proc, SV_AOUT)) #endif ) { error = EINVAL; @@ -979,7 +982,8 @@ sendit(struct thread *td, int s, struct msghdr *mp, int flags) if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK - if (mp->msg_flags == MSG_COMPAT) { + if (mp->msg_flags == MSG_COMPAT && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); @@ -1120,7 +1124,8 @@ sys_sendto(struct thread *td, struct sendto_args *uap) msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK - msg.msg_flags = 0; + if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) + msg.msg_flags = 0; #endif aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; @@ -1239,7 +1244,8 @@ sys_sendmsg(struct thread *td, struct sendmsg_args *uap) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK - msg.msg_flags = 0; + if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) + msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); @@ -1356,7 +1362,8 @@ kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK - if (mp->msg_flags & MSG_COMPAT) + if ((mp->msg_flags & MSG_COMPAT) != 0 && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif @@ -1379,7 +1386,8 @@ kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, * If we receive rights, trim the cmsghdr; anything else * is tossed. */ - if (control && mp->msg_flags & MSG_COMPAT) { + if (control && (mp->msg_flags & MSG_COMPAT) != 0 && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != @@ -1438,7 +1446,8 @@ recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK - if (mp->msg_flags & MSG_COMPAT) + if ((mp->msg_flags & MSG_COMPAT) != 0 && + SV_PROC_FLAG(td->td_proc, SV_AOUT)) error = 0; /* old recvfrom didn't check */ #endif } @@ -1581,7 +1590,8 @@ sys_recvmsg(struct thread *td, struct recvmsg_args *uap) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK - msg.msg_flags &= ~MSG_COMPAT; + if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) + msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; @@ -1863,7 +1873,7 @@ getsockname1(struct thread *td, struct getsockname_args *uap, int compat) if (len != 0) { #ifdef COMPAT_OLDSOCK - if (compat) + if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); @@ -1978,7 +1988,7 @@ getpeername1(struct thread *td, struct getpeername_args *uap, int compat) if (len != 0) { #ifdef COMPAT_OLDSOCK - if (compat) + if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); @@ -2083,7 +2093,8 @@ sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK - if (type == MT_SONAME && buflen <= 112) + if (type == MT_SONAME && buflen <= 112 && + SV_CURPROC_FLAG(SV_AOUT)) buflen = MLEN; /* unix domain compat. hack */ else #endif @@ -2101,7 +2112,8 @@ sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN - if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + if (sa->sa_family == 0 && sa->sa_len < AF_MAX && + SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; @@ -2129,7 +2141,8 @@ getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len) free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN - if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + if (sa->sa_family == 0 && sa->sa_len < AF_MAX && + SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; @@ -2180,8 +2193,10 @@ m_dispose_extcontrolm(struct mbuf *m) fd = *fds++; error = fget(td, fd, &cap_no_rights, &fp); - if (error == 0) + if (error == 0) { fdclose(td, fp, fd); + fdrop(fp, td); + } } } clen -= datalen; diff --git a/freebsd/sys/kern/uipc_usrreq.c b/freebsd/sys/kern/uipc_usrreq.c index 6b34dcb8..39f28b4b 100644 --- a/freebsd/sys/kern/uipc_usrreq.c +++ b/freebsd/sys/kern/uipc_usrreq.c @@ -1032,7 +1032,7 @@ uipc_listen(struct socket *so, int backlog, struct thread *td) SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { - cru2x(td->td_ucred, &unp->unp_peercred); + cru2xt(td, &unp->unp_peercred); solisten_proto(so, backlog); } SOCK_UNLOCK(so); @@ -1837,7 +1837,7 @@ void unp_copy_peercred(struct thread *td, struct unpcb *client_unp, struct unpcb *server_unp, struct unpcb *listen_unp) { - cru2x(td->td_ucred, &client_unp->unp_peercred); + cru2xt(td, &client_unp->unp_peercred); client_unp->unp_flags |= UNP_HAVEPC; memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred, @@ -2306,30 +2306,53 @@ unp_init(void) } #ifndef __rtems__ +static void +unp_internalize_cleanup_rights(struct mbuf *control) +{ + struct cmsghdr *cp; + struct mbuf *m; + void *data; + socklen_t datalen; + + for (m = control; m != NULL; m = m->m_next) { + cp = mtod(m, struct cmsghdr *); + if (cp->cmsg_level != SOL_SOCKET || + cp->cmsg_type != SCM_RIGHTS) + continue; + data = CMSG_DATA(cp); + datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data; + unp_freerights(data, datalen / sizeof(struct filedesc *)); + } +} + static int unp_internalize(struct mbuf **controlp, struct thread *td) { - struct mbuf *control = *controlp; - struct proc *p = td->td_proc; - struct filedesc *fdesc = p->p_fd; + struct mbuf *control, **initial_controlp; + struct proc *p; + struct filedesc *fdesc; struct bintime *bt; - struct cmsghdr *cm = mtod(control, struct cmsghdr *); + struct cmsghdr *cm; struct cmsgcred *cmcred; struct filedescent *fde, **fdep, *fdev; struct file *fp; struct timeval *tv; struct timespec *ts; - int i, *fdp; void *data; - socklen_t clen = control->m_len, datalen; - int error, oldfds; + socklen_t clen, datalen; + int i, j, error, *fdp, oldfds; u_int newlen; UNP_LINK_UNLOCK_ASSERT(); + p = td->td_proc; + fdesc = p->p_fd; error = 0; + control = *controlp; + clen = control->m_len; *controlp = NULL; - while (cm != NULL) { + initial_controlp = controlp; + for (cm = mtod(control, struct cmsghdr *); cm != NULL;) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) { error = EINVAL; @@ -2400,6 +2423,19 @@ unp_internalize(struct mbuf **controlp, struct thread *td) goto out; } fdp = data; + for (i = 0; i < oldfds; i++, fdp++) { + if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) { + fdp = data; + for (j = 0; j < i; j++, fdp++) { + fdrop(fdesc->fd_ofiles[*fdp]. + fde_file, td); + } + FILEDESC_SUNLOCK(fdesc); + error = EBADF; + goto out; + } + } + fdp = data; fdep = (struct filedescent **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS, @@ -2480,6 +2516,8 @@ unp_internalize(struct mbuf **controlp, struct thread *td) } out: + if (error != 0 && initial_controlp != NULL) + unp_internalize_cleanup_rights(*initial_controlp); m_freem(control); return (error); } @@ -2601,7 +2639,6 @@ unp_internalize_fp(struct file *fp) unp->unp_file = fp; unp->unp_msgcount++; } - fhold(fp); unp_rights++; UNP_LINK_WUNLOCK(); } @@ -2762,10 +2799,10 @@ unp_gc(__unused void *arg, int pending) if ((unp->unp_gcflag & UNPGC_DEAD) != 0) { f = unp->unp_file; if (unp->unp_msgcount == 0 || f == NULL || - f->f_count != unp->unp_msgcount) + f->f_count != unp->unp_msgcount || + !fhold(f)) continue; unref[total++] = f; - fhold(f); KASSERT(total <= unp_unreachable, ("unp_gc: incorrect unreachable count.")); } @@ -2942,8 +2979,8 @@ db_print_xucred(int indent, struct xucred *xu) int comma, i; db_print_indent(indent); - db_printf("cr_version: %u cr_uid: %u cr_ngroups: %d\n", - xu->cr_version, xu->cr_uid, xu->cr_ngroups); + db_printf("cr_version: %u cr_uid: %u cr_pid: %d cr_ngroups: %d\n", + xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups); db_print_indent(indent); db_printf("cr_groups: "); comma = 0; -- cgit v1.2.3