diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2019-09-24 11:05:03 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2019-11-13 10:47:04 +0100 |
commit | a5ddb0ea69f21c16b7697a935d7a0c16bb3cffcf (patch) | |
tree | db091fb0f7d091804482156c9f3f55879ac93d5b /freebsd/sys/net | |
parent | test/syscalls01: Fix sporadic test failures (diff) | |
download | rtems-libbsd-a5ddb0ea69f21c16b7697a935d7a0c16bb3cffcf.tar.bz2 |
Update to FreeBSD head 2019-09-24
Git mirror commit 6b0307a0a5184339393f555d5d424190d8a8277a.
Diffstat (limited to 'freebsd/sys/net')
52 files changed, 4664 insertions, 3642 deletions
diff --git a/freebsd/sys/net/altq/altq_cbq.c b/freebsd/sys/net/altq/altq_cbq.c index 015e35bf..7c99f8a8 100644 --- a/freebsd/sys/net/altq/altq_cbq.c +++ b/freebsd/sys/net/altq/altq_cbq.c @@ -225,12 +225,11 @@ cbq_pfattach(struct pf_altq *a) } int -cbq_add_altq(struct pf_altq *a) +cbq_add_altq(struct ifnet *ifp, struct pf_altq *a) { cbq_state_t *cbqp; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_codel.c b/freebsd/sys/net/altq/altq_codel.c index 4a55cdbe..375fc382 100644 --- a/freebsd/sys/net/altq/altq_codel.c +++ b/freebsd/sys/net/altq/altq_codel.c @@ -91,13 +91,12 @@ codel_pfattach(struct pf_altq *a) } int -codel_add_altq(struct pf_altq *a) +codel_add_altq(struct ifnet *ifp, struct pf_altq *a) { struct codel_if *cif; - struct ifnet *ifp; struct codel_opts *opts; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_fairq.c b/freebsd/sys/net/altq/altq_fairq.c index a1bc3fdb..5b7646e2 100644 --- a/freebsd/sys/net/altq/altq_fairq.c +++ b/freebsd/sys/net/altq/altq_fairq.c @@ -150,12 +150,11 @@ fairq_pfattach(struct pf_altq *a) } int -fairq_add_altq(struct pf_altq *a) +fairq_add_altq(struct ifnet *ifp, struct pf_altq *a) { struct fairq_if *pif; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_hfsc.c b/freebsd/sys/net/altq/altq_hfsc.c index 202915a8..024055e3 100644 --- a/freebsd/sys/net/altq/altq_hfsc.c +++ b/freebsd/sys/net/altq/altq_hfsc.c @@ -161,12 +161,11 @@ hfsc_pfattach(struct pf_altq *a) } int -hfsc_add_altq(struct pf_altq *a) +hfsc_add_altq(struct ifnet *ifp, struct pf_altq *a) { struct hfsc_if *hif; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); @@ -508,6 +507,7 @@ hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, goto err_ret; } } + cl->cl_slot = i; if (flags & HFCF_DEFAULTCLASS) hif->hif_defaultclass = cl; @@ -560,7 +560,7 @@ hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, static int hfsc_class_destroy(struct hfsc_class *cl) { - int i, s; + int s; if (cl == NULL) return (0); @@ -591,12 +591,7 @@ hfsc_class_destroy(struct hfsc_class *cl) ASSERT(p != NULL); } - for (i = 0; i < HFSC_MAX_CLASSES; i++) - if (cl->cl_hif->hif_class_tbl[i] == cl) { - cl->cl_hif->hif_class_tbl[i] = NULL; - break; - } - + cl->cl_hif->hif_class_tbl[cl->cl_slot] = NULL; cl->cl_hif->hif_classes--; IFQ_UNLOCK(cl->cl_hif->hif_ifq); splx(s); diff --git a/freebsd/sys/net/altq/altq_hfsc.h b/freebsd/sys/net/altq/altq_hfsc.h index fa4aa811..c43c6671 100644 --- a/freebsd/sys/net/altq/altq_hfsc.h +++ b/freebsd/sys/net/altq/altq_hfsc.h @@ -214,6 +214,7 @@ struct runtime_sc { struct hfsc_class { u_int cl_id; /* class id (just for debug) */ + u_int cl_slot; /* slot in hif class table */ u_int32_t cl_handle; /* class handle */ struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ int cl_flags; /* misc flags */ diff --git a/freebsd/sys/net/altq/altq_priq.c b/freebsd/sys/net/altq/altq_priq.c index 5e77aef2..0090d8fa 100644 --- a/freebsd/sys/net/altq/altq_priq.c +++ b/freebsd/sys/net/altq/altq_priq.c @@ -97,12 +97,11 @@ priq_pfattach(struct pf_altq *a) } int -priq_add_altq(struct pf_altq *a) +priq_add_altq(struct ifnet * ifp, struct pf_altq *a) { struct priq_if *pif; - struct ifnet *ifp; - if ((ifp = ifunit(a->ifname)) == NULL) + if (ifp == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); diff --git a/freebsd/sys/net/altq/altq_subr.c b/freebsd/sys/net/altq/altq_subr.c index 61aaec59..151bdf10 100644 --- a/freebsd/sys/net/altq/altq_subr.c +++ b/freebsd/sys/net/altq/altq_subr.c @@ -412,11 +412,11 @@ tbr_timeout(arg) { VNET_ITERATOR_DECL(vnet_iter); struct ifnet *ifp; - int active, s; + struct epoch_tracker et; + int active; active = 0; - s = splnet(); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); @@ -433,8 +433,7 @@ tbr_timeout(arg) CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); - IFNET_RUNLOCK_NOSLEEP(); - splx(s); + NET_EPOCH_EXIT(et); if (active > 0) CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); else @@ -523,7 +522,7 @@ altq_pfdetach(struct pf_altq *a) * malloc with WAITOK, also it is not yet clear which lock to use. */ int -altq_add(struct pf_altq *a) +altq_add(struct ifnet *ifp, struct pf_altq *a) { int error = 0; @@ -538,27 +537,27 @@ altq_add(struct pf_altq *a) switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: - error = cbq_add_altq(a); + error = cbq_add_altq(ifp, a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: - error = priq_add_altq(a); + error = priq_add_altq(ifp, a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: - error = hfsc_add_altq(a); + error = hfsc_add_altq(ifp, a); break; #endif #ifdef ALTQ_FAIRQ case ALTQT_FAIRQ: - error = fairq_add_altq(a); + error = fairq_add_altq(ifp, a); break; #endif #ifdef ALTQ_CODEL case ALTQT_CODEL: - error = codel_add_altq(a); + error = codel_add_altq(ifp, a); break; #endif default: diff --git a/freebsd/sys/net/altq/altq_var.h b/freebsd/sys/net/altq/altq_var.h index 47326a03..f711e093 100644 --- a/freebsd/sys/net/altq/altq_var.h +++ b/freebsd/sys/net/altq/altq_var.h @@ -199,40 +199,40 @@ int tbr_set(struct ifaltq *, struct tb_profile *); int altq_pfattach(struct pf_altq *); int altq_pfdetach(struct pf_altq *); -int altq_add(struct pf_altq *); +int altq_add(struct ifnet *, struct pf_altq *); int altq_remove(struct pf_altq *); int altq_add_queue(struct pf_altq *); int altq_remove_queue(struct pf_altq *); int altq_getqstats(struct pf_altq *, void *, int *, int); int cbq_pfattach(struct pf_altq *); -int cbq_add_altq(struct pf_altq *); +int cbq_add_altq(struct ifnet *, struct pf_altq *); int cbq_remove_altq(struct pf_altq *); int cbq_add_queue(struct pf_altq *); int cbq_remove_queue(struct pf_altq *); int cbq_getqstats(struct pf_altq *, void *, int *, int); int codel_pfattach(struct pf_altq *); -int codel_add_altq(struct pf_altq *); +int codel_add_altq(struct ifnet *, struct pf_altq *); int codel_remove_altq(struct pf_altq *); int codel_getqstats(struct pf_altq *, void *, int *, int); int priq_pfattach(struct pf_altq *); -int priq_add_altq(struct pf_altq *); +int priq_add_altq(struct ifnet *, struct pf_altq *); int priq_remove_altq(struct pf_altq *); int priq_add_queue(struct pf_altq *); int priq_remove_queue(struct pf_altq *); int priq_getqstats(struct pf_altq *, void *, int *, int); int hfsc_pfattach(struct pf_altq *); -int hfsc_add_altq(struct pf_altq *); +int hfsc_add_altq(struct ifnet *, struct pf_altq *); int hfsc_remove_altq(struct pf_altq *); int hfsc_add_queue(struct pf_altq *); int hfsc_remove_queue(struct pf_altq *); int hfsc_getqstats(struct pf_altq *, void *, int *, int); int fairq_pfattach(struct pf_altq *); -int fairq_add_altq(struct pf_altq *); +int fairq_add_altq(struct ifnet *, struct pf_altq *); int fairq_remove_altq(struct pf_altq *); int fairq_add_queue(struct pf_altq *); int fairq_remove_queue(struct pf_altq *); diff --git a/freebsd/sys/net/bpf.c b/freebsd/sys/net/bpf.c index edee632b..101ac4e0 100644 --- a/freebsd/sys/net/bpf.c +++ b/freebsd/sys/net/bpf.c @@ -5,6 +5,7 @@ * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. + * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org> * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed @@ -45,16 +46,16 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_ddb.h> #include <rtems/bsd/local/opt_netgraph.h> -#include <sys/types.h> #include <sys/param.h> -#include <sys/lock.h> -#include <sys/rwlock.h> -#include <sys/systm.h> #include <sys/conf.h> +#include <sys/eventhandler.h> #include <sys/fcntl.h> #include <sys/jail.h> +#include <sys/ktr.h> +#include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/mutex.h> #include <sys/time.h> #include <sys/priv.h> #include <sys/proc.h> @@ -64,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include <sys/ttycom.h> #include <sys/uio.h> #include <sys/sysent.h> +#include <sys/systm.h> #include <sys/event.h> #include <sys/file.h> @@ -99,14 +101,16 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> #ifdef __rtems__ #include <rtems/imfs.h> +#undef devfs_get_cdevpriv #define devfs_get_cdevpriv(x) 0 +#undef devtoname #define devtoname(x) "bpf" #endif /* __rtems__ */ MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); static struct bpf_if_ext dead_bpf_if = { - .bif_dlist = LIST_HEAD_INITIALIZER() + .bif_dlist = CK_LIST_HEAD_INITIALIZER() }; struct bpf_if { @@ -115,19 +119,22 @@ struct bpf_if { struct bpf_if_ext bif_ext; /* public members */ u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of link header */ + struct bpfd_list bif_wlist; /* writer-only list */ struct ifnet *bif_ifp; /* corresponding interface */ - struct rwlock bif_lock; /* interface lock */ - LIST_HEAD(, bpf_d) bif_wlist; /* writer-only list */ - int bif_flags; /* Interface flags */ struct bpf_if **bif_bpf; /* Pointer to pointer to us */ + volatile u_int bif_refcnt; + struct epoch_context epoch_ctx; }; CTASSERT(offsetof(struct bpf_if, bif_ext) == 0); -#define BPFIF_RLOCK(bif) rw_rlock(&(bif)->bif_lock) -#define BPFIF_RUNLOCK(bif) rw_runlock(&(bif)->bif_lock) -#define BPFIF_WLOCK(bif) rw_wlock(&(bif)->bif_lock) -#define BPFIF_WUNLOCK(bif) rw_wunlock(&(bif)->bif_lock) +struct bpf_program_buffer { + struct epoch_context epoch_ctx; +#ifdef BPF_JITTER + bpf_jit_filter *func; +#endif + void *buffer[0]; +}; #if defined(DEV_BPF) || defined(NETGRAPH_BPF) @@ -187,18 +194,24 @@ struct bpf_dltlist32 { #define BPF_LOCK_ASSERT() sx_assert(&bpf_sx, SA_XLOCKED) /* * bpf_iflist is a list of BPF interface structures, each corresponding to a - * specific DLT. The same network interface might have several BPF interface + * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ -static LIST_HEAD(, bpf_if) bpf_iflist, bpf_freelist; +CK_LIST_HEAD(bpf_iflist, bpf_if); +static struct bpf_iflist bpf_iflist; static struct sx bpf_sx; /* bpf global lock */ static int bpf_bpfd_cnt; +static void bpfif_ref(struct bpf_if *); +static void bpfif_rele(struct bpf_if *); + +static void bpfd_ref(struct bpf_d *); +static void bpfd_rele(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); -static void bpf_detachd_locked(struct bpf_d *); -static void bpf_freed(struct bpf_d *); +static void bpf_detachd_locked(struct bpf_d *, bool); +static void bpfd_free(epoch_context_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); @@ -261,37 +274,106 @@ static struct filterops bpfread_filtops = { .f_event = filt_bpfread, }; -eventhandler_tag bpf_ifdetach_cookie = NULL; - /* - * LOCKING MODEL USED BY BPF: + * LOCKING MODEL USED BY BPF + * * Locks: - * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal, - * some global counters and every bpf_if reference. - * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters. - * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields - * used by bpf_mtap code. + * 1) global lock (BPF_LOCK). Sx, used to protect some global counters, + * every bpf_iflist changes, serializes ioctl access to bpf descriptors. + * 2) Descriptor lock. Mutex, used to protect BPF buffers and various + * structure fields used by bpf_*tap* code. + * + * Lock order: global lock, then descriptor lock. * - * Lock order: + * There are several possible consumers: * - * Global lock, interface lock, descriptor lock + * 1. The kernel registers interface pointer with bpfattach(). + * Each call allocates new bpf_if structure, references ifnet pointer + * and links bpf_if into bpf_iflist chain. This is protected with global + * lock. * - * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2] - * working model. In many places (like bpf_detachd) we start with BPF descriptor - * (and we need to at least rlock it to get reliable interface pointer). This - * gives us potential LOR. As a result, we use global lock to protect from bpf_if - * change in every such place. + * 2. An userland application uses ioctl() call to bpf_d descriptor. + * All such call are serialized with global lock. BPF filters can be + * changed, but pointer to old filter will be freed using epoch_call(). + * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to + * filter pointers, even if change will happen during bpf_tap execution. + * Destroying of bpf_d descriptor also is doing using epoch_call(). * - * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and - * 3) descriptor main wlock. - * Reading bd_bif can be protected by any of these locks, typically global lock. + * 3. An userland application can write packets into bpf_d descriptor. + * There we need to be sure, that ifnet won't disappear during bpfwrite(). * - * Changing read/write BPF filter is protected by the same three locks, - * the same applies for reading. + * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to + * bif_dlist is protected with net_epoch_preempt section. So, it should + * be safe to make access to bpf_d descriptor inside the section. * - * Sleeping in global lock is not allowed due to bpfdetach() using it. + * 5. The kernel invokes bpfdetach() on interface destroying. All lists + * are modified with global lock held and actual free() is done using + * epoch_call(). */ +static void +bpfif_free(epoch_context_t ctx) +{ + struct bpf_if *bp; + + bp = __containerof(ctx, struct bpf_if, epoch_ctx); + if_rele(bp->bif_ifp); + free(bp, M_BPF); +} + +static void +bpfif_ref(struct bpf_if *bp) +{ + + refcount_acquire(&bp->bif_refcnt); +} + +static void +bpfif_rele(struct bpf_if *bp) +{ + + if (!refcount_release(&bp->bif_refcnt)) + return; + epoch_call(net_epoch_preempt, &bp->epoch_ctx, bpfif_free); +} + +static void +bpfd_ref(struct bpf_d *d) +{ + + refcount_acquire(&d->bd_refcnt); +} + +static void +bpfd_rele(struct bpf_d *d) +{ + + if (!refcount_release(&d->bd_refcnt)) + return; + epoch_call(net_epoch_preempt, &d->epoch_ctx, bpfd_free); +} + +static struct bpf_program_buffer* +bpf_program_buffer_alloc(size_t size, int flags) +{ + + return (malloc(sizeof(struct bpf_program_buffer) + size, + M_BPF, flags)); +} + +static void +bpf_program_buffer_free(epoch_context_t ctx) +{ + struct bpf_program_buffer *ptr; + + ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx); +#ifdef BPF_JITTER + if (ptr->func != NULL) + bpf_destroy_jit_filter(ptr->func); +#endif + free(ptr, M_BPF); +} + /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure @@ -673,7 +755,8 @@ bad: } /* - * Attach file to the bpf interface, i.e. make d listen on bp. + * Attach descriptor to the bpf interface, i.e. make d listen on bp, + * then reset its buffers and counters with reset_d(). */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) @@ -689,7 +772,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) op_w = V_bpf_optimize_writers || d->bd_writer; if (d->bd_bif != NULL) - bpf_detachd_locked(d); + bpf_detachd_locked(d, false); /* * Point d at bp, and add d to the interface's list. * Since there are many applications using BPF for @@ -698,26 +781,27 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) * some filter is configured. */ - BPFIF_WLOCK(bp); BPFD_LOCK(d); - + /* + * Hold reference to bpif while descriptor uses this interface. + */ + bpfif_ref(bp); d->bd_bif = bp; - if (op_w != 0) { /* Add to writers-only list */ - LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); + CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); /* * We decrement bd_writer on every filter set operation. * First BIOCSETF is done by pcap_open_live() to set up - * snap length. After that appliation usually sets its own filter + * snap length. After that appliation usually sets its own + * filter. */ d->bd_writer = 2; } else - LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + reset_d(d); BPFD_UNLOCK(d); - BPFIF_WUNLOCK(bp); - bpf_bpfd_cnt++; CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", @@ -731,7 +815,8 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) * Check if we need to upgrade our descriptor @d from write-only mode. */ static int -bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) +bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, + int flen) { int is_snap, need_upgrade; @@ -751,7 +836,8 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) * we'd prefer to treat k=0 (deny ALL) case the same way: e.g. * do not consider upgrading immediately */ - if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K)) + if (cmd == BIOCSETF && flen == 1 && + fcode[0].code == (BPF_RET | BPF_K)) is_snap = 1; else is_snap = 0; @@ -789,88 +875,45 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) } /* - * Add d to the list of active bp filters. - * Requires bpf_attachd() to be called before. - */ -static void -bpf_upgraded(struct bpf_d *d) -{ - struct bpf_if *bp; - - BPF_LOCK_ASSERT(); - - bp = d->bd_bif; - - /* - * Filter can be set several times without specifying interface. - * Mark d as reader and exit. - */ - if (bp == NULL) { - BPFD_LOCK(d); - d->bd_writer = 0; - BPFD_UNLOCK(d); - return; - } - - BPFIF_WLOCK(bp); - BPFD_LOCK(d); - - /* Remove from writers-only list */ - LIST_REMOVE(d, bd_next); - LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); - /* Mark d as reader */ - d->bd_writer = 0; - - BPFD_UNLOCK(d); - BPFIF_WUNLOCK(bp); - - CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); - - EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); -} - -/* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { BPF_LOCK(); - bpf_detachd_locked(d); + bpf_detachd_locked(d, false); BPF_UNLOCK(); } static void -bpf_detachd_locked(struct bpf_d *d) +bpf_detachd_locked(struct bpf_d *d, bool detached_ifp) { - int error; struct bpf_if *bp; struct ifnet *ifp; - - CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); + int error; BPF_LOCK_ASSERT(); + CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); /* Check if descriptor is attached */ if ((bp = d->bd_bif) == NULL) return; - BPFIF_WLOCK(bp); BPFD_LOCK(d); - + /* Remove d from the interface's descriptor list. */ + CK_LIST_REMOVE(d, bd_next); /* Save bd_writer value */ error = d->bd_writer; - - /* - * Remove d from the interface's descriptor list. - */ - LIST_REMOVE(d, bd_next); - ifp = bp->bif_ifp; d->bd_bif = NULL; + if (detached_ifp) { + /* + * Notify descriptor as it's detached, so that any + * sleepers wake up and get ENXIO. + */ + bpf_wakeup(d); + } BPFD_UNLOCK(d); - BPFIF_WUNLOCK(bp); - bpf_bpfd_cnt--; /* Call event handler iff d is attached */ @@ -879,9 +922,9 @@ bpf_detachd_locked(struct bpf_d *d) /* * Check if this descriptor had requested promiscuous mode. - * If so, turn it off. + * If so and ifnet is not detached, turn it off. */ - if (d->bd_promisc) { + if (d->bd_promisc && !detached_ifp) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); @@ -897,6 +940,7 @@ bpf_detachd_locked(struct bpf_d *d) "bpf_detach: ifpromisc failed (%d)\n", error); } } + bpfif_rele(bp); } /* @@ -921,8 +965,7 @@ bpf_dtor(void *data) seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); - bpf_freed(d); - free(d, M_BPF); + bpfd_rele(d); } /* @@ -975,6 +1018,7 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; + d->bd_refcnt = 1; BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); @@ -1162,7 +1206,8 @@ bpf_timed_out(void *arg) BPFD_LOCK_ASSERT(d); - if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) + if (callout_pending(&d->bd_callout) || + !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; @@ -1192,49 +1237,73 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) #endif /* __rtems__ */ { + struct route ro; + struct sockaddr dst; + struct epoch_tracker et; + struct bpf_if *bp; #ifndef __rtems__ struct bpf_d *d; #endif /* __rtems__ */ struct ifnet *ifp; struct mbuf *m, *mc; - struct sockaddr dst; - struct route ro; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); + NET_EPOCH_ENTER(et); + BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); counter_u64_add(d->bd_wcount, 1); - /* XXX: locking required */ - if (d->bd_bif == NULL) { - counter_u64_add(d->bd_wdcount, 1); - return (ENXIO); + if ((bp = d->bd_bif) == NULL) { + error = ENXIO; + goto out_locked; } - ifp = d->bd_bif->bif_ifp; - + ifp = bp->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { - counter_u64_add(d->bd_wdcount, 1); - return (ENETDOWN); + error = ENETDOWN; + goto out_locked; } - if (uio->uio_resid == 0) { - counter_u64_add(d->bd_wdcount, 1); - return (0); - } + if (uio->uio_resid == 0) + goto out_locked; bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; - /* XXX: bpf_movein() can sleep */ - error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, + + /* + * Take extra reference, unlock d and exit from epoch section, + * since bpf_movein() can sleep. + */ + bpfd_ref(d); + NET_EPOCH_EXIT(et); + BPFD_UNLOCK(d); + + error = bpf_movein(uio, (int)bp->bif_dlt, ifp, &m, &dst, &hlen, d); - if (error) { + + if (error != 0) { counter_u64_add(d->bd_wdcount, 1); + bpfd_rele(d); return (error); } + + BPFD_LOCK(d); + /* + * Check that descriptor is still attached to the interface. + * This can happen on bpfdetach(). To avoid access to detached + * ifnet, free mbuf and return ENXIO. + */ + if (d->bd_bif == NULL) { + counter_u64_add(d->bd_wdcount, 1); + BPFD_UNLOCK(d); + bpfd_rele(d); + m_freem(m); + return (ENXIO); + } counter_u64_add(d->bd_wfcount, 1); if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; @@ -1255,11 +1324,9 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) CURVNET_SET(ifp->if_vnet); #ifdef MAC - BPFD_LOCK(d); mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); - BPFD_UNLOCK(d); #endif bzero(&ro, sizeof(ro)); @@ -1269,6 +1336,9 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) ro.ro_flags = RT_HAS_HEADER; } + /* Avoid possible recursion on BPFD_LOCK(). */ + NET_EPOCH_ENTER(et); + BPFD_UNLOCK(d); error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) counter_u64_add(d->bd_wdcount, 1); @@ -1279,8 +1349,15 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) else m_freem(mc); } + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); + bpfd_rele(d); + return (error); +out_locked: + counter_u64_add(d->bd_wdcount, 1); + NET_EPOCH_EXIT(et); + BPFD_UNLOCK(d); return (error); } @@ -1916,16 +1993,11 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, } /* - * Set d's packet filter program to fp. If this file already has a filter, - * free it and replace it. Returns EINVAL for bogus requests. - * - * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls - * since reading d->bd_bif can't be protected by d or interface lock due to - * lock order. - * - * Additionally, we have to acquire interface write lock due to bpf_mtap() uses - * interface read lock to read all filers. + * Set d's packet filter program to fp. If this file already has a filter, + * free it and replace it. Returns EINVAL for bogus requests. * + * Note we use global lock here to serialize bpf_setf() and bpf_setif() + * calls. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) @@ -1934,13 +2006,14 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) struct bpf_program fp_swab; struct bpf_program32 *fp32; #endif - struct bpf_insn *fcode, *old; + struct bpf_program_buffer *fcode; + struct bpf_insn *filter; #ifdef BPF_JITTER - bpf_jit_filter *jfunc, *ofunc; + bpf_jit_filter *jfunc; #endif size_t size; u_int flen; - int need_upgrade; + bool track_event; #ifdef COMPAT_FREEBSD32 switch (cmd) { @@ -1949,7 +2022,8 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; - fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; + fp_swab.bf_insns = + (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; switch (cmd) { case BIOCSETF32: @@ -1963,12 +2037,10 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) } #endif - fcode = NULL; + filter = NULL; #ifdef BPF_JITTER - jfunc = ofunc = NULL; + jfunc = NULL; #endif - need_upgrade = 0; - /* * Check new filter validness before acquiring any locks. * Allocate memory for new filter, if needed. @@ -1978,10 +2050,11 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) return (EINVAL); size = flen * sizeof(*fp->bf_insns); if (size > 0) { - /* We're setting up new filter. Copy and check actual data. */ - fcode = malloc(size, M_BPF, M_WAITOK); - if (copyin(fp->bf_insns, fcode, size) != 0 || - !bpf_validate(fcode, flen)) { + /* We're setting up new filter. Copy and check actual data. */ + fcode = bpf_program_buffer_alloc(size, M_WAITOK); + filter = (struct bpf_insn *)fcode->buffer; + if (copyin(fp->bf_insns, filter, size) != 0 || + !bpf_validate(filter, flen)) { free(fcode, M_BPF); return (EINVAL); } @@ -1991,49 +2064,72 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) * Filter is copied inside fcode and is * perfectly valid. */ - jfunc = bpf_jitter(fcode, flen); + jfunc = bpf_jitter(filter, flen); } #endif } - BPF_LOCK(); + track_event = false; + fcode = NULL; - /* - * Set up new filter. - * Protect filter change by interface lock. - * Additionally, we are protected by global lock here. - */ - if (d->bd_bif != NULL) - BPFIF_WLOCK(d->bd_bif); + BPF_LOCK(); BPFD_LOCK(d); + /* Set up new filter. */ if (cmd == BIOCSETWF) { - old = d->bd_wfilter; - d->bd_wfilter = fcode; + if (d->bd_wfilter != NULL) { + fcode = __containerof((void *)d->bd_wfilter, + struct bpf_program_buffer, buffer); +#ifdef BPF_JITTER + fcode->func = NULL; +#endif + } + d->bd_wfilter = filter; } else { - old = d->bd_rfilter; - d->bd_rfilter = fcode; + if (d->bd_rfilter != NULL) { + fcode = __containerof((void *)d->bd_rfilter, + struct bpf_program_buffer, buffer); +#ifdef BPF_JITTER + fcode->func = d->bd_bfilter; +#endif + } + d->bd_rfilter = filter; #ifdef BPF_JITTER - ofunc = d->bd_bfilter; d->bd_bfilter = jfunc; #endif if (cmd == BIOCSETF) reset_d(d); - need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen); + if (bpf_check_upgrade(cmd, d, filter, flen) != 0) { + /* + * Filter can be set several times without + * specifying interface. In this case just mark d + * as reader. + */ + d->bd_writer = 0; + if (d->bd_bif != NULL) { + /* + * Remove descriptor from writers-only list + * and add it to active readers list. + */ + CK_LIST_REMOVE(d, bd_next); + CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist, + d, bd_next); + CTR2(KTR_NET, + "%s: upgrade required by pid %d", + __func__, d->bd_pid); + track_event = true; + } + } } BPFD_UNLOCK(d); - if (d->bd_bif != NULL) - BPFIF_WUNLOCK(d->bd_bif); - if (old != NULL) - free(old, M_BPF); -#ifdef BPF_JITTER - if (ofunc != NULL) - bpf_destroy_jit_filter(ofunc); -#endif - /* Move d to active readers list. */ - if (need_upgrade != 0) - bpf_upgraded(d); + if (fcode != NULL) + epoch_call(net_epoch_preempt, &fcode->epoch_ctx, + bpf_program_buffer_free); + + if (track_event) + EVENTHANDLER_INVOKE(bpf_track, + d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1); BPF_UNLOCK(); return (0); @@ -2057,15 +2153,6 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) return (ENXIO); bp = theywant->if_bpf; - - /* Check if interface is not being detached from BPF */ - BPFIF_RLOCK(bp); - if (bp->bif_flags & BPFIF_FLAG_DYING) { - BPFIF_RUNLOCK(bp); - return (ENXIO); - } - BPFIF_RUNLOCK(bp); - /* * At this point, we expect the buffer is already allocated. If not, * return an error. @@ -2084,9 +2171,11 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) } if (bp != d->bd_bif) bpf_attachd(d, bp); - BPFD_LOCK(d); - reset_d(d); - BPFD_UNLOCK(d); + else { + BPFD_LOCK(d); + reset_d(d); + BPFD_UNLOCK(d); + } return (0); } @@ -2253,6 +2342,7 @@ bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m) void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { + struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER @@ -2262,24 +2352,14 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) int gottime; gottime = BPF_TSTAMP_NONE; - - BPFIF_RLOCK(bp); - - LIST_FOREACH(d, &bp->bif_dlist, bd_next) { - /* - * We are not using any locks for d here because: - * 1) any filter change is protected by interface - * write lock - * 2) destroying/detaching d is protected by interface - * write lock, too - */ - + NET_EPOCH_ENTER(et); + CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { counter_u64_add(d->bd_rcount, 1); /* - * NB: We dont call BPF_CHECK_DIRECTION() here since there is no - * way for the caller to indiciate to us whether this packet - * is inbound or outbound. In the bpf_mtap() routines, we use - * the interface pointers on the mbuf to figure it out. + * NB: We dont call BPF_CHECK_DIRECTION() here since there + * is no way for the caller to indiciate to us whether this + * packet is inbound or outbound. In the bpf_mtap() routines, + * we use the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; @@ -2293,10 +2373,10 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) * Filter matches. Let's to acquire write lock. */ BPFD_LOCK(d); - counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) - gottime = bpf_gettime(&bt, d->bd_tstamp, NULL); + gottime = bpf_gettime(&bt, d->bd_tstamp, + NULL); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif @@ -2305,7 +2385,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) BPFD_UNLOCK(d); } } - BPFIF_RUNLOCK(bp); + NET_EPOCH_EXIT(et); } #define BPF_CHECK_DIRECTION(d, r, i) \ @@ -2319,6 +2399,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { + struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER @@ -2328,7 +2409,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) int gottime; /* Skip outgoing duplicate packets. */ - if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { + if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) { m->m_flags &= ~M_PROMISC; return; } @@ -2336,17 +2417,17 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) pktlen = m_length(m, NULL); gottime = BPF_TSTAMP_NONE; - BPFIF_RLOCK(bp); - - LIST_FOREACH(d, &bp->bif_dlist, bd_next) { - if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) + NET_EPOCH_ENTER(et); + CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) - slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); + slen = (*(bf->func))(mtod(m, u_char *), pktlen, + pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); @@ -2364,7 +2445,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) BPFD_UNLOCK(d); } } - BPFIF_RUNLOCK(bp); + NET_EPOCH_EXIT(et); } /* @@ -2374,6 +2455,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { + struct epoch_tracker et; struct bintime bt; struct mbuf mb; struct bpf_d *d; @@ -2392,6 +2474,7 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ + mb.m_flags = 0; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; @@ -2399,9 +2482,8 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) gottime = BPF_TSTAMP_NONE; - BPFIF_RLOCK(bp); - - LIST_FOREACH(d, &bp->bif_dlist, bd_next) { + NET_EPOCH_ENTER(et); + CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); @@ -2420,11 +2502,10 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) BPFD_UNLOCK(d); } } - BPFIF_RUNLOCK(bp); + NET_EPOCH_EXIT(et); } #undef BPF_CHECK_DIRECTION - #undef BPF_TSTAMP_NONE #undef BPF_TSTAMP_FAST #undef BPF_TSTAMP_NORMAL @@ -2514,6 +2595,11 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, int tstype; BPFD_LOCK_ASSERT(d); + if (d->bd_bif == NULL) { + /* Descriptor was detached in concurrent thread */ + counter_u64_add(d->bd_dcount, 1); + return; + } /* * Detect whether user space has released a buffer back to us, and if @@ -2643,26 +2729,36 @@ copy: * Called on close. */ static void -bpf_freed(struct bpf_d *d) +bpfd_free(epoch_context_t ctx) { + struct bpf_d *d; + struct bpf_program_buffer *p; /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ + d = __containerof(ctx, struct bpf_d, epoch_ctx); bpf_free(d); if (d->bd_rfilter != NULL) { - free((caddr_t)d->bd_rfilter, M_BPF); + p = __containerof((void *)d->bd_rfilter, + struct bpf_program_buffer, buffer); #ifdef BPF_JITTER - if (d->bd_bfilter != NULL) - bpf_destroy_jit_filter(d->bd_bfilter); + p->func = d->bd_bfilter; #endif + bpf_program_buffer_free(&p->epoch_ctx); + } + if (d->bd_wfilter != NULL) { + p = __containerof((void *)d->bd_wfilter, + struct bpf_program_buffer, buffer); +#ifdef BPF_JITTER + p->func = NULL; +#endif + bpf_program_buffer_free(&p->epoch_ctx); } - if (d->bd_wfilter != NULL) - free((caddr_t)d->bd_wfilter, M_BPF); - mtx_destroy(&d->bd_lock); + mtx_destroy(&d->bd_lock); counter_u64_free(d->bd_rcount); counter_u64_free(d->bd_dcount); counter_u64_free(d->bd_fcount); @@ -2670,7 +2766,7 @@ bpf_freed(struct bpf_d *d) counter_u64_free(d->bd_wfcount); counter_u64_free(d->bd_wdcount); counter_u64_free(d->bd_zcopy); - + free(d, M_BPF); } /* @@ -2691,29 +2787,33 @@ bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) * headers are not yet supporrted). */ void -bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) +bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, + struct bpf_if **driverp) { struct bpf_if *bp; - bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO); - if (bp == NULL) - panic("bpfattach"); + KASSERT(*driverp == NULL, + ("bpfattach2: driverp already initialized")); - LIST_INIT(&bp->bif_dlist); - LIST_INIT(&bp->bif_wlist); + bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO); + + CK_LIST_INIT(&bp->bif_dlist); + CK_LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; - rw_init(&bp->bif_lock, "bpf interface lock"); - KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); + bp->bif_hdrlen = hdrlen; bp->bif_bpf = driverp; + bp->bif_refcnt = 1; *driverp = bp; - + /* + * Reference ifnet pointer, so it won't freed until + * we release it. + */ + if_ref(ifp); BPF_LOCK(); - LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); + CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); BPF_UNLOCK(); - bp->bif_hdrlen = hdrlen; - if (bootverbose && IS_DEFAULT_VNET(curvnet)) if_printf(ifp, "bpf attached\n"); } @@ -2752,98 +2852,32 @@ bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen) void bpfdetach(struct ifnet *ifp) { - struct bpf_if *bp, *bp_temp; - struct bpf_d *d; - int ndetached; - - ndetached = 0; + struct bpf_if *bp, *bp_temp; + struct bpf_d *d; BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ - LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { + CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; - LIST_REMOVE(bp, bif_next); - /* Add to to-be-freed list */ - LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next); - - ndetached++; - /* - * Delay freeing bp till interface is detached - * and all routes through this interface are removed. - * Mark bp as detached to restrict new consumers. - */ - BPFIF_WLOCK(bp); - bp->bif_flags |= BPFIF_FLAG_DYING; + CK_LIST_REMOVE(bp, bif_next); *bp->bif_bpf = (struct bpf_if *)&dead_bpf_if; - BPFIF_WUNLOCK(bp); - CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", + CTR4(KTR_NET, + "%s: sheduling free for encap %d (%p) for if %p", __func__, bp->bif_dlt, bp, ifp); - /* Free common descriptors */ - while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { - bpf_detachd_locked(d); - BPFD_LOCK(d); - bpf_wakeup(d); - BPFD_UNLOCK(d); + /* Detach common descriptors */ + while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) { + bpf_detachd_locked(d, true); } - /* Free writer-only descriptors */ - while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) { - bpf_detachd_locked(d); - BPFD_LOCK(d); - bpf_wakeup(d); - BPFD_UNLOCK(d); + /* Detach writer-only descriptors */ + while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) { + bpf_detachd_locked(d, true); } - } - BPF_UNLOCK(); - -#ifdef INVARIANTS - if (ndetached == 0) - printf("bpfdetach: %s was not attached\n", ifp->if_xname); -#endif -} - -/* - * Interface departure handler. - * Note departure event does not guarantee interface is going down. - * Interface renaming is currently done via departure/arrival event set. - * - * Departure handled is called after all routes pointing to - * given interface are removed and interface is in down state - * restricting any packets to be sent/received. We assume it is now safe - * to free data allocated by BPF. - */ -static void -bpf_ifdetach(void *arg __unused, struct ifnet *ifp) -{ - struct bpf_if *bp, *bp_temp; - int nmatched = 0; - - /* Ignore ifnet renaming. */ - if (ifp->if_flags & IFF_RENAMING) - return; - - BPF_LOCK(); - /* - * Find matching entries in free list. - * Nothing should be found if bpfdetach() was not called. - */ - LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) { - if (ifp != bp->bif_ifp) - continue; - - CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p", - __func__, bp, ifp); - - LIST_REMOVE(bp, bif_next); - - rw_destroy(&bp->bif_lock); - free(bp, M_BPF); - - nmatched++; + bpfif_rele(bp); } BPF_UNLOCK(); } @@ -2862,9 +2896,8 @@ bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; -again: n1 = 0; - LIST_FOREACH(bp, &bpf_iflist, bif_next) { + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp) n1++; } @@ -2874,24 +2907,16 @@ again: } if (n1 > bfl->bfl_len) return (ENOMEM); - BPF_UNLOCK(); + lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK); n = 0; - BPF_LOCK(); - LIST_FOREACH(bp, &bpf_iflist, bif_next) { + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; - if (n >= n1) { - free(lst, M_TEMP); - goto again; - } - lst[n] = bp->bif_dlt; - n++; + lst[n++] = bp->bif_dlt; } - BPF_UNLOCK(); error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n); free(lst, M_TEMP); - BPF_LOCK(); bfl->bfl_len = n; return (error); } @@ -2907,33 +2932,34 @@ bpf_setdlt(struct bpf_d *d, u_int dlt) struct bpf_if *bp; BPF_LOCK_ASSERT(); + MPASS(d->bd_bif != NULL); + /* + * It is safe to check bd_bif without BPFD_LOCK, it can not be + * changed while we hold global lock. + */ if (d->bd_bif->bif_dlt == dlt) return (0); - ifp = d->bd_bif->bif_ifp; - LIST_FOREACH(bp, &bpf_iflist, bif_next) { + ifp = d->bd_bif->bif_ifp; + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } + if (bp == NULL) + return (EINVAL); - if (bp != NULL) { - opromisc = d->bd_promisc; - bpf_attachd(d, bp); - BPFD_LOCK(d); - reset_d(d); - BPFD_UNLOCK(d); - if (opromisc) { - error = ifpromisc(bp->bif_ifp, 1); - if (error) - if_printf(bp->bif_ifp, - "bpf_setdlt: ifpromisc failed (%d)\n", - error); - else - d->bd_promisc = 1; - } + opromisc = d->bd_promisc; + bpf_attachd(d, bp); + if (opromisc) { + error = ifpromisc(bp->bif_ifp, 1); + if (error) + if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n", + __func__, error); + else + d->bd_promisc = 1; } - return (bp == NULL ? EINVAL : 0); + return (0); } #ifdef __rtems__ static struct bpf_d * @@ -2973,7 +2999,7 @@ bpf_imfs_readv(rtems_libio_t *iop, const struct iovec *iov, int iovcnt, ssize_t struct bpf_d *d = bpf_imfs_get_context_by_iop(iop); struct thread *td = rtems_bsd_get_curthread_or_null(); struct uio uio = { - .uio_iov = iov, + .uio_iov = RTEMS_DECONST(struct iovec *, iov), .uio_iovcnt = iovcnt, .uio_offset = 0, .uio_resid = total, @@ -3014,7 +3040,7 @@ bpf_imfs_writev(rtems_libio_t *iop, const struct iovec *iov, int iovcnt, ssize_t struct bpf_d *d = bpf_imfs_get_context_by_iop(iop); struct thread *td = rtems_bsd_get_curthread_or_null(); struct uio uio = { - .uio_iov = iov, + .uio_iov = RTEMS_DECONST(struct iovec *, iov), .uio_iovcnt = iovcnt, .uio_offset = 0, .uio_resid = total, @@ -3042,7 +3068,7 @@ static ssize_t bpf_imfs_write(rtems_libio_t *iop, const void *buffer, size_t count) { struct iovec iov = { - .iov_base = buffer, + .iov_base = RTEMS_DECONST(void *, buffer), .iov_len = count }; @@ -3115,24 +3141,23 @@ bpf_drvinit(void *unused) #endif /* __rtems__ */ sx_init(&bpf_sx, "bpf global lock"); - LIST_INIT(&bpf_iflist); - LIST_INIT(&bpf_freelist); + CK_LIST_INIT(&bpf_iflist); #ifndef __rtems__ dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); -#else /* __rtems__ */ - rv = IMFS_make_generic_node("/dev/bpf", mode, &bpf_imfs_control, NULL); - BSD_ASSERT(rv == 0); - rv = symlink("/dev/bpf", "/dev/bpf0"); - BSD_ASSERT(rv == 0); -#endif /* __rtems__ */ /* Register interface departure handler */ bpf_ifdetach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bpf_ifdetach, NULL, EVENTHANDLER_PRI_ANY); +#else /* __rtems__ */ + rv = IMFS_make_generic_node("/dev/bpf", mode, &bpf_imfs_control, NULL); + BSD_ASSERT(rv == 0); + rv = symlink("/dev/bpf", "/dev/bpf0"); + BSD_ASSERT(rv == 0); +#endif /* __rtems__ */ } /* @@ -3147,19 +3172,19 @@ bpf_zero_counters(void) struct bpf_d *bd; BPF_LOCK(); - LIST_FOREACH(bp, &bpf_iflist, bif_next) { - BPFIF_RLOCK(bp); - LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { - BPFD_LOCK(bd); + /* + * We are protected by global lock here, interfaces and + * descriptors can not be deleted while we hold it. + */ + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { + CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { counter_u64_zero(bd->bd_rcount); counter_u64_zero(bd->bd_dcount); counter_u64_zero(bd->bd_fcount); counter_u64_zero(bd->bd_wcount); counter_u64_zero(bd->bd_wfcount); counter_u64_zero(bd->bd_zcopy); - BPFD_UNLOCK(bd); } - BPFIF_RUNLOCK(bp); } BPF_UNLOCK(); } @@ -3171,10 +3196,9 @@ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { + BPF_LOCK_ASSERT(); bzero(d, sizeof(*d)); - BPFD_LOCK_ASSERT(bd); d->bd_structsize = sizeof(*d); - /* XXX: reading should be protected by global lock */ d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; @@ -3251,22 +3275,16 @@ bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) return (ENOMEM); } index = 0; - LIST_FOREACH(bp, &bpf_iflist, bif_next) { - BPFIF_RLOCK(bp); + CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { /* Send writers-only first */ - LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { + CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { xbd = &xbdbuf[index++]; - BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); - BPFD_UNLOCK(bd); } - LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { + CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; - BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); - BPFD_UNLOCK(bd); } - BPFIF_RUNLOCK(bp); } BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); @@ -3346,10 +3364,10 @@ bpf_show_bpf_if(struct bpf_if *bpf_if) /* bif_ext.bif_dlist */ BPF_DB_PRINTF("%#x", bif_dlt); BPF_DB_PRINTF("%u", bif_hdrlen); - BPF_DB_PRINTF("%p", bif_ifp); - /* bif_lock */ /* bif_wlist */ - BPF_DB_PRINTF("%#x", bif_flags); + BPF_DB_PRINTF("%p", bif_ifp); + BPF_DB_PRINTF("%p", bif_bpf); + BPF_DB_PRINTF("%u", bif_refcnt); } DB_SHOW_COMMAND(bpf_if, db_show_bpf_if) diff --git a/freebsd/sys/net/bpf.h b/freebsd/sys/net/bpf.h index d8eb7ff4..55b03b54 100644 --- a/freebsd/sys/net/bpf.h +++ b/freebsd/sys/net/bpf.h @@ -42,6 +42,10 @@ #ifndef _NET_BPF_H_ #define _NET_BPF_H_ +#include <sys/_eventhandler.h> +#include <sys/ck.h> +#include <net/dlt.h> + #if defined(__rtems__) && !defined(__FreeBSD__) #define __FreeBSD__ 1 #endif /* defined(__rtems__) && !defined(__FreeBSD__) */ @@ -236,9 +240,6 @@ struct bpf_zbuf_header { u_int _bzh_pad[5]; }; -/* Pull in data-link level type codes. */ -#include <net/dlt.h> - /* * The instruction encodings. * @@ -412,10 +413,11 @@ SYSCTL_DECL(_net_bpf); * bpf_peers_present() calls. */ struct bpf_if; +CK_LIST_HEAD(bpfd_list, bpf_d); struct bpf_if_ext { - LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ - LIST_HEAD(, bpf_d) bif_dlist; /* descriptor list */ + CK_LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ + struct bpfd_list bif_dlist; /* descriptor list */ }; void bpf_bufheld(struct bpf_d *d); @@ -439,7 +441,7 @@ bpf_peers_present(struct bpf_if *bpf) struct bpf_if_ext *ext; ext = (struct bpf_if_ext *)bpf; - if (!LIST_EMPTY(&ext->bif_dlist)) + if (!CK_LIST_EMPTY(&ext->bif_dlist)) return (1); return (0); } @@ -467,12 +469,10 @@ bpf_peers_present(struct bpf_if *bpf) */ #define BPF_MEMWORDS 16 -#ifdef _SYS_EVENTHANDLER_H_ /* BPF attach/detach events */ struct ifnet; typedef void (*bpf_track_fn)(void *, struct ifnet *, int /* dlt */, int /* 1 =>'s attach */); EVENTHANDLER_DECLARE(bpf_track, bpf_track_fn); -#endif /* _SYS_EVENTHANDLER_H_ */ #endif /* _NET_BPF_H_ */ diff --git a/freebsd/sys/net/bpf_buffer.c b/freebsd/sys/net/bpf_buffer.c index 7a182a61..daa9e267 100644 --- a/freebsd/sys/net/bpf_buffer.c +++ b/freebsd/sys/net/bpf_buffer.c @@ -71,8 +71,10 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_bpf.h> #include <sys/param.h> +#include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/mutex.h> #include <sys/socket.h> #include <sys/uio.h> #include <sys/kernel.h> @@ -119,19 +121,10 @@ bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, { const struct mbuf *m; u_char *dst; - u_int count; m = (struct mbuf *)src; dst = (u_char *)buf + offset; - while (len > 0) { - if (m == NULL) - panic("bpf_mcopy"); - count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); - m = m->m_next; - dst += count; - len -= count; - } + m_copydata(m, 0, len, dst); } /* diff --git a/freebsd/sys/net/bpfdesc.h b/freebsd/sys/net/bpfdesc.h index 2ce9204b..c28a74f9 100644 --- a/freebsd/sys/net/bpfdesc.h +++ b/freebsd/sys/net/bpfdesc.h @@ -43,9 +43,10 @@ #include <sys/callout.h> #include <sys/selinfo.h> -#include <sys/queue.h> +#include <sys/ck.h> #include <sys/conf.h> #include <sys/counter.h> +#include <sys/epoch.h> #include <net/if.h> /* @@ -53,7 +54,7 @@ */ struct zbuf; struct bpf_d { - LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ + CK_LIST_ENTRY(bpf_d) bd_next; /* Linked list of descriptors */ /* * Buffer slots: two memory buffers store the incoming packets. * The model has three slots. Sbuf is always occupied. @@ -106,6 +107,9 @@ struct bpf_d { counter_u64_t bd_wdcount; /* number of packets dropped during a write */ counter_u64_t bd_zcopy; /* number of zero copy operations */ u_char bd_compat32; /* 32-bit stream on LP64 system */ + + volatile u_int bd_refcnt; + struct epoch_context epoch_ctx; }; /* Values for bd_state */ diff --git a/freebsd/sys/net/bridgestp.c b/freebsd/sys/net/bridgestp.c index 49e772b3..424f4d69 100644 --- a/freebsd/sys/net/bridgestp.c +++ b/freebsd/sys/net/bridgestp.c @@ -2024,6 +2024,7 @@ bstp_same_bridgeid(uint64_t id1, uint64_t id2) void bstp_reinit(struct bstp_state *bs) { + struct epoch_tracker et; struct bstp_port *bp; struct ifnet *ifp, *mif; u_char *e_addr; @@ -2044,7 +2045,7 @@ bstp_reinit(struct bstp_state *bs) * from is part of this bridge, so we can have more than one independent * bridges in the same STP domain. */ - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type != IFT_ETHER) continue; /* Not Ethernet */ @@ -2064,7 +2065,7 @@ bstp_reinit(struct bstp_state *bs) continue; } } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); if (mif == NULL) goto disablestp; @@ -2275,4 +2276,7 @@ bstp_destroy(struct bstp_port *bp) taskqueue_drain(taskqueue_swi, &bp->bp_statetask); taskqueue_drain(taskqueue_swi, &bp->bp_rtagetask); taskqueue_drain(taskqueue_swi, &bp->bp_mediatask); + + if (bp->bp_bs->bs_root_port == bp) + bstp_assign_roles(bp->bp_bs); } diff --git a/freebsd/sys/net/ethernet.h b/freebsd/sys/net/ethernet.h index fa75c1df..7ceb9b80 100644 --- a/freebsd/sys/net/ethernet.h +++ b/freebsd/sys/net/ethernet.h @@ -401,6 +401,8 @@ struct ether_vlan_header { #ifdef _KERNEL +#include <sys/_eventhandler.h> + struct ifnet; struct mbuf; struct route; @@ -422,12 +424,11 @@ void ether_vlan_mtap(struct bpf_if *, struct mbuf *, struct mbuf *ether_vlanencap(struct mbuf *, uint16_t); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, uint16_t vid, uint8_t pcp); +void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr); -#ifdef _SYS_EVENTHANDLER_H_ /* new ethernet interface attached event */ typedef void (*ether_ifattach_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ether_ifattach_event, ether_ifattach_event_handler_t); -#endif #else /* _KERNEL */ diff --git a/freebsd/sys/net/ieee8023ad_lacp.c b/freebsd/sys/net/ieee8023ad_lacp.c index 9a70d6a1..46076a23 100644 --- a/freebsd/sys/net/ieee8023ad_lacp.c +++ b/freebsd/sys/net/ieee8023ad_lacp.c @@ -34,6 +34,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_ratelimit.h> #include <sys/param.h> @@ -837,7 +838,9 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) struct lacp_softc *lsc = LACP_SOFTC(sc); struct lacp_portmap *pm; struct lacp_port *lp; + struct lacp_port **map; uint32_t hash; + int count; if (__predict_false(lsc->lsc_suppress_distributing)) { LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); @@ -850,13 +853,31 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (NULL); } +#ifdef NUMA + if ((sc->sc_opts & LAGG_OPT_USE_NUMA) && + pm->pm_num_dom > 1 && m->m_pkthdr.numa_domain < MAXMEMDOM) { + count = pm->pm_numa[m->m_pkthdr.numa_domain].count; + if (count > 0) { + map = pm->pm_numa[m->m_pkthdr.numa_domain].map; + } else { + /* No ports on this domain; use global hash. */ + map = pm->pm_map; + count = pm->pm_count; + } + } else +#endif + { + map = pm->pm_map; + count = pm->pm_count; + } if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) hash = m->m_pkthdr.flowid >> sc->flowid_shift; else hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); - hash %= pm->pm_count; - lp = pm->pm_map[hash]; + + hash %= count; + lp = map[hash]; KASSERT((lp->lp_state & LACP_STATE_DISTRIBUTING) != 0, ("aggregated port is not distributing")); @@ -864,7 +885,7 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (lp->lp_lagg); } -#ifdef RATELIMIT +#if defined(RATELIMIT) || defined(KERN_TLS) struct lagg_port * lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid) { @@ -1046,6 +1067,10 @@ lacp_update_portmap(struct lacp_softc *lsc) uint64_t speed; u_int newmap; int i; +#ifdef NUMA + int count; + uint8_t domain; +#endif newmap = lsc->lsc_activemap == 0 ? 1 : 0; p = &lsc->lsc_pmap[newmap]; @@ -1056,9 +1081,25 @@ lacp_update_portmap(struct lacp_softc *lsc) if (la != NULL && la->la_nports > 0) { p->pm_count = la->la_nports; i = 0; - TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) + TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) { p->pm_map[i++] = lp; +#ifdef NUMA + domain = lp->lp_ifp->if_numa_domain; + if (domain >= MAXMEMDOM) + continue; + count = p->pm_numa[domain].count; + p->pm_numa[domain].map[count] = lp; + p->pm_numa[domain].count++; +#endif + } KASSERT(i == p->pm_count, ("Invalid port count")); + +#ifdef NUMA + for (i = 0; i < MAXMEMDOM; i++) { + if (p->pm_numa[i].count != 0) + p->pm_num_dom++; + } +#endif speed = lacp_aggregator_bandwidth(la); } sc->sc_ifp->if_baudrate = speed; diff --git a/freebsd/sys/net/ieee8023ad_lacp.h b/freebsd/sys/net/ieee8023ad_lacp.h index 5ae48ceb..b6a0860f 100644 --- a/freebsd/sys/net/ieee8023ad_lacp.h +++ b/freebsd/sys/net/ieee8023ad_lacp.h @@ -197,8 +197,15 @@ enum lacp_mux_state { #define LACP_MAX_PORTS 32 +struct lacp_numa { + int count; + struct lacp_port *map[LACP_MAX_PORTS]; +}; + struct lacp_portmap { int pm_count; + int pm_num_dom; + struct lacp_numa pm_numa[MAXMEMDOM]; struct lacp_port *pm_map[LACP_MAX_PORTS]; }; @@ -286,7 +293,7 @@ struct lacp_softc { struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); -#ifdef RATELIMIT +#if defined(RATELIMIT) || defined(KERN_TLS) struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t); #endif void lacp_attach(struct lagg_softc *); diff --git a/freebsd/sys/net/ieee_oui.h b/freebsd/sys/net/ieee_oui.h new file mode 100644 index 00000000..068328d8 --- /dev/null +++ b/freebsd/sys/net/ieee_oui.h @@ -0,0 +1,85 @@ +/* - + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 The FreeBSD Foundation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * Author: George V. Neville-Neil + * + */ + +/* Organizationally Unique Identifier assigned by IEEE 14 Nov 2013 */ +#define OUI_FREEBSD_BASE 0x589cfc000000 +#define OUI_FREEBSD(nic) (OUI_FREEBSD_BASE | (nic)) + +/* + * OUIs are most often used to uniquely identify network interfaces + * and occupy the first 3 bytes of both destination and source MAC + * addresses. The following allocations exist so that various + * software systems associated with FreeBSD can have unique IDs in the + * absence of hardware. The use of OUIs for this purpose is not fully + * fleshed out but is now in common use in virtualization technology. + * + * Allocations from this range are expected to be made using COMMON + * SENSE by developers. Do NOT take a large range just because + * they're currently wide open. Take the smallest useful range for + * your system. We have (2^24 - 2) available addresses (see Reserved + * Values below) but that is far from infinite. + * + * In the event of a conflict arbitration of allocation in this file + * is subject to core@ approval. + * + * Applications are differentiated based on the high order bit(s) of + * the remaining three bytes. Our first allocation has all 0s, the + * next allocation has the highest bit set. Allocating in this way + * gives us 254 allocations of 64K addresses. Address blocks can be + * concatenated if necessary. + * + * Reserved Values: 0x000000 and 0xffffff are reserved and MUST NOT BE + * allocated for any reason. + */ + +/* Allocate 20 bits to bhyve */ +#define OUI_FREEBSD_BHYVE_LOW OUI_FREEBSD(0x000001) +#define OUI_FREEBSD_BHYVE_HIGH OUI_FREEBSD(0x0fffff) + +/* + * Allocate 16 bits for a pool to give to various interfaces that need a + * generated address, but don't quite need to slice off a whole section of + * the OUI (e.g. cloned interfaces, one-off NICs of various vendors). + * + * ether_gen_addr should be used to generate an address from this pool. + */ +#define OUI_FREEBSD_GENERATED_MASK 0x10ffff +#define OUI_FREEBSD_GENERATED_LOW OUI_FREEBSD(0x100000) +#define OUI_FREEBSD_GENERATED_HIGH OUI_FREEBSD(OUI_FREEBSD_GENERATED_MASK) + +/* Allocate 16 bits for emulated NVMe devices */ +#define OUI_FREEBSD_NVME_MASK 0x20ffff +#define OUI_FREEBSD_NVME_LOW OUI_FREEBSD(0x200000) +#define OUI_FREEBSD_NVME_HIGH OUI_FREEBSD(OUI_FREEBSD_NVME_MASK) diff --git a/freebsd/sys/net/if.c b/freebsd/sys/net/if.c index 9d233444..c1fd928e 100644 --- a/freebsd/sys/net/if.c +++ b/freebsd/sys/net/if.c @@ -38,9 +38,10 @@ #include <rtems/bsd/local/opt_inet.h> #include <sys/param.h> -#include <sys/types.h> #include <sys/conf.h> +#include <sys/eventhandler.h> #include <sys/malloc.h> +#include <sys/domainset.h> #include <sys/sbuf.h> #include <sys/bus.h> #include <sys/epoch.h> @@ -175,14 +176,14 @@ struct ifmediareq32 { #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ - case _IOC_NEWTYPE((cmd), struct ifgroupreq32): + _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ - case (cmd) + (cmd) union ifreq_union { struct ifreq ifr; @@ -270,7 +271,6 @@ static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); -static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); @@ -358,16 +358,17 @@ ifnet_byindex(u_short idx) struct ifnet * ifnet_byindex_ref(u_short idx) { + struct epoch_tracker et; struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (NULL); } if_ref(ifp); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -431,14 +432,15 @@ ifnet_setbyindex(u_short idx, struct ifnet *ifp) struct ifaddr * ifaddr_byindex(u_short idx) { + struct epoch_tracker et; struct ifnet *ifp; struct ifaddr *ifa = NULL; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifa); } @@ -531,13 +533,23 @@ if_grow(void) * registered for the passed type. */ struct ifnet * -if_alloc(u_char type) +if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; - ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); +#ifndef __rtems__ + KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); + if (numa_domain == IF_NODOM) +#endif /* __rtems__ */ + ifp = malloc(sizeof(struct ifnet), M_IFNET, + M_WAITOK | M_ZERO); +#ifndef __rtems__ + else + ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, + DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); +#endif /* __rtems__ */ restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); @@ -552,6 +564,9 @@ if_alloc(u_char type) ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; +#ifndef __rtems__ + ifp->if_numa_domain = numa_domain; +#endif /* __rtems__ */ #ifdef VIMAGE ifp->if_vnet = curvnet; #endif @@ -585,6 +600,22 @@ if_alloc(u_char type) return (ifp); } +struct ifnet * +if_alloc_dev(u_char type, device_t dev) +{ + int numa_domain; + + if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) + return (if_alloc_domain(type, IF_NODOM)); + return (if_alloc_domain(type, numa_domain)); +} + +struct ifnet * +if_alloc(u_char type) +{ + + return (if_alloc_domain(type, IF_NODOM)); +} /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an @@ -613,7 +644,14 @@ if_free_internal(struct ifnet *ifp) free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); - free(ifp, M_IFNET); +#ifndef __rtems__ + if (ifp->if_numa_domain == IF_NODOM) +#endif /* __rtems__ */ + free(ifp, M_IFNET); +#ifndef __rtems__ + else + free_domain(ifp, M_IFNET); +#endif /* __rtems__ */ } static void @@ -840,7 +878,6 @@ if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; - ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; @@ -976,12 +1013,14 @@ if_purgeaddrs(struct ifnet *ifp) struct ifaddr *ifa; while (1) { - NET_EPOCH_ENTER(); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if (ifa == NULL) break; @@ -1107,6 +1146,15 @@ if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); + + /* + * Ensure all pending EPOCH(9) callbacks have been executed. This + * fixes issues about late destruction of multicast options + * which lead to leave group calls, which in turn access the + * belonging ifnet structure: + */ + epoch_drain_callbacks(net_epoch_preempt); + /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. @@ -1618,38 +1666,39 @@ ifgr_groups_get(void *ifgrp) static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { + struct epoch_tracker et; int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (error); } len -= sizeof(ifgrq); ifgp++; } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); return (0); } @@ -1869,6 +1918,7 @@ static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { + struct epoch_tracker et; int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; @@ -1879,6 +1929,16 @@ ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; + if (cmd == RTM_ADD) { + /* explicitly specify (loopback) ifa */ + if (info.rti_ifp != NULL) { + NET_EPOCH_ENTER(et); + info.rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); + if (info.rti_ifa != NULL) + ifa_ref(info.rti_ifa); + NET_EPOCH_EXIT(et); + } + } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; @@ -1963,11 +2023,12 @@ done: int ifa_ifwithaddr_check(const struct sockaddr *addr) { + struct epoch_tracker et; int rc; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (rc); } @@ -2057,9 +2118,7 @@ ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) /* * Scan though each interface, looking for ones that have addresses - * in this address family and the requested fib. Maintain a reference - * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that - * kept it stable when we move onto the next interface. + * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) @@ -2188,38 +2247,6 @@ ifa_preferred(struct ifaddr *cur, struct ifaddr *next) ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } -#include <net/if_llatbl.h> - -/* - * Default action when installing a route with a Link Level gateway. - * Lookup an appropriate real ifa to point to. - * This should be moved to /sys/net/link.c eventually. - */ -static void -link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) -{ - struct ifaddr *ifa, *oifa; - struct sockaddr *dst; - struct ifnet *ifp; - - if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == NULL) || - ((ifp = ifa->ifa_ifp) == NULL) || ((dst = rt_key(rt)) == NULL)) - return; - NET_EPOCH_ENTER(); - ifa = ifaof_ifpforaddr(dst, ifp); - if (ifa) { - oifa = rt->rt_ifa; - if (oifa != ifa) { - ifa_free(oifa); - ifa_ref(ifa); - } - rt->rt_ifa = ifa; - if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) - ifa->ifa_rtrequest(cmd, rt, info); - } - NET_EPOCH_EXIT(); -} - struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { @@ -2418,9 +2445,10 @@ if_qflush(struct ifnet *ifp) struct ifnet * ifunit_ref(const char *name) { + struct epoch_tracker et; struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) @@ -2428,21 +2456,22 @@ ifunit_ref(const char *name) } if (ifp != NULL) if_ref(ifp); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { + struct epoch_tracker et; struct ifnet *ifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -2706,6 +2735,8 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } + if (strcmp(new_name, ifp->if_xname) == 0) + break; if (ifunit(new_name) != NULL) return (EEXIST); @@ -2830,6 +2861,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (EINVAL); if (cmd == SIOCADDMULTI) { + struct epoch_tracker et; struct ifmultiaddr *ifma; /* @@ -2839,9 +2871,9 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) * lose a race while we check if the membership * already exists. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else @@ -2878,6 +2910,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: + case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); @@ -2895,7 +2928,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) error = if_gethwaddr(ifp, ifr); break; - CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): + case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); @@ -2904,12 +2937,12 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (error); break; - CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): + case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; - CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): + case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); @@ -3080,7 +3113,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) error = if_clone_list((struct if_clonereq *)data); goto out_noref; - CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): + case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; @@ -3280,6 +3313,7 @@ again: IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + struct epoch_tracker et; int addrs; /* @@ -3296,7 +3330,7 @@ again: } addrs = 0; - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; @@ -3324,7 +3358,7 @@ again: if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); @@ -3631,15 +3665,16 @@ if_delmulti(struct ifnet *ifp, struct sockaddr *sa) struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS + struct epoch_tracker et; struct ifnet *oifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif @@ -3705,15 +3740,16 @@ if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { + struct epoch_tracker et; struct ifnet *oifp; - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); } #endif /* @@ -3837,10 +3873,11 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; + struct epoch_tracker et; int rc; rc = 0; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); ifa = ifp->if_addr; if (ifa == NULL) { rc = EINVAL; @@ -3874,7 +3911,7 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) * to re-init it in order to reprogram its * address filter. */ - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; @@ -3890,7 +3927,7 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); out: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (rc); } @@ -4305,6 +4342,8 @@ if_getsoftc(if_t ifp) void if_setrcvif(struct mbuf *m, if_t ifp) { + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } diff --git a/freebsd/sys/net/if_arp.h b/freebsd/sys/net/if_arp.h index 070dbafe..f4c3bec2 100644 --- a/freebsd/sys/net/if_arp.h +++ b/freebsd/sys/net/if_arp.h @@ -105,8 +105,9 @@ struct arpstat { uint64_t rxrequests; /* # of ARP requests received by this host. */ uint64_t rxreplies; /* # of ARP replies received by this host. */ uint64_t received; /* # of ARP packets received by this host. */ + uint64_t txerrors; /* # of ARP requests failed to send. */ - uint64_t arp_spares[4]; /* For either the upper or lower half. */ + uint64_t arp_spares[3]; /* For either the upper or lower half. */ /* Abnormal event and error counting: */ uint64_t dropped; /* # of packets dropped waiting for a reply. */ uint64_t timeouts; /* # of times with entries removed */ diff --git a/freebsd/sys/net/if_bridge.c b/freebsd/sys/net/if_bridge.c index aa56be48..18e0e7bf 100644 --- a/freebsd/sys/net/if_bridge.c +++ b/freebsd/sys/net/if_bridge.c @@ -228,7 +228,7 @@ struct bridge_softc { struct bstp_state sc_stp; /* STP state */ uint32_t sc_brtexceeded; /* # of cache drops */ struct ifnet *sc_ifaddr; /* member mac copied from */ - u_char sc_defaddr[6]; /* Default MAC address */ + struct ether_addr sc_defaddr; /* Default MAC address */ }; VNET_DEFINE_STATIC(struct mtx, bridge_list_mtx); @@ -237,7 +237,8 @@ static eventhandler_tag bridge_detach_cookie; int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; -uma_zone_t bridge_rtnode_zone; +VNET_DEFINE_STATIC(uma_zone_t, bridge_rtnode_zone); +#define V_bridge_rtnode_zone VNET(bridge_rtnode_zone) static int bridge_clone_create(struct if_clone *, int, caddr_t); static void bridge_clone_destroy(struct ifnet *); @@ -529,6 +530,9 @@ static void vnet_bridge_init(const void *unused __unused) { + V_bridge_rtnode_zone = uma_zcreate("bridge_rtnode", + sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); BRIDGE_LIST_LOCK_INIT(); LIST_INIT(&V_bridge_list); V_bridge_cloner = if_clone_simple(bridge_name, @@ -544,6 +548,7 @@ vnet_bridge_uninit(const void *unused __unused) if_clone_detach(V_bridge_cloner); V_bridge_cloner = NULL; BRIDGE_LIST_LOCK_DESTROY(); + uma_zdestroy(V_bridge_rtnode_zone); } VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_bridge_uninit, NULL); @@ -554,9 +559,6 @@ bridge_modevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: - bridge_rtnode_zone = uma_zcreate("bridge_rtnode", - sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, - UMA_ALIGN_PTR, 0); bridge_dn_p = bridge_dummynet; bridge_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bridge_ifdetach, NULL, @@ -565,7 +567,6 @@ bridge_modevent(module_t mod, int type, void *data) case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, bridge_detach_cookie); - uma_zdestroy(bridge_rtnode_zone); bridge_dn_p = NULL; break; default: @@ -672,16 +673,14 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) getcredhostid(curthread->td_ucred, &hostid); do { if (fb || hostid == 0) { - arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1); - sc->sc_defaddr[0] &= ~1;/* clear multicast bit */ - sc->sc_defaddr[0] |= 2; /* set the LAA bit */ + ether_gen_addr(ifp, &sc->sc_defaddr); } else { - sc->sc_defaddr[0] = 0x2; - sc->sc_defaddr[1] = (hostid >> 24) & 0xff; - sc->sc_defaddr[2] = (hostid >> 16) & 0xff; - sc->sc_defaddr[3] = (hostid >> 8 ) & 0xff; - sc->sc_defaddr[4] = hostid & 0xff; - sc->sc_defaddr[5] = ifp->if_dunit & 0xff; + sc->sc_defaddr.octet[0] = 0x2; + sc->sc_defaddr.octet[1] = (hostid >> 24) & 0xff; + sc->sc_defaddr.octet[2] = (hostid >> 16) & 0xff; + sc->sc_defaddr.octet[3] = (hostid >> 8 ) & 0xff; + sc->sc_defaddr.octet[4] = hostid & 0xff; + sc->sc_defaddr.octet[5] = ifp->if_dunit & 0xff; } fb = 1; @@ -689,7 +688,7 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) BRIDGE_LIST_LOCK(); LIST_FOREACH(sc2, &V_bridge_list, sc_list) { bifp = sc2->sc_ifp; - if (memcmp(sc->sc_defaddr, + if (memcmp(sc->sc_defaddr.octet, IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) { retry = 1; break; @@ -699,7 +698,7 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) } while (retry == 1); bstp_attach(&sc->sc_stp, &bridge_ops); - ether_ifattach(ifp, sc->sc_defaddr); + ether_ifattach(ifp, sc->sc_defaddr.octet); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_BRIDGE; @@ -734,6 +733,9 @@ bridge_clone_destroy(struct ifnet *ifp) bridge_delete_span(sc, bif); } + /* Tear down the routing table. */ + bridge_rtable_fini(sc); + BRIDGE_UNLOCK(sc); callout_drain(&sc->sc_brcallout); @@ -746,9 +748,6 @@ bridge_clone_destroy(struct ifnet *ifp) ether_ifdetach(ifp); if_free(ifp); - /* Tear down the routing table. */ - bridge_rtable_fini(sc); - BRIDGE_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } @@ -927,7 +926,7 @@ bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) { struct ifnet *ifp = bif->bif_ifp; struct ifreq ifr; - int error; + int error, mask, stuck; BRIDGE_UNLOCK_ASSERT(sc); @@ -940,10 +939,12 @@ bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) if_printf(sc->sc_ifp, "error setting capabilities on %s: %d\n", ifp->if_xname, error); - if ((ifp->if_capenable & ~set) != 0) + mask = BRIDGE_IFCAPS_MASK | BRIDGE_IFCAPS_STRIP; + stuck = ifp->if_capenable & mask & ~set; + if (stuck != 0) if_printf(sc->sc_ifp, "can't disable some capabilities on %s: 0x%x\n", - ifp->if_xname, ifp->if_capenable & ~set); + ifp->if_xname, stuck); } } @@ -1018,7 +1019,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, */ if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) { if (LIST_EMPTY(&sc->sc_iflist)) { - bcopy(sc->sc_defaddr, + bcopy(&sc->sc_defaddr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = NULL; } else { @@ -1189,7 +1190,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) * the default randomly generated one. */ if (V_bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) && - !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN)) { + !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr.octet, ETHER_ADDR_LEN)) { bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = ifs; EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); @@ -1972,9 +1973,9 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp) return; } - if (PFIL_HOOKED(&V_inet_pfil_hook) + if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) @@ -2001,7 +2002,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, struct rtentry *rt) { struct ether_header *eh; - struct ifnet *dst_if; + struct ifnet *bifp, *dst_if; struct bridge_softc *sc; uint16_t vlan; @@ -2016,13 +2017,14 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, vlan = VLANTAGOF(m); BRIDGE_LOCK(sc); + bifp = sc->sc_ifp; /* * If bridge is down, but the original output interface is up, * go ahead and send out that interface. Otherwise, the packet * is dropped below. */ - if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + if ((bifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { dst_if = ifp; goto sendunicast; } @@ -2035,6 +2037,9 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, dst_if = NULL; else dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); + /* Tap any traffic not passing back out the originating interface */ + if (dst_if != ifp) + ETHER_BPF_MTAP(bifp, m); if (dst_if == NULL) { struct bridge_iflist *bif; struct mbuf *mc; @@ -2072,7 +2077,7 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, } else { mc = m_copypacket(m, M_NOWAIT); if (mc == NULL) { - if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); + if_inc_counter(bifp, IFCOUNTER_OERRORS, 1); continue; } } @@ -2232,9 +2237,9 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, ETHER_BPF_MTAP(ifp, m); /* run the packet filter */ - if (PFIL_HOOKED(&V_inet_pfil_hook) + if (PFIL_HOOKED_IN(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_IN(V_inet6_pfil_head) #endif ) { BRIDGE_UNLOCK(sc); @@ -2272,9 +2277,9 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, BRIDGE_UNLOCK(sc); - if (PFIL_HOOKED(&V_inet_pfil_hook) + if (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif ) { if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) @@ -2411,7 +2416,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) #ifdef INET6 # define OR_PFIL_HOOKED_INET6 \ - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_IN(V_inet6_pfil_head) #else # define OR_PFIL_HOOKED_INET6 #endif @@ -2423,22 +2428,6 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_DST((iface)) \ ) { \ - if ((iface)->if_type == IFT_BRIDGE) { \ - ETHER_BPF_MTAP(iface, m); \ - if_inc_counter(iface, IFCOUNTER_IPACKETS, 1); \ - if_inc_counter(iface, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ - /* Filter on the physical interface. */ \ - if (V_pfil_local_phys && \ - (PFIL_HOOKED(&V_inet_pfil_hook) \ - OR_PFIL_HOOKED_INET6)) { \ - if (bridge_pfil(&m, NULL, ifp, \ - PFIL_IN) != 0 || m == NULL) { \ - BRIDGE_UNLOCK(sc); \ - return (NULL); \ - } \ - eh = mtod(m, struct ether_header *); \ - } \ - } \ if (bif->bif_flags & IFBIF_LEARNING) { \ error = bridge_rtupdate(sc, eh->ether_shost, \ vlan, bif, 0, IFBAF_DYNAMIC); \ @@ -2449,6 +2438,26 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) } \ } \ m->m_pkthdr.rcvif = iface; \ + if ((iface) == ifp) { \ + /* Skip bridge processing... src == dest */ \ + BRIDGE_UNLOCK(sc); \ + return (m); \ + } \ + /* It's passing over or to the bridge, locally. */ \ + ETHER_BPF_MTAP(bifp, m); \ + if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); \ + if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ + /* Filter on the physical interface. */ \ + if (V_pfil_local_phys && (PFIL_HOOKED_IN(V_inet_pfil_head) \ + OR_PFIL_HOOKED_INET6)) { \ + if (bridge_pfil(&m, NULL, ifp, \ + PFIL_IN) != 0 || m == NULL) { \ + BRIDGE_UNLOCK(sc); \ + return (NULL); \ + } \ + } \ + if ((iface) != bifp) \ + ETHER_BPF_MTAP(iface, m); \ BRIDGE_UNLOCK(sc); \ return (m); \ } \ @@ -2519,9 +2528,9 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, } /* Filter on the bridge interface before broadcasting */ - if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) + if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) @@ -2566,9 +2575,9 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, * pointer so we do not redundantly filter on the bridge for * each interface we broadcast on. */ - if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) + if (runfilt && (PFIL_HOOKED_OUT(V_inet_pfil_head) #ifdef INET6 - || PFIL_HOOKED(&V_inet6_pfil_hook) + || PFIL_HOOKED_OUT(V_inet6_pfil_head) #endif )) { if (used == 0) { @@ -2671,7 +2680,7 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, * initialize the expiration time and Ethernet * address. */ - brt = uma_zalloc(bridge_rtnode_zone, M_NOWAIT | M_ZERO); + brt = uma_zalloc(V_bridge_rtnode_zone, M_NOWAIT | M_ZERO); if (brt == NULL) return (ENOMEM); @@ -2684,7 +2693,7 @@ bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, brt->brt_vlan = vlan; if ((error = bridge_rtnode_insert(sc, brt)) != 0) { - uma_zfree(bridge_rtnode_zone, brt); + uma_zfree(V_bridge_rtnode_zone, brt); return (error); } brt->brt_dst = bif; @@ -2768,11 +2777,14 @@ bridge_timer(void *arg) BRIDGE_LOCK_ASSERT(sc); + /* Destruction of rtnodes requires a proper vnet context */ + CURVNET_SET(sc->sc_ifp->if_vnet); bridge_rtage(sc); if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); + CURVNET_RESTORE(); } /* @@ -3030,7 +3042,7 @@ bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) LIST_REMOVE(brt, brt_list); sc->sc_brtcnt--; brt->brt_dst->bif_addrcnt--; - uma_zfree(bridge_rtnode_zone, brt); + uma_zfree(V_bridge_rtnode_zone, brt); } /* @@ -3044,6 +3056,7 @@ bridge_rtable_expire(struct ifnet *ifp, int age) struct bridge_softc *sc = ifp->if_bridge; struct bridge_rtnode *brt; + CURVNET_SET(ifp->if_vnet); BRIDGE_LOCK(sc); /* @@ -3062,6 +3075,7 @@ bridge_rtable_expire(struct ifnet *ifp, int age) } } BRIDGE_UNLOCK(sc); + CURVNET_RESTORE(); } /* @@ -3103,6 +3117,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) struct ip *ip; struct llc llc1; u_int16_t ether_type; + pfil_return_t rv; snap = 0; error = -1; /* Default error if not error == 0 */ @@ -3174,14 +3189,14 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) } /* Run the packet through pfil before stripping link headers */ - if (PFIL_HOOKED(&V_link_pfil_hook) && V_pfil_ipfw != 0 && - dir == PFIL_OUT && ifp != NULL) { - - error = pfil_run_hooks(&V_link_pfil_hook, mp, ifp, dir, 0, - NULL); - - if (*mp == NULL || error != 0) /* packet consumed by filter */ - return (error); + if (PFIL_HOOKED_OUT(V_link_pfil_head) && V_pfil_ipfw != 0 && + dir == PFIL_OUT && ifp != NULL) { + switch (pfil_run_hooks(V_link_pfil_head, mp, ifp, dir, NULL)) { + case PFIL_DROPPED: + return (EPERM); + case PFIL_CONSUMED: + return (0); + } } /* Strip off the Ethernet header and keep a copy. */ @@ -3219,6 +3234,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) /* * Run the packet through pfil */ + rv = PFIL_PASS; switch (ether_type) { case ETHERTYPE_IP: /* @@ -3228,25 +3244,19 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) * Keep the order: * in_if -> bridge_if -> out_if */ - if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) - error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = + pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_member && ifp != NULL) - error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_member && ifp != NULL && (rv = + pfil_run_hooks(V_inet_pfil_head, mp, ifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) - error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = + pfil_run_hooks(V_inet_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) break; /* check if we need to fragment the packet */ @@ -3282,35 +3292,33 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) break; #ifdef INET6 case ETHERTYPE_IPV6: - if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) - error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL && (rv = + pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_member && ifp != NULL) - error = pfil_run_hooks(&V_inet6_pfil_hook, mp, ifp, - dir, 0, NULL); - - if (*mp == NULL || error != 0) /* filter may consume */ + if (V_pfil_member && ifp != NULL && (rv = + pfil_run_hooks(V_inet6_pfil_head, mp, ifp, dir, NULL)) != + PFIL_PASS) break; - if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) - error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, - dir, 0, NULL); + if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL && (rv = + pfil_run_hooks(V_inet6_pfil_head, mp, bifp, dir, NULL)) != + PFIL_PASS) + break; break; #endif + } + + switch (rv) { + case PFIL_CONSUMED: + return (0); + case PFIL_DROPPED: + return (EPERM); default: - error = 0; break; } - if (*mp == NULL) - return (error); - if (error != 0) - goto bad; - error = -1; /* diff --git a/freebsd/sys/net/if_clone.h b/freebsd/sys/net/if_clone.h index 5dceacf6..30d604f3 100644 --- a/freebsd/sys/net/if_clone.h +++ b/freebsd/sys/net/if_clone.h @@ -37,6 +37,8 @@ #ifdef _KERNEL +#include <sys/_eventhandler.h> + #define IFC_NOGROUP 0x1 struct if_clone; @@ -65,11 +67,9 @@ const char *ifc_name(struct if_clone *); void ifc_flags_set(struct if_clone *, int flags); int ifc_flags_get(struct if_clone *); -#ifdef _SYS_EVENTHANDLER_H_ /* Interface clone event. */ typedef void (*if_clone_event_handler_t)(void *, struct if_clone *); EVENTHANDLER_DECLARE(if_clone_event, if_clone_event_handler_t); -#endif /* The below interfaces used only by net/if.c. */ void vnet_if_clone_init(void); diff --git a/freebsd/sys/net/if_dead.c b/freebsd/sys/net/if_dead.c index 552be13f..ff73ceaf 100644 --- a/freebsd/sys/net/if_dead.c +++ b/freebsd/sys/net/if_dead.c @@ -128,6 +128,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt) { } +static void +ifdead_ratelimit_query(struct ifnet *ifp __unused, + struct if_ratelimit_query_results *q) +{ + /* + * This guy does not support + * this interface. Not sure + * why we would specify a + * flag on the interface + * that says we do. + */ + q->rate_table = NULL; + q->flags = RT_NOSUPPORT; + q->max_flows = 0; + q->number_of_rates = 0; +} + void if_dead(struct ifnet *ifp) { @@ -144,4 +161,5 @@ if_dead(struct ifnet *ifp) ifp->if_snd_tag_modify = ifdead_snd_tag_modify; ifp->if_snd_tag_query = ifdead_snd_tag_query; ifp->if_snd_tag_free = ifdead_snd_tag_free; + ifp->if_ratelimit_query = ifdead_ratelimit_query; } diff --git a/freebsd/sys/net/if_enc.c b/freebsd/sys/net/if_enc.c index ebfbf5cb..9e7fcc53 100644 --- a/freebsd/sys/net/if_enc.c +++ b/freebsd/sys/net/if_enc.c @@ -287,24 +287,24 @@ enc_hhook(int32_t hhook_type, int32_t hhook_id, void *udata, void *ctx_data, switch (hhook_id) { #ifdef INET case AF_INET: - ph = &V_inet_pfil_hook; + ph = V_inet_pfil_head; break; #endif #ifdef INET6 case AF_INET6: - ph = &V_inet6_pfil_hook; + ph = V_inet6_pfil_head; break; #endif default: ph = NULL; } - if (ph == NULL || !PFIL_HOOKED(ph)) + if (ph == NULL || (pdir == PFIL_OUT && !PFIL_HOOKED_OUT(ph)) || + (pdir == PFIL_IN && !PFIL_HOOKED_IN(ph))) return (0); /* Make a packet looks like it was received on enc(4) */ rcvif = (*ctx->mp)->m_pkthdr.rcvif; (*ctx->mp)->m_pkthdr.rcvif = ifp; - if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, 0, ctx->inp) != 0 || - *ctx->mp == NULL) { + if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, ctx->inp) != PFIL_PASS) { *ctx->mp = NULL; /* consumed by filter */ return (EACCES); } diff --git a/freebsd/sys/net/if_ethersubr.c b/freebsd/sys/net/if_ethersubr.c index 96ed309a..6c5c2ccb 100644 --- a/freebsd/sys/net/if_ethersubr.c +++ b/freebsd/sys/net/if_ethersubr.c @@ -44,11 +44,13 @@ #include <sys/systm.h> #include <sys/bus.h> #include <sys/eventhandler.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/mbuf.h> +#include <sys/proc.h> #include <sys/priv.h> #include <sys/random.h> #include <sys/socket.h> @@ -56,6 +58,7 @@ #include <sys/sysctl.h> #include <sys/uuid.h> +#include <net/ieee_oui.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_arp.h> @@ -87,12 +90,14 @@ #endif #include <security/mac/mac_framework.h> +#include <crypto/sha1.h> + #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif -VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */ +VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); @@ -459,7 +464,6 @@ ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { - int error; uint8_t pcp; pcp = ifp->if_pcp; @@ -467,27 +471,27 @@ ether_output_frame(struct ifnet *ifp, struct mbuf *m) !ether_set_pcp(&m, ifp, pcp)) return (0); - if (PFIL_HOOKED(&V_link_pfil_hook)) { - error = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, - PFIL_OUT, 0, NULL); - if (error != 0) + if (PFIL_HOOKED_OUT(V_link_pfil_head)) + switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, + NULL)) { + case PFIL_DROPPED: return (EACCES); - - if (m == NULL) + case PFIL_CONSUMED: return (0); - } + } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ - /* Catch ETHERTYPE_IP, and ETHERTYPE_ARP if we are v6-only. */ - if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY) != 0) { + /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ + if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ @@ -538,6 +542,25 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); +#ifdef EXPERIMENTAL +#if defined(INET6) && defined(INET) + /* draft-ietf-6man-ipv6only-flag */ + /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ + if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { + + switch (etype) { + case ETHERTYPE_IP: + case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: + m_freem(m); + return; + /* NOTREACHED */ + break; + }; + } +#endif +#endif + CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { @@ -739,14 +762,14 @@ SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { - int i; + struct pfil_head_args args; + + args.pa_version = PFIL_VERSION; + args.pa_flags = PFIL_IN | PFIL_OUT; + args.pa_type = PFIL_TYPE_ETHERNET; + args.pa_headname = PFIL_ETHER_NAME; + V_link_pfil_head = pfil_head_register(&args); - /* Initialize packet filter hooks. */ - V_link_pfil_hook.ph_type = PFIL_TYPE_AF; - V_link_pfil_hook.ph_af = AF_LINK; - if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) - printf("%s: WARNING: unable to register pfil link hook, " - "error %d\n", __func__, i); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif @@ -758,11 +781,8 @@ VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, static void vnet_ether_pfil_destroy(__unused void *arg) { - int i; - if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0) - printf("%s: WARNING: unable to unregister pfil link hook, " - "error %d\n", __func__, i); + pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); @@ -798,6 +818,7 @@ ether_input(struct ifnet *ifp, struct mbuf *m) * We will rely on rcvif being set properly in the deferred context, * so assert it is correct here. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); CURVNET_SET_QUIET(ifp->if_vnet); @@ -820,10 +841,8 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ - if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) { - i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, 0, - NULL); - + if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { + i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } @@ -1390,5 +1409,38 @@ ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, return (true); } +/* + * Allocate an address from the FreeBSD Foundation OUI. This uses a + * cryptographic hash function on the containing jail's UUID and the interface + * name to attempt to provide a unique but stable address. Pseudo-interfaces + * which require a MAC address should use this function to allocate + * non-locally-administered addresses. + */ +void +ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) +{ +#define ETHER_GEN_ADDR_BUFSIZ HOSTUUIDLEN + IFNAMSIZ + 2 + SHA1_CTX ctx; + char buf[ETHER_GEN_ADDR_BUFSIZ]; + char uuid[HOSTUUIDLEN + 1]; + uint64_t addr; + int i, sz; + char digest[SHA1_RESULTLEN]; + + getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); + sz = snprintf(buf, ETHER_GEN_ADDR_BUFSIZ, "%s-%s", uuid, ifp->if_xname); + SHA1Init(&ctx); + SHA1Update(&ctx, buf, sz); + SHA1Final(digest, &ctx); + + addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & + OUI_FREEBSD_GENERATED_MASK; + addr = OUI_FREEBSD(addr); + for (i = 0; i < ETHER_ADDR_LEN; ++i) { + hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & + 0xFF; + } +} + DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); diff --git a/freebsd/sys/net/if_gre.c b/freebsd/sys/net/if_gre.c index 4fbc105e..5aeb8266 100644 --- a/freebsd/sys/net/if_gre.c +++ b/freebsd/sys/net/if_gre.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_rss.h> #include <sys/param.h> #include <sys/kernel.h> @@ -51,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include <sys/priv.h> #include <sys/proc.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/sx.h> #include <sys/sysctl.h> @@ -67,19 +69,27 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <netinet/in.h> +#include <netinet/in_pcb.h> #ifdef INET #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_var.h> +#ifdef RSS +#include <netinet/in_rss.h> +#endif #endif #ifdef INET6 #include <netinet/ip6.h> #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> +#ifdef RSS +#include <netinet6/in6_rss.h> +#endif #endif #include <netinet/ip_encap.h> +#include <netinet/udp.h> #include <net/bpf.h> #include <net/if_gre.h> @@ -153,6 +163,7 @@ vnet_gre_uninit(const void *unused __unused) #ifdef INET6 in6_gre_uninit(); #endif + /* XXX: epoch_call drain */ } VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_uninit, NULL); @@ -272,6 +283,7 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; case GRESKEY: case GRESOPTS: + case GRESPORT: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, @@ -287,23 +299,45 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } if (sc->gre_options == opt) break; + } else if (cmd == GRESPORT) { + if (opt != 0 && (opt < V_ipport_hifirstauto || + opt > V_ipport_hilastauto)) { + error = EINVAL; + break; + } + if (sc->gre_port == opt) + break; + if ((sc->gre_options & GRE_UDPENCAP) == 0) { + /* + * UDP encapsulation is not enabled, thus + * there is no need to reattach softc. + */ + sc->gre_port = opt; + break; + } } switch (sc->gre_family) { #ifdef INET case AF_INET: - in_gre_setopts(sc, cmd, opt); + error = in_gre_setopts(sc, cmd, opt); break; #endif #ifdef INET6 case AF_INET6: - in6_gre_setopts(sc, cmd, opt); + error = in6_gre_setopts(sc, cmd, opt); break; #endif default: + /* + * Tunnel is not yet configured. + * We can just change any parameters. + */ if (cmd == GRESKEY) sc->gre_key = opt; - else + if (cmd == GRESOPTS) sc->gre_options = opt; + if (cmd == GRESPORT) + sc->gre_port = opt; break; } /* @@ -319,6 +353,10 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr), sizeof(sc->gre_options)); break; + case GREGPORT: + error = copyout(&sc->gre_port, ifr_data_get_ptr(ifr), + sizeof(sc->gre_port)); + break; default: error = EINVAL; break; @@ -343,6 +381,7 @@ end: static void gre_delete_tunnel(struct gre_softc *sc) { + struct gre_socket *gs; sx_assert(&gre_ioctl_sx, SA_XLOCKED); if (sc->gre_family != 0) { @@ -352,6 +391,16 @@ gre_delete_tunnel(struct gre_softc *sc) free(sc->gre_hdr, M_GRE); sc->gre_family = 0; } + /* + * If this Tunnel was the last one that could use UDP socket, + * we should unlink socket from hash table and close it. + */ + if ((gs = sc->gre_so) != NULL && CK_LIST_EMPTY(&gs->list)) { + CK_LIST_REMOVE(gs, chain); + soclose(gs->so); + epoch_call(net_epoch_preempt, &gs->epoch_ctx, gre_sofree); + sc->gre_so = NULL; + } GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GRE2IFP(sc), LINK_STATE_DOWN); } @@ -378,7 +427,38 @@ gre_hashdestroy(struct gre_list *hash) } void -gre_updatehdr(struct gre_softc *sc, struct grehdr *gh) +gre_sofree(epoch_context_t ctx) +{ + struct gre_socket *gs; + + gs = __containerof(ctx, struct gre_socket, epoch_ctx); + free(gs, M_GRE); +} + +static __inline uint16_t +gre_cksum_add(uint16_t sum, uint16_t a) +{ + uint16_t res; + + res = sum + a; + return (res + (res < a)); +} + +void +gre_update_udphdr(struct gre_softc *sc, struct udphdr *udp, uint16_t csum) +{ + + sx_assert(&gre_ioctl_sx, SA_XLOCKED); + MPASS(sc->gre_options & GRE_UDPENCAP); + + udp->uh_dport = htons(GRE_UDPPORT); + udp->uh_sport = htons(sc->gre_port); + udp->uh_sum = csum; + udp->uh_ulen = 0; +} + +void +gre_update_hdr(struct gre_softc *sc, struct grehdr *gh) { uint32_t *opts; uint16_t flags; @@ -545,6 +625,52 @@ gre_setseqn(struct grehdr *gh, uint32_t seq) *opts = htonl(seq); } +static uint32_t +gre_flowid(struct gre_softc *sc, struct mbuf *m, uint32_t af) +{ + uint32_t flowid; + + if ((sc->gre_options & GRE_UDPENCAP) == 0 || sc->gre_port != 0) + return (0); +#ifndef RSS + switch (af) { +#ifdef INET + case AF_INET: + flowid = mtod(m, struct ip *)->ip_src.s_addr ^ + mtod(m, struct ip *)->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case AF_INET6: + flowid = mtod(m, struct ip6_hdr *)->ip6_src.s6_addr32[3] ^ + mtod(m, struct ip6_hdr *)->ip6_dst.s6_addr32[3]; + break; +#endif + default: + flowid = 0; + } +#else /* RSS */ + switch (af) { +#ifdef INET + case AF_INET: + flowid = rss_hash_ip4_2tuple(mtod(m, struct ip *)->ip_src, + mtod(m, struct ip *)->ip_dst); + break; +#endif +#ifdef INET6 + case AF_INET6: + flowid = rss_hash_ip6_2tuple( + &mtod(m, struct ip6_hdr *)->ip6_src, + &mtod(m, struct ip6_hdr *)->ip6_dst); + break; +#endif + default: + flowid = 0; + } +#endif + return (flowid); +} + #define MTAG_GRE 1307983903 static int gre_transmit(struct ifnet *ifp, struct mbuf *m) @@ -552,7 +678,8 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) GRE_RLOCK_TRACKER; struct gre_softc *sc; struct grehdr *gh; - uint32_t af; + struct udphdr *uh; + uint32_t af, flowid; int error, len; uint16_t proto; @@ -579,6 +706,7 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) af = m->m_pkthdr.csum_data; BPF_MTAP2(ifp, &af, sizeof(af), m); m->m_flags &= ~(M_BCAST|M_MCAST); + flowid = gre_flowid(sc, m, af); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { @@ -620,6 +748,19 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) error = ENETDOWN; goto drop; } + if (sc->gre_options & GRE_UDPENCAP) { + uh = (struct udphdr *)mtodo(m, len); + uh->uh_sport |= htons(V_ipport_hifirstauto) | + (flowid >> 16) | (flowid & 0xFFFF); + uh->uh_sport = htons(ntohs(uh->uh_sport) % + V_ipport_hilastauto); + uh->uh_ulen = htons(m->m_pkthdr.len - len); + uh->uh_sum = gre_cksum_add(uh->uh_sum, + htons(m->m_pkthdr.len - len + IPPROTO_UDP)); + m->m_pkthdr.csum_flags = sc->gre_csumflags; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + len += sizeof(struct udphdr); + } gh = (struct grehdr *)mtodo(m, len); gh->gre_proto = proto; if (sc->gre_options & GRE_ENABLE_SEQ) @@ -637,7 +778,7 @@ gre_transmit(struct ifnet *ifp, struct mbuf *m) #endif #ifdef INET6 case AF_INET6: - error = in6_gre_output(m, af, sc->gre_hlen); + error = in6_gre_output(m, af, sc->gre_hlen, flowid); break; #endif default: diff --git a/freebsd/sys/net/if_gre.h b/freebsd/sys/net/if_gre.h index 4b93321a..de3c5979 100644 --- a/freebsd/sys/net/if_gre.h +++ b/freebsd/sys/net/if_gre.h @@ -53,14 +53,35 @@ struct greip { struct ip gi_ip; struct grehdr gi_gre; } __packed; -#endif + +struct greudp { + struct ip gi_ip; + struct udphdr gi_udp; + struct grehdr gi_gre; +} __packed; +#endif /* INET */ #ifdef INET6 struct greip6 { struct ip6_hdr gi6_ip6; struct grehdr gi6_gre; } __packed; -#endif + +struct greudp6 { + struct ip6_hdr gi6_ip6; + struct udphdr gi6_udp; + struct grehdr gi6_gre; +} __packed; +#endif /* INET6 */ + +CK_LIST_HEAD(gre_list, gre_softc); +CK_LIST_HEAD(gre_sockets, gre_socket); +struct gre_socket { + struct socket *so; + struct gre_list list; + CK_LIST_ENTRY(gre_socket) chain; + struct epoch_context epoch_ctx; +}; struct gre_softc { struct ifnet *gre_ifp; @@ -69,22 +90,26 @@ struct gre_softc { uint32_t gre_oseq; uint32_t gre_key; uint32_t gre_options; + uint32_t gre_csumflags; + uint32_t gre_port; u_int gre_fibnum; u_int gre_hlen; /* header size */ union { void *hdr; #ifdef INET - struct greip *gihdr; + struct greip *iphdr; + struct greudp *udphdr; #endif #ifdef INET6 - struct greip6 *gi6hdr; + struct greip6 *ip6hdr; + struct greudp6 *udp6hdr; #endif } gre_uhdr; + struct gre_socket *gre_so; CK_LIST_ENTRY(gre_softc) chain; CK_LIST_ENTRY(gre_softc) srchash; }; -CK_LIST_HEAD(gre_list, gre_softc); MALLOC_DECLARE(M_GRE); #ifndef GRE_HASH_SIZE @@ -98,28 +123,35 @@ MALLOC_DECLARE(M_GRE); #define GRE_WAIT() epoch_wait_preempt(net_epoch_preempt) #define gre_hdr gre_uhdr.hdr -#define gre_gihdr gre_uhdr.gihdr -#define gre_gi6hdr gre_uhdr.gi6hdr -#define gre_oip gre_gihdr->gi_ip -#define gre_oip6 gre_gi6hdr->gi6_ip6 +#define gre_iphdr gre_uhdr.iphdr +#define gre_ip6hdr gre_uhdr.ip6hdr +#define gre_udphdr gre_uhdr.udphdr +#define gre_udp6hdr gre_uhdr.udp6hdr + +#define gre_oip gre_iphdr->gi_ip +#define gre_udp gre_udphdr->gi_udp +#define gre_oip6 gre_ip6hdr->gi6_ip6 +#define gre_udp6 gre_udp6hdr->gi6_udp struct gre_list *gre_hashinit(void); void gre_hashdestroy(struct gre_list *); int gre_input(struct mbuf *, int, int, void *); -void gre_updatehdr(struct gre_softc *, struct grehdr *); +void gre_update_hdr(struct gre_softc *, struct grehdr *); +void gre_update_udphdr(struct gre_softc *, struct udphdr *, uint16_t); +void gre_sofree(epoch_context_t); void in_gre_init(void); void in_gre_uninit(void); -void in_gre_setopts(struct gre_softc *, u_long, uint32_t); +int in_gre_setopts(struct gre_softc *, u_long, uint32_t); int in_gre_ioctl(struct gre_softc *, u_long, caddr_t); int in_gre_output(struct mbuf *, int, int); void in6_gre_init(void); void in6_gre_uninit(void); -void in6_gre_setopts(struct gre_softc *, u_long, uint32_t); +int in6_gre_setopts(struct gre_softc *, u_long, uint32_t); int in6_gre_ioctl(struct gre_softc *, u_long, caddr_t); -int in6_gre_output(struct mbuf *, int, int); +int in6_gre_output(struct mbuf *, int, int, uint32_t); /* * CISCO uses special type for GRE tunnel created as part of WCCP * connection, while in fact those packets are just IPv4 encapsulated @@ -139,9 +171,15 @@ int in6_gre_output(struct mbuf *, int, int); #define GRESKEY _IOW('i', 108, struct ifreq) #define GREGOPTS _IOWR('i', 109, struct ifreq) #define GRESOPTS _IOW('i', 110, struct ifreq) +#define GREGPORT _IOWR('i', 111, struct ifreq) +#define GRESPORT _IOW('i', 112, struct ifreq) + +/* GRE-in-UDP encapsulation destination port as defined in RFC8086 */ +#define GRE_UDPPORT 4754 #define GRE_ENABLE_CSUM 0x0001 #define GRE_ENABLE_SEQ 0x0002 -#define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ) +#define GRE_UDPENCAP 0x0004 +#define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ|GRE_UDPENCAP) #endif /* _NET_IF_GRE_H_ */ diff --git a/freebsd/sys/net/if_lagg.c b/freebsd/sys/net/if_lagg.c index 85099115..b82313eb 100644 --- a/freebsd/sys/net/if_lagg.c +++ b/freebsd/sys/net/if_lagg.c @@ -25,6 +25,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_ratelimit.h> #include <sys/param.h> @@ -97,6 +98,11 @@ static struct { {0, NULL} }; +struct lagg_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx); @@ -113,6 +119,7 @@ static void lagg_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; +static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface"); static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); @@ -131,10 +138,17 @@ static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); +static int lagg_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int lagg_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); +static void lagg_snd_tag_free(struct m_snd_tag *); +static void lagg_ratelimit_query(struct ifnet *, + struct if_ratelimit_query_results *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); @@ -264,6 +278,13 @@ SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); +/* Default value for using numa */ +VNET_DEFINE_STATIC(int, def_use_numa) = 1; +#define V_def_use_numa VNET(def_use_numa) +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN, + &VNET_NAME(def_use_numa), 0, + "Use numa to steer flows"); + /* Default value for flowid shift */ VNET_DEFINE_STATIC(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) @@ -480,10 +501,10 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) struct ifnet *ifp; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ - sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { - free(sc, M_DEVBUF); + free(sc, M_LAGG); return (ENOSPC); } LAGG_SX_INIT(sc); @@ -491,6 +512,8 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; + if (V_def_use_numa) + sc->sc_opts |= LAGG_OPT_USE_NUMA; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ @@ -514,12 +537,14 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; - ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT; -#else - ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; + ifp->if_snd_tag_modify = lagg_snd_tag_modify; + ifp->if_snd_tag_query = lagg_snd_tag_query; + ifp->if_snd_tag_free = lagg_snd_tag_free; + ifp->if_ratelimit_query = lagg_ratelimit_query; #endif + ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached @@ -572,7 +597,7 @@ lagg_clone_destroy(struct ifnet *ifp) LAGG_LIST_UNLOCK(); LAGG_SX_DESTROY(sc); - free(sc, M_DEVBUF); + free(sc, M_LAGG); } static void @@ -686,7 +711,7 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) ifr.ifr_mtu = oldmtu; } - lp = malloc(sizeof(struct lagg_port), M_DEVBUF, M_WAITOK|M_ZERO); + lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ @@ -694,7 +719,7 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); - free(lp, M_DEVBUF); + free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); @@ -705,7 +730,7 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); - free(lp, M_DEVBUF); + free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); @@ -753,7 +778,6 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ - LAGG_RLOCK(); CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( CK_SLIST_NEXT(tlp, lp_entries) == NULL || @@ -761,7 +785,6 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) ifp->if_index)) break; } - LAGG_RUNLOCK(); if (tlp != NULL) CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries); else @@ -816,7 +839,7 @@ lagg_port_destroy_cb(epoch_context_t ec) ifp = lp->lp_ifp; if_rele(ifp); - free(lp, M_DEVBUF); + free(lp, M_LAGG); } static int @@ -1250,6 +1273,8 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: + case LAGG_OPT_USE_NUMA: + case -LAGG_OPT_USE_NUMA: case LAGG_OPT_FLOWIDSHIFT: valid = 1; lacp = 0; @@ -1528,49 +1553,142 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) return (error); } -#ifdef RATELIMIT -static int -lagg_snd_tag_alloc(struct ifnet *ifp, - union if_snd_tag_alloc_params *params, - struct m_snd_tag **ppmt) +#if defined(KERN_TLS) || defined(RATELIMIT) +static inline struct lagg_snd_tag * +mst_to_lst(struct m_snd_tag *mst) { - struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + + return (__containerof(mst, struct lagg_snd_tag, com)); +} + +/* + * Look up the port used by a specific flow. This only works for lagg + * protocols with deterministic port mappings (e.g. not roundrobin). + * In addition protocols which use a hash to map flows to ports must + * be configured to use the mbuf flowid rather than hashing packet + * contents. + */ +static struct lagg_port * +lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype) +{ + struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t p; + sc = ifp->if_softc; + switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: - lp = lagg_link_active(sc, sc->sc_primary); - break; + return (lagg_link_active(sc, sc->sc_primary)); case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) - return (EOPNOTSUPP); - p = params->hdr.flowid >> sc->flowid_shift; + flowtype == M_HASHTYPE_NONE) + return (NULL); + p = flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; - lp = lagg_link_active(sc, lp); - break; + return (lagg_link_active(sc, lp)); case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) - return (EOPNOTSUPP); - lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid); - break; + flowtype == M_HASHTYPE_NONE) + return (NULL); + return (lacp_select_tx_port_by_hash(sc, flowid)); default: - return (EOPNOTSUPP); + return (NULL); } - if (lp == NULL) +} + +static int +lagg_snd_tag_alloc(struct ifnet *ifp, + union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + struct lagg_snd_tag *lst; + struct lagg_softc *sc; + struct lagg_port *lp; + struct ifnet *lp_ifp; + int error; + + sc = ifp->if_softc; + + LAGG_RLOCK(); + lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype); + if (lp == NULL) { + LAGG_RUNLOCK(); return (EOPNOTSUPP); - ifp = lp->lp_ifp; - if (ifp == NULL || ifp->if_snd_tag_alloc == NULL || - (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + } + if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) { + LAGG_RUNLOCK(); return (EOPNOTSUPP); + } + lp_ifp = lp->lp_ifp; + if_ref(lp_ifp); + LAGG_RUNLOCK(); + + lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT); + if (lst == NULL) { + if_rele(lp_ifp); + return (ENOMEM); + } + + error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag); + if_rele(lp_ifp); + if (error) { + free(lst, M_LAGG); + return (error); + } + + m_snd_tag_init(&lst->com, ifp); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + *ppmt = &lst->com; + return (0); +} + +static int +lagg_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params)); +} + +static int +lagg_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_query(lst->tag, params)); +} + +static void +lagg_snd_tag_free(struct m_snd_tag *mst) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + m_snd_tag_rele(lst->tag); + free(lst, M_LAGG); +} + +static void +lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) +{ + /* + * For lagg, we have an indirect + * interface. The caller needs to + * get a ratelimit tag on the actual + * interface the flow will go on. + */ + q->rate_table = NULL; + q->flags = RT_IS_INDIRECT; + q->max_flows = 0; + q->number_of_rates = 0; } #endif @@ -1588,7 +1706,7 @@ lagg_setmulti(struct lagg_port *lp) CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; - mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); + mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); @@ -1619,7 +1737,7 @@ lagg_clrmulti(struct lagg_port *lp) SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); - free(mc, M_DEVBUF); + free(mc, M_LAGG); } return (0); } @@ -1696,6 +1814,10 @@ lagg_transmit(struct ifnet *ifp, struct mbuf *m) struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; +#if defined(KERN_TLS) || defined(RATELIMIT) + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); +#endif LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { @@ -1848,12 +1970,20 @@ struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; - struct epoch_tracker net_et; /* * Search a port which reports an active link state. */ +#ifdef INVARIANTS + /* + * This is called with either LAGG_RLOCK() held or + * LAGG_XLOCK(sc) held. + */ + if (!in_epoch(net_epoch_preempt)) + LAGG_XLOCK_ASSERT(sc); +#endif + if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { @@ -1866,15 +1996,12 @@ lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) goto found; } - search: - epoch_enter_preempt(net_epoch_preempt, &net_et); +search: CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { - epoch_exit_preempt(net_epoch_preempt, &net_et); return (lp_next); } } - epoch_exit_preempt(net_epoch_preempt, &net_et); found: return (rval); } @@ -1883,6 +2010,21 @@ int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { +#if defined(KERN_TLS) || defined(RATELIMIT) + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct lagg_snd_tag *lst; + struct m_snd_tag *mst; + + mst = m->m_pkthdr.snd_tag; + lst = mst_to_lst(mst); + if (lst->tag->ifp != ifp) { + m_freem(m); + return (EAGAIN); + } + m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag); + m_snd_tag_rele(mst); + } +#endif return (ifp->if_transmit)(ifp, m); } @@ -1956,7 +2098,7 @@ lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) struct lagg_port *lp, *last = NULL; struct mbuf *m0; - LAGG_RLOCK(); + LAGG_RLOCK_ASSERT(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; @@ -1977,7 +2119,6 @@ lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) } last = lp; } - LAGG_RUNLOCK(); if (last == NULL) { m_freem(m); @@ -2063,7 +2204,7 @@ lagg_lb_attach(struct lagg_softc *sc) struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); - lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO); + lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; @@ -2078,7 +2219,7 @@ lagg_lb_detach(struct lagg_softc *sc) lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) - free(lb, M_DEVBUF); + free(lb, M_LAGG); } static int @@ -2090,7 +2231,7 @@ lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) rv = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); - LAGG_RLOCK(); + LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; @@ -2103,7 +2244,6 @@ lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } - LAGG_RUNLOCK(); return (rv); } diff --git a/freebsd/sys/net/if_lagg.h b/freebsd/sys/net/if_lagg.h index f1e2d8f4..2c566c0d 100644 --- a/freebsd/sys/net/if_lagg.h +++ b/freebsd/sys/net/if_lagg.h @@ -143,6 +143,7 @@ struct lagg_reqopts { #define LAGG_OPT_USE_FLOWID 0x01 /* enable use of flowid */ /* Pseudo flags which are used in ro_opts but not stored into sc_opts. */ #define LAGG_OPT_FLOWIDSHIFT 0x02 /* set flowid shift */ +#define LAGG_OPT_USE_NUMA 0x04 /* enable use of numa */ #define LAGG_OPT_FLOWIDSHIFT_MASK 0x1f /* flowid is uint32_t */ #define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */ #define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */ @@ -158,8 +159,9 @@ struct lagg_reqopts { #define SIOCGLAGGOPTS _IOWR('i', 152, struct lagg_reqopts) #define SIOCSLAGGOPTS _IOW('i', 153, struct lagg_reqopts) -#define LAGG_OPT_BITS "\020\001USE_FLOWID\005LACP_STRICT" \ - "\006LACP_TXTEST\007LACP_RXTEST" +#define LAGG_OPT_BITS "\020\001USE_FLOWID\003USE_NUMA" \ + "\005LACP_STRICT\006LACP_TXTEST" \ + "\007LACP_RXTEST" #ifdef _KERNEL diff --git a/freebsd/sys/net/if_llatbl.c b/freebsd/sys/net/if_llatbl.c index b220d7aa..e79b9ba9 100644 --- a/freebsd/sys/net/if_llatbl.c +++ b/freebsd/sys/net/if_llatbl.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/eventhandler.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/syslog.h> @@ -92,6 +93,7 @@ static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, static int lltable_dump_af(struct lltable *llt, struct sysctl_req *wr) { + struct epoch_tracker et; int error; LLTABLE_LIST_LOCK_ASSERT(); @@ -100,10 +102,10 @@ lltable_dump_af(struct lltable *llt, struct sysctl_req *wr) return (0); error = 0; - IF_AFDATA_RLOCK(llt->llt_ifp); + NET_EPOCH_ENTER(et); error = lltable_foreach_lle(llt, (llt_foreach_cb_t *)llt->llt_dump_entry, wr); - IF_AFDATA_RUNLOCK(llt->llt_ifp); + NET_EPOCH_EXIT(et); return (error); } @@ -455,11 +457,12 @@ struct llentry * llentry_alloc(struct ifnet *ifp, struct lltable *lt, struct sockaddr_storage *dst) { + struct epoch_tracker et; struct llentry *la, *la_tmp; - IF_AFDATA_RLOCK(ifp); + NET_EPOCH_ENTER(et); la = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); - IF_AFDATA_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (la != NULL) { LLE_ADDREF(la); diff --git a/freebsd/sys/net/if_llatbl.h b/freebsd/sys/net/if_llatbl.h index 74301284..7bf57bdb 100644 --- a/freebsd/sys/net/if_llatbl.h +++ b/freebsd/sys/net/if_llatbl.h @@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$"); #ifndef _NET_IF_LLATBL_H_ #define _NET_IF_LLATBL_H_ +#include <sys/_eventhandler.h> #include <sys/_rwlock.h> #include <netinet/in.h> #include <sys/epoch.h> @@ -267,7 +268,6 @@ llentry_mark_used(struct llentry *lle) int lla_rt_output(struct rt_msghdr *, struct rt_addrinfo *); -#include <sys/eventhandler.h> enum { LLENTRY_RESOLVED, LLENTRY_TIMEDOUT, diff --git a/freebsd/sys/net/if_spppsubr.c b/freebsd/sys/net/if_spppsubr.c index f5b78dec..9aec7cd1 100644 --- a/freebsd/sys/net/if_spppsubr.c +++ b/freebsd/sys/net/if_spppsubr.c @@ -1062,15 +1062,13 @@ sppp_detach(struct ifnet *ifp) KASSERT(mtx_initialized(&sp->mtx), ("sppp mutex is not initialized")); /* Stop keepalive handler. */ - if (!callout_drain(&sp->keepalive_callout)) - callout_stop(&sp->keepalive_callout); + callout_drain(&sp->keepalive_callout); for (i = 0; i < IDX_COUNT; i++) { - if (!callout_drain(&sp->ch[i])) - callout_stop(&sp->ch[i]); + callout_drain(&sp->ch[i]); } - if (!callout_drain(&sp->pap_my_to_ch)) - callout_stop(&sp->pap_my_to_ch); + callout_drain(&sp->pap_my_to_ch); + mtx_destroy(&sp->pp_cpq.ifq_mtx); mtx_destroy(&sp->pp_fastq.ifq_mtx); mtx_destroy(&sp->mtx); @@ -4339,16 +4337,12 @@ sppp_chap_tld(struct sppp *sp) static void sppp_chap_scr(struct sppp *sp) { - u_long *ch, seed; + u_long *ch; u_char clen; /* Compute random challenge. */ ch = (u_long *)sp->myauth.challenge; - read_random(&seed, sizeof seed); - ch[0] = seed ^ random(); - ch[1] = seed ^ random(); - ch[2] = seed ^ random(); - ch[3] = seed ^ random(); + arc4random_buf(ch, 4 * sizeof(*ch)); clen = AUTHKEYLEN; sp->confid[IDX_CHAP] = ++sp->pp_seq[IDX_CHAP]; @@ -4809,7 +4803,7 @@ sppp_keepalive(void *dummy) sppp_cisco_send (sp, CISCO_KEEPALIVE_REQ, ++sp->pp_seq[IDX_LCP], sp->pp_rseq[IDX_LCP]); else if (sp->pp_phase >= PHASE_AUTHENTICATE) { - long nmagic = htonl (sp->lcp.magic); + uint32_t nmagic = htonl(sp->lcp.magic); sp->lcp.echoid = ++sp->pp_seq[IDX_LCP]; sppp_cp_send (sp, PPP_LCP, ECHO_REQ, sp->lcp.echoid, 4, &nmagic); diff --git a/freebsd/sys/net/if_stf.c b/freebsd/sys/net/if_stf.c index 3ba9f8c0..7185fb8d 100644 --- a/freebsd/sys/net/if_stf.c +++ b/freebsd/sys/net/if_stf.c @@ -730,6 +730,7 @@ stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } ifp->if_flags |= IFF_UP; + ifp->if_drv_flags |= IFF_DRV_RUNNING; break; case SIOCADDMULTI: diff --git a/freebsd/sys/net/if_tap.c b/freebsd/sys/net/if_tap.c deleted file mode 100644 index dbf3e599..00000000 --- a/freebsd/sys/net/if_tap.c +++ /dev/null @@ -1,1133 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * BASED ON: - * ------------------------------------------------------------------------- - * - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - */ - -/* - * $FreeBSD$ - * $Id: if_tap.c,v 0.21 2000/07/23 21:46:02 max Exp $ - */ - -#include <rtems/bsd/local/opt_inet.h> - -#include <sys/param.h> -#include <sys/conf.h> -#include <sys/fcntl.h> -#include <sys/filio.h> -#include <sys/jail.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/poll.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/selinfo.h> -#include <sys/signalvar.h> -#include <sys/socket.h> -#include <sys/sockio.h> -#include <sys/sysctl.h> -#include <sys/systm.h> -#include <sys/ttycom.h> -#include <sys/uio.h> -#include <sys/queue.h> - -#include <net/bpf.h> -#include <net/ethernet.h> -#include <net/if.h> -#include <net/if_var.h> -#include <net/if_clone.h> -#include <net/if_dl.h> -#include <net/if_media.h> -#include <net/if_types.h> -#include <net/route.h> -#include <net/vnet.h> - -#include <netinet/in.h> - -#include <net/if_tapvar.h> -#include <net/if_tap.h> - -#define CDEV_NAME "tap" -#define TAPDEBUG if (tapdebug) printf - -static const char tapname[] = "tap"; -static const char vmnetname[] = "vmnet"; -#define TAPMAXUNIT 0x7fff -#define VMNET_DEV_MASK CLONE_FLAG0 - -/* module */ -static int tapmodevent(module_t, int, void *); - -/* device */ -static void tapclone(void *, struct ucred *, char *, int, - struct cdev **); -static void tapcreate(struct cdev *); - -/* network interface */ -static void tapifstart(struct ifnet *); -static int tapifioctl(struct ifnet *, u_long, caddr_t); -static void tapifinit(void *); - -static int tap_clone_create(struct if_clone *, int, caddr_t); -static void tap_clone_destroy(struct ifnet *); -static struct if_clone *tap_cloner; -static int vmnet_clone_create(struct if_clone *, int, caddr_t); -static void vmnet_clone_destroy(struct ifnet *); -static struct if_clone *vmnet_cloner; - -/* character device */ -static d_open_t tapopen; -static d_close_t tapclose; -static d_read_t tapread; -static d_write_t tapwrite; -static d_ioctl_t tapioctl; -static d_poll_t tappoll; -static d_kqfilter_t tapkqfilter; - -/* kqueue(2) */ -static int tapkqread(struct knote *, long); -static int tapkqwrite(struct knote *, long); -static void tapkqdetach(struct knote *); - -static struct filterops tap_read_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tapkqdetach, - .f_event = tapkqread, -}; - -static struct filterops tap_write_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tapkqdetach, - .f_event = tapkqwrite, -}; - -static struct cdevsw tap_cdevsw = { - .d_version = D_VERSION, - .d_flags = D_NEEDMINOR, - .d_open = tapopen, - .d_close = tapclose, - .d_read = tapread, - .d_write = tapwrite, - .d_ioctl = tapioctl, - .d_poll = tappoll, - .d_name = CDEV_NAME, - .d_kqfilter = tapkqfilter, -}; - -/* - * All global variables in if_tap.c are locked with tapmtx, with the - * exception of tapdebug, which is accessed unlocked; tapclones is - * static at runtime. - */ -static struct mtx tapmtx; -static int tapdebug = 0; /* debug flag */ -static int tapuopen = 0; /* allow user open() */ -static int tapuponopen = 0; /* IFF_UP on open() */ -static int tapdclone = 1; /* enable devfs cloning */ -static SLIST_HEAD(, tap_softc) taphead; /* first device */ -static struct clonedevs *tapclones; - -MALLOC_DECLARE(M_TAP); -MALLOC_DEFINE(M_TAP, CDEV_NAME, "Ethernet tunnel interface"); -SYSCTL_INT(_debug, OID_AUTO, if_tap_debug, CTLFLAG_RW, &tapdebug, 0, ""); - -SYSCTL_DECL(_net_link); -static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, - "Ethernet tunnel software network interface"); -SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tapuopen, 0, - "Allow user to open /dev/tap (based on node permissions)"); -SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, - "Bring interface up when /dev/tap is opened"); -SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, - "Enable legacy devfs interface creation"); -SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tapdebug, 0, ""); - -DEV_MODULE(if_tap, tapmodevent, NULL); - -static int -tap_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* Find any existing device, or allocate new unit number. */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, 0); - if (i) { - dev = make_dev(&tap_cdevsw, unit, UID_ROOT, GID_WHEEL, 0600, - "%s%d", tapname, unit); - } - - tapcreate(dev); - return (0); -} - -/* vmnet devices are tap devices in disguise */ -static int -vmnet_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* Find any existing device, or allocate new unit number. */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, VMNET_DEV_MASK); - if (i) { - dev = make_dev(&tap_cdevsw, unit | VMNET_DEV_MASK, UID_ROOT, - GID_WHEEL, 0600, "%s%d", vmnetname, unit); - } - - tapcreate(dev); - return (0); -} - -static void -tap_destroy(struct tap_softc *tp) -{ - struct ifnet *ifp = tp->tap_ifp; - - CURVNET_SET(ifp->if_vnet); - destroy_dev(tp->tap_dev); - seldrain(&tp->tap_rsel); - knlist_clear(&tp->tap_rsel.si_note, 0); - knlist_destroy(&tp->tap_rsel.si_note); - ether_ifdetach(ifp); - if_free(ifp); - - mtx_destroy(&tp->tap_mtx); - free(tp, M_TAP); - CURVNET_RESTORE(); -} - -static void -tap_clone_destroy(struct ifnet *ifp) -{ - struct tap_softc *tp = ifp->if_softc; - - mtx_lock(&tapmtx); - SLIST_REMOVE(&taphead, tp, tap_softc, tap_next); - mtx_unlock(&tapmtx); - tap_destroy(tp); -} - -/* vmnet devices are tap devices in disguise */ -static void -vmnet_clone_destroy(struct ifnet *ifp) -{ - tap_clone_destroy(ifp); -} - -/* - * tapmodevent - * - * module event handler - */ -static int -tapmodevent(module_t mod, int type, void *data) -{ - static eventhandler_tag eh_tag = NULL; - struct tap_softc *tp = NULL; - struct ifnet *ifp = NULL; - - switch (type) { - case MOD_LOAD: - - /* intitialize device */ - - mtx_init(&tapmtx, "tapmtx", NULL, MTX_DEF); - SLIST_INIT(&taphead); - - clone_setup(&tapclones); - eh_tag = EVENTHANDLER_REGISTER(dev_clone, tapclone, 0, 1000); - if (eh_tag == NULL) { - clone_cleanup(&tapclones); - mtx_destroy(&tapmtx); - return (ENOMEM); - } - tap_cloner = if_clone_simple(tapname, tap_clone_create, - tap_clone_destroy, 0); - vmnet_cloner = if_clone_simple(vmnetname, vmnet_clone_create, - vmnet_clone_destroy, 0); - return (0); - - case MOD_UNLOAD: - /* - * The EBUSY algorithm here can't quite atomically - * guarantee that this is race-free since we have to - * release the tap mtx to deregister the clone handler. - */ - mtx_lock(&tapmtx); - SLIST_FOREACH(tp, &taphead, tap_next) { - mtx_lock(&tp->tap_mtx); - if (tp->tap_flags & TAP_OPEN) { - mtx_unlock(&tp->tap_mtx); - mtx_unlock(&tapmtx); - return (EBUSY); - } - mtx_unlock(&tp->tap_mtx); - } - mtx_unlock(&tapmtx); - - EVENTHANDLER_DEREGISTER(dev_clone, eh_tag); - if_clone_detach(tap_cloner); - if_clone_detach(vmnet_cloner); - drain_dev_clone_events(); - - mtx_lock(&tapmtx); - while ((tp = SLIST_FIRST(&taphead)) != NULL) { - SLIST_REMOVE_HEAD(&taphead, tap_next); - mtx_unlock(&tapmtx); - - ifp = tp->tap_ifp; - - TAPDEBUG("detaching %s\n", ifp->if_xname); - - tap_destroy(tp); - mtx_lock(&tapmtx); - } - mtx_unlock(&tapmtx); - clone_cleanup(&tapclones); - - mtx_destroy(&tapmtx); - - break; - - default: - return (EOPNOTSUPP); - } - - return (0); -} /* tapmodevent */ - - -/* - * DEVFS handler - * - * We need to support two kind of devices - tap and vmnet - */ -static void -tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) -{ - char devname[SPECNAMELEN + 1]; - int i, unit, append_unit; - int extra; - - if (*dev != NULL) - return; - - if (!tapdclone || - (!tapuopen && priv_check_cred(cred, PRIV_NET_IFCREATE) != 0)) - return; - - unit = 0; - append_unit = 0; - extra = 0; - - /* We're interested in only tap/vmnet devices. */ - if (strcmp(name, tapname) == 0) { - unit = -1; - } else if (strcmp(name, vmnetname) == 0) { - unit = -1; - extra = VMNET_DEV_MASK; - } else if (dev_stdclone(name, NULL, tapname, &unit) != 1) { - if (dev_stdclone(name, NULL, vmnetname, &unit) != 1) { - return; - } else { - extra = VMNET_DEV_MASK; - } - } - - if (unit == -1) - append_unit = 1; - - CURVNET_SET(CRED_TO_VNET(cred)); - /* find any existing device, or allocate new unit number */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, dev, extra); - if (i) { - if (append_unit) { - /* - * We were passed 'tun' or 'tap', with no unit specified - * so we'll need to append it now. - */ - namelen = snprintf(devname, sizeof(devname), "%s%d", name, - unit); - name = devname; - } - - *dev = make_dev_credf(MAKEDEV_REF, &tap_cdevsw, unit | extra, - cred, UID_ROOT, GID_WHEEL, 0600, "%s", name); - } - - if_clone_create(name, namelen, NULL); - CURVNET_RESTORE(); -} /* tapclone */ - - -/* - * tapcreate - * - * to create interface - */ -static void -tapcreate(struct cdev *dev) -{ - struct ifnet *ifp = NULL; - struct tap_softc *tp = NULL; - unsigned short macaddr_hi; - uint32_t macaddr_mid; - int unit; - const char *name = NULL; - u_char eaddr[6]; - - /* allocate driver storage and create device */ - tp = malloc(sizeof(*tp), M_TAP, M_WAITOK | M_ZERO); - mtx_init(&tp->tap_mtx, "tap_mtx", NULL, MTX_DEF); - mtx_lock(&tapmtx); - SLIST_INSERT_HEAD(&taphead, tp, tap_next); - mtx_unlock(&tapmtx); - - unit = dev2unit(dev); - - /* select device: tap or vmnet */ - if (unit & VMNET_DEV_MASK) { - name = vmnetname; - tp->tap_flags |= TAP_VMNET; - } else - name = tapname; - - unit &= TAPMAXUNIT; - - TAPDEBUG("tapcreate(%s%d). minor = %#x\n", name, unit, dev2unit(dev)); - - /* generate fake MAC address: 00 bd xx xx xx unit_no */ - macaddr_hi = htons(0x00bd); - macaddr_mid = (uint32_t) ticks; - bcopy(&macaddr_hi, eaddr, sizeof(short)); - bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); - eaddr[5] = (u_char)unit; - - /* fill the rest and attach interface */ - ifp = tp->tap_ifp = if_alloc(IFT_ETHER); - if (ifp == NULL) - panic("%s%d: can not if_alloc()", name, unit); - ifp->if_softc = tp; - if_initname(ifp, name, unit); - ifp->if_init = tapifinit; - ifp->if_start = tapifstart; - ifp->if_ioctl = tapifioctl; - ifp->if_mtu = ETHERMTU; - ifp->if_flags = (IFF_BROADCAST|IFF_SIMPLEX|IFF_MULTICAST); - IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); - ifp->if_capabilities |= IFCAP_LINKSTATE; - ifp->if_capenable |= IFCAP_LINKSTATE; - - dev->si_drv1 = tp; - tp->tap_dev = dev; - - ether_ifattach(ifp, eaddr); - - mtx_lock(&tp->tap_mtx); - tp->tap_flags |= TAP_INITED; - mtx_unlock(&tp->tap_mtx); - - knlist_init_mtx(&tp->tap_rsel.si_note, &tp->tap_mtx); - - TAPDEBUG("interface %s is created. minor = %#x\n", - ifp->if_xname, dev2unit(dev)); -} /* tapcreate */ - - -/* - * tapopen - * - * to open tunnel. must be superuser - */ -static int -tapopen(struct cdev *dev, int flag, int mode, struct thread *td) -{ - struct tap_softc *tp = NULL; - struct ifnet *ifp = NULL; - int error; - - if (tapuopen == 0) { - error = priv_check(td, PRIV_NET_TAP); - if (error) - return (error); - } - - if ((dev2unit(dev) & CLONE_UNITMASK) > TAPMAXUNIT) - return (ENXIO); - - tp = dev->si_drv1; - - mtx_lock(&tp->tap_mtx); - if (tp->tap_flags & TAP_OPEN) { - mtx_unlock(&tp->tap_mtx); - return (EBUSY); - } - - bcopy(IF_LLADDR(tp->tap_ifp), tp->ether_addr, sizeof(tp->ether_addr)); -#ifndef __rtems__ - tp->tap_pid = td->td_proc->p_pid; -#else /* __rtems__ */ - tp->tap_pid = BSD_DEFAULT_PID; -#endif /* __rtems__ */ - tp->tap_flags |= TAP_OPEN; - ifp = tp->tap_ifp; - - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - if (tapuponopen) - ifp->if_flags |= IFF_UP; - if_link_state_change(ifp, LINK_STATE_UP); - mtx_unlock(&tp->tap_mtx); - - TAPDEBUG("%s is open. minor = %#x\n", ifp->if_xname, dev2unit(dev)); - - return (0); -} /* tapopen */ - - -/* - * tapclose - * - * close the device - mark i/f down & delete routing info - */ -static int -tapclose(struct cdev *dev, int foo, int bar, struct thread *td) -{ - struct ifaddr *ifa; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - - /* junk all pending output */ - mtx_lock(&tp->tap_mtx); - CURVNET_SET(ifp->if_vnet); - IF_DRAIN(&ifp->if_snd); - - /* - * Do not bring the interface down, and do not anything with - * interface, if we are in VMnet mode. Just close the device. - */ - if (((tp->tap_flags & TAP_VMNET) == 0) && - (ifp->if_flags & (IFF_UP | IFF_LINK0)) == IFF_UP) { - mtx_unlock(&tp->tap_mtx); - if_down(ifp); - mtx_lock(&tp->tap_mtx); - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - mtx_unlock(&tp->tap_mtx); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - rtinit(ifa, (int)RTM_DELETE, 0); - } - if_purgeaddrs(ifp); - mtx_lock(&tp->tap_mtx); - } - } - - if_link_state_change(ifp, LINK_STATE_DOWN); - CURVNET_RESTORE(); - - funsetown(&tp->tap_sigio); - selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - - tp->tap_flags &= ~TAP_OPEN; - tp->tap_pid = 0; - mtx_unlock(&tp->tap_mtx); - - TAPDEBUG("%s is closed. minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - return (0); -} /* tapclose */ - - -/* - * tapifinit - * - * network interface initialization function - */ -static void -tapifinit(void *xtp) -{ - struct tap_softc *tp = (struct tap_softc *)xtp; - struct ifnet *ifp = tp->tap_ifp; - - TAPDEBUG("initializing %s\n", ifp->if_xname); - - mtx_lock(&tp->tap_mtx); - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - mtx_unlock(&tp->tap_mtx); - - /* attempt to start output */ - tapifstart(ifp); -} /* tapifinit */ - - -/* - * tapifioctl - * - * Process an ioctl request on network interface - */ -static int -tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct tap_softc *tp = ifp->if_softc; - struct ifreq *ifr = (struct ifreq *)data; - struct ifstat *ifs = NULL; - struct ifmediareq *ifmr = NULL; - int dummy, error = 0; - - switch (cmd) { - case SIOCSIFFLAGS: /* XXX -- just like vmnet does */ - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - - case SIOCGIFMEDIA: - ifmr = (struct ifmediareq *)data; - dummy = ifmr->ifm_count; - ifmr->ifm_count = 1; - ifmr->ifm_status = IFM_AVALID; - ifmr->ifm_active = IFM_ETHER; - if (tp->tap_flags & TAP_OPEN) - ifmr->ifm_status |= IFM_ACTIVE; - ifmr->ifm_current = ifmr->ifm_active; - if (dummy >= 1) { - int media = IFM_ETHER; - error = copyout(&media, ifmr->ifm_ulist, - sizeof(int)); - } - break; - - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; - - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - mtx_lock(&tp->tap_mtx); - if (tp->tap_pid != 0) - snprintf(ifs->ascii, sizeof(ifs->ascii), - "\tOpened by PID %d\n", tp->tap_pid); - else - ifs->ascii[0] = '\0'; - mtx_unlock(&tp->tap_mtx); - break; - - default: - error = ether_ioctl(ifp, cmd, data); - break; - } - - return (error); -} /* tapifioctl */ - - -/* - * tapifstart - * - * queue packets from higher level ready to put out - */ -static void -tapifstart(struct ifnet *ifp) -{ - struct tap_softc *tp = ifp->if_softc; - - TAPDEBUG("%s starting\n", ifp->if_xname); - - /* - * do not junk pending output if we are in VMnet mode. - * XXX: can this do any harm because of queue overflow? - */ - - mtx_lock(&tp->tap_mtx); - if (((tp->tap_flags & TAP_VMNET) == 0) && - ((tp->tap_flags & TAP_READY) != TAP_READY)) { - struct mbuf *m; - - /* Unlocked read. */ - TAPDEBUG("%s not ready, tap_flags = 0x%x\n", ifp->if_xname, - tp->tap_flags); - - for (;;) { - IF_DEQUEUE(&ifp->if_snd, m); - if (m != NULL) { - m_freem(m); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - } else - break; - } - mtx_unlock(&tp->tap_mtx); - - return; - } - - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - if (tp->tap_flags & TAP_RWAIT) { - tp->tap_flags &= ~TAP_RWAIT; - wakeup(tp); - } - - if ((tp->tap_flags & TAP_ASYNC) && (tp->tap_sigio != NULL)) { - mtx_unlock(&tp->tap_mtx); - pgsigio(&tp->tap_sigio, SIGIO, 0); - mtx_lock(&tp->tap_mtx); - } - - selwakeuppri(&tp->tap_rsel, PZERO+1); - KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ - } - - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - mtx_unlock(&tp->tap_mtx); -} /* tapifstart */ - - -/* - * tapioctl - * - * the cdevsw interface is now pretty minimal - */ -static int -tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) -{ - struct ifreq ifr; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct tapinfo *tapp = NULL; - int f; - int error; -#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD4) - int ival; -#endif - - switch (cmd) { - case TAPSIFINFO: - tapp = (struct tapinfo *)data; - if (ifp->if_type != tapp->type) - return (EPROTOTYPE); - mtx_lock(&tp->tap_mtx); - if (ifp->if_mtu != tapp->mtu) { - strlcpy(ifr.ifr_name, if_name(ifp), IFNAMSIZ); - ifr.ifr_mtu = tapp->mtu; - CURVNET_SET(ifp->if_vnet); - error = ifhwioctl(SIOCSIFMTU, ifp, - (caddr_t)&ifr, td); - CURVNET_RESTORE(); - if (error) { - mtx_unlock(&tp->tap_mtx); - return (error); - } - } - ifp->if_baudrate = tapp->baudrate; - mtx_unlock(&tp->tap_mtx); - break; - - case TAPGIFINFO: - tapp = (struct tapinfo *)data; - mtx_lock(&tp->tap_mtx); - tapp->mtu = ifp->if_mtu; - tapp->type = ifp->if_type; - tapp->baudrate = ifp->if_baudrate; - mtx_unlock(&tp->tap_mtx); - break; - - case TAPSDEBUG: - tapdebug = *(int *)data; - break; - - case TAPGDEBUG: - *(int *)data = tapdebug; - break; - - case TAPGIFNAME: { - struct ifreq *ifr = (struct ifreq *) data; - - strlcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ); - } break; - - case FIONBIO: - break; - - case FIOASYNC: - mtx_lock(&tp->tap_mtx); - if (*(int *)data) - tp->tap_flags |= TAP_ASYNC; - else - tp->tap_flags &= ~TAP_ASYNC; - mtx_unlock(&tp->tap_mtx); - break; - - case FIONREAD: - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - struct mbuf *mb; - - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, mb); - for (*(int *)data = 0; mb != NULL; - mb = mb->m_next) - *(int *)data += mb->m_len; - IFQ_UNLOCK(&ifp->if_snd); - } else - *(int *)data = 0; - break; - - case FIOSETOWN: - return (fsetown(*(int *)data, &tp->tap_sigio)); - - case FIOGETOWN: - *(int *)data = fgetown(&tp->tap_sigio); - return (0); - - /* this is deprecated, FIOSETOWN should be used instead */ - case TIOCSPGRP: - return (fsetown(-(*(int *)data), &tp->tap_sigio)); - - /* this is deprecated, FIOGETOWN should be used instead */ - case TIOCGPGRP: - *(int *)data = -fgetown(&tp->tap_sigio); - return (0); - - /* VMware/VMnet port ioctl's */ - -#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ - defined(COMPAT_FREEBSD4) - case _IO('V', 0): - ival = IOCPARM_IVAL(data); - data = (caddr_t)&ival; - /* FALLTHROUGH */ -#endif - case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ - f = *(int *)data; - f &= 0x0fff; - f &= ~IFF_CANTCHANGE; - f |= IFF_UP; - - mtx_lock(&tp->tap_mtx); - ifp->if_flags = f | (ifp->if_flags & IFF_CANTCHANGE); - mtx_unlock(&tp->tap_mtx); - break; - - case SIOCGIFADDR: /* get MAC address of the remote side */ - mtx_lock(&tp->tap_mtx); - bcopy(tp->ether_addr, data, sizeof(tp->ether_addr)); - mtx_unlock(&tp->tap_mtx); - break; - - case SIOCSIFADDR: /* set MAC address of the remote side */ - mtx_lock(&tp->tap_mtx); - bcopy(data, tp->ether_addr, sizeof(tp->ether_addr)); - mtx_unlock(&tp->tap_mtx); - break; - - default: - return (ENOTTY); - } - return (0); -} /* tapioctl */ - - -/* - * tapread - * - * the cdevsw read interface - reads a packet at a time, or at - * least as much of a packet as can be read - */ -static int -tapread(struct cdev *dev, struct uio *uio, int flag) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct mbuf *m = NULL; - int error = 0, len; - - TAPDEBUG("%s reading, minor = %#x\n", ifp->if_xname, dev2unit(dev)); - - mtx_lock(&tp->tap_mtx); - if ((tp->tap_flags & TAP_READY) != TAP_READY) { - mtx_unlock(&tp->tap_mtx); - - /* Unlocked read. */ - TAPDEBUG("%s not ready. minor = %#x, tap_flags = 0x%x\n", - ifp->if_xname, dev2unit(dev), tp->tap_flags); - - return (EHOSTDOWN); - } - - tp->tap_flags &= ~TAP_RWAIT; - - /* sleep until we get a packet */ - do { - IF_DEQUEUE(&ifp->if_snd, m); - - if (m == NULL) { - if (flag & O_NONBLOCK) { - mtx_unlock(&tp->tap_mtx); - return (EWOULDBLOCK); - } - - tp->tap_flags |= TAP_RWAIT; - error = mtx_sleep(tp, &tp->tap_mtx, PCATCH | (PZERO + 1), - "taprd", 0); - if (error) { - mtx_unlock(&tp->tap_mtx); - return (error); - } - } - } while (m == NULL); - mtx_unlock(&tp->tap_mtx); - - /* feed packet to bpf */ - BPF_MTAP(ifp, m); - - /* xfer packet to user space */ - while ((m != NULL) && (uio->uio_resid > 0) && (error == 0)) { - len = min(uio->uio_resid, m->m_len); - if (len == 0) - break; - - error = uiomove(mtod(m, void *), len, uio); - m = m_free(m); - } - - if (m != NULL) { - TAPDEBUG("%s dropping mbuf, minor = %#x\n", ifp->if_xname, - dev2unit(dev)); - m_freem(m); - } - - return (error); -} /* tapread */ - - -/* - * tapwrite - * - * the cdevsw write interface - an atomic write is a packet - or else! - */ -static int -tapwrite(struct cdev *dev, struct uio *uio, int flag) -{ - struct ether_header *eh; - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - struct mbuf *m; - - TAPDEBUG("%s writing, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - if (uio->uio_resid == 0) - return (0); - - if ((uio->uio_resid < 0) || (uio->uio_resid > TAPMRU)) { - TAPDEBUG("%s invalid packet len = %zd, minor = %#x\n", - ifp->if_xname, uio->uio_resid, dev2unit(dev)); - - return (EIO); - } - - if ((m = m_uiotombuf(uio, M_NOWAIT, 0, ETHER_ALIGN, - M_PKTHDR)) == NULL) { - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - return (ENOBUFS); - } - - m->m_pkthdr.rcvif = ifp; - - /* - * Only pass a unicast frame to ether_input(), if it would actually - * have been received by non-virtual hardware. - */ - if (m->m_len < sizeof(struct ether_header)) { - m_freem(m); - return (0); - } - eh = mtod(m, struct ether_header *); - - if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && - !ETHER_IS_MULTICAST(eh->ether_dhost) && - bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { - m_freem(m); - return (0); - } - - /* Pass packet up to parent. */ - CURVNET_SET(ifp->if_vnet); - (*ifp->if_input)(ifp, m); - CURVNET_RESTORE(); - if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); /* ibytes are counted in parent */ - - return (0); -} /* tapwrite */ - - -/* - * tappoll - * - * the poll interface, this is only useful on reads - * really. the write detect always returns true, write never blocks - * anyway, it either accepts the packet or drops it - */ -static int -tappoll(struct cdev *dev, int events, struct thread *td) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - int revents = 0; - - TAPDEBUG("%s polling, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - if (events & (POLLIN | POLLRDNORM)) { - IFQ_LOCK(&ifp->if_snd); - if (!IFQ_IS_EMPTY(&ifp->if_snd)) { - TAPDEBUG("%s have data in queue. len = %d, " \ - "minor = %#x\n", ifp->if_xname, - ifp->if_snd.ifq_len, dev2unit(dev)); - - revents |= (events & (POLLIN | POLLRDNORM)); - } else { - TAPDEBUG("%s waiting for data, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - - selrecord(td, &tp->tap_rsel); - } - IFQ_UNLOCK(&ifp->if_snd); - } - - if (events & (POLLOUT | POLLWRNORM)) - revents |= (events & (POLLOUT | POLLWRNORM)); - - return (revents); -} /* tappoll */ - - -/* - * tap_kqfilter - * - * support for kevent() system call - */ -static int -tapkqfilter(struct cdev *dev, struct knote *kn) -{ - struct tap_softc *tp = dev->si_drv1; - struct ifnet *ifp = tp->tap_ifp; - - switch (kn->kn_filter) { - case EVFILT_READ: - TAPDEBUG("%s kqfilter: EVFILT_READ, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tap_read_filterops; - break; - - case EVFILT_WRITE: - TAPDEBUG("%s kqfilter: EVFILT_WRITE, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tap_write_filterops; - break; - - default: - TAPDEBUG("%s kqfilter: invalid filter, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - return (EINVAL); - /* NOT REACHED */ - } - - kn->kn_hook = tp; - knlist_add(&tp->tap_rsel.si_note, kn, 0); - - return (0); -} /* tapkqfilter */ - - -/* - * tap_kqread - * - * Return true if there is data in the interface queue - */ -static int -tapkqread(struct knote *kn, long hint) -{ - int ret; - struct tap_softc *tp = kn->kn_hook; - struct cdev *dev = tp->tap_dev; - struct ifnet *ifp = tp->tap_ifp; - - if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { - TAPDEBUG("%s have data in queue. len = %d, minor = %#x\n", - ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); - ret = 1; - } else { - TAPDEBUG("%s waiting for data, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - ret = 0; - } - - return (ret); -} /* tapkqread */ - - -/* - * tap_kqwrite - * - * Always can write. Return the MTU in kn->data - */ -static int -tapkqwrite(struct knote *kn, long hint) -{ - struct tap_softc *tp = kn->kn_hook; - struct ifnet *ifp = tp->tap_ifp; - - kn->kn_data = ifp->if_mtu; - - return (1); -} /* tapkqwrite */ - - -static void -tapkqdetach(struct knote *kn) -{ - struct tap_softc *tp = kn->kn_hook; - - knlist_remove(&tp->tap_rsel.si_note, kn, 0); -} /* tapkqdetach */ - diff --git a/freebsd/sys/net/if_tap.h b/freebsd/sys/net/if_tap.h index 34f44b38..9718cee4 100644 --- a/freebsd/sys/net/if_tap.h +++ b/freebsd/sys/net/if_tap.h @@ -40,24 +40,22 @@ #ifndef _NET_IF_TAP_H_ #define _NET_IF_TAP_H_ -/* refer to if_tapvar.h for the softc stuff */ +#include <net/if_tun.h> /* maximum receive packet size (hard limit) */ #define TAPMRU 16384 -struct tapinfo { - int baudrate; /* linespeed */ - short mtu; /* maximum transmission unit */ - u_char type; /* ethernet, tokenring, etc. */ - u_char dummy; /* place holder */ -}; +#define tapinfo tuninfo -/* ioctl's for get/set debug */ -#define TAPSDEBUG _IOW('t', 90, int) -#define TAPGDEBUG _IOR('t', 89, int) -#define TAPSIFINFO _IOW('t', 91, struct tapinfo) -#define TAPGIFINFO _IOR('t', 92, struct tapinfo) -#define TAPGIFNAME _IOR('t', 93, struct ifreq) +/* + * ioctl's for get/set debug; these are aliases of TUN* ioctls, see net/if_tun.h + * for details. + */ +#define TAPSDEBUG TUNSDEBUG +#define TAPGDEBUG TUNGDEBUG +#define TAPSIFINFO TUNSIFINFO +#define TAPGIFINFO TUNGIFINFO +#define TAPGIFNAME TUNGIFNAME /* VMware ioctl's */ #define VMIO_SIOCSIFFLAGS _IOWINT('V', 0) diff --git a/freebsd/sys/net/if_tapvar.h b/freebsd/sys/net/if_tapvar.h deleted file mode 100644 index f5cf9f3e..00000000 --- a/freebsd/sys/net/if_tapvar.h +++ /dev/null @@ -1,71 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * BASED ON: - * ------------------------------------------------------------------------- - * - * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org> - * All rights reserved. - * - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - */ - -/* - * $FreeBSD$ - * $Id: if_tapvar.h,v 0.6 2000/07/11 02:16:08 max Exp $ - */ - -#ifndef _NET_IF_TAPVAR_H_ -#define _NET_IF_TAPVAR_H_ - -/* - * tap_mtx locks tap_flags, tap_pid. tap_next locked with global tapmtx. - * Other fields locked by owning subsystems. - */ -struct tap_softc { - struct ifnet *tap_ifp; - u_short tap_flags; /* misc flags */ -#define TAP_OPEN (1 << 0) -#define TAP_INITED (1 << 1) -#define TAP_RWAIT (1 << 2) -#define TAP_ASYNC (1 << 3) -#define TAP_READY (TAP_OPEN|TAP_INITED) -#define TAP_VMNET (1 << 4) - - u_int8_t ether_addr[ETHER_ADDR_LEN]; /* ether addr of the remote side */ - - pid_t tap_pid; /* PID of process to open */ - struct sigio *tap_sigio; /* information for async I/O */ - struct selinfo tap_rsel; /* read select */ - - SLIST_ENTRY(tap_softc) tap_next; /* next device in chain */ - struct cdev *tap_dev; - struct mtx tap_mtx; /* per-softc mutex */ -}; - -#endif /* !_NET_IF_TAPVAR_H_ */ diff --git a/freebsd/sys/net/if_tun.c b/freebsd/sys/net/if_tun.c deleted file mode 100644 index 44441773..00000000 --- a/freebsd/sys/net/if_tun.c +++ /dev/null @@ -1,1055 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ - -/*- - * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> - * Nottingham University 1987. - * - * This source may be freely distributed, however I would be interested - * in any changes that are made. - * - * This driver takes packets off the IP i/f and hands them up to a - * user process to have its wicked way with. This driver has it's - * roots in a similar driver written by Phil Cockcroft (formerly) at - * UCL. This driver is based much more on read/write/poll mode of - * operation though. - * - * $FreeBSD$ - */ - -#include <rtems/bsd/local/opt_inet.h> -#include <rtems/bsd/local/opt_inet6.h> - -#include <sys/param.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/systm.h> -#include <sys/jail.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/socket.h> -#include <sys/fcntl.h> -#include <sys/filio.h> -#include <sys/sockio.h> -#include <sys/ttycom.h> -#include <sys/poll.h> -#include <sys/selinfo.h> -#include <sys/signalvar.h> -#include <sys/filedesc.h> -#include <sys/kernel.h> -#include <sys/sysctl.h> -#include <sys/conf.h> -#include <sys/uio.h> -#include <sys/malloc.h> -#include <sys/random.h> - -#include <net/if.h> -#include <net/if_var.h> -#include <net/if_clone.h> -#include <net/if_types.h> -#include <net/netisr.h> -#include <net/route.h> -#include <net/vnet.h> -#ifdef INET -#include <netinet/in.h> -#endif -#include <net/bpf.h> -#include <net/if_tun.h> - -#include <sys/queue.h> -#include <sys/condvar.h> - -#include <security/mac/mac_framework.h> - -/* - * tun_list is protected by global tunmtx. Other mutable fields are - * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is - * static for the duration of a tunnel interface. - */ -struct tun_softc { - TAILQ_ENTRY(tun_softc) tun_list; - struct cdev *tun_dev; - u_short tun_flags; /* misc flags */ -#define TUN_OPEN 0x0001 -#define TUN_INITED 0x0002 -#define TUN_RCOLL 0x0004 -#define TUN_IASET 0x0008 -#define TUN_DSTADDR 0x0010 -#define TUN_LMODE 0x0020 -#define TUN_RWAIT 0x0040 -#define TUN_ASYNC 0x0080 -#define TUN_IFHEAD 0x0100 - -#define TUN_READY (TUN_OPEN | TUN_INITED) - - /* - * XXXRW: tun_pid is used to exclusively lock /dev/tun. Is this - * actually needed? Can we just return EBUSY if already open? - * Problem is that this involved inherent races when a tun device - * is handed off from one process to another, as opposed to just - * being slightly stale informationally. - */ - pid_t tun_pid; /* owning pid */ - struct ifnet *tun_ifp; /* the interface */ - struct sigio *tun_sigio; /* information for async I/O */ - struct selinfo tun_rsel; /* read select */ - struct mtx tun_mtx; /* protect mutable softc fields */ - struct cv tun_cv; /* protect against ref'd dev destroy */ -}; -#define TUN2IFP(sc) ((sc)->tun_ifp) - -#define TUNDEBUG if (tundebug) if_printf - -/* - * All mutable global variables in if_tun are locked using tunmtx, with - * the exception of tundebug, which is used unlocked, and tunclones, - * which is static after setup. - */ -static struct mtx tunmtx; -static const char tunname[] = "tun"; -static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); -static int tundebug = 0; -static int tundclone = 1; -static struct clonedevs *tunclones; -static TAILQ_HEAD(,tun_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); -SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); - -SYSCTL_DECL(_net_link); -static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, - "IP tunnel software network interface."); -SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, - "Enable legacy devfs interface creation."); - -static void tunclone(void *arg, struct ucred *cred, char *name, - int namelen, struct cdev **dev); -static void tuncreate(const char *name, struct cdev *dev); -static int tunifioctl(struct ifnet *, u_long, caddr_t); -static void tuninit(struct ifnet *); -static int tunmodevent(module_t, int, void *); -static int tunoutput(struct ifnet *, struct mbuf *, - const struct sockaddr *, struct route *ro); -static void tunstart(struct ifnet *); - -static int tun_clone_create(struct if_clone *, int, caddr_t); -static void tun_clone_destroy(struct ifnet *); -static struct if_clone *tun_cloner; - -static d_open_t tunopen; -static d_close_t tunclose; -static d_read_t tunread; -static d_write_t tunwrite; -static d_ioctl_t tunioctl; -static d_poll_t tunpoll; -static d_kqfilter_t tunkqfilter; - -static int tunkqread(struct knote *, long); -static int tunkqwrite(struct knote *, long); -static void tunkqdetach(struct knote *); - -static struct filterops tun_read_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tunkqdetach, - .f_event = tunkqread, -}; - -static struct filterops tun_write_filterops = { - .f_isfd = 1, - .f_attach = NULL, - .f_detach = tunkqdetach, - .f_event = tunkqwrite, -}; - -static struct cdevsw tun_cdevsw = { - .d_version = D_VERSION, - .d_flags = D_NEEDMINOR, - .d_open = tunopen, - .d_close = tunclose, - .d_read = tunread, - .d_write = tunwrite, - .d_ioctl = tunioctl, - .d_poll = tunpoll, - .d_kqfilter = tunkqfilter, - .d_name = tunname, -}; - -static int -tun_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct cdev *dev; - int i; - - /* find any existing device, or allocate new unit number */ - i = clone_create(&tunclones, &tun_cdevsw, &unit, &dev, 0); - if (i) { - /* No preexisting struct cdev *, create one */ - dev = make_dev(&tun_cdevsw, unit, - UID_UUCP, GID_DIALER, 0600, "%s%d", tunname, unit); - } - tuncreate(tunname, dev); - - return (0); -} - -static void -tunclone(void *arg, struct ucred *cred, char *name, int namelen, - struct cdev **dev) -{ - char devname[SPECNAMELEN + 1]; - int u, i, append_unit; - - if (*dev != NULL) - return; - - /* - * If tun cloning is enabled, only the superuser can create an - * interface. - */ - if (!tundclone || priv_check_cred(cred, PRIV_NET_IFCREATE) != 0) - return; - - if (strcmp(name, tunname) == 0) { - u = -1; - } else if (dev_stdclone(name, NULL, tunname, &u) != 1) - return; /* Don't recognise the name */ - if (u != -1 && u > IF_MAXUNIT) - return; /* Unit number too high */ - - if (u == -1) - append_unit = 1; - else - append_unit = 0; - - CURVNET_SET(CRED_TO_VNET(cred)); - /* find any existing device, or allocate new unit number */ - i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); - if (i) { - if (append_unit) { - namelen = snprintf(devname, sizeof(devname), "%s%d", - name, u); - name = devname; - } - /* No preexisting struct cdev *, create one */ - *dev = make_dev_credf(MAKEDEV_REF, &tun_cdevsw, u, cred, - UID_UUCP, GID_DIALER, 0600, "%s", name); - } - - if_clone_create(name, namelen, NULL); - CURVNET_RESTORE(); -} - -static void -tun_destroy(struct tun_softc *tp) -{ - struct cdev *dev; - - mtx_lock(&tp->tun_mtx); - if ((tp->tun_flags & TUN_OPEN) != 0) - cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); - else - mtx_unlock(&tp->tun_mtx); - - CURVNET_SET(TUN2IFP(tp)->if_vnet); - dev = tp->tun_dev; - bpfdetach(TUN2IFP(tp)); - if_detach(TUN2IFP(tp)); - if_free(TUN2IFP(tp)); - destroy_dev(dev); - seldrain(&tp->tun_rsel); - knlist_clear(&tp->tun_rsel.si_note, 0); - knlist_destroy(&tp->tun_rsel.si_note); - mtx_destroy(&tp->tun_mtx); - cv_destroy(&tp->tun_cv); - free(tp, M_TUN); - CURVNET_RESTORE(); -} - -static void -tun_clone_destroy(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; - - mtx_lock(&tunmtx); - TAILQ_REMOVE(&tunhead, tp, tun_list); - mtx_unlock(&tunmtx); - tun_destroy(tp); -} - -static int -tunmodevent(module_t mod, int type, void *data) -{ - static eventhandler_tag tag; - struct tun_softc *tp; - - switch (type) { - case MOD_LOAD: - mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); - clone_setup(&tunclones); - tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); - if (tag == NULL) - return (ENOMEM); - tun_cloner = if_clone_simple(tunname, tun_clone_create, - tun_clone_destroy, 0); - break; - case MOD_UNLOAD: - if_clone_detach(tun_cloner); - EVENTHANDLER_DEREGISTER(dev_clone, tag); - drain_dev_clone_events(); - - mtx_lock(&tunmtx); - while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { - TAILQ_REMOVE(&tunhead, tp, tun_list); - mtx_unlock(&tunmtx); - tun_destroy(tp); - mtx_lock(&tunmtx); - } - mtx_unlock(&tunmtx); - clone_cleanup(&tunclones); - mtx_destroy(&tunmtx); - break; - default: - return EOPNOTSUPP; - } - return 0; -} - -static moduledata_t tun_mod = { - "if_tun", - tunmodevent, - 0 -}; - -DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); -MODULE_VERSION(if_tun, 1); - -static void -tunstart(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; - struct mbuf *m; - - TUNDEBUG(ifp,"%s starting\n", ifp->if_xname); - if (ALTQ_IS_ENABLED(&ifp->if_snd)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m == NULL) { - IFQ_UNLOCK(&ifp->if_snd); - return; - } - IFQ_UNLOCK(&ifp->if_snd); - } - - mtx_lock(&tp->tun_mtx); - if (tp->tun_flags & TUN_RWAIT) { - tp->tun_flags &= ~TUN_RWAIT; - wakeup(tp); - } - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); - if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { - mtx_unlock(&tp->tun_mtx); - pgsigio(&tp->tun_sigio, SIGIO, 0); - } else - mtx_unlock(&tp->tun_mtx); -} - -/* XXX: should return an error code so it can fail. */ -static void -tuncreate(const char *name, struct cdev *dev) -{ - struct tun_softc *sc; - struct ifnet *ifp; - - sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); - mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); - cv_init(&sc->tun_cv, "tun_condvar"); - sc->tun_flags = TUN_INITED; - sc->tun_dev = dev; - mtx_lock(&tunmtx); - TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); - mtx_unlock(&tunmtx); - - ifp = sc->tun_ifp = if_alloc(IFT_PPP); - if (ifp == NULL) - panic("%s%d: failed to if_alloc() interface.\n", - name, dev2unit(dev)); - if_initname(ifp, name, dev2unit(dev)); - ifp->if_mtu = TUNMTU; - ifp->if_ioctl = tunifioctl; - ifp->if_output = tunoutput; - ifp->if_start = tunstart; - ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; - ifp->if_softc = sc; - IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); - ifp->if_snd.ifq_drv_maxlen = 0; - IFQ_SET_READY(&ifp->if_snd); - knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); - ifp->if_capabilities |= IFCAP_LINKSTATE; - ifp->if_capenable |= IFCAP_LINKSTATE; - - if_attach(ifp); - bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - dev->si_drv1 = sc; - TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); -} - -static int -tunopen(struct cdev *dev, int flag, int mode, struct thread *td) -{ - struct ifnet *ifp; - struct tun_softc *tp; - - /* - * XXXRW: Non-atomic test and set of dev->si_drv1 requires - * synchronization. - */ - tp = dev->si_drv1; - if (!tp) { - tuncreate(tunname, dev); - tp = dev->si_drv1; - } - - /* - * XXXRW: This use of tun_pid is subject to error due to the - * fact that a reference to the tunnel can live beyond the - * death of the process that created it. Can we replace this - * with a simple busy flag? - */ - mtx_lock(&tp->tun_mtx); -#ifndef __rtems__ - if (tp->tun_pid != 0 && tp->tun_pid != td->td_proc->p_pid) { -#else /* __rtems__ */ - if (tp->tun_pid != 0 && tp->tun_pid != BSD_DEFAULT_PID) { -#endif /* __rtems__ */ - mtx_unlock(&tp->tun_mtx); - return (EBUSY); - } -#ifndef __rtems__ - tp->tun_pid = td->td_proc->p_pid; -#else /* __rtems__ */ - tp->tun_pid = BSD_DEFAULT_PID; -#endif /* __rtems__ */ - - tp->tun_flags |= TUN_OPEN; - ifp = TUN2IFP(tp); - if_link_state_change(ifp, LINK_STATE_UP); - TUNDEBUG(ifp, "open\n"); - mtx_unlock(&tp->tun_mtx); - - return (0); -} - -/* - * tunclose - close the device - mark i/f down & delete - * routing info - */ -static int -tunclose(struct cdev *dev, int foo, int bar, struct thread *td) -{ - struct tun_softc *tp; - struct ifnet *ifp; - - tp = dev->si_drv1; - ifp = TUN2IFP(tp); - - mtx_lock(&tp->tun_mtx); - tp->tun_flags &= ~TUN_OPEN; - tp->tun_pid = 0; - - /* - * junk all pending output - */ - CURVNET_SET(ifp->if_vnet); - IFQ_PURGE(&ifp->if_snd); - - if (ifp->if_flags & IFF_UP) { - mtx_unlock(&tp->tun_mtx); - if_down(ifp); - mtx_lock(&tp->tun_mtx); - } - - /* Delete all addresses and routes which reference this interface. */ - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - struct ifaddr *ifa; - - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - mtx_unlock(&tp->tun_mtx); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - /* deal w/IPv4 PtP destination; unlocked read */ - if (ifa->ifa_addr->sa_family == AF_INET) { - rtinit(ifa, (int)RTM_DELETE, - tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); - } else { - rtinit(ifa, (int)RTM_DELETE, 0); - } - } - if_purgeaddrs(ifp); - mtx_lock(&tp->tun_mtx); - } - if_link_state_change(ifp, LINK_STATE_DOWN); - CURVNET_RESTORE(); - - funsetown(&tp->tun_sigio); - selwakeuppri(&tp->tun_rsel, PZERO + 1); - KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); - TUNDEBUG (ifp, "closed\n"); - - cv_broadcast(&tp->tun_cv); - mtx_unlock(&tp->tun_mtx); - return (0); -} - -static void -tuninit(struct ifnet *ifp) -{ - struct tun_softc *tp = ifp->if_softc; -#ifdef INET - struct ifaddr *ifa; -#endif - - TUNDEBUG(ifp, "tuninit\n"); - - mtx_lock(&tp->tun_mtx); - ifp->if_flags |= IFF_UP; - ifp->if_drv_flags |= IFF_DRV_RUNNING; - getmicrotime(&ifp->if_lastchange); - -#ifdef INET - if_addr_rlock(ifp); - CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family == AF_INET) { - struct sockaddr_in *si; - - si = (struct sockaddr_in *)ifa->ifa_addr; - if (si->sin_addr.s_addr) - tp->tun_flags |= TUN_IASET; - - si = (struct sockaddr_in *)ifa->ifa_dstaddr; - if (si && si->sin_addr.s_addr) - tp->tun_flags |= TUN_DSTADDR; - } - } - if_addr_runlock(ifp); -#endif - mtx_unlock(&tp->tun_mtx); -} - -/* - * Process an ioctl request. - */ -static int -tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct ifreq *ifr = (struct ifreq *)data; - struct tun_softc *tp = ifp->if_softc; - struct ifstat *ifs; - int error = 0; - - switch(cmd) { - case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - mtx_lock(&tp->tun_mtx); - if (tp->tun_pid) - snprintf(ifs->ascii, sizeof(ifs->ascii), - "\tOpened by PID %d\n", tp->tun_pid); - else - ifs->ascii[0] = '\0'; - mtx_unlock(&tp->tun_mtx); - break; - case SIOCSIFADDR: - tuninit(ifp); - TUNDEBUG(ifp, "address set\n"); - break; - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - TUNDEBUG(ifp, "mtu set\n"); - break; - case SIOCSIFFLAGS: - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - default: - error = EINVAL; - } - return (error); -} - -/* - * tunoutput - queue packets from higher level ready to put out. - */ -static int -tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, - struct route *ro) -{ - struct tun_softc *tp = ifp->if_softc; - u_short cached_tun_flags; - int error; - u_int32_t af; - - TUNDEBUG (ifp, "tunoutput\n"); - -#ifdef MAC - error = mac_ifnet_check_transmit(ifp, m0); - if (error) { - m_freem(m0); - return (error); - } -#endif - - /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); - cached_tun_flags = tp->tun_flags; - mtx_unlock(&tp->tun_mtx); - if ((cached_tun_flags & TUN_READY) != TUN_READY) { - TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); - m_freem (m0); - return (EHOSTDOWN); - } - - if ((ifp->if_flags & IFF_UP) != IFF_UP) { - m_freem (m0); - return (EHOSTDOWN); - } - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) - bcopy(dst->sa_data, &af, sizeof(af)); - else - af = dst->sa_family; - - if (bpf_peers_present(ifp->if_bpf)) - bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); - - /* prepend sockaddr? this may abort if the mbuf allocation fails */ - if (cached_tun_flags & TUN_LMODE) { - /* allocate space for sockaddr */ - M_PREPEND(m0, dst->sa_len, M_NOWAIT); - - /* if allocation failed drop packet */ - if (m0 == NULL) { - if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (ENOBUFS); - } else { - bcopy(dst, m0->m_data, dst->sa_len); - } - } - - if (cached_tun_flags & TUN_IFHEAD) { - /* Prepend the address family */ - M_PREPEND(m0, 4, M_NOWAIT); - - /* if allocation failed drop packet */ - if (m0 == NULL) { - if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); - if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - return (ENOBUFS); - } else - *(u_int32_t *)m0->m_data = htonl(af); - } else { -#ifdef INET - if (af != AF_INET) -#endif - { - m_freem(m0); - return (EAFNOSUPPORT); - } - } - - error = (ifp->if_transmit)(ifp, m0); - if (error) - return (ENOBUFS); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); - return (0); -} - -/* - * the cdevsw interface is now pretty minimal. - */ -static int -tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, - struct thread *td) -{ - struct ifreq ifr; - struct tun_softc *tp = dev->si_drv1; - struct tuninfo *tunp; - int error; - - switch (cmd) { - case TUNSIFINFO: - tunp = (struct tuninfo *)data; - if (TUN2IFP(tp)->if_type != tunp->type) - return (EPROTOTYPE); - mtx_lock(&tp->tun_mtx); - if (TUN2IFP(tp)->if_mtu != tunp->mtu) { - strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); - ifr.ifr_mtu = tunp->mtu; - CURVNET_SET(TUN2IFP(tp)->if_vnet); - error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), - (caddr_t)&ifr, td); - CURVNET_RESTORE(); - if (error) { - mtx_unlock(&tp->tun_mtx); - return (error); - } - } - TUN2IFP(tp)->if_baudrate = tunp->baudrate; - mtx_unlock(&tp->tun_mtx); - break; - case TUNGIFINFO: - tunp = (struct tuninfo *)data; - mtx_lock(&tp->tun_mtx); - tunp->mtu = TUN2IFP(tp)->if_mtu; - tunp->type = TUN2IFP(tp)->if_type; - tunp->baudrate = TUN2IFP(tp)->if_baudrate; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSDEBUG: - tundebug = *(int *)data; - break; - case TUNGDEBUG: - *(int *)data = tundebug; - break; - case TUNSLMODE: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) { - tp->tun_flags |= TUN_LMODE; - tp->tun_flags &= ~TUN_IFHEAD; - } else - tp->tun_flags &= ~TUN_LMODE; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSIFHEAD: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) { - tp->tun_flags |= TUN_IFHEAD; - tp->tun_flags &= ~TUN_LMODE; - } else - tp->tun_flags &= ~TUN_IFHEAD; - mtx_unlock(&tp->tun_mtx); - break; - case TUNGIFHEAD: - mtx_lock(&tp->tun_mtx); - *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; - mtx_unlock(&tp->tun_mtx); - break; - case TUNSIFMODE: - /* deny this if UP */ - if (TUN2IFP(tp)->if_flags & IFF_UP) - return(EBUSY); - - switch (*(int *)data & ~IFF_MULTICAST) { - case IFF_POINTOPOINT: - case IFF_BROADCAST: - mtx_lock(&tp->tun_mtx); - TUN2IFP(tp)->if_flags &= - ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); - TUN2IFP(tp)->if_flags |= *(int *)data; - mtx_unlock(&tp->tun_mtx); - break; - default: - return(EINVAL); - } - break; - case TUNSIFPID: - mtx_lock(&tp->tun_mtx); -#ifndef __rtems__ - tp->tun_pid = curthread->td_proc->p_pid; -#else /* __rtems__ */ - tp->tun_pid = BSD_DEFAULT_PID; -#endif /* __rtems__ */ - mtx_unlock(&tp->tun_mtx); - break; - case FIONBIO: - break; - case FIOASYNC: - mtx_lock(&tp->tun_mtx); - if (*(int *)data) - tp->tun_flags |= TUN_ASYNC; - else - tp->tun_flags &= ~TUN_ASYNC; - mtx_unlock(&tp->tun_mtx); - break; - case FIONREAD: - if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { - struct mbuf *mb; - IFQ_LOCK(&TUN2IFP(tp)->if_snd); - IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); - for (*(int *)data = 0; mb != NULL; mb = mb->m_next) - *(int *)data += mb->m_len; - IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); - } else - *(int *)data = 0; - break; - case FIOSETOWN: - return (fsetown(*(int *)data, &tp->tun_sigio)); - - case FIOGETOWN: - *(int *)data = fgetown(&tp->tun_sigio); - return (0); - - /* This is deprecated, FIOSETOWN should be used instead. */ - case TIOCSPGRP: - return (fsetown(-(*(int *)data), &tp->tun_sigio)); - - /* This is deprecated, FIOGETOWN should be used instead. */ - case TIOCGPGRP: - *(int *)data = -fgetown(&tp->tun_sigio); - return (0); - - default: - return (ENOTTY); - } - return (0); -} - -/* - * The cdevsw read interface - reads a packet at a time, or at - * least as much of a packet as can be read. - */ -static int -tunread(struct cdev *dev, struct uio *uio, int flag) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - struct mbuf *m; - int error=0, len; - - TUNDEBUG (ifp, "read\n"); - mtx_lock(&tp->tun_mtx); - if ((tp->tun_flags & TUN_READY) != TUN_READY) { - mtx_unlock(&tp->tun_mtx); - TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); - return (EHOSTDOWN); - } - - tp->tun_flags &= ~TUN_RWAIT; - - do { - IFQ_DEQUEUE(&ifp->if_snd, m); - if (m == NULL) { - if (flag & O_NONBLOCK) { - mtx_unlock(&tp->tun_mtx); - return (EWOULDBLOCK); - } - tp->tun_flags |= TUN_RWAIT; - error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), - "tunread", 0); - if (error != 0) { - mtx_unlock(&tp->tun_mtx); - return (error); - } - } - } while (m == NULL); - mtx_unlock(&tp->tun_mtx); - - while (m && uio->uio_resid > 0 && error == 0) { - len = min(uio->uio_resid, m->m_len); - if (len != 0) - error = uiomove(mtod(m, void *), len, uio); - m = m_free(m); - } - - if (m) { - TUNDEBUG(ifp, "Dropping mbuf\n"); - m_freem(m); - } - return (error); -} - -/* - * the cdevsw write interface - an atomic write is a packet - or else! - */ -static int -tunwrite(struct cdev *dev, struct uio *uio, int flag) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - struct mbuf *m; - uint32_t family, mru; - int isr; - - TUNDEBUG(ifp, "tunwrite\n"); - - if ((ifp->if_flags & IFF_UP) != IFF_UP) - /* ignore silently */ - return (0); - - if (uio->uio_resid == 0) - return (0); - - mru = TUNMRU; - if (tp->tun_flags & TUN_IFHEAD) - mru += sizeof(family); - if (uio->uio_resid < 0 || uio->uio_resid > mru) { - TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); - return (EIO); - } - - if ((m = m_uiotombuf(uio, M_NOWAIT, 0, 0, M_PKTHDR)) == NULL) { - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - return (ENOBUFS); - } - - m->m_pkthdr.rcvif = ifp; -#ifdef MAC - mac_ifnet_create_mbuf(ifp, m); -#endif - - /* Could be unlocked read? */ - mtx_lock(&tp->tun_mtx); - if (tp->tun_flags & TUN_IFHEAD) { - mtx_unlock(&tp->tun_mtx); - if (m->m_len < sizeof(family) && - (m = m_pullup(m, sizeof(family))) == NULL) - return (ENOBUFS); - family = ntohl(*mtod(m, u_int32_t *)); - m_adj(m, sizeof(family)); - } else { - mtx_unlock(&tp->tun_mtx); - family = AF_INET; - } - - BPF_MTAP2(ifp, &family, sizeof(family), m); - - switch (family) { -#ifdef INET - case AF_INET: - isr = NETISR_IP; - break; -#endif -#ifdef INET6 - case AF_INET6: - isr = NETISR_IPV6; - break; -#endif - default: - m_freem(m); - return (EAFNOSUPPORT); - } - random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); - if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); - if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); - CURVNET_SET(ifp->if_vnet); - M_SETFIB(m, ifp->if_fib); - netisr_dispatch(isr, m); - CURVNET_RESTORE(); - return (0); -} - -/* - * tunpoll - the poll interface, this is only useful on reads - * really. The write detect always returns true, write never blocks - * anyway, it either accepts the packet or drops it. - */ -static int -tunpoll(struct cdev *dev, int events, struct thread *td) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - int revents = 0; - struct mbuf *m; - - TUNDEBUG(ifp, "tunpoll\n"); - - if (events & (POLLIN | POLLRDNORM)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m != NULL) { - TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); - revents |= events & (POLLIN | POLLRDNORM); - } else { - TUNDEBUG(ifp, "tunpoll waiting\n"); - selrecord(td, &tp->tun_rsel); - } - IFQ_UNLOCK(&ifp->if_snd); - } - if (events & (POLLOUT | POLLWRNORM)) - revents |= events & (POLLOUT | POLLWRNORM); - - return (revents); -} - -/* - * tunkqfilter - support for the kevent() system call. - */ -static int -tunkqfilter(struct cdev *dev, struct knote *kn) -{ - struct tun_softc *tp = dev->si_drv1; - struct ifnet *ifp = TUN2IFP(tp); - - switch(kn->kn_filter) { - case EVFILT_READ: - TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tun_read_filterops; - break; - - case EVFILT_WRITE: - TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - kn->kn_fop = &tun_write_filterops; - break; - - default: - TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", - ifp->if_xname, dev2unit(dev)); - return(EINVAL); - } - - kn->kn_hook = tp; - knlist_add(&tp->tun_rsel.si_note, kn, 0); - - return (0); -} - -/* - * Return true of there is data in the interface queue. - */ -static int -tunkqread(struct knote *kn, long hint) -{ - int ret; - struct tun_softc *tp = kn->kn_hook; - struct cdev *dev = tp->tun_dev; - struct ifnet *ifp = TUN2IFP(tp); - - if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { - TUNDEBUG(ifp, - "%s have data in the queue. Len = %d, minor = %#x\n", - ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); - ret = 1; - } else { - TUNDEBUG(ifp, - "%s waiting for data, minor = %#x\n", ifp->if_xname, - dev2unit(dev)); - ret = 0; - } - - return (ret); -} - -/* - * Always can write, always return MTU in kn->data. - */ -static int -tunkqwrite(struct knote *kn, long hint) -{ - struct tun_softc *tp = kn->kn_hook; - struct ifnet *ifp = TUN2IFP(tp); - - kn->kn_data = ifp->if_mtu; - - return (1); -} - -static void -tunkqdetach(struct knote *kn) -{ - struct tun_softc *tp = kn->kn_hook; - - knlist_remove(&tp->tun_rsel.si_note, kn, 0); -} diff --git a/freebsd/sys/net/if_tun.h b/freebsd/sys/net/if_tun.h index 1ea375f7..a44c87bd 100644 --- a/freebsd/sys/net/if_tun.h +++ b/freebsd/sys/net/if_tun.h @@ -40,6 +40,7 @@ struct tuninfo { #define TUNSIFINFO _IOW('t', 91, struct tuninfo) #define TUNGIFINFO _IOR('t', 92, struct tuninfo) #define TUNSLMODE _IOW('t', 93, int) +#define TUNGIFNAME _IOR('t', 93, struct ifreq) #define TUNSIFMODE _IOW('t', 94, int) #define TUNSIFPID _IO('t', 95) #define TUNSIFHEAD _IOW('t', 96, int) diff --git a/freebsd/sys/net/if_tuntap.c b/freebsd/sys/net/if_tuntap.c new file mode 100644 index 00000000..3516d82b --- /dev/null +++ b/freebsd/sys/net/if_tuntap.c @@ -0,0 +1,1734 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com> + * All rights reserved. + * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * BASED ON: + * ------------------------------------------------------------------------- + * + * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk> + * Nottingham University 1987. + * + * This source may be freely distributed, however I would be interested + * in any changes that are made. + * + * This driver takes packets off the IP i/f and hands them up to a + * user process to have its wicked way with. This driver has it's + * roots in a similar driver written by Phil Cockcroft (formerly) at + * UCL. This driver is based much more on read/write/poll mode of + * operation though. + * + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/jail.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/socket.h> +#include <sys/eventhandler.h> +#include <sys/fcntl.h> +#include <sys/filio.h> +#include <sys/sockio.h> +#include <sys/sx.h> +#include <sys/ttycom.h> +#include <sys/poll.h> +#include <sys/selinfo.h> +#include <sys/signalvar.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/malloc.h> +#include <sys/random.h> +#include <sys/ctype.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_clone.h> +#include <net/if_dl.h> +#include <net/if_media.h> +#include <net/if_types.h> +#include <net/netisr.h> +#include <net/route.h> +#include <net/vnet.h> +#ifdef INET +#include <netinet/in.h> +#endif +#include <net/bpf.h> +#include <net/if_tap.h> +#include <net/if_tun.h> + +#include <sys/queue.h> +#include <sys/condvar.h> +#include <security/mac/mac_framework.h> + +struct tuntap_driver; + +/* + * tun_list is protected by global tunmtx. Other mutable fields are + * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is + * static for the duration of a tunnel interface. + */ +struct tuntap_softc { + TAILQ_ENTRY(tuntap_softc) tun_list; + struct cdev *tun_dev; + u_short tun_flags; /* misc flags */ +#define TUN_OPEN 0x0001 +#define TUN_INITED 0x0002 +#define TUN_IASET 0x0008 +#define TUN_DSTADDR 0x0010 +#define TUN_LMODE 0x0020 +#define TUN_RWAIT 0x0040 +#define TUN_ASYNC 0x0080 +#define TUN_IFHEAD 0x0100 +#define TUN_DYING 0x0200 +#define TUN_L2 0x0400 +#define TUN_VMNET 0x0800 + +#define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) +#define TUN_READY (TUN_OPEN | TUN_INITED) + +#ifndef __rtems__ + pid_t tun_pid; /* owning pid */ +#endif /* __rtems__ */ + struct ifnet *tun_ifp; /* the interface */ + struct sigio *tun_sigio; /* async I/O info */ + struct tuntap_driver *tun_drv; /* appropriate driver */ + struct selinfo tun_rsel; /* read select */ + struct mtx tun_mtx; /* softc field mutex */ + struct cv tun_cv; /* for ref'd dev destroy */ + struct ether_addr tun_ether; /* remote address */ +}; +#define TUN2IFP(sc) ((sc)->tun_ifp) + +#define TUNDEBUG if (tundebug) if_printf + +#define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) +#define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) + +#define TUN_VMIO_FLAG_MASK 0x0fff + +/* + * All mutable global variables in if_tun are locked using tunmtx, with + * the exception of tundebug, which is used unlocked, and the drivers' *clones, + * which are static after setup. + */ +static struct mtx tunmtx; +static eventhandler_tag tag; +static const char tunname[] = "tun"; +static const char tapname[] = "tap"; +static const char vmnetname[] = "vmnet"; +static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); +static int tundebug = 0; +static int tundclone = 1; +static int tap_allow_uopen = 0; /* allow user open() */ +static int tapuponopen = 0; /* IFF_UP on open() */ +static int tapdclone = 1; /* enable devfs cloning */ + +static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); +SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); + +static struct sx tun_ioctl_sx; +SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); + +SYSCTL_DECL(_net_link); +/* tun */ +static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, + "IP tunnel software network interface"); +SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, + "Enable legacy devfs interface creation"); + +/* tap */ +static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, + "Ethernet tunnel software network interface"); +SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, + "Allow user to open /dev/tap (based on node permissions)"); +SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, + "Bring interface up when /dev/tap is opened"); +SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, + "Enable legacy devfs interface creation"); +SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); + +static int tuntap_name2info(const char *name, int *unit, int *flags); +static void tunclone(void *arg, struct ucred *cred, char *name, + int namelen, struct cdev **dev); +static void tuncreate(struct cdev *dev, struct tuntap_driver *); +static int tunifioctl(struct ifnet *, u_long, caddr_t); +static void tuninit(struct ifnet *); +static void tunifinit(void *xtp); +static int tuntapmodevent(module_t, int, void *); +static int tunoutput(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *ro); +static void tunstart(struct ifnet *); +static void tunstart_l2(struct ifnet *); + +static int tun_clone_match(struct if_clone *ifc, const char *name); +static int tap_clone_match(struct if_clone *ifc, const char *name); +static int vmnet_clone_match(struct if_clone *ifc, const char *name); +static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); +static int tun_clone_destroy(struct if_clone *, struct ifnet *); + +static d_open_t tunopen; +static d_close_t tunclose; +static d_read_t tunread; +static d_write_t tunwrite; +static d_ioctl_t tunioctl; +static d_poll_t tunpoll; +static d_kqfilter_t tunkqfilter; + +static int tunkqread(struct knote *, long); +static int tunkqwrite(struct knote *, long); +static void tunkqdetach(struct knote *); + +static struct filterops tun_read_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqread, +}; + +static struct filterops tun_write_filterops = { + .f_isfd = 1, + .f_attach = NULL, + .f_detach = tunkqdetach, + .f_event = tunkqwrite, +}; + +static struct tuntap_driver { + struct cdevsw cdevsw; + int ident_flags; + struct unrhdr *unrhdr; + struct clonedevs *clones; + ifc_match_t *clone_match_fn; + ifc_create_t *clone_create_fn; + ifc_destroy_t *clone_destroy_fn; +} tuntap_drivers[] = { + { + .ident_flags = 0, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_close = tunclose, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = tunname, + }, + .clone_match_fn = tun_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, + { + .ident_flags = TUN_L2, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_close = tunclose, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = tapname, + }, + .clone_match_fn = tap_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, + { + .ident_flags = TUN_L2 | TUN_VMNET, + .cdevsw = { + .d_version = D_VERSION, + .d_flags = D_NEEDMINOR, + .d_open = tunopen, + .d_close = tunclose, + .d_read = tunread, + .d_write = tunwrite, + .d_ioctl = tunioctl, + .d_poll = tunpoll, + .d_kqfilter = tunkqfilter, + .d_name = vmnetname, + }, + .clone_match_fn = vmnet_clone_match, + .clone_create_fn = tun_clone_create, + .clone_destroy_fn = tun_clone_destroy, + }, +}; + +struct tuntap_driver_cloner { + SLIST_ENTRY(tuntap_driver_cloner) link; + struct tuntap_driver *drv; + struct if_clone *cloner; +}; + +VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = + SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); + +#define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) + +/* + * Sets unit and/or flags given the device name. Must be called with correct + * vnet context. + */ +static int +tuntap_name2info(const char *name, int *outunit, int *outflags) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + char *dname; + int flags, unit; + bool found; + + if (name == NULL) + return (EINVAL); + + /* + * Needed for dev_stdclone, but dev_stdclone will not modify, it just + * wants to be able to pass back a char * through the second param. We + * will always set that as NULL here, so we'll fake it. + */ + dname = __DECONST(char *, name); + found = false; + + KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), + ("tuntap_driver_cloners failed to initialize")); + SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { + KASSERT(drvc->drv != NULL, + ("tuntap_driver_cloners entry not properly initialized")); + drv = drvc->drv; + + if (strcmp(name, drv->cdevsw.d_name) == 0) { + found = true; + unit = -1; + flags = drv->ident_flags; + break; + } + + if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { + found = true; + flags = drv->ident_flags; + break; + } + } + + if (!found) + return (ENXIO); + + if (outunit != NULL) + *outunit = unit; + if (outflags != NULL) + *outflags = flags; + return (0); +} + +/* + * Get driver information from a set of flags specified. Masks the identifying + * part of the flags and compares it against all of the available + * tuntap_drivers. Must be called with correct vnet context. + */ +static struct tuntap_driver * +tuntap_driver_from_flags(int tun_flags) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + + KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), + ("tuntap_driver_cloners failed to initialize")); + SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { + KASSERT(drvc->drv != NULL, + ("tuntap_driver_cloners entry not properly initialized")); + drv = drvc->drv; + if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) + return (drv); + } + + return (NULL); +} + + + +static int +tun_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & TUN_L2) == 0) + return (1); + } + + return (0); +} + +static int +tap_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) + return (1); + } + + return (0); +} + +static int +vmnet_clone_match(struct if_clone *ifc, const char *name) +{ + int tunflags; + + if (tuntap_name2info(name, NULL, &tunflags) == 0) { + if ((tunflags & TUN_VMNET) != 0) + return (1); + } + + return (0); +} + +static int +tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) +{ + struct tuntap_driver *drv; + struct cdev *dev; + int err, i, tunflags, unit; + + tunflags = 0; + /* The name here tells us exactly what we're creating */ + err = tuntap_name2info(name, &unit, &tunflags); + if (err != 0) + return (err); + + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) + return (ENXIO); + + if (unit != -1) { + /* If this unit number is still available that's okay. */ + if (alloc_unr_specific(drv->unrhdr, unit) == -1) + return (EEXIST); + } else { + unit = alloc_unr(drv->unrhdr); + } + + snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); + + /* find any existing device, or allocate new unit number */ + i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); + if (i) { + /* No preexisting struct cdev *, create one */ + dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600, + "%s%d", drv->cdevsw.d_name, unit); + } + + tuncreate(dev, drv); + + return (0); +} + +static void +tunclone(void *arg, struct ucred *cred, char *name, int namelen, + struct cdev **dev) +{ + char devname[SPECNAMELEN + 1]; + struct tuntap_driver *drv; + int append_unit, i, u, tunflags; + bool mayclone; + + if (*dev != NULL) + return; + + tunflags = 0; + CURVNET_SET(CRED_TO_VNET(cred)); + if (tuntap_name2info(name, &u, &tunflags) != 0) + goto out; /* Not recognized */ + + if (u != -1 && u > IF_MAXUNIT) + goto out; /* Unit number too high */ + + mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; + if ((tunflags & TUN_L2) != 0) { + /* tap/vmnet allow user open with a sysctl */ + mayclone = (mayclone || tap_allow_uopen) && tapdclone; + } else { + mayclone = mayclone && tundclone; + } + + /* + * If tun cloning is enabled, only the superuser can create an + * interface. + */ + if (!mayclone) + goto out; + + if (u == -1) + append_unit = 1; + else + append_unit = 0; + + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) + goto out; + + /* find any existing device, or allocate new unit number */ + i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); + if (i) { + if (append_unit) { + namelen = snprintf(devname, sizeof(devname), "%s%d", + name, u); + name = devname; + } + /* No preexisting struct cdev *, create one */ + *dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred, + UID_UUCP, GID_DIALER, 0600, "%s", name); + } + + if_clone_create(name, namelen, NULL); +out: + CURVNET_RESTORE(); +} + +static void +tun_destroy(struct tuntap_softc *tp) +{ + + TUN_LOCK(tp); + tp->tun_flags |= TUN_DYING; + if ((tp->tun_flags & TUN_OPEN) != 0) + cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); + else + TUN_UNLOCK(tp); + + CURVNET_SET(TUN2IFP(tp)->if_vnet); + + destroy_dev(tp->tun_dev); + seldrain(&tp->tun_rsel); + knlist_clear(&tp->tun_rsel.si_note, 0); + knlist_destroy(&tp->tun_rsel.si_note); + if ((tp->tun_flags & TUN_L2) != 0) { + ether_ifdetach(TUN2IFP(tp)); + } else { + bpfdetach(TUN2IFP(tp)); + if_detach(TUN2IFP(tp)); + } + sx_xlock(&tun_ioctl_sx); + TUN2IFP(tp)->if_softc = NULL; + sx_xunlock(&tun_ioctl_sx); + free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); + if_free(TUN2IFP(tp)); + mtx_destroy(&tp->tun_mtx); + cv_destroy(&tp->tun_cv); + free(tp, M_TUN); + CURVNET_RESTORE(); +} + +static int +tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + + mtx_lock(&tunmtx); + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); + + return (0); +} + +static void +vnet_tun_init(const void *unused __unused) +{ + struct tuntap_driver *drv; + struct tuntap_driver_cloner *drvc; + int i; + + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); + + drvc->drv = drv; + drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, + drv->clone_match_fn, drv->clone_create_fn, + drv->clone_destroy_fn); + SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); + }; +} +VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_tun_init, NULL); + +#ifndef __rtems__ +static void +vnet_tun_uninit(const void *unused __unused) +{ + struct tuntap_driver_cloner *drvc; + + while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { + drvc = SLIST_FIRST(&V_tuntap_driver_cloners); + SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); + + if_clone_detach(drvc->cloner); + free(drvc, M_TUN); + } +} +VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_tun_uninit, NULL); + +static void +tun_uninit(const void *unused __unused) +{ + struct tuntap_driver *drv; + struct tuntap_softc *tp; + int i; + + EVENTHANDLER_DEREGISTER(dev_clone, tag); + drain_dev_clone_events(); + + mtx_lock(&tunmtx); + while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { + TAILQ_REMOVE(&tunhead, tp, tun_list); + mtx_unlock(&tunmtx); + tun_destroy(tp); + mtx_lock(&tunmtx); + } + mtx_unlock(&tunmtx); + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + delete_unrhdr(drv->unrhdr); + clone_cleanup(&drv->clones); + } + mtx_destroy(&tunmtx); +} +SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); +#endif /* __rtems__ */ + +static int +tuntapmodevent(module_t mod, int type, void *data) +{ + struct tuntap_driver *drv; + int i; + + switch (type) { + case MOD_LOAD: + mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); + for (i = 0; i < nitems(tuntap_drivers); ++i) { + drv = &tuntap_drivers[i]; + clone_setup(&drv->clones); + drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); + } + tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); + if (tag == NULL) + return (ENOMEM); + break; + case MOD_UNLOAD: + /* See tun_uninit, so it's done after the vnet_sysuninit() */ + break; + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t tuntap_mod = { + "if_tuntap", + tuntapmodevent, + 0 +}; + +DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_tuntap, 1); +MODULE_VERSION(if_tun, 1); +MODULE_VERSION(if_tap, 1); + +static void +tunstart(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + struct mbuf *m; + + TUNDEBUG(ifp, "starting\n"); + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m == NULL) { + IFQ_UNLOCK(&ifp->if_snd); + return; + } + IFQ_UNLOCK(&ifp->if_snd); + } + + TUN_LOCK(tp); + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup(tp); + } + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { + TUN_UNLOCK(tp); + pgsigio(&tp->tun_sigio, SIGIO, 0); + } else + TUN_UNLOCK(tp); +} + +/* + * tunstart_l2 + * + * queue packets from higher level ready to put out + */ +static void +tunstart_l2(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; + + TUNDEBUG(ifp, "starting\n"); + + /* + * do not junk pending output if we are in VMnet mode. + * XXX: can this do any harm because of queue overflow? + */ + + TUN_LOCK(tp); + if (((tp->tun_flags & TUN_VMNET) == 0) && + ((tp->tun_flags & TUN_READY) != TUN_READY)) { + struct mbuf *m; + + /* Unlocked read. */ + TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); + + for (;;) { + IF_DEQUEUE(&ifp->if_snd, m); + if (m != NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + } else + break; + } + TUN_UNLOCK(tp); + + return; + } + + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + if (tp->tun_flags & TUN_RWAIT) { + tp->tun_flags &= ~TUN_RWAIT; + wakeup(tp); + } + + if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { + TUN_UNLOCK(tp); + pgsigio(&tp->tun_sigio, SIGIO, 0); + TUN_LOCK(tp); + } + + selwakeuppri(&tp->tun_rsel, PZERO+1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ + } + + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + TUN_UNLOCK(tp); +} /* tunstart_l2 */ + + +/* XXX: should return an error code so it can fail. */ +static void +tuncreate(struct cdev *dev, struct tuntap_driver *drv) +{ + struct tuntap_softc *sc; + struct ifnet *ifp; + struct ether_addr eaddr; + int iflags; + u_char type; + + sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); + mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); + cv_init(&sc->tun_cv, "tun_condvar"); + sc->tun_flags = drv->ident_flags; + sc->tun_dev = dev; + sc->tun_drv = drv; + mtx_lock(&tunmtx); + TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); + mtx_unlock(&tunmtx); + + iflags = IFF_MULTICAST; + if ((sc->tun_flags & TUN_L2) != 0) { + type = IFT_ETHER; + iflags |= IFF_BROADCAST | IFF_SIMPLEX; + } else { + type = IFT_PPP; + iflags |= IFF_POINTOPOINT; + } + ifp = sc->tun_ifp = if_alloc(type); + if (ifp == NULL) + panic("%s%d: failed to if_alloc() interface.\n", + drv->cdevsw.d_name, dev2unit(dev)); + ifp->if_softc = sc; + if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); + ifp->if_ioctl = tunifioctl; + ifp->if_flags = iflags; + IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); + knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); + ifp->if_capabilities |= IFCAP_LINKSTATE; + ifp->if_capenable |= IFCAP_LINKSTATE; + + if ((sc->tun_flags & TUN_L2) != 0) { + ifp->if_mtu = ETHERMTU; + ifp->if_init = tunifinit; + ifp->if_start = tunstart_l2; + + ether_gen_addr(ifp, &eaddr); + ether_ifattach(ifp, eaddr.octet); + } else { + ifp->if_mtu = TUNMTU; + ifp->if_start = tunstart; + ifp->if_output = tunoutput; + + ifp->if_snd.ifq_drv_maxlen = 0; + IFQ_SET_READY(&ifp->if_snd); + + if_attach(ifp); + bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); + } + dev->si_drv1 = sc; + + TUN_LOCK(sc); + sc->tun_flags |= TUN_INITED; + TUN_UNLOCK(sc); + + TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); +} + +static int +tunopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct ifnet *ifp; + struct tuntap_driver *drv; + struct tuntap_softc *tp; + int error, tunflags; + + tunflags = 0; + CURVNET_SET(TD_TO_VNET(td)); + error = tuntap_name2info(dev->si_name, NULL, &tunflags); + if (error != 0) { + CURVNET_RESTORE(); + return (error); /* Shouldn't happen */ + } + + if ((tunflags & TUN_L2) != 0) { + /* Restrict? */ + if (tap_allow_uopen == 0) { + error = priv_check(td, PRIV_NET_TAP); + if (error != 0) { + CURVNET_RESTORE(); + return (error); + } + } + } + + /* + * XXXRW: Non-atomic test and set of dev->si_drv1 requires + * synchronization. + */ + tp = dev->si_drv1; + if (!tp) { + drv = tuntap_driver_from_flags(tunflags); + if (drv == NULL) { + CURVNET_RESTORE(); + return (ENXIO); + } + tuncreate(dev, drv); + tp = dev->si_drv1; + } + + TUN_LOCK(tp); + if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { + TUN_UNLOCK(tp); + CURVNET_RESTORE(); + return (EBUSY); + } + + ifp = TUN2IFP(tp); + + if ((tp->tun_flags & TUN_L2) != 0) { + bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, + sizeof(tp->tun_ether.octet)); + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + if (tapuponopen) + ifp->if_flags |= IFF_UP; + } + +#ifndef __rtems__ + tp->tun_pid = td->td_proc->p_pid; +#endif /* __rtems__ */ + tp->tun_flags |= TUN_OPEN; + + if_link_state_change(ifp, LINK_STATE_UP); + TUNDEBUG(ifp, "open\n"); + TUN_UNLOCK(tp); + CURVNET_RESTORE(); + return (0); +} + +/* + * tunclose - close the device - mark i/f down & delete + * routing info + */ +static int +tunclose(struct cdev *dev, int foo, int bar, struct thread *td) +{ + struct tuntap_softc *tp; + struct ifnet *ifp; + bool l2tun; + + tp = dev->si_drv1; + ifp = TUN2IFP(tp); + + TUN_LOCK(tp); +#ifndef __rtems__ + /* + * Simply close the device if this isn't the controlling process. This + * may happen if, for instance, the tunnel has been handed off to + * another process. The original controller should be able to close it + * without putting us into an inconsistent state. + */ + if (td->td_proc->p_pid != tp->tun_pid) { + TUN_UNLOCK(tp); + return (0); + } +#endif /* __rtems__ */ + + /* + * junk all pending output + */ + CURVNET_SET(ifp->if_vnet); + + l2tun = false; + if ((tp->tun_flags & TUN_L2) != 0) { + l2tun = true; + IF_DRAIN(&ifp->if_snd); + } else { + IFQ_PURGE(&ifp->if_snd); + } + + /* For vmnet, we won't do most of the address/route bits */ + if ((tp->tun_flags & TUN_VMNET) != 0 || + (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) + goto out; + + if (ifp->if_flags & IFF_UP) { + TUN_UNLOCK(tp); + if_down(ifp); + TUN_LOCK(tp); + } + + /* Delete all addresses and routes which reference this interface. */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + struct ifaddr *ifa; + + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + TUN_UNLOCK(tp); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + /* deal w/IPv4 PtP destination; unlocked read */ + if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) { + rtinit(ifa, (int)RTM_DELETE, + tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); + } else { + rtinit(ifa, (int)RTM_DELETE, 0); + } + } + if_purgeaddrs(ifp); + TUN_LOCK(tp); + } + +out: + if_link_state_change(ifp, LINK_STATE_DOWN); + CURVNET_RESTORE(); + + funsetown(&tp->tun_sigio); + selwakeuppri(&tp->tun_rsel, PZERO + 1); + KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); + TUNDEBUG (ifp, "closed\n"); + tp->tun_flags &= ~TUN_OPEN; +#ifndef __rtems__ + tp->tun_pid = 0; +#endif /* __rtems__ */ + + cv_broadcast(&tp->tun_cv); + TUN_UNLOCK(tp); + return (0); +} + +static void +tuninit(struct ifnet *ifp) +{ + struct tuntap_softc *tp = ifp->if_softc; +#ifdef INET + struct ifaddr *ifa; +#endif + + TUNDEBUG(ifp, "tuninit\n"); + + TUN_LOCK(tp); + ifp->if_drv_flags |= IFF_DRV_RUNNING; + if ((tp->tun_flags & TUN_L2) == 0) { + ifp->if_flags |= IFF_UP; + getmicrotime(&ifp->if_lastchange); +#ifdef INET + if_addr_rlock(ifp); + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) { + struct sockaddr_in *si; + + si = (struct sockaddr_in *)ifa->ifa_addr; + if (si->sin_addr.s_addr) + tp->tun_flags |= TUN_IASET; + + si = (struct sockaddr_in *)ifa->ifa_dstaddr; + if (si && si->sin_addr.s_addr) + tp->tun_flags |= TUN_DSTADDR; + } + } + if_addr_runlock(ifp); +#endif + TUN_UNLOCK(tp); + } else { + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + TUN_UNLOCK(tp); + /* attempt to start output */ + tunstart_l2(ifp); + } + +} + +/* + * Used only for l2 tunnel. + */ +static void +tunifinit(void *xtp) +{ + struct tuntap_softc *tp; + + tp = (struct tuntap_softc *)xtp; + tuninit(tp->tun_ifp); +} + +/* + * Process an ioctl request. + */ +static int +tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + struct tuntap_softc *tp; + struct ifstat *ifs; + struct ifmediareq *ifmr; + int dummy, error = 0; + bool l2tun; + + ifmr = NULL; + sx_xlock(&tun_ioctl_sx); + tp = ifp->if_softc; + if (tp == NULL) { + error = ENXIO; + goto bad; + } + l2tun = (tp->tun_flags & TUN_L2) != 0; + switch(cmd) { + case SIOCGIFSTATUS: + ifs = (struct ifstat *)data; + TUN_LOCK(tp); +#ifndef __rtems__ + if (tp->tun_pid) + snprintf(ifs->ascii, sizeof(ifs->ascii), + "\tOpened by PID %d\n", tp->tun_pid); + else +#endif /* __rtems__ */ + ifs->ascii[0] = '\0'; + TUN_UNLOCK(tp); + break; + case SIOCSIFADDR: + if (l2tun) + error = ether_ioctl(ifp, cmd, data); + else + tuninit(ifp); + if (error == 0) + TUNDEBUG(ifp, "address set\n"); + break; + case SIOCSIFMTU: + ifp->if_mtu = ifr->ifr_mtu; + TUNDEBUG(ifp, "mtu set\n"); + break; + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + case SIOCGIFMEDIA: + if (!l2tun) { + error = EINVAL; + break; + } + + ifmr = (struct ifmediareq *)data; + dummy = ifmr->ifm_count; + ifmr->ifm_count = 1; + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + if (tp->tun_flags & TUN_OPEN) + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_current = ifmr->ifm_active; + if (dummy >= 1) { + int media = IFM_ETHER; + error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); + } + break; + default: + if (l2tun) { + error = ether_ioctl(ifp, cmd, data); + } else { + error = EINVAL; + } + } +bad: + sx_xunlock(&tun_ioctl_sx); + return (error); +} + +/* + * tunoutput - queue packets from higher level ready to put out. + */ +static int +tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, + struct route *ro) +{ + struct tuntap_softc *tp = ifp->if_softc; + u_short cached_tun_flags; + int error; + u_int32_t af; + + TUNDEBUG (ifp, "tunoutput\n"); + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m0); + if (error) { + m_freem(m0); + return (error); + } +#endif + + /* Could be unlocked read? */ + TUN_LOCK(tp); + cached_tun_flags = tp->tun_flags; + TUN_UNLOCK(tp); + if ((cached_tun_flags & TUN_READY) != TUN_READY) { + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + m_freem (m0); + return (EHOSTDOWN); + } + + if ((ifp->if_flags & IFF_UP) != IFF_UP) { + m_freem (m0); + return (EHOSTDOWN); + } + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) + bcopy(dst->sa_data, &af, sizeof(af)); + else + af = dst->sa_family; + + if (bpf_peers_present(ifp->if_bpf)) + bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); + + /* prepend sockaddr? this may abort if the mbuf allocation fails */ + if (cached_tun_flags & TUN_LMODE) { + /* allocate space for sockaddr */ + M_PREPEND(m0, dst->sa_len, M_NOWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENOBUFS); + } else { + bcopy(dst, m0->m_data, dst->sa_len); + } + } + + if (cached_tun_flags & TUN_IFHEAD) { + /* Prepend the address family */ + M_PREPEND(m0, 4, M_NOWAIT); + + /* if allocation failed drop packet */ + if (m0 == NULL) { + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENOBUFS); + } else + *(u_int32_t *)m0->m_data = htonl(af); + } else { +#ifdef INET + if (af != AF_INET) +#endif + { + m_freem(m0); + return (EAFNOSUPPORT); + } + } + + error = (ifp->if_transmit)(ifp, m0); + if (error) + return (ENOBUFS); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + return (0); +} + +/* + * the cdevsw interface is now pretty minimal. + */ +static int +tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, + struct thread *td) +{ + struct ifreq ifr, *ifrp; + struct tuntap_softc *tp = dev->si_drv1; + struct tuninfo *tunp; + int error, iflags; +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + int ival; +#endif + bool l2tun; + + l2tun = (tp->tun_flags & TUN_L2) != 0; + if (l2tun) { + /* tap specific ioctls */ + switch(cmd) { + /* VMware/VMnet port ioctl's */ +#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ + defined(COMPAT_FREEBSD4) + case _IO('V', 0): + ival = IOCPARM_IVAL(data); + data = (caddr_t)&ival; + /* FALLTHROUGH */ +#endif + case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ + iflags = *(int *)data; + iflags &= TUN_VMIO_FLAG_MASK; + iflags &= ~IFF_CANTCHANGE; + iflags |= IFF_UP; + + TUN_LOCK(tp); + TUN2IFP(tp)->if_flags = iflags | + (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE); + TUN_UNLOCK(tp); + + return (0); + case SIOCGIFADDR: /* get MAC address of the remote side */ + TUN_LOCK(tp); + bcopy(&tp->tun_ether.octet, data, + sizeof(tp->tun_ether.octet)); + TUN_UNLOCK(tp); + + return (0); + case SIOCSIFADDR: /* set MAC address of the remote side */ + TUN_LOCK(tp); + bcopy(data, &tp->tun_ether.octet, + sizeof(tp->tun_ether.octet)); + TUN_UNLOCK(tp); + + return (0); + } + + /* Fall through to the common ioctls if unhandled */ + } else { + switch (cmd) { + case TUNSLMODE: + TUN_LOCK(tp); + if (*(int *)data) { + tp->tun_flags |= TUN_LMODE; + tp->tun_flags &= ~TUN_IFHEAD; + } else + tp->tun_flags &= ~TUN_LMODE; + TUN_UNLOCK(tp); + + return (0); + case TUNSIFHEAD: + TUN_LOCK(tp); + if (*(int *)data) { + tp->tun_flags |= TUN_IFHEAD; + tp->tun_flags &= ~TUN_LMODE; + } else + tp->tun_flags &= ~TUN_IFHEAD; + TUN_UNLOCK(tp); + + return (0); + case TUNGIFHEAD: + TUN_LOCK(tp); + *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; + TUN_UNLOCK(tp); + + return (0); + case TUNSIFMODE: + /* deny this if UP */ + if (TUN2IFP(tp)->if_flags & IFF_UP) + return (EBUSY); + + switch (*(int *)data & ~IFF_MULTICAST) { + case IFF_POINTOPOINT: + case IFF_BROADCAST: + TUN_LOCK(tp); + TUN2IFP(tp)->if_flags &= + ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); + TUN2IFP(tp)->if_flags |= *(int *)data; + TUN_UNLOCK(tp); + + break; + default: + return (EINVAL); + } + + return (0); + case TUNSIFPID: +#ifndef __rtems__ + TUN_LOCK(tp); + tp->tun_pid = curthread->td_proc->p_pid; + TUN_UNLOCK(tp); +#endif /* __rtems__ */ + + return (0); + } + /* Fall through to the common ioctls if unhandled */ + } + + switch (cmd) { + case TUNGIFNAME: + ifrp = (struct ifreq *)data; + strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); + + return (0); + case TUNSIFINFO: + tunp = (struct tuninfo *)data; + if (TUN2IFP(tp)->if_type != tunp->type) + return (EPROTOTYPE); + TUN_LOCK(tp); + if (TUN2IFP(tp)->if_mtu != tunp->mtu) { + strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); + ifr.ifr_mtu = tunp->mtu; + CURVNET_SET(TUN2IFP(tp)->if_vnet); + error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), + (caddr_t)&ifr, td); + CURVNET_RESTORE(); + if (error) { + TUN_UNLOCK(tp); + return (error); + } + } + TUN2IFP(tp)->if_baudrate = tunp->baudrate; + TUN_UNLOCK(tp); + break; + case TUNGIFINFO: + tunp = (struct tuninfo *)data; + TUN_LOCK(tp); + tunp->mtu = TUN2IFP(tp)->if_mtu; + tunp->type = TUN2IFP(tp)->if_type; + tunp->baudrate = TUN2IFP(tp)->if_baudrate; + TUN_UNLOCK(tp); + break; + case TUNSDEBUG: + tundebug = *(int *)data; + break; + case TUNGDEBUG: + *(int *)data = tundebug; + break; + case FIONBIO: + break; + case FIOASYNC: + TUN_LOCK(tp); + if (*(int *)data) + tp->tun_flags |= TUN_ASYNC; + else + tp->tun_flags &= ~TUN_ASYNC; + TUN_UNLOCK(tp); + break; + case FIONREAD: + if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { + struct mbuf *mb; + IFQ_LOCK(&TUN2IFP(tp)->if_snd); + IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); + for (*(int *)data = 0; mb != NULL; mb = mb->m_next) + *(int *)data += mb->m_len; + IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); + } else + *(int *)data = 0; + break; + case FIOSETOWN: + return (fsetown(*(int *)data, &tp->tun_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(&tp->tun_sigio); + return (0); + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &tp->tun_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)data = -fgetown(&tp->tun_sigio); + return (0); + + default: + return (ENOTTY); + } + return (0); +} + +/* + * The cdevsw read interface - reads a packet at a time, or at + * least as much of a packet as can be read. + */ +static int +tunread(struct cdev *dev, struct uio *uio, int flag) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + struct mbuf *m; + int error=0, len; + + TUNDEBUG (ifp, "read\n"); + TUN_LOCK(tp); + if ((tp->tun_flags & TUN_READY) != TUN_READY) { + TUN_UNLOCK(tp); + TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); + return (EHOSTDOWN); + } + + tp->tun_flags &= ~TUN_RWAIT; + + for (;;) { + IFQ_DEQUEUE(&ifp->if_snd, m); + if (m != NULL) + break; + if (flag & O_NONBLOCK) { + TUN_UNLOCK(tp); + return (EWOULDBLOCK); + } + tp->tun_flags |= TUN_RWAIT; + error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), + "tunread", 0); + if (error != 0) { + TUN_UNLOCK(tp); + return (error); + } + } + TUN_UNLOCK(tp); + + if ((tp->tun_flags & TUN_L2) != 0) + BPF_MTAP(ifp, m); + + while (m && uio->uio_resid > 0 && error == 0) { + len = min(uio->uio_resid, m->m_len); + if (len != 0) + error = uiomove(mtod(m, void *), len, uio); + m = m_free(m); + } + + if (m) { + TUNDEBUG(ifp, "Dropping mbuf\n"); + m_freem(m); + } + return (error); +} + +static int +tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m) +{ + struct ether_header *eh; + struct ifnet *ifp; + + ifp = TUN2IFP(tp); + + /* + * Only pass a unicast frame to ether_input(), if it would + * actually have been received by non-virtual hardware. + */ + if (m->m_len < sizeof(struct ether_header)) { + m_freem(m); + return (0); + } + + eh = mtod(m, struct ether_header *); + + if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && + !ETHER_IS_MULTICAST(eh->ether_dhost) && + bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { + m_freem(m); + return (0); + } + + /* Pass packet up to parent. */ + CURVNET_SET(ifp->if_vnet); + (*ifp->if_input)(ifp, m); + CURVNET_RESTORE(); + /* ibytes are counted in parent */ + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + return (0); +} + +static int +tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) +{ + struct ifnet *ifp; + int family, isr; + + ifp = TUN2IFP(tp); + /* Could be unlocked read? */ + TUN_LOCK(tp); + if (tp->tun_flags & TUN_IFHEAD) { + TUN_UNLOCK(tp); + if (m->m_len < sizeof(family) && + (m = m_pullup(m, sizeof(family))) == NULL) + return (ENOBUFS); + family = ntohl(*mtod(m, u_int32_t *)); + m_adj(m, sizeof(family)); + } else { + TUN_UNLOCK(tp); + family = AF_INET; + } + + BPF_MTAP2(ifp, &family, sizeof(family), m); + + switch (family) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + break; +#endif +#ifdef INET6 + case AF_INET6: + isr = NETISR_IPV6; + break; +#endif + default: + m_freem(m); + return (EAFNOSUPPORT); + } + random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + CURVNET_SET(ifp->if_vnet); + M_SETFIB(m, ifp->if_fib); + netisr_dispatch(isr, m); + CURVNET_RESTORE(); + return (0); +} + +/* + * the cdevsw write interface - an atomic write is a packet - or else! + */ +static int +tunwrite(struct cdev *dev, struct uio *uio, int flag) +{ + struct tuntap_softc *tp; + struct ifnet *ifp; + struct mbuf *m; + uint32_t mru; + int align; + bool l2tun; + + tp = dev->si_drv1; + ifp = TUN2IFP(tp); + TUNDEBUG(ifp, "tunwrite\n"); + if ((ifp->if_flags & IFF_UP) != IFF_UP) + /* ignore silently */ + return (0); + + if (uio->uio_resid == 0) + return (0); + + l2tun = (tp->tun_flags & TUN_L2) != 0; + align = 0; + mru = l2tun ? TAPMRU : TUNMRU; + if (l2tun) + align = ETHER_ALIGN; + else if ((tp->tun_flags & TUN_IFHEAD) != 0) + mru += sizeof(uint32_t); /* family */ + if (uio->uio_resid < 0 || uio->uio_resid > mru) { + TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); + return (EIO); + } + + if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + return (ENOBUFS); + } + + m->m_pkthdr.rcvif = ifp; +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); +#endif + + if (l2tun) + return (tunwrite_l2(tp, m)); + + return (tunwrite_l3(tp, m)); +} + +/* + * tunpoll - the poll interface, this is only useful on reads + * really. The write detect always returns true, write never blocks + * anyway, it either accepts the packet or drops it. + */ +static int +tunpoll(struct cdev *dev, int events, struct thread *td) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + int revents = 0; + + TUNDEBUG(ifp, "tunpoll\n"); + + if (events & (POLLIN | POLLRDNORM)) { + IFQ_LOCK(&ifp->if_snd); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); + revents |= events & (POLLIN | POLLRDNORM); + } else { + TUNDEBUG(ifp, "tunpoll waiting\n"); + selrecord(td, &tp->tun_rsel); + } + IFQ_UNLOCK(&ifp->if_snd); + } + revents |= events & (POLLOUT | POLLWRNORM); + + return (revents); +} + +/* + * tunkqfilter - support for the kevent() system call. + */ +static int +tunkqfilter(struct cdev *dev, struct knote *kn) +{ + struct tuntap_softc *tp = dev->si_drv1; + struct ifnet *ifp = TUN2IFP(tp); + + switch(kn->kn_filter) { + case EVFILT_READ: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_read_filterops; + break; + + case EVFILT_WRITE: + TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + kn->kn_fop = &tun_write_filterops; + break; + + default: + TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", + ifp->if_xname, dev2unit(dev)); + return(EINVAL); + } + + kn->kn_hook = tp; + knlist_add(&tp->tun_rsel.si_note, kn, 0); + + return (0); +} + +/* + * Return true of there is data in the interface queue. + */ +static int +tunkqread(struct knote *kn, long hint) +{ + int ret; + struct tuntap_softc *tp = kn->kn_hook; + struct cdev *dev = tp->tun_dev; + struct ifnet *ifp = TUN2IFP(tp); + + if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { + TUNDEBUG(ifp, + "%s have data in the queue. Len = %d, minor = %#x\n", + ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); + ret = 1; + } else { + TUNDEBUG(ifp, + "%s waiting for data, minor = %#x\n", ifp->if_xname, + dev2unit(dev)); + ret = 0; + } + + return (ret); +} + +/* + * Always can write, always return MTU in kn->data. + */ +static int +tunkqwrite(struct knote *kn, long hint) +{ + struct tuntap_softc *tp = kn->kn_hook; + struct ifnet *ifp = TUN2IFP(tp); + + kn->kn_data = ifp->if_mtu; + + return (1); +} + +static void +tunkqdetach(struct knote *kn) +{ + struct tuntap_softc *tp = kn->kn_hook; + + knlist_remove(&tp->tun_rsel.si_note, kn, 0); +} diff --git a/freebsd/sys/net/if_var.h b/freebsd/sys/net/if_var.h index d23928e5..700296fa 100644 --- a/freebsd/sys/net/if_var.h +++ b/freebsd/sys/net/if_var.h @@ -73,6 +73,7 @@ struct netmap_adapter; struct netdump_methods; #ifdef _KERNEL +#include <sys/_eventhandler.h> #include <sys/mbuf.h> /* ifqueue only? */ #include <sys/buf_ring.h> #include <net/vnet.h> @@ -95,8 +96,9 @@ CK_STAILQ_HEAD(ifmultihead, ifmultiaddr); CK_STAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL -VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ -#define V_link_pfil_hook VNET(link_pfil_hook) +VNET_DECLARE(struct pfil_head *, link_pfil_head); +#define V_link_pfil_head VNET(link_pfil_head) +#define PFIL_ETHER_NAME "ethernet" #define HHOOK_IPSEC_INET 0 #define HHOOK_IPSEC_INET6 1 @@ -193,11 +195,13 @@ struct if_encap_req { * m_snd_tag" comes from the network driver and it is free to allocate * as much additional space as it wants for its own use. */ +struct ktls_session; struct m_snd_tag; #define IF_SND_TAG_TYPE_RATE_LIMIT 0 #define IF_SND_TAG_TYPE_UNLIMITED 1 -#define IF_SND_TAG_TYPE_MAX 2 +#define IF_SND_TAG_TYPE_TLS 2 +#define IF_SND_TAG_TYPE_MAX 3 struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ @@ -208,6 +212,14 @@ struct if_snd_tag_alloc_header { struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ + uint32_t reserved; /* alignment */ +}; + +struct if_snd_tag_alloc_tls { + struct if_snd_tag_alloc_header hdr; + struct inpcb *inp; + const struct ktls_session *tls; }; struct if_snd_tag_rate_limit_params { @@ -215,13 +227,14 @@ struct if_snd_tag_rate_limit_params { uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 - uint32_t reserved; /* padding */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ }; union if_snd_tag_alloc_params { struct if_snd_tag_alloc_header hdr; struct if_snd_tag_alloc_rate_limit rate_limit; struct if_snd_tag_alloc_rate_limit unlimited; + struct if_snd_tag_alloc_tls tls; }; union if_snd_tag_modify_params { @@ -234,11 +247,37 @@ union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params unlimited; }; +/* Query return flags */ +#define RT_NOSUPPORT 0x00000000 /* Not supported */ +#define RT_IS_INDIRECT 0x00000001 /* + * Interface like a lagg, select + * the actual interface for + * capabilities. + */ +#define RT_IS_SELECTABLE 0x00000002 /* + * No rate table, you select + * rates and the first + * number_of_rates are created. + */ +#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */ +#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */ + +struct if_ratelimit_query_results { + const uint64_t *rate_table; /* Pointer to table if present */ + uint32_t flags; /* Flags indicating results */ + uint32_t max_flows; /* Max flows using, 0=unlimited */ + uint32_t number_of_rates; /* How many unique rates can be created */ + uint32_t min_segment_burst; /* The amount the adapter bursts at each send */ +}; + typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); +typedef void (if_ratelimit_query_t)(struct ifnet *, + struct if_ratelimit_query_results *); + /* * Structure defining a network interface. @@ -250,7 +289,9 @@ struct ifnet { CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ - +#ifndef __rtems__ + uint8_t if_numa_domain; /* NUMA domain of device */ +#endif /* __rtems__ */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ @@ -379,6 +420,7 @@ struct ifnet { if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; + if_ratelimit_query_t *if_ratelimit_query; /* Ethernet PCP */ uint8_t if_pcp; @@ -416,24 +458,21 @@ struct rtems_ifinputreq { /* for compatibility with other BSDs */ #define if_name(ifp) ((ifp)->if_xname) +#define IF_NODOM 255 /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_lock) -#define IF_ADDR_RLOCK(if) struct epoch_tracker if_addr_et; epoch_enter_preempt(net_epoch_preempt, &if_addr_et); -#define IF_ADDR_RUNLOCK(if) epoch_exit_preempt(net_epoch_preempt, &if_addr_et); #define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock)) #define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED) -#define NET_EPOCH_ENTER() struct epoch_tracker nep_et; epoch_enter_preempt(net_epoch_preempt, &nep_et) -#define NET_EPOCH_ENTER_ET(et) epoch_enter_preempt(net_epoch_preempt, &(et)) -#define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt, &nep_et) -#define NET_EPOCH_EXIT_ET(et) epoch_exit_preempt(net_epoch_preempt, &(et)) -#define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) - +#define NET_EPOCH_ENTER(et) epoch_enter_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_EXIT(et) epoch_exit_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) +#define NET_EPOCH_ASSERT() MPASS(in_epoch(net_epoch_preempt)) /* * Function variations on locking macros intended to be used by loadable @@ -446,7 +485,6 @@ void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL -#ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); @@ -474,7 +512,6 @@ EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event); EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); -#endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups @@ -513,16 +550,13 @@ EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF) #define IF_AFDATA_WLOCK(ifp) mtx_lock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RLOCK(ifp) struct epoch_tracker if_afdata_et; epoch_enter_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_WUNLOCK(ifp) mtx_unlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RUNLOCK(ifp) epoch_exit_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) mtx_trylock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) mtx_destroy(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock)) -#define IF_AFDATA_RLOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt)); #define IF_AFDATA_WLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED) @@ -606,16 +640,13 @@ extern struct sx ifnet_sxlock; * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) -#define IFNET_RLOCK_NOSLEEP_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) -#define IFNET_RLOCK_NOSLEEP() struct epoch_tracker ifnet_rlock_et; epoch_enter_preempt(net_epoch_preempt, &ifnet_rlock_et) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) -#define IFNET_RUNLOCK_NOSLEEP() epoch_exit_preempt(net_epoch_preempt, &ifnet_rlock_et) /* * Look up an ifnet given its index; the _ref variant also acquires a @@ -654,6 +685,8 @@ int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); +struct ifnet* if_alloc_dev(u_char, device_t dev); +struct ifnet* if_alloc_domain(u_char, int numa_domain); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); diff --git a/freebsd/sys/net/if_vlan.c b/freebsd/sys/net/if_vlan.c index 893bb2cf..10a8a3bf 100644 --- a/freebsd/sys/net/if_vlan.c +++ b/freebsd/sys/net/if_vlan.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_kern_tls.h> #include <rtems/bsd/local/opt_vlan.h> #include <rtems/bsd/local/opt_ratelimit.h> @@ -105,6 +106,20 @@ struct ifvlantrunk { int refcnt; }; +#if defined(KERN_TLS) || defined(RATELIMIT) +struct vlan_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + +static inline struct vlan_snd_tag * +mst_to_vst(struct m_snd_tag *mst) +{ + + return (__containerof(mst, struct vlan_snd_tag, com)); +} +#endif + /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. @@ -158,7 +173,7 @@ struct vlan_mc_entry { struct epoch_context mc_epoch_ctx; }; -struct ifvlan { +struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) @@ -166,28 +181,19 @@ struct ifvlan { void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; - struct ifv_linkmib { - int ifvm_encaplen; /* encapsulation length */ - int ifvm_mtufudge; /* MTU fudged by this much */ - int ifvm_mintu; /* min transmission unit */ - uint16_t ifvm_proto; /* encapsulation ethertype */ - uint16_t ifvm_tag; /* tag to apply on packets leaving if */ - uint16_t ifvm_vid; /* VLAN ID */ - uint8_t ifvm_pcp; /* Priority Code Point (PCP). */ - } ifv_mib; + int ifv_encaplen; /* encapsulation length */ + int ifv_mtufudge; /* MTU fudged by this much */ + int ifv_mintu; /* min transmission unit */ + uint16_t ifv_proto; /* encapsulation ethertype */ + uint16_t ifv_tag; /* tag to apply on packets leaving if */ + uint16_t ifv_vid; /* VLAN ID */ + uint8_t ifv_pcp; /* Priority Code Point (PCP). */ struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; -#define ifv_proto ifv_mib.ifvm_proto -#define ifv_tag ifv_mib.ifvm_tag -#define ifv_vid ifv_mib.ifvm_vid -#define ifv_pcp ifv_mib.ifvm_pcp -#define ifv_encaplen ifv_mib.ifvm_encaplen -#define ifv_mtufudge ifv_mib.ifvm_mtufudge -#define ifv_mintu ifv_mib.ifvm_mintu /* Special flags we should propagate to parent. */ static struct { @@ -235,10 +241,6 @@ static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) -#define VLAN_RLOCK() NET_EPOCH_ENTER(); -#define VLAN_RUNLOCK() NET_EPOCH_EXIT(); -#define VLAN_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) - #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) @@ -254,11 +256,8 @@ static struct sx _VLAN_SX_ID; */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) -#define TRUNK_RLOCK(trunk) NET_EPOCH_ENTER() #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) -#define TRUNK_RUNLOCK(trunk) NET_EPOCH_EXIT(); #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) -#define TRUNK_RLOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt)) #define TRUNK_LOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(trunk)->lock)) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); @@ -282,9 +281,14 @@ static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); +static int vlan_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int vlan_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); +static void vlan_snd_tag_free(struct m_snd_tag *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, @@ -292,6 +296,8 @@ static int vlan_setflag(struct ifnet *ifp, int flag, int status, static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); +static int vlan_output(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); @@ -474,7 +480,7 @@ vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; - TRUNK_RLOCK_ASSERT(trunk); + NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) @@ -619,16 +625,17 @@ vlan_setmulti(struct ifnet *ifp) static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; - /* Need the rmlock since this is run on taskqueue_swi. */ - VLAN_RLOCK(); + /* Need the epoch since this is run on taskqueue_swi. */ + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return; } @@ -654,7 +661,7 @@ vlan_iflladdr(void *arg __unused, struct ifnet *ifp) taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); } /* @@ -700,17 +707,18 @@ vlan_ifdetach(void *arg __unused, struct ifnet *ifp) static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -782,20 +790,21 @@ vlan_setcookie(struct ifnet *ifp, void *cookie) static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (NULL); } ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (ifp); } @@ -1055,17 +1064,16 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; - /* NB: flags are not set here */ - ifp->if_linkmib = &ifv->ifv_mib; - ifp->if_linkmiblen = sizeof(ifv->ifv_mib); - /* NB: mtu is not set here */ ifp->if_init = vlan_init; ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; + ifp->if_snd_tag_modify = vlan_snd_tag_modify; + ifp->if_snd_tag_query = vlan_snd_tag_query; + ifp->if_snd_tag_free = vlan_snd_tag_free; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); @@ -1135,15 +1143,16 @@ vlan_init(void *foo __unused) static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { + struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } @@ -1153,20 +1162,40 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) BPF_MTAP(ifp, m); +#if defined(KERN_TLS) || defined(RATELIMIT) + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct vlan_snd_tag *vst; + struct m_snd_tag *mst; + + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); + mst = m->m_pkthdr.snd_tag; + vst = mst_to_vst(mst); + if (vst->tag->ifp != p) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + NET_EPOCH_EXIT(et); + m_freem(m); + return (EAGAIN); + } + + m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); + m_snd_tag_rele(mst); + } +#endif + /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, ifv->ifv_vid, ifv->ifv_pcp)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (0); } @@ -1180,10 +1209,31 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return (error); } +static int +vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) +{ + struct epoch_tracker et; + struct ifvlan *ifv; + struct ifnet *p; + + NET_EPOCH_ENTER(et); + ifv = ifp->if_softc; + if (TRUNK(ifv) == NULL) { + NET_EPOCH_EXIT(et); + m_freem(m); + return (ENETDOWN); + } + p = PARENT(ifv); + NET_EPOCH_EXIT(et); + return p->if_output(ifp, m, dst, ro); +} + + /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ @@ -1195,15 +1245,16 @@ vlan_qflush(struct ifnet *ifp __unused) static void vlan_input(struct ifnet *ifp, struct mbuf *m) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -1226,7 +1277,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return; } evl = mtod(m, struct ether_vlan_header *); @@ -1249,7 +1300,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -1259,7 +1310,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; @@ -1279,7 +1330,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -1290,7 +1341,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); @@ -1316,6 +1367,7 @@ vlan_lladdr_fn(void *arg, int pending __unused) static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; @@ -1397,7 +1449,6 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; - ifp->if_output = p->if_output; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; @@ -1405,6 +1456,12 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ifp->if_pcp = ifv->ifv_pcp; /* + * We wrap the parent's if_output using vlan_output to ensure that it + * can't become stale. + */ + ifp->if_output = vlan_output; + + /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ @@ -1415,9 +1472,9 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) ifp->if_link_state = p->if_link_state; - TRUNK_RLOCK(TRUNK(ifv)); + NET_EPOCH_ENTER(et); vlan_capabilities(ifv); - TRUNK_RUNLOCK(TRUNK(ifv)); + NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying @@ -1589,14 +1646,15 @@ vlan_setflags(struct ifnet *ifp, int status) static void vlan_link_state(struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; /* Called from a taskqueue_swi task, so we cannot sleep. */ - VLAN_RLOCK(); + NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); return; } @@ -1607,7 +1665,7 @@ vlan_link_state(struct ifnet *ifp) trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); - VLAN_RUNLOCK(); + NET_EPOCH_EXIT(et); } static void @@ -1620,7 +1678,7 @@ vlan_capabilities(struct ifvlan *ifv) u_long hwa = 0; VLAN_SXLOCK_ASSERT(); - TRUNK_RLOCK_ASSERT(TRUNK(ifv)); + NET_EPOCH_ASSERT(); p = PARENT(ifv); ifp = ifv->ifv_ifp; @@ -1704,6 +1762,30 @@ vlan_capabilities(struct ifvlan *ifv) ena |= (mena & IFCAP_TXRTLMT); #endif + /* + * If the parent interface supports unmapped mbufs, so does + * the VLAN interface. Note that this should be fine even for + * interfaces that don't support hardware tagging as headers + * are prepended in normal mbufs to unmapped mbufs holding + * payload data. + */ + cap |= (p->if_capabilities & IFCAP_NOMAP); + ena |= (mena & IFCAP_NOMAP); + + /* + * If the parent interface can offload encryption and segmentation + * of TLS records over TCP, propagate it's capability to the VLAN + * interface. + * + * All TLS drivers in the tree today can deal with VLANs. If + * this ever changes, then a new IFCAP_VLAN_TXTLS can be + * defined. + */ + if (p->if_capabilities & IFCAP_TXTLS) + cap |= p->if_capabilities & IFCAP_TXTLS; + if (p->if_capenable & IFCAP_TXTLS) + ena |= mena & IFCAP_TXTLS; + ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; @@ -1712,6 +1794,7 @@ vlan_capabilities(struct ifvlan *ifv) static void vlan_trunk_capabilities(struct ifnet *ifp) { + struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; @@ -1721,11 +1804,11 @@ vlan_trunk_capabilities(struct ifnet *ifp) VLAN_SUNLOCK(); return; } - TRUNK_RLOCK(trunk); + NET_EPOCH_ENTER(et); VLAN_FOREACH(ifv, trunk) { vlan_capabilities(ifv); } - TRUNK_RUNLOCK(trunk); + NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } @@ -1917,9 +2000,11 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { - TRUNK_RLOCK(trunk); + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); vlan_capabilities(ifv); - TRUNK_RUNLOCK(trunk); + NET_EPOCH_EXIT(et); } VLAN_SUNLOCK(); break; @@ -1932,18 +2017,77 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) return (error); } -#ifdef RATELIMIT +#if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { + struct epoch_tracker et; + struct vlan_snd_tag *vst; + struct ifvlan *ifv; + struct ifnet *parent; + int error; - /* get trunk device */ - ifp = vlan_trunkdev(ifp); - if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + NET_EPOCH_ENTER(et); + ifv = ifp->if_softc; + if (ifv->ifv_trunk != NULL) + parent = PARENT(ifv); + else + parent = NULL; + if (parent == NULL || parent->if_snd_tag_alloc == NULL) { + NET_EPOCH_EXIT(et); return (EOPNOTSUPP); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + } + if_ref(parent); + NET_EPOCH_EXIT(et); + + vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); + if (vst == NULL) { + if_rele(parent); + return (ENOMEM); + } + + error = parent->if_snd_tag_alloc(parent, params, &vst->tag); + if_rele(parent); + if (error) { + free(vst, M_VLAN); + return (error); + } + + m_snd_tag_init(&vst->com, ifp); + + *ppmt = &vst->com; + return (0); +} + +static int +vlan_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params)); +} + +static int +vlan_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_query(vst->tag, params)); +} + +static void +vlan_snd_tag_free(struct m_snd_tag *mst) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + m_snd_tag_rele(vst->tag); + free(vst, M_VLAN); } #endif diff --git a/freebsd/sys/net/if_vlan_var.h b/freebsd/sys/net/if_vlan_var.h index 0b66ec0a..28b0fa73 100644 --- a/freebsd/sys/net/if_vlan_var.h +++ b/freebsd/sys/net/if_vlan_var.h @@ -150,13 +150,13 @@ extern int (*vlan_pcp_p)(struct ifnet *, uint16_t *); extern int (*vlan_setcookie_p)(struct ifnet *, void *); extern void *(*vlan_cookie_p)(struct ifnet *); -#ifdef _SYS_EVENTHANDLER_H_ +#include <sys/_eventhandler.h> + /* VLAN state change events */ typedef void (*vlan_config_fn)(void *, struct ifnet *, uint16_t); typedef void (*vlan_unconfig_fn)(void *, struct ifnet *, uint16_t); EVENTHANDLER_DECLARE(vlan_config, vlan_config_fn); EVENTHANDLER_DECLARE(vlan_unconfig, vlan_unconfig_fn); -#endif /* _SYS_EVENTHANDLER_H_ */ #endif /* _KERNEL */ diff --git a/freebsd/sys/net/iflib.h b/freebsd/sys/net/iflib.h index 8c2be41b..cda00c4c 100644 --- a/freebsd/sys/net/iflib.h +++ b/freebsd/sys/net/iflib.h @@ -69,6 +69,9 @@ typedef struct if_rxd_frag { uint16_t irf_len; } *if_rxd_frag_t; +/* bnxt supports 64 with hardware LRO enabled */ +#define IFLIB_MAX_RX_SEGS 64 + typedef struct if_rxd_info { /* set by iflib */ uint16_t iri_qsidx; /* qset index */ @@ -76,7 +79,7 @@ typedef struct if_rxd_info { /* XXX redundant with the new irf_len field */ uint16_t iri_len; /* packet length */ qidx_t iri_cidx; /* consumer index of cq */ - struct ifnet *iri_ifp; /* some drivers >1 interface per softc */ + if_t iri_ifp; /* driver may have >1 iface per softc */ /* updated by driver */ if_rxd_frag_t iri_frags; @@ -129,12 +132,12 @@ typedef struct if_pkt_info { uint8_t ipi_mflags; /* packet mbuf flags */ uint32_t ipi_tcp_seq; /* tcp seqno */ - uint32_t ipi_tcp_sum; /* tcp csum */ + uint32_t __spare0__; } *if_pkt_info_t; typedef struct if_irq { struct resource *ii_res; - int ii_rid; + int __spare0__; void *ii_tag; } *if_irq_t; @@ -163,7 +166,7 @@ typedef struct pci_vendor_info { uint32_t pvi_subdevice_id; uint32_t pvi_rev_id; uint32_t pvi_class_mask; - caddr_t pvi_name; + const char *pvi_name; } pci_vendor_info_t; #define PVID(vendor, devid, name) {vendor, devid, 0, 0, 0, 0, name} @@ -191,9 +194,8 @@ typedef struct if_softc_ctx { int isc_vectors; int isc_nrxqsets; int isc_ntxqsets; - uint8_t isc_min_tx_latency; /* disable doorbell update batching */ - uint8_t isc_rx_mvec_enable; /* generate mvecs on rx */ - uint32_t isc_txrx_budget_bytes_max; + uint16_t __spare0__; + uint32_t __spare1__; int isc_msix_bar; /* can be model specific - initialize in attach_pre */ int isc_tx_nsegments; /* can be model specific - initialize in attach_pre */ int isc_ntxd[8]; @@ -215,16 +217,23 @@ typedef struct if_softc_ctx { int isc_rss_table_mask; int isc_nrxqsets_max; int isc_ntxqsets_max; - uint32_t isc_tx_qdepth; + uint32_t __spare2__; iflib_intr_mode_t isc_intr; uint16_t isc_max_frame_size; /* set at init time by driver */ uint16_t isc_min_frame_size; /* set at init time by driver, only used if IFLIB_NEED_ETHER_PAD is set. */ uint32_t isc_pause_frames; /* set by driver for iflib_timer to detect */ - pci_vendor_info_t isc_vendor_info; /* set by iflib prior to attach_pre */ + uint32_t __spare3__; + uint32_t __spare4__; + uint32_t __spare5__; + uint32_t __spare6__; + uint32_t __spare7__; + uint32_t __spare8__; + caddr_t __spare9__; int isc_disable_msix; if_txrx_t isc_txrx; + struct ifmedia *isc_media; } *if_softc_ctx_t; /* @@ -244,8 +253,8 @@ struct if_shared_ctx { int isc_admin_intrcnt; /* # of admin/link interrupts */ /* fields necessary for probe */ - pci_vendor_info_t *isc_vendor_info; - char *isc_driver_version; + const pci_vendor_info_t *isc_vendor_info; + const char *isc_driver_version; /* optional function to transform the read values to match the table*/ void (*isc_parse_devinfo) (uint16_t *device_id, uint16_t *subvendor_id, uint16_t *subdevice_id, uint16_t *rev_id); @@ -260,7 +269,7 @@ struct if_shared_ctx { int isc_nfl __aligned(CACHE_LINE_SIZE); int isc_ntxqs; /* # of tx queues per tx qset - usually 1 */ int isc_nrxqs; /* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */ - int isc_rx_process_limit; + int __spare0__; int isc_tx_reclaim_thresh; int isc_flags; const char *isc_name; @@ -284,11 +293,6 @@ typedef enum { IFLIB_INTR_IOV, } iflib_intr_type_t; -#ifndef ETH_ADDR_LEN -#define ETH_ADDR_LEN 6 -#endif - - /* * Interface has a separate command queue for RX */ @@ -358,7 +362,10 @@ typedef enum { * Interface needs admin task to ignore interface up/down status */ #define IFLIB_ADMIN_ALWAYS_RUN 0x10000 - +/* + * Driver will pass the media + */ +#define IFLIB_DRIVER_MEDIA 0x20000 /* * field accessors @@ -378,6 +385,8 @@ void iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN]); void iflib_request_reset(if_ctx_t ctx); uint8_t iflib_in_detach(if_ctx_t ctx); +uint32_t iflib_get_rx_mbuf_sz(if_ctx_t ctx); + /* * If the driver can plug cleanly in to newbus use these */ @@ -388,6 +397,12 @@ int iflib_device_suspend(device_t); int iflib_device_resume(device_t); int iflib_device_shutdown(device_t); +/* + * Use this instead of iflib_device_probe if the driver should report + * BUS_PROBE_VENDOR instead of BUS_PROBE_DEFAULT. (For example, an out-of-tree + * driver based on iflib). + */ +int iflib_device_probe_vendor(device_t); int iflib_device_iov_init(device_t, uint16_t, const nvlist_t *); void iflib_device_iov_uninit(device_t); @@ -400,8 +415,6 @@ int iflib_device_iov_add_vf(device_t, uint16_t, const nvlist_t *); int iflib_device_register(device_t dev, void *softc, if_shared_ctx_t sctx, if_ctx_t *ctxp); int iflib_device_deregister(if_ctx_t); - - int iflib_irq_alloc(if_ctx_t, if_irq_t, int, driver_filter_t, void *filter_arg, driver_intr_t, void *arg, const char *name); int iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid, iflib_intr_type_t type, driver_filter_t *filter, @@ -410,33 +423,28 @@ void iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t t void iflib_irq_free(if_ctx_t ctx, if_irq_t irq); -void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name); +void iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, + const char *name); void iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, const char *name); - void iflib_config_gtask_deinit(struct grouptask *gtask); - - void iflib_tx_intr_deferred(if_ctx_t ctx, int txqid); void iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid); void iflib_admin_intr_deferred(if_ctx_t ctx); void iflib_iov_intr_deferred(if_ctx_t ctx); - void iflib_link_state_change(if_ctx_t ctx, int linkstate, uint64_t baudrate); int iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags); +int iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags); void iflib_dma_free(iflib_dma_info_t dma); - int iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count); void iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count); - struct sx *iflib_ctx_lock_get(if_ctx_t); -struct mtx *iflib_qset_lock_get(if_ctx_t, uint16_t); void iflib_led_create(if_ctx_t ctx); @@ -448,4 +456,5 @@ void iflib_add_int_delay_sysctl(if_ctx_t, const char *, const char *, */ if_pseudo_t iflib_clone_register(if_shared_ctx_t); void iflib_clone_deregister(if_pseudo_t); + #endif /* __IFLIB_H_ */ diff --git a/freebsd/sys/net/netisr.c b/freebsd/sys/net/netisr.c index a3da964b..0f7c4800 100644 --- a/freebsd/sys/net/netisr.c +++ b/freebsd/sys/net/netisr.c @@ -868,6 +868,7 @@ netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, ("%s: invalid policy %u for %s", __func__, npp->np_policy, npp->np_name)); + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); ifp = m->m_pkthdr.rcvif; if (ifp != NULL) *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; diff --git a/freebsd/sys/net/pfil.c b/freebsd/sys/net/pfil.c index 65af515f..1dea915d 100644 --- a/freebsd/sys/net/pfil.c +++ b/freebsd/sys/net/pfil.c @@ -6,6 +6,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * + * Copyright (c) 2019 Gleb Smirnoff <glebius@FreeBSD.org> * Copyright (c) 1996 Matthew R. Green * All rights reserved. * @@ -34,445 +35,650 @@ */ #include <sys/param.h> +#include <sys/conf.h> #include <sys/kernel.h> +#include <sys/epoch.h> #include <sys/errno.h> #include <sys/lock.h> #include <sys/malloc.h> -#include <sys/rmlock.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/systm.h> -#include <sys/condvar.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/proc.h> #include <sys/queue.h> +#include <sys/ucred.h> +#include <sys/jail.h> #include <net/if.h> #include <net/if_var.h> #include <net/pfil.h> -static struct mtx pfil_global_lock; - -MTX_SYSINIT(pfil_heads_lock, &pfil_global_lock, "pfil_head_list lock", - MTX_DEF); - -static struct packet_filter_hook *pfil_chain_get(int, struct pfil_head *); -static int pfil_chain_add(pfil_chain_t *, struct packet_filter_hook *, int); -static int pfil_chain_remove(pfil_chain_t *, void *, void *); -static int pfil_add_hook_priv(void *, void *, int, struct pfil_head *, bool); +static MALLOC_DEFINE(M_PFIL, "pfil", "pfil(9) packet filter hooks"); + +static int pfil_ioctl(struct cdev *, u_long, caddr_t, int, struct thread *); +static struct cdevsw pfil_cdevsw = { + .d_ioctl = pfil_ioctl, + .d_name = PFILDEV, + .d_version = D_VERSION, +}; +static struct cdev *pfil_dev; + +static struct mtx pfil_lock; +MTX_SYSINIT(pfil_mtxinit, &pfil_lock, "pfil(9) lock", MTX_DEF); +#define PFIL_LOCK() mtx_lock(&pfil_lock) +#define PFIL_UNLOCK() mtx_unlock(&pfil_lock) +#define PFIL_LOCK_ASSERT() mtx_assert(&pfil_lock, MA_OWNED) + +#define PFIL_EPOCH net_epoch_preempt +#define PFIL_EPOCH_ENTER(et) epoch_enter_preempt(net_epoch_preempt, &(et)) +#define PFIL_EPOCH_EXIT(et) epoch_exit_preempt(net_epoch_preempt, &(et)) + +struct pfil_hook { + pfil_func_t hook_func; + void *hook_ruleset; + int hook_flags; + int hook_links; + enum pfil_types hook_type; + const char *hook_modname; + const char *hook_rulname; + LIST_ENTRY(pfil_hook) hook_list; +}; + +struct pfil_link { + CK_STAILQ_ENTRY(pfil_link) link_chain; + pfil_func_t link_func; + void *link_ruleset; + int link_flags; + struct pfil_hook *link_hook; + struct epoch_context link_epoch_ctx; +}; + +typedef CK_STAILQ_HEAD(pfil_chain, pfil_link) pfil_chain_t; +struct pfil_head { + int head_nhooksin; + int head_nhooksout; + pfil_chain_t head_in; + pfil_chain_t head_out; + int head_flags; + enum pfil_types head_type; + LIST_ENTRY(pfil_head) head_list; + const char *head_name; +}; LIST_HEAD(pfilheadhead, pfil_head); -VNET_DEFINE(struct pfilheadhead, pfil_head_list); +VNET_DEFINE_STATIC(struct pfilheadhead, pfil_head_list) = + LIST_HEAD_INITIALIZER(pfil_head_list); #define V_pfil_head_list VNET(pfil_head_list) -VNET_DEFINE(struct rmlock, pfil_lock); - -#define PFIL_LOCK_INIT_REAL(l, t) \ - rm_init_flags(l, "PFil " t " rmlock", RM_RECURSE) -#define PFIL_LOCK_DESTROY_REAL(l) \ - rm_destroy(l) -#define PFIL_LOCK_INIT(p) do { \ - if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) { \ - PFIL_LOCK_INIT_REAL(&(p)->ph_lock, "private"); \ - (p)->ph_plock = &(p)->ph_lock; \ - } else \ - (p)->ph_plock = &V_pfil_lock; \ -} while (0) -#define PFIL_LOCK_DESTROY(p) do { \ - if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) \ - PFIL_LOCK_DESTROY_REAL((p)->ph_plock); \ -} while (0) - -#define PFIL_TRY_RLOCK(p, t) rm_try_rlock((p)->ph_plock, (t)) -#define PFIL_RLOCK(p, t) rm_rlock((p)->ph_plock, (t)) -#define PFIL_WLOCK(p) rm_wlock((p)->ph_plock) -#define PFIL_RUNLOCK(p, t) rm_runlock((p)->ph_plock, (t)) -#define PFIL_WUNLOCK(p) rm_wunlock((p)->ph_plock) -#define PFIL_WOWNED(p) rm_wowned((p)->ph_plock) - -#define PFIL_HEADLIST_LOCK() mtx_lock(&pfil_global_lock) -#define PFIL_HEADLIST_UNLOCK() mtx_unlock(&pfil_global_lock) -/* - * pfil_run_hooks() runs the specified packet filter hook chain. - */ +LIST_HEAD(pfilhookhead, pfil_hook); +VNET_DEFINE_STATIC(struct pfilhookhead, pfil_hook_list) = + LIST_HEAD_INITIALIZER(pfil_hook_list); +#define V_pfil_hook_list VNET(pfil_hook_list) + +static struct pfil_link *pfil_link_remove(pfil_chain_t *, pfil_hook_t ); +static void pfil_link_free(epoch_context_t); + int -pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, - int dir, int flags, struct inpcb *inp) +pfil_realloc(pfil_packet_t *p, int flags, struct ifnet *ifp) { - struct rm_priotracker rmpt; - struct packet_filter_hook *pfh; - struct mbuf *m = *mp; - int rv = 0; - - PFIL_RLOCK(ph, &rmpt); - KASSERT(ph->ph_nhooks >= 0, ("Pfil hook count dropped < 0")); - for (pfh = pfil_chain_get(dir, ph); pfh != NULL; - pfh = TAILQ_NEXT(pfh, pfil_chain)) { - if (pfh->pfil_func_flags != NULL) { - rv = (*pfh->pfil_func_flags)(pfh->pfil_arg, &m, ifp, - dir, flags, inp); - if (rv != 0 || m == NULL) - break; - } - if (pfh->pfil_func != NULL) { - rv = (*pfh->pfil_func)(pfh->pfil_arg, &m, ifp, dir, - inp); - if (rv != 0 || m == NULL) - break; - } - } - PFIL_RUNLOCK(ph, &rmpt); - *mp = m; - return (rv); + struct mbuf *m; + + MPASS(flags & PFIL_MEMPTR); + + if ((m = m_devget(p->mem, PFIL_LENGTH(flags), 0, ifp, NULL)) == NULL) + return (ENOMEM); + *p = pfil_packet_align(*p); + *p->m = m; + + return (0); } -static struct packet_filter_hook * -pfil_chain_get(int dir, struct pfil_head *ph) +static __noinline int +pfil_fake_mbuf(pfil_func_t func, pfil_packet_t *p, struct ifnet *ifp, int flags, + void *ruleset, struct inpcb *inp) { + struct mbuf m, *mp; + pfil_return_t rv; + + (void)m_init(&m, M_NOWAIT, MT_DATA, M_NOFREE | M_PKTHDR); + m_extadd(&m, p->mem, PFIL_LENGTH(flags), NULL, NULL, NULL, 0, + EXT_RXRING); + m.m_len = m.m_pkthdr.len = PFIL_LENGTH(flags); + mp = &m; + flags &= ~(PFIL_MEMPTR | PFIL_LENMASK); + + rv = func(&mp, ifp, flags, ruleset, inp); + if (rv == PFIL_PASS && mp != &m) { + /* + * Firewalls that need pfil_fake_mbuf() most likely don't + * know they need return PFIL_REALLOCED. + */ + rv = PFIL_REALLOCED; + *p = pfil_packet_align(*p); + *p->m = mp; + } - if (dir == PFIL_IN) - return (TAILQ_FIRST(&ph->ph_in)); - else if (dir == PFIL_OUT) - return (TAILQ_FIRST(&ph->ph_out)); - else - return (NULL); + return (rv); } -#ifndef __rtems__ /* - * pfil_try_rlock() acquires rm reader lock for specified head - * if this is immediately possible. + * pfil_run_hooks() runs the specified packet filter hook chain. */ int -pfil_try_rlock(struct pfil_head *ph, struct rm_priotracker *tracker) +pfil_run_hooks(struct pfil_head *head, pfil_packet_t p, struct ifnet *ifp, + int flags, struct inpcb *inp) { - - return (PFIL_TRY_RLOCK(ph, tracker)); + struct epoch_tracker et; + pfil_chain_t *pch; + struct pfil_link *link; + pfil_return_t rv; + bool realloc = false; + + if (PFIL_DIR(flags) == PFIL_IN) + pch = &head->head_in; + else if (__predict_true(PFIL_DIR(flags) == PFIL_OUT)) + pch = &head->head_out; + else + panic("%s: bogus flags %d", __func__, flags); + + rv = PFIL_PASS; + PFIL_EPOCH_ENTER(et); + CK_STAILQ_FOREACH(link, pch, link_chain) { + if ((flags & PFIL_MEMPTR) && !(link->link_flags & PFIL_MEMPTR)) + rv = pfil_fake_mbuf(link->link_func, &p, ifp, flags, + link->link_ruleset, inp); + else + rv = (*link->link_func)(p, ifp, flags, + link->link_ruleset, inp); + if (rv == PFIL_DROPPED || rv == PFIL_CONSUMED) + break; + else if (rv == PFIL_REALLOCED) { + flags &= ~(PFIL_MEMPTR | PFIL_LENMASK); + realloc = true; + } + } + PFIL_EPOCH_EXIT(et); + if (realloc && rv == PFIL_PASS) + rv = PFIL_REALLOCED; + return (rv); } -#endif /* __rtems__ */ /* - * pfil_rlock() acquires rm reader lock for specified head. + * pfil_head_register() registers a pfil_head with the packet filter hook + * mechanism. */ -void -pfil_rlock(struct pfil_head *ph, struct rm_priotracker *tracker) +pfil_head_t +pfil_head_register(struct pfil_head_args *pa) { + struct pfil_head *head, *list; - PFIL_RLOCK(ph, tracker); -} + MPASS(pa->pa_version == PFIL_VERSION); -/* - * pfil_runlock() releases reader lock for specified head. - */ -void -pfil_runlock(struct pfil_head *ph, struct rm_priotracker *tracker) -{ + head = malloc(sizeof(struct pfil_head), M_PFIL, M_WAITOK); + + head->head_nhooksin = head->head_nhooksout = 0; + head->head_flags = pa->pa_flags; + head->head_type = pa->pa_type; + head->head_name = pa->pa_headname; + CK_STAILQ_INIT(&head->head_in); + CK_STAILQ_INIT(&head->head_out); + + PFIL_LOCK(); + LIST_FOREACH(list, &V_pfil_head_list, head_list) + if (strcmp(pa->pa_headname, list->head_name) == 0) { + printf("pfil: duplicate head \"%s\"\n", + pa->pa_headname); + } + LIST_INSERT_HEAD(&V_pfil_head_list, head, head_list); + PFIL_UNLOCK(); - PFIL_RUNLOCK(ph, tracker); + return (head); } /* - * pfil_wlock() acquires writer lock for specified head. + * pfil_head_unregister() removes a pfil_head from the packet filter hook + * mechanism. The producer of the hook promises that all outstanding + * invocations of the hook have completed before it unregisters the hook. */ void -pfil_wlock(struct pfil_head *ph) +pfil_head_unregister(pfil_head_t ph) { + struct pfil_link *link, *next; + + PFIL_LOCK(); + LIST_REMOVE(ph, head_list); - PFIL_WLOCK(ph); + CK_STAILQ_FOREACH_SAFE(link, &ph->head_in, link_chain, next) { + link->link_hook->hook_links--; + free(link, M_PFIL); + } + CK_STAILQ_FOREACH_SAFE(link, &ph->head_out, link_chain, next) { + link->link_hook->hook_links--; + free(link, M_PFIL); + } + PFIL_UNLOCK(); } -/* - * pfil_wunlock() releases writer lock for specified head. - */ -void -pfil_wunlock(struct pfil_head *ph) +pfil_hook_t +pfil_add_hook(struct pfil_hook_args *pa) { + struct pfil_hook *hook, *list; + + MPASS(pa->pa_version == PFIL_VERSION); + + hook = malloc(sizeof(struct pfil_hook), M_PFIL, M_WAITOK | M_ZERO); + hook->hook_func = pa->pa_func; + hook->hook_ruleset = pa->pa_ruleset; + hook->hook_flags = pa->pa_flags; + hook->hook_type = pa->pa_type; + hook->hook_modname = pa->pa_modname; + hook->hook_rulname = pa->pa_rulname; + + PFIL_LOCK(); + LIST_FOREACH(list, &V_pfil_hook_list, hook_list) + if (strcmp(pa->pa_modname, list->hook_modname) == 0 && + strcmp(pa->pa_rulname, list->hook_rulname) == 0) { + printf("pfil: duplicate hook \"%s:%s\"\n", + pa->pa_modname, pa->pa_rulname); + } + LIST_INSERT_HEAD(&V_pfil_hook_list, hook, hook_list); + PFIL_UNLOCK(); - PFIL_WUNLOCK(ph); + return (hook); } -/* - * pfil_wowned() returns a non-zero value if the current thread owns - * an exclusive lock. - */ -int -pfil_wowned(struct pfil_head *ph) +static int +pfil_unlink(struct pfil_link_args *pa, pfil_head_t head, pfil_hook_t hook) { + struct pfil_link *in, *out; + + PFIL_LOCK_ASSERT(); - return (PFIL_WOWNED(ph)); + if (pa->pa_flags & PFIL_IN) { + in = pfil_link_remove(&head->head_in, hook); + if (in != NULL) { + head->head_nhooksin--; + hook->hook_links--; + } + } else + in = NULL; + if (pa->pa_flags & PFIL_OUT) { + out = pfil_link_remove(&head->head_out, hook); + if (out != NULL) { + head->head_nhooksout--; + hook->hook_links--; + } + } else + out = NULL; + PFIL_UNLOCK(); + + if (in != NULL) + epoch_call(PFIL_EPOCH, &in->link_epoch_ctx, pfil_link_free); + if (out != NULL) + epoch_call(PFIL_EPOCH, &out->link_epoch_ctx, pfil_link_free); + + if (in == NULL && out == NULL) + return (ENOENT); + else + return (0); } -/* - * pfil_head_register() registers a pfil_head with the packet filter hook - * mechanism. - */ int -pfil_head_register(struct pfil_head *ph) +pfil_link(struct pfil_link_args *pa) { - struct pfil_head *lph; - - PFIL_HEADLIST_LOCK(); - LIST_FOREACH(lph, &V_pfil_head_list, ph_list) { - if (ph->ph_type == lph->ph_type && - ph->ph_un.phu_val == lph->ph_un.phu_val) { - PFIL_HEADLIST_UNLOCK(); - return (EEXIST); - } + struct pfil_link *in, *out, *link; + struct pfil_head *head; + struct pfil_hook *hook; + int error; + + MPASS(pa->pa_version == PFIL_VERSION); + + if ((pa->pa_flags & (PFIL_IN | PFIL_UNLINK)) == PFIL_IN) + in = malloc(sizeof(*in), M_PFIL, M_WAITOK | M_ZERO); + else + in = NULL; + if ((pa->pa_flags & (PFIL_OUT | PFIL_UNLINK)) == PFIL_OUT) + out = malloc(sizeof(*out), M_PFIL, M_WAITOK | M_ZERO); + else + out = NULL; + + PFIL_LOCK(); + if (pa->pa_flags & PFIL_HEADPTR) + head = pa->pa_head; + else + LIST_FOREACH(head, &V_pfil_head_list, head_list) + if (strcmp(pa->pa_headname, head->head_name) == 0) + break; + if (pa->pa_flags & PFIL_HOOKPTR) + hook = pa->pa_hook; + else + LIST_FOREACH(hook, &V_pfil_hook_list, hook_list) + if (strcmp(pa->pa_modname, hook->hook_modname) == 0 && + strcmp(pa->pa_rulname, hook->hook_rulname) == 0) + break; + if (head == NULL || hook == NULL) { + error = ENOENT; + goto fail; + } + + if (pa->pa_flags & PFIL_UNLINK) + return (pfil_unlink(pa, head, hook)); + + if (head->head_type != hook->hook_type || + ((hook->hook_flags & pa->pa_flags) & ~head->head_flags)) { + error = EINVAL; + goto fail; + } + + if (pa->pa_flags & PFIL_IN) + CK_STAILQ_FOREACH(link, &head->head_in, link_chain) + if (link->link_hook == hook) { + error = EEXIST; + goto fail; + } + if (pa->pa_flags & PFIL_OUT) + CK_STAILQ_FOREACH(link, &head->head_out, link_chain) + if (link->link_hook == hook) { + error = EEXIST; + goto fail; + } + + if (pa->pa_flags & PFIL_IN) { + in->link_hook = hook; + in->link_func = hook->hook_func; + in->link_flags = hook->hook_flags; + in->link_ruleset = hook->hook_ruleset; + if (pa->pa_flags & PFIL_APPEND) + CK_STAILQ_INSERT_TAIL(&head->head_in, in, link_chain); + else + CK_STAILQ_INSERT_HEAD(&head->head_in, in, link_chain); + hook->hook_links++; + head->head_nhooksin++; + } + if (pa->pa_flags & PFIL_OUT) { + out->link_hook = hook; + out->link_func = hook->hook_func; + out->link_flags = hook->hook_flags; + out->link_ruleset = hook->hook_ruleset; + if (pa->pa_flags & PFIL_APPEND) + CK_STAILQ_INSERT_HEAD(&head->head_out, out, link_chain); + else + CK_STAILQ_INSERT_TAIL(&head->head_out, out, link_chain); + hook->hook_links++; + head->head_nhooksout++; } - PFIL_LOCK_INIT(ph); - ph->ph_nhooks = 0; - TAILQ_INIT(&ph->ph_in); - TAILQ_INIT(&ph->ph_out); - LIST_INSERT_HEAD(&V_pfil_head_list, ph, ph_list); - PFIL_HEADLIST_UNLOCK(); + PFIL_UNLOCK(); + return (0); + +fail: + PFIL_UNLOCK(); + free(in, M_PFIL); + free(out, M_PFIL); + return (error); +} + +static void +pfil_link_free(epoch_context_t ctx) +{ + struct pfil_link *link; + + link = __containerof(ctx, struct pfil_link, link_epoch_ctx); + free(link, M_PFIL); } /* - * pfil_head_unregister() removes a pfil_head from the packet filter hook - * mechanism. The producer of the hook promises that all outstanding - * invocations of the hook have completed before it unregisters the hook. + * pfil_remove_hook removes a filter from all filtering points. */ -int -pfil_head_unregister(struct pfil_head *ph) +void +pfil_remove_hook(pfil_hook_t hook) { - struct packet_filter_hook *pfh, *pfnext; - - PFIL_HEADLIST_LOCK(); - LIST_REMOVE(ph, ph_list); - PFIL_HEADLIST_UNLOCK(); - TAILQ_FOREACH_SAFE(pfh, &ph->ph_in, pfil_chain, pfnext) - free(pfh, M_IFADDR); - TAILQ_FOREACH_SAFE(pfh, &ph->ph_out, pfil_chain, pfnext) - free(pfh, M_IFADDR); - PFIL_LOCK_DESTROY(ph); - return (0); + struct pfil_head *head; + struct pfil_link *in, *out; + + PFIL_LOCK(); + LIST_FOREACH(head, &V_pfil_head_list, head_list) { +retry: + in = pfil_link_remove(&head->head_in, hook); + if (in != NULL) { + head->head_nhooksin--; + hook->hook_links--; + epoch_call(PFIL_EPOCH, &in->link_epoch_ctx, + pfil_link_free); + } + out = pfil_link_remove(&head->head_out, hook); + if (out != NULL) { + head->head_nhooksout--; + hook->hook_links--; + epoch_call(PFIL_EPOCH, &out->link_epoch_ctx, + pfil_link_free); + } + if (in != NULL || out != NULL) + /* What if some stupid admin put same filter twice? */ + goto retry; + } + LIST_REMOVE(hook, hook_list); + PFIL_UNLOCK(); + MPASS(hook->hook_links == 0); + free(hook, M_PFIL); } /* - * pfil_head_get() returns the pfil_head for a given key/dlt. + * Internal: Remove a pfil hook from a hook chain. */ -struct pfil_head * -pfil_head_get(int type, u_long val) +static struct pfil_link * +pfil_link_remove(pfil_chain_t *chain, pfil_hook_t hook) { - struct pfil_head *ph; + struct pfil_link *link; - PFIL_HEADLIST_LOCK(); - LIST_FOREACH(ph, &V_pfil_head_list, ph_list) - if (ph->ph_type == type && ph->ph_un.phu_val == val) - break; - PFIL_HEADLIST_UNLOCK(); - return (ph); + PFIL_LOCK_ASSERT(); + + CK_STAILQ_FOREACH(link, chain, link_chain) + if (link->link_hook == hook) { + CK_STAILQ_REMOVE(chain, link, pfil_link, link_chain); + return (link); + } + + return (NULL); } -/* - * pfil_add_hook_flags() adds a function to the packet filter hook. the - * flags are: - * PFIL_IN call me on incoming packets - * PFIL_OUT call me on outgoing packets - * PFIL_ALL call me on all of the above - * PFIL_WAITOK OK to call malloc with M_WAITOK. - */ -int -pfil_add_hook_flags(pfil_func_flags_t func, void *arg, int flags, - struct pfil_head *ph) +static void +pfil_init(const void *unused __unused) { - return (pfil_add_hook_priv(func, arg, flags, ph, true)); + struct make_dev_args args; + int error; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME; + args.mda_devsw = &pfil_cdevsw; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_WHEEL; + args.mda_mode = 0600; + error = make_dev_s(&args, &pfil_dev, PFILDEV); + KASSERT(error == 0, ("%s: failed to create dev: %d", __func__, error)); } +/* + * Make sure the pfil bits are first before any possible subsystem which + * might piggyback on the SI_SUB_PROTO_PFIL. + */ +SYSINIT(pfil_init, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, pfil_init, NULL); /* - * pfil_add_hook() adds a function to the packet filter hook. the - * flags are: - * PFIL_IN call me on incoming packets - * PFIL_OUT call me on outgoing packets - * PFIL_ALL call me on all of the above - * PFIL_WAITOK OK to call malloc with M_WAITOK. + * User control interface. */ -int -pfil_add_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) +static int pfilioc_listheads(struct pfilioc_list *); +static int pfilioc_listhooks(struct pfilioc_list *); +static int pfilioc_link(struct pfilioc_link *); + +static int +pfil_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) { - return (pfil_add_hook_priv(func, arg, flags, ph, false)); + int error; + + CURVNET_SET(TD_TO_VNET(td)); + error = 0; + switch (cmd) { + case PFILIOC_LISTHEADS: + error = pfilioc_listheads((struct pfilioc_list *)addr); + break; + case PFILIOC_LISTHOOKS: + error = pfilioc_listhooks((struct pfilioc_list *)addr); + break; + case PFILIOC_LINK: + error = pfilioc_link((struct pfilioc_link *)addr); + break; + default: + error = EINVAL; + break; + } + CURVNET_RESTORE(); + return (error); } static int -pfil_add_hook_priv(void *func, void *arg, int flags, - struct pfil_head *ph, bool hasflags) +pfilioc_listheads(struct pfilioc_list *req) { - struct packet_filter_hook *pfh1 = NULL; - struct packet_filter_hook *pfh2 = NULL; - int err; - - if (flags & PFIL_IN) { - pfh1 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), - M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); - if (pfh1 == NULL) { - err = ENOMEM; - goto error; - } - } - if (flags & PFIL_OUT) { - pfh2 = (struct packet_filter_hook *)malloc(sizeof(*pfh1), - M_IFADDR, (flags & PFIL_WAITOK) ? M_WAITOK : M_NOWAIT); - if (pfh2 == NULL) { - err = ENOMEM; - goto error; - } + struct pfil_head *head; + struct pfil_link *link; + struct pfilioc_head *iohead; + struct pfilioc_hook *iohook; + u_int nheads, nhooks, hd, hk; + int error; + + PFIL_LOCK(); +restart: + nheads = nhooks = 0; + LIST_FOREACH(head, &V_pfil_head_list, head_list) { + nheads++; + nhooks += head->head_nhooksin + head->head_nhooksout; } - PFIL_WLOCK(ph); - if (flags & PFIL_IN) { - pfh1->pfil_func_flags = hasflags ? func : NULL; - pfh1->pfil_func = hasflags ? NULL : func; - pfh1->pfil_arg = arg; - err = pfil_chain_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT); - if (err) - goto locked_error; - ph->ph_nhooks++; + PFIL_UNLOCK(); + + if (req->pio_nheads < nheads || req->pio_nhooks < nhooks) { + req->pio_nheads = nheads; + req->pio_nhooks = nhooks; + return (0); } - if (flags & PFIL_OUT) { - pfh2->pfil_func_flags = hasflags ? func : NULL; - pfh2->pfil_func = hasflags ? NULL : func; - pfh2->pfil_arg = arg; - err = pfil_chain_add(&ph->ph_out, pfh2, flags & ~PFIL_IN); - if (err) { - if (flags & PFIL_IN) - pfil_chain_remove(&ph->ph_in, func, arg); - goto locked_error; + + iohead = malloc(sizeof(*iohead) * nheads, M_TEMP, M_WAITOK); + iohook = malloc(sizeof(*iohook) * nhooks, M_TEMP, M_WAITOK); + + hd = hk = 0; + PFIL_LOCK(); + LIST_FOREACH(head, &V_pfil_head_list, head_list) { + if (hd + 1 > nheads || + hk + head->head_nhooksin + head->head_nhooksout > nhooks) { + /* Configuration changed during malloc(). */ + free(iohead, M_TEMP); + free(iohook, M_TEMP); + goto restart; + } + strlcpy(iohead[hd].pio_name, head->head_name, + sizeof(iohead[0].pio_name)); + iohead[hd].pio_nhooksin = head->head_nhooksin; + iohead[hd].pio_nhooksout = head->head_nhooksout; + iohead[hd].pio_type = head->head_type; + CK_STAILQ_FOREACH(link, &head->head_in, link_chain) { + strlcpy(iohook[hk].pio_module, + link->link_hook->hook_modname, + sizeof(iohook[0].pio_module)); + strlcpy(iohook[hk].pio_ruleset, + link->link_hook->hook_rulname, + sizeof(iohook[0].pio_ruleset)); + hk++; } - ph->ph_nhooks++; + CK_STAILQ_FOREACH(link, &head->head_out, link_chain) { + strlcpy(iohook[hk].pio_module, + link->link_hook->hook_modname, + sizeof(iohook[0].pio_module)); + strlcpy(iohook[hk].pio_ruleset, + link->link_hook->hook_rulname, + sizeof(iohook[0].pio_ruleset)); + hk++; + } + hd++; } - PFIL_WUNLOCK(ph); - return (0); -locked_error: - PFIL_WUNLOCK(ph); -error: - if (pfh1 != NULL) - free(pfh1, M_IFADDR); - if (pfh2 != NULL) - free(pfh2, M_IFADDR); - return (err); -} + PFIL_UNLOCK(); -/* - * pfil_remove_hook_flags removes a specific function from the packet filter hook - * chain. - */ -int -pfil_remove_hook_flags(pfil_func_flags_t func, void *arg, int flags, - struct pfil_head *ph) -{ - return (pfil_remove_hook((pfil_func_t)func, arg, flags, ph)); -} + error = copyout(iohead, req->pio_heads, + sizeof(*iohead) * min(hd, req->pio_nheads)); + if (error == 0) + error = copyout(iohook, req->pio_hooks, + sizeof(*iohook) * min(req->pio_nhooks, hk)); -/* - * pfil_remove_hook removes a specific function from the packet filter hook - * chain. - */ -int -pfil_remove_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) -{ - int err = 0; + req->pio_nheads = hd; + req->pio_nhooks = hk; - PFIL_WLOCK(ph); - if (flags & PFIL_IN) { - err = pfil_chain_remove(&ph->ph_in, func, arg); - if (err == 0) - ph->ph_nhooks--; - } - if ((err == 0) && (flags & PFIL_OUT)) { - err = pfil_chain_remove(&ph->ph_out, func, arg); - if (err == 0) - ph->ph_nhooks--; - } - PFIL_WUNLOCK(ph); - return (err); -} + free(iohead, M_TEMP); + free(iohook, M_TEMP); -/* - * Internal: Add a new pfil hook into a hook chain. - */ -static int -pfil_chain_add(pfil_chain_t *chain, struct packet_filter_hook *pfh1, int flags) -{ - struct packet_filter_hook *pfh; - - /* - * First make sure the hook is not already there. - */ - TAILQ_FOREACH(pfh, chain, pfil_chain) - if (((pfh->pfil_func != NULL && pfh->pfil_func == pfh1->pfil_func) || - (pfh->pfil_func_flags != NULL && - pfh->pfil_func_flags == pfh1->pfil_func_flags)) && - pfh->pfil_arg == pfh1->pfil_arg) - return (EEXIST); - - /* - * Insert the input list in reverse order of the output list so that - * the same path is followed in or out of the kernel. - */ - if (flags & PFIL_IN) - TAILQ_INSERT_HEAD(chain, pfh1, pfil_chain); - else - TAILQ_INSERT_TAIL(chain, pfh1, pfil_chain); - return (0); + return (error); } -/* - * Internal: Remove a pfil hook from a hook chain. - */ static int -pfil_chain_remove(pfil_chain_t *chain, void *func, void *arg) +pfilioc_listhooks(struct pfilioc_list *req) { - struct packet_filter_hook *pfh; - - TAILQ_FOREACH(pfh, chain, pfil_chain) - if ((pfh->pfil_func == func || pfh->pfil_func_flags == func) && - pfh->pfil_arg == arg) { - TAILQ_REMOVE(chain, pfh, pfil_chain); - free(pfh, M_IFADDR); - return (0); + struct pfil_hook *hook; + struct pfilioc_hook *iohook; + u_int nhooks, hk; + int error; + + PFIL_LOCK(); +restart: + nhooks = 0; + LIST_FOREACH(hook, &V_pfil_hook_list, hook_list) + nhooks++; + PFIL_UNLOCK(); + + if (req->pio_nhooks < nhooks) { + req->pio_nhooks = nhooks; + return (0); + } + + iohook = malloc(sizeof(*iohook) * nhooks, M_TEMP, M_WAITOK); + + hk = 0; + PFIL_LOCK(); + LIST_FOREACH(hook, &V_pfil_hook_list, hook_list) { + if (hk + 1 > nhooks) { + /* Configuration changed during malloc(). */ + free(iohook, M_TEMP); + goto restart; } - return (ENOENT); -} + strlcpy(iohook[hk].pio_module, hook->hook_modname, + sizeof(iohook[0].pio_module)); + strlcpy(iohook[hk].pio_ruleset, hook->hook_rulname, + sizeof(iohook[0].pio_ruleset)); + iohook[hk].pio_type = hook->hook_type; + iohook[hk].pio_flags = hook->hook_flags; + hk++; + } + PFIL_UNLOCK(); -/* - * Stuff that must be initialized for every instance (including the first of - * course). - */ -static void -vnet_pfil_init(const void *unused __unused) -{ + error = copyout(iohook, req->pio_hooks, + sizeof(*iohook) * min(req->pio_nhooks, hk)); + req->pio_nhooks = hk; + free(iohook, M_TEMP); - LIST_INIT(&V_pfil_head_list); - PFIL_LOCK_INIT_REAL(&V_pfil_lock, "shared"); + return (error); } -/* - * Called for the removal of each instance. - */ -static void -vnet_pfil_uninit(const void *unused __unused) +static int +pfilioc_link(struct pfilioc_link *req) { + struct pfil_link_args args; - KASSERT(LIST_EMPTY(&V_pfil_head_list), - ("%s: pfil_head_list %p not empty", __func__, &V_pfil_head_list)); - PFIL_LOCK_DESTROY_REAL(&V_pfil_lock); -} + if (req->pio_flags & ~(PFIL_IN | PFIL_OUT | PFIL_UNLINK | PFIL_APPEND)) + return (EINVAL); -/* - * Starting up. - * - * VNET_SYSINIT is called for each existing vnet and each new vnet. - * Make sure the pfil bits are first before any possible subsystem which - * might piggyback on the SI_SUB_PROTO_PFIL. - */ -VNET_SYSINIT(vnet_pfil_init, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, - vnet_pfil_init, NULL); - -/* - * Closing up shop. These are done in REVERSE ORDER. Not called on reboot. - * - * VNET_SYSUNINIT is called for each exiting vnet as it exits. - */ -VNET_SYSUNINIT(vnet_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, - vnet_pfil_uninit, NULL); + args.pa_version = PFIL_VERSION; + args.pa_flags = req->pio_flags; + args.pa_headname = req->pio_name; + args.pa_modname = req->pio_module; + args.pa_rulname = req->pio_ruleset; + + return (pfil_link(&args)); +} diff --git a/freebsd/sys/net/pfil.h b/freebsd/sys/net/pfil.h index 8fdaf5a6..da045b30 100644 --- a/freebsd/sys/net/pfil.h +++ b/freebsd/sys/net/pfil.h @@ -4,6 +4,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * + * Copyright (c) 2019 Gleb Smirnoff <glebius@FreeBSD.org> * Copyright (c) 1996 Matthew R. Green * All rights reserved. * @@ -34,98 +35,180 @@ #ifndef _NET_PFIL_H_ #define _NET_PFIL_H_ -#include <sys/systm.h> -#include <sys/queue.h> -#include <sys/_lock.h> -#include <sys/_mutex.h> -#include <sys/lock.h> -#include <sys/rmlock.h> -#include <net/vnet.h> +#include <sys/ioccom.h> +enum pfil_types { + PFIL_TYPE_IP4, + PFIL_TYPE_IP6, + PFIL_TYPE_ETHERNET, +}; + +#define MAXPFILNAME 64 + +struct pfilioc_head { + char pio_name[MAXPFILNAME]; + int pio_nhooksin; + int pio_nhooksout; + enum pfil_types pio_type; +}; + +struct pfilioc_hook { + char pio_module[MAXPFILNAME]; + char pio_ruleset[MAXPFILNAME]; + int pio_flags; + enum pfil_types pio_type; +}; + +struct pfilioc_list { + u_int pio_nheads; + u_int pio_nhooks; + struct pfilioc_head *pio_heads; + struct pfilioc_hook *pio_hooks; +}; + +struct pfilioc_link { + char pio_name[MAXPFILNAME]; + char pio_module[MAXPFILNAME]; + char pio_ruleset[MAXPFILNAME]; + int pio_flags; +}; + +#define PFILDEV "pfil" +#define PFILIOC_LISTHEADS _IOWR('P', 1, struct pfilioc_list) +#define PFILIOC_LISTHOOKS _IOWR('P', 2, struct pfilioc_list) +#define PFILIOC_LINK _IOW('P', 3, struct pfilioc_link) + +#define PFIL_IN 0x00010000 +#define PFIL_OUT 0x00020000 +#define PFIL_FWD 0x00040000 +#define PFIL_DIR(f) ((f) & (PFIL_IN|PFIL_OUT)) +#define PFIL_MEMPTR 0x00080000 +#define PFIL_HEADPTR 0x00100000 +#define PFIL_HOOKPTR 0x00200000 +#define PFIL_APPEND 0x00400000 +#define PFIL_UNLINK 0x00800000 +#define PFIL_LENMASK 0x0000ffff +#define PFIL_LENGTH(f) ((f) & PFIL_LENMASK) + +#ifdef _KERNEL struct mbuf; struct ifnet; struct inpcb; -typedef int (*pfil_func_t)(void *, struct mbuf **, struct ifnet *, int, - struct inpcb *); -typedef int (*pfil_func_flags_t)(void *, struct mbuf **, struct ifnet *, - int, int, struct inpcb *); +typedef union { + struct mbuf **m; + void *mem; + uintptr_t __ui; +} pfil_packet_t __attribute__((__transparent_union__)); + +static inline pfil_packet_t +pfil_packet_align(pfil_packet_t p) +{ + + return ((pfil_packet_t ) (((uintptr_t)(p).mem + + (_Alignof(void *) - 1)) & - _Alignof(void *))); +} + +static inline struct mbuf * +pfil_mem2mbuf(void *v) +{ + + return (*(struct mbuf **) (((uintptr_t)(v) + + (_Alignof(void *) - 1)) & - _Alignof(void *))); +} + +typedef enum { + PFIL_PASS = 0, + PFIL_DROPPED, + PFIL_CONSUMED, + PFIL_REALLOCED, +} pfil_return_t; + +typedef pfil_return_t (*pfil_func_t)(pfil_packet_t, struct ifnet *, int, + void *, struct inpcb *); +/* + * A pfil head is created by a packet intercept point. + * + * A pfil hook is created by a packet filter. + * + * Hooks are chained on heads. Historically some hooking happens + * automatically, e.g. ipfw(4), pf(4) and ipfilter(4) would register + * theirselves on IPv4 and IPv6 input/output. + */ + +typedef struct pfil_hook * pfil_hook_t; +typedef struct pfil_head * pfil_head_t; /* - * The packet filter hooks are designed for anything to call them to - * possibly intercept the packet. Multiple filter hooks are chained - * together and after each other in the specified order. + * Give us a chance to modify pfil_xxx_args structures in future. */ -struct packet_filter_hook { - TAILQ_ENTRY(packet_filter_hook) pfil_chain; - pfil_func_t pfil_func; - pfil_func_flags_t pfil_func_flags; - void *pfil_arg; +#define PFIL_VERSION 1 + +/* Argument structure used by packet filters to register themselves. */ +struct pfil_hook_args { + int pa_version; + int pa_flags; + enum pfil_types pa_type; + pfil_func_t pa_func; + void *pa_ruleset; + const char *pa_modname; + const char *pa_rulname; }; -#define PFIL_IN 0x00000001 -#define PFIL_OUT 0x00000002 -#define PFIL_WAITOK 0x00000004 -#define PFIL_FWD 0x00000008 -#define PFIL_ALL (PFIL_IN|PFIL_OUT) +/* Public functions for pfil hook management by packet filters. */ +pfil_hook_t pfil_add_hook(struct pfil_hook_args *); +void pfil_remove_hook(pfil_hook_t); -typedef TAILQ_HEAD(pfil_chain, packet_filter_hook) pfil_chain_t; +/* Argument structure used by ioctl() and packet filters to set filters. */ +struct pfil_link_args { + int pa_version; + int pa_flags; + union { + const char *pa_headname; + pfil_head_t pa_head; + }; + union { + struct { + const char *pa_modname; + const char *pa_rulname; + }; + pfil_hook_t pa_hook; + }; +}; -#define PFIL_TYPE_AF 1 /* key is AF_* type */ -#define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ +/* Public function to configure filter chains. Used by ioctl() and filters. */ +int pfil_link(struct pfil_link_args *); -#define PFIL_FLAG_PRIVATE_LOCK 0x01 /* Personal lock instead of global */ +/* Argument structure used by inspection points to register themselves. */ +struct pfil_head_args { + int pa_version; + int pa_flags; + enum pfil_types pa_type; + const char *pa_headname; +}; +/* Public functions for pfil head management by inspection points. */ +pfil_head_t pfil_head_register(struct pfil_head_args *); +void pfil_head_unregister(pfil_head_t); + +/* Public functions to run the packet inspection by inspection points. */ +int pfil_run_hooks(struct pfil_head *, pfil_packet_t, struct ifnet *, int, + struct inpcb *inp); /* - * A pfil head is created by each protocol or packet intercept point. - * For packet is then run through the hook chain for inspection. + * Minimally exposed structure to avoid function call in case of absence + * of any filters by protocols and macros to do the check. */ -struct pfil_head { - pfil_chain_t ph_in; - pfil_chain_t ph_out; - int ph_type; - int ph_nhooks; -#if defined( __linux__ ) || defined( _WIN32 ) - rwlock_t ph_mtx; -#else - struct rmlock *ph_plock; /* Pointer to the used lock */ - struct rmlock ph_lock; /* Private lock storage */ - int flags; -#endif - union { - u_long phu_val; - void *phu_ptr; - } ph_un; -#define ph_af ph_un.phu_val -#define ph_ifnet ph_un.phu_ptr - LIST_ENTRY(pfil_head) ph_list; +struct _pfil_head { + int head_nhooksin; + int head_nhooksout; }; +#define PFIL_HOOKED_IN(p) (((struct _pfil_head *)(p))->head_nhooksin > 0) +#define PFIL_HOOKED_OUT(p) (((struct _pfil_head *)(p))->head_nhooksout > 0) -VNET_DECLARE(struct rmlock, pfil_lock); -#define V_pfil_lock VNET(pfil_lock) - -/* Public functions for pfil hook management by packet filters. */ -struct pfil_head *pfil_head_get(int, u_long); -int pfil_add_hook_flags(pfil_func_flags_t, void *, int, struct pfil_head *); -int pfil_add_hook(pfil_func_t, void *, int, struct pfil_head *); -int pfil_remove_hook_flags(pfil_func_flags_t, void *, int, struct pfil_head *); -int pfil_remove_hook(pfil_func_t, void *, int, struct pfil_head *); -#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) - -/* Public functions to run the packet inspection by protocols. */ -int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, int, - int, struct inpcb *inp); - -/* Public functions for pfil head management by protocols. */ -int pfil_head_register(struct pfil_head *); -int pfil_head_unregister(struct pfil_head *); - -/* Public pfil locking functions for self managed locks by packet filters. */ -int pfil_try_rlock(struct pfil_head *, struct rm_priotracker *); -void pfil_rlock(struct pfil_head *, struct rm_priotracker *); -void pfil_runlock(struct pfil_head *, struct rm_priotracker *); -void pfil_wlock(struct pfil_head *); -void pfil_wunlock(struct pfil_head *); -int pfil_wowned(struct pfil_head *ph); +/* + * Alloc mbuf to be used instead of memory pointer. + */ +int pfil_realloc(pfil_packet_t *, int, struct ifnet *); +#endif /* _KERNEL */ #endif /* _NET_PFIL_H_ */ diff --git a/freebsd/sys/net/pfvar.h b/freebsd/sys/net/pfvar.h index 2924c06d..bfa7e773 100644 --- a/freebsd/sys/net/pfvar.h +++ b/freebsd/sys/net/pfvar.h @@ -41,6 +41,7 @@ #include <sys/cpuset.h> #include <sys/malloc.h> #include <sys/refcount.h> +#include <sys/sysctl.h> #include <sys/lock.h> #include <sys/rmlock.h> #include <sys/tree.h> @@ -95,6 +96,9 @@ struct pf_addr_wrap { #ifdef _KERNEL +SYSCTL_DECL(_net_pf); +MALLOC_DECLARE(M_PFHASH); + struct pfi_dynaddr { TAILQ_ENTRY(pfi_dynaddr) entry; struct pf_addr pfid_addr4; @@ -1017,6 +1021,17 @@ struct pfr_tstats { int pfrts_cnt; int pfrts_refcnt[PFR_REFCNT_MAX]; }; + +struct pfr_ktstats { + struct pfr_table pfrts_t; + counter_u64_t pfrkts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + counter_u64_t pfrkts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + counter_u64_t pfrkts_match; + counter_u64_t pfrkts_nomatch; + long pfrkts_tzero; + int pfrkts_cnt; + int pfrkts_refcnt[PFR_REFCNT_MAX]; +}; #define pfrts_name pfrts_t.pfrt_name #define pfrts_flags pfrts_t.pfrt_flags @@ -1030,8 +1045,9 @@ union sockaddr_union { #endif /* _SOCKADDR_UNION_DEFINED */ struct pfr_kcounters { - u_int64_t pfrkc_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; - u_int64_t pfrkc_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + counter_u64_t pfrkc_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + counter_u64_t pfrkc_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + long pfrkc_tzero; }; SLIST_HEAD(pfr_kentryworkq, pfr_kentry); @@ -1039,8 +1055,7 @@ struct pfr_kentry { struct radix_node pfrke_node[2]; union sockaddr_union pfrke_sa; SLIST_ENTRY(pfr_kentry) pfrke_workq; - struct pfr_kcounters *pfrke_counters; - long pfrke_tzero; + struct pfr_kcounters pfrke_counters; u_int8_t pfrke_af; u_int8_t pfrke_net; u_int8_t pfrke_not; @@ -1050,7 +1065,7 @@ struct pfr_kentry { SLIST_HEAD(pfr_ktableworkq, pfr_ktable); RB_HEAD(pfr_ktablehead, pfr_ktable); struct pfr_ktable { - struct pfr_tstats pfrkt_ts; + struct pfr_ktstats pfrkt_kts; RB_ENTRY(pfr_ktable) pfrkt_tree; SLIST_ENTRY(pfr_ktable) pfrkt_workq; struct radix_node_head *pfrkt_ip4; @@ -1061,18 +1076,18 @@ struct pfr_ktable { long pfrkt_larg; int pfrkt_nflags; }; -#define pfrkt_t pfrkt_ts.pfrts_t +#define pfrkt_t pfrkt_kts.pfrts_t #define pfrkt_name pfrkt_t.pfrt_name #define pfrkt_anchor pfrkt_t.pfrt_anchor #define pfrkt_ruleset pfrkt_t.pfrt_ruleset #define pfrkt_flags pfrkt_t.pfrt_flags -#define pfrkt_cnt pfrkt_ts.pfrts_cnt -#define pfrkt_refcnt pfrkt_ts.pfrts_refcnt -#define pfrkt_packets pfrkt_ts.pfrts_packets -#define pfrkt_bytes pfrkt_ts.pfrts_bytes -#define pfrkt_match pfrkt_ts.pfrts_match -#define pfrkt_nomatch pfrkt_ts.pfrts_nomatch -#define pfrkt_tzero pfrkt_ts.pfrts_tzero +#define pfrkt_cnt pfrkt_kts.pfrkts_cnt +#define pfrkt_refcnt pfrkt_kts.pfrkts_refcnt +#define pfrkt_packets pfrkt_kts.pfrkts_packets +#define pfrkt_bytes pfrkt_kts.pfrkts_bytes +#define pfrkt_match pfrkt_kts.pfrkts_match +#define pfrkt_nomatch pfrkt_kts.pfrkts_nomatch +#define pfrkt_tzero pfrkt_kts.pfrkts_tzero /* keep synced with pfi_kif, used in RB_FIND */ struct pfi_kif_cmp { @@ -1601,7 +1616,7 @@ VNET_DECLARE(uint64_t, pf_stateid[MAXCPU]); #define V_pf_stateid VNET(pf_stateid) TAILQ_HEAD(pf_altqqueue, pf_altq); -VNET_DECLARE(struct pf_altqqueue, pf_altqs[2]); +VNET_DECLARE(struct pf_altqqueue, pf_altqs[4]); #define V_pf_altqs VNET(pf_altqs) VNET_DECLARE(struct pf_palist, pf_pabuf); #define V_pf_pabuf VNET(pf_pabuf) @@ -1616,8 +1631,12 @@ VNET_DECLARE(u_int32_t, ticket_pabuf); #define V_ticket_pabuf VNET(ticket_pabuf) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_active); #define V_pf_altqs_active VNET(pf_altqs_active) +VNET_DECLARE(struct pf_altqqueue *, pf_altq_ifs_active); +#define V_pf_altq_ifs_active VNET(pf_altq_ifs_active) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_inactive); #define V_pf_altqs_inactive VNET(pf_altqs_inactive) +VNET_DECLARE(struct pf_altqqueue *, pf_altq_ifs_inactive); +#define V_pf_altq_ifs_inactive VNET(pf_altq_ifs_inactive) VNET_DECLARE(struct pf_rulequeue, pf_unlinked_rules); #define V_pf_unlinked_rules VNET(pf_unlinked_rules) diff --git a/freebsd/sys/net/route.c b/freebsd/sys/net/route.c index 3cd909c1..36f3bf41 100644 --- a/freebsd/sys/net/route.c +++ b/freebsd/sys/net/route.c @@ -625,11 +625,12 @@ rtredirect_fib(struct sockaddr *dst, int error = 0; short *stat = NULL; struct rt_addrinfo info; + struct epoch_tracker et; struct ifaddr *ifa; struct rib_head *rnh; ifa = NULL; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; @@ -724,7 +725,7 @@ done: if (rt) RTFREE_LOCKED(rt); out: - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) @@ -1307,11 +1308,14 @@ rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. + * + * Assume basic consistency checks are executed by callers: + * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { - struct ifaddr *ifa; + struct epoch_tracker et; int needref, error; /* @@ -1320,22 +1324,55 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) */ error = 0; needref = (info->rti_ifa == NULL); - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); + + /* If we have interface specified by the ifindex in the address, use it */ if (info->rti_ifp == NULL && ifpaddr != NULL && - ifpaddr->sa_family == AF_LINK && - (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { - info->rti_ifp = ifa->ifa_ifp; + ifpaddr->sa_family == AF_LINK) { + const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)ifpaddr; + if (sdl->sdl_index != 0) + info->rti_ifp = ifnet_byindex_locked(sdl->sdl_index); } + /* + * If we have source address specified, try to find it + * TODO: avoid enumerating all ifas on all interfaces. + */ if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; - sa = ifaaddr != NULL ? ifaaddr : - (gateway != NULL ? gateway : dst); - if (sa != NULL && info->rti_ifp != NULL) + /* + * Most common use case for the userland-supplied routes. + * + * Choose sockaddr to select ifa. + * -- if ifp is set -- + * Order of preference: + * 1) IFA address + * 2) gateway address + * Note: for interface routes link-level gateway address + * is specified to indicate the interface index without + * specifying RTF_GATEWAY. In this case, ignore gateway + * Note: gateway AF may be different from dst AF. In this case, + * ignore gateway + * 3) final destination. + * 4) if all of these fails, try to get at least link-level ifa. + * -- else -- + * try to lookup gateway or dst in the routing table to get ifa + */ + if (info->rti_info[RTAX_IFA] != NULL) + sa = info->rti_info[RTAX_IFA]; + else if ((info->rti_flags & RTF_GATEWAY) != 0 && + gateway->sa_family == dst->sa_family) + sa = gateway; + else + sa = dst; + if (info->rti_ifp != NULL) { info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); - else if (dst != NULL && gateway != NULL) + /* Case 4 */ + if (info->rti_ifa == NULL && gateway != NULL) + info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp); + } else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) @@ -1348,7 +1385,7 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) ifa_ref(info->rti_ifa); } else error = ENETUNREACH; - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); return (error); } @@ -1585,6 +1622,8 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, switch (req) { case RTM_DELETE: if (netmask) { + if (dst->sa_len > sizeof(mdst)) + return (EINVAL); rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } @@ -1990,7 +2029,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; - static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + struct sockaddr_dl *sdl = NULL; struct rib_head *rnh; if (flags & RTF_HOST) { @@ -2045,7 +2084,14 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } - } + } else if (cmd == RTM_ADD) { + sdl = (struct sockaddr_dl *)tempbuf; + bzero(sdl, sizeof(struct sockaddr_dl)); + sdl->sdl_family = AF_LINK; + sdl->sdl_len = sizeof(struct sockaddr_dl); + sdl->sdl_type = ifa->ifa_ifp->if_type; + sdl->sdl_index = ifa->ifa_ifp->if_index; + } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 @@ -2108,8 +2154,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) * doing this for compatibility reasons */ if (cmd == RTM_ADD) - info.rti_info[RTAX_GATEWAY] = - (struct sockaddr *)&null_sdl; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; @@ -2136,15 +2181,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) rt->rt_ifa = ifa; } #endif - /* - * doing this for compatibility reasons - */ - if (cmd == RTM_ADD) { - ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = - rt->rt_ifp->if_type; - ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = - rt->rt_ifp->if_index; - } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); diff --git a/freebsd/sys/net/route.h b/freebsd/sys/net/route.h index c4333838..bdeb9869 100644 --- a/freebsd/sys/net/route.h +++ b/freebsd/sys/net/route.h @@ -210,6 +210,7 @@ struct rtentry { #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ +#define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ #define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ diff --git a/freebsd/sys/net/route_var.h b/freebsd/sys/net/route_var.h index 9d0d1931..db3db4e3 100644 --- a/freebsd/sys/net/route_var.h +++ b/freebsd/sys/net/route_var.h @@ -67,6 +67,7 @@ fib_rte_to_nh_flags(int rt_flags) uint16_t res; res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0; + res |= (rt_flags & RTF_HOST) ? NHF_HOST : 0; res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0; res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0; res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0; diff --git a/freebsd/sys/net/rtsock.c b/freebsd/sys/net/rtsock.c index e1b87095..6c457a9d 100644 --- a/freebsd/sys/net/rtsock.c +++ b/freebsd/sys/net/rtsock.c @@ -33,6 +33,7 @@ * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ +#include <rtems/bsd/local/opt_ddb.h> #include <rtems/bsd/local/opt_mpath.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> @@ -55,6 +56,11 @@ #include <sys/sysctl.h> #include <sys/systm.h> +#ifdef DDB +#include <ddb/ddb.h> +#include <ddb/db_lex.h> +#endif + #include <net/if.h> #include <net/if_var.h> #include <net/if_dl.h> @@ -448,6 +454,9 @@ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred) { +#if defined(INET) || defined(INET6) + struct epoch_tracker et; +#endif /* First, see if the returned address is part of the jail. */ if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) { @@ -468,7 +477,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, * Try to find an address on the given outgoing interface * that belongs to the jail. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; @@ -480,7 +489,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. @@ -510,7 +519,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, * Try to find an address on the given outgoing interface * that belongs to the jail. */ - IF_ADDR_RLOCK(ifp); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; @@ -523,7 +532,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, break; } } - IF_ADDR_RUNLOCK(ifp); + NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. @@ -627,6 +636,8 @@ route_output(struct mbuf *m, struct socket *so, ...) if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); + if (rtm->rtm_flags & RTF_RNH_LOCKED) + senderr(EINVAL); info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || @@ -798,16 +809,17 @@ route_output(struct mbuf *m, struct socket *so, ...) if (rt->rt_ifp != NULL && rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { + struct epoch_tracker et; struct ifaddr *ifa; - NET_EPOCH_ENTER(); + NET_EPOCH_ENTER(et); ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, ifa->ifa_netmask); - NET_EPOCH_EXIT(); + NET_EPOCH_EXIT(et); } else rt_maskedcopy(rt->rt_ifa->ifa_addr, &laddr, @@ -1571,7 +1583,7 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) struct rt_addrinfo info; struct sockaddr_storage ss; - IFNET_RLOCK_NOSLEEP_ASSERT(); + NET_EPOCH_ASSERT(); if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; @@ -1765,7 +1777,7 @@ sysctl_iflist(int af, struct walkarg *w) bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); - NET_EPOCH_ENTER_ET(et); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; @@ -1815,7 +1827,7 @@ sysctl_iflist(int af, struct walkarg *w) info.rti_info[RTAX_BRD] = NULL; } done: - NET_EPOCH_EXIT_ET(et); + NET_EPOCH_EXIT(et); return (error); } @@ -1823,6 +1835,7 @@ static int sysctl_ifmalist(int af, struct walkarg *w) { struct rt_addrinfo info; + struct epoch_tracker et; struct ifaddr *ifa; struct ifmultiaddr *ifma; struct ifnet *ifp; @@ -1831,13 +1844,12 @@ sysctl_ifmalist(int af, struct walkarg *w) error = 0; bzero((caddr_t)&info, sizeof(info)); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; @@ -1864,11 +1876,10 @@ sysctl_ifmalist(int af, struct walkarg *w) break; } } - IF_ADDR_RUNLOCK(ifp); if (error != 0) break; } - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); return (error); } @@ -1955,11 +1966,13 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { + struct epoch_tracker et; + RIB_RLOCK(rnh); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER(et); error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT(et); RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; @@ -2008,3 +2021,408 @@ static struct domain routedomain = { }; VNET_DOMAIN_SET(route); + +#ifdef DDB +/* + * Unfortunately, RTF_ values are expressed as raw masks rather than powers of + * 2, so we cannot use them as nice C99 initializer indices below. + */ +static const char * const rtf_flag_strings[] = { + "UP", + "GATEWAY", + "HOST", + "REJECT", + "DYNAMIC", + "MODIFIED", + "DONE", + "UNUSED_0x80", + "UNUSED_0x100", + "XRESOLVE", + "LLDATA", + "STATIC", + "BLACKHOLE", + "UNUSED_0x2000", + "PROTO2", + "PROTO1", + "UNUSED_0x10000", + "UNUSED_0x20000", + "PROTO3", + "FIXEDMTU", + "PINNED", + "LOCAL", + "BROADCAST", + "MULTICAST", + /* Big gap. */ + [28] = "STICKY", + [30] = "RNH_LOCKED", + [31] = "GWFLAG_COMPAT", +}; + +static const char * __pure +rt_flag_name(unsigned idx) +{ + if (idx >= nitems(rtf_flag_strings)) + return ("INVALID_FLAG"); + if (rtf_flag_strings[idx] == NULL) + return ("UNKNOWN"); + return (rtf_flag_strings[idx]); +} + +static void +rt_dumpaddr_ddb(const char *name, const struct sockaddr *sa) +{ + char buf[INET6_ADDRSTRLEN], *res; + + res = NULL; + if (sa == NULL) + res = "NULL"; + else if (sa->sa_family == AF_INET) { + res = inet_ntop(AF_INET, + &((const struct sockaddr_in *)sa)->sin_addr, + buf, sizeof(buf)); + } else if (sa->sa_family == AF_INET6) { + res = inet_ntop(AF_INET6, + &((const struct sockaddr_in6 *)sa)->sin6_addr, + buf, sizeof(buf)); + } else if (sa->sa_family == AF_LINK) { + res = "on link"; + } + + if (res != NULL) { + db_printf("%s <%s> ", name, res); + return; + } + + db_printf("%s <af:%d> ", name, sa->sa_family); +} + +static int +rt_dumpentry_ddb(struct radix_node *rn, void *arg __unused) +{ + struct sockaddr_storage ss; + struct rtentry *rt; + int flags, idx; + + /* If RNTORT is important, put it in a header. */ + rt = (void *)rn; + + rt_dumpaddr_ddb("dst", rt_key(rt)); + rt_dumpaddr_ddb("gateway", rt->rt_gateway); + rt_dumpaddr_ddb("netmask", rtsock_fix_netmask(rt_key(rt), rt_mask(rt), + &ss)); + if (rt->rt_ifp != NULL && (rt->rt_ifp->if_flags & IFF_DYING) == 0) { + rt_dumpaddr_ddb("ifp", rt->rt_ifp->if_addr->ifa_addr); + rt_dumpaddr_ddb("ifa", rt->rt_ifa->ifa_addr); + } + + db_printf("flags "); + flags = rt->rt_flags; + if (flags == 0) + db_printf("none"); + + while ((idx = ffs(flags)) > 0) { + idx--; + + if (flags != rt->rt_flags) + db_printf(","); + db_printf("%s", rt_flag_name(idx)); + + flags &= ~(1ul << idx); + } + + db_printf("\n"); + return (0); +} + +DB_SHOW_COMMAND(routetable, db_show_routetable_cmd) +{ + struct rib_head *rnh; + int error, i, lim; + + if (have_addr) + i = lim = addr; + else { + i = 1; + lim = AF_MAX; + } + + for (; i <= lim; i++) { + rnh = rt_tables_get_rnh(0, i); + if (rnh == NULL) { + if (have_addr) { + db_printf("%s: AF %d not supported?\n", + __func__, i); + break; + } + continue; + } + + if (!have_addr && i > 1) + db_printf("\n"); + + db_printf("Route table for AF %d%s%s%s:\n", i, + (i == AF_INET || i == AF_INET6) ? " (" : "", + (i == AF_INET) ? "INET" : (i == AF_INET6) ? "INET6" : "", + (i == AF_INET || i == AF_INET6) ? ")" : ""); + + error = rnh->rnh_walktree(&rnh->head, rt_dumpentry_ddb, NULL); + if (error != 0) + db_printf("%s: walktree(%d): %d\n", __func__, i, + error); + } +} + +_DB_FUNC(_show, route, db_show_route_cmd, db_show_table, CS_OWN, NULL) +{ + char buf[INET6_ADDRSTRLEN], *bp; + const void *dst_addrp; + struct sockaddr *dstp; + struct rtentry *rt; + union { + struct sockaddr_in dest_sin; + struct sockaddr_in6 dest_sin6; + } u; + uint16_t hextets[8]; + unsigned i, tets; + int t, af, exp, tokflags; + + /* + * Undecoded address family. No double-colon expansion seen yet. + */ + af = -1; + exp = -1; + /* Assume INET6 to start; we can work back if guess was wrong. */ + tokflags = DRT_WSPACE | DRT_HEX | DRT_HEXADECIMAL; + + /* + * db_command has lexed 'show route' for us. + */ + t = db_read_token_flags(tokflags); + if (t == tWSPACE) + t = db_read_token_flags(tokflags); + + /* + * tEOL: Just 'show route' isn't a valid mode. + * tMINUS: It's either '-h' or some invalid option. Regardless, usage. + */ + if (t == tEOL || t == tMINUS) + goto usage; + + db_unread_token(t); + + tets = nitems(hextets); + + /* + * Each loop iteration, we expect to read one octet (v4) or hextet + * (v6), followed by an appropriate field separator ('.' or ':' or + * '::'). + * + * At the start of each loop, we're looking for a number (octet or + * hextet). + * + * INET6 addresses have a special case where they may begin with '::'. + */ + for (i = 0; i < tets; i++) { + t = db_read_token_flags(tokflags); + + if (t == tCOLONCOLON) { + /* INET6 with leading '::' or invalid. */ + if (i != 0) { + db_printf("Parse error: unexpected extra " + "colons.\n"); + goto exit; + } + + af = AF_INET6; + exp = i; + hextets[i] = 0; + continue; + } else if (t == tNUMBER) { + /* + * Lexer separates out '-' as tMINUS, but make the + * assumption explicit here. + */ + MPASS(db_tok_number >= 0); + + if (af == AF_INET && db_tok_number > UINT8_MAX) { + db_printf("Not a valid v4 octet: %ld\n", + (long)db_tok_number); + goto exit; + } + hextets[i] = db_tok_number; + } else if (t == tEOL) { + /* + * We can only detect the end of an IPv6 address in + * compact representation with EOL. + */ + if (af != AF_INET6 || exp < 0) { + db_printf("Parse failed. Got unexpected EOF " + "when the address is not a compact-" + "representation IPv6 address.\n"); + goto exit; + } + break; + } else { + db_printf("Parse failed. Unexpected token %d.\n", t); + goto exit; + } + + /* Next, look for a separator, if appropriate. */ + if (i == tets - 1) + continue; + + t = db_read_token_flags(tokflags); + if (af < 0) { + if (t == tCOLON) { + af = AF_INET6; + continue; + } + if (t == tCOLONCOLON) { + af = AF_INET6; + i++; + hextets[i] = 0; + exp = i; + continue; + } + if (t == tDOT) { + unsigned hn, dn; + + af = AF_INET; + /* Need to fixup the first parsed number. */ + if (hextets[0] > 0x255 || + (hextets[0] & 0xf0) > 0x90 || + (hextets[0] & 0xf) > 9) { + db_printf("Not a valid v4 octet: %x\n", + hextets[0]); + goto exit; + } + + hn = hextets[0]; + dn = (hn >> 8) * 100 + + ((hn >> 4) & 0xf) * 10 + + (hn & 0xf); + + hextets[0] = dn; + + /* Switch to decimal for remaining octets. */ + tokflags &= ~DRT_RADIX_MASK; + tokflags |= DRT_DECIMAL; + + tets = 4; + continue; + } + + db_printf("Parse error. Unexpected token %d.\n", t); + goto exit; + } else if (af == AF_INET) { + if (t == tDOT) + continue; + db_printf("Expected '.' (%d) between octets but got " + "(%d).\n", tDOT, t); + goto exit; + + } else if (af == AF_INET6) { + if (t == tCOLON) + continue; + if (t == tCOLONCOLON) { + if (exp < 0) { + i++; + hextets[i] = 0; + exp = i; + continue; + } + db_printf("Got bogus second '::' in v6 " + "address.\n"); + goto exit; + } + if (t == tEOL) { + /* + * Handle in the earlier part of the loop + * because we need to handle trailing :: too. + */ + db_unread_token(t); + continue; + } + + db_printf("Expected ':' (%d) or '::' (%d) between " + "hextets but got (%d).\n", tCOLON, tCOLONCOLON, t); + goto exit; + } + } + + /* Check for trailing garbage. */ + if (i == tets) { + t = db_read_token_flags(tokflags); + if (t != tEOL) { + db_printf("Got unexpected garbage after address " + "(%d).\n", t); + goto exit; + } + } + + /* + * Need to expand compact INET6 addresses. + * + * Technically '::' for a single ':0:' is MUST NOT but just in case, + * don't bother expanding that form (exp >= 0 && i == tets case). + */ + if (af == AF_INET6 && exp >= 0 && i < tets) { + if (exp + 1 < i) { + memmove(&hextets[exp + 1 + (nitems(hextets) - i)], + &hextets[exp + 1], + (i - (exp + 1)) * sizeof(hextets[0])); + } + memset(&hextets[exp + 1], 0, (nitems(hextets) - i) * + sizeof(hextets[0])); + } + + memset(&u, 0, sizeof(u)); + if (af == AF_INET) { + u.dest_sin.sin_family = AF_INET; + u.dest_sin.sin_len = sizeof(u.dest_sin); + u.dest_sin.sin_addr.s_addr = htonl( + ((uint32_t)hextets[0] << 24) | + ((uint32_t)hextets[1] << 16) | + ((uint32_t)hextets[2] << 8) | + (uint32_t)hextets[3]); + dstp = (void *)&u.dest_sin; + dst_addrp = &u.dest_sin.sin_addr; + } else if (af == AF_INET6) { + u.dest_sin6.sin6_family = AF_INET6; + u.dest_sin6.sin6_len = sizeof(u.dest_sin6); + for (i = 0; i < nitems(hextets); i++) + u.dest_sin6.sin6_addr.s6_addr16[i] = htons(hextets[i]); + dstp = (void *)&u.dest_sin6; + dst_addrp = &u.dest_sin6.sin6_addr; + } else { + MPASS(false); + /* UNREACHABLE */ + /* Appease Clang false positive: */ + dstp = NULL; + } + + bp = inet_ntop(af, dst_addrp, buf, sizeof(buf)); + if (bp != NULL) + db_printf("Looking up route to destination '%s'\n", bp); + + CURVNET_SET(vnet0); + rt = rtalloc1(dstp, 0, RTF_RNH_LOCKED); + CURVNET_RESTORE(); + + if (rt == NULL) { + db_printf("Could not get route for that server.\n"); + return; + } + + rt_dumpentry_ddb((void *)rt, NULL); + RTFREE_LOCKED(rt); + + return; +usage: + db_printf("Usage: 'show route <address>'\n" + " Currently accepts only dotted-decimal INET or colon-separated\n" + " hextet INET6 addresses.\n"); +exit: + db_skip_to_eol(); +} +#endif diff --git a/freebsd/sys/net/sff8472.h b/freebsd/sys/net/sff8472.h index d38fcfc0..9fa465a1 100644 --- a/freebsd/sys/net/sff8472.h +++ b/freebsd/sys/net/sff8472.h @@ -379,7 +379,7 @@ enum { /* * Table 3.2 Identifier values. - * Identifier constants has taken from SFF-8024 rev 4.2 table 4.1 + * Identifier constants has taken from SFF-8024 rev 4.6 table 4.1 * (as referenced by table 3.2 footer) * */ enum { @@ -396,10 +396,10 @@ enum { SFF_8024_ID_X2 = 0xA, /* X2 */ SFF_8024_ID_DWDM_SFP = 0xB, /* DWDM-SFP */ SFF_8024_ID_QSFP = 0xC, /* QSFP */ - SFF_8024_ID_QSFPPLUS = 0xD, /* QSFP+ */ + SFF_8024_ID_QSFPPLUS = 0xD, /* QSFP+ or later */ SFF_8024_ID_CXP = 0xE, /* CXP */ - SFF_8024_ID_HD4X = 0xF, /* Shielded Mini Multilane HD 4X */ - SFF_8024_ID_HD8X = 0x10, /* Shielded Mini Multilane HD 8X */ + SFF_8024_ID_HD4X = 0xF, /* Shielded Mini Multilane HD 4X */ + SFF_8024_ID_HD8X = 0x10, /* Shielded Mini Multilane HD 8X */ SFF_8024_ID_QSFP28 = 0x11, /* QSFP28 or later */ SFF_8024_ID_CXP2 = 0x12, /* CXP2 (aka CXP28) */ SFF_8024_ID_CDFP = 0x13, /* CDFP (Style 1/Style 2) */ @@ -408,34 +408,49 @@ enum { SFF_8024_ID_CDFP3 = 0x16, /* CDFP (Style3) */ SFF_8024_ID_MICROQSFP = 0x17, /* microQSFP */ SFF_8024_ID_QSFP_DD = 0x18, /* QSFP-DD 8X Pluggable Transceiver */ - SFF_8024_ID_LAST = SFF_8024_ID_QSFP_DD - }; - -static const char *sff_8024_id[SFF_8024_ID_LAST + 1] = {"Unknown", - "GBIC", - "SFF", - "SFP/SFP+/SFP28", - "XBI", - "Xenpak", - "XFP", - "XFF", - "XFP-E", - "XPAK", - "X2", - "DWDM-SFP/SFP+", - "QSFP", - "QSFP+", - "CXP", - "HD4X", - "HD8X", - "QSFP28", - "CXP2", - "CDFP", - "SMM4", - "SMM8", - "CDFP3", - "microQSFP", - "QSFP-DD"}; + SFF_8024_ID_OSFP8X = 0x19, /* OSFP 8X Pluggable Transceiver */ + SFF_8024_ID_SFP_DD = 0x1A, /* SFP-DD 2X Pluggable Transceiver */ + SFF_8024_ID_DSFP = 0x1B, /* DSFP Dual SFF Pluggable Transceiver */ + SFF_8024_ID_X4ML = 0x1C, /* x4 MiniLink/OcuLink */ + SFF_8024_ID_X8ML = 0x1D, /* x8 MiniLink */ + SFF_8024_ID_QSFP_CMIS = 0x1E, /* QSFP+ or later w/ Common Management + Interface Specification */ + SFF_8024_ID_LAST = SFF_8024_ID_QSFP_CMIS +}; + +static const char *sff_8024_id[SFF_8024_ID_LAST + 1] = { + "Unknown", + "GBIC", + "SFF", + "SFP/SFP+/SFP28", + "XBI", + "Xenpak", + "XFP", + "XFF", + "XFP-E", + "XPAK", + "X2", + "DWDM-SFP/SFP+", + "QSFP", + "QSFP+", + "CXP", + "HD4X", + "HD8X", + "QSFP28", + "CXP2", + "CDFP", + "SMM4", + "SMM8", + "CDFP3", + "microQSFP", + "QSFP-DD", + "QSFP8X", + "SFP-DD", + "DSFP", + "x4MiniLink/OcuLink", + "x8MiniLink", + "QSFP+(CIMS)" +}; /* Keep compatibility with old definitions */ #define SFF_8472_ID_UNKNOWN SFF_8024_ID_UNKNOWN diff --git a/freebsd/sys/net/vnet.h b/freebsd/sys/net/vnet.h index b4168750..a8c9887e 100644 --- a/freebsd/sys/net/vnet.h +++ b/freebsd/sys/net/vnet.h @@ -273,7 +273,8 @@ extern struct sx vnet_sxlock; /* struct _hack is to stop this from being used with static data */ #define VNET_DEFINE(t, n) \ struct _hack; t VNET_NAME(n) __section(VNET_SETNAME) __used -#if defined(KLD_MODULE) && (defined(__aarch64__) || defined(__riscv)) +#if defined(KLD_MODULE) && (defined(__aarch64__) || defined(__riscv) \ + || defined(__powerpc64__)) /* * As with DPCPU_DEFINE_STATIC we are unable to mark this data as static * in modules on some architectures. |