diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2013-11-04 11:33:00 +0100 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2013-11-04 15:28:21 +0100 |
commit | af5333e0a02b2295304d4e029b15ee15a4fe2b3a (patch) | |
tree | c5c43680d374f58b487eeeaf18fb7ec6b84ba074 /freebsd/sys/net | |
parent | BUS_SPACE(9): Use simple memory model for ARM (diff) | |
download | rtems-libbsd-af5333e0a02b2295304d4e029b15ee15a4fe2b3a.tar.bz2 |
Update to FreeBSD 8.4
Diffstat (limited to 'freebsd/sys/net')
46 files changed, 2446 insertions, 1042 deletions
diff --git a/freebsd/sys/net/bpf.c b/freebsd/sys/net/bpf.c index d9223313..179d5f0a 100644 --- a/freebsd/sys/net/bpf.c +++ b/freebsd/sys/net/bpf.c @@ -45,6 +45,8 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/types.h> #include <rtems/bsd/sys/param.h> +#include <rtems/bsd/sys/lock.h> +#include <sys/rwlock.h> #include <sys/systm.h> #include <sys/conf.h> #include <sys/fcntl.h> @@ -68,6 +70,7 @@ __FBSDID("$FreeBSD$"); #include <sys/socket.h> #include <net/if.h> +#define BPF_INTERNAL #include <net/bpf.h> #include <net/bpf_buffer.h> #ifdef BPF_JITTER @@ -141,6 +144,7 @@ static int bpf_bpfd_cnt; static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); +static void bpf_detachd_locked(struct bpf_d *); static void bpf_freed(struct bpf_d *); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); @@ -152,7 +156,7 @@ static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct timeval *); static void reset_d(struct bpf_d *); -static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); +static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); @@ -170,6 +174,12 @@ SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); +static VNET_DEFINE(int, bpf_optimize_writers) = 0; +#define V_bpf_optimize_writers VNET(bpf_optimize_writers) +SYSCTL_VNET_INT(_net_bpf, OID_AUTO, optimize_writers, + CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0, + "Do not send packets until BPF program is set"); + static d_open_t bpfopen; static d_read_t bpfread; static d_write_t bpfwrite; @@ -191,6 +201,37 @@ static struct cdevsw bpf_cdevsw = { static struct filterops bpfread_filtops = { 1, NULL, filt_bpfdetach, filt_bpfread }; +eventhandler_tag bpf_ifdetach_cookie = NULL; + +/* + * LOCKING MODEL USED BY BPF: + * Locks: + * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal, + * some global counters and every bpf_if reference. + * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters. + * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields + * used by bpf_mtap code. + * + * Lock order: + * + * Global lock, interface lock, descriptor lock + * + * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2] + * working model. In many places (like bpf_detachd) we start with BPF descriptor + * (and we need to at least rlock it to get reliable interface pointer). This + * gives us potential LOR. As a result, we use global lock to protect from bpf_if + * change in every such place. + * + * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and + * 3) descriptor main wlock. + * Reading bd_bif can be protected by any of these locks, typically global lock. + * + * Changing read/write BPF filter is protected by the same three locks, + * the same applies for reading. + * + * Sleeping in global lock is not allowed due to bpfdetach() using it. + */ + /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure @@ -284,7 +325,6 @@ bpf_canfreebuf(struct bpf_d *d) static int bpf_canwritebuf(struct bpf_d *d) { - BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { @@ -563,17 +603,92 @@ bad: static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { + int op_w; + + BPF_LOCK_ASSERT(); + + /* + * Save sysctl value to protect from sysctl change + * between reads + */ + op_w = V_bpf_optimize_writers; + + if (d->bd_bif != NULL) + bpf_detachd_locked(d); /* - * Point d at bp, and add d to the interface's list of listeners. - * Finally, point the driver's bpf cookie at the interface so - * it will divert packets to bpf. + * Point d at bp, and add d to the interface's list. + * Since there are many applicaiotns using BPF for + * sending raw packets only (dhcpd, cdpd are good examples) + * we can delay adding d to the list of active listeners until + * some filter is configured. */ - BPFIF_LOCK(bp); + + BPFIF_WLOCK(bp); + BPFD_LOCK(d); + d->bd_bif = bp; - LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + + if (op_w != 0) { + /* Add to writers-only list */ + LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); + /* + * We decrement bd_writer on every filter set operation. + * First BIOCSETF is done by pcap_open_live() to set up + * snap length. After that appliation usually sets its own filter + */ + d->bd_writer = 2; + } else + LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + + BPFD_UNLOCK(d); + BPFIF_WUNLOCK(bp); bpf_bpfd_cnt++; - BPFIF_UNLOCK(bp); + + CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", + __func__, d->bd_pid, d->bd_writer ? "writer" : "active"); + + if (op_w == 0) + EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); +} + +/* + * Add d to the list of active bp filters. + * Reuqires bpf_attachd() to be called before + */ +static void +bpf_upgraded(struct bpf_d *d) +{ + struct bpf_if *bp; + + BPF_LOCK_ASSERT(); + + bp = d->bd_bif; + + /* + * Filter can be set several times without specifying interface. + * Mark d as reader and exit. + */ + if (bp == NULL) { + BPFD_LOCK(d); + d->bd_writer = 0; + BPFD_UNLOCK(d); + return; + } + + BPFIF_WLOCK(bp); + BPFD_LOCK(d); + + /* Remove from writers-only list */ + LIST_REMOVE(d, bd_next); + LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); + /* Mark d as reader */ + d->bd_writer = 0; + + BPFD_UNLOCK(d); + BPFIF_WUNLOCK(bp); + + CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } @@ -584,26 +699,47 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) static void bpf_detachd(struct bpf_d *d) { + BPF_LOCK(); + bpf_detachd_locked(d); + BPF_UNLOCK(); +} + +static void +bpf_detachd_locked(struct bpf_d *d) +{ int error; struct bpf_if *bp; struct ifnet *ifp; - bp = d->bd_bif; - BPFIF_LOCK(bp); + CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); + + BPF_LOCK_ASSERT(); + + /* Check if descriptor is attached */ + if ((bp = d->bd_bif) == NULL) + return; + + BPFIF_WLOCK(bp); BPFD_LOCK(d); - ifp = d->bd_bif->bif_ifp; + + /* Save bd_writer value */ + error = d->bd_writer; /* * Remove d from the interface's descriptor list. */ LIST_REMOVE(d, bd_next); - bpf_bpfd_cnt--; + ifp = bp->bif_ifp; d->bd_bif = NULL; BPFD_UNLOCK(d); - BPFIF_UNLOCK(bp); + BPFIF_WUNLOCK(bp); + + bpf_bpfd_cnt--; - EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); + /* Call event handler iff d is attached */ + if (error == 0) + EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); /* * Check if this descriptor had requested promiscuous mode. @@ -642,14 +778,11 @@ bpf_dtor(void *data) d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); - mtx_lock(&bpf_mtx); - if (d->bd_bif) - bpf_detachd(d); - mtx_unlock(&bpf_mtx); - selwakeuppri(&d->bd_sel, PRINET); + bpf_detachd(d); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ + seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpf_freed(d); @@ -665,7 +798,7 @@ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; - int error; + int error, size; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); @@ -683,14 +816,18 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; - d->bd_pid = td->td_proc->p_pid; + BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); mac_bpfdesc_create(td->td_ucred, d); #endif - mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF); - callout_init_mtx(&d->bd_callout, &d->bd_mtx, 0); - knlist_init_mtx(&d->bd_sel.si_note, &d->bd_mtx); + mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF); + callout_init_mtx(&d->bd_callout, &d->bd_lock, 0); + knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock); + + /* Allocate default buffers */ + size = d->bd_bufsize; + bpf_buffer_ioctl_sblen(d, &size); return (0); } @@ -720,7 +857,7 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag) non_block = ((ioflag & O_NONBLOCK) != 0); BPFD_LOCK(d); - d->bd_pid = curthread->td_proc->p_pid; + BPF_PID_REFRESH_CUR(d); if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { BPFD_UNLOCK(d); return (EOPNOTSUPP); @@ -766,7 +903,7 @@ bpfread(struct cdev *dev, struct uio *uio, int ioflag) BPFD_UNLOCK(d); return (EWOULDBLOCK); } - error = msleep(d, &d->bd_mtx, PRINET|PCATCH, + error = msleep(d, &d->bd_lock, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); @@ -883,8 +1020,9 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) if (error != 0) return (error); - d->bd_pid = curthread->td_proc->p_pid; + BPF_PID_REFRESH_CUR(d); d->bd_wcount++; + /* XXX: locking required */ if (d->bd_bif == NULL) { d->bd_wdcount++; return (ENXIO); @@ -905,6 +1043,7 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; + /* XXX: bpf_movein() can sleep */ error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, &m, &dst, &hlen, d->bd_wfilter); if (error) { @@ -964,7 +1103,7 @@ static void reset_d(struct bpf_d *d) { - mtx_assert(&d->bd_mtx, MA_OWNED); + BPFD_LOCK_ASSERT(d); if ((d->bd_hbuf != NULL) && (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { @@ -1030,7 +1169,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); - d->bd_pid = td->td_proc->p_pid; + BPF_PID_REFRESH(d, td); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; @@ -1081,7 +1220,9 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, case BIOCGDLTLIST32: case BIOCGRTIMEOUT32: case BIOCSRTIMEOUT32: + BPFD_LOCK(d); d->bd_compat32 = 1; + BPFD_UNLOCK(d); } #endif @@ -1126,7 +1267,9 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, * Get buffer len [for read()]. */ case BIOCGBLEN: + BPFD_LOCK(d); *(u_int *)addr = d->bd_bufsize; + BPFD_UNLOCK(d); break; /* @@ -1181,10 +1324,12 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, * Get current data link type. */ case BIOCGDLT: + BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; + BPF_UNLOCK(); break; /* @@ -1199,6 +1344,7 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, list32 = (struct bpf_dltlist32 *)addr; dltlist.bfl_len = list32->bfl_len; dltlist.bfl_list = PTRIN(list32->bfl_list); + BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { @@ -1206,31 +1352,37 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, if (error == 0) list32->bfl_len = dltlist.bfl_len; } + BPF_UNLOCK(); break; } #endif case BIOCGDLTLIST: + BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); + BPF_UNLOCK(); break; /* * Set data link type. */ case BIOCSDLT: + BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); + BPF_UNLOCK(); break; /* * Get interface name. */ case BIOCGETIF: + BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { @@ -1240,13 +1392,16 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } + BPF_UNLOCK(); break; /* * Set interface. */ case BIOCSETIF: + BPF_LOCK(); error = bpf_setif(d, (struct ifreq *)addr); + BPF_UNLOCK(); break; /* @@ -1329,7 +1484,9 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, * Set immediate mode. */ case BIOCIMMEDIATE: + BPFD_LOCK(d); d->bd_immediate = *(u_int *)addr; + BPFD_UNLOCK(d); break; case BIOCVERSION: @@ -1345,21 +1502,27 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, * Get "header already complete" flag */ case BIOCGHDRCMPLT: + BPFD_LOCK(d); *(u_int *)addr = d->bd_hdrcmplt; + BPFD_UNLOCK(d); break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: + BPFD_LOCK(d); d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; + BPFD_UNLOCK(d); break; /* * Get packet direction flag */ case BIOCGDIRECTION: + BPFD_LOCK(d); *(u_int *)addr = d->bd_direction; + BPFD_UNLOCK(d); break; /* @@ -1374,7 +1537,9 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: + BPFD_LOCK(d); d->bd_direction = direction; + BPFD_UNLOCK(d); break; default: error = EINVAL; @@ -1383,26 +1548,38 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, break; case BIOCFEEDBACK: + BPFD_LOCK(d); d->bd_feedback = *(u_int *)addr; + BPFD_UNLOCK(d); break; case BIOCLOCK: + BPFD_LOCK(d); d->bd_locked = 1; + BPFD_UNLOCK(d); break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ + BPFD_LOCK(d); d->bd_async = *(int *)addr; + BPFD_UNLOCK(d); break; case FIOSETOWN: + /* + * XXX: Add some sort of locking here? + * fsetown() can sleep. + */ error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: + BPFD_LOCK(d); *(int *)addr = fgetown(&d->bd_sigio); + BPFD_UNLOCK(d); break; /* This is deprecated, FIOSETOWN should be used instead. */ @@ -1423,16 +1600,23 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, if (sig >= NSIG) error = EINVAL; - else + else { + BPFD_LOCK(d); d->bd_sig = sig; + BPFD_UNLOCK(d); + } break; } case BIOCGRSIG: + BPFD_LOCK(d); *(u_int *)addr = d->bd_sig; + BPFD_UNLOCK(d); break; case BIOCGETBUFMODE: + BPFD_LOCK(d); *(u_int *)addr = d->bd_bufmode; + BPFD_UNLOCK(d); break; case BIOCSETBUFMODE: @@ -1487,95 +1671,130 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. + * + * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls + * since reading d->bd_bif can't be protected by d or interface lock due to + * lock order. + * + * Additionally, we have to acquire interface write lock due to bpf_mtap() uses + * interface read lock to read all filers. + * */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { +#ifdef COMPAT_FREEBSD32 + struct bpf_program fp_swab; + struct bpf_program32 *fp32; +#endif struct bpf_insn *fcode, *old; - u_int wfilter, flen, size; #ifdef BPF_JITTER - bpf_jit_filter *ofunc; + bpf_jit_filter *jfunc, *ofunc; #endif -#ifdef COMPAT_FREEBSD32 - struct bpf_program32 *fp32; - struct bpf_program fp_swab; + size_t size; + u_int flen; + int need_upgrade; - if (cmd == BIOCSETWF32 || cmd == BIOCSETF32 || cmd == BIOCSETFNR32) { +#ifdef COMPAT_FREEBSD32 + switch (cmd) { + case BIOCSETF32: + case BIOCSETWF32: + case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; - if (cmd == BIOCSETWF32) + switch (cmd) { + case BIOCSETF32: + cmd = BIOCSETF; + break; + case BIOCSETWF32: cmd = BIOCSETWF; + break; + } + break; } #endif - if (cmd == BIOCSETWF) { - old = d->bd_wfilter; - wfilter = 1; -#ifdef BPF_JITTER - ofunc = NULL; -#endif - } else { - wfilter = 0; - old = d->bd_rfilter; + + fcode = NULL; #ifdef BPF_JITTER - ofunc = d->bd_bfilter; + jfunc = ofunc = NULL; #endif - } - if (fp->bf_insns == NULL) { - if (fp->bf_len != 0) + need_upgrade = 0; + + /* + * Check new filter validness before acquiring any locks. + * Allocate memory for new filter, if needed. + */ + flen = fp->bf_len; + if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0)) + return (EINVAL); + size = flen * sizeof(*fp->bf_insns); + if (size > 0) { + /* We're setting up new filter. Copy and check actual data. */ + fcode = malloc(size, M_BPF, M_WAITOK); + if (copyin(fp->bf_insns, fcode, size) != 0 || + !bpf_validate(fcode, flen)) { + free(fcode, M_BPF); return (EINVAL); - BPFD_LOCK(d); - if (wfilter) - d->bd_wfilter = NULL; - else { - d->bd_rfilter = NULL; -#ifdef BPF_JITTER - d->bd_bfilter = NULL; -#endif - if (cmd == BIOCSETF) - reset_d(d); } - BPFD_UNLOCK(d); - if (old != NULL) - free((caddr_t)old, M_BPF); #ifdef BPF_JITTER - if (ofunc != NULL) - bpf_destroy_jit_filter(ofunc); + /* Filter is copied inside fcode and is perfectly valid. */ + jfunc = bpf_jitter(fcode, flen); #endif - return (0); } - flen = fp->bf_len; - if (flen > bpf_maxinsns) - return (EINVAL); - size = flen * sizeof(*fp->bf_insns); - fcode = (struct bpf_insn *)malloc(size, M_BPF, M_WAITOK); - if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 && - bpf_validate(fcode, (int)flen)) { - BPFD_LOCK(d); - if (wfilter) - d->bd_wfilter = fcode; - else { - d->bd_rfilter = fcode; + BPF_LOCK(); + + /* + * Set up new filter. + * Protect filter change by interface lock. + * Additionally, we are protected by global lock here. + */ + if (d->bd_bif != NULL) + BPFIF_WLOCK(d->bd_bif); + BPFD_LOCK(d); + if (cmd == BIOCSETWF) { + old = d->bd_wfilter; + d->bd_wfilter = fcode; + } else { + old = d->bd_rfilter; + d->bd_rfilter = fcode; #ifdef BPF_JITTER - d->bd_bfilter = bpf_jitter(fcode, flen); + ofunc = d->bd_bfilter; + d->bd_bfilter = jfunc; #endif - if (cmd == BIOCSETF) - reset_d(d); + if (cmd == BIOCSETF) + reset_d(d); + + if (fcode != NULL) { + /* + * Do not require upgrade by first BIOCSETF + * (used to set snaplen) by pcap_open_live(). + */ + if (d->bd_writer != 0 && --d->bd_writer == 0) + need_upgrade = 1; + CTR4(KTR_NET, "%s: filter function set by pid %d, " + "bd_writer counter %d, need_upgrade %d", + __func__, d->bd_pid, d->bd_writer, need_upgrade); } - BPFD_UNLOCK(d); - if (old != NULL) - free((caddr_t)old, M_BPF); + } + BPFD_UNLOCK(d); + if (d->bd_bif != NULL) + BPFIF_WUNLOCK(d->bd_bif); + if (old != NULL) + free(old, M_BPF); #ifdef BPF_JITTER - if (ofunc != NULL) - bpf_destroy_jit_filter(ofunc); + if (ofunc != NULL) + bpf_destroy_jit_filter(ofunc); #endif - return (0); - } - free((caddr_t)fcode, M_BPF); - return (EINVAL); + /* Move d to active readers list. */ + if (need_upgrade) + bpf_upgraded(d); + + BPF_UNLOCK(); + return (0); } /* @@ -1589,28 +1808,30 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) struct bpf_if *bp; struct ifnet *theywant; + BPF_LOCK_ASSERT(); + theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; + /* Check if interface is not being detached from BPF */ + BPFIF_RLOCK(bp); + if (bp->flags & BPFIF_FLAG_DYING) { + BPFIF_RUNLOCK(bp); + return (ENXIO); + } + BPFIF_RUNLOCK(bp); + /* * Behavior here depends on the buffering model. If we're using * kernel memory buffers, then we can allocate them here. If we're * using zero-copy, then the user process must have registered * buffers by the time we get here. If not, return an error. - * - * XXXRW: There are locking issues here with multi-threaded use: what - * if two threads try to set the interface at once? */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: - if (d->bd_sbuf == NULL) - bpf_buffer_alloc(d); - KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL")); - break; - case BPF_BUFMODE_ZBUF: if (d->bd_sbuf == NULL) return (EINVAL); @@ -1619,15 +1840,8 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) default: panic("bpf_setif: bufmode %d", d->bd_bufmode); } - if (bp != d->bd_bif) { - if (d->bd_bif) - /* - * Detach if attached to something else. - */ - bpf_detachd(d); - + if (bp != d->bd_bif) bpf_attachd(d, bp); - } BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); @@ -1655,7 +1869,7 @@ bpfpoll(struct cdev *dev, int events, struct thread *td) */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); - d->bd_pid = td->td_proc->p_pid; + BPF_PID_REFRESH(d, td); if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); @@ -1690,7 +1904,7 @@ bpfkqfilter(struct cdev *dev, struct knote *kn) * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); - d->bd_pid = curthread->td_proc->p_pid; + BPF_PID_REFRESH_CUR(d); kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); @@ -1746,9 +1960,19 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) struct timeval tv; gottime = 0; - BPFIF_LOCK(bp); + + BPFIF_RLOCK(bp); + LIST_FOREACH(d, &bp->bif_dlist, bd_next) { - BPFD_LOCK(d); + /* + * We are not using any locks for d here because: + * 1) any filter change is protected by interface + * write lock + * 2) destroying/detaching d is protected by interface + * write lock, too + */ + + /* XXX: Do not protect counter for the sake of performance. */ ++d->bd_rcount; /* * NB: We dont call BPF_CHECK_DIRECTION() here since there is no @@ -1764,6 +1988,11 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { + /* + * Filter matches. Let's to acquire write lock. + */ + BPFD_LOCK(d); + d->bd_fcount++; if (!gottime) { microtime(&tv); @@ -1774,10 +2003,10 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) #endif catchpacket(d, pkt, pktlen, slen, bpf_append_bytes, &tv); + BPFD_UNLOCK(d); } - BPFD_UNLOCK(d); } - BPFIF_UNLOCK(bp); + BPFIF_RUNLOCK(bp); } #define BPF_CHECK_DIRECTION(d, r, i) \ @@ -1786,6 +2015,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. + * Locking model is explained in bpf_tap(). */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) @@ -1808,11 +2038,11 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) pktlen = m_length(m, NULL); - BPFIF_LOCK(bp); + BPFIF_RLOCK(bp); + LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; - BPFD_LOCK(d); ++d->bd_rcount; #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; @@ -1823,6 +2053,8 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { + BPFD_LOCK(d); + d->bd_fcount++; if (!gottime) { microtime(&tv); @@ -1833,10 +2065,10 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_append_mbuf, &tv); + BPFD_UNLOCK(d); } - BPFD_UNLOCK(d); } - BPFIF_UNLOCK(bp); + BPFIF_RUNLOCK(bp); } /* @@ -1871,14 +2103,17 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) mb.m_len = dlen; pktlen += dlen; - BPFIF_LOCK(bp); + + BPFIF_RLOCK(bp); + LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; - BPFD_LOCK(d); ++d->bd_rcount; slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { + BPFD_LOCK(d); + d->bd_fcount++; if (!gottime) { microtime(&tv); @@ -1889,10 +2124,10 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_append_mbuf, &tv); + BPFD_UNLOCK(d); } - BPFD_UNLOCK(d); } - BPFIF_UNLOCK(bp); + BPFIF_RUNLOCK(bp); } #undef BPF_CHECK_DIRECTION @@ -2042,7 +2277,7 @@ bpf_freed(struct bpf_d *d) } if (d->bd_wfilter != NULL) free((caddr_t)d->bd_wfilter, M_BPF); - mtx_destroy(&d->bd_mtx); + mtx_destroy(&d->bd_lock); } /* @@ -2072,15 +2307,16 @@ bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) panic("bpfattach"); LIST_INIT(&bp->bif_dlist); + LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; - mtx_init(&bp->bif_mtx, "bpf interface lock", NULL, MTX_DEF); + rw_init(&bp->bif_lock, "bpf interface lock"); KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); *driverp = bp; - mtx_lock(&bpf_mtx); + BPF_LOCK(); LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); - mtx_unlock(&bpf_mtx); + BPF_UNLOCK(); /* * Compute the length of the bpf header. This is not necessarily @@ -2095,42 +2331,95 @@ bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) } /* - * Detach bpf from an interface. This involves detaching each descriptor - * associated with the interface, and leaving bd_bif NULL. Notify each - * descriptor as it's detached so that any sleepers wake up and get - * ENXIO. + * Detach bpf from an interface. This involves detaching each descriptor + * associated with the interface. Notify each descriptor as it's detached + * so that any sleepers wake up and get ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp; struct bpf_d *d; +#ifdef INVARIANTS + int ndetached; - /* Locate BPF interface information */ - mtx_lock(&bpf_mtx); - LIST_FOREACH(bp, &bpf_iflist, bif_next) { - if (ifp == bp->bif_ifp) - break; - } + ndetached = 0; +#endif + + BPF_LOCK(); + /* Find all bpf_if struct's which reference ifp and detach them. */ + do { + LIST_FOREACH(bp, &bpf_iflist, bif_next) { + if (ifp == bp->bif_ifp) + break; + } + if (bp != NULL) + LIST_REMOVE(bp, bif_next); + + if (bp != NULL) { +#ifdef INVARIANTS + ndetached++; +#endif + while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { + bpf_detachd_locked(d); + BPFD_LOCK(d); + bpf_wakeup(d); + BPFD_UNLOCK(d); + } + /* Free writer-only descriptors */ + while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) { + bpf_detachd_locked(d); + BPFD_LOCK(d); + bpf_wakeup(d); + BPFD_UNLOCK(d); + } + + /* + * Delay freing bp till interface is detached + * and all routes through this interface are removed. + * Mark bp as detached to restrict new consumers. + */ + BPFIF_WLOCK(bp); + bp->flags |= BPFIF_FLAG_DYING; + BPFIF_WUNLOCK(bp); + } + } while (bp != NULL); + BPF_UNLOCK(); - /* Interface wasn't attached */ - if ((bp == NULL) || (bp->bif_ifp == NULL)) { - mtx_unlock(&bpf_mtx); +#ifdef INVARIANTS + if (ndetached == 0) printf("bpfdetach: %s was not attached\n", ifp->if_xname); +#endif +} + +/* + * Interface departure handler. + * Note departure event does not guarantee interface is going down. + */ +static void +bpf_ifdetach(void *arg __unused, struct ifnet *ifp) +{ + struct bpf_if *bp; + + BPF_LOCK(); + if ((bp = ifp->if_bpf) == NULL) { + BPF_UNLOCK(); return; } - LIST_REMOVE(bp, bif_next); - mtx_unlock(&bpf_mtx); - - while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { - bpf_detachd(d); - BPFD_LOCK(d); - bpf_wakeup(d); - BPFD_UNLOCK(d); + /* Check if bpfdetach() was called previously */ + if ((bp->flags & BPFIF_FLAG_DYING) == 0) { + BPF_UNLOCK(); + return; } - mtx_destroy(&bp->bif_mtx); + CTR3(KTR_NET, "%s: freing BPF instance %p for interface %p", + __func__, bp, ifp); + + ifp->if_bpf = NULL; + BPF_UNLOCK(); + + rw_destroy(&bp->bif_lock); free(bp, M_BPF); } @@ -2144,24 +2433,22 @@ bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) struct ifnet *ifp; struct bpf_if *bp; + BPF_LOCK_ASSERT(); + ifp = d->bd_bif->bif_ifp; n = 0; error = 0; - mtx_lock(&bpf_mtx); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; if (bfl->bfl_list != NULL) { - if (n >= bfl->bfl_len) { - mtx_unlock(&bpf_mtx); + if (n >= bfl->bfl_len) return (ENOMEM); - } error = copyout(&bp->bif_dlt, bfl->bfl_list + n, sizeof(u_int)); } n++; } - mtx_unlock(&bpf_mtx); bfl->bfl_len = n; return (error); } @@ -2176,18 +2463,19 @@ bpf_setdlt(struct bpf_d *d, u_int dlt) struct ifnet *ifp; struct bpf_if *bp; + BPF_LOCK_ASSERT(); + if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; - mtx_lock(&bpf_mtx); + LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } - mtx_unlock(&bpf_mtx); + if (bp != NULL) { opromisc = d->bd_promisc; - bpf_detachd(d); bpf_attachd(d, bp); BPFD_LOCK(d); reset_d(d); @@ -2216,6 +2504,11 @@ bpf_drvinit(void *unused) dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); + + /* Register interface departure handler */ + bpf_ifdetach_cookie = EVENTHANDLER_REGISTER( + ifnet_departure_event, bpf_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); } /* @@ -2229,9 +2522,9 @@ bpf_zero_counters(void) struct bpf_if *bp; struct bpf_d *bd; - mtx_lock(&bpf_mtx); + BPF_LOCK(); LIST_FOREACH(bp, &bpf_iflist, bif_next) { - BPFIF_LOCK(bp); + BPFIF_RLOCK(bp); LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { BPFD_LOCK(bd); bd->bd_rcount = 0; @@ -2242,11 +2535,14 @@ bpf_zero_counters(void) bd->bd_zcopy = 0; BPFD_UNLOCK(bd); } - BPFIF_UNLOCK(bp); + BPFIF_RUNLOCK(bp); } - mtx_unlock(&bpf_mtx); + BPF_UNLOCK(); } +/* + * Fill filter statistics + */ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { @@ -2254,6 +2550,7 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) bzero(d, sizeof(*d)); BPFD_LOCK_ASSERT(bd); d->bd_structsize = sizeof(*d); + /* XXX: reading should be protected by global lock */ d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; @@ -2278,6 +2575,9 @@ bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) d->bd_bufmode = bd->bd_bufmode; } +/* + * Handle `netstat -B' stats request + */ static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { @@ -2315,24 +2615,31 @@ bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); - mtx_lock(&bpf_mtx); + BPF_LOCK(); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { - mtx_unlock(&bpf_mtx); + BPF_UNLOCK(); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; LIST_FOREACH(bp, &bpf_iflist, bif_next) { - BPFIF_LOCK(bp); + BPFIF_RLOCK(bp); + /* Send writers-only first */ + LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { + xbd = &xbdbuf[index++]; + BPFD_LOCK(bd); + bpfstats_fill_xbpf(xbd, bd); + BPFD_UNLOCK(bd); + } LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); BPFD_UNLOCK(bd); } - BPFIF_UNLOCK(bp); + BPFIF_RUNLOCK(bp); } - mtx_unlock(&bpf_mtx); + BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); diff --git a/freebsd/sys/net/bpf.h b/freebsd/sys/net/bpf.h index 726483a5..004815ad 100644 --- a/freebsd/sys/net/bpf.h +++ b/freebsd/sys/net/bpf.h @@ -917,14 +917,21 @@ SYSCTL_DECL(_net_bpf); /* * Descriptor associated with each attached hardware interface. + * FIXME: this structure is exposed to external callers to speed up + * bpf_peers_present() call. However we cover all fields not needed by + * this function via BPF_INTERNAL define */ struct bpf_if { LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ LIST_HEAD(, bpf_d) bif_dlist; /* descriptor list */ +#ifdef BPF_INTERNAL u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of header (with padding) */ struct ifnet *bif_ifp; /* corresponding interface */ - struct mtx bif_mtx; /* mutex for interface */ + struct rwlock bif_lock; /* interface lock */ + LIST_HEAD(, bpf_d) bif_wlist; /* writer-only list */ + int flags; /* Interface flags */ +#endif }; void bpf_bufheld(struct bpf_d *d); diff --git a/freebsd/sys/net/bpf_buffer.c b/freebsd/sys/net/bpf_buffer.c index 7ebfb0a8..382497f6 100644 --- a/freebsd/sys/net/bpf_buffer.c +++ b/freebsd/sys/net/bpf_buffer.c @@ -4,7 +4,7 @@ * Copyright (c) 2007 Seccuris Inc. * All rights reserved. * - * This sofware was developed by Robert N. M. Watson under contract to + * This software was developed by Robert N. M. Watson under contract to * Seccuris Inc. * * Redistribution and use in source and binary forms, with or without @@ -95,21 +95,6 @@ static int bpf_maxbufsize = BPF_MAXBUFSIZE; SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW, &bpf_maxbufsize, 0, "Default capture buffer in bytes"); -void -bpf_buffer_alloc(struct bpf_d *d) -{ - - KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL")); - KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL")); - KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL")); - - d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK); - d->bd_hbuf = NULL; - d->bd_slen = 0; - d->bd_hlen = 0; -} - /* * Simple data copy to the current kernel buffer. */ @@ -185,18 +170,42 @@ int bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i) { u_int size; + caddr_t fbuf, sbuf; - BPFD_LOCK(d); - if (d->bd_bif != NULL) { - BPFD_UNLOCK(d); - return (EINVAL); - } size = *i; if (size > bpf_maxbufsize) *i = size = bpf_maxbufsize; else if (size < BPF_MINBUFSIZE) *i = size = BPF_MINBUFSIZE; + + /* Allocate buffers immediately */ + fbuf = (caddr_t)malloc(size, M_BPF, M_WAITOK); + sbuf = (caddr_t)malloc(size, M_BPF, M_WAITOK); + + BPFD_LOCK(d); + if (d->bd_bif != NULL) { + /* Interface already attached, unable to change buffers */ + BPFD_UNLOCK(d); + free(fbuf, M_BPF); + free(sbuf, M_BPF); + return (EINVAL); + } + + /* Free old buffers if set */ + if (d->bd_fbuf != NULL) + free(d->bd_fbuf, M_BPF); + if (d->bd_sbuf != NULL) + free(d->bd_sbuf, M_BPF); + + /* Fill in new data */ d->bd_bufsize = size; + d->bd_fbuf = fbuf; + d->bd_sbuf = sbuf; + + d->bd_hbuf = NULL; + d->bd_slen = 0; + d->bd_hlen = 0; + BPFD_UNLOCK(d); return (0); } diff --git a/freebsd/sys/net/bpf_buffer.h b/freebsd/sys/net/bpf_buffer.h index 82d0310b..c1dc1f3a 100644 --- a/freebsd/sys/net/bpf_buffer.h +++ b/freebsd/sys/net/bpf_buffer.h @@ -2,7 +2,7 @@ * Copyright (c) 2007 Seccuris Inc. * All rights reserved. * - * This sofware was developed by Robert N. M. Watson under contract to + * This software was developed by Robert N. M. Watson under contract to * Seccuris Inc. * * Redistribution and use in source and binary forms, with or without @@ -36,7 +36,6 @@ #error "no user-serviceable parts inside" #endif -void bpf_buffer_alloc(struct bpf_d *d); void bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len); void bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, diff --git a/freebsd/sys/net/bpf_zerocopy.h b/freebsd/sys/net/bpf_zerocopy.h index c541a15d..a5709b86 100644 --- a/freebsd/sys/net/bpf_zerocopy.h +++ b/freebsd/sys/net/bpf_zerocopy.h @@ -2,7 +2,7 @@ * Copyright (c) 2007 Seccuris Inc. * All rights reserved. * - * This sofware was developed by Robert N. M. Watson under contract to + * This software was developed by Robert N. M. Watson under contract to * Seccuris Inc. * * Redistribution and use in source and binary forms, with or without diff --git a/freebsd/sys/net/bpfdesc.h b/freebsd/sys/net/bpfdesc.h index 03cb20dd..c3265ce1 100644 --- a/freebsd/sys/net/bpfdesc.h +++ b/freebsd/sys/net/bpfdesc.h @@ -79,6 +79,7 @@ struct bpf_d { u_char bd_promisc; /* true if listening promiscuously */ u_char bd_state; /* idle, waiting, or timed out */ u_char bd_immediate; /* true to return on packet arrival */ + u_char bd_writer; /* non-zero if d is writer-only */ int bd_hdrcmplt; /* false to fill in src lladdr automatically */ int bd_direction; /* select packet direction */ int bd_feedback; /* true to feed back sent packets */ @@ -86,7 +87,7 @@ struct bpf_d { int bd_sig; /* signal to send upon packet reception */ struct sigio * bd_sigio; /* information for async I/O */ struct selinfo bd_sel; /* bsd select info */ - struct mtx bd_mtx; /* mutex for this descriptor */ + struct mtx bd_lock; /* per-descriptor lock */ struct callout bd_callout; /* for BPF timeouts with select */ struct label *bd_label; /* MAC label for descriptor */ u_int64_t bd_fcount; /* number of packets which matched filter */ @@ -105,10 +106,16 @@ struct bpf_d { #define BPF_WAITING 1 /* waiting for read timeout in select */ #define BPF_TIMED_OUT 2 /* read timeout has expired in select */ -#define BPFD_LOCK(bd) mtx_lock(&(bd)->bd_mtx) -#define BPFD_UNLOCK(bd) mtx_unlock(&(bd)->bd_mtx) -#define BPFD_LOCK_ASSERT(bd) mtx_assert(&(bd)->bd_mtx, MA_OWNED) +#define BPFD_LOCK(bd) mtx_lock(&(bd)->bd_lock) +#define BPFD_UNLOCK(bd) mtx_unlock(&(bd)->bd_lock) +#define BPFD_LOCK_ASSERT(bd) mtx_assert(&(bd)->bd_lock, MA_OWNED) +#define BPF_PID_REFRESH(bd, td) (bd)->bd_pid = (td)->td_proc->p_pid +#define BPF_PID_REFRESH_CUR(bd) (bd)->bd_pid = curthread->td_proc->p_pid + +#define BPF_LOCK() mtx_lock(&bpf_mtx) +#define BPF_UNLOCK() mtx_unlock(&bpf_mtx) +#define BPF_LOCK_ASSERT() mtx_assert(&bpf_mtx, MA_OWNED) /* * External representation of the bpf descriptor */ @@ -143,7 +150,11 @@ struct xbpf_d { u_int64_t bd_spare[4]; }; -#define BPFIF_LOCK(bif) mtx_lock(&(bif)->bif_mtx) -#define BPFIF_UNLOCK(bif) mtx_unlock(&(bif)->bif_mtx) +#define BPFIF_RLOCK(bif) rw_rlock(&(bif)->bif_lock) +#define BPFIF_RUNLOCK(bif) rw_runlock(&(bif)->bif_lock) +#define BPFIF_WLOCK(bif) rw_wlock(&(bif)->bif_lock) +#define BPFIF_WUNLOCK(bif) rw_wunlock(&(bif)->bif_lock) + +#define BPFIF_FLAG_DYING 1 /* Reject new bpf consumers */ #endif diff --git a/freebsd/sys/net/bridgestp.c b/freebsd/sys/net/bridgestp.c index cc7f4e6f..1b2ef7cf 100644 --- a/freebsd/sys/net/bridgestp.c +++ b/freebsd/sys/net/bridgestp.c @@ -129,14 +129,14 @@ static int bstp_rerooted(struct bstp_state *, struct bstp_port *); static uint32_t bstp_calc_path_cost(struct bstp_port *); static void bstp_notify_state(void *, int); static void bstp_notify_rtage(void *, int); -static void bstp_ifupdstatus(struct bstp_state *, struct bstp_port *); +static void bstp_ifupdstatus(void *, int); static void bstp_enable_port(struct bstp_state *, struct bstp_port *); static void bstp_disable_port(struct bstp_state *, struct bstp_port *); static void bstp_tick(void *); static void bstp_timer_start(struct bstp_timer *, uint16_t); static void bstp_timer_stop(struct bstp_timer *); static void bstp_timer_latch(struct bstp_timer *); -static int bstp_timer_expired(struct bstp_timer *); +static int bstp_timer_dectest(struct bstp_timer *); static void bstp_hello_timer_expiry(struct bstp_state *, struct bstp_port *); static void bstp_message_age_expiry(struct bstp_state *, @@ -448,7 +448,7 @@ bstp_pdu_flags(struct bstp_port *bp) return (flags); } -struct mbuf * +void bstp_input(struct bstp_port *bp, struct ifnet *ifp, struct mbuf *m) { struct bstp_state *bs = bp->bp_bs; @@ -458,7 +458,7 @@ bstp_input(struct bstp_port *bp, struct ifnet *ifp, struct mbuf *m) if (bp->bp_active == 0) { m_freem(m); - return (NULL); + return; } BSTP_LOCK(bs); @@ -523,7 +523,6 @@ out: BSTP_UNLOCK(bs); if (m) m_freem(m); - return (NULL); } static void @@ -1680,7 +1679,7 @@ bstp_set_autoptp(struct bstp_port *bp, int set) if (set) { bp->bp_flags |= BSTP_PORT_AUTOPTP; if (bp->bp_role != BSTP_ROLE_DISABLED) - bstp_ifupdstatus(bs, bp); + taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask); } else bp->bp_flags &= ~BSTP_PORT_AUTOPTP; BSTP_UNLOCK(bs); @@ -1770,85 +1769,93 @@ bstp_notify_rtage(void *arg, int pending) } void -bstp_linkstate(struct ifnet *ifp, int state) +bstp_linkstate(struct bstp_port *bp) { - struct bstp_state *bs; - struct bstp_port *bp; + struct bstp_state *bs = bp->bp_bs; - /* search for the stp port */ - mtx_lock(&bstp_list_mtx); - LIST_FOREACH(bs, &bstp_list, bs_list) { - BSTP_LOCK(bs); - LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { - if (bp->bp_ifp == ifp) { - bstp_ifupdstatus(bs, bp); - bstp_update_state(bs, bp); - /* it only exists once so return */ - BSTP_UNLOCK(bs); - mtx_unlock(&bstp_list_mtx); - return; - } - } - BSTP_UNLOCK(bs); - } - mtx_unlock(&bstp_list_mtx); + if (!bp->bp_active) + return; + + bstp_ifupdstatus(bp, 0); + BSTP_LOCK(bs); + bstp_update_state(bs, bp); + BSTP_UNLOCK(bs); } static void -bstp_ifupdstatus(struct bstp_state *bs, struct bstp_port *bp) +bstp_ifupdstatus(void *arg, int pending) { + struct bstp_port *bp = (struct bstp_port *)arg; + struct bstp_state *bs = bp->bp_bs; struct ifnet *ifp = bp->bp_ifp; struct ifmediareq ifmr; - int error = 0; + int error, changed; - BSTP_LOCK_ASSERT(bs); + if (!bp->bp_active) + return; bzero((char *)&ifmr, sizeof(ifmr)); error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); + BSTP_LOCK(bs); + changed = 0; if ((error == 0) && (ifp->if_flags & IFF_UP)) { if (ifmr.ifm_status & IFM_ACTIVE) { /* A full-duplex link is assumed to be point to point */ if (bp->bp_flags & BSTP_PORT_AUTOPTP) { - bp->bp_ptp_link = - ifmr.ifm_active & IFM_FDX ? 1 : 0; + int fdx; + + fdx = ifmr.ifm_active & IFM_FDX ? 1 : 0; + if (bp->bp_ptp_link ^ fdx) { + bp->bp_ptp_link = fdx; + changed = 1; + } } /* Calc the cost if the link was down previously */ if (bp->bp_flags & BSTP_PORT_PNDCOST) { - bp->bp_path_cost = bstp_calc_path_cost(bp); + uint32_t cost; + + cost = bstp_calc_path_cost(bp); + if (bp->bp_path_cost != cost) { + bp->bp_path_cost = cost; + changed = 1; + } bp->bp_flags &= ~BSTP_PORT_PNDCOST; } - if (bp->bp_role == BSTP_ROLE_DISABLED) + if (bp->bp_role == BSTP_ROLE_DISABLED) { bstp_enable_port(bs, bp); + changed = 1; + } } else { if (bp->bp_role != BSTP_ROLE_DISABLED) { bstp_disable_port(bs, bp); + changed = 1; if ((bp->bp_flags & BSTP_PORT_ADMEDGE) && bp->bp_protover == BSTP_PROTO_RSTP) bp->bp_operedge = 1; } } - return; - } - - if (bp->bp_infois != BSTP_INFO_DISABLED) + } else if (bp->bp_infois != BSTP_INFO_DISABLED) { bstp_disable_port(bs, bp); + changed = 1; + } + if (changed) + bstp_assign_roles(bs); + BSTP_UNLOCK(bs); } static void bstp_enable_port(struct bstp_state *bs, struct bstp_port *bp) { bp->bp_infois = BSTP_INFO_AGED; - bstp_assign_roles(bs); } static void bstp_disable_port(struct bstp_state *bs, struct bstp_port *bp) { bp->bp_infois = BSTP_INFO_DISABLED; - bstp_assign_roles(bs); } static void @@ -1862,30 +1869,34 @@ bstp_tick(void *arg) if (bs->bs_running == 0) return; - /* slow timer to catch missed link events */ - if (bstp_timer_expired(&bs->bs_link_timer)) { - LIST_FOREACH(bp, &bs->bs_bplist, bp_next) - bstp_ifupdstatus(bs, bp); + CURVNET_SET(bs->bs_vnet); + + /* poll link events on interfaces that do not support linkstate */ + if (bstp_timer_dectest(&bs->bs_link_timer)) { + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + if (!(bp->bp_ifp->if_capabilities & IFCAP_LINKSTATE)) + taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask); + } bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); } LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { /* no events need to happen for these */ - bstp_timer_expired(&bp->bp_tc_timer); - bstp_timer_expired(&bp->bp_recent_root_timer); - bstp_timer_expired(&bp->bp_forward_delay_timer); - bstp_timer_expired(&bp->bp_recent_backup_timer); + bstp_timer_dectest(&bp->bp_tc_timer); + bstp_timer_dectest(&bp->bp_recent_root_timer); + bstp_timer_dectest(&bp->bp_forward_delay_timer); + bstp_timer_dectest(&bp->bp_recent_backup_timer); - if (bstp_timer_expired(&bp->bp_hello_timer)) + if (bstp_timer_dectest(&bp->bp_hello_timer)) bstp_hello_timer_expiry(bs, bp); - if (bstp_timer_expired(&bp->bp_message_age_timer)) + if (bstp_timer_dectest(&bp->bp_message_age_timer)) bstp_message_age_expiry(bs, bp); - if (bstp_timer_expired(&bp->bp_migrate_delay_timer)) + if (bstp_timer_dectest(&bp->bp_migrate_delay_timer)) bstp_migrate_delay_expiry(bs, bp); - if (bstp_timer_expired(&bp->bp_edge_delay_timer)) + if (bstp_timer_dectest(&bp->bp_edge_delay_timer)) bstp_edge_delay_expiry(bs, bp); /* update the various state machines for the port */ @@ -1895,6 +1906,8 @@ bstp_tick(void *arg) bp->bp_txcount--; } + CURVNET_RESTORE(); + callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs); } @@ -1922,7 +1935,7 @@ bstp_timer_latch(struct bstp_timer *t) } static int -bstp_timer_expired(struct bstp_timer *t) +bstp_timer_dectest(struct bstp_timer *t) { if (t->active == 0 || t->latched) return (0); @@ -2010,24 +2023,33 @@ bstp_reinit(struct bstp_state *bs) struct bstp_port *bp; struct ifnet *ifp, *mif; u_char *e_addr; + void *bridgeptr; static const u_char llzero[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ BSTP_LOCK_ASSERT(bs); + if (LIST_EMPTY(&bs->bs_bplist)) + goto disablestp; + mif = NULL; + bridgeptr = LIST_FIRST(&bs->bs_bplist)->bp_ifp->if_bridge; + KASSERT(bridgeptr != NULL, ("Invalid bridge pointer")); /* * Search through the Ethernet adapters and find the one with the - * lowest value. The adapter which we take the MAC address from does - * not need to be part of the bridge, it just needs to be a unique - * value. + * lowest value. Make sure the adapter which we take the MAC address + * from is part of this bridge, so we can have more than one independent + * bridges in the same STP domain. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type != IFT_ETHER) - continue; + continue; /* Not Ethernet */ + + if (ifp->if_bridge != bridgeptr) + continue; /* Not part of our bridge */ if (bstp_addr_cmp(IF_LLADDR(ifp), llzero) == 0) - continue; + continue; /* No mac address set */ if (mif == NULL) { mif = ifp; @@ -2039,21 +2061,8 @@ bstp_reinit(struct bstp_state *bs) } } IFNET_RUNLOCK_NOSLEEP(); - - if (LIST_EMPTY(&bs->bs_bplist) || mif == NULL) { - /* Set the bridge and root id (lower bits) to zero */ - bs->bs_bridge_pv.pv_dbridge_id = - ((uint64_t)bs->bs_bridge_priority) << 48; - bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; - bs->bs_root_pv = bs->bs_bridge_pv; - /* Disable any remaining ports, they will have no MAC address */ - LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { - bp->bp_infois = BSTP_INFO_DISABLED; - bstp_set_port_role(bp, BSTP_ROLE_DISABLED); - } - callout_stop(&bs->bs_bstpcallout); - return; - } + if (mif == NULL) + goto disablestp; e_addr = IF_LLADDR(mif); bs->bs_bridge_pv.pv_dbridge_id = @@ -2076,11 +2085,25 @@ bstp_reinit(struct bstp_state *bs) LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { bp->bp_port_id = (bp->bp_priority << 8) | (bp->bp_ifp->if_index & 0xfff); - bstp_ifupdstatus(bs, bp); + taskqueue_enqueue(taskqueue_swi, &bp->bp_mediatask); } bstp_assign_roles(bs); bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); + return; + +disablestp: + /* Set the bridge and root id (lower bits) to zero */ + bs->bs_bridge_pv.pv_dbridge_id = + ((uint64_t)bs->bs_bridge_priority) << 48; + bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; + bs->bs_root_pv = bs->bs_bridge_pv; + /* Disable any remaining ports, they will have no MAC address */ + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_infois = BSTP_INFO_DISABLED; + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + } + callout_stop(&bs->bs_bstpcallout); } static int @@ -2090,10 +2113,8 @@ bstp_modevent(module_t mod, int type, void *data) case MOD_LOAD: mtx_init(&bstp_list_mtx, "bridgestp list", NULL, MTX_DEF); LIST_INIT(&bstp_list); - bstp_linkstate_p = bstp_linkstate; break; case MOD_UNLOAD: - bstp_linkstate_p = NULL; mtx_destroy(&bstp_list_mtx); break; default: @@ -2128,6 +2149,7 @@ bstp_attach(struct bstp_state *bs, struct bstp_cb_ops *cb) bs->bs_protover = BSTP_PROTO_RSTP; bs->bs_state_cb = cb->bcb_state; bs->bs_rtage_cb = cb->bcb_rtage; + bs->bs_vnet = curvnet; getmicrotime(&bs->bs_last_tc_time); @@ -2184,6 +2206,7 @@ bstp_create(struct bstp_state *bs, struct bstp_port *bp, struct ifnet *ifp) bp->bp_priority = BSTP_DEFAULT_PORT_PRIORITY; TASK_INIT(&bp->bp_statetask, 0, bstp_notify_state, bp); TASK_INIT(&bp->bp_rtagetask, 0, bstp_notify_rtage, bp); + TASK_INIT(&bp->bp_mediatask, 0, bstp_ifupdstatus, bp); /* Init state */ bp->bp_infois = BSTP_INFO_DISABLED; @@ -2247,4 +2270,5 @@ bstp_destroy(struct bstp_port *bp) KASSERT(bp->bp_active == 0, ("port is still attached")); taskqueue_drain(taskqueue_swi, &bp->bp_statetask); taskqueue_drain(taskqueue_swi, &bp->bp_rtagetask); + taskqueue_drain(taskqueue_swi, &bp->bp_mediatask); } diff --git a/freebsd/sys/net/bridgestp.h b/freebsd/sys/net/bridgestp.h index 74086fce..cbb8d53c 100644 --- a/freebsd/sys/net/bridgestp.h +++ b/freebsd/sys/net/bridgestp.h @@ -326,6 +326,7 @@ struct bstp_port { uint8_t bp_txcount; struct task bp_statetask; struct task bp_rtagetask; + struct task bp_mediatask; }; /* @@ -358,6 +359,7 @@ struct bstp_state { LIST_HEAD(, bstp_port) bs_bplist; bstp_state_cb_t bs_state_cb; bstp_rtage_cb_t bs_rtage_cb; + struct vnet *bs_vnet; }; #define BSTP_LOCK_INIT(_bs) mtx_init(&(_bs)->bs_mtx, "bstp", NULL, MTX_DEF) @@ -368,8 +370,6 @@ struct bstp_state { extern const uint8_t bstp_etheraddr[]; -extern void (*bstp_linkstate_p)(struct ifnet *ifp, int state); - void bstp_attach(struct bstp_state *, struct bstp_cb_ops *); void bstp_detach(struct bstp_state *); void bstp_init(struct bstp_state *); @@ -378,7 +378,7 @@ int bstp_create(struct bstp_state *, struct bstp_port *, struct ifnet *); int bstp_enable(struct bstp_port *); void bstp_disable(struct bstp_port *); void bstp_destroy(struct bstp_port *); -void bstp_linkstate(struct ifnet *, int); +void bstp_linkstate(struct bstp_port *); int bstp_set_htime(struct bstp_state *, int); int bstp_set_fdelay(struct bstp_state *, int); int bstp_set_maxage(struct bstp_state *, int); @@ -391,6 +391,6 @@ int bstp_set_edge(struct bstp_port *, int); int bstp_set_autoedge(struct bstp_port *, int); int bstp_set_ptp(struct bstp_port *, int); int bstp_set_autoptp(struct bstp_port *, int); -struct mbuf *bstp_input(struct bstp_port *, struct ifnet *, struct mbuf *); +void bstp_input(struct bstp_port *, struct ifnet *, struct mbuf *); #endif /* _KERNEL */ diff --git a/freebsd/sys/net/ieee8023ad_lacp.c b/freebsd/sys/net/ieee8023ad_lacp.c index 6e06ffe5..1b4418a2 100644 --- a/freebsd/sys/net/ieee8023ad_lacp.c +++ b/freebsd/sys/net/ieee8023ad_lacp.c @@ -814,10 +814,10 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (NULL); } - if (m->m_flags & M_FLOWID) + if (sc->use_flowid && (m->m_flags & M_FLOWID)) hash = m->m_pkthdr.flowid; else - hash = lagg_hashmbuf(m, lsc->lsc_hashkey); + hash = lagg_hashmbuf(sc, m, lsc->lsc_hashkey); hash %= pm->pm_count; lp = pm->pm_map[hash]; diff --git a/freebsd/sys/net/if.c b/freebsd/sys/net/if.c index 918f8c4e..5dffd06d 100644 --- a/freebsd/sys/net/if.c +++ b/freebsd/sys/net/if.c @@ -60,6 +60,8 @@ #include <sys/taskqueue.h> #include <sys/domain.h> #include <sys/jail.h> +#include <sys/priv.h> + #include <machine/stdarg.h> #include <vm/uma.h> @@ -104,7 +106,7 @@ SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); TUNABLE_INT("net.link.ifqmaxlen", &ifqmaxlen); -SYSCTL_UINT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, +SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ @@ -126,7 +128,7 @@ MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); -void (*bstp_linkstate_p)(struct ifnet *ifp, int state); +void (*bridge_linkstate_p)(struct ifnet *ifp); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ @@ -277,6 +279,7 @@ ifindex_alloc_locked(u_short *idxp) IFNET_WLOCK_ASSERT(); +retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. @@ -289,10 +292,12 @@ ifindex_alloc_locked(u_short *idxp) /* Catch if_index overflow. */ if (idx < 1) return (ENOSPC); + if (idx >= V_if_indexlim) { + if_grow(); + goto retry; + } if (idx > V_if_index) V_if_index = idx; - if (V_if_index >= V_if_indexlim) - if_grow(); *idxp = idx; return (0); } @@ -362,10 +367,12 @@ vnet_if_init(const void *unused __unused) TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); + IFNET_WLOCK(); if_grow(); /* create initial table */ + IFNET_WUNLOCK(); vnet_if_clone_init(); } -VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_init, +VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); /* ARGSUSED*/ @@ -376,7 +383,7 @@ if_init(void *dummy __unused) IFNET_LOCK_INIT(); if_clone_init(); } -SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init, NULL); +SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL); #ifdef VIMAGE @@ -384,8 +391,10 @@ static void vnet_if_uninit(const void *unused __unused) { - VNET_ASSERT(TAILQ_EMPTY(&V_ifnet)); - VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head)); + VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " + "not empty", __func__, __LINE__, &V_ifnet)); + VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " + "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } @@ -396,16 +405,25 @@ VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, static void if_grow(void) { + int oldlim; u_int n; struct ifindex_entry *e; - V_if_indexlim <<= 1; - n = V_if_indexlim * sizeof(*e); + IFNET_WLOCK_ASSERT(); + oldlim = V_if_indexlim; + IFNET_WUNLOCK(); + n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); + IFNET_WLOCK(); + if (V_if_indexlim != oldlim) { + free(e, M_IFNET); + return; + } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); free((caddr_t)V_ifindex_table, M_IFNET); } + V_if_indexlim <<= 1; V_ifindex_table = e; } @@ -472,8 +490,8 @@ if_alloc(u_char type) } /* - * Do the actual work of freeing a struct ifnet, associated index, and layer - * 2 common structure. This call is made when the last reference to an + * Do the actual work of freeing a struct ifnet, and layer 2 common + * structure. This call is made when the last reference to an * interface is released. */ static void @@ -483,13 +501,6 @@ if_free_internal(struct ifnet *ifp) KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); - IFNET_WLOCK(); - KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), - ("%s: freeing unallocated ifnet", ifp->if_xname)); - - ifindex_free_locked(ifp->if_index); - IFNET_WUNLOCK(); - if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); @@ -520,6 +531,14 @@ if_free_type(struct ifnet *ifp, u_char type) ifp->if_alloctype)); ifp->if_flags |= IFF_DYING; /* XXX: Locking */ + + IFNET_WLOCK(); + KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), + ("%s: freeing unallocated ifnet", ifp->if_xname)); + + ifindex_free_locked(ifp->if_index); + IFNET_WUNLOCK(); + if (!refcount_release(&ifp->if_refcount)) return; if_free_internal(ifp); @@ -818,10 +837,10 @@ if_purgemaddrs(struct ifnet *ifp) struct ifmultiaddr *ifma; struct ifmultiaddr *next; - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 1); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); } /* @@ -1165,10 +1184,10 @@ if_addgroup(struct ifnet *ifp, const char *groupname) ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); @@ -1195,9 +1214,9 @@ if_delgroup(struct ifnet *ifp, const char *groupname) return (ENOENT); } - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) @@ -1238,9 +1257,9 @@ if_delgroups(struct ifnet *ifp) strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) @@ -1282,33 +1301,33 @@ if_getgroup(struct ifgroupreq *data, struct ifnet *ifp) struct ifgroupreq *ifgr = data; if (ifgr->ifgr_len == 0) { - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; /* XXX: wire */ - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); return (0); } @@ -1415,28 +1434,28 @@ void if_addr_rlock(struct ifnet *ifp) { - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); } void if_addr_runlock(struct ifnet *ifp) { - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); } void if_maddr_rlock(struct ifnet *ifp) { - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); } void if_maddr_runlock(struct ifnet *ifp) { - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); } /* @@ -1548,14 +1567,14 @@ ifa_ifwithaddr_internal(struct sockaddr *addr, int getref) IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { if (getref) ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); goto done; } /* IP6 doesn't have broadcast */ @@ -1565,11 +1584,11 @@ ifa_ifwithaddr_internal(struct sockaddr *addr, int getref) sa_equal(ifa->ifa_broadaddr, addr)) { if (getref) ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); goto done; } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: @@ -1603,7 +1622,7 @@ ifa_ifwithbroadaddr(struct sockaddr *addr) IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; @@ -1612,11 +1631,11 @@ ifa_ifwithbroadaddr(struct sockaddr *addr) ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); goto done; } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: @@ -1638,18 +1657,18 @@ ifa_ifwithdstaddr(struct sockaddr *addr) TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); goto done; } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: @@ -1683,12 +1702,12 @@ ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp) /* * Scan though each interface, looking for ones that have addresses * in this address family. Maintain a reference on ifa_maybe once - * we find one, as we release the IF_ADDR_LOCK() that kept it stable + * we find one, as we release the IF_ADDR_RLOCK() that kept it stable * when we move onto the next interface. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { char *cp, *cp2, *cp3; @@ -1707,7 +1726,7 @@ next: continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); goto done; } } else { @@ -1718,7 +1737,7 @@ next: continue; if (ifa->ifa_claim_addr) { if ((*ifa->ifa_claim_addr)(ifa, addr)) { ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); goto done; } continue; @@ -1758,7 +1777,7 @@ next: continue; } } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); } ifa = ifa_maybe; ifa_maybe = NULL; @@ -1784,7 +1803,7 @@ ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) if (af >= AF_MAX) return (NULL); - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; @@ -1816,7 +1835,7 @@ ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) done: if (ifa != NULL) ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); return (ifa); } @@ -1936,14 +1955,10 @@ do_link_state_change(void *arg, int pending) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); - if (ifp->if_bridge) { - KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!")); - (*bstp_linkstate_p)(ifp, link_state); - } - if (ifp->if_lagg) { - KASSERT(lagg_linkstate_p != NULL,("if_lagg not loaded!")); + if (ifp->if_bridge) + (*bridge_linkstate_p)(ifp); + if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); - } if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, @@ -2180,6 +2195,20 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) free(odescrbuf, M_IFDESCR); break; + case SIOCGIFFIB: + ifr->ifr_fib = ifp->if_fib; + break; + + case SIOCSIFFIB: + error = priv_check(td, PRIV_NET_SETIFFIB); + if (error) + return (error); + if (ifr->ifr_fib >= rt_numfibs) + return (EINVAL); + + ifp->if_fib = ifr->ifr_fib; + break; + case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) @@ -2379,9 +2408,9 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) * lose a race while we check if the membership * already exists. */ - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else @@ -2492,10 +2521,13 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) int error; int oif_flags; + CURVNET_SET(so->so_vnet); switch (cmd) { case SIOCGIFCONF: case OSIOCGIFCONF: - return (ifconf(cmd, data)); + error = ifconf(cmd, data); + CURVNET_RESTORE(); + return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: @@ -2507,7 +2539,11 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); - return (ifconf(SIOCGIFCONF, (void *)&ifc)); + error = ifconf(SIOCGIFCONF, (void *)&ifc); + CURVNET_RESTORE(); + if (error == 0) + ifc32->ifc_len = ifc.ifc_len; + return (error); } #endif } @@ -2517,49 +2553,74 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); - if (error) - return (error); - return (if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid)); + if (error == 0) + error = if_vmove_reclaim(td, ifr->ifr_name, + ifr->ifr_jid); + CURVNET_RESTORE(); + return (error); #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); - if (error) - return (error); - return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), - cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL)); + if (error == 0) + error = if_clone_create(ifr->ifr_name, + sizeof(ifr->ifr_name), + cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL); + CURVNET_RESTORE(); + return (error); case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); - if (error) - return (error); - return if_clone_destroy(ifr->ifr_name); + if (error == 0) + error = if_clone_destroy(ifr->ifr_name); + CURVNET_RESTORE(); + return (error); case SIOCIFGCLONERS: - return (if_clone_list((struct if_clonereq *)data)); + error = if_clone_list((struct if_clonereq *)data); + CURVNET_RESTORE(); + return (error); case SIOCGIFGMEMB: - return (if_getgroupmembers((struct ifgroupreq *)data)); + error = if_getgroupmembers((struct ifgroupreq *)data); + CURVNET_RESTORE(); + return (error); } ifp = ifunit_ref(ifr->ifr_name); - if (ifp == NULL) + if (ifp == NULL) { + CURVNET_RESTORE(); return (ENXIO); + } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) { if_rele(ifp); + CURVNET_RESTORE(); return (error); } oif_flags = ifp->if_flags; if (so->so_proto == NULL) { if_rele(ifp); + CURVNET_RESTORE(); return (EOPNOTSUPP); } + + /* + * Pass the request on to the socket control method, and if the + * latter returns EOPNOTSUPP, directly to the interface. + * + * Make an exception for the legacy SIOCSIF* requests. Drivers + * trust SIOCSIFADDR et al to come from an already privileged + * layer, and do not perform any credentials checks or input + * validation. + */ #ifndef COMPAT_43 error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); - if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL) + if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && + cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && + cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); #else { @@ -2603,7 +2664,9 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && - ifp->if_ioctl != NULL) + ifp->if_ioctl != NULL && + cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && + cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); switch (ocmd) { @@ -2627,6 +2690,7 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) #endif } if_rele(ifp); + CURVNET_RESTORE(); return (error); } @@ -2776,7 +2840,7 @@ again: } addrs = 0; - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; @@ -2808,7 +2872,7 @@ again: if (!sbuf_overflowed(sb)) valid_len = sbuf_len(sb); } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); sbuf_bcat(sb, &ifr, sizeof(ifr)); @@ -2966,13 +3030,13 @@ if_addmulti(struct ifnet *ifp, struct sockaddr *sa, * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); return (0); } @@ -3038,7 +3102,7 @@ if_addmulti(struct ifnet *ifp, struct sockaddr *sa, * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the @@ -3058,7 +3122,7 @@ free_llsa_out: free(llsa, M_IFMADDR); unlock_out: - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); return (error); } @@ -3092,12 +3156,12 @@ if_delmulti(struct ifnet *ifp, struct sockaddr *sa) if (ifp == NULL) return (ENOENT); - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); @@ -3119,10 +3183,10 @@ if_delallmulti(struct ifnet *ifp) struct ifmultiaddr *ifma; struct ifmultiaddr *next; - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); } /* @@ -3159,7 +3223,7 @@ if_delmulti_ifma(struct ifmultiaddr *ifma) * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) - IF_ADDR_LOCK(ifp); + IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, 0); @@ -3169,7 +3233,7 @@ if_delmulti_ifma(struct ifmultiaddr *ifma) * Release the address lock. * If the group was left: update the hardware hash filter. */ - IF_ADDR_UNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } @@ -3191,7 +3255,7 @@ if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); - IF_ADDR_LOCK_ASSERT(ifp); + IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; @@ -3264,14 +3328,14 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) struct ifaddr *ifa; struct ifreq ifr; - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; if (ifa == NULL) { - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); return (EINVAL); } ifa_ref(ifa); - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { ifa_free(ifa); diff --git a/freebsd/sys/net/if.h b/freebsd/sys/net/if.h index 1a6423f6..25d43ac3 100644 --- a/freebsd/sys/net/if.h +++ b/freebsd/sys/net/if.h @@ -145,7 +145,7 @@ struct if_data { #define IFF_LINK2 0x4000 /* per link layer defined bit */ #define IFF_ALTPHYS IFF_LINK2 /* use alternate physical connection */ #define IFF_MULTICAST 0x8000 /* (i) supports multicast */ -/* 0x10000 */ +#define IFF_CANTCONFIG 0x10000 /* (i) unconfigurable using ioctl(2) */ #define IFF_PPROMISC 0x20000 /* (n) user-requested promisc mode */ #define IFF_MONITOR 0x40000 /* (n) user-requested monitor mode */ #define IFF_STATICARP 0x80000 /* (n) static ARP */ @@ -165,7 +165,7 @@ struct if_data { #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_DRV_RUNNING|IFF_DRV_OACTIVE|\ IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_SMART|IFF_PROMISC|\ - IFF_DYING) + IFF_DYING|IFF_CANTCONFIG) /* * Values for if_link_state. @@ -220,6 +220,7 @@ struct if_data { #define IFCAP_POLLING_NOCOUNT 0x20000 /* polling ticks cannot be fragmented */ #define IFCAP_VLAN_HWTSO 0x40000 /* can do IFCAP_TSO on VLANs */ #define IFCAP_LINKSTATE 0x80000 /* the runtime link state is dynamic */ +#define IFCAP_NETMAP 0x100000 /* netmap mode supported/enabled */ #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) #define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) @@ -232,6 +233,7 @@ struct if_data { /* * Message format for use in obtaining information about interfaces * from getkerninfo and the routing socket + * For the new, extensible interface see struct if_msghdrl below. */ struct if_msghdr { u_short ifm_msglen; /* to skip over non-understood messages */ @@ -244,8 +246,34 @@ struct if_msghdr { }; /* + * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL. It is + * extensible after ifm_data_off or within ifm_data. Both the if_msghdr and + * if_data now have a member field detailing the struct length in addition to + * the routing message length. Macros are provided to find the start of + * ifm_data and the start of the socket address strucutres immediately following + * struct if_msghdrl given a pointer to struct if_msghdrl. + */ +#define IF_MSGHDRL_IFM_DATA(_l) \ + (struct if_data *)((char *)(_l) + (_l)->ifm_data_off) +#define IF_MSGHDRL_RTA(_l) \ + (void *)((uintptr_t)(_l) + (_l)->ifm_len) +struct if_msghdrl { + u_short ifm_msglen; /* to skip over non-understood messages */ + u_char ifm_version; /* future binary compatibility */ + u_char ifm_type; /* message type */ + int ifm_addrs; /* like rtm_addrs */ + int ifm_flags; /* value of if_flags */ + u_short ifm_index; /* index for associated ifp */ + u_short _ifm_spare1; /* spare space to grow if_index, see if_var.h */ + u_short ifm_len; /* length of if_msghdrl incl. if_data */ + u_short ifm_data_off; /* offset of if_data from beginning */ + struct if_data ifm_data;/* statistics and other data about if */ +}; + +/* * Message format for use in obtaining information about interface addresses * from getkerninfo and the routing socket + * For the new, extensible interface see struct ifa_msghdrl below. */ struct ifa_msghdr { u_short ifam_msglen; /* to skip over non-understood messages */ @@ -258,6 +286,33 @@ struct ifa_msghdr { }; /* + * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL. It is + * extensible after ifam_metric or within ifam_data. Both the ifa_msghdrl and + * if_data now have a member field detailing the struct length in addition to + * the routing message length. Macros are provided to find the start of + * ifm_data and the start of the socket address strucutres immediately following + * struct ifa_msghdrl given a pointer to struct ifa_msghdrl. + */ +#define IFA_MSGHDRL_IFAM_DATA(_l) \ + (struct if_data *)((char *)(_l) + (_l)->ifam_data_off) +#define IFA_MSGHDRL_RTA(_l) \ + (void *)((uintptr_t)(_l) + (_l)->ifam_len) +struct ifa_msghdrl { + u_short ifam_msglen; /* to skip over non-understood messages */ + u_char ifam_version; /* future binary compatibility */ + u_char ifam_type; /* message type */ + int ifam_addrs; /* like rtm_addrs */ + int ifam_flags; /* value of ifa_flags */ + u_short ifam_index; /* index for associated ifp */ + u_short _ifam_spare1; /* spare space to grow if_index, see if_var.h */ + u_short ifam_len; /* length of ifa_msghdrl incl. if_data */ + u_short ifam_data_off; /* offset of if_data from beginning */ + int ifam_metric; /* value of ifa_metric */ + struct if_data ifam_data;/* statistics and other data about if or + * address */ +}; + +/* * Message format for use in obtaining information about multicast addresses * from the routing socket */ @@ -315,6 +370,7 @@ struct ifreq { int ifru_media; caddr_t ifru_data; int ifru_cap[2]; + u_int ifru_fib; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -331,6 +387,7 @@ struct ifreq { #define ifr_reqcap ifr_ifru.ifru_cap[0] /* requested capabilities */ #define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ #define ifr_index ifr_ifru.ifru_index /* interface index */ +#define ifr_fib ifr_ifru.ifru_fib /* interface fib */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ diff --git a/freebsd/sys/net/if_arcsubr.c b/freebsd/sys/net/if_arcsubr.c index dc75b445..e9422068 100644 --- a/freebsd/sys/net/if_arcsubr.c +++ b/freebsd/sys/net/if_arcsubr.c @@ -610,6 +610,7 @@ arc_input(struct ifnet *ifp, struct mbuf *m) m_freem(m); return; } + M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); } diff --git a/freebsd/sys/net/if_atmsubr.c b/freebsd/sys/net/if_atmsubr.c index 747bc936..e3ce4ea0 100644 --- a/freebsd/sys/net/if_atmsubr.c +++ b/freebsd/sys/net/if_atmsubr.c @@ -334,6 +334,7 @@ atm_input(struct ifnet *ifp, struct atm_pseudohdr *ah, struct mbuf *m, return; } } + M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); } diff --git a/freebsd/sys/net/if_bridge.c b/freebsd/sys/net/if_bridge.c index 5c15a78f..52146381 100644 --- a/freebsd/sys/net/if_bridge.c +++ b/freebsd/sys/net/if_bridge.c @@ -87,6 +87,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/protosw.h> #include <sys/systm.h> +#include <sys/jail.h> #include <rtems/bsd/sys/time.h> #include <sys/socket.h> /* for net/if.h */ #include <sys/sockio.h> @@ -145,10 +146,10 @@ __FBSDID("$FreeBSD$"); #define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) /* - * Maximum number of addresses to cache. + * Default maximum number of addresses to cache. */ #ifndef BRIDGE_RTABLE_MAX -#define BRIDGE_RTABLE_MAX 100 +#define BRIDGE_RTABLE_MAX 2000 #endif /* @@ -334,6 +335,10 @@ static int bridge_ip6_checkbasic(struct mbuf **mp); #endif /* INET6 */ static int bridge_fragment(struct ifnet *, struct mbuf *, struct ether_header *, int, struct llc *); +static void bridge_linkstate(struct ifnet *ifp); +static void bridge_linkcheck(struct bridge_softc *sc); + +extern void (*bridge_linkstate_p)(struct ifnet *ifp); /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ #define VLANTAGOF(_m) \ @@ -356,19 +361,26 @@ static int pfil_local_phys = 0; /* run pfil hooks on the physical interface for locally destined packets */ static int log_stp = 0; /* log STP state changes */ static int bridge_inherit_mac = 0; /* share MAC with first bridge member */ +TUNABLE_INT("net.link.bridge.pfil_onlyip", &pfil_onlyip); SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW, &pfil_onlyip, 0, "Only pass IP packets when pfil is enabled"); +TUNABLE_INT("net.link.bridge.ipfw_arp", &pfil_ipfw_arp); SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW, &pfil_ipfw_arp, 0, "Filter ARP packets through IPFW layer2"); +TUNABLE_INT("net.link.bridge.pfil_bridge", &pfil_bridge); SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW, &pfil_bridge, 0, "Packet filter on the bridge interface"); +TUNABLE_INT("net.link.bridge.pfil_member", &pfil_member); SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW, &pfil_member, 0, "Packet filter on the member interface"); +TUNABLE_INT("net.link.bridge.pfil_local_phys", &pfil_local_phys); SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RW, &pfil_local_phys, 0, "Packet filter on the physical interface for locally destined packets"); +TUNABLE_INT("net.link.bridge.log_stp", &log_stp); SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RW, &log_stp, 0, "Log STP state changes"); +TUNABLE_INT("net.link.bridge.inherit_mac", &bridge_inherit_mac); SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, CTLFLAG_RW, &bridge_inherit_mac, 0, "Inherit MAC address from the first bridge member"); @@ -490,6 +502,7 @@ bridge_modevent(module_t mod, int type, void *data) bridge_input_p = bridge_input; bridge_output_p = bridge_output; bridge_dn_p = bridge_dummynet; + bridge_linkstate_p = bridge_linkstate; bridge_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bridge_ifdetach, NULL, EVENTHANDLER_PRI_ANY); @@ -502,6 +515,7 @@ bridge_modevent(module_t mod, int type, void *data) bridge_input_p = NULL; bridge_output_p = NULL; bridge_dn_p = NULL; + bridge_linkstate_p = NULL; mtx_destroy(&bridge_list_mtx); break; default: @@ -562,7 +576,8 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct bridge_softc *sc, *sc2; struct ifnet *bifp, *ifp; - int retry; + int fb, retry; + unsigned long hostid; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); @@ -595,17 +610,30 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) IFQ_SET_READY(&ifp->if_snd); /* - * Generate a random ethernet address with a locally administered - * address. + * Generate an ethernet address with a locally administered address. * * Since we are using random ethernet addresses for the bridge, it is * possible that we might have address collisions, so make sure that * this hardware address isn't already in use on another bridge. + * The first try uses the hostid and falls back to arc4rand(). */ + fb = 0; + getcredhostid(curthread->td_ucred, &hostid); for (retry = 1; retry != 0;) { - arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1); - sc->sc_defaddr[0] &= ~1; /* clear multicast bit */ - sc->sc_defaddr[0] |= 2; /* set the LAA bit */ + if (fb || hostid == 0) { + arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1); + sc->sc_defaddr[0] &= ~1;/* clear multicast bit */ + sc->sc_defaddr[0] |= 2; /* set the LAA bit */ + } else { + sc->sc_defaddr[0] = 0x2; + sc->sc_defaddr[1] = (hostid >> 24) & 0xff; + sc->sc_defaddr[2] = (hostid >> 16) & 0xff; + sc->sc_defaddr[3] = (hostid >> 8 ) & 0xff; + sc->sc_defaddr[4] = hostid & 0xff; + sc->sc_defaddr[5] = ifp->if_dunit & 0xff; + } + + fb = 1; retry = 0; mtx_lock(&bridge_list_mtx); LIST_FOREACH(sc2, &bridge_list, sc_list) { @@ -939,6 +967,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } + bridge_linkcheck(sc); bridge_mutecaps(sc); /* recalcuate now this interface is removed */ bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); KASSERT(bif->bif_addrcnt == 0, @@ -1066,17 +1095,16 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) /* Set interface capabilities to the intersection set of all members */ bridge_mutecaps(sc); + bridge_linkcheck(sc); + /* Place the interface into promiscuous mode */ switch (ifs->if_type) { - case IFT_ETHER: - case IFT_L2VLAN: - /* - * Place the interface into promiscuous mode. - */ - BRIDGE_UNLOCK(sc); - error = ifpromisc(ifs, 1); - BRIDGE_LOCK(sc); - break; + case IFT_ETHER: + case IFT_L2VLAN: + BRIDGE_UNLOCK(sc); + error = ifpromisc(ifs, 1); + BRIDGE_LOCK(sc); + break; } if (error) bridge_delete_member(sc, bif, 0); @@ -2195,11 +2223,9 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) /* Tap off 802.1D packets; they do not get forwarded. */ if (memcmp(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN) == 0) { - m = bstp_input(&bif->bif_stp, ifp, m); - if (m == NULL) { - BRIDGE_UNLOCK(sc); - return (NULL); - } + bstp_input(&bif->bif_stp, ifp, m); /* consumes mbuf */ + BRIDGE_UNLOCK(sc); + return (NULL); } if ((bif->bif_flags & IFBIF_STP) && @@ -3456,3 +3482,46 @@ out: m_freem(m); return (error); } + +static void +bridge_linkstate(struct ifnet *ifp) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_iflist *bif; + + BRIDGE_LOCK(sc); + bif = bridge_lookup_member_if(sc, ifp); + if (bif == NULL) { + BRIDGE_UNLOCK(sc); + return; + } + bridge_linkcheck(sc); + BRIDGE_UNLOCK(sc); + + bstp_linkstate(&bif->bif_stp); +} + +static void +bridge_linkcheck(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + int new_link, hasls; + + BRIDGE_LOCK_ASSERT(sc); + new_link = LINK_STATE_DOWN; + hasls = 0; + /* Our link is considered up if at least one of our ports is active */ + LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp->if_capabilities & IFCAP_LINKSTATE) + hasls++; + if (bif->bif_ifp->if_link_state == LINK_STATE_UP) { + new_link = LINK_STATE_UP; + break; + } + } + if (!LIST_EMPTY(&sc->sc_iflist) && !hasls) { + /* If no interfaces support link-state then we default to up */ + new_link = LINK_STATE_UP; + } + if_link_state_change(sc->sc_ifp, new_link); +} diff --git a/freebsd/sys/net/if_epair.c b/freebsd/sys/net/if_epair.c index cd7a6c79..fafc0259 100644 --- a/freebsd/sys/net/if_epair.c +++ b/freebsd/sys/net/if_epair.c @@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$"); #include <net/ethernet.h> #include <net/if.h> #include <net/if_clone.h> +#include <net/if_media.h> #include <net/if_var.h> #include <net/if_types.h> #include <net/netisr.h> @@ -94,6 +95,8 @@ static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *); static void epair_nh_drainedcpu(u_int); static void epair_start_locked(struct ifnet *); +static int epair_media_change(struct ifnet *); +static void epair_media_status(struct ifnet *, struct ifmediareq *); static int epair_clone_match(struct if_clone *, const char *); static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t); @@ -129,6 +132,7 @@ SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, struct epair_softc { struct ifnet *ifp; /* This ifp. */ struct ifnet *oifp; /* other ifp of pair. */ + struct ifmedia media; /* Media config (fake). */ u_int refcount; /* # of mbufs in flight. */ u_int cpuid; /* CPU ID assigned upon creation. */ void (*if_qflush)(struct ifnet *); @@ -191,10 +195,7 @@ epair_dpcpu_init(void) struct eid_list *s; u_int cpuid; - for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { - if (CPU_ABSENT(cpuid)) - continue; - + CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); /* Initialize per-cpu lock. */ @@ -219,10 +220,7 @@ epair_dpcpu_detach(void) struct epair_dpcpu *epair_dpcpu; u_int cpuid; - for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { - if (CPU_ABSENT(cpuid)) - continue; - + CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); /* Destroy per-cpu lock. */ @@ -332,10 +330,7 @@ epair_remove_ifp_from_draining(struct ifnet *ifp) struct epair_ifp_drain *elm, *tvar; u_int cpuid; - for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { - if (CPU_ABSENT(cpuid)) - continue; - + CPU_FOREACH(cpuid) { epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu); EPAIR_LOCK(epair_dpcpu); STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list, @@ -622,8 +617,25 @@ epair_qflush(struct ifnet *ifp) } static int +epair_media_change(struct ifnet *ifp __unused) +{ + + /* Do nothing. */ + return (0); +} + +static void +epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr) +{ + + imr->ifm_status = IFM_AVALID | IFM_ACTIVE; + imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; +} + +static int epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { + struct epair_softc *sc; struct ifreq *ifr; int error; @@ -635,6 +647,12 @@ epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = 0; break; + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + sc = ifp->if_softc; + error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd); + break; + case SIOCSIFMTU: /* We basically allow all kinds of MTUs. */ ifp->if_mtu = ifr->ifr_mtu; @@ -794,6 +812,8 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) ifp->if_dname = ifc->ifc_name; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_capabilities = IFCAP_VLAN_MTU; + ifp->if_capenable = IFCAP_VLAN_MTU; ifp->if_start = epair_start; ifp->if_ioctl = epair_ioctl; ifp->if_init = epair_init; @@ -818,6 +838,8 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) ifp->if_dname = ifc->ifc_name; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_capabilities = IFCAP_VLAN_MTU; + ifp->if_capenable = IFCAP_VLAN_MTU; ifp->if_start = epair_start; ifp->if_ioctl = epair_ioctl; ifp->if_init = epair_init; @@ -840,6 +862,14 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) strlcpy(name, sca->ifp->if_xname, len); DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb); + /* Initialise pseudo media types. */ + ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status); + ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL); + ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T); + ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status); + ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL); + ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T); + /* Tell the world, that we are ready to rock. */ sca->ifp->if_drv_flags |= IFF_DRV_RUNNING; scb->ifp->if_drv_flags |= IFF_DRV_RUNNING; @@ -876,37 +906,41 @@ epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) if_link_state_change(oifp, LINK_STATE_DOWN); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; oifp->if_drv_flags &= ~IFF_DRV_RUNNING; + + /* + * Get rid of our second half. As the other of the two + * interfaces may reside in a different vnet, we need to + * switch before freeing them. + */ + CURVNET_SET_QUIET(oifp->if_vnet); ether_ifdetach(oifp); - ether_ifdetach(ifp); /* * Wait for all packets to be dispatched to if_input. - * The numbers can only go down as the interfaces are + * The numbers can only go down as the interface is * detached so there is no need to use atomics. */ - DPRINTF("sca refcnt=%u scb refcnt=%u\n", sca->refcount, scb->refcount); - EPAIR_REFCOUNT_ASSERT(sca->refcount == 1 && scb->refcount == 1, - ("%s: ifp=%p sca->refcount!=1: %d || ifp=%p scb->refcount!=1: %d", - __func__, ifp, sca->refcount, oifp, scb->refcount)); - - /* - * Get rid of our second half. - */ + DPRINTF("scb refcnt=%u\n", scb->refcount); + EPAIR_REFCOUNT_ASSERT(scb->refcount == 1, + ("%s: ifp=%p scb->refcount!=1: %d", __func__, oifp, scb->refcount)); oifp->if_softc = NULL; error = if_clone_destroyif(ifc, oifp); if (error) panic("%s: if_clone_destroyif() for our 2nd iface failed: %d", __func__, error); + if_free(oifp); + ifmedia_removeall(&scb->media); + free(scb, M_EPAIR); + CURVNET_RESTORE(); + ether_ifdetach(ifp); /* - * Finish cleaning up. Free them and release the unit. - * As the other of the two interfaces my reside in a different vnet, - * we need to switch before freeing them. + * Wait for all packets to be dispatched to if_input. */ - CURVNET_SET_QUIET(oifp->if_vnet); - if_free(oifp); - CURVNET_RESTORE(); + DPRINTF("sca refcnt=%u\n", sca->refcount); + EPAIR_REFCOUNT_ASSERT(sca->refcount == 1, + ("%s: ifp=%p sca->refcount!=1: %d", __func__, ifp, sca->refcount)); if_free(ifp); - free(scb, M_EPAIR); + ifmedia_removeall(&sca->media); free(sca, M_EPAIR); ifc_free_unit(ifc, unit); diff --git a/freebsd/sys/net/if_ethersubr.c b/freebsd/sys/net/if_ethersubr.c index 02a5d002..b7c48731 100644 --- a/freebsd/sys/net/if_ethersubr.c +++ b/freebsd/sys/net/if_ethersubr.c @@ -662,8 +662,10 @@ ether_input(struct ifnet *ifp, struct mbuf *m) m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; - else + else { + CURVNET_RESTORE(); return; + } } /* @@ -682,6 +684,7 @@ ether_input(struct ifnet *ifp, struct mbuf *m) #endif ifp->if_ierrors++; m_freem(m); + CURVNET_RESTORE(); return; } @@ -694,6 +697,8 @@ ether_input(struct ifnet *ifp, struct mbuf *m) m_adj(m, ETHER_VLAN_ENCAP_LEN); } + M_SETFIB(m, ifp->if_fib); + /* Allow ng_ether(4) to claim this frame. */ if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_input_p != NULL, diff --git a/freebsd/sys/net/if_faith.c b/freebsd/sys/net/if_faith.c index d99e16ea..58de362a 100644 --- a/freebsd/sys/net/if_faith.c +++ b/freebsd/sys/net/if_faith.c @@ -340,7 +340,7 @@ faithprefix(in6) sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = *in6; - rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL); + rt = in6_rtalloc1((struct sockaddr *)&sin6, 0, 0UL, RT_DEFAULT_FIB); if (rt && rt->rt_ifp && rt->rt_ifp->if_type == IFT_FAITH && (rt->rt_ifp->if_flags & IFF_UP) != 0) ret = 1; diff --git a/freebsd/sys/net/if_fddisubr.c b/freebsd/sys/net/if_fddisubr.c index ba4db83f..154fe2fc 100644 --- a/freebsd/sys/net/if_fddisubr.c +++ b/freebsd/sys/net/if_fddisubr.c @@ -552,6 +552,7 @@ fddi_input(ifp, m) ifp->if_noproto++; goto dropanyway; } + M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; diff --git a/freebsd/sys/net/if_fwsubr.c b/freebsd/sys/net/if_fwsubr.c index a9931419..df90d48d 100644 --- a/freebsd/sys/net/if_fwsubr.c +++ b/freebsd/sys/net/if_fwsubr.c @@ -629,6 +629,7 @@ firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src) return; } + M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); } diff --git a/freebsd/sys/net/if_gif.c b/freebsd/sys/net/if_gif.c index d9144419..1a8e4c8d 100644 --- a/freebsd/sys/net/if_gif.c +++ b/freebsd/sys/net/if_gif.c @@ -37,6 +37,7 @@ #include <rtems/bsd/sys/param.h> #include <sys/systm.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -493,7 +494,7 @@ gif_input(m, af, ifp) struct ifnet *ifp; { int isr, n; - struct gif_softc *sc = ifp->if_softc; + struct gif_softc *sc; struct etherip_header *eip; struct ether_header *eh; struct ifnet *oldifp; @@ -503,7 +504,7 @@ gif_input(m, af, ifp) m_freem(m); return; } - + sc = ifp->if_softc; m->m_pkthdr.rcvif = ifp; #ifdef MAC @@ -614,6 +615,7 @@ gif_input(m, af, ifp) ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; + M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); } @@ -823,6 +825,12 @@ gif_ioctl(ifp, cmd, data) } if (src->sa_len > size) return EINVAL; + error = prison_if(curthread->td_ucred, src); + if (error != 0) + return (error); + error = prison_if(curthread->td_ucred, dst); + if (error != 0) + return (error); bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); #ifdef INET6 if (dst->sa_family == AF_INET6) { diff --git a/freebsd/sys/net/if_gre.c b/freebsd/sys/net/if_gre.c index a75e52a4..21f39eb2 100644 --- a/freebsd/sys/net/if_gre.c +++ b/freebsd/sys/net/if_gre.c @@ -55,7 +55,9 @@ #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/sys/param.h> +#include <sys/jail.h> #include <sys/kernel.h> +#include <sys/libkern.h> #include <sys/malloc.h> #include <sys/module.h> #include <sys/mbuf.h> @@ -99,6 +101,14 @@ #define GRENAME "gre" +#define MTAG_COOKIE_GRE 1307983903 +#define MTAG_GRE_NESTING 1 +struct mtag_gre_nesting { + uint16_t count; + uint16_t max; + struct ifnet *ifp[]; +}; + /* * gre_mtx protects all global variables in if_gre.c. * XXX: gre_softc data not protected yet. @@ -204,7 +214,6 @@ gre_clone_create(ifc, unit, params) sc->g_proto = IPPROTO_GRE; GRE2IFP(sc)->if_flags |= IFF_LINK0; sc->encap = NULL; - sc->called = 0; #ifndef __rtems__ sc->gre_fibnum = curthread->td_proc->p_fibnum; #else /* __rtems__ */ @@ -252,23 +261,77 @@ gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct gre_softc *sc = ifp->if_softc; struct greip *gh; struct ip *ip; + struct m_tag *mtag; + struct mtag_gre_nesting *gt; + size_t len; u_short gre_ip_id = 0; uint8_t gre_ip_tos = 0; u_int16_t etype = 0; struct mobile_h mob_h; u_int32_t af; - int extra = 0; + int extra = 0, max; /* - * gre may cause infinite recursion calls when misconfigured. - * We'll prevent this by introducing upper limit. + * gre may cause infinite recursion calls when misconfigured. High + * nesting level may cause stack exhaustion. We'll prevent this by + * detecting loops and by introducing upper limit. */ - if (++(sc->called) > max_gre_nesting) { - printf("%s: gre_output: recursively called too many " - "times(%d)\n", if_name(GRE2IFP(sc)), sc->called); - m_freem(m); - error = EIO; /* is there better errno? */ - goto end; + mtag = m_tag_locate(m, MTAG_COOKIE_GRE, MTAG_GRE_NESTING, NULL); + if (mtag != NULL) { + struct ifnet **ifp2; + + gt = (struct mtag_gre_nesting *)(mtag + 1); + gt->count++; + if (gt->count > min(gt->max,max_gre_nesting)) { + printf("%s: hit maximum recursion limit %u on %s\n", + __func__, gt->count - 1, ifp->if_xname); + m_freem(m); + error = EIO; /* is there better errno? */ + goto end; + } + + ifp2 = gt->ifp; + for (max = gt->count - 1; max > 0; max--) { + if (*ifp2 == ifp) + break; + ifp2++; + } + if (*ifp2 == ifp) { + printf("%s: detected loop with nexting %u on %s\n", + __func__, gt->count-1, ifp->if_xname); + m_freem(m); + error = EIO; /* is there better errno? */ + goto end; + } + *ifp2 = ifp; + + } else { + /* + * Given that people should NOT increase max_gre_nesting beyond + * their real needs, we allocate once per packet rather than + * allocating an mtag once per passing through gre. + * + * Note: the sysctl does not actually check for saneness, so we + * limit the maximum numbers of possible recursions here. + */ + max = imin(max_gre_nesting, 256); + /* If someone sets the sysctl <= 0, we want at least 1. */ + max = imax(max, 1); + len = sizeof(struct mtag_gre_nesting) + + max * sizeof(struct ifnet *); + mtag = m_tag_alloc(MTAG_COOKIE_GRE, MTAG_GRE_NESTING, len, + M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + error = ENOMEM; + goto end; + } + gt = (struct mtag_gre_nesting *)(mtag + 1); + bzero(gt, len); + gt->count = 1; + gt->max = max; + *gt->ifp = ifp; + m_tag_prepend(m, mtag); } if (!((ifp->if_flags & IFF_UP) && @@ -456,7 +519,6 @@ gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, error = ip_output(m, NULL, &sc->route, IP_FORWARDING, (struct ip_moptions *)NULL, (struct inpcb *)NULL); end: - sc->called = 0; if (error) ifp->if_oerrors++; return (error); @@ -649,6 +711,9 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_src.s_addr; sa = sintosa(&si); + error = prison_if(curthread->td_ucred, sa); + if (error != 0) + break; ifr->ifr_addr = *sa; break; case GREGADDRD: @@ -657,6 +722,9 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_dst.s_addr; sa = sintosa(&si); + error = prison_if(curthread->td_ucred, sa); + if (error != 0) + break; ifr->ifr_addr = *sa; break; case SIOCSIFPHYADDR: @@ -720,8 +788,14 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_src.s_addr; + error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); + if (error != 0) + break; memcpy(&lifr->addr, &si, sizeof(si)); si.sin_addr.s_addr = sc->g_dst.s_addr; + error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); + if (error != 0) + break; memcpy(&lifr->dstaddr, &si, sizeof(si)); break; case SIOCGIFPSRCADDR: @@ -736,6 +810,9 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_src.s_addr; + error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); + if (error != 0) + break; bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); break; case SIOCGIFPDSTADDR: @@ -750,6 +827,9 @@ gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) si.sin_family = AF_INET; si.sin_len = sizeof(struct sockaddr_in); si.sin_addr.s_addr = sc->g_dst.s_addr; + error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); + if (error != 0) + break; bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); break; case GRESKEY: diff --git a/freebsd/sys/net/if_gre.h b/freebsd/sys/net/if_gre.h index 186d4cc6..13b882c8 100644 --- a/freebsd/sys/net/if_gre.h +++ b/freebsd/sys/net/if_gre.h @@ -68,8 +68,6 @@ struct gre_softc { const struct encaptab *encap; /* encapsulation cookie */ - int called; /* infinite recursion preventer */ - uint32_t key; /* key included in outgoing GRE packets */ /* zero means none */ diff --git a/freebsd/sys/net/if_iso88025subr.c b/freebsd/sys/net/if_iso88025subr.c index 6a39956e..b52853a2 100644 --- a/freebsd/sys/net/if_iso88025subr.c +++ b/freebsd/sys/net/if_iso88025subr.c @@ -682,6 +682,7 @@ iso88025_input(ifp, m) break; } + M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; diff --git a/freebsd/sys/net/if_lagg.c b/freebsd/sys/net/if_lagg.c index a1c90cdf..5d5064a4 100644 --- a/freebsd/sys/net/if_lagg.c +++ b/freebsd/sys/net/if_lagg.c @@ -169,6 +169,11 @@ static int lagg_failover_rx_all = 0; /* Allow input on any failover links */ SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW, &lagg_failover_rx_all, 0, "Accept input from any interface in a failover lagg"); +static int def_use_flowid = 1; /* Default value for using M_FLOWID */ +TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid); +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW, + &def_use_flowid, 0, + "Default setting for using flow id for load sharing"); static int lagg_modevent(module_t mod, int type, void *data) @@ -206,6 +211,7 @@ static moduledata_t lagg_mod = { }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(if_lagg, 1); #if __FreeBSD_version >= 800000 /* @@ -258,6 +264,8 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) struct ifnet *ifp; int i, error = 0; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ + struct sysctl_oid *oid; + char num[14]; /* sufficient for 32 bits */ sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); @@ -266,6 +274,17 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) return (ENOSPC); } + sysctl_ctx_init(&sc->ctx); + snprintf(num, sizeof(num), "%u", unit); + sc->use_flowid = def_use_flowid; + oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg), + OID_AUTO, num, CTLFLAG_RD, NULL, ""); + SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, + "use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, sc->use_flowid, + "Use flow id for load sharing"); + /* Hash all layers by default */ + sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4; + sc->sc_proto = LAGG_PROTO_NONE; for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) { if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) { @@ -345,6 +364,7 @@ lagg_clone_destroy(struct ifnet *ifp) LAGG_WUNLOCK(sc); + sysctl_ctx_free(&sc->ctx); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free_type(ifp, IFT_ETHER); @@ -738,28 +758,18 @@ fallback: return (EINVAL); } +/* + * For direct output to child ports. + */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; - struct ether_header *eh; - short type = 0; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: - eh = (struct ether_header *)dst->sa_data; - type = eh->ether_type; - break; - } - - /* - * Only allow ethernet types required to initiate or maintain the link, - * aggregated frames take a different path. - */ - switch (ntohs(type)) { - case ETHERTYPE_PAE: /* EAPOL PAE/802.1x */ return ((*lp->lp_output)(ifp, m, dst, ro)); } @@ -776,6 +786,9 @@ lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) if ((lp = ifp->if_lagg) == NULL) return; + /* If the ifnet is just being renamed, don't do anything. */ + if (ifp->if_flags & IFF_RENAMING) + return; sc = lp->lp_softc; @@ -871,6 +884,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; + struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; @@ -923,11 +937,11 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = EPROTONOSUPPORT; break; } + LAGG_WLOCK(sc); if (sc->sc_proto != LAGG_PROTO_NONE) { - LAGG_WLOCK(sc); - error = sc->sc_detach(sc); - /* Reset protocol and pointers */ + /* Reset protocol first in case detach unlocks */ sc->sc_proto = LAGG_PROTO_NONE; + error = sc->sc_detach(sc); sc->sc_detach = NULL; sc->sc_start = NULL; sc->sc_input = NULL; @@ -939,10 +953,14 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) sc->sc_lladdr = NULL; sc->sc_req = NULL; sc->sc_portreq = NULL; - LAGG_WUNLOCK(sc); + } else if (sc->sc_input != NULL) { + /* Still detaching */ + error = EBUSY; } - if (error != 0) + if (error != 0) { + LAGG_WUNLOCK(sc); break; + } for (int i = 0; i < (sizeof(lagg_protos) / sizeof(lagg_protos[0])); i++) { if (lagg_protos[i].ti_proto == ra->ra_proto) { @@ -950,7 +968,6 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) printf("%s: using proto %u\n", sc->sc_ifname, lagg_protos[i].ti_proto); - LAGG_WLOCK(sc); sc->sc_proto = lagg_protos[i].ti_proto; if (sc->sc_proto != LAGG_PROTO_NONE) error = lagg_protos[i].ti_attach(sc); @@ -958,8 +975,25 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) return (error); } } + LAGG_WUNLOCK(sc); error = EPROTONOSUPPORT; break; + case SIOCGLAGGFLAGS: + rf->rf_flags = sc->sc_flags; + break; + case SIOCSLAGGHASH: + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { + error = EINVAL; + break; + } + LAGG_WLOCK(sc); + sc->sc_flags &= ~LAGG_F_HASHMASK; + sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK; + LAGG_WUNLOCK(sc); + break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit(rp->rp_portname)) == NULL) { @@ -1215,14 +1249,15 @@ lagg_input(struct ifnet *ifp, struct mbuf *m) struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; + LAGG_RLOCK(sc); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (lp->lp_flags & LAGG_PORT_DISABLED) || sc->sc_proto == LAGG_PROTO_NONE) { + LAGG_RUNLOCK(sc); m_freem(m); return (NULL); } - LAGG_RLOCK(sc); ETHER_BPF_MTAP(scifp, m); m = (*sc->sc_input)(sc, lp, m); @@ -1388,42 +1423,55 @@ lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) } uint32_t -lagg_hashmbuf(struct mbuf *m, uint32_t key) +lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key) { uint16_t etype; - uint32_t p = 0; + uint32_t p = key; int off; struct ether_header *eh; - struct ether_vlan_header vlanbuf; const struct ether_vlan_header *vlan; #ifdef INET const struct ip *ip; - struct ip ipbuf; + const uint32_t *ports; + int iphlen; #endif #ifdef INET6 const struct ip6_hdr *ip6; - struct ip6_hdr ip6buf; uint32_t flow; #endif + union { +#ifdef INET + struct ip ip; +#endif +#ifdef INET6 + struct ip6_hdr ip6; +#endif + struct ether_vlan_header vlan; + uint32_t port; + } buf; + off = sizeof(*eh); if (m->m_len < off) goto out; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); - p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, key); - p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); + if (sc->sc_flags & LAGG_F_HASHL2) { + p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p); + p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); + } /* Special handling for encapsulating VLAN frames */ - if (m->m_flags & M_VLANTAG) { + if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) { p = hash32_buf(&m->m_pkthdr.ether_vtag, sizeof(m->m_pkthdr.ether_vtag), p); } else if (etype == ETHERTYPE_VLAN) { - vlan = lagg_gethdr(m, off, sizeof(*vlan), &vlanbuf); + vlan = lagg_gethdr(m, off, sizeof(*vlan), &buf); if (vlan == NULL) goto out; - p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); + if (sc->sc_flags & LAGG_F_HASHL2) + p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); etype = ntohs(vlan->evl_proto); off += sizeof(*vlan) - sizeof(*eh); } @@ -1431,17 +1479,37 @@ lagg_hashmbuf(struct mbuf *m, uint32_t key) switch (etype) { #ifdef INET case ETHERTYPE_IP: - ip = lagg_gethdr(m, off, sizeof(*ip), &ipbuf); + ip = lagg_gethdr(m, off, sizeof(*ip), &buf); if (ip == NULL) goto out; - p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); - p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); + if (sc->sc_flags & LAGG_F_HASHL3) { + p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); + p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); + } + if (!(sc->sc_flags & LAGG_F_HASHL4)) + break; + switch (ip->ip_p) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + iphlen = ip->ip_hl << 2; + if (iphlen < sizeof(*ip)) + break; + off += iphlen; + ports = lagg_gethdr(m, off, sizeof(*ports), &buf); + if (ports == NULL) + break; + p = hash32_buf(ports, sizeof(*ports), p); + break; + } break; #endif #ifdef INET6 case ETHERTYPE_IPV6: - ip6 = lagg_gethdr(m, off, sizeof(*ip6), &ip6buf); + if (!(sc->sc_flags & LAGG_F_HASHL3)) + break; + ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf); if (ip6 == NULL) goto out; @@ -1668,10 +1736,10 @@ lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) struct lagg_port *lp = NULL; uint32_t p = 0; - if (m->m_flags & M_FLOWID) + if (sc->use_flowid && (m->m_flags & M_FLOWID)) p = m->m_pkthdr.flowid; else - p = lagg_hashmbuf(m, lb->lb_key); + p = lagg_hashmbuf(sc, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; @@ -1788,7 +1856,7 @@ lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ - if (etype == ETHERTYPE_SLOW) { + if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); diff --git a/freebsd/sys/net/if_lagg.h b/freebsd/sys/net/if_lagg.h index 0034c617..27ab46f2 100644 --- a/freebsd/sys/net/if_lagg.h +++ b/freebsd/sys/net/if_lagg.h @@ -21,6 +21,8 @@ #ifndef _NET_LAGG_H #define _NET_LAGG_H +#include <sys/sysctl.h> + /* * Global definitions */ @@ -29,6 +31,12 @@ #define LAGG_MAX_NAMESIZE 32 /* name of a protocol */ #define LAGG_MAX_STACKING 4 /* maximum number of stacked laggs */ +/* Lagg flags */ +#define LAGG_F_HASHL2 0x00000001 /* hash layer 2 */ +#define LAGG_F_HASHL3 0x00000002 /* hash layer 3 */ +#define LAGG_F_HASHL4 0x00000004 /* hash layer 4 */ +#define LAGG_F_HASHMASK 0x00000007 + /* Port flags */ #define LAGG_PORT_SLAVE 0x00000000 /* normal enslaved port */ #define LAGG_PORT_MASTER 0x00000001 /* primary port */ @@ -120,6 +128,14 @@ struct lagg_reqall { #define SIOCGLAGG _IOWR('i', 143, struct lagg_reqall) #define SIOCSLAGG _IOW('i', 144, struct lagg_reqall) +struct lagg_reqflags { + char rf_ifname[IFNAMSIZ]; /* name of the lagg */ + uint32_t rf_flags; /* lagg protocol */ +}; + +#define SIOCGLAGGFLAGS _IOWR('i', 145, struct lagg_reqflags) +#define SIOCSLAGGHASH _IOW('i', 146, struct lagg_reqflags) + #ifdef _KERNEL /* * Internal kernel part @@ -177,6 +193,7 @@ struct lagg_softc { struct ifmedia sc_media; /* media config */ caddr_t sc_psc; /* protocol data */ uint32_t sc_seq; /* sequence counter */ + uint32_t sc_flags; SLIST_HEAD(__tplhd, lagg_port) sc_ports; /* list of interfaces */ SLIST_ENTRY(lagg_softc) sc_entries; @@ -202,6 +219,8 @@ struct lagg_softc { eventhandler_tag vlan_attach; eventhandler_tag vlan_detach; #endif + struct sysctl_ctx_list ctx; /* sysctl variables */ + int use_flowid; /* use M_FLOWID */ }; struct lagg_port { @@ -240,7 +259,7 @@ extern struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); extern void (*lagg_linkstate_p)(struct ifnet *, int ); int lagg_enqueue(struct ifnet *, struct mbuf *); -uint32_t lagg_hashmbuf(struct mbuf *, uint32_t); +uint32_t lagg_hashmbuf(struct lagg_softc *, struct mbuf *, uint32_t); #endif /* _KERNEL */ diff --git a/freebsd/sys/net/if_llatbl.c b/freebsd/sys/net/if_llatbl.c index 3ffcc21a..80888559 100644 --- a/freebsd/sys/net/if_llatbl.c +++ b/freebsd/sys/net/if_llatbl.c @@ -102,18 +102,35 @@ done: * This function is called by the timer functions * such as arptimer() and nd6_llinfo_timer(), and * the caller does the locking. + * + * Returns the number of held packets, if any, that were dropped. */ -void +size_t llentry_free(struct llentry *lle) { - + size_t pkts_dropped; + struct mbuf *next; + + pkts_dropped = 0; LLE_WLOCK_ASSERT(lle); LIST_REMOVE(lle, lle_next); - if (lle->la_hold != NULL) + while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) { + next = lle->la_hold->m_nextpkt; m_freem(lle->la_hold); + lle->la_hold = next; + lle->la_numheld--; + pkts_dropped++; + } + + KASSERT(lle->la_numheld == 0, + ("%s: la_numheld %d > 0, pkts_droped %zd", __func__, + lle->la_numheld, pkts_dropped)); + lle->la_flags &= ~LLE_VALID; LLE_FREE_LOCKED(lle); + + return (pkts_dropped); } /* @@ -214,7 +231,8 @@ lltable_drain(int af) #endif void -lltable_prefix_free(int af, struct sockaddr *prefix, struct sockaddr *mask) +lltable_prefix_free(int af, struct sockaddr *prefix, struct sockaddr *mask, + u_int flags) { struct lltable *llt; @@ -223,7 +241,7 @@ lltable_prefix_free(int af, struct sockaddr *prefix, struct sockaddr *mask) if (llt->llt_af != af) continue; - llt->llt_prefix_free(llt, prefix, mask); + llt->llt_prefix_free(llt, prefix, mask, flags); } LLTABLE_RUNLOCK(); } @@ -414,6 +432,7 @@ llatbl_lle_show(struct llentry_sa *la) db_printf(" lle_tbl=%p\n", lle->lle_tbl); db_printf(" lle_head=%p\n", lle->lle_head); db_printf(" la_hold=%p\n", lle->la_hold); + db_printf(" la_numheld=%d\n", lle->la_numheld); db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire); db_printf(" la_flags=0x%04x\n", lle->la_flags); db_printf(" la_asked=%u\n", lle->la_asked); diff --git a/freebsd/sys/net/if_llatbl.h b/freebsd/sys/net/if_llatbl.h index a4d02ab0..8b15e5c8 100644 --- a/freebsd/sys/net/if_llatbl.h +++ b/freebsd/sys/net/if_llatbl.h @@ -58,6 +58,7 @@ struct llentry { struct lltable *lle_tbl; struct llentries *lle_head; struct mbuf *la_hold; + int la_numheld; /* # of packets currently held */ time_t la_expire; uint16_t la_flags; uint16_t la_asked; @@ -115,19 +116,12 @@ struct llentry { LLE_WUNLOCK(lle); \ } \ /* guard against invalid refs */ \ - lle = 0; \ + lle = NULL; \ } while (0) #define LLE_FREE(lle) do { \ LLE_WLOCK(lle); \ - if ((lle)->lle_refcnt <= 1) \ - (lle)->lle_tbl->llt_free((lle)->lle_tbl, (lle));\ - else { \ - (lle)->lle_refcnt--; \ - LLE_WUNLOCK(lle); \ - } \ - /* guard against invalid refs */ \ - lle = NULL; \ + LLE_FREE_LOCKED(lle); \ } while (0) @@ -152,15 +146,13 @@ struct lltable { int llt_af; struct ifnet *llt_ifp; - struct llentry * (*llt_new)(const struct sockaddr *, u_int); void (*llt_free)(struct lltable *, struct llentry *); void (*llt_prefix_free)(struct lltable *, const struct sockaddr *prefix, - const struct sockaddr *mask); + const struct sockaddr *mask, + u_int flags); struct llentry * (*llt_lookup)(struct lltable *, u_int flags, const struct sockaddr *l3addr); - int (*llt_rtcheck)(struct ifnet *, u_int flags, - const struct sockaddr *); int (*llt_dump)(struct lltable *, struct sysctl_req *); }; @@ -185,13 +177,13 @@ MALLOC_DECLARE(M_LLTABLE); struct lltable *lltable_init(struct ifnet *, int); void lltable_free(struct lltable *); void lltable_prefix_free(int, struct sockaddr *, - struct sockaddr *); + struct sockaddr *, u_int); #if 0 void lltable_drain(int); #endif int lltable_sysctl_dumparp(int, struct sysctl_req *); -void llentry_free(struct llentry *); +size_t llentry_free(struct llentry *); int llentry_update(struct llentry **, struct lltable *, struct sockaddr_storage *, struct ifnet *); diff --git a/freebsd/sys/net/if_media.c b/freebsd/sys/net/if_media.c index 46b57b42..3bc6122c 100644 --- a/freebsd/sys/net/if_media.c +++ b/freebsd/sys/net/if_media.c @@ -237,7 +237,7 @@ ifmedia_ioctl(ifp, ifr, ifm, cmd) /* * If no change, we're done. * XXX Automedia may invole software intervention. - * Keep going in case the the connected media changed. + * Keep going in case the connected media changed. * Similarly, if best match changed (kernel debugger?). */ if ((IFM_SUBTYPE(newmedia) != IFM_AUTO) && diff --git a/freebsd/sys/net/if_media.h b/freebsd/sys/net/if_media.h index 337ad685..2c833228 100644 --- a/freebsd/sys/net/if_media.h +++ b/freebsd/sys/net/if_media.h @@ -36,7 +36,7 @@ */ #ifndef _NET_IF_MEDIA_H_ -#define _NET_IF_MEDIA_H_ +#define _NET_IF_MEDIA_H_ /* * Prototypes and definitions for BSD/OS-compatible network interface @@ -144,13 +144,12 @@ uint64_t ifmedia_baudrate(int); #define IFM_10G_LR 18 /* 10GBase-LR 1310nm Single-mode */ #define IFM_10G_SR 19 /* 10GBase-SR 850nm Multi-mode */ #define IFM_10G_CX4 20 /* 10GBase CX4 copper */ -#define IFM_2500_SX 21 /* 2500BaseSX - multi-mode fiber */ -#define IFM_10G_TWINAX 22 /* 10GBase Twinax copper */ -#define IFM_10G_TWINAX_LONG 23 /* 10GBase Twinax Long copper */ -#define IFM_10G_LRM 24 /* 10GBase-LRM 850nm Multi-mode */ -#define IFM_UNKNOWN 25 /* media types not defined yet */ -#define IFM_10G_T 26 /* 10GBase-T - RJ45 */ - +#define IFM_2500_SX 21 /* 2500BaseSX - multi-mode fiber */ +#define IFM_10G_TWINAX 22 /* 10GBase Twinax copper */ +#define IFM_10G_TWINAX_LONG 23 /* 10GBase Twinax Long copper */ +#define IFM_10G_LRM 24 /* 10GBase-LRM 850nm Multi-mode */ +#define IFM_UNKNOWN 25 /* media types not defined yet */ +#define IFM_10G_T 26 /* 10GBase-T - RJ45 */ /* note 31 is the max! */ @@ -232,20 +231,20 @@ uint64_t ifmedia_baudrate(int); /* * ATM */ -#define IFM_ATM 0x000000a0 -#define IFM_ATM_UNKNOWN 3 -#define IFM_ATM_UTP_25 4 -#define IFM_ATM_TAXI_100 5 -#define IFM_ATM_TAXI_140 6 -#define IFM_ATM_MM_155 7 -#define IFM_ATM_SM_155 8 -#define IFM_ATM_UTP_155 9 -#define IFM_ATM_MM_622 10 -#define IFM_ATM_SM_622 11 +#define IFM_ATM 0x000000a0 +#define IFM_ATM_UNKNOWN 3 +#define IFM_ATM_UTP_25 4 +#define IFM_ATM_TAXI_100 5 +#define IFM_ATM_TAXI_140 6 +#define IFM_ATM_MM_155 7 +#define IFM_ATM_SM_155 8 +#define IFM_ATM_UTP_155 9 +#define IFM_ATM_MM_622 10 +#define IFM_ATM_SM_622 11 #define IFM_ATM_VIRTUAL 12 -#define IFM_ATM_SDH 0x00000100 /* SDH instead of SONET */ -#define IFM_ATM_NOSCRAMB 0x00000200 /* no scrambling */ -#define IFM_ATM_UNASSIGNED 0x00000400 /* unassigned cells */ +#define IFM_ATM_SDH 0x00000100 /* SDH instead of SONET */ +#define IFM_ATM_NOSCRAMB 0x00000200 /* no scrambling */ +#define IFM_ATM_UNASSIGNED 0x00000400 /* unassigned cells */ /* * CARP Common Address Redundancy Protocol @@ -295,22 +294,22 @@ uint64_t ifmedia_baudrate(int); #define IFM_STATUS_VALID IFM_AVALID /* List of "status valid" bits, for ifconfig(8). */ -#define IFM_STATUS_VALID_LIST { \ - IFM_AVALID, \ - 0 \ +#define IFM_STATUS_VALID_LIST { \ + IFM_AVALID, \ + 0 \ } /* * Macros to extract various bits of information from the media word. */ -#define IFM_TYPE(x) ((x) & IFM_NMASK) -#define IFM_SUBTYPE(x) ((x) & IFM_TMASK) -#define IFM_TYPE_OPTIONS(x) ((x) & IFM_OMASK) -#define IFM_INST(x) (((x) & IFM_IMASK) >> IFM_ISHIFT) -#define IFM_OPTIONS(x) ((x) & (IFM_OMASK|IFM_GMASK)) -#define IFM_MODE(x) ((x) & IFM_MMASK) +#define IFM_TYPE(x) ((x) & IFM_NMASK) +#define IFM_SUBTYPE(x) ((x) & IFM_TMASK) +#define IFM_TYPE_OPTIONS(x) ((x) & IFM_OMASK) +#define IFM_INST(x) (((x) & IFM_IMASK) >> IFM_ISHIFT) +#define IFM_OPTIONS(x) ((x) & (IFM_OMASK | IFM_GMASK)) +#define IFM_MODE(x) ((x) & IFM_MMASK) -#define IFM_INST_MAX IFM_INST(IFM_IMASK) +#define IFM_INST_MAX IFM_INST(IFM_IMASK) /* * Macro to create a media word. @@ -371,6 +370,7 @@ struct ifmedia_description { } #define IFM_SUBTYPE_ETHERNET_ALIASES { \ + { IFM_10_T, "10baseT" }, \ { IFM_10_T, "UTP" }, \ { IFM_10_T, "10UTP" }, \ { IFM_10_2, "BNC" }, \ @@ -390,6 +390,23 @@ struct ifmedia_description { { IFM_1000_T, "1000TX" }, \ { IFM_1000_T, "1000T" }, \ { IFM_2500_SX, "2500SX" }, \ + \ + /* \ + * Shorthands for common media+option combinations as announced \ + * by miibus(4) \ + */ \ + { IFM_10_T | IFM_FDX, "10baseT-FDX" }, \ + { IFM_10_T | IFM_FDX | IFM_FLOW, "10baseT-FDX-flow" }, \ + { IFM_100_TX | IFM_FDX, "100baseTX-FDX" }, \ + { IFM_100_TX | IFM_FDX | IFM_FLOW, "100baseTX-FDX-flow" }, \ + { IFM_1000_T | IFM_FDX, "1000baseT-FDX" }, \ + { IFM_1000_T | IFM_FDX | IFM_FLOW, "1000baseT-FDX-flow" }, \ + { IFM_1000_T | IFM_FDX | IFM_FLOW | IFM_ETH_MASTER, \ + "1000baseT-FDX-flow-master" }, \ + { IFM_1000_T | IFM_FDX | IFM_ETH_MASTER, \ + "1000baseT-FDX-master" }, \ + { IFM_1000_T | IFM_ETH_MASTER, "1000baseT-master" }, \ + \ { 0, NULL }, \ } @@ -539,7 +556,7 @@ struct ifmedia_description { { 0, NULL }, \ } -# define IFM_SUBTYPE_ATM_DESCRIPTIONS { \ +#define IFM_SUBTYPE_ATM_DESCRIPTIONS { \ { IFM_ATM_UNKNOWN, "Unknown" }, \ { IFM_ATM_UTP_25, "UTP/25.6MBit" }, \ { IFM_ATM_TAXI_100, "Taxi/100MBit" }, \ @@ -553,7 +570,7 @@ struct ifmedia_description { { 0, NULL }, \ } -# define IFM_SUBTYPE_ATM_ALIASES { \ +#define IFM_SUBTYPE_ATM_ALIASES { \ { IFM_ATM_UNKNOWN, "UNKNOWN" }, \ { IFM_ATM_UTP_25, "UTP-25" }, \ { IFM_ATM_TAXI_100, "TAXI-100" }, \ @@ -574,7 +591,6 @@ struct ifmedia_description { { 0, NULL }, \ } - #define IFM_SUBTYPE_SHARED_DESCRIPTIONS { \ { IFM_AUTO, "autoselect" }, \ { IFM_MANUAL, "manual" }, \ @@ -584,6 +600,13 @@ struct ifmedia_description { #define IFM_SUBTYPE_SHARED_ALIASES { \ { IFM_AUTO, "auto" }, \ + \ + /* \ + * Shorthands for common media+option combinations as announced \ + * by miibus(4) \ + */ \ + { IFM_AUTO | IFM_FLOW, "auto-flow" }, \ + \ { 0, NULL }, \ } @@ -598,6 +621,15 @@ struct ifmedia_description { { 0, NULL }, \ } +#define IFM_SHARED_OPTION_ALIASES { \ + { IFM_FDX, "fdx" }, \ + { IFM_HDX, "hdx" }, \ + { IFM_FLOW, "flow" }, \ + { IFM_LOOP, "loop" }, \ + { IFM_LOOP, "loopback" }, \ + { 0, NULL }, \ +} + /* * Baudrate descriptions for the various media types. */ @@ -606,7 +638,7 @@ struct ifmedia_baudrate { uint64_t ifmb_baudrate; /* corresponding baudrate */ }; -#define IFM_BAUDRATE_DESCRIPTIONS { \ +#define IFM_BAUDRATE_DESCRIPTIONS { \ { IFM_ETHER | IFM_10_T, IF_Mbps(10) }, \ { IFM_ETHER | IFM_10_2, IF_Mbps(10) }, \ { IFM_ETHER | IFM_10_5, IF_Mbps(10) }, \ @@ -670,10 +702,10 @@ struct ifmedia_status_description { const char *ifms_string[2]; }; -#define IFM_STATUS_DESC(ifms, bit) \ +#define IFM_STATUS_DESC(ifms, bit) \ (ifms)->ifms_string[((ifms)->ifms_bit & (bit)) ? 1 : 0] -#define IFM_STATUS_DESCRIPTIONS { \ +#define IFM_STATUS_DESCRIPTIONS { \ { IFM_ETHER, IFM_AVALID, IFM_ACTIVE, \ { "no carrier", "active" } }, \ { IFM_FDDI, IFM_AVALID, IFM_ACTIVE, \ diff --git a/freebsd/sys/net/if_spppfr.c b/freebsd/sys/net/if_spppfr.c index be080a7d..f25bad7b 100644 --- a/freebsd/sys/net/if_spppfr.c +++ b/freebsd/sys/net/if_spppfr.c @@ -282,6 +282,8 @@ drop: ++ifp->if_ierrors; if (! (ifp->if_flags & IFF_UP)) goto drop; + M_SETFIB(m, ifp->if_fib); + /* Check queue. */ if (netisr_queue(isr, m)) { /* (0) on success. */ if (debug) diff --git a/freebsd/sys/net/if_spppsubr.c b/freebsd/sys/net/if_spppsubr.c index d5f3487a..01743f47 100644 --- a/freebsd/sys/net/if_spppsubr.c +++ b/freebsd/sys/net/if_spppsubr.c @@ -739,6 +739,7 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) goto drop; SPPP_UNLOCK(sp); + M_SETFIB(m, ifp->if_fib); /* Check queue. */ if (netisr_queue(isr, m)) { /* (0) on success. */ if (debug) diff --git a/freebsd/sys/net/if_stf.c b/freebsd/sys/net/if_stf.c index 79466119..a808548c 100644 --- a/freebsd/sys/net/if_stf.c +++ b/freebsd/sys/net/if_stf.c @@ -787,6 +787,7 @@ in_stf_input(m, off) */ ifp->if_ipackets++; ifp->if_ibytes += m->m_pkthdr.len; + M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); } diff --git a/freebsd/sys/net/if_tap.c b/freebsd/sys/net/if_tap.c index cd775369..6e6b6a64 100644 --- a/freebsd/sys/net/if_tap.c +++ b/freebsd/sys/net/if_tap.c @@ -44,6 +44,7 @@ #include <sys/conf.h> #include <sys/fcntl.h> #include <sys/filio.h> +#include <sys/jail.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -66,8 +67,9 @@ #include <net/if.h> #include <net/if_clone.h> #include <net/if_dl.h> -#include <net/route.h> #include <net/if_types.h> +#include <net/route.h> +#include <net/vnet.h> #include <netinet/in.h> @@ -216,6 +218,8 @@ tap_destroy(struct tap_softc *tp) KASSERT(!(tp->tap_flags & TAP_OPEN), ("%s flags is out of sync", ifp->if_xname)); + CURVNET_SET(ifp->if_vnet); + seldrain(&tp->tap_rsel); knlist_destroy(&tp->tap_rsel.si_note); destroy_dev(tp->tap_dev); ether_ifdetach(ifp); @@ -223,6 +227,7 @@ tap_destroy(struct tap_softc *tp) mtx_destroy(&tp->tap_mtx); free(tp, M_TAP); + CURVNET_RESTORE(); } static void @@ -364,6 +369,7 @@ tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **d if (unit == -1) append_unit = 1; + CURVNET_SET(CRED_TO_VNET(cred)); /* find any existing device, or allocate new unit number */ i = clone_create(&tapclones, &tap_cdevsw, &unit, dev, extra); if (i) { @@ -382,6 +388,7 @@ tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **d } if_clone_create(name, namelen, NULL); + CURVNET_RESTORE(); } /* tapclone */ @@ -526,6 +533,7 @@ tapclose(struct cdev *dev, int foo, int bar, struct thread *td) /* junk all pending output */ mtx_lock(&tp->tap_mtx); + CURVNET_SET(ifp->if_vnet); IF_DRAIN(&ifp->if_snd); /* @@ -549,6 +557,8 @@ tapclose(struct cdev *dev, int foo, int bar, struct thread *td) } if_link_state_change(ifp, LINK_STATE_DOWN); + CURVNET_RESTORE(); + funsetown(&tp->tap_sigio); selwakeuppri(&tp->tap_rsel, PZERO+1); KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); @@ -950,7 +960,9 @@ tapwrite(struct cdev *dev, struct uio *uio, int flag) } /* Pass packet up to parent. */ + CURVNET_SET(ifp->if_vnet); (*ifp->if_input)(ifp, m); + CURVNET_RESTORE(); ifp->if_ipackets ++; /* ibytes are counted in parent */ return (0); diff --git a/freebsd/sys/net/if_tun.c b/freebsd/sys/net/if_tun.c index b6fa0e5a..444113f4 100644 --- a/freebsd/sys/net/if_tun.c +++ b/freebsd/sys/net/if_tun.c @@ -128,7 +128,7 @@ static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(const char *name, struct cdev *dev); static int tunifioctl(struct ifnet *, u_long, caddr_t); -static int tuninit(struct ifnet *); +static void tuninit(struct ifnet *); static int tunmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, struct sockaddr *, struct route *ro); @@ -230,8 +230,8 @@ tunclone(void *arg, struct ucred *cred, char *name, int namelen, i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0); if (i) { if (append_unit) { - namelen = snprintf(devname, sizeof(devname), "%s%d", name, - u); + namelen = snprintf(devname, sizeof(devname), "%s%d", + name, u); name = devname; } /* No preexisting struct cdev *, create one */ @@ -261,6 +261,7 @@ tun_destroy(struct tun_softc *tp) if_detach(TUN2IFP(tp)); if_free(TUN2IFP(tp)); destroy_dev(dev); + seldrain(&tp->tun_rsel); knlist_destroy(&tp->tun_rsel.si_note); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); @@ -504,14 +505,13 @@ tunclose(struct cdev *dev, int foo, int bar, struct thread *td) return (0); } -static int +static void tuninit(struct ifnet *ifp) { struct tun_softc *tp = ifp->if_softc; #ifdef INET struct ifaddr *ifa; #endif - int error = 0; TUNDEBUG(ifp, "tuninit\n"); @@ -538,7 +538,6 @@ tuninit(struct ifnet *ifp) if_addr_runlock(ifp); #endif mtx_unlock(&tp->tun_mtx); - return (error); } /* @@ -562,12 +561,12 @@ tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) mtx_unlock(&tp->tun_mtx); break; case SIOCSIFADDR: - error = tuninit(ifp); - TUNDEBUG(ifp, "address set, error=%d\n", error); + tuninit(ifp); + TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFDSTADDR: - error = tuninit(ifp); - TUNDEBUG(ifp, "destination address set, error=%d\n", error); + tuninit(ifp); + TUNDEBUG(ifp, "destination address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; @@ -587,11 +586,8 @@ tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) * tunoutput - queue packets from higher level ready to put out. */ static int -tunoutput( - struct ifnet *ifp, - struct mbuf *m0, - struct sockaddr *dst, - struct route *ro) +tunoutput(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, + struct route *ro) { struct tun_softc *tp = ifp->if_softc; u_short cached_tun_flags; @@ -671,10 +667,8 @@ tunoutput( } error = (ifp->if_transmit)(ifp, m0); - if (error) { - ifp->if_collisions++; + if (error) return (ENOBUFS); - } ifp->if_opackets++; return (0); } @@ -683,7 +677,8 @@ tunoutput( * the cdevsw interface is now pretty minimal. */ static int -tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) +tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, + struct thread *td) { int error; struct tun_softc *tp = dev->si_drv1; @@ -875,7 +870,6 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; - int error = 0; uint32_t family; int isr; @@ -895,7 +889,7 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) { ifp->if_ierrors++; - return (error); + return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; @@ -950,6 +944,7 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) ifp->if_ibytes += m->m_pkthdr.len; ifp->if_ipackets++; CURVNET_SET(ifp->if_vnet); + M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); CURVNET_RESTORE(); return (0); diff --git a/freebsd/sys/net/if_var.h b/freebsd/sys/net/if_var.h index 172ebe0e..c5c489fb 100644 --- a/freebsd/sys/net/if_var.h +++ b/freebsd/sys/net/if_var.h @@ -197,17 +197,18 @@ struct ifnet { /* protected by if_addr_mtx */ void *if_pf_kif; void *if_lagg; /* lagg glue */ - u_char if_alloctype; /* if_type at time of allocation */ + u_char if_alloctype; /* if_type at time of allocation */ /* * Spare fields are added so that we can modify sensitive data * structures without changing the kernel binary interface, and must * be used with care where binary compatibility is required. */ - char if_cspare[3]; + char if_cspare[3]; char *if_description; /* interface description */ - void *if_pspare[7]; - int if_ispare[4]; + void *if_pspare[7]; /* 1 netmap, 6 TBD */ + int if_ispare[3]; + u_int if_fib; /* interface FIB */ }; typedef void if_init_f_t(void *); @@ -249,9 +250,15 @@ typedef void if_init_f_t(void *); #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_mtx, \ "if_addr_mtx", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_mtx) -#define IF_ADDR_LOCK(if) mtx_lock(&(if)->if_addr_mtx) -#define IF_ADDR_UNLOCK(if) mtx_unlock(&(if)->if_addr_mtx) +#define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_mtx) +#define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_mtx) +#define IF_ADDR_RLOCK(if) mtx_lock(&(if)->if_addr_mtx) +#define IF_ADDR_RUNLOCK(if) mtx_unlock(&(if)->if_addr_mtx) #define IF_ADDR_LOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED) +#define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED) +/* XXX: Compat. */ +#define IF_ADDR_LOCK(if) IF_ADDR_WLOCK(if) +#define IF_ADDR_UNLOCK(if) IF_ADDR_WUNLOCK(if) /* * Function variations on locking macros intended to be used by loadable diff --git a/freebsd/sys/net/if_vlan.c b/freebsd/sys/net/if_vlan.c index 576243d9..81c151a5 100644 --- a/freebsd/sys/net/if_vlan.c +++ b/freebsd/sys/net/if_vlan.c @@ -36,9 +36,8 @@ * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that - * ether_output() left on our output queue when it calls - * if_start(), rewrite them for use by the real outgoing interface, - * and ask it to send them. + * ether_output() sends to us via if_transmit(), rewrite them for + * use by the real outgoing interface, and ask it to send them. */ #include <sys/cdefs.h> @@ -181,16 +180,17 @@ static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, #endif static void trunk_destroy(struct ifvlantrunk *trunk); -static void vlan_start(struct ifnet *ifp); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); +static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); +static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); static void vlan_unconfig(struct ifnet *ifp); -static void vlan_unconfig_locked(struct ifnet *ifp); +static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); static void vlan_link_state(struct ifnet *ifp, int link); static void vlan_capabilities(struct ifvlan *ifv); @@ -545,7 +545,7 @@ vlan_ifdetach(void *arg __unused, struct ifnet *ifp) #ifdef VLAN_ARRAY for (i = 0; i < VLAN_ARRAY_SIZE; i++) if ((ifv = ifp->if_vlantrunk->vlans[i])) { - vlan_unconfig_locked(ifv->ifv_ifp); + vlan_unconfig_locked(ifv->ifv_ifp, 1); if (ifp->if_vlantrunk == NULL) break; } @@ -553,7 +553,7 @@ vlan_ifdetach(void *arg __unused, struct ifnet *ifp) restart: for (i = 0; i < (1 << ifp->if_vlantrunk->hwidth); i++) if ((ifv = LIST_FIRST(&ifp->if_vlantrunk->hash[i]))) { - vlan_unconfig_locked(ifv->ifv_ifp); + vlan_unconfig_locked(ifv->ifv_ifp, 1); if (ifp->if_vlantrunk) goto restart; /* trunk->hwidth can change */ else @@ -809,9 +809,9 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) /* NB: mtu is not set here */ ifp->if_init = vlan_init; - ifp->if_start = vlan_start; + ifp->if_transmit = vlan_transmit; + ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; - ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ @@ -823,7 +823,7 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) error = vlan_config(ifv, p, tag); if (error != 0) { /* - * Since we've partialy failed, we need to back + * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ @@ -867,99 +867,99 @@ vlan_init(void *foo __unused) } /* - * The if_start method for vlan(4) interface. It doesn't - * raises the IFF_DRV_OACTIVE flag, since it is called - * only from IFQ_HANDOFF() macro in ether_output_frame(). - * If the interface queue is full, and vlan_start() is - * not called, the queue would never get emptied and - * interface would stall forever. + * The if_transmit method for vlan(4) interface. */ -static void -vlan_start(struct ifnet *ifp) +static int +vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct ifvlan *ifv; struct ifnet *p; - struct mbuf *m; - int error; + int error, len, mcast; ifv = ifp->if_softc; p = PARENT(ifv); + len = m->m_pkthdr.len; + mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; - for (;;) { - IF_DEQUEUE(&ifp->if_snd, m); - if (m == NULL) - break; - BPF_MTAP(ifp, m); + BPF_MTAP(ifp, m); - /* - * Do not run parent's if_start() if the parent is not up, - * or parent's driver will cause a system crash. - */ - if (!UP_AND_RUNNING(p)) { - m_freem(m); - ifp->if_collisions++; - continue; - } + /* + * Do not run parent's if_transmit() if the parent is not up, + * or parent's driver will cause a system crash. + */ + if (!UP_AND_RUNNING(p)) { + m_freem(m); + ifp->if_oerrors++; + return (0); + } - /* - * Pad the frame to the minimum size allowed if told to. - * This option is in accord with IEEE Std 802.1Q, 2003 Ed., - * paragraph C.4.4.3.b. It can help to work around buggy - * bridges that violate paragraph C.4.4.3.a from the same - * document, i.e., fail to pad short frames after untagging. - * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but - * untagging it will produce a 62-byte frame, which is a runt - * and requires padding. There are VLAN-enabled network - * devices that just discard such runts instead or mishandle - * them somehow. - */ - if (soft_pad) { - static char pad[8]; /* just zeros */ - int n; - - for (n = ETHERMIN + ETHER_HDR_LEN - m->m_pkthdr.len; - n > 0; n -= sizeof(pad)) - if (!m_append(m, min(n, sizeof(pad)), pad)) - break; - - if (n > 0) { - if_printf(ifp, "cannot pad short frame\n"); - ifp->if_oerrors++; - m_freem(m); - continue; - } - } + /* + * Pad the frame to the minimum size allowed if told to. + * This option is in accord with IEEE Std 802.1Q, 2003 Ed., + * paragraph C.4.4.3.b. It can help to work around buggy + * bridges that violate paragraph C.4.4.3.a from the same + * document, i.e., fail to pad short frames after untagging. + * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but + * untagging it will produce a 62-byte frame, which is a runt + * and requires padding. There are VLAN-enabled network + * devices that just discard such runts instead or mishandle + * them somehow. + */ + if (soft_pad) { + static char pad[8]; /* just zeros */ + int n; - /* - * If underlying interface can do VLAN tag insertion itself, - * just pass the packet along. However, we need some way to - * tell the interface where the packet came from so that it - * knows how to find the VLAN tag to use, so we attach a - * packet tag that holds it. - */ - if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { - m->m_pkthdr.ether_vtag = ifv->ifv_tag; - m->m_flags |= M_VLANTAG; - } else { - m = ether_vlanencap(m, ifv->ifv_tag); - if (m == NULL) { - if_printf(ifp, - "unable to prepend VLAN header\n"); - ifp->if_oerrors++; - continue; - } + for (n = ETHERMIN + ETHER_HDR_LEN - m->m_pkthdr.len; + n > 0; n -= sizeof(pad)) + if (!m_append(m, min(n, sizeof(pad)), pad)) + break; + + if (n > 0) { + if_printf(ifp, "cannot pad short frame\n"); + ifp->if_oerrors++; + m_freem(m); + return (0); } + } - /* - * Send it, precisely as ether_output() would have. - * We are already running at splimp. - */ - error = (p->if_transmit)(p, m); - if (!error) - ifp->if_opackets++; - else + /* + * If underlying interface can do VLAN tag insertion itself, + * just pass the packet along. However, we need some way to + * tell the interface where the packet came from so that it + * knows how to find the VLAN tag to use, so we attach a + * packet tag that holds it. + */ + if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { + m->m_pkthdr.ether_vtag = ifv->ifv_tag; + m->m_flags |= M_VLANTAG; + } else { + m = ether_vlanencap(m, ifv->ifv_tag); + if (m == NULL) { + if_printf(ifp, "unable to prepend VLAN header\n"); ifp->if_oerrors++; + return (0); + } } + + /* + * Send it, precisely as ether_output() would have. + */ + error = (p->if_transmit)(p, m); + if (!error) { + ifp->if_opackets++; + ifp->if_omcasts += mcast; + ifp->if_obytes += len; + } else + ifp->if_oerrors++; + return (error); +} + +/* + * The ifp->if_qflush entry point for vlan(4) is a no-op. + */ +static void +vlan_qflush(struct ifnet *ifp __unused) +{ } static void @@ -1165,17 +1165,18 @@ vlan_unconfig(struct ifnet *ifp) { VLAN_LOCK(); - vlan_unconfig_locked(ifp); + vlan_unconfig_locked(ifp, 0); VLAN_UNLOCK(); } static void -vlan_unconfig_locked(struct ifnet *ifp) +vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; + int error; VLAN_LOCK_ASSERT(); @@ -1206,13 +1207,21 @@ vlan_unconfig_locked(struct ifnet *ifp) ETHER_ADDR_LEN); /* - * This may fail if the parent interface is - * being detached. Regardless, we should do a - * best effort to free this interface as much - * as possible as all callers expect vlan - * destruction to succeed. + * If the parent interface is being detached, + * all its multicast addresses have already + * been removed. Warn about errors if + * if_delmulti() does fail, but don't abort as + * all callers expect vlan destruction to + * succeed. */ - (void)if_delmulti(parent, (struct sockaddr *)&sdl); + if (!departing) { + error = if_delmulti(parent, + (struct sockaddr *)&sdl); + if (error) + if_printf(ifp, + "Failed to delete multicast address from parent: %d\n", + error); + } SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); free(mc, M_VLAN); } diff --git a/freebsd/sys/net/netisr.c b/freebsd/sys/net/netisr.c index 465b0b29..6ba71233 100644 --- a/freebsd/sys/net/netisr.c +++ b/freebsd/sys/net/netisr.c @@ -2,8 +2,12 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson + * Copyright (c) 2010 Juniper Networks, Inc. * All rights reserved. * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -34,13 +38,13 @@ __FBSDID("$FreeBSD$"); * dispatched) and asynchronous (deferred dispatch) processing of packets by * registered protocol handlers. Callers pass a protocol identifier and * packet to netisr, along with a direct dispatch hint, and work will either - * be immediately processed with the registered handler, or passed to a - * kernel software interrupt (SWI) thread for deferred dispatch. Callers - * will generally select one or the other based on: + * be immediately processed by the registered handler, or passed to a + * software interrupt (SWI) thread for deferred dispatch. Callers will + * generally select one or the other based on: * - * - Might directly dispatching a netisr handler lead to code reentrance or + * - Whether directly dispatching a netisr handler lead to code reentrance or * lock recursion, such as entering the socket code from the socket code. - * - Might directly dispatching a netisr handler lead to recursive + * - Whether directly dispatching a netisr handler lead to recursive * processing, such as when decapsulating several wrapped layers of tunnel * information (IPSEC within IPSEC within ...). * @@ -56,9 +60,9 @@ __FBSDID("$FreeBSD$"); * more than one flow. * * netisr supports several policy variations, represented by the - * NETISR_POLICY_* constants, allowing protocols to play a varying role in + * NETISR_POLICY_* constants, allowing protocols to play various roles in * identifying flows, assigning work to CPUs, etc. These are described in - * detail in netisr.h. + * netisr.h. */ #include <rtems/bsd/local/opt_ddb.h> @@ -85,9 +89,11 @@ __FBSDID("$FreeBSD$"); #include <ddb/ddb.h> #endif +#define _WANT_NETISR_INTERNAL /* Enable definitions from netisr_internal.h */ #include <net/if.h> #include <net/if_var.h> #include <net/netisr.h> +#include <net/netisr_internal.h> #include <net/vnet.h> /*- @@ -97,13 +103,13 @@ __FBSDID("$FreeBSD$"); * * The following data structures and fields are protected by this lock: * - * - The np array, including all fields of struct netisr_proto. + * - The netisr_proto array, including all fields of struct netisr_proto. * - The nws array, including all fields of struct netisr_worker. * - The nws_array array. * * Note: the NETISR_LOCKING define controls whether read locks are acquired * in packet processing paths requiring netisr registration stability. This - * is disabled by default as it can lead to a measurable performance + * is disabled by default as it can lead to measurable performance * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and * because netisr registration and unregistration is extremely rare at * runtime. If it becomes more common, this decision should be revisited. @@ -158,111 +164,58 @@ SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW, */ static int netisr_maxthreads = -1; /* Max number of threads. */ TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); -SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RD, +SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); -SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RD, +SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); /* - * Limit per-workstream queues to at most net.isr.maxqlimit, both for initial - * configuration and later modification using netisr_setqlimit(). + * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, + * both for initial configuration and later modification using + * netisr_setqlimit(). */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; TUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit); -SYSCTL_INT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RD, +SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN, &netisr_maxqlimit, 0, "Maximum netisr per-protocol, per-CPU queue depth."); /* - * The default per-workstream queue limit for protocols that don't initialize - * the nh_qlimit field of their struct netisr_handler. If this is set above - * netisr_maxqlimit, we truncate it to the maximum during boot. + * The default per-workstream mbuf queue limit for protocols that don't + * initialize the nh_qlimit field of their struct netisr_handler. If this is + * set above netisr_maxqlimit, we truncate it to the maximum during boot. */ #define NETISR_DEFAULT_DEFAULTQLIMIT 256 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; TUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit); -SYSCTL_INT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RD, +SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN, &netisr_defaultqlimit, 0, "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); /* - * Each protocol is described by a struct netisr_proto, which holds all - * global per-protocol information. This data structure is set up by - * netisr_register(), and derived from the public struct netisr_handler. - */ -struct netisr_proto { - const char *np_name; /* Character string protocol name. */ - netisr_handler_t *np_handler; /* Protocol handler. */ - netisr_m2flow_t *np_m2flow; /* Query flow for untagged packet. */ - netisr_m2cpuid_t *np_m2cpuid; /* Query CPU to process packet on. */ - netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ - u_int np_qlimit; /* Maximum per-CPU queue depth. */ - u_int np_policy; /* Work placement policy. */ -}; - -#define NETISR_MAXPROT 16 /* Compile-time limit. */ - -/* - * The np array describes all registered protocols, indexed by protocol - * number. + * Store and export the compile-time constant NETISR_MAXPROT limit on the + * number of protocols that can register with netisr at a time. This is + * required for crashdump analysis, as it sizes netisr_proto[]. */ -static struct netisr_proto np[NETISR_MAXPROT]; - -/* - * Protocol-specific work for each workstream is described by struct - * netisr_work. Each work descriptor consists of an mbuf queue and - * statistics. - */ -struct netisr_work { - /* - * Packet queue, linked by m_nextpkt. - */ - struct mbuf *nw_head; - struct mbuf *nw_tail; - u_int nw_len; - u_int nw_qlimit; - u_int nw_watermark; - - /* - * Statistics -- written unlocked, but mostly from curcpu. - */ - u_int64_t nw_dispatched; /* Number of direct dispatches. */ - u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ - u_int64_t nw_qdrops; /* "" drops. */ - u_int64_t nw_queued; /* "" enqueues. */ - u_int64_t nw_handled; /* "" handled in worker. */ -}; +static u_int netisr_maxprot = NETISR_MAXPROT; +SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD, + &netisr_maxprot, 0, + "Compile-time limit on the number of protocols supported by netisr."); /* - * Workstreams hold a set of ordered work across each protocol, and are - * described by netisr_workstream. Each workstream is associated with a - * worker thread, which in turn is pinned to a CPU. Work associated with a - * workstream can be processd in other threads during direct dispatch; - * concurrent processing is prevented by the NWS_RUNNING flag, which - * indicates that a thread is already processing the work queue. + * The netisr_proto array describes all registered protocols, indexed by + * protocol number. See netisr_internal.h for more details. */ -struct netisr_workstream { - struct intr_event *nws_intr_event; /* Handler for stream. */ - void *nws_swi_cookie; /* swi(9) cookie for stream. */ - struct mtx nws_mtx; /* Synchronize work. */ - u_int nws_cpu; /* CPU pinning. */ - u_int nws_flags; /* Wakeup flags. */ - u_int nws_pendingbits; /* Scheduled protocols. */ - - /* - * Each protocol has per-workstream data. - */ - struct netisr_work nws_work[NETISR_MAXPROT]; -} __aligned(CACHE_LINE_SIZE); +static struct netisr_proto netisr_proto[NETISR_MAXPROT]; #ifndef __rtems__ /* - * Per-CPU workstream data. + * Per-CPU workstream data. See netisr_internal.h for more details. */ DPCPU_DEFINE(struct netisr_workstream, nws); @@ -278,20 +231,13 @@ static u_int nws_array[MAXCPU]; * CPUs once fully started. */ static u_int nws_count; -SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, +SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, &nws_count, 0, "Number of extant netisr threads."); #else /* __rtems__ */ static struct netisr_workstream rtems_bsd_nws; #endif /* __rtems__ */ /* - * Per-workstream flags. - */ -#define NWS_RUNNING 0x00000001 /* Currently running in a thread. */ -#define NWS_DISPATCHING 0x00000002 /* Currently being direct-dispatched. */ -#define NWS_SCHEDULED 0x00000004 /* Signal issued. */ - -/* * Synchronization for each workstream: a mutex protects all mutable fields * in each stream, including per-protocol state (mbuf queues). The SWI is * woken up if asynchronous dispatch is required. @@ -324,7 +270,7 @@ netisr_get_cpuid(u_int cpunumber) } /* - * The default implementation of -> CPU ID mapping. + * The default implementation of flow -> CPU ID mapping. * * Non-static so that protocols can use it to map their own work to specific * CPUs in a manner consistent to netisr for affinity purposes. @@ -381,36 +327,34 @@ netisr_register(const struct netisr_handler *nhp) * Test that no existing registration exists for this protocol. */ NETISR_WLOCK(); - KASSERT(np[proto].np_name == NULL, + KASSERT(netisr_proto[proto].np_name == NULL, ("%s(%u, %s): name present", __func__, proto, name)); - KASSERT(np[proto].np_handler == NULL, + KASSERT(netisr_proto[proto].np_handler == NULL, ("%s(%u, %s): handler present", __func__, proto, name)); - np[proto].np_name = name; - np[proto].np_handler = nhp->nh_handler; - np[proto].np_m2flow = nhp->nh_m2flow; - np[proto].np_m2cpuid = nhp->nh_m2cpuid; - np[proto].np_drainedcpu = nhp->nh_drainedcpu; + netisr_proto[proto].np_name = name; + netisr_proto[proto].np_handler = nhp->nh_handler; + netisr_proto[proto].np_m2flow = nhp->nh_m2flow; + netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid; + netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu; if (nhp->nh_qlimit == 0) - np[proto].np_qlimit = netisr_defaultqlimit; + netisr_proto[proto].np_qlimit = netisr_defaultqlimit; else if (nhp->nh_qlimit > netisr_maxqlimit) { printf("%s: %s requested queue limit %u capped to " "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, netisr_maxqlimit); - np[proto].np_qlimit = netisr_maxqlimit; + netisr_proto[proto].np_qlimit = netisr_maxqlimit; } else - np[proto].np_qlimit = nhp->nh_qlimit; - np[proto].np_policy = nhp->nh_policy; - for (i = 0; i <= mp_maxid; i++) { - if (CPU_ABSENT(i)) - continue; + netisr_proto[proto].np_qlimit = nhp->nh_qlimit; + netisr_proto[proto].np_policy = nhp->nh_policy; + CPU_FOREACH(i) { #ifndef __rtems__ npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; #else /* __rtems__ */ npwp = &rtems_bsd_nws.nws_work[proto]; #endif /* __rtems__ */ bzero(npwp, sizeof(*npwp)); - npwp->nw_qlimit = np[proto].np_qlimit; + npwp->nw_qlimit = netisr_proto[proto].np_qlimit; } NETISR_WUNLOCK(); } @@ -435,13 +379,11 @@ netisr_clearqdrops(const struct netisr_handler *nhp) ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); - KASSERT(np[proto].np_handler != NULL, + KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - for (i = 0; i <= mp_maxid; i++) { - if (CPU_ABSENT(i)) - continue; + CPU_FOREACH(i) { #ifndef __rtems__ npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; #else /* __rtems__ */ @@ -453,7 +395,7 @@ netisr_clearqdrops(const struct netisr_handler *nhp) } /* - * Query the current drop counters across all workstreams for a protocol. + * Query current drop counters across all workstreams for a protocol. */ void netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) @@ -474,13 +416,11 @@ netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); - KASSERT(np[proto].np_handler != NULL, + KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - for (i = 0; i <= mp_maxid; i++) { - if (CPU_ABSENT(i)) - continue; + CPU_FOREACH(i) { #ifndef __rtems__ npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; #else /* __rtems__ */ @@ -492,7 +432,7 @@ netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) } /* - * Query the current queue limit for per-workstream queues for a protocol. + * Query current per-workstream queue limit for a protocol. */ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) @@ -511,10 +451,10 @@ netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); - KASSERT(np[proto].np_handler != NULL, + KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - *qlimitp = np[proto].np_qlimit; + *qlimitp = netisr_proto[proto].np_qlimit; NETISR_RUNLOCK(&tracker); } @@ -543,14 +483,12 @@ netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); - KASSERT(np[proto].np_handler != NULL, + KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - np[proto].np_qlimit = qlimit; - for (i = 0; i <= mp_maxid; i++) { - if (CPU_ABSENT(i)) - continue; + netisr_proto[proto].np_qlimit = qlimit; + CPU_FOREACH(i) { #ifndef __rtems__ npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; #else /* __rtems__ */ @@ -608,19 +546,17 @@ netisr_unregister(const struct netisr_handler *nhp) ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); - KASSERT(np[proto].np_handler != NULL, + KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); - np[proto].np_name = NULL; - np[proto].np_handler = NULL; - np[proto].np_m2flow = NULL; - np[proto].np_m2cpuid = NULL; - np[proto].np_qlimit = 0; - np[proto].np_policy = 0; - for (i = 0; i <= mp_maxid; i++) { - if (CPU_ABSENT(i)) - continue; + netisr_proto[proto].np_name = NULL; + netisr_proto[proto].np_handler = NULL; + netisr_proto[proto].np_m2flow = NULL; + netisr_proto[proto].np_m2cpuid = NULL; + netisr_proto[proto].np_qlimit = 0; + netisr_proto[proto].np_policy = 0; + CPU_FOREACH(i) { #ifndef __rtems__ npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; #else /* __rtems__ */ @@ -744,22 +680,23 @@ netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) if (local_npw.nw_head == NULL) local_npw.nw_tail = NULL; local_npw.nw_len--; - VNET_ASSERT(m->m_pkthdr.rcvif != NULL); + VNET_ASSERT(m->m_pkthdr.rcvif != NULL, + ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m)); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); - np[proto].np_handler(m); + netisr_proto[proto].np_handler(m); CURVNET_RESTORE(); } KASSERT(local_npw.nw_len == 0, ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); - if (np[proto].np_drainedcpu) - np[proto].np_drainedcpu(nwsp->nws_cpu); + if (netisr_proto[proto].np_drainedcpu) + netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu); NWS_LOCK(nwsp); npwp->nw_handled += handled; return (handled); } /* - * SWI handler for netisr -- processes prackets in a set of workstreams that + * SWI handler for netisr -- processes packets in a set of workstreams that * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already * being direct dispatched, go back to sleep and wait for the dispatching * thread to wake us up again. @@ -827,6 +764,11 @@ netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, npwp->nw_len++; if (npwp->nw_len > npwp->nw_watermark) npwp->nw_watermark = npwp->nw_len; + + /* + * We must set the bit regardless of NWS_RUNNING, so that + * swi_net() keeps calling netisr_process_workstream_proto(). + */ nwsp->nws_pendingbits |= (1 << proto); if (!(nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { @@ -887,10 +829,10 @@ netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif - KASSERT(np[proto].np_handler != NULL, + KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); - m = netisr_select_cpuid(&np[proto], source, m, &cpuid); + m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid); if (m != NULL) { KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); @@ -911,7 +853,7 @@ netisr_queue(u_int proto, struct mbuf *m) } /* - * Dispatch a packet for netisr processing, direct dispatch permitted by + * Dispatch a packet for netisr processing; direct dispatch is permitted by * calling context. */ int @@ -936,7 +878,7 @@ netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif - KASSERT(np[proto].np_handler != NULL, + KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); /* @@ -951,7 +893,7 @@ netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) npwp = &nwsp->nws_work[proto]; npwp->nw_dispatched++; npwp->nw_handled++; - np[proto].np_handler(m); + netisr_proto[proto].np_handler(m); error = 0; goto out_unlock; } @@ -961,7 +903,7 @@ netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) * dispatch if we're on the right CPU and the netisr worker isn't * already running. */ - m = netisr_select_cpuid(&np[proto], source, m, &cpuid); + m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid); if (m == NULL) { error = ENOBUFS; goto out_unlock; @@ -1000,7 +942,7 @@ netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) */ nwsp->nws_flags |= NWS_DISPATCHING; NWS_UNLOCK(nwsp); - np[proto].np_handler(m); + netisr_proto[proto].np_handler(m); NWS_LOCK(nwsp); nwsp->nws_flags &= ~NWS_DISPATCHING; npwp->nw_handled++; @@ -1171,6 +1113,166 @@ netisr_start(void *arg) SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); #endif /* __rtems__ */ +/* + * Sysctl monitoring for netisr: query a list of registered protocols. + */ +static int +sysctl_netisr_proto(SYSCTL_HANDLER_ARGS) +{ + struct rm_priotracker tracker; + struct sysctl_netisr_proto *snpp, *snp_array; + struct netisr_proto *npp; + u_int counter, proto; + int error; + + if (req->newptr != NULL) + return (EINVAL); + snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP, + M_ZERO | M_WAITOK); + counter = 0; + NETISR_RLOCK(&tracker); + for (proto = 0; proto < NETISR_MAXPROT; proto++) { + npp = &netisr_proto[proto]; + if (npp->np_name == NULL) + continue; + snpp = &snp_array[counter]; + snpp->snp_version = sizeof(*snpp); + strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN); + snpp->snp_proto = proto; + snpp->snp_qlimit = npp->np_qlimit; + snpp->snp_policy = npp->np_policy; + if (npp->np_m2flow != NULL) + snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW; + if (npp->np_m2cpuid != NULL) + snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID; + if (npp->np_drainedcpu != NULL) + snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU; + counter++; + } + NETISR_RUNLOCK(&tracker); + KASSERT(counter <= NETISR_MAXPROT, + ("sysctl_netisr_proto: counter too big (%d)", counter)); + error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter); + free(snp_array, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_isr, OID_AUTO, proto, + CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto, + "S,sysctl_netisr_proto", + "Return list of protocols registered with netisr"); + +/* + * Sysctl monitoring for netisr: query a list of workstreams. + */ +static int +sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS) +{ + struct rm_priotracker tracker; + struct sysctl_netisr_workstream *snwsp, *snws_array; + struct netisr_workstream *nwsp; + u_int counter, cpuid; + int error; + + if (req->newptr != NULL) + return (EINVAL); + snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP, + M_ZERO | M_WAITOK); + counter = 0; + NETISR_RLOCK(&tracker); + CPU_FOREACH(cpuid) { + nwsp = DPCPU_ID_PTR(cpuid, nws); + if (nwsp->nws_intr_event == NULL) + continue; + NWS_LOCK(nwsp); + snwsp = &snws_array[counter]; + snwsp->snws_version = sizeof(*snwsp); + + /* + * For now, we equate workstream IDs and CPU IDs in the + * kernel, but expose them independently to userspace in case + * that assumption changes in the future. + */ + snwsp->snws_wsid = cpuid; + snwsp->snws_cpu = cpuid; + if (nwsp->nws_intr_event != NULL) + snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR; + NWS_UNLOCK(nwsp); + counter++; + } + NETISR_RUNLOCK(&tracker); + KASSERT(counter <= MAXCPU, + ("sysctl_netisr_workstream: counter too big (%d)", counter)); + error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter); + free(snws_array, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_isr, OID_AUTO, workstream, + CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream, + "S,sysctl_netisr_workstream", + "Return list of workstreams implemented by netisr"); + +/* + * Sysctl monitoring for netisr: query per-protocol data across all + * workstreams. + */ +static int +sysctl_netisr_work(SYSCTL_HANDLER_ARGS) +{ + struct rm_priotracker tracker; + struct sysctl_netisr_work *snwp, *snw_array; + struct netisr_workstream *nwsp; + struct netisr_proto *npp; + struct netisr_work *nwp; + u_int counter, cpuid, proto; + int error; + + if (req->newptr != NULL) + return (EINVAL); + snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT, + M_TEMP, M_ZERO | M_WAITOK); + counter = 0; + NETISR_RLOCK(&tracker); + CPU_FOREACH(cpuid) { + nwsp = DPCPU_ID_PTR(cpuid, nws); + if (nwsp->nws_intr_event == NULL) + continue; + NWS_LOCK(nwsp); + for (proto = 0; proto < NETISR_MAXPROT; proto++) { + npp = &netisr_proto[proto]; + if (npp->np_name == NULL) + continue; + nwp = &nwsp->nws_work[proto]; + snwp = &snw_array[counter]; + snwp->snw_version = sizeof(*snwp); + snwp->snw_wsid = cpuid; /* See comment above. */ + snwp->snw_proto = proto; + snwp->snw_len = nwp->nw_len; + snwp->snw_watermark = nwp->nw_watermark; + snwp->snw_dispatched = nwp->nw_dispatched; + snwp->snw_hybrid_dispatched = + nwp->nw_hybrid_dispatched; + snwp->snw_qdrops = nwp->nw_qdrops; + snwp->snw_queued = nwp->nw_queued; + snwp->snw_handled = nwp->nw_handled; + counter++; + } + NWS_UNLOCK(nwsp); + } + KASSERT(counter <= MAXCPU * NETISR_MAXPROT, + ("sysctl_netisr_work: counter too big (%d)", counter)); + NETISR_RUNLOCK(&tracker); + error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter); + free(snw_array, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_isr, OID_AUTO, work, + CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work, + "S,sysctl_netisr_work", + "Return list of per-workstream, per-protocol work in netisr"); + #ifdef DDB DB_SHOW_COMMAND(netisr, db_show_netisr) { @@ -1181,15 +1283,13 @@ DB_SHOW_COMMAND(netisr, db_show_netisr) db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); - for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { - if (CPU_ABSENT(cpuid)) - continue; + CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; first = 1; for (proto = 0; proto < NETISR_MAXPROT; proto++) { - if (np[proto].np_handler == NULL) + if (netisr_proto[proto].np_handler == NULL) continue; nwp = &nwsp->nws_work[proto]; if (first) { @@ -1199,7 +1299,7 @@ DB_SHOW_COMMAND(netisr, db_show_netisr) db_printf("%3s ", ""); db_printf( "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", - np[proto].np_name, nwp->nw_len, + netisr_proto[proto].np_name, nwp->nw_len, nwp->nw_watermark, nwp->nw_qlimit, nwp->nw_dispatched, nwp->nw_hybrid_dispatched, nwp->nw_qdrops, nwp->nw_queued); diff --git a/freebsd/sys/net/netisr.h b/freebsd/sys/net/netisr.h index 72e7f17f..cd692f6d 100644 --- a/freebsd/sys/net/netisr.h +++ b/freebsd/sys/net/netisr.h @@ -1,7 +1,11 @@ /*- * Copyright (c) 2007-2009 Robert N. M. Watson + * Copyright (c) 2010 Juniper Networks, Inc. * All rights reserved. * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -28,7 +32,6 @@ #ifndef _NET_NETISR_H_ #define _NET_NETISR_H_ -#ifdef _KERNEL /* * The netisr (network interrupt service routine) provides a deferred @@ -39,6 +42,13 @@ * Historically, this was implemented by the BSD software ISR facility; it is * now implemented via a software ithread (SWI). */ + +/* + * Protocol numbers, which are encoded in monitoring applications and kernel + * modules. Internally, these are used in bit shift operations so must have + * a value 0 < proto < 32; we currently further limit at compile-time to 16 + * for array-sizing purposes. + */ #define NETISR_IP 1 #define NETISR_IGMP 2 /* IGMPv3 output queue */ #define NETISR_ROUTE 3 /* routing socket */ @@ -52,6 +62,78 @@ #define NETISR_NATM 11 #define NETISR_EPAIR 12 /* if_epair(4) */ +/* + * Protocol ordering and affinity policy constants. See the detailed + * discussion of policies later in the file. + */ +#define NETISR_POLICY_SOURCE 1 /* Maintain source ordering. */ +#define NETISR_POLICY_FLOW 2 /* Maintain flow ordering. */ +#define NETISR_POLICY_CPU 3 /* Protocol determines CPU placement. */ + +/* + * Monitoring data structures, exported by sysctl(2). + * + * Three sysctls are defined. First, a per-protocol structure exported by + * net.isr.proto. + */ +#define NETISR_NAMEMAXLEN 32 +struct sysctl_netisr_proto { + u_int snp_version; /* Length of struct. */ + char snp_name[NETISR_NAMEMAXLEN]; /* nh_name */ + u_int snp_proto; /* nh_proto */ + u_int snp_qlimit; /* nh_qlimit */ + u_int snp_policy; /* nh_policy */ + u_int snp_flags; /* Various flags. */ + u_int _snp_ispare[7]; +}; + +/* + * Flags for sysctl_netisr_proto.snp_flags. + */ +#define NETISR_SNP_FLAGS_M2FLOW 0x00000001 /* nh_m2flow */ +#define NETISR_SNP_FLAGS_M2CPUID 0x00000002 /* nh_m2cpuid */ +#define NETISR_SNP_FLAGS_DRAINEDCPU 0x00000004 /* nh_drainedcpu */ + +/* + * Next, a structure per-workstream, with per-protocol data, exported as + * net.isr.workstream. + */ +struct sysctl_netisr_workstream { + u_int snws_version; /* Length of struct. */ + u_int snws_flags; /* Various flags. */ + u_int snws_wsid; /* Workstream ID. */ + u_int snws_cpu; /* nws_cpu */ + u_int _snws_ispare[12]; +}; + +/* + * Flags for sysctl_netisr_workstream.snws_flags + */ +#define NETISR_SNWS_FLAGS_INTR 0x00000001 /* nws_intr_event */ + +/* + * Finally, a per-workstream-per-protocol structure, exported as + * net.isr.work. + */ +struct sysctl_netisr_work { + u_int snw_version; /* Length of struct. */ + u_int snw_wsid; /* Workstream ID. */ + u_int snw_proto; /* Protocol number. */ + u_int snw_len; /* nw_len */ + u_int snw_watermark; /* nw_watermark */ + u_int _snw_ispare[3]; + + uint64_t snw_dispatched; /* nw_dispatched */ + uint64_t snw_hybrid_dispatched; /* nw_hybrid_dispatched */ + uint64_t snw_qdrops; /* nw_qdrops */ + uint64_t snw_queued; /* nw_queued */ + uint64_t snw_handled; /* nw_handled */ + + uint64_t _snw_llspare[7]; +}; + +#ifdef _KERNEL + /*- * Protocols express ordering constraints and affinity preferences by * implementing one or neither of nh_m2flow and nh_m2cpuid, which are used by @@ -91,10 +173,6 @@ typedef struct mbuf *netisr_m2cpuid_t(struct mbuf *m, uintptr_t source, typedef struct mbuf *netisr_m2flow_t(struct mbuf *m, uintptr_t source); typedef void netisr_drainedcpu_t(u_int cpuid); -#define NETISR_POLICY_SOURCE 1 /* Maintain source ordering. */ -#define NETISR_POLICY_FLOW 2 /* Maintain flow ordering. */ -#define NETISR_POLICY_CPU 3 /* Protocol determines CPU placement. */ - /* * Data structure describing a protocol handler. */ diff --git a/freebsd/sys/net/netisr_internal.h b/freebsd/sys/net/netisr_internal.h new file mode 100644 index 00000000..40afaf16 --- /dev/null +++ b/freebsd/sys/net/netisr_internal.h @@ -0,0 +1,127 @@ +/*- + * Copyright (c) 2007-2009 Robert N. M. Watson + * Copyright (c) 2010 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_NETISR_INTERNAL_H_ +#define _NET_NETISR_INTERNAL_H_ + +#ifndef _WANT_NETISR_INTERNAL +#error "no user-serviceable parts inside" +#endif + +/* + * These definitions are private to the netisr implementation, but provided + * here for use by post-mortem crashdump analysis tools. They should not be + * used in any other context as they can and will change. Public definitions + * may be found in netisr.h. + */ + +#ifndef _KERNEL +typedef void *netisr_handler_t; +typedef void *netisr_m2flow_t; +typedef void *netisr_m2cpuid_t; +typedef void *netisr_drainedcpu_t; +#endif + +/* + * Each protocol is described by a struct netisr_proto, which holds all + * global per-protocol information. This data structure is set up by + * netisr_register(), and derived from the public struct netisr_handler. + */ +struct netisr_proto { + const char *np_name; /* Character string protocol name. */ + netisr_handler_t *np_handler; /* Protocol handler. */ + netisr_m2flow_t *np_m2flow; /* Query flow for untagged packet. */ + netisr_m2cpuid_t *np_m2cpuid; /* Query CPU to process packet on. */ + netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ + u_int np_qlimit; /* Maximum per-CPU queue depth. */ + u_int np_policy; /* Work placement policy. */ +}; + +#define NETISR_MAXPROT 16 /* Compile-time limit. */ + +/* + * Protocol-specific work for each workstream is described by struct + * netisr_work. Each work descriptor consists of an mbuf queue and + * statistics. + */ +struct netisr_work { + /* + * Packet queue, linked by m_nextpkt. + */ + struct mbuf *nw_head; + struct mbuf *nw_tail; + u_int nw_len; + u_int nw_qlimit; + u_int nw_watermark; + + /* + * Statistics -- written unlocked, but mostly from curcpu. + */ + u_int64_t nw_dispatched; /* Number of direct dispatches. */ + u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ + u_int64_t nw_qdrops; /* "" drops. */ + u_int64_t nw_queued; /* "" enqueues. */ + u_int64_t nw_handled; /* "" handled in worker. */ +}; + +/* + * Workstreams hold a queue of ordered work across each protocol, and are + * described by netisr_workstream. Each workstream is associated with a + * worker thread, which in turn is pinned to a CPU. Work associated with a + * workstream can be processd in other threads during direct dispatch; + * concurrent processing is prevented by the NWS_RUNNING flag, which + * indicates that a thread is already processing the work queue. It is + * important to prevent a directly dispatched packet from "skipping ahead" of + * work already in the workstream queue. + */ +struct netisr_workstream { + struct intr_event *nws_intr_event; /* Handler for stream. */ + void *nws_swi_cookie; /* swi(9) cookie for stream. */ + struct mtx nws_mtx; /* Synchronize work. */ + u_int nws_cpu; /* CPU pinning. */ + u_int nws_flags; /* Wakeup flags. */ + u_int nws_pendingbits; /* Scheduled protocols. */ + + /* + * Each protocol has per-workstream data. + */ + struct netisr_work nws_work[NETISR_MAXPROT]; +} __aligned(CACHE_LINE_SIZE); + +/* + * Per-workstream flags. + */ +#define NWS_RUNNING 0x00000001 /* Currently running in a thread. */ +#define NWS_DISPATCHING 0x00000002 /* Currently being direct-dispatched. */ +#define NWS_SCHEDULED 0x00000004 /* Signal issued. */ + +#endif /* !_NET_NETISR_INTERNAL_H_ */ diff --git a/freebsd/sys/net/radix_mpath.c b/freebsd/sys/net/radix_mpath.c index bb7b6fd4..6a3e3ef7 100644 --- a/freebsd/sys/net/radix_mpath.c +++ b/freebsd/sys/net/radix_mpath.c @@ -102,10 +102,7 @@ rt_mpath_matchgate(struct rtentry *rt, struct sockaddr *gate) { struct radix_node *rn; - if (!rn_mpath_next((struct radix_node *)rt)) - return rt; - - if (!gate) + if (!gate || !rt->rt_gateway) return NULL; /* beyond here, we use rn as the master copy */ diff --git a/freebsd/sys/net/raw_cb.h b/freebsd/sys/net/raw_cb.h index 35b546c5..1b347e02 100644 --- a/freebsd/sys/net/raw_cb.h +++ b/freebsd/sys/net/raw_cb.h @@ -70,9 +70,14 @@ pr_init_t raw_init; * Library routines for raw socket usrreq functions; will always be wrapped * so that protocol-specific functions can be handled. */ +typedef int (*raw_input_cb_fn)(struct mbuf *, struct sockproto *, + struct sockaddr *, struct rawcb *); + int raw_attach(struct socket *, int); void raw_detach(struct rawcb *); void raw_input(struct mbuf *, struct sockproto *, struct sockaddr *); +void raw_input_ext(struct mbuf *, struct sockproto *, struct sockaddr *, + raw_input_cb_fn); /* * Generic pr_usrreqs entries for raw socket protocols, usually wrapped so diff --git a/freebsd/sys/net/raw_usrreq.c b/freebsd/sys/net/raw_usrreq.c index 0723799f..0d7973e9 100644 --- a/freebsd/sys/net/raw_usrreq.c +++ b/freebsd/sys/net/raw_usrreq.c @@ -73,6 +73,14 @@ raw_init(void) void raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src) { + + return (raw_input_ext(m0, proto, src, NULL)); +} + +void +raw_input_ext(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src, + raw_input_cb_fn cb) +{ struct rawcb *rp; struct mbuf *m = m0; struct socket *last; @@ -85,6 +93,8 @@ raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src) if (rp->rcb_proto.sp_protocol && rp->rcb_proto.sp_protocol != proto->sp_protocol) continue; + if (cb != NULL && (*cb)(m, proto, src, rp) != 0) + continue; if (last) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); diff --git a/freebsd/sys/net/route.c b/freebsd/sys/net/route.c index 5827cc00..3821c208 100644 --- a/freebsd/sys/net/route.c +++ b/freebsd/sys/net/route.c @@ -37,6 +37,7 @@ ***********************************************************************/ #include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_route.h> #include <rtems/bsd/local/opt_mrouting.h> #include <rtems/bsd/local/opt_mpath.h> @@ -69,12 +70,34 @@ #include <vm/uma.h> +/* We use 4 bits in the mbuf flags, thus we are limited to 16 FIBS. */ +#define RT_MAXFIBS 16 + +/* Kernel config default option. */ +#ifdef ROUTETABLES +#if ROUTETABLES <= 0 +#error "ROUTETABLES defined too low" +#endif +#if ROUTETABLES > RT_MAXFIBS +#error "ROUTETABLES defined too big" +#endif +#define RT_NUMFIBS ROUTETABLES +#endif /* ROUTETABLES */ +/* Initialize to default if not otherwise set. */ +#ifndef RT_NUMFIBS +#define RT_NUMFIBS 1 +#endif + u_int rt_numfibs = RT_NUMFIBS; -SYSCTL_INT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, ""); +SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, ""); /* * Allow the boot code to allow LESS than RT_MAXFIBS to be used. * We can't do more because storage is statically allocated for now. - * (for compatibility reasons.. this will change). + * (for compatibility reasons.. this will change. When this changes, code should + * be refactored to protocol independent parts and protocol dependent parts, + * probably hanging of domain(9) specific storage to not need the full + * fib * af RNH allocation etc. but allow tuning the number of tables per + * address family). */ TUNABLE_INT("net.fibs", &rt_numfibs); @@ -84,9 +107,12 @@ TUNABLE_INT("net.fibs", &rt_numfibs); * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. + * XXX also has the problems getting the FIB from curthread which will not + * always work given the fib can be overridden and prefixes can be added + * from the network stack context. */ u_int rt_add_addr_allfibs = 1; -SYSCTL_INT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW, +SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW, &rt_add_addr_allfibs, 0, ""); TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs); @@ -118,12 +144,6 @@ VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) -#if 0 -/* default fib for tunnels to use */ -u_int tunnel_fib = 0; -SYSCTL_INT(_net, OID_AUTO, tunnelfib, CTLFLAG_RD, &tunnel_fib, 0, ""); -#endif - #ifndef __rtems__ /* * handler for net.my_fibnum @@ -206,27 +226,23 @@ vnet_route_init(const void *unused __unused) V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { - if (dom->dom_rtattach) { - for (table = 0; table < rt_numfibs; table++) { - if ( (fam = dom->dom_family) == AF_INET || - table == 0) { - /* for now only AF_INET has > 1 table */ - /* XXX MRT - * rtattach will be also called - * from vfs_export.c but the - * offset will be 0 - * (only for AF_INET and AF_INET6 - * which don't need it anyhow) - */ - rnh = rt_tables_get_rnh_ptr(table, fam); - if (rnh == NULL) - panic("%s: rnh NULL", __func__); - dom->dom_rtattach((void **)rnh, - dom->dom_rtoffset); - } else { - break; - } - } + if (dom->dom_rtattach == NULL) + continue; + + for (table = 0; table < rt_numfibs; table++) { + fam = dom->dom_family; + if (table != 0 && fam != AF_INET6 && fam != AF_INET) + break; + + /* + * XXX MRT rtattach will be also called from + * vfs_export.c but the offset will be 0 (only for + * AF_INET and AF_INET6 which don't need it anyhow). + */ + rnh = rt_tables_get_rnh_ptr(table, fam); + if (rnh == NULL) + panic("%s: rnh NULL", __func__); + dom->dom_rtattach((void **)rnh, dom->dom_rtoffset); } } } @@ -243,20 +259,19 @@ vnet_route_uninit(const void *unused __unused) struct radix_node_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { - if (dom->dom_rtdetach) { - for (table = 0; table < rt_numfibs; table++) { - if ( (fam = dom->dom_family) == AF_INET || - table == 0) { - /* For now only AF_INET has > 1 tbl. */ - rnh = rt_tables_get_rnh_ptr(table, fam); - if (rnh == NULL) - panic("%s: rnh NULL", __func__); - dom->dom_rtdetach((void **)rnh, - dom->dom_rtoffset); - } else { - break; - } - } + if (dom->dom_rtdetach == NULL) + continue; + + for (table = 0; table < rt_numfibs; table++) { + fam = dom->dom_family; + + if (table != 0 && fam != AF_INET6 && fam != AF_INET) + break; + + rnh = rt_tables_get_rnh_ptr(table, fam); + if (rnh == NULL) + panic("%s: rnh NULL", __func__); + dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset); } } } @@ -286,7 +301,8 @@ setfib(struct thread *td, struct setfib_args *uap) void rtalloc(struct route *ro) { - rtalloc_ign_fib(ro, 0UL, 0); + + rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB); } void @@ -306,7 +322,7 @@ rtalloc_ign(struct route *ro, u_long ignore) RTFREE(rt); ro->ro_rt = NULL; } - ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0); + ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } @@ -336,7 +352,8 @@ rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { - return (rtalloc1_fib(dst, report, ignflags, 0)); + + return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * @@ -344,7 +361,6 @@ rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { struct radix_node_head *rnh; - struct rtentry *rt; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; @@ -352,17 +368,23 @@ rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, int needlock; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); - if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */ - fibnum = 0; + switch (dst->sa_family) { + case AF_INET6: + case AF_INET: + /* We support multiple FIBs. */ + break; + default: + fibnum = RT_DEFAULT_FIB; + break; + } rnh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; + if (rnh == NULL) + goto miss; + /* * Look up the address in the table for that Address Family */ - if (rnh == NULL) { - V_rtstat.rts_unreach++; - goto miss; - } needlock = !(ignflags & RTF_RNH_LOCKED); if (needlock) RADIX_NODE_HEAD_RLOCK(rnh); @@ -372,7 +394,7 @@ rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, #endif rn = rnh->rnh_matchaddr(dst, rnh); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { - newrt = rt = RNTORT(rn); + newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if (needlock) @@ -387,8 +409,9 @@ rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, * Which basically means * "caint get there frm here" */ - V_rtstat.rts_unreach++; miss: + V_rtstat.rts_unreach++; + if (report) { /* * If required, report the failure to the supervising @@ -397,8 +420,8 @@ miss: */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; - rt_missmsg(msgtype, &info, 0, err); - } + rt_missmsg_fib(msgtype, &info, 0, err, fibnum); + } done: if (newrt) RT_LOCK_ASSERT(newrt); @@ -499,7 +522,8 @@ rtredirect(struct sockaddr *dst, int flags, struct sockaddr *src) { - rtredirect_fib(dst, gateway, netmask, flags, src, 0); + + rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB); } void @@ -545,7 +569,7 @@ rtredirect_fib(struct sockaddr *dst, goto done; /* * Create a new entry if we just got back a wildcard entry - * or the the lookup failed. This is necessary for hosts + * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ @@ -622,7 +646,7 @@ out: info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; - rt_missmsg(RTM_REDIRECT, &info, flags, error); + rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); if (ifa != NULL) ifa_free(ifa); } @@ -630,7 +654,8 @@ out: int rtioctl(u_long req, caddr_t data) { - return (rtioctl_fib(req, data, 0)); + + return (rtioctl_fib(req, data, RT_DEFAULT_FIB)); } /* @@ -660,7 +685,8 @@ rtioctl_fib(u_long req, caddr_t data, u_int fibnum) struct ifaddr * ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway) { - return (ifa_ifwithroute_fib(flags, dst, gateway, 0)); + + return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB)); } struct ifaddr * @@ -745,7 +771,9 @@ rtrequest(int req, int flags, struct rtentry **ret_nrt) { - return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0)); + + return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, + RT_DEFAULT_FIB)); } int @@ -784,7 +812,8 @@ rtrequest_fib(int req, int rt_getifa(struct rt_addrinfo *info) { - return (rt_getifa_fib(info, 0)); + + return (rt_getifa_fib(info, RT_DEFAULT_FIB)); } /* @@ -1038,11 +1067,20 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, register struct radix_node_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; + struct sockaddr_storage mdst; #define senderr(x) { error = x ; goto bad; } KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); - if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */ - fibnum = 0; + switch (dst->sa_family) { + case AF_INET6: + case AF_INET: + /* We support multiple FIBs. */ + break; + default: + fibnum = RT_DEFAULT_FIB; + break; + } + /* * Find the correct routing tree to use for this Address Family */ @@ -1064,6 +1102,10 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, switch (req) { case RTM_DELETE: + if (netmask) { + rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); + dst = (struct sockaddr *)&mdst; + } #ifdef RADIX_MPATH if (rn_mpath_capable(rnh)) { error = rn_mpath_update(req, info, rnh, ret_nrt); @@ -1144,8 +1186,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* - * Add the gateway. Possibly re-malloc-ing the storage for it - * + * Add the gateway. Possibly re-malloc-ing the storage for it. */ RT_LOCK(rt); if ((error = rt_setgate(rt, dst, gateway)) != 0) { @@ -1194,11 +1235,17 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, #ifdef FLOWTABLE rt0 = NULL; - /* XXX - * "flow-table" only support IPv4 at the moment. - */ + /* "flow-table" only supports IPv6 and IPv4 at the moment. */ + switch (dst->sa_family) { +#ifdef notyet +#ifdef INET6 + case AF_INET6: +#endif +#endif #ifdef INET - if (dst->sa_family == AF_INET) { + case AF_INET: +#endif +#if defined(INET6) || defined(INET) rn = rnh->rnh_matchaddr(dst, rnh); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { struct sockaddr *mask; @@ -1237,9 +1284,9 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, } } } +#endif/* INET6 || INET */ } -#endif -#endif +#endif /* FLOWTABLE */ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); @@ -1261,9 +1308,20 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, } #ifdef FLOWTABLE else if (rt0 != NULL) { + switch (dst->sa_family) { +#ifdef notyet +#ifdef INET6 + case AF_INET6: + flowtable_route_flush(V_ip6_ft, rt0); + break; +#endif +#endif #ifdef INET - flowtable_route_flush(V_ip_ft, rt0); + case AF_INET: + flowtable_route_flush(V_ip_ft, rt0); + break; #endif + } RTFREE(rt0); } #endif @@ -1395,8 +1453,17 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } - if ( dst->sa_family != AF_INET) - fibnum = 0; + if (dst->sa_len == 0) + return(EINVAL); + switch (dst->sa_family) { + case AF_INET6: + case AF_INET: + /* We support multiple FIBs. */ + break; + default: + fibnum = RT_DEFAULT_FIB; + break; + } if (fibnum == -1) { if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) { #ifndef __rtems__ @@ -1413,8 +1480,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) startfib = fibnum; endfib = fibnum; } - if (dst->sa_len == 0) - return(EINVAL); /* * If it's a delete, check that if it exists, @@ -1438,9 +1503,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the - * tables for those that do. XXX For this version only AF_INET. - * When that changes code should be refactored to protocol - * independent parts and protocol dependent parts. + * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { @@ -1494,7 +1557,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; - info.rti_flags = flags | ifa->ifa_flags; + info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF); info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons @@ -1514,10 +1577,10 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) #ifdef RADIX_MPATH /* * in case address alias finds the first address - * e.g. ifconfig bge0 192.103.54.246/24 - * e.g. ifconfig bge0 192.103.54.247/24 - * the address set in the route is 192.103.54.246 - * so we need to replace it with 192.103.54.247 + * e.g. ifconfig bge0 192.0.2.246/24 + * e.g. ifconfig bge0 192.0.2.247/24 + * the address set in the route is 192.0.2.246 + * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { @@ -1538,7 +1601,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) } RT_ADDREF(rt); RT_UNLOCK(rt); - rt_newaddrmsg(cmd, ifa, error, rt); + rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { @@ -1580,12 +1643,14 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) return (error); } +#ifndef BURN_BRIDGES /* special one for inet internal use. may not use. */ int rtinit_fib(struct ifaddr *ifa, int cmd, int flags) { return (rtinit1(ifa, cmd, flags, -1)); } +#endif /* * Set up a routing table entry, normally @@ -1595,7 +1660,7 @@ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; - int fib = 0; + int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; @@ -1603,7 +1668,12 @@ rtinit(struct ifaddr *ifa, int cmd, int flags) dst = ifa->ifa_addr; } - if (dst->sa_family == AF_INET) + switch (dst->sa_family) { + case AF_INET6: + case AF_INET: + /* We do support multiple FIBs. */ fib = -1; + break; + } return (rtinit1(ifa, cmd, flags, fib)); } diff --git a/freebsd/sys/net/route.h b/freebsd/sys/net/route.h index 4014b3f6..b26ac441 100644 --- a/freebsd/sys/net/route.h +++ b/freebsd/sys/net/route.h @@ -86,30 +86,8 @@ struct rt_metrics { #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) -/* MRT compile-time constants */ -#ifdef _KERNEL - #ifndef ROUTETABLES - #define RT_NUMFIBS 1 - #define RT_MAXFIBS 1 - #else - /* while we use 4 bits in the mbuf flags, we are limited to 16 */ - #define RT_MAXFIBS 16 - #if ROUTETABLES > RT_MAXFIBS - #define RT_NUMFIBS RT_MAXFIBS - #error "ROUTETABLES defined too big" - #else - #if ROUTETABLES == 0 - #define RT_NUMFIBS 1 - #else - #define RT_NUMFIBS ROUTETABLES - #endif - #endif - #endif -#endif - +#define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ extern u_int rt_numfibs; /* number fo usable routing tables */ -extern u_int tunnel_fib; /* tunnels use these */ -extern u_int fwd_fib; /* packets being forwarded use these routes */ /* * XXX kernel function pointer `rt_output' is visible to applications. */ @@ -325,7 +303,6 @@ struct rt_addrinfo { #define RT_LOCK_INIT(_rt) \ mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK) #define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) -#define RT_TRYLOCK(_rt) mtx_trylock(&(_rt)->rt_mtx) #define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx) #define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx) #define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED) @@ -360,22 +337,6 @@ struct rt_addrinfo { RTFREE_LOCKED(_rt); \ } while (0) -#define RT_TEMP_UNLOCK(_rt) do { \ - RT_ADDREF(_rt); \ - RT_UNLOCK(_rt); \ -} while (0) - -#define RT_RELOCK(_rt) do { \ - RT_LOCK(_rt); \ - if ((_rt)->rt_refcnt <= 1) { \ - rtfree(_rt); \ - _rt = 0; /* signal that it went away */ \ - } else { \ - RT_REMREF(_rt); \ - /* note that _rt is still valid */ \ - } \ -} while (0) - struct radix_node_head *rt_tables_get_rnh(int, int); struct ifmultiaddr; @@ -384,7 +345,9 @@ void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); void rt_ifmsg(struct ifnet *); void rt_missmsg(int, struct rt_addrinfo *, int, int); +void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *); +void rt_newaddrmsg_fib(int, struct ifaddr *, int, struct rtentry *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); @@ -418,8 +381,10 @@ void rtredirect(struct sockaddr *, struct sockaddr *, int rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **); +#ifndef BURN_BRIDGES /* defaults to "all" FIBs */ int rtinit_fib(struct ifaddr *, int, int); +#endif /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones diff --git a/freebsd/sys/net/rtsock.c b/freebsd/sys/net/rtsock.c index bfdecf87..beca84da 100644 --- a/freebsd/sys/net/rtsock.c +++ b/freebsd/sys/net/rtsock.c @@ -116,7 +116,34 @@ struct if_msghdr32 { uint16_t ifm_index; struct if_data32 ifm_data; }; -#endif + +struct if_msghdrl32 { + uint16_t ifm_msglen; + uint8_t ifm_version; + uint8_t ifm_type; + int32_t ifm_addrs; + int32_t ifm_flags; + uint16_t ifm_index; + uint16_t _ifm_spare1; + uint16_t ifm_len; + uint16_t ifm_data_off; + struct if_data32 ifm_data; +}; + +struct ifa_msghdrl32 { + uint16_t ifam_msglen; + uint8_t ifam_version; + uint8_t ifam_type; + int32_t ifam_addrs; + int32_t ifam_flags; + uint16_t ifam_index; + uint16_t _ifam_spare1; + uint16_t ifam_len; + uint16_t ifam_data_off; + int32_t ifam_metric; + struct if_data32 ifam_data; +}; +#endif /* COMPAT_FREEBSD32 */ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); @@ -124,6 +151,13 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; +/* + * Used by rtsock/raw_input callback code to decide whether to filter the update + * notification to a socket bound to a particular FIB. + */ +#define RTS_FILTER_FIB M_PROTO8 +#define RTS_ALLFIBS -1 + static struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ @@ -161,7 +195,7 @@ static void rt_setmetrics(u_long which, const struct rt_metrics *in, struct rt_metrics_lite *out); static void rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out); -static void rt_dispatch(struct mbuf *, const struct sockaddr *); +static void rt_dispatch(struct mbuf *, sa_family_t); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", @@ -200,6 +234,31 @@ rts_init(void) } SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); +static int +raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, + struct rawcb *rp) +{ + int fibnum; + + KASSERT(m != NULL, ("%s: m is NULL", __func__)); + KASSERT(proto != NULL, ("%s: proto is NULL", __func__)); + KASSERT(rp != NULL, ("%s: rp is NULL", __func__)); + + /* No filtering requested. */ + if ((m->m_flags & RTS_FILTER_FIB) == 0) + return (0); + + /* Check if it is a rts and the fib matches the one of the socket. */ + fibnum = M_GETFIB(m); + if (proto->sp_family != PF_ROUTE || + rp->rcb_socket == NULL || + rp->rcb_socket->so_fibnum == fibnum) + return (0); + + /* Filtering requested and no match, the socket shall be skipped. */ + return (1); +} + static void rts_input(struct mbuf *m) { @@ -216,7 +275,7 @@ rts_input(struct mbuf *m) } else route_proto.sp_protocol = 0; - raw_input(m, &route_proto, &route_src); + raw_input_ext(m, &route_proto, &route_src, raw_input_rts_cb); } /* @@ -428,7 +487,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, * Try to find an address on the given outgoing interface * that belongs to the jail. */ - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; @@ -440,7 +499,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, break; } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. @@ -470,7 +529,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, * Try to find an address on the given outgoing interface * that belongs to the jail. */ - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; @@ -483,7 +542,7 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, break; } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); if (!found) { /* * As a last resort return the 'default' jail address. @@ -521,6 +580,7 @@ route_output(struct mbuf *m, struct socket *so) int len, error = 0; struct ifnet *ifp = NULL; union sockaddr_union saun; + sa_family_t saf = AF_UNSPEC; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && @@ -561,6 +621,7 @@ route_output(struct mbuf *m, struct socket *so) (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) senderr(EINVAL); + saf = info.rti_info[RTAX_DST]->sa_family; /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. @@ -898,6 +959,8 @@ flush: Free(rtm); } if (m) { + M_SETFIB(m, so->so_fibnum); + m->m_flags |= RTS_FILTER_FIB; if (rp) { /* * XXX insure we don't get a copy by @@ -905,10 +968,10 @@ flush: */ unsigned short family = rp->rcb_proto.sp_family; rp->rcb_proto.sp_family = 0; - rt_dispatch(m, info.rti_info[RTAX_DST]); + rt_dispatch(m, saf); rp->rcb_proto.sp_family = family; } else - rt_dispatch(m, info.rti_info[RTAX_DST]); + rt_dispatch(m, saf); } } return (error); @@ -984,6 +1047,9 @@ rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) return (0); } +/* + * Used by the routing socket. + */ static struct mbuf * rt_msg1(int type, struct rt_addrinfo *rtinfo) { @@ -1051,6 +1117,9 @@ rt_msg1(int type, struct rt_addrinfo *rtinfo) return (m); } +/* + * Used by the sysctl code and routing socket. + */ static int rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w) { @@ -1064,17 +1133,31 @@ again: case RTM_DELADDR: case RTM_NEWADDR: - len = sizeof(struct ifa_msghdr); + if (w != NULL && w->w_op == NET_RT_IFLISTL) { +#ifdef COMPAT_FREEBSD32 + if (w->w_req->flags & SCTL_MASK32) + len = sizeof(struct ifa_msghdrl32); + else +#endif + len = sizeof(struct ifa_msghdrl); + } else + len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { - len = sizeof(struct if_msghdr32); + if (w->w_op == NET_RT_IFLISTL) + len = sizeof(struct if_msghdrl32); + else + len = sizeof(struct if_msghdr32); break; } #endif - len = sizeof(struct if_msghdr); + if (w != NULL && w->w_op == NET_RT_IFLISTL) + len = sizeof(struct if_msghdrl); + else + len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: @@ -1137,7 +1220,8 @@ again: * destination. */ void -rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) +rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, + int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; @@ -1148,11 +1232,26 @@ rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) m = rt_msg1(type, rtinfo); if (m == NULL) return; + + if (fibnum != RTS_ALLFIBS) { + KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " + "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); + M_SETFIB(m, fibnum); + m->m_flags |= RTS_FILTER_FIB; + } + rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; - rt_dispatch(m, sa); + rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); +} + +void +rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) +{ + + rt_missmsg_fib(type, rtinfo, flags, error, RTS_ALLFIBS); } /* @@ -1177,7 +1276,7 @@ rt_ifmsg(struct ifnet *ifp) ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_data = ifp->if_data; ifm->ifm_addrs = 0; - rt_dispatch(m, NULL); + rt_dispatch(m, AF_UNSPEC); } /* @@ -1189,7 +1288,8 @@ rt_ifmsg(struct ifnet *ifp) * copies of it. */ void -rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) +rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, + int fibnum) { struct rt_addrinfo info; struct sockaddr *sa = NULL; @@ -1247,10 +1347,24 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) rtm->rtm_errno = error; rtm->rtm_addrs = info.rti_addrs; } - rt_dispatch(m, sa); + if (fibnum != RTS_ALLFIBS) { + KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: " + "fibnum out of range 0 <= %d < %d", __func__, + fibnum, rt_numfibs)); + M_SETFIB(m, fibnum); + m->m_flags |= RTS_FILTER_FIB; + } + rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } } +void +rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) +{ + + rt_newaddrmsg_fib(cmd, ifa, error, rt, RTS_ALLFIBS); +} + /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since @@ -1283,7 +1397,7 @@ rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; - rt_dispatch(m, ifma->ifma_addr); + rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * @@ -1343,7 +1457,7 @@ rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; - rt_dispatch(m, NULL); + rt_dispatch(m, AF_UNSPEC); } } @@ -1359,11 +1473,11 @@ rt_ifannouncemsg(struct ifnet *ifp, int what) m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) - rt_dispatch(m, NULL); + rt_dispatch(m, AF_UNSPEC); } static void -rt_dispatch(struct mbuf *m, const struct sockaddr *sa) +rt_dispatch(struct mbuf *m, sa_family_t saf) { struct m_tag *tag; @@ -1372,14 +1486,14 @@ rt_dispatch(struct mbuf *m, const struct sockaddr *sa) * use when injecting the mbuf into the routing socket buffer from * the netisr. */ - if (sa != NULL) { + if (saf != AF_UNSPEC) { tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short), M_NOWAIT); if (tag == NULL) { m_freem(m); return; } - *(unsigned short *)(tag + 1) = sa->sa_family; + *(unsigned short *)(tag + 1) = saf; m_tag_prepend(m, tag); } #ifdef VIMAGE @@ -1473,6 +1587,127 @@ copy_ifdata32(struct if_data *src, struct if_data32 *dst) #endif static int +sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info, + struct walkarg *w, int len) +{ + struct if_msghdrl *ifm; + +#ifdef COMPAT_FREEBSD32 + if (w->w_req->flags & SCTL_MASK32) { + struct if_msghdrl32 *ifm32; + + ifm32 = (struct if_msghdrl32 *)w->w_tmem; + ifm32->ifm_addrs = info->rti_addrs; + ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm32->ifm_index = ifp->if_index; + ifm32->_ifm_spare1 = 0; + ifm32->ifm_len = sizeof(*ifm32); + ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); + + copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); + + return (SYSCTL_OUT(w->w_req, (caddr_t)ifm32, len)); + } +#endif + ifm = (struct if_msghdrl *)w->w_tmem; + ifm->ifm_addrs = info->rti_addrs; + ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm->ifm_index = ifp->if_index; + ifm->_ifm_spare1 = 0; + ifm->ifm_len = sizeof(*ifm); + ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); + + ifm->ifm_data = ifp->if_data; + + return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); +} + +static int +sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info, + struct walkarg *w, int len) +{ + struct if_msghdr *ifm; + +#ifdef COMPAT_FREEBSD32 + if (w->w_req->flags & SCTL_MASK32) { + struct if_msghdr32 *ifm32; + + ifm32 = (struct if_msghdr32 *)w->w_tmem; + ifm32->ifm_addrs = info->rti_addrs; + ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm32->ifm_index = ifp->if_index; + + copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); + + return (SYSCTL_OUT(w->w_req, (caddr_t)ifm32, len)); + } +#endif + ifm = (struct if_msghdr *)w->w_tmem; + ifm->ifm_addrs = info->rti_addrs; + ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm->ifm_index = ifp->if_index; + + ifm->ifm_data = ifp->if_data; + + return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); +} + +static int +sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, + struct walkarg *w, int len) +{ + struct ifa_msghdrl *ifam; + +#ifdef COMPAT_FREEBSD32 + if (w->w_req->flags & SCTL_MASK32) { + struct ifa_msghdrl32 *ifam32; + + ifam32 = (struct ifa_msghdrl32 *)w->w_tmem; + ifam32->ifam_addrs = info->rti_addrs; + ifam32->ifam_flags = ifa->ifa_flags; + ifam32->ifam_index = ifa->ifa_ifp->if_index; + ifam32->_ifam_spare1 = 0; + ifam32->ifam_len = sizeof(*ifam32); + ifam32->ifam_data_off = + offsetof(struct ifa_msghdrl32, ifam_data); + ifam32->ifam_metric = ifa->ifa_metric; + + copy_ifdata32(&ifa->ifa_ifp->if_data, &ifam32->ifam_data); + + return (SYSCTL_OUT(w->w_req, (caddr_t)ifam32, len)); + } +#endif + + ifam = (struct ifa_msghdrl *)w->w_tmem; + ifam->ifam_addrs = info->rti_addrs; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->_ifam_spare1 = 0; + ifam->ifam_len = sizeof(*ifam); + ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); + ifam->ifam_metric = ifa->ifa_metric; + + ifam->ifam_data = ifa->if_data; + + return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); +} + +static int +sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, + struct walkarg *w, int len) +{ + struct ifa_msghdr *ifam; + + ifam = (struct ifa_msghdr *)w->w_tmem; + ifam->ifam_addrs = info->rti_addrs; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_metric = ifa->ifa_metric; + + return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); +} + +static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; @@ -1485,38 +1720,16 @@ sysctl_iflist(int af, struct walkarg *w) TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; len = rt_msg2(RTM_IFINFO, &info, NULL, w); info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { - struct if_msghdr *ifm; - -#ifdef COMPAT_FREEBSD32 - if (w->w_req->flags & SCTL_MASK32) { - struct if_msghdr32 *ifm32; - - ifm32 = (struct if_msghdr32 *)w->w_tmem; - ifm32->ifm_index = ifp->if_index; - ifm32->ifm_flags = ifp->if_flags | - ifp->if_drv_flags; - copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); - ifm32->ifm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, (caddr_t)ifm32, - len); - goto sysctl_out; - } -#endif - ifm = (struct if_msghdr *)w->w_tmem; - ifm->ifm_index = ifp->if_index; - ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm->ifm_data = ifp->if_data; - ifm->ifm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, (caddr_t)ifm, len); -#ifdef COMPAT_FREEBSD32 - sysctl_out: -#endif + if (w->w_op == NET_RT_IFLISTL) + error = sysctl_iflist_ifml(ifp, &info, w, len); + else + error = sysctl_iflist_ifm(ifp, &info, w, len); if (error) goto done; } @@ -1531,25 +1744,23 @@ sysctl_iflist(int af, struct walkarg *w) info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; len = rt_msg2(RTM_NEWADDR, &info, NULL, w); if (w->w_req && w->w_tmem) { - struct ifa_msghdr *ifam; - - ifam = (struct ifa_msghdr *)w->w_tmem; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_metric = ifa->ifa_metric; - ifam->ifam_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); + if (w->w_op == NET_RT_IFLISTL) + error = sysctl_iflist_ifaml(ifa, &info, + w, len); + else + error = sysctl_iflist_ifam(ifa, &info, + w, len); if (error) goto done; } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = info.rti_info[RTAX_BRD] = NULL; } done: if (ifp != NULL) - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); IFNET_RUNLOCK(); return (error); } @@ -1570,7 +1781,7 @@ sysctl_ifmalist(int af, struct walkarg *w) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; - IF_ADDR_LOCK(ifp); + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; @@ -1591,12 +1802,12 @@ sysctl_ifmalist(int af, struct walkarg *w) ifmam->ifmam_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error) { - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); goto done; } } } - IF_ADDR_UNLOCK(ifp); + IF_ADDR_RUNLOCK(ifp); } done: IFNET_RUNLOCK(); @@ -1662,16 +1873,17 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) rnh = rt_tables_get_rnh(BSD_DEFAULT_FIB, i); #endif /* __rtems__ */ if (rnh != NULL) { - RADIX_NODE_HEAD_LOCK(rnh); + RADIX_NODE_HEAD_RLOCK(rnh); error = rnh->rnh_walktree(rnh, sysctl_dumpentry, &w); - RADIX_NODE_HEAD_UNLOCK(rnh); + RADIX_NODE_HEAD_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_IFLIST: + case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; diff --git a/freebsd/sys/net/vnet.h b/freebsd/sys/net/vnet.h index d3c426a4..8ef1c00d 100644 --- a/freebsd/sys/net/vnet.h +++ b/freebsd/sys/net/vnet.h @@ -116,22 +116,27 @@ void vnet_destroy(struct vnet *vnet); * Various macros -- get and set the current network stack, but also * assertions. */ +#if defined(INVARIANTS) || defined(VNET_DEBUG) +#define VNET_ASSERT(exp, msg) do { \ + if (!(exp)) \ + panic msg; \ +} while (0) +#else +#define VNET_ASSERT(exp, msg) do { \ +} while (0) +#endif + #ifdef VNET_DEBUG void vnet_log_recursion(struct vnet *, const char *, int); -#define VNET_ASSERT(condition) \ - if (!(condition)) { \ - printf("VNET_ASSERT @ %s:%d %s():\n", \ - __FILE__, __LINE__, __FUNCTION__); \ - panic(#condition); \ - } - #define CURVNET_SET_QUIET(arg) \ - VNET_ASSERT((arg)->vnet_magic_n == VNET_MAGIC_N); \ + VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \ + ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p", \ + __FILE__, __LINE__, __func__, curvnet, (arg))); \ struct vnet *saved_vnet = curvnet; \ const char *saved_vnet_lpush = curthread->td_vnet_lpush; \ curvnet = arg; \ - curthread->td_vnet_lpush = __FUNCTION__; + curthread->td_vnet_lpush = __func__; #define CURVNET_SET_VERBOSE(arg) \ CURVNET_SET_QUIET(arg) \ @@ -141,21 +146,31 @@ void vnet_log_recursion(struct vnet *, const char *, int); #define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) #define CURVNET_RESTORE() \ - VNET_ASSERT(saved_vnet == NULL || \ - saved_vnet->vnet_magic_n == VNET_MAGIC_N); \ + VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL || \ + saved_vnet->vnet_magic_n == VNET_MAGIC_N), \ + ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p", \ + __FILE__, __LINE__, __func__, curvnet, saved_vnet)); \ curvnet = saved_vnet; \ curthread->td_vnet_lpush = saved_vnet_lpush; #else /* !VNET_DEBUG */ -#define VNET_ASSERT(condition) -#define CURVNET_SET(arg) \ +#define CURVNET_SET_QUIET(arg) \ + VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \ + ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p", \ + __FILE__, __LINE__, __func__, curvnet, (arg))); \ struct vnet *saved_vnet = curvnet; \ curvnet = arg; -#define CURVNET_SET_VERBOSE(arg) CURVNET_SET(arg) -#define CURVNET_SET_QUIET(arg) CURVNET_SET(arg) +#define CURVNET_SET_VERBOSE(arg) \ + CURVNET_SET_QUIET(arg) + +#define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) #define CURVNET_RESTORE() \ + VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL || \ + saved_vnet->vnet_magic_n == VNET_MAGIC_N), \ + ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p", \ + __FILE__, __LINE__, __func__, curvnet, saved_vnet)); \ curvnet = saved_vnet; #endif /* VNET_DEBUG */ @@ -191,15 +206,6 @@ extern struct sx vnet_sxlock; * Virtual network stack memory allocator, which allows global variables to * be automatically instantiated for each network stack instance. */ -__asm__( -#if defined(__arm__) - ".section " VNET_SETNAME ", \"aw\", %progbits\n" -#else - ".section " VNET_SETNAME ", \"aw\", @progbits\n" -#endif - "\t.p2align " __XSTRING(CACHE_LINE_SHIFT) "\n" - "\t.previous"); - #define VNET_NAME(n) vnet_entry_##n #define VNET_DECLARE(t, n) extern t VNET_NAME(n) #define VNET_DEFINE(t, n) t VNET_NAME(n) __section(VNET_SETNAME) __used @@ -357,7 +363,7 @@ do { \ */ #define curvnet NULL -#define VNET_ASSERT(condition) +#define VNET_ASSERT(exp, msg) #define CURVNET_SET(arg) #define CURVNET_SET_QUIET(arg) #define CURVNET_RESTORE() |