diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2016-10-07 15:10:20 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2017-01-10 09:53:31 +0100 |
commit | c40e45b75eb76d79a05c7fa85c1fa9b5c728a12f (patch) | |
tree | ad4f2519067709f00ab98b3c591186c26dc3a21f /freebsd/sys/net | |
parent | userspace-header-gen.py: Simplify program ports (diff) | |
download | rtems-libbsd-c40e45b75eb76d79a05c7fa85c1fa9b5c728a12f.tar.bz2 |
Update to FreeBSD head 2016-08-23
Git mirror commit 9fe7c416e6abb28b1398fd3e5687099846800cfd.
Diffstat (limited to 'freebsd/sys/net')
104 files changed, 29683 insertions, 8933 deletions
diff --git a/freebsd/sys/net/altq/altq.h b/freebsd/sys/net/altq/altq.h new file mode 100644 index 00000000..5d7eab8a --- /dev/null +++ b/freebsd/sys/net/altq/altq.h @@ -0,0 +1,206 @@ +/*- + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq.h,v 1.10 2003/07/10 12:07:47 kjc Exp $ + * $FreeBSD$ + */ +#ifndef _ALTQ_ALTQ_H_ +#define _ALTQ_ALTQ_H_ + +#if 0 +/* + * allow altq-3 (altqd(8) and /dev/altq) to coexist with the new pf-based altq. + * altq3 is mainly for research experiments. pf-based altq is for daily use. + */ +#define ALTQ3_COMPAT /* for compatibility with altq-3 */ +#define ALTQ3_CLFIER_COMPAT /* for compatibility with altq-3 classifier */ +#endif + +#ifdef ALTQ3_COMPAT +#include <rtems/bsd/sys/param.h> +#include <sys/ioccom.h> +#include <sys/queue.h> +#include <netinet/in.h> + +#ifndef IFNAMSIZ +#define IFNAMSIZ 16 +#endif +#endif /* ALTQ3_COMPAT */ + +/* altq discipline type */ +#define ALTQT_NONE 0 /* reserved */ +#define ALTQT_CBQ 1 /* cbq */ +#define ALTQT_WFQ 2 /* wfq */ +#define ALTQT_AFMAP 3 /* afmap */ +#define ALTQT_FIFOQ 4 /* fifoq */ +#define ALTQT_RED 5 /* red */ +#define ALTQT_RIO 6 /* rio */ +#define ALTQT_LOCALQ 7 /* local use */ +#define ALTQT_HFSC 8 /* hfsc */ +#define ALTQT_CDNR 9 /* traffic conditioner */ +#define ALTQT_BLUE 10 /* blue */ +#define ALTQT_PRIQ 11 /* priority queue */ +#define ALTQT_JOBS 12 /* JoBS */ +#define ALTQT_FAIRQ 13 /* fairq */ +#define ALTQT_CODEL 14 /* CoDel */ +#define ALTQT_MAX 15 /* should be max discipline type + 1 */ + +#ifdef ALTQ3_COMPAT +struct altqreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + u_long arg; /* request-specific argument */ +}; +#endif + +/* simple token backet meter profile */ +struct tb_profile { + u_int rate; /* rate in bit-per-sec */ + u_int depth; /* depth in bytes */ +}; + +#ifdef ALTQ3_COMPAT +struct tbrreq { + char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct tb_profile tb_prof; /* token bucket profile */ +}; + +#ifdef ALTQ3_CLFIER_COMPAT +/* + * common network flow info structure + */ +struct flowinfo { + u_char fi_len; /* total length */ + u_char fi_family; /* address family */ + u_int8_t fi_data[46]; /* actually longer; address family + specific flow info. */ +}; + +/* + * flow info structure for internet protocol family. + * (currently this is the only protocol family supported) + */ +struct flowinfo_in { + u_char fi_len; /* sizeof(struct flowinfo_in) */ + u_char fi_family; /* AF_INET */ + u_int8_t fi_proto; /* IPPROTO_XXX */ + u_int8_t fi_tos; /* type-of-service */ + struct in_addr fi_dst; /* dest address */ + struct in_addr fi_src; /* src address */ + u_int16_t fi_dport; /* dest port */ + u_int16_t fi_sport; /* src port */ + u_int32_t fi_gpi; /* generalized port id for ipsec */ + u_int8_t _pad[28]; /* make the size equal to + flowinfo_in6 */ +}; + +#ifdef SIN6_LEN +struct flowinfo_in6 { + u_char fi6_len; /* sizeof(struct flowinfo_in6) */ + u_char fi6_family; /* AF_INET6 */ + u_int8_t fi6_proto; /* IPPROTO_XXX */ + u_int8_t fi6_tclass; /* traffic class */ + u_int32_t fi6_flowlabel; /* ipv6 flowlabel */ + u_int16_t fi6_dport; /* dest port */ + u_int16_t fi6_sport; /* src port */ + u_int32_t fi6_gpi; /* generalized port id */ + struct in6_addr fi6_dst; /* dest address */ + struct in6_addr fi6_src; /* src address */ +}; +#endif /* INET6 */ + +/* + * flow filters for AF_INET and AF_INET6 + */ +struct flow_filter { + int ff_ruleno; + struct flowinfo_in ff_flow; + struct { + struct in_addr mask_dst; + struct in_addr mask_src; + u_int8_t mask_tos; + u_int8_t _pad[3]; + } ff_mask; + u_int8_t _pad2[24]; /* make the size equal to flow_filter6 */ +}; + +#ifdef SIN6_LEN +struct flow_filter6 { + int ff_ruleno; + struct flowinfo_in6 ff_flow6; + struct { + struct in6_addr mask6_dst; + struct in6_addr mask6_src; + u_int8_t mask6_tclass; + u_int8_t _pad[3]; + } ff_mask6; +}; +#endif /* INET6 */ +#endif /* ALTQ3_CLFIER_COMPAT */ +#endif /* ALTQ3_COMPAT */ + +/* + * generic packet counter + */ +struct pktcntr { + u_int64_t packets; + u_int64_t bytes; +}; + +#define PKTCNTR_ADD(cntr, len) \ + do { (cntr)->packets++; (cntr)->bytes += len; } while (/*CONSTCOND*/ 0) + +#ifdef ALTQ3_COMPAT +/* + * altq related ioctls + */ +#define ALTQGTYPE _IOWR('q', 0, struct altqreq) /* get queue type */ +#if 0 +/* + * these ioctls are currently discipline-specific but could be shared + * in the future. + */ +#define ALTQATTACH _IOW('q', 1, struct altqreq) /* attach discipline */ +#define ALTQDETACH _IOW('q', 2, struct altqreq) /* detach discipline */ +#define ALTQENABLE _IOW('q', 3, struct altqreq) /* enable discipline */ +#define ALTQDISABLE _IOW('q', 4, struct altqreq) /* disable discipline*/ +#define ALTQCLEAR _IOW('q', 5, struct altqreq) /* (re)initialize */ +#define ALTQCONFIG _IOWR('q', 6, struct altqreq) /* set config params */ +#define ALTQADDCLASS _IOWR('q', 7, struct altqreq) /* add a class */ +#define ALTQMODCLASS _IOWR('q', 8, struct altqreq) /* modify a class */ +#define ALTQDELCLASS _IOWR('q', 9, struct altqreq) /* delete a class */ +#define ALTQADDFILTER _IOWR('q', 10, struct altqreq) /* add a filter */ +#define ALTQDELFILTER _IOWR('q', 11, struct altqreq) /* delete a filter */ +#define ALTQGETSTATS _IOWR('q', 12, struct altqreq) /* get statistics */ +#define ALTQGETCNTR _IOWR('q', 13, struct altqreq) /* get a pkt counter */ +#endif /* 0 */ +#define ALTQTBRSET _IOW('q', 14, struct tbrreq) /* set tb regulator */ +#define ALTQTBRGET _IOWR('q', 15, struct tbrreq) /* get tb regulator */ +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +#include <net/altq/altq_var.h> +#endif + +#endif /* _ALTQ_ALTQ_H_ */ diff --git a/freebsd/sys/net/altq/altq_cbq.c b/freebsd/sys/net/altq/altq_cbq.c new file mode 100644 index 00000000..b8593fd6 --- /dev/null +++ b/freebsd/sys/net/altq/altq_cbq.c @@ -0,0 +1,1171 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + * + * $KAME: altq_cbq.c,v 1.19 2003/09/17 14:23:25 kjc Exp $ + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/time.h> +#ifdef ALTQ3_COMPAT +#include <sys/uio.h> +#include <sys/kernel.h> +#endif + +#include <net/if.h> +#include <net/if_var.h> +#include <netinet/in.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <netpfil/pf/pf_mtag.h> +#include <net/altq/altq.h> +#include <net/altq/altq_cbq.h> +#ifdef ALTQ3_COMPAT +#include <net/altq/altq_conf.h> +#endif + +#ifdef ALTQ3_COMPAT +/* + * Local Data structures. + */ +static cbq_state_t *cbq_list = NULL; +#endif + +/* + * Forward Declarations. + */ +static int cbq_class_destroy(cbq_state_t *, struct rm_class *); +static struct rm_class *clh_to_clp(cbq_state_t *, u_int32_t); +static int cbq_clear_interface(cbq_state_t *); +static int cbq_request(struct ifaltq *, int, void *); +static int cbq_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *cbq_dequeue(struct ifaltq *, int); +static void cbqrestart(struct ifaltq *); +static void get_class_stats(class_stats_t *, struct rm_class *); +static void cbq_purge(cbq_state_t *); +#ifdef ALTQ3_COMPAT +static int cbq_add_class(struct cbq_add_class *); +static int cbq_delete_class(struct cbq_delete_class *); +static int cbq_modify_class(struct cbq_modify_class *); +static int cbq_class_create(cbq_state_t *, struct cbq_add_class *, + struct rm_class *, struct rm_class *); +static int cbq_clear_hierarchy(struct cbq_interface *); +static int cbq_set_enable(struct cbq_interface *, int); +static int cbq_ifattach(struct cbq_interface *); +static int cbq_ifdetach(struct cbq_interface *); +static int cbq_getstats(struct cbq_getstats *); + +static int cbq_add_filter(struct cbq_add_filter *); +static int cbq_delete_filter(struct cbq_delete_filter *); +#endif /* ALTQ3_COMPAT */ + +/* + * int + * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This + * function destroys a given traffic class. Before destroying + * the class, all traffic for that class is released. + */ +static int +cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl) +{ + int i; + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) + cbqp->cbq_class_tbl[i] = NULL; + + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; +#ifdef ALTQ3_COMPAT + if (cl == cbqp->ifnp.ctl_) + cbqp->ifnp.ctl_ = NULL; +#endif + return (0); +} + +/* convert class handle to class pointer */ +static struct rm_class * +clh_to_clp(cbq_state_t *cbqp, u_int32_t chandle) +{ + int i; + struct rm_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % CBQ_MAX_CLASSES; + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL && + cl->stats_.handle == chandle) + return (cl); + return (NULL); +} + +static int +cbq_clear_interface(cbq_state_t *cbqp) +{ + int again, i; + struct rm_class *cl; + +#ifdef ALTQ3_CLFIER_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&cbqp->cbq_classifier, NULL, 1); +#endif + + /* clear out the classes now */ + do { + again = 0; + for (i = 0; i < CBQ_MAX_CLASSES; i++) { + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { + if (is_a_parent_class(cl)) + again++; + else { + cbq_class_destroy(cbqp, cl); + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; +#ifdef ALTQ3_COMPAT + if (cl == cbqp->ifnp.ctl_) + cbqp->ifnp.ctl_ = NULL; +#endif + } + } + } + } while (again); + + return (0); +} + +static int +cbq_request(struct ifaltq *ifq, int req, void *arg) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + cbq_purge(cbqp); + break; + } + return (0); +} + +/* copy the stats info in rm_class to class_states_t */ +static void +get_class_stats(class_stats_t *statsp, struct rm_class *cl) +{ + statsp->xmit_cnt = cl->stats_.xmit_cnt; + statsp->drop_cnt = cl->stats_.drop_cnt; + statsp->over = cl->stats_.over; + statsp->borrows = cl->stats_.borrows; + statsp->overactions = cl->stats_.overactions; + statsp->delays = cl->stats_.delays; + + statsp->depth = cl->depth_; + statsp->priority = cl->pri_; + statsp->maxidle = cl->maxidle_; + statsp->minidle = cl->minidle_; + statsp->offtime = cl->offtime_; + statsp->qmax = qlimit(cl->q_); + statsp->ns_per_byte = cl->ns_per_byte_; + statsp->wrr_allot = cl->w_allotment_; + statsp->qcnt = qlen(cl->q_); + statsp->avgidle = cl->avgidle_; + + statsp->qtype = qtype(cl->q_); +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_getstats(cl->red_, &statsp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_getstats((rio_t *)cl->red_, &statsp->red[0]); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->q_)) + codel_getstats(cl->codel_, &statsp->codel); +#endif +} + +int +cbq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + s = splnet(); + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, a->altq_disc, + cbq_enqueue, cbq_dequeue, cbq_request, NULL, NULL); + splx(s); + return (error); +} + +int +cbq_add_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + /* allocate and initialize cbq_state_t */ + cbqp = malloc(sizeof(cbq_state_t), M_DEVBUF, M_NOWAIT | M_ZERO); + if (cbqp == NULL) + return (ENOMEM); + CALLOUT_INIT(&cbqp->cbq_callout); + cbqp->cbq_qlen = 0; + cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* keep the state in pf_altq */ + a->altq_disc = cbqp; + + return (0); +} + +int +cbq_remove_altq(struct pf_altq *a) +{ + cbq_state_t *cbqp; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + cbq_clear_interface(cbqp); + + if (cbqp->ifnp.default_) + cbq_class_destroy(cbqp, cbqp->ifnp.default_); + if (cbqp->ifnp.root_) + cbq_class_destroy(cbqp, cbqp->ifnp.root_); + + /* deallocate cbq_state_t */ + free(cbqp, M_DEVBUF); + + return (0); +} + +int +cbq_add_queue(struct pf_altq *a) +{ + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + struct rm_class *cl; + struct cbq_opts *opts; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = a->qid % CBQ_MAX_CLASSES; + if (cbqp->cbq_class_tbl[i] != NULL) { + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + } + + opts = &a->pq_u.cbq_opts; + /* check parameters */ + if (a->priority >= CBQ_MAXPRI) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, a->parent_qid); + if (opts->flags & CBQCLF_BORROW) + borrow = parent; + else + borrow = NULL; + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (opts->flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_queue: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + /* + * check parameters + */ + switch (opts->flags & CBQCLF_CLASSMASK) { + case CBQCLF_ROOTCLASS: + if (parent != NULL) + return (EINVAL); + if (cbqp->ifnp.root_) + return (EINVAL); + break; + case CBQCLF_DEFCLASS: + if (cbqp->ifnp.default_) + return (EINVAL); + break; + case 0: + if (a->qid == 0) + return (EINVAL); + break; + default: + /* more than two flags bits set */ + return (EINVAL); + } + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, opts->ns_per_byte, + cbqrestart, a->qlimit, RM_MAXQUEUED, + opts->maxidle, opts->minidle, opts->offtime, + opts->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(a->priority, + &cbqp->ifnp, opts->ns_per_byte, + rmc_delay_action, a->qlimit, parent, borrow, + opts->maxidle, opts->minidle, opts->offtime, + opts->pktsize, opts->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + cl->stats_.handle = a->qid; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + + return (0); +} + +int +cbq_remove_queue(struct pf_altq *a) +{ + struct rm_class *cl; + cbq_state_t *cbqp; + int i; + + if ((cbqp = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* delete the class */ + rmc_delete_class(&cbqp->ifnp, cl); + + /* + * free the class handle + */ + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == cl) { + cbqp->cbq_class_tbl[i] = NULL; + if (cl == cbqp->ifnp.root_) + cbqp->ifnp.root_ = NULL; + if (cl == cbqp->ifnp.default_) + cbqp->ifnp.default_ = NULL; + break; + } + + return (0); +} + +int +cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats; + int error = 0; + + if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * int + * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr) + * - Queue data packets. + * + * cbq_enqueue is set to ifp->if_altqenqueue and called by an upper + * layer (e.g. ether_output). cbq_enqueue queues the given packet + * to the cbq, then invokes the driver's start routine. + * + * Assumptions: called in splimp + * Returns: 0 if the queueing is successful. + * ENOBUFS if a packet dropping occurred as a result of + * the queueing. + */ + +static int +cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct rm_class *cl; + struct pf_mtag *t; + int len; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = pf_find_mtag(m)) != NULL) + cl = clh_to_clp(cbqp, t->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL) { + cl = cbqp->ifnp.default_; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->pktattr_ = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->pktattr_ = NULL; + len = m_pktlen(m); + if (rmc_queue_packet(cl, m) != 0) { + /* drop occurred. some mbuf was freed in rmc_queue_packet. */ + PKTCNTR_ADD(&cl->stats_.drop_cnt, len); + return (ENOBUFS); + } + + /* successfully queued. */ + ++cbqp->cbq_qlen; + IFQ_INC_LEN(ifq); + return (0); +} + +static struct mbuf * +cbq_dequeue(struct ifaltq *ifq, int op) +{ + cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; + struct mbuf *m; + + IFQ_LOCK_ASSERT(ifq); + + m = rmc_dequeue_next(&cbqp->ifnp, op); + + if (m && op == ALTDQ_REMOVE) { + --cbqp->cbq_qlen; /* decrement # of packets in cbq */ + IFQ_DEC_LEN(ifq); + + /* Update the class. */ + rmc_update_class_util(&cbqp->ifnp); + } + return (m); +} + +/* + * void + * cbqrestart(queue_t *) - Restart sending of data. + * called from rmc_restart in splimp via timeout after waking up + * a suspended class. + * Returns: NONE + */ + +static void +cbqrestart(struct ifaltq *ifq) +{ + cbq_state_t *cbqp; + struct ifnet *ifp; + + IFQ_LOCK_ASSERT(ifq); + + if (!ALTQ_IS_ENABLED(ifq)) + /* cbq must have been detached */ + return; + + if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL) + /* should not happen */ + return; + + ifp = ifq->altq_ifp; + if (ifp->if_start && + cbqp->cbq_qlen > 0 && (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { + IFQ_UNLOCK(ifq); + (*ifp->if_start)(ifp); + IFQ_LOCK(ifq); + } +} + +static void cbq_purge(cbq_state_t *cbqp) +{ + struct rm_class *cl; + int i; + + for (i = 0; i < CBQ_MAX_CLASSES; i++) + if ((cl = cbqp->cbq_class_tbl[i]) != NULL) + rmc_dropall(cl); + if (ALTQ_IS_ENABLED(cbqp->ifnp.ifq_)) + cbqp->ifnp.ifq_->ifq_len = 0; +} +#ifdef ALTQ3_COMPAT + +static int +cbq_add_class(acp) + struct cbq_add_class *acp; +{ + char *ifacename; + struct rm_class *borrow, *parent; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* check parameters */ + if (acp->cbq_class.priority >= CBQ_MAXPRI || + acp->cbq_class.maxq > CBQ_MAXQSIZE) + return (EINVAL); + + /* Get pointers to parent and borrow classes. */ + parent = clh_to_clp(cbqp, acp->cbq_class.parent_class_handle); + borrow = clh_to_clp(cbqp, acp->cbq_class.borrow_class_handle); + + /* + * A class must borrow from it's parent or it can not + * borrow at all. Hence, borrow can be null. + */ + if (parent == NULL && (acp->cbq_class.flags & CBQCLF_ROOTCLASS) == 0) { + printf("cbq_add_class: no parent class!\n"); + return (EINVAL); + } + + if ((borrow != parent) && (borrow != NULL)) { + printf("cbq_add_class: borrow class != parent\n"); + return (EINVAL); + } + + return cbq_class_create(cbqp, acp, parent, borrow); +} + +static int +cbq_delete_class(dcp) + struct cbq_delete_class *dcp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = dcp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(cbqp, dcp->cbq_class_handle)) == NULL) + return (EINVAL); + + /* if we are a parent class, then return an error. */ + if (is_a_parent_class(cl)) + return (EINVAL); + + /* if a filter has a reference to this class delete the filter */ + acc_discard_filters(&cbqp->cbq_classifier, cl, 0); + + return cbq_class_destroy(cbqp, cl); +} + +static int +cbq_modify_class(acp) + struct cbq_modify_class *acp; +{ + char *ifacename; + struct rm_class *cl; + cbq_state_t *cbqp; + + ifacename = acp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get pointer to this class */ + if ((cl = clh_to_clp(cbqp, acp->cbq_class_handle)) == NULL) + return (EINVAL); + + if (rmc_modclass(cl, acp->cbq_class.nano_sec_per_byte, + acp->cbq_class.maxq, acp->cbq_class.maxidle, + acp->cbq_class.minidle, acp->cbq_class.offtime, + acp->cbq_class.pktsize) < 0) + return (EINVAL); + return (0); +} + +/* + * struct rm_class * + * cbq_class_create(cbq_mod_state_t *cbqp, struct cbq_add_class *acp, + * struct rm_class *parent, struct rm_class *borrow) + * + * This function create a new traffic class in the CBQ class hierarchy of + * given parameters. The class that created is either the root, default, + * or a new dynamic class. If CBQ is not initilaized, the root class + * will be created. + */ +static int +cbq_class_create(cbqp, acp, parent, borrow) + cbq_state_t *cbqp; + struct cbq_add_class *acp; + struct rm_class *parent, *borrow; +{ + struct rm_class *cl; + cbq_class_spec_t *spec = &acp->cbq_class; + u_int32_t chandle; + int i; + + /* + * allocate class handle + */ + for (i = 1; i < CBQ_MAX_CLASSES; i++) + if (cbqp->cbq_class_tbl[i] == NULL) + break; + if (i == CBQ_MAX_CLASSES) + return (EINVAL); + chandle = i; /* use the slot number as class handle */ + + /* + * create a class. if this is a root class, initialize the + * interface. + */ + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { + rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, spec->nano_sec_per_byte, + cbqrestart, spec->maxq, RM_MAXQUEUED, + spec->maxidle, spec->minidle, spec->offtime, + spec->flags); + cl = cbqp->ifnp.root_; + } else { + cl = rmc_newclass(spec->priority, + &cbqp->ifnp, spec->nano_sec_per_byte, + rmc_delay_action, spec->maxq, parent, borrow, + spec->maxidle, spec->minidle, spec->offtime, + spec->pktsize, spec->flags); + } + if (cl == NULL) + return (ENOMEM); + + /* return handle to user space. */ + acp->cbq_class_handle = chandle; + + cl->stats_.handle = chandle; + cl->stats_.depth = cl->depth_; + + /* save the allocated class */ + cbqp->cbq_class_tbl[i] = cl; + + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) + cbqp->ifnp.default_ = cl; + if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_CTLCLASS) + cbqp->ifnp.ctl_ = cl; + + return (0); +} + +static int +cbq_add_filter(afp) + struct cbq_add_filter *afp; +{ + char *ifacename; + cbq_state_t *cbqp; + struct rm_class *cl; + + ifacename = afp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + /* Get the pointer to class. */ + if ((cl = clh_to_clp(cbqp, afp->cbq_class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&cbqp->cbq_classifier, &afp->cbq_filter, + cl, &afp->cbq_filter_handle); +} + +static int +cbq_delete_filter(dfp) + struct cbq_delete_filter *dfp; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = dfp->cbq_iface.cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&cbqp->cbq_classifier, + dfp->cbq_filter_handle); +} + +/* + * cbq_clear_hierarchy deletes all classes and their filters on the + * given interface. + */ +static int +cbq_clear_hierarchy(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + return cbq_clear_interface(cbqp); +} + +/* + * static int + * cbq_set_enable(struct cbq_enable *ep) - this function processed the + * ioctl request to enable class based queueing. It searches the list + * of interfaces for the specified interface and then enables CBQ on + * that interface. + * + * Returns: 0, for no error. + * EBADF, for specified inteface not found. + */ + +static int +cbq_set_enable(ep, enable) + struct cbq_interface *ep; + int enable; +{ + int error = 0; + cbq_state_t *cbqp; + char *ifacename; + + ifacename = ep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + switch (enable) { + case ENABLE: + if (cbqp->ifnp.root_ == NULL || cbqp->ifnp.default_ == NULL || + cbqp->ifnp.ctl_ == NULL) { + if (cbqp->ifnp.root_ == NULL) + printf("No Root Class for %s\n", ifacename); + if (cbqp->ifnp.default_ == NULL) + printf("No Default Class for %s\n", ifacename); + if (cbqp->ifnp.ctl_ == NULL) + printf("No Control Class for %s\n", ifacename); + error = EINVAL; + } else if ((error = altq_enable(cbqp->ifnp.ifq_)) == 0) { + cbqp->cbq_qlen = 0; + } + break; + + case DISABLE: + error = altq_disable(cbqp->ifnp.ifq_); + break; + } + return (error); +} + +static int +cbq_getstats(gsp) + struct cbq_getstats *gsp; +{ + char *ifacename; + int i, n, nclasses; + cbq_state_t *cbqp; + struct rm_class *cl; + class_stats_t stats, *usp; + int error = 0; + + ifacename = gsp->iface.cbq_ifacename; + nclasses = gsp->nclasses; + usp = gsp->stats; + + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + if (nclasses <= 0) + return (EINVAL); + + for (n = 0, i = 0; n < nclasses && i < CBQ_MAX_CLASSES; n++, i++) { + while ((cl = cbqp->cbq_class_tbl[i]) == NULL) + if (++i >= CBQ_MAX_CLASSES) + goto out; + + get_class_stats(&stats, cl); + stats.handle = cl->stats_.handle; + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + out: + gsp->nclasses = n; + return (error); +} + +static int +cbq_ifattach(ifacep) + struct cbq_interface *ifacep; +{ + int error = 0; + char *ifacename; + cbq_state_t *new_cbqp; + struct ifnet *ifp; + + ifacename = ifacep->cbq_ifacename; + if ((ifp = ifunit(ifacename)) == NULL) + return (ENXIO); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENXIO); + + /* allocate and initialize cbq_state_t */ + new_cbqp = malloc(sizeof(cbq_state_t), M_DEVBUF, M_WAITOK); + if (new_cbqp == NULL) + return (ENOMEM); + bzero(new_cbqp, sizeof(cbq_state_t)); + CALLOUT_INIT(&new_cbqp->cbq_callout); + + new_cbqp->cbq_qlen = 0; + new_cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ + + /* + * set CBQ to this ifnet structure. + */ + error = altq_attach(&ifp->if_snd, ALTQT_CBQ, new_cbqp, + cbq_enqueue, cbq_dequeue, cbq_request, + &new_cbqp->cbq_classifier, acc_classify); + if (error) { + free(new_cbqp, M_DEVBUF); + return (error); + } + + /* prepend to the list of cbq_state_t's. */ + new_cbqp->cbq_next = cbq_list; + cbq_list = new_cbqp; + + return (0); +} + +static int +cbq_ifdetach(ifacep) + struct cbq_interface *ifacep; +{ + char *ifacename; + cbq_state_t *cbqp; + + ifacename = ifacep->cbq_ifacename; + if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) + return (EBADF); + + (void)cbq_set_enable(ifacep, DISABLE); + + cbq_clear_interface(cbqp); + + /* remove CBQ from the ifnet structure. */ + (void)altq_detach(cbqp->ifnp.ifq_); + + /* remove from the list of cbq_state_t's. */ + if (cbq_list == cbqp) + cbq_list = cbqp->cbq_next; + else { + cbq_state_t *cp; + + for (cp = cbq_list; cp != NULL; cp = cp->cbq_next) + if (cp->cbq_next == cbqp) { + cp->cbq_next = cbqp->cbq_next; + break; + } + ASSERT(cp != NULL); + } + + /* deallocate cbq_state_t */ + free(cbqp, M_DEVBUF); + + return (0); +} + +/* + * cbq device interface + */ + +altqdev_decl(cbq); + +int +cbqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + return (0); +} + +int +cbqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct ifnet *ifp; + struct cbq_interface iface; + int err, error = 0; + + while (cbq_list) { + ifp = cbq_list->ifnp.ifq_->altq_ifp; + sprintf(iface.cbq_ifacename, "%s", ifp->if_xname); + err = cbq_ifdetach(&iface); + if (err != 0 && error == 0) + error = err; + } + + return (error); +} + +int +cbqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + int error = 0; + + /* check cmd for superuser only */ + switch (cmd) { + case CBQ_GETSTATS: + /* currently only command that an ordinary user can call */ + break; + default: +#if (__FreeBSD_version > 700000) + error = priv_check(p, PRIV_ALTQ_MANAGE); +#elsif (__FreeBSD_version > 400000) + error = suser(p); +#else + error = suser(p->p_ucred, &p->p_acflag); +#endif + if (error) + return (error); + break; + } + + switch (cmd) { + + case CBQ_ENABLE: + error = cbq_set_enable((struct cbq_interface *)addr, ENABLE); + break; + + case CBQ_DISABLE: + error = cbq_set_enable((struct cbq_interface *)addr, DISABLE); + break; + + case CBQ_ADD_FILTER: + error = cbq_add_filter((struct cbq_add_filter *)addr); + break; + + case CBQ_DEL_FILTER: + error = cbq_delete_filter((struct cbq_delete_filter *)addr); + break; + + case CBQ_ADD_CLASS: + error = cbq_add_class((struct cbq_add_class *)addr); + break; + + case CBQ_DEL_CLASS: + error = cbq_delete_class((struct cbq_delete_class *)addr); + break; + + case CBQ_MODIFY_CLASS: + error = cbq_modify_class((struct cbq_modify_class *)addr); + break; + + case CBQ_CLEAR_HIERARCHY: + error = cbq_clear_hierarchy((struct cbq_interface *)addr); + break; + + case CBQ_IF_ATTACH: + error = cbq_ifattach((struct cbq_interface *)addr); + break; + + case CBQ_IF_DETACH: + error = cbq_ifdetach((struct cbq_interface *)addr); + break; + + case CBQ_GETSTATS: + error = cbq_getstats((struct cbq_getstats *)addr); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +#if 0 +/* for debug */ +static void cbq_class_dump(int); + +static void cbq_class_dump(i) + int i; +{ + struct rm_class *cl; + rm_class_stats_t *s; + struct _class_queue_ *q; + + if (cbq_list == NULL) { + printf("cbq_class_dump: no cbq_state found\n"); + return; + } + cl = cbq_list->cbq_class_tbl[i]; + + printf("class %d cl=%p\n", i, cl); + if (cl != NULL) { + s = &cl->stats_; + q = cl->q_; + + printf("pri=%d, depth=%d, maxrate=%d, allotment=%d\n", + cl->pri_, cl->depth_, cl->maxrate_, cl->allotment_); + printf("w_allotment=%d, bytes_alloc=%d, avgidle=%d, maxidle=%d\n", + cl->w_allotment_, cl->bytes_alloc_, cl->avgidle_, + cl->maxidle_); + printf("minidle=%d, offtime=%d, sleeping=%d, leaf=%d\n", + cl->minidle_, cl->offtime_, cl->sleeping_, cl->leaf_); + printf("handle=%d, depth=%d, packets=%d, bytes=%d\n", + s->handle, s->depth, + (int)s->xmit_cnt.packets, (int)s->xmit_cnt.bytes); + printf("over=%d\n, borrows=%d, drops=%d, overactions=%d, delays=%d\n", + s->over, s->borrows, (int)s->drop_cnt.packets, + s->overactions, s->delays); + printf("tail=%p, head=%p, qlen=%d, qlim=%d, qthresh=%d,qtype=%d\n", + q->tail_, q->head_, q->qlen_, q->qlim_, + q->qthresh_, q->qtype_); + } +} +#endif /* 0 */ + +#ifdef KLD_MODULE + +static struct altqsw cbq_sw = + {"cbq", cbqopen, cbqclose, cbqioctl}; + +ALTQ_MODULE(altq_cbq, ALTQT_CBQ, &cbq_sw); +MODULE_DEPEND(altq_cbq, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_cbq, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_CBQ */ diff --git a/freebsd/sys/net/altq/altq_cbq.h b/freebsd/sys/net/altq/altq_cbq.h new file mode 100644 index 00000000..51e7cf9a --- /dev/null +++ b/freebsd/sys/net/altq/altq_cbq.h @@ -0,0 +1,225 @@ +/*- + * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + * + * $KAME: altq_cbq.h,v 1.12 2003/10/03 05:05:15 kjc Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_CBQ_H_ +#define _ALTQ_ALTQ_CBQ_H_ + +#include <net/altq/altq.h> +#include <net/altq/altq_rmclass.h> +#include <net/altq/altq_codel.h> +#include <net/altq/altq_red.h> +#include <net/altq/altq_rio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define NULL_CLASS_HANDLE 0 + +/* class flags should be same as class flags in rm_class.h */ +#define CBQCLF_RED 0x0001 /* use RED */ +#define CBQCLF_ECN 0x0002 /* use RED/ECN */ +#define CBQCLF_RIO 0x0004 /* use RIO */ +#define CBQCLF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define CBQCLF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define CBQCLF_BORROW 0x0020 /* borrow from parent */ +#define CBQCLF_CODEL 0x0040 /* use CoDel */ + +/* class flags only for root class */ +#define CBQCLF_WRR 0x0100 /* weighted-round robin */ +#define CBQCLF_EFFICIENT 0x0200 /* work-conserving */ + +/* class flags for special classes */ +#define CBQCLF_ROOTCLASS 0x1000 /* root class */ +#define CBQCLF_DEFCLASS 0x2000 /* default class */ +#ifdef ALTQ3_COMPAT +#define CBQCLF_CTLCLASS 0x4000 /* control class */ +#endif +#define CBQCLF_CLASSMASK 0xf000 /* class mask */ + +#define CBQ_MAXQSIZE 200 +#define CBQ_MAXPRI RM_MAXPRIO + +typedef struct _cbq_class_stats_ { + u_int32_t handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ + + /* other static class parameters useful for debugging */ + int priority; + int maxidle; + int minidle; + int offtime; + int qmax; + int ns_per_byte; + int wrr_allot; + + int qcnt; /* # packets in queue */ + int avgidle; + + /* codel, red and rio related info */ + int qtype; + struct redstats red[3]; + struct codel_stats codel; +} class_stats_t; + +#ifdef ALTQ3_COMPAT +/* + * Define structures associated with IOCTLS for cbq. + */ + +/* + * Define the CBQ interface structure. This must be included in all + * IOCTL's such that the CBQ driver may find the appropriate CBQ module + * associated with the network interface to be affected. + */ +struct cbq_interface { + char cbq_ifacename[IFNAMSIZ]; +}; + +typedef struct cbq_class_spec { + u_int priority; + u_int nano_sec_per_byte; + u_int maxq; + u_int maxidle; + int minidle; + u_int offtime; + u_int32_t parent_class_handle; + u_int32_t borrow_class_handle; + + u_int pktsize; + int flags; +} cbq_class_spec_t; + +struct cbq_add_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_int32_t cbq_class_handle; +}; + +struct cbq_delete_class { + struct cbq_interface cbq_iface; + u_int32_t cbq_class_handle; +}; + +struct cbq_modify_class { + struct cbq_interface cbq_iface; + + cbq_class_spec_t cbq_class; + u_int32_t cbq_class_handle; +}; + +struct cbq_add_filter { + struct cbq_interface cbq_iface; + u_int32_t cbq_class_handle; + struct flow_filter cbq_filter; + + u_long cbq_filter_handle; +}; + +struct cbq_delete_filter { + struct cbq_interface cbq_iface; + u_long cbq_filter_handle; +}; + +/* number of classes are returned in nclasses field */ +struct cbq_getstats { + struct cbq_interface iface; + int nclasses; + class_stats_t *stats; +}; + +/* + * Define IOCTLs for CBQ. + */ +#define CBQ_IF_ATTACH _IOW('Q', 1, struct cbq_interface) +#define CBQ_IF_DETACH _IOW('Q', 2, struct cbq_interface) +#define CBQ_ENABLE _IOW('Q', 3, struct cbq_interface) +#define CBQ_DISABLE _IOW('Q', 4, struct cbq_interface) +#define CBQ_CLEAR_HIERARCHY _IOW('Q', 5, struct cbq_interface) +#define CBQ_ADD_CLASS _IOWR('Q', 7, struct cbq_add_class) +#define CBQ_DEL_CLASS _IOW('Q', 8, struct cbq_delete_class) +#define CBQ_MODIFY_CLASS _IOWR('Q', 9, struct cbq_modify_class) +#define CBQ_ADD_FILTER _IOWR('Q', 10, struct cbq_add_filter) +#define CBQ_DEL_FILTER _IOW('Q', 11, struct cbq_delete_filter) +#define CBQ_GETSTATS _IOWR('Q', 12, struct cbq_getstats) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +/* + * Define macros only good for kernel drivers and modules. + */ +#define CBQ_WATCHDOG (hz / 20) +#define CBQ_TIMEOUT 10 +#define CBQ_LS_TIMEOUT (20 * hz / 1000) + +#define CBQ_MAX_CLASSES 256 + +#ifdef ALTQ3_COMPAT +#define CBQ_MAX_FILTERS 256 + +#define DISABLE 0x00 +#define ENABLE 0x01 +#endif /* ALTQ3_COMPAT */ + +/* + * Define State structures. + */ +typedef struct cbqstate { +#ifdef ALTQ3_COMPAT + struct cbqstate *cbq_next; +#endif + int cbq_qlen; /* # of packets in cbq */ + struct rm_class *cbq_class_tbl[CBQ_MAX_CLASSES]; + + struct rm_ifdat ifnp; + struct callout cbq_callout; /* for timeouts */ +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier cbq_classifier; +#endif +} cbq_state_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* !_ALTQ_ALTQ_CBQ_H_ */ diff --git a/freebsd/sys/net/altq/altq_cdnr.c b/freebsd/sys/net/altq/altq_cdnr.c new file mode 100644 index 00000000..f456ce83 --- /dev/null +++ b/freebsd/sys/net/altq/altq_cdnr.c @@ -0,0 +1,1384 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (C) 1999-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_cdnr.c,v 1.15 2005/04/13 03:44:24 suz Exp $ + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#include <net/altq/if_altq.h> +#include <net/altq/altq.h> +#ifdef ALTQ3_COMPAT +#include <net/altq/altq_conf.h> +#endif +#include <net/altq/altq_cdnr.h> + +#ifdef ALTQ3_COMPAT +/* + * diffserv traffic conditioning module + */ + +int altq_cdnr_enabled = 0; + +/* traffic conditioner is enabled by ALTQ_CDNR option in opt_altq.h */ +#ifdef ALTQ_CDNR + +/* cdnr_list keeps all cdnr's allocated. */ +static LIST_HEAD(, top_cdnr) tcb_list; + +static int altq_cdnr_input(struct mbuf *, int); +static struct top_cdnr *tcb_lookup(char *ifname); +static struct cdnr_block *cdnr_handle2cb(u_long); +static u_long cdnr_cb2handle(struct cdnr_block *); +static void *cdnr_cballoc(struct top_cdnr *, int, + struct tc_action *(*)(struct cdnr_block *, struct cdnr_pktinfo *)); +static void cdnr_cbdestroy(void *); +static int tca_verify_action(struct tc_action *); +static void tca_import_action(struct tc_action *, struct tc_action *); +static void tca_invalidate_action(struct tc_action *); + +static int generic_element_destroy(struct cdnr_block *); +static struct top_cdnr *top_create(struct ifaltq *); +static int top_destroy(struct top_cdnr *); +static struct cdnr_block *element_create(struct top_cdnr *, struct tc_action *); +static int element_destroy(struct cdnr_block *); +static void tb_import_profile(struct tbe *, struct tb_profile *); +static struct tbmeter *tbm_create(struct top_cdnr *, struct tb_profile *, + struct tc_action *, struct tc_action *); +static int tbm_destroy(struct tbmeter *); +static struct tc_action *tbm_input(struct cdnr_block *, struct cdnr_pktinfo *); +static struct trtcm *trtcm_create(struct top_cdnr *, + struct tb_profile *, struct tb_profile *, + struct tc_action *, struct tc_action *, struct tc_action *, + int); +static int trtcm_destroy(struct trtcm *); +static struct tc_action *trtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); +static struct tswtcm *tswtcm_create(struct top_cdnr *, + u_int32_t, u_int32_t, u_int32_t, + struct tc_action *, struct tc_action *, struct tc_action *); +static int tswtcm_destroy(struct tswtcm *); +static struct tc_action *tswtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); + +static int cdnrcmd_if_attach(char *); +static int cdnrcmd_if_detach(char *); +static int cdnrcmd_add_element(struct cdnr_add_element *); +static int cdnrcmd_delete_element(struct cdnr_delete_element *); +static int cdnrcmd_add_filter(struct cdnr_add_filter *); +static int cdnrcmd_delete_filter(struct cdnr_delete_filter *); +static int cdnrcmd_add_tbm(struct cdnr_add_tbmeter *); +static int cdnrcmd_modify_tbm(struct cdnr_modify_tbmeter *); +static int cdnrcmd_tbm_stats(struct cdnr_tbmeter_stats *); +static int cdnrcmd_add_trtcm(struct cdnr_add_trtcm *); +static int cdnrcmd_modify_trtcm(struct cdnr_modify_trtcm *); +static int cdnrcmd_tcm_stats(struct cdnr_tcm_stats *); +static int cdnrcmd_add_tswtcm(struct cdnr_add_tswtcm *); +static int cdnrcmd_modify_tswtcm(struct cdnr_modify_tswtcm *); +static int cdnrcmd_get_stats(struct cdnr_get_stats *); + +altqdev_decl(cdnr); + +/* + * top level input function called from ip_input. + * should be called before converting header fields to host-byte-order. + */ +int +altq_cdnr_input(m, af) + struct mbuf *m; + int af; /* address family */ +{ + struct ifnet *ifp; + struct ip *ip; + struct top_cdnr *top; + struct tc_action *tca; + struct cdnr_block *cb; + struct cdnr_pktinfo pktinfo; + + ifp = m->m_pkthdr.rcvif; + if (!ALTQ_IS_CNDTNING(&ifp->if_snd)) + /* traffic conditioner is not enabled on this interface */ + return (1); + + top = ifp->if_snd.altq_cdnr; + + ip = mtod(m, struct ip *); +#ifdef INET6 + if (af == AF_INET6) { + u_int32_t flowlabel; + + flowlabel = ((struct ip6_hdr *)ip)->ip6_flow; + pktinfo.pkt_dscp = (ntohl(flowlabel) >> 20) & DSCP_MASK; + } else +#endif + pktinfo.pkt_dscp = ip->ip_tos & DSCP_MASK; + pktinfo.pkt_len = m_pktlen(m); + + tca = NULL; + + cb = acc_classify(&top->tc_classifier, m, af); + if (cb != NULL) + tca = &cb->cb_action; + + if (tca == NULL) + tca = &top->tc_block.cb_action; + + while (1) { + PKTCNTR_ADD(&top->tc_cnts[tca->tca_code], pktinfo.pkt_len); + + switch (tca->tca_code) { + case TCACODE_PASS: + return (1); + case TCACODE_DROP: + m_freem(m); + return (0); + case TCACODE_RETURN: + return (0); + case TCACODE_MARK: +#ifdef INET6 + if (af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + flowlabel = (tca->tca_dscp << 20) | + (flowlabel & ~(DSCP_MASK << 20)); + ip6->ip6_flow = htonl(flowlabel); + } else +#endif + ip->ip_tos = tca->tca_dscp | + (ip->ip_tos & DSCP_CUMASK); + return (1); + case TCACODE_NEXT: + cb = tca->tca_next; + tca = (*cb->cb_input)(cb, &pktinfo); + break; + case TCACODE_NONE: + default: + return (1); + } + } +} + +static struct top_cdnr * +tcb_lookup(ifname) + char *ifname; +{ + struct top_cdnr *top; + struct ifnet *ifp; + + if ((ifp = ifunit(ifname)) != NULL) + LIST_FOREACH(top, &tcb_list, tc_next) + if (top->tc_ifq->altq_ifp == ifp) + return (top); + return (NULL); +} + +static struct cdnr_block * +cdnr_handle2cb(handle) + u_long handle; +{ + struct cdnr_block *cb; + + cb = (struct cdnr_block *)handle; + if (handle != ALIGN(cb)) + return (NULL); + + if (cb == NULL || cb->cb_handle != handle) + return (NULL); + return (cb); +} + +static u_long +cdnr_cb2handle(cb) + struct cdnr_block *cb; +{ + return (cb->cb_handle); +} + +static void * +cdnr_cballoc(top, type, input_func) + struct top_cdnr *top; + int type; + struct tc_action *(*input_func)(struct cdnr_block *, + struct cdnr_pktinfo *); +{ + struct cdnr_block *cb; + int size; + + switch (type) { + case TCETYPE_TOP: + size = sizeof(struct top_cdnr); + break; + case TCETYPE_ELEMENT: + size = sizeof(struct cdnr_block); + break; + case TCETYPE_TBMETER: + size = sizeof(struct tbmeter); + break; + case TCETYPE_TRTCM: + size = sizeof(struct trtcm); + break; + case TCETYPE_TSWTCM: + size = sizeof(struct tswtcm); + break; + default: + return (NULL); + } + + cb = malloc(size, M_DEVBUF, M_WAITOK); + if (cb == NULL) + return (NULL); + bzero(cb, size); + + cb->cb_len = size; + cb->cb_type = type; + cb->cb_ref = 0; + cb->cb_handle = (u_long)cb; + if (top == NULL) + cb->cb_top = (struct top_cdnr *)cb; + else + cb->cb_top = top; + + if (input_func != NULL) { + /* + * if this cdnr has an action function, + * make tc_action to call itself. + */ + cb->cb_action.tca_code = TCACODE_NEXT; + cb->cb_action.tca_next = cb; + cb->cb_input = input_func; + } else + cb->cb_action.tca_code = TCACODE_NONE; + + /* if this isn't top, register the element to the top level cdnr */ + if (top != NULL) + LIST_INSERT_HEAD(&top->tc_elements, cb, cb_next); + + return ((void *)cb); +} + +static void +cdnr_cbdestroy(cblock) + void *cblock; +{ + struct cdnr_block *cb = cblock; + + /* delete filters belonging to this cdnr */ + acc_discard_filters(&cb->cb_top->tc_classifier, cb, 0); + + /* remove from the top level cdnr */ + if (cb->cb_top != cblock) + LIST_REMOVE(cb, cb_next); + + free(cb, M_DEVBUF); +} + +/* + * conditioner common destroy routine + */ +static int +generic_element_destroy(cb) + struct cdnr_block *cb; +{ + int error = 0; + + switch (cb->cb_type) { + case TCETYPE_TOP: + error = top_destroy((struct top_cdnr *)cb); + break; + case TCETYPE_ELEMENT: + error = element_destroy(cb); + break; + case TCETYPE_TBMETER: + error = tbm_destroy((struct tbmeter *)cb); + break; + case TCETYPE_TRTCM: + error = trtcm_destroy((struct trtcm *)cb); + break; + case TCETYPE_TSWTCM: + error = tswtcm_destroy((struct tswtcm *)cb); + break; + default: + error = EINVAL; + } + return (error); +} + +static int +tca_verify_action(utca) + struct tc_action *utca; +{ + switch (utca->tca_code) { + case TCACODE_PASS: + case TCACODE_DROP: + case TCACODE_MARK: + /* these are ok */ + break; + + case TCACODE_HANDLE: + /* verify handle value */ + if (cdnr_handle2cb(utca->tca_handle) == NULL) + return (-1); + break; + + case TCACODE_NONE: + case TCACODE_RETURN: + case TCACODE_NEXT: + default: + /* should not be passed from a user */ + return (-1); + } + return (0); +} + +static void +tca_import_action(ktca, utca) + struct tc_action *ktca, *utca; +{ + struct cdnr_block *cb; + + *ktca = *utca; + if (ktca->tca_code == TCACODE_HANDLE) { + cb = cdnr_handle2cb(ktca->tca_handle); + if (cb == NULL) { + ktca->tca_code = TCACODE_NONE; + return; + } + ktca->tca_code = TCACODE_NEXT; + ktca->tca_next = cb; + cb->cb_ref++; + } else if (ktca->tca_code == TCACODE_MARK) { + ktca->tca_dscp &= DSCP_MASK; + } + return; +} + +static void +tca_invalidate_action(tca) + struct tc_action *tca; +{ + struct cdnr_block *cb; + + if (tca->tca_code == TCACODE_NEXT) { + cb = tca->tca_next; + if (cb == NULL) + return; + cb->cb_ref--; + } + tca->tca_code = TCACODE_NONE; +} + +/* + * top level traffic conditioner + */ +static struct top_cdnr * +top_create(ifq) + struct ifaltq *ifq; +{ + struct top_cdnr *top; + + if ((top = cdnr_cballoc(NULL, TCETYPE_TOP, NULL)) == NULL) + return (NULL); + + top->tc_ifq = ifq; + /* set default action for the top level conditioner */ + top->tc_block.cb_action.tca_code = TCACODE_PASS; + + LIST_INSERT_HEAD(&tcb_list, top, tc_next); + + ifq->altq_cdnr = top; + + return (top); +} + +static int +top_destroy(top) + struct top_cdnr *top; +{ + struct cdnr_block *cb; + + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + top->tc_ifq->altq_cdnr = NULL; + + /* + * destroy all the conditioner elements belonging to this interface + */ + while ((cb = LIST_FIRST(&top->tc_elements)) != NULL) { + while (cb != NULL && cb->cb_ref > 0) + cb = LIST_NEXT(cb, cb_next); + if (cb != NULL) + generic_element_destroy(cb); + } + + LIST_REMOVE(top, tc_next); + + cdnr_cbdestroy(top); + + /* if there is no active conditioner, remove the input hook */ + if (altq_input != NULL) { + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + } + + return (0); +} + +/* + * simple tc elements without input function (e.g., dropper and makers). + */ +static struct cdnr_block * +element_create(top, action) + struct top_cdnr *top; + struct tc_action *action; +{ + struct cdnr_block *cb; + + if (tca_verify_action(action) < 0) + return (NULL); + + if ((cb = cdnr_cballoc(top, TCETYPE_ELEMENT, NULL)) == NULL) + return (NULL); + + tca_import_action(&cb->cb_action, action); + + return (cb); +} + +static int +element_destroy(cb) + struct cdnr_block *cb; +{ + if (cb->cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&cb->cb_action); + + cdnr_cbdestroy(cb); + return (0); +} + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TB_SHIFT 32 +#define TB_SCALE(x) ((u_int64_t)(x) << TB_SHIFT) +#define TB_UNSCALE(x) ((x) >> TB_SHIFT) + +static void +tb_import_profile(tb, profile) + struct tbe *tb; + struct tb_profile *profile; +{ + tb->rate = TB_SCALE(profile->rate / 8) / machclk_freq; + tb->depth = TB_SCALE(profile->depth); + if (tb->rate > 0) + tb->filluptime = tb->depth / tb->rate; + else + tb->filluptime = 0xffffffffffffffffLL; + tb->token = tb->depth; + tb->last = read_machclk(); +} + +/* + * simple token bucket meter + */ +static struct tbmeter * +tbm_create(top, profile, in_action, out_action) + struct top_cdnr *top; + struct tb_profile *profile; + struct tc_action *in_action, *out_action; +{ + struct tbmeter *tbm = NULL; + + if (tca_verify_action(in_action) < 0 + || tca_verify_action(out_action) < 0) + return (NULL); + + if ((tbm = cdnr_cballoc(top, TCETYPE_TBMETER, + tbm_input)) == NULL) + return (NULL); + + tb_import_profile(&tbm->tb, profile); + + tca_import_action(&tbm->in_action, in_action); + tca_import_action(&tbm->out_action, out_action); + + return (tbm); +} + +static int +tbm_destroy(tbm) + struct tbmeter *tbm; +{ + if (tbm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tbm->in_action); + tca_invalidate_action(&tbm->out_action); + + cdnr_cbdestroy(tbm); + return (0); +} + +static struct tc_action * +tbm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tbmeter *tbm = (struct tbmeter *)cb; + u_int64_t len; + u_int64_t interval, now; + + len = TB_SCALE(pktinfo->pkt_len); + + if (tbm->tb.token < len) { + now = read_machclk(); + interval = now - tbm->tb.last; + if (interval >= tbm->tb.filluptime) + tbm->tb.token = tbm->tb.depth; + else { + tbm->tb.token += interval * tbm->tb.rate; + if (tbm->tb.token > tbm->tb.depth) + tbm->tb.token = tbm->tb.depth; + } + tbm->tb.last = now; + } + + if (tbm->tb.token < len) { + PKTCNTR_ADD(&tbm->out_cnt, pktinfo->pkt_len); + return (&tbm->out_action); + } + + tbm->tb.token -= len; + PKTCNTR_ADD(&tbm->in_cnt, pktinfo->pkt_len); + return (&tbm->in_action); +} + +/* + * two rate three color marker + * as described in draft-heinanen-diffserv-trtcm-01.txt + */ +static struct trtcm * +trtcm_create(top, cmtd_profile, peak_profile, + green_action, yellow_action, red_action, coloraware) + struct top_cdnr *top; + struct tb_profile *cmtd_profile, *peak_profile; + struct tc_action *green_action, *yellow_action, *red_action; + int coloraware; +{ + struct trtcm *tcm = NULL; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tcm = cdnr_cballoc(top, TCETYPE_TRTCM, + trtcm_input)) == NULL) + return (NULL); + + tb_import_profile(&tcm->cmtd_tb, cmtd_profile); + tb_import_profile(&tcm->peak_tb, peak_profile); + + tca_import_action(&tcm->green_action, green_action); + tca_import_action(&tcm->yellow_action, yellow_action); + tca_import_action(&tcm->red_action, red_action); + + /* set dscps to use */ + if (tcm->green_action.tca_code == TCACODE_MARK) + tcm->green_dscp = tcm->green_action.tca_dscp & DSCP_MASK; + else + tcm->green_dscp = DSCP_AF11; + if (tcm->yellow_action.tca_code == TCACODE_MARK) + tcm->yellow_dscp = tcm->yellow_action.tca_dscp & DSCP_MASK; + else + tcm->yellow_dscp = DSCP_AF12; + if (tcm->red_action.tca_code == TCACODE_MARK) + tcm->red_dscp = tcm->red_action.tca_dscp & DSCP_MASK; + else + tcm->red_dscp = DSCP_AF13; + + tcm->coloraware = coloraware; + + return (tcm); +} + +static int +trtcm_destroy(tcm) + struct trtcm *tcm; +{ + if (tcm->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tcm->green_action); + tca_invalidate_action(&tcm->yellow_action); + tca_invalidate_action(&tcm->red_action); + + cdnr_cbdestroy(tcm); + return (0); +} + +static struct tc_action * +trtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct trtcm *tcm = (struct trtcm *)cb; + u_int64_t len; + u_int64_t interval, now; + u_int8_t color; + + len = TB_SCALE(pktinfo->pkt_len); + if (tcm->coloraware) { + color = pktinfo->pkt_dscp; + if (color != tcm->yellow_dscp && color != tcm->red_dscp) + color = tcm->green_dscp; + } else { + /* if color-blind, precolor it as green */ + color = tcm->green_dscp; + } + + now = read_machclk(); + if (tcm->cmtd_tb.token < len) { + interval = now - tcm->cmtd_tb.last; + if (interval >= tcm->cmtd_tb.filluptime) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + else { + tcm->cmtd_tb.token += interval * tcm->cmtd_tb.rate; + if (tcm->cmtd_tb.token > tcm->cmtd_tb.depth) + tcm->cmtd_tb.token = tcm->cmtd_tb.depth; + } + tcm->cmtd_tb.last = now; + } + if (tcm->peak_tb.token < len) { + interval = now - tcm->peak_tb.last; + if (interval >= tcm->peak_tb.filluptime) + tcm->peak_tb.token = tcm->peak_tb.depth; + else { + tcm->peak_tb.token += interval * tcm->peak_tb.rate; + if (tcm->peak_tb.token > tcm->peak_tb.depth) + tcm->peak_tb.token = tcm->peak_tb.depth; + } + tcm->peak_tb.last = now; + } + + if (color == tcm->red_dscp || tcm->peak_tb.token < len) { + pktinfo->pkt_dscp = tcm->red_dscp; + PKTCNTR_ADD(&tcm->red_cnt, pktinfo->pkt_len); + return (&tcm->red_action); + } + + if (color == tcm->yellow_dscp || tcm->cmtd_tb.token < len) { + pktinfo->pkt_dscp = tcm->yellow_dscp; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->yellow_cnt, pktinfo->pkt_len); + return (&tcm->yellow_action); + } + + pktinfo->pkt_dscp = tcm->green_dscp; + tcm->cmtd_tb.token -= len; + tcm->peak_tb.token -= len; + PKTCNTR_ADD(&tcm->green_cnt, pktinfo->pkt_len); + return (&tcm->green_action); +} + +/* + * time sliding window three color marker + * as described in draft-fang-diffserv-tc-tswtcm-00.txt + */ +static struct tswtcm * +tswtcm_create(top, cmtd_rate, peak_rate, avg_interval, + green_action, yellow_action, red_action) + struct top_cdnr *top; + u_int32_t cmtd_rate, peak_rate, avg_interval; + struct tc_action *green_action, *yellow_action, *red_action; +{ + struct tswtcm *tsw; + + if (tca_verify_action(green_action) < 0 + || tca_verify_action(yellow_action) < 0 + || tca_verify_action(red_action) < 0) + return (NULL); + + if ((tsw = cdnr_cballoc(top, TCETYPE_TSWTCM, + tswtcm_input)) == NULL) + return (NULL); + + tca_import_action(&tsw->green_action, green_action); + tca_import_action(&tsw->yellow_action, yellow_action); + tca_import_action(&tsw->red_action, red_action); + + /* set dscps to use */ + if (tsw->green_action.tca_code == TCACODE_MARK) + tsw->green_dscp = tsw->green_action.tca_dscp & DSCP_MASK; + else + tsw->green_dscp = DSCP_AF11; + if (tsw->yellow_action.tca_code == TCACODE_MARK) + tsw->yellow_dscp = tsw->yellow_action.tca_dscp & DSCP_MASK; + else + tsw->yellow_dscp = DSCP_AF12; + if (tsw->red_action.tca_code == TCACODE_MARK) + tsw->red_dscp = tsw->red_action.tca_dscp & DSCP_MASK; + else + tsw->red_dscp = DSCP_AF13; + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = cmtd_rate / 8; + tsw->peak_rate = peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * avg_interval / 1000; + + return (tsw); +} + +static int +tswtcm_destroy(tsw) + struct tswtcm *tsw; +{ + if (tsw->cdnrblk.cb_ref > 0) + return (EBUSY); + + tca_invalidate_action(&tsw->green_action); + tca_invalidate_action(&tsw->yellow_action); + tca_invalidate_action(&tsw->red_action); + + cdnr_cbdestroy(tsw); + return (0); +} + +static struct tc_action * +tswtcm_input(cb, pktinfo) + struct cdnr_block *cb; + struct cdnr_pktinfo *pktinfo; +{ + struct tswtcm *tsw = (struct tswtcm *)cb; + int len; + u_int32_t avg_rate; + u_int64_t interval, now, tmp; + + /* + * rate estimator + */ + len = pktinfo->pkt_len; + now = read_machclk(); + + interval = now - tsw->t_front; + /* + * calculate average rate: + * avg = (avg * timewin + pkt_len)/(timewin + interval) + * pkt_len needs to be multiplied by machclk_freq in order to + * get (bytes/sec). + * note: when avg_rate (bytes/sec) and timewin (machclk unit) are + * less than 32 bits, the following 64-bit operation has enough + * precision. + */ + tmp = ((u_int64_t)tsw->avg_rate * tsw->timewin + + (u_int64_t)len * machclk_freq) / (tsw->timewin + interval); + tsw->avg_rate = avg_rate = (u_int32_t)tmp; + tsw->t_front = now; + + /* + * marker + */ + if (avg_rate > tsw->cmtd_rate) { + u_int32_t randval = arc4random() % avg_rate; + + if (avg_rate > tsw->peak_rate) { + if (randval < avg_rate - tsw->peak_rate) { + /* mark red */ + pktinfo->pkt_dscp = tsw->red_dscp; + PKTCNTR_ADD(&tsw->red_cnt, len); + return (&tsw->red_action); + } else if (randval < avg_rate - tsw->cmtd_rate) + goto mark_yellow; + } else { + /* peak_rate >= avg_rate > cmtd_rate */ + if (randval < avg_rate - tsw->cmtd_rate) { + mark_yellow: + pktinfo->pkt_dscp = tsw->yellow_dscp; + PKTCNTR_ADD(&tsw->yellow_cnt, len); + return (&tsw->yellow_action); + } + } + } + + /* mark green */ + pktinfo->pkt_dscp = tsw->green_dscp; + PKTCNTR_ADD(&tsw->green_cnt, len); + return (&tsw->green_action); +} + +/* + * ioctl requests + */ +static int +cdnrcmd_if_attach(ifname) + char *ifname; +{ + struct ifnet *ifp; + struct top_cdnr *top; + + if ((ifp = ifunit(ifname)) == NULL) + return (EBADF); + + if (ifp->if_snd.altq_cdnr != NULL) + return (EBUSY); + + if ((top = top_create(&ifp->if_snd)) == NULL) + return (ENOMEM); + return (0); +} + +static int +cdnrcmd_if_detach(ifname) + char *ifname; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ifname)) == NULL) + return (EBADF); + + return top_destroy(top); +} + +static int +cdnrcmd_add_element(ap) + struct cdnr_add_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + cb = element_create(top, &ap->action); + if (cb == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(cb); + return (0); +} + +static int +cdnrcmd_delete_element(ap) + struct cdnr_delete_element *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type != TCETYPE_ELEMENT) + return generic_element_destroy(cb); + + return element_destroy(cb); +} + +static int +cdnrcmd_add_filter(ap) + struct cdnr_add_filter *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&top->tc_classifier, &ap->filter, + cb, &ap->filter_handle); +} + +static int +cdnrcmd_delete_filter(ap) + struct cdnr_delete_filter *ap; +{ + struct top_cdnr *top; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + return acc_delete_filter(&top->tc_classifier, ap->filter_handle); +} + +static int +cdnrcmd_add_tbm(ap) + struct cdnr_add_tbmeter *ap; +{ + struct top_cdnr *top; + struct tbmeter *tbm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tbm = tbm_create(top, &ap->profile, &ap->in_action, &ap->out_action); + if (tbm == NULL) + return (EINVAL); + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tbm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tbm(ap) + struct cdnr_modify_tbmeter *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tbm->tb, &ap->profile); + + return (0); +} + +static int +cdnrcmd_tbm_stats(ap) + struct cdnr_tbmeter_stats *ap; +{ + struct tbmeter *tbm; + + if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + ap->in_cnt = tbm->in_cnt; + ap->out_cnt = tbm->out_cnt; + + return (0); +} + +static int +cdnrcmd_add_trtcm(ap) + struct cdnr_add_trtcm *ap; +{ + struct top_cdnr *top; + struct trtcm *tcm; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + tcm = trtcm_create(top, &ap->cmtd_profile, &ap->peak_profile, + &ap->green_action, &ap->yellow_action, + &ap->red_action, ap->coloraware); + if (tcm == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tcm->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_trtcm(ap) + struct cdnr_modify_trtcm *ap; +{ + struct trtcm *tcm; + + if ((tcm = (struct trtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + tb_import_profile(&tcm->cmtd_tb, &ap->cmtd_profile); + tb_import_profile(&tcm->peak_tb, &ap->peak_profile); + + return (0); +} + +static int +cdnrcmd_tcm_stats(ap) + struct cdnr_tcm_stats *ap; +{ + struct cdnr_block *cb; + + if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (cb->cb_type == TCETYPE_TRTCM) { + struct trtcm *tcm = (struct trtcm *)cb; + + ap->green_cnt = tcm->green_cnt; + ap->yellow_cnt = tcm->yellow_cnt; + ap->red_cnt = tcm->red_cnt; + } else if (cb->cb_type == TCETYPE_TSWTCM) { + struct tswtcm *tsw = (struct tswtcm *)cb; + + ap->green_cnt = tsw->green_cnt; + ap->yellow_cnt = tsw->yellow_cnt; + ap->red_cnt = tsw->red_cnt; + } else + return (EINVAL); + + return (0); +} + +static int +cdnrcmd_add_tswtcm(ap) + struct cdnr_add_tswtcm *ap; +{ + struct top_cdnr *top; + struct tswtcm *tsw; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + tsw = tswtcm_create(top, ap->cmtd_rate, ap->peak_rate, + ap->avg_interval, &ap->green_action, + &ap->yellow_action, &ap->red_action); + if (tsw == NULL) + return (EINVAL); + + /* return a class handle to the user */ + ap->cdnr_handle = cdnr_cb2handle(&tsw->cdnrblk); + return (0); +} + +static int +cdnrcmd_modify_tswtcm(ap) + struct cdnr_modify_tswtcm *ap; +{ + struct tswtcm *tsw; + + if ((tsw = (struct tswtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) + return (EINVAL); + + if (ap->cmtd_rate > ap->peak_rate) + return (EINVAL); + + /* convert rates from bits/sec to bytes/sec */ + tsw->cmtd_rate = ap->cmtd_rate / 8; + tsw->peak_rate = ap->peak_rate / 8; + tsw->avg_rate = 0; + + /* timewin is converted from msec to machine clock unit */ + tsw->timewin = (u_int64_t)machclk_freq * ap->avg_interval / 1000; + + return (0); +} + +static int +cdnrcmd_get_stats(ap) + struct cdnr_get_stats *ap; +{ + struct top_cdnr *top; + struct cdnr_block *cb; + struct tbmeter *tbm; + struct trtcm *tcm; + struct tswtcm *tsw; + struct tce_stats tce, *usp; + int error, n, nskip, nelements; + + if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) + return (EBADF); + + /* copy action stats */ + bcopy(top->tc_cnts, ap->cnts, sizeof(ap->cnts)); + + /* stats for each element */ + nelements = ap->nelements; + usp = ap->tce_stats; + if (nelements <= 0 || usp == NULL) + return (0); + + nskip = ap->nskip; + n = 0; + LIST_FOREACH(cb, &top->tc_elements, cb_next) { + if (nskip > 0) { + nskip--; + continue; + } + + bzero(&tce, sizeof(tce)); + tce.tce_handle = cb->cb_handle; + tce.tce_type = cb->cb_type; + switch (cb->cb_type) { + case TCETYPE_TBMETER: + tbm = (struct tbmeter *)cb; + tce.tce_cnts[0] = tbm->in_cnt; + tce.tce_cnts[1] = tbm->out_cnt; + break; + case TCETYPE_TRTCM: + tcm = (struct trtcm *)cb; + tce.tce_cnts[0] = tcm->green_cnt; + tce.tce_cnts[1] = tcm->yellow_cnt; + tce.tce_cnts[2] = tcm->red_cnt; + break; + case TCETYPE_TSWTCM: + tsw = (struct tswtcm *)cb; + tce.tce_cnts[0] = tsw->green_cnt; + tce.tce_cnts[1] = tsw->yellow_cnt; + tce.tce_cnts[2] = tsw->red_cnt; + break; + default: + continue; + } + + if ((error = copyout((caddr_t)&tce, (caddr_t)usp++, + sizeof(tce))) != 0) + return (error); + + if (++n == nelements) + break; + } + ap->nelements = n; + + return (0); +} + +/* + * conditioner device interface + */ +int +cdnropen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("cdnr: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +cdnrclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct top_cdnr *top; + int err, error = 0; + + while ((top = LIST_FIRST(&tcb_list)) != NULL) { + /* destroy all */ + err = top_destroy(top); + if (err != 0 && error == 0) + error = err; + } + altq_input = NULL; + + return (error); +} + +int +cdnrioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct top_cdnr *top; + struct cdnr_interface *ifacep; + int s, error = 0; + + /* check super-user privilege */ + switch (cmd) { + case CDNR_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + + s = splnet(); + switch (cmd) { + + case CDNR_IF_ATTACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_attach(ifacep->cdnr_ifname); + break; + + case CDNR_IF_DETACH: + ifacep = (struct cdnr_interface *)addr; + error = cdnrcmd_if_detach(ifacep->cdnr_ifname); + break; + + case CDNR_ENABLE: + case CDNR_DISABLE: + ifacep = (struct cdnr_interface *)addr; + if ((top = tcb_lookup(ifacep->cdnr_ifname)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case CDNR_ENABLE: + ALTQ_SET_CNDTNING(top->tc_ifq); + if (altq_input == NULL) + altq_input = altq_cdnr_input; + break; + + case CDNR_DISABLE: + ALTQ_CLEAR_CNDTNING(top->tc_ifq); + LIST_FOREACH(top, &tcb_list, tc_next) + if (ALTQ_IS_CNDTNING(top->tc_ifq)) + break; + if (top == NULL) + altq_input = NULL; + break; + } + break; + + case CDNR_ADD_ELEM: + error = cdnrcmd_add_element((struct cdnr_add_element *)addr); + break; + + case CDNR_DEL_ELEM: + error = cdnrcmd_delete_element((struct cdnr_delete_element *)addr); + break; + + case CDNR_ADD_TBM: + error = cdnrcmd_add_tbm((struct cdnr_add_tbmeter *)addr); + break; + + case CDNR_MOD_TBM: + error = cdnrcmd_modify_tbm((struct cdnr_modify_tbmeter *)addr); + break; + + case CDNR_TBM_STATS: + error = cdnrcmd_tbm_stats((struct cdnr_tbmeter_stats *)addr); + break; + + case CDNR_ADD_TCM: + error = cdnrcmd_add_trtcm((struct cdnr_add_trtcm *)addr); + break; + + case CDNR_MOD_TCM: + error = cdnrcmd_modify_trtcm((struct cdnr_modify_trtcm *)addr); + break; + + case CDNR_TCM_STATS: + error = cdnrcmd_tcm_stats((struct cdnr_tcm_stats *)addr); + break; + + case CDNR_ADD_FILTER: + error = cdnrcmd_add_filter((struct cdnr_add_filter *)addr); + break; + + case CDNR_DEL_FILTER: + error = cdnrcmd_delete_filter((struct cdnr_delete_filter *)addr); + break; + + case CDNR_GETSTATS: + error = cdnrcmd_get_stats((struct cdnr_get_stats *)addr); + break; + + case CDNR_ADD_TSW: + error = cdnrcmd_add_tswtcm((struct cdnr_add_tswtcm *)addr); + break; + + case CDNR_MOD_TSW: + error = cdnrcmd_modify_tswtcm((struct cdnr_modify_tswtcm *)addr); + break; + + default: + error = EINVAL; + break; + } + splx(s); + + return error; +} + +#ifdef KLD_MODULE + +static struct altqsw cdnr_sw = + {"cdnr", cdnropen, cdnrclose, cdnrioctl}; + +ALTQ_MODULE(altq_cdnr, ALTQT_CDNR, &cdnr_sw); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ3_COMPAT */ +#endif /* ALTQ_CDNR */ diff --git a/freebsd/sys/net/altq/altq_cdnr.h b/freebsd/sys/net/altq/altq_cdnr.h new file mode 100644 index 00000000..06fa9c98 --- /dev/null +++ b/freebsd/sys/net/altq/altq_cdnr.h @@ -0,0 +1,336 @@ +/*- + * Copyright (C) 1999-2002 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_cdnr.h,v 1.9 2003/07/10 12:07:48 kjc Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_CDNR_H_ +#define _ALTQ_ALTQ_CDNR_H_ + +#include <net/altq/altq.h> + +/* + * traffic conditioner element types + */ +#define TCETYPE_NONE 0 +#define TCETYPE_TOP 1 /* top level conditioner */ +#define TCETYPE_ELEMENT 2 /* a simple tc element */ +#define TCETYPE_TBMETER 3 /* token bucket meter */ +#define TCETYPE_TRTCM 4 /* (two-rate) three color marker */ +#define TCETYPE_TSWTCM 5 /* time sliding window 3-color maker */ + +/* + * traffic conditioner action + */ +struct cdnr_block; + +struct tc_action { + int tca_code; /* e.g., TCACODE_PASS */ + /* tca_code dependent variable */ + union { + u_long un_value; /* template */ + u_int8_t un_dscp; /* diffserv code point */ + u_long un_handle; /* tc action handle */ + struct cdnr_block *un_next; /* next tc element block */ + } tca_un; +}; +#define tca_value tca_un.un_value +#define tca_dscp tca_un.un_dscp +#define tca_handle tca_un.un_handle +#define tca_next tca_un.un_next + +#define TCACODE_NONE 0 /* action is not set */ +#define TCACODE_PASS 1 /* pass this packet */ +#define TCACODE_DROP 2 /* discard this packet */ +#define TCACODE_RETURN 3 /* do not process this packet */ +#define TCACODE_MARK 4 /* mark dscp */ +#define TCACODE_HANDLE 5 /* take action specified by handle */ +#define TCACODE_NEXT 6 /* take action in the next tc element */ +#define TCACODE_MAX 6 + +#define CDNR_NULL_HANDLE 0 + +struct cdnr_interface { + char cdnr_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +/* simple element operations */ +struct cdnr_add_element { + struct cdnr_interface iface; + struct tc_action action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_delete_element { + struct cdnr_interface iface; + u_long cdnr_handle; +}; + +/* token-bucket meter operations */ +struct cdnr_add_tbmeter { + struct cdnr_interface iface; + struct tb_profile profile; + struct tc_action in_action; + struct tc_action out_action; + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tbmeter { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile profile; +}; + +struct cdnr_tbmeter_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr in_cnt; + struct pktcntr out_cnt; +}; + +/* two-rate three-color marker operations */ +struct cdnr_add_trtcm { + struct cdnr_interface iface; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + int coloraware; /* color-aware/color-blind */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_trtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + struct tb_profile cmtd_profile; /* profile for committed tb */ + struct tb_profile peak_profile; /* profile for peak tb */ + int coloraware; /* color-aware/color-blind */ +}; + +struct cdnr_tcm_stats { + struct cdnr_interface iface; + u_long cdnr_handle; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker operations */ +struct cdnr_add_tswtcm { + struct cdnr_interface iface; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ + struct tc_action green_action; /* action for green packets */ + struct tc_action yellow_action; /* action for yellow packets */ + struct tc_action red_action; /* action for red packets */ + + u_long cdnr_handle; /* return value */ +}; + +struct cdnr_modify_tswtcm { + struct cdnr_interface iface; + u_long cdnr_handle; + u_int32_t cmtd_rate; /* committed rate (bits/sec) */ + u_int32_t peak_rate; /* peak rate (bits/sec) */ + u_int32_t avg_interval; /* averaging interval (msec) */ +}; + +struct cdnr_add_filter { + struct cdnr_interface iface; + u_long cdnr_handle; +#ifdef ALTQ3_CLFIER_COMPAT + struct flow_filter filter; +#endif + u_long filter_handle; /* return value */ +}; + +struct cdnr_delete_filter { + struct cdnr_interface iface; + u_long filter_handle; +}; + +struct tce_stats { + u_long tce_handle; /* tc element handle */ + int tce_type; /* e.g., TCETYPE_ELEMENT */ + struct pktcntr tce_cnts[3]; /* tcm returns 3 counters */ +}; + +struct cdnr_get_stats { + struct cdnr_interface iface; + struct pktcntr cnts[TCACODE_MAX+1]; + + /* element stats */ + int nskip; /* skip # of elements */ + int nelements; /* # of element stats (WR) */ + struct tce_stats *tce_stats; /* pointer to stats array */ +}; + +#define CDNR_IF_ATTACH _IOW('Q', 1, struct cdnr_interface) +#define CDNR_IF_DETACH _IOW('Q', 2, struct cdnr_interface) +#define CDNR_ENABLE _IOW('Q', 3, struct cdnr_interface) +#define CDNR_DISABLE _IOW('Q', 4, struct cdnr_interface) +#define CDNR_ADD_FILTER _IOWR('Q', 10, struct cdnr_add_filter) +#define CDNR_DEL_FILTER _IOW('Q', 11, struct cdnr_delete_filter) +#define CDNR_GETSTATS _IOWR('Q', 12, struct cdnr_get_stats) +#define CDNR_ADD_ELEM _IOWR('Q', 30, struct cdnr_add_element) +#define CDNR_DEL_ELEM _IOW('Q', 31, struct cdnr_delete_element) +#define CDNR_ADD_TBM _IOWR('Q', 32, struct cdnr_add_tbmeter) +#define CDNR_MOD_TBM _IOW('Q', 33, struct cdnr_modify_tbmeter) +#define CDNR_TBM_STATS _IOWR('Q', 34, struct cdnr_tbmeter_stats) +#define CDNR_ADD_TCM _IOWR('Q', 35, struct cdnr_add_trtcm) +#define CDNR_MOD_TCM _IOWR('Q', 36, struct cdnr_modify_trtcm) +#define CDNR_TCM_STATS _IOWR('Q', 37, struct cdnr_tcm_stats) +#define CDNR_ADD_TSW _IOWR('Q', 38, struct cdnr_add_tswtcm) +#define CDNR_MOD_TSW _IOWR('Q', 39, struct cdnr_modify_tswtcm) + +#ifndef DSCP_EF +/* diffserve code points */ +#define DSCP_MASK 0xfc +#define DSCP_CUMASK 0x03 +#define DSCP_EF 0xb8 +#define DSCP_AF11 0x28 +#define DSCP_AF12 0x30 +#define DSCP_AF13 0x38 +#define DSCP_AF21 0x48 +#define DSCP_AF22 0x50 +#define DSCP_AF23 0x58 +#define DSCP_AF31 0x68 +#define DSCP_AF32 0x70 +#define DSCP_AF33 0x78 +#define DSCP_AF41 0x88 +#define DSCP_AF42 0x90 +#define DSCP_AF43 0x98 +#define AF_CLASSMASK 0xe0 +#define AF_DROPPRECMASK 0x18 +#endif + +#ifdef _KERNEL + +/* + * packet information passed to the input function of tc elements + */ +struct cdnr_pktinfo { + int pkt_len; /* packet length */ + u_int8_t pkt_dscp; /* diffserv code point */ +}; + +/* + * traffic conditioner control block common to all types of tc elements + */ +struct cdnr_block { + LIST_ENTRY(cdnr_block) cb_next; + int cb_len; /* size of this tc element */ + int cb_type; /* cdnr block type */ + int cb_ref; /* reference count of this element */ + u_long cb_handle; /* handle of this tc element */ + struct top_cdnr *cb_top; /* back pointer to top */ + struct tc_action cb_action; /* top level action for this tcb */ + struct tc_action *(*cb_input)(struct cdnr_block *, + struct cdnr_pktinfo *); +}; + +/* + * top level traffic conditioner structure for an interface + */ +struct top_cdnr { + struct cdnr_block tc_block; + + LIST_ENTRY(top_cdnr) tc_next; + struct ifaltq *tc_ifq; + + LIST_HEAD(, cdnr_block) tc_elements; +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier tc_classifier; +#endif + struct pktcntr tc_cnts[TCACODE_MAX+1]; +}; + +/* token bucket element */ +struct tbe { + u_int64_t rate; + u_int64_t depth; + + u_int64_t token; + u_int64_t filluptime; + u_int64_t last; +}; + +/* token bucket meter structure */ +struct tbmeter { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe tb; /* token bucket */ + struct tc_action in_action; /* actions for IN/OUT */ + struct tc_action out_action; /* actions for IN/OUT */ + struct pktcntr in_cnt; /* statistics for IN/OUT */ + struct pktcntr out_cnt; /* statistics for IN/OUT */ +}; + +/* two-rate three-color marker structure */ +struct trtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + struct tbe cmtd_tb; /* committed tb profile */ + struct tbe peak_tb; /* peak tb profile */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + int coloraware; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +/* time sliding window three-color marker structure */ +struct tswtcm { + struct cdnr_block cdnrblk; /* conditioner block */ + + u_int32_t avg_rate; /* average rate (bytes/sec) */ + u_int64_t t_front; /* timestamp of last update */ + + u_int64_t timewin; /* average interval */ + u_int32_t cmtd_rate; /* committed target rate */ + u_int32_t peak_rate; /* peak target rate */ + struct tc_action green_action; + struct tc_action yellow_action; + struct tc_action red_action; + u_int8_t green_dscp; + u_int8_t yellow_dscp; + u_int8_t red_dscp; + struct pktcntr green_cnt; + struct pktcntr yellow_cnt; + struct pktcntr red_cnt; +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_CDNR_H_ */ diff --git a/freebsd/sys/net/altq/altq_classq.h b/freebsd/sys/net/altq/altq_classq.h new file mode 100644 index 00000000..dc465a0b --- /dev/null +++ b/freebsd/sys/net/altq/altq_classq.h @@ -0,0 +1,213 @@ +/*- + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_classq.h,v 1.6 2003/01/07 07:33:38 kjc Exp $ + * $FreeBSD$ + */ +/* + * class queue definitions extracted from rm_class.h. + */ +#ifndef _ALTQ_ALTQ_CLASSQ_H_ +#define _ALTQ_ALTQ_CLASSQ_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Packet Queue types: RED or DROPHEAD. + */ +#define Q_DROPHEAD 0x00 +#define Q_RED 0x01 +#define Q_RIO 0x02 +#define Q_DROPTAIL 0x03 +#define Q_CODEL 0x04 + +#ifdef _KERNEL + +/* + * Packet Queue structures and macros to manipulate them. + */ +struct _class_queue_ { + struct mbuf *tail_; /* Tail of packet queue */ + int qlen_; /* Queue length (in number of packets) */ + int qlim_; /* Queue limit (in number of packets*) */ + int qsize_; /* Queue size (in number of bytes*) */ + int qtype_; /* Queue type */ +}; + +typedef struct _class_queue_ class_queue_t; + +#define qtype(q) (q)->qtype_ /* Get queue type */ +#define qlimit(q) (q)->qlim_ /* Max packets to be queued */ +#define qlen(q) (q)->qlen_ /* Current queue length. */ +#define qsize(q) (q)->qsize_ /* Current queue size. */ +#define qtail(q) (q)->tail_ /* Tail of the queue */ +#define qhead(q) ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL) + +#define qempty(q) ((q)->qlen_ == 0) /* Is the queue empty?? */ +#define q_is_codel(q) ((q)->qtype_ == Q_CODEL) /* Is the queue a codel queue */ +#define q_is_red(q) ((q)->qtype_ == Q_RED) /* Is the queue a red queue */ +#define q_is_rio(q) ((q)->qtype_ == Q_RIO) /* Is the queue a rio queue */ +#define q_is_red_or_rio(q) ((q)->qtype_ == Q_RED || (q)->qtype_ == Q_RIO) + +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +extern void _addq(class_queue_t *, struct mbuf *); +extern struct mbuf *_getq(class_queue_t *); +extern struct mbuf *_getq_tail(class_queue_t *); +extern struct mbuf *_getq_random(class_queue_t *); +extern void _removeq(class_queue_t *, struct mbuf *); +extern void _flushq(class_queue_t *); + +#else /* __GNUC__ && !ALTQ_DEBUG */ +/* + * inlined versions + */ +static __inline void +_addq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; + qsize(q) += m_pktlen(m); +} + +static __inline struct mbuf * +_getq(class_queue_t *q) +{ + struct mbuf *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else + qtail(q) = NULL; + qlen(q)--; + qsize(q) -= m_pktlen(m0); + m0->m_nextpkt = NULL; + return (m0); +} + +/* drop a packet at the tail of the queue */ +static __inline struct mbuf * +_getq_tail(class_queue_t *q) +{ + struct mbuf *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else + qtail(q) = prev; + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +/* randomly select a packet in the queue */ +static __inline struct mbuf * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) + qtail(q) = NULL; + else { + struct mbuf *prev = NULL; + + n = random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +static __inline void +_removeq(class_queue_t *q, struct mbuf *m) +{ + struct mbuf *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +static __inline void +_flushq(class_queue_t *q) +{ + struct mbuf *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); +} + +#endif /* __GNUC__ && !ALTQ_DEBUG */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_CLASSQ_H_ */ diff --git a/freebsd/sys/net/altq/altq_codel.c b/freebsd/sys/net/altq/altq_codel.c new file mode 100644 index 00000000..438120f5 --- /dev/null +++ b/freebsd/sys/net/altq/altq_codel.c @@ -0,0 +1,479 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/* + * CoDel - The Controlled-Delay Active Queue Management algorithm + * + * Copyright (C) 2013 Ermal Luçi <eri@FreeBSD.org> + * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> + * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> + * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * $FreeBSD$ + */ +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#ifdef ALTQ_CODEL /* CoDel is enabled by ALTQ_CODEL option in opt_altq.h */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/systm.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <netinet/in.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <net/altq/if_altq.h> +#include <net/altq/altq.h> +#include <net/altq/altq_codel.h> + +static int codel_should_drop(struct codel *, class_queue_t *, + struct mbuf *, u_int64_t); +static void codel_Newton_step(struct codel_vars *); +static u_int64_t codel_control_law(u_int64_t t, u_int64_t, u_int32_t); + +#define codel_time_after(a, b) ((int64_t)(a) - (int64_t)(b) > 0) +#define codel_time_after_eq(a, b) ((int64_t)(a) - (int64_t)(b) >= 0) +#define codel_time_before(a, b) ((int64_t)(a) - (int64_t)(b) < 0) +#define codel_time_before_eq(a, b) ((int64_t)(a) - (int64_t)(b) <= 0) + +static int codel_request(struct ifaltq *, int, void *); + +static int codel_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *codel_dequeue(struct ifaltq *, int); + +int +codel_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + + return (altq_attach(&ifp->if_snd, ALTQT_CODEL, a->altq_disc, + codel_enqueue, codel_dequeue, codel_request, NULL, NULL)); +} + +int +codel_add_altq(struct pf_altq *a) +{ + struct codel_if *cif; + struct ifnet *ifp; + struct codel_opts *opts; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + opts = &a->pq_u.codel_opts; + + cif = malloc(sizeof(struct codel_if), M_DEVBUF, M_NOWAIT | M_ZERO); + if (cif == NULL) + return (ENOMEM); + cif->cif_bandwidth = a->ifbandwidth; + cif->cif_ifq = &ifp->if_snd; + + cif->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO); + if (cif->cl_q == NULL) { + free(cif, M_DEVBUF); + return (ENOMEM); + } + + if (a->qlimit == 0) + a->qlimit = 50; /* use default. */ + qlimit(cif->cl_q) = a->qlimit; + qtype(cif->cl_q) = Q_CODEL; + qlen(cif->cl_q) = 0; + qsize(cif->cl_q) = 0; + + if (opts->target == 0) + opts->target = 5; + if (opts->interval == 0) + opts->interval = 100; + cif->codel.params.target = machclk_freq * opts->target / 1000; + cif->codel.params.interval = machclk_freq * opts->interval / 1000; + cif->codel.params.ecn = opts->ecn; + cif->codel.stats.maxpacket = 256; + + cif->cl_stats.qlength = qlen(cif->cl_q); + cif->cl_stats.qlimit = qlimit(cif->cl_q); + + /* keep the state in pf_altq */ + a->altq_disc = cif; + + return (0); +} + +int +codel_remove_altq(struct pf_altq *a) +{ + struct codel_if *cif; + + if ((cif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + if (cif->cl_q) + free(cif->cl_q, M_DEVBUF); + free(cif, M_DEVBUF); + + return (0); +} + +int +codel_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct codel_if *cif; + struct codel_ifstats stats; + int error = 0; + + if ((cif = altq_lookup(a->ifname, ALTQT_CODEL)) == NULL) + return (EBADF); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + stats = cif->cl_stats; + stats.stats = cif->codel.stats; + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + + return (0); +} + +static int +codel_request(struct ifaltq *ifq, int req, void *arg) +{ + struct codel_if *cif = (struct codel_if *)ifq->altq_disc; + struct mbuf *m; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + if (!ALTQ_IS_ENABLED(cif->cif_ifq)) + break; + + if (qempty(cif->cl_q)) + break; + + while ((m = _getq(cif->cl_q)) != NULL) { + PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m)); + m_freem(m); + IFQ_DEC_LEN(cif->cif_ifq); + } + cif->cif_ifq->ifq_len = 0; + break; + } + + return (0); +} + +static int +codel_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + + struct codel_if *cif = (struct codel_if *) ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); + m_freem(m); + PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m)); + return (ENOBUFS); + } + + if (codel_addq(&cif->codel, cif->cl_q, m)) { + PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m)); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + + return (0); +} + +static struct mbuf * +codel_dequeue(struct ifaltq *ifq, int op) +{ + struct codel_if *cif = (struct codel_if *)ifq->altq_disc; + struct mbuf *m; + + IFQ_LOCK_ASSERT(ifq); + + if (IFQ_IS_EMPTY(ifq)) + return (NULL); + + if (op == ALTDQ_POLL) + return (qhead(cif->cl_q)); + + + m = codel_getq(&cif->codel, cif->cl_q); + if (m != NULL) { + IFQ_DEC_LEN(ifq); + PKTCNTR_ADD(&cif->cl_stats.cl_xmitcnt, m_pktlen(m)); + return (m); + } + + return (NULL); +} + +struct codel * +codel_alloc(int target, int interval, int ecn) +{ + struct codel *c; + + c = malloc(sizeof(*c), M_DEVBUF, M_NOWAIT | M_ZERO); + if (c != NULL) { + c->params.target = machclk_freq * target / 1000; + c->params.interval = machclk_freq * interval / 1000; + c->params.ecn = ecn; + c->stats.maxpacket = 256; + } + + return (c); +} + +void +codel_destroy(struct codel *c) +{ + + free(c, M_DEVBUF); +} + +#define MTAG_CODEL 1438031249 +int +codel_addq(struct codel *c, class_queue_t *q, struct mbuf *m) +{ + struct m_tag *mtag; + uint64_t *enqueue_time; + + if (qlen(q) < qlimit(q)) { + mtag = m_tag_locate(m, MTAG_CODEL, 0, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_CODEL, 0, sizeof(uint64_t), + M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + return (-1); + } + enqueue_time = (uint64_t *)(mtag + 1); + *enqueue_time = read_machclk(); + m_tag_prepend(m, mtag); + _addq(q, m); + return (0); + } + c->drop_overlimit++; + m_freem(m); + + return (-1); +} + +static int +codel_should_drop(struct codel *c, class_queue_t *q, struct mbuf *m, + u_int64_t now) +{ + struct m_tag *mtag; + uint64_t *enqueue_time; + + if (m == NULL) { + c->vars.first_above_time = 0; + return (0); + } + + mtag = m_tag_locate(m, MTAG_CODEL, 0, NULL); + if (mtag == NULL) { + /* Only one warning per second. */ + if (ppsratecheck(&c->last_log, &c->last_pps, 1)) + printf("%s: could not found the packet mtag!\n", + __func__); + c->vars.first_above_time = 0; + return (0); + } + enqueue_time = (uint64_t *)(mtag + 1); + c->vars.ldelay = now - *enqueue_time; + c->stats.maxpacket = MAX(c->stats.maxpacket, m_pktlen(m)); + + if (codel_time_before(c->vars.ldelay, c->params.target) || + qsize(q) <= c->stats.maxpacket) { + /* went below - stay below for at least interval */ + c->vars.first_above_time = 0; + return (0); + } + if (c->vars.first_above_time == 0) { + /* just went above from below. If we stay above + * for at least interval we'll say it's ok to drop + */ + c->vars.first_above_time = now + c->params.interval; + return (0); + } + if (codel_time_after(now, c->vars.first_above_time)) + return (1); + + return (0); +} + +/* + * Run a Newton method step: + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) + * + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 + */ +static void +codel_Newton_step(struct codel_vars *vars) +{ + uint32_t invsqrt, invsqrt2; + uint64_t val; + +/* sizeof_in_bits(rec_inv_sqrt) */ +#define REC_INV_SQRT_BITS (8 * sizeof(u_int16_t)) +/* needed shift to get a Q0.32 number from rec_inv_sqrt */ +#define REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS) + + invsqrt = ((u_int32_t)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT; + invsqrt2 = ((u_int64_t)invsqrt * invsqrt) >> 32; + val = (3LL << 32) - ((u_int64_t)vars->count * invsqrt2); + val >>= 2; /* avoid overflow in following multiply */ + val = (val * invsqrt) >> (32 - 2 + 1); + + vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT; +} + +static u_int64_t +codel_control_law(u_int64_t t, u_int64_t interval, u_int32_t rec_inv_sqrt) +{ + + return (t + (u_int32_t)(((u_int64_t)interval * + (rec_inv_sqrt << REC_INV_SQRT_SHIFT)) >> 32)); +} + +struct mbuf * +codel_getq(struct codel *c, class_queue_t *q) +{ + struct mbuf *m; + u_int64_t now; + int drop; + + if ((m = _getq(q)) == NULL) { + c->vars.dropping = 0; + return (m); + } + + now = read_machclk(); + drop = codel_should_drop(c, q, m, now); + if (c->vars.dropping) { + if (!drop) { + /* sojourn time below target - leave dropping state */ + c->vars.dropping = 0; + } else if (codel_time_after_eq(now, c->vars.drop_next)) { + /* It's time for the next drop. Drop the current + * packet and dequeue the next. The dequeue might + * take us out of dropping state. + * If not, schedule the next drop. + * A large backlog might result in drop rates so high + * that the next drop should happen now, + * hence the while loop. + */ + while (c->vars.dropping && + codel_time_after_eq(now, c->vars.drop_next)) { + c->vars.count++; /* don't care of possible wrap + * since there is no more + * divide */ + codel_Newton_step(&c->vars); + /* TODO ECN */ + PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m)); + m_freem(m); + m = _getq(q); + if (!codel_should_drop(c, q, m, now)) + /* leave dropping state */ + c->vars.dropping = 0; + else + /* and schedule the next drop */ + c->vars.drop_next = + codel_control_law(c->vars.drop_next, + c->params.interval, + c->vars.rec_inv_sqrt); + } + } + } else if (drop) { + /* TODO ECN */ + PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m)); + m_freem(m); + + m = _getq(q); + drop = codel_should_drop(c, q, m, now); + + c->vars.dropping = 1; + /* if min went above target close to when we last went below it + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + */ + if (codel_time_before(now - c->vars.drop_next, + 16 * c->params.interval)) { + c->vars.count = (c->vars.count - c->vars.lastcount) | 1; + /* we dont care if rec_inv_sqrt approximation + * is not very precise : + * Next Newton steps will correct it quadratically. + */ + codel_Newton_step(&c->vars); + } else { + c->vars.count = 1; + c->vars.rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT; + } + c->vars.lastcount = c->vars.count; + c->vars.drop_next = codel_control_law(now, c->params.interval, + c->vars.rec_inv_sqrt); + } + + return (m); +} + +void +codel_getstats(struct codel *c, struct codel_stats *s) +{ + *s = c->stats; +} + +#endif /* ALTQ_CODEL */ diff --git a/freebsd/sys/net/altq/altq_codel.h b/freebsd/sys/net/altq/altq_codel.h new file mode 100644 index 00000000..8d7178b4 --- /dev/null +++ b/freebsd/sys/net/altq/altq_codel.h @@ -0,0 +1,129 @@ +/* + * CoDel - The Controlled-Delay Active Queue Management algorithm + * + * Copyright (C) 2013 Ermal Luçi <eri@FreeBSD.org> + * Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com> + * Copyright (C) 2011-2012 Van Jacobson <van@pollere.net> + * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> + * Copyright (C) 2012 Eric Dumazet <edumazet@google.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_CODEL_H_ +#define _ALTQ_ALTQ_CODEL_H_ + +struct codel_stats { + u_int32_t maxpacket; + struct pktcntr drop_cnt; + u_int marked_packets; +}; + +struct codel_ifstats { + u_int qlength; + u_int qlimit; + struct codel_stats stats; + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +#ifdef _KERNEL +#include <net/altq/altq_classq.h> + +/** + * struct codel_params - contains codel parameters + * <at> target: target queue size (in time units) + * <at> interval: width of moving time window + * <at> ecn: is Explicit Congestion Notification enabled + */ +struct codel_params { + u_int64_t target; + u_int64_t interval; + int ecn; +}; + +/** + * struct codel_vars - contains codel variables + * <at> count: how many drops we've done since the last time we + * entered dropping state + * <at> lastcount: count at entry to dropping state + * <at> dropping: set to true if in dropping state + * <at> rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 + * <at> first_above_time: when we went (or will go) continuously above + * target for interval + * <at> drop_next: time to drop next packet, or when we dropped last + * <at> ldelay: sojourn time of last dequeued packet + */ +struct codel_vars { + u_int32_t count; + u_int32_t lastcount; + int dropping; + u_int16_t rec_inv_sqrt; + u_int64_t first_above_time; + u_int64_t drop_next; + u_int64_t ldelay; +}; + +struct codel { + int last_pps; + struct codel_params params; + struct codel_vars vars; + struct codel_stats stats; + struct timeval last_log; + u_int32_t drop_overlimit; +}; + +/* + * codel interface state + */ +struct codel_if { + struct codel_if *cif_next; /* interface state list */ + struct ifaltq *cif_ifq; /* backpointer to ifaltq */ + u_int cif_bandwidth; /* link bandwidth in bps */ + + class_queue_t *cl_q; /* class queue structure */ + struct codel codel; + + /* statistics */ + struct codel_ifstats cl_stats; +}; + +struct codel *codel_alloc(int, int, int); +void codel_destroy(struct codel *); +int codel_addq(struct codel *, class_queue_t *, struct mbuf *); +struct mbuf *codel_getq(struct codel *, class_queue_t *); +void codel_getstats(struct codel *, struct codel_stats *); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_CODEL_H_ */ diff --git a/freebsd/sys/net/altq/altq_fairq.c b/freebsd/sys/net/altq/altq_fairq.c new file mode 100644 index 00000000..efb58d3f --- /dev/null +++ b/freebsd/sys/net/altq/altq_fairq.c @@ -0,0 +1,911 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon <dillon@backplane.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/net/altq/altq_fairq.c,v 1.1 2008/04/06 18:58:15 dillon Exp $ + * $FreeBSD$ + */ +/* + * Matt: I gutted altq_priq.c and used it as a skeleton on which to build + * fairq. The fairq algorithm is completely different then priq, of course, + * but because I used priq's skeleton I believe I should include priq's + * copyright. + * + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * FAIRQ - take traffic classified by keep state (hashed into + * mbuf->m_pkthdr.altq_state_hash) and bucketize it. Fairly extract + * the first packet from each bucket in a round-robin fashion. + * + * TODO - better overall qlimit support (right now it is per-bucket). + * - NOTE: red etc is per bucket, not overall. + * - better service curve support. + * + * EXAMPLE: + * + * altq on em0 fairq bandwidth 650Kb queue { std, bulk } + * queue std priority 3 bandwidth 400Kb \ + * fairq (buckets 64, default, hogs 1Kb) qlimit 50 + * queue bulk priority 2 bandwidth 100Kb \ + * fairq (buckets 64, hogs 1Kb) qlimit 50 + * + * pass out on em0 from any to any keep state queue std + * pass out on em0 inet proto tcp ..... port ... keep state queue bulk + */ +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#ifdef ALTQ_FAIRQ /* fairq is enabled in the kernel conf */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <netinet/in.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <netpfil/pf/pf_mtag.h> +#include <net/altq/altq.h> +#include <net/altq/altq_fairq.h> + +/* + * function prototypes + */ +static int fairq_clear_interface(struct fairq_if *); +static int fairq_request(struct ifaltq *, int, void *); +static void fairq_purge(struct fairq_if *); +static struct fairq_class *fairq_class_create(struct fairq_if *, int, int, u_int, struct fairq_opts *, int); +static int fairq_class_destroy(struct fairq_class *); +static int fairq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *fairq_dequeue(struct ifaltq *, int); + +static int fairq_addq(struct fairq_class *, struct mbuf *, u_int32_t); +static struct mbuf *fairq_getq(struct fairq_class *, uint64_t); +static struct mbuf *fairq_pollq(struct fairq_class *, uint64_t, int *); +static fairq_bucket_t *fairq_selectq(struct fairq_class *, int); +static void fairq_purgeq(struct fairq_class *); + +static void get_class_stats(struct fairq_classstats *, struct fairq_class *); +static struct fairq_class *clh_to_clp(struct fairq_if *, uint32_t); + +int +fairq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + + error = altq_attach(&ifp->if_snd, ALTQT_FAIRQ, a->altq_disc, + fairq_enqueue, fairq_dequeue, fairq_request, NULL, NULL); + + return (error); +} + +int +fairq_add_altq(struct pf_altq *a) +{ + struct fairq_if *pif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + + pif = malloc(sizeof(struct fairq_if), + M_DEVBUF, M_WAITOK | M_ZERO); + pif->pif_bandwidth = a->ifbandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = pif; + + return (0); +} + +int +fairq_remove_altq(struct pf_altq *a) +{ + struct fairq_if *pif; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + fairq_clear_interface(pif); + + free(pif, M_DEVBUF); + return (0); +} + +int +fairq_add_queue(struct pf_altq *a) +{ + struct fairq_if *pif; + struct fairq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + /* check parameters */ + if (a->priority >= FAIRQ_MAXPRI) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + if (pif->pif_classes[a->priority] != NULL) + return (EBUSY); + if (clh_to_clp(pif, a->qid) != NULL) + return (EBUSY); + + cl = fairq_class_create(pif, a->priority, a->qlimit, a->bandwidth, + &a->pq_u.fairq_opts, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +fairq_remove_queue(struct pf_altq *a) +{ + struct fairq_if *pif; + struct fairq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + return (fairq_class_destroy(cl)); +} + +int +fairq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct fairq_if *pif; + struct fairq_class *cl; + struct fairq_classstats stats; + int error = 0; + + if ((pif = altq_lookup(a->ifname, ALTQT_FAIRQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +fairq_clear_interface(struct fairq_if *pif) +{ + struct fairq_class *cl; + int pri; + + /* clear out the classes */ + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL) + fairq_class_destroy(cl); + } + + return (0); +} + +static int +fairq_request(struct ifaltq *ifq, int req, void *arg) +{ + struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + fairq_purge(pif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +fairq_purge(struct fairq_if *pif) +{ + struct fairq_class *cl; + int pri; + + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL && cl->cl_head) + fairq_purgeq(cl); + } + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + pif->pif_ifq->ifq_len = 0; +} + +static struct fairq_class * +fairq_class_create(struct fairq_if *pif, int pri, int qlimit, + u_int bandwidth, struct fairq_opts *opts, int qid) +{ + struct fairq_class *cl; + int flags = opts->flags; + u_int nbuckets = opts->nbuckets; + int i; + +#ifndef ALTQ_RED + if (flags & FARF_RED) { +#ifdef ALTQ_DEBUG + printf("fairq_class_create: RED not configured for FAIRQ!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_CODEL + if (flags & FARF_CODEL) { +#ifdef ALTQ_DEBUG + printf("fairq_class_create: CODEL not configured for FAIRQ!\n"); +#endif + return (NULL); + } +#endif + if (nbuckets == 0) + nbuckets = 256; + if (nbuckets > FAIRQ_MAX_BUCKETS) + nbuckets = FAIRQ_MAX_BUCKETS; + /* enforce power-of-2 size */ + while ((nbuckets ^ (nbuckets - 1)) != ((nbuckets << 1) - 1)) + ++nbuckets; + + if ((cl = pif->pif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ + IFQ_LOCK(cl->cl_pif->pif_ifq); + if (cl->cl_head) + fairq_purgeq(cl); + IFQ_UNLOCK(cl->cl_pif->pif_ifq); +#ifdef ALTQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (cl->cl_qtype == Q_RED) + red_destroy(cl->cl_red); +#endif +#ifdef ALTQ_CODEL + if (cl->cl_qtype == Q_CODEL) + codel_destroy(cl->cl_codel); +#endif + } else { + cl = malloc(sizeof(struct fairq_class), + M_DEVBUF, M_WAITOK | M_ZERO); + cl->cl_nbuckets = nbuckets; + cl->cl_nbucket_mask = nbuckets - 1; + + cl->cl_buckets = malloc( + sizeof(struct fairq_bucket) * cl->cl_nbuckets, + M_DEVBUF, M_WAITOK | M_ZERO); + cl->cl_head = NULL; + } + + pif->pif_classes[pri] = cl; + if (flags & FARF_DEFAULTCLASS) + pif->pif_default = cl; + if (qlimit == 0) + qlimit = 50; /* use default */ + cl->cl_qlimit = qlimit; + for (i = 0; i < cl->cl_nbuckets; ++i) { + qlimit(&cl->cl_buckets[i].queue) = qlimit; + } + cl->cl_bandwidth = bandwidth / 8; + cl->cl_qtype = Q_DROPTAIL; + cl->cl_flags = flags & FARF_USERFLAGS; + cl->cl_pri = pri; + if (pri > pif->pif_maxpri) + pif->pif_maxpri = pri; + cl->cl_pif = pif; + cl->cl_handle = qid; + cl->cl_hogs_m1 = opts->hogs_m1 / 8; + cl->cl_lssc_m1 = opts->lssc_m1 / 8; /* NOT YET USED */ + +#ifdef ALTQ_RED + if (flags & (FARF_RED|FARF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & FARF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & FARF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (pif->pif_bandwidth < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); +#ifdef ALTQ_RIO + if (flags & FARF_RIO) { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + cl->cl_qtype = Q_RIO; + } else +#endif + if (flags & FARF_RED) { + cl->cl_red = red_alloc(0, 0, + cl->cl_qlimit * 10/100, + cl->cl_qlimit * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + cl->cl_qtype = Q_RED; + } + } +#endif /* ALTQ_RED */ +#ifdef ALTQ_CODEL + if (flags & FARF_CODEL) { + cl->cl_codel = codel_alloc(5, 100, 0); + if (cl->cl_codel != NULL) + cl->cl_qtype = Q_CODEL; + } +#endif + + return (cl); +} + +static int +fairq_class_destroy(struct fairq_class *cl) +{ + struct fairq_if *pif; + int pri; + + IFQ_LOCK(cl->cl_pif->pif_ifq); + + if (cl->cl_head) + fairq_purgeq(cl); + + pif = cl->cl_pif; + pif->pif_classes[cl->cl_pri] = NULL; + if (pif->pif_poll_cache == cl) + pif->pif_poll_cache = NULL; + if (pif->pif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (pif->pif_classes[pri] != NULL) { + pif->pif_maxpri = pri; + break; + } + if (pri < 0) + pif->pif_maxpri = -1; + } + IFQ_UNLOCK(cl->cl_pif->pif_ifq); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (cl->cl_qtype == Q_RED) + red_destroy(cl->cl_red); +#endif +#ifdef ALTQ_CODEL + if (cl->cl_qtype == Q_CODEL) + codel_destroy(cl->cl_codel); +#endif + } + free(cl->cl_buckets, M_DEVBUF); + free(cl, M_DEVBUF); + + return (0); +} + +/* + * fairq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +fairq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc; + struct fairq_class *cl = NULL; /* Make compiler happy */ + struct pf_mtag *t; + u_int32_t qid_hash = 0; + int len; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); + m_freem(m); + return (ENOBUFS); + } + + if ((t = pf_find_mtag(m)) != NULL) { + cl = clh_to_clp(pif, t->qid); + qid_hash = t->qid_hash; + } + if (cl == NULL) { + cl = pif->pif_default; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } + cl->cl_flags |= FARF_HAS_PACKETS; + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (fairq_addq(cl, m, qid_hash) != 0) { + /* drop occurred. mbuf was freed in fairq_addq. */ + PKTCNTR_ADD(&cl->cl_dropcnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + + return (0); +} + +/* + * fairq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +fairq_dequeue(struct ifaltq *ifq, int op) +{ + struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc; + struct fairq_class *cl; + struct fairq_class *best_cl; + struct mbuf *best_m; + struct mbuf *m = NULL; + uint64_t cur_time = read_machclk(); + int pri; + int hit_limit; + + IFQ_LOCK_ASSERT(ifq); + + if (IFQ_IS_EMPTY(ifq)) { + return (NULL); + } + + if (pif->pif_poll_cache && op == ALTDQ_REMOVE) { + best_cl = pif->pif_poll_cache; + m = fairq_getq(best_cl, cur_time); + pif->pif_poll_cache = NULL; + if (m) { + IFQ_DEC_LEN(ifq); + PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m)); + return (m); + } + } else { + best_cl = NULL; + best_m = NULL; + + for (pri = pif->pif_maxpri; pri >= 0; pri--) { + if ((cl = pif->pif_classes[pri]) == NULL) + continue; + if ((cl->cl_flags & FARF_HAS_PACKETS) == 0) + continue; + m = fairq_pollq(cl, cur_time, &hit_limit); + if (m == NULL) { + cl->cl_flags &= ~FARF_HAS_PACKETS; + continue; + } + + /* + * Only override the best choice if we are under + * the BW limit. + */ + if (hit_limit == 0 || best_cl == NULL) { + best_cl = cl; + best_m = m; + } + + /* + * Remember the highest priority mbuf in case we + * do not find any lower priority mbufs. + */ + if (hit_limit) + continue; + break; + } + if (op == ALTDQ_POLL) { + pif->pif_poll_cache = best_cl; + m = best_m; + } else if (best_cl) { + m = fairq_getq(best_cl, cur_time); + if (m != NULL) { + IFQ_DEC_LEN(ifq); + PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m)); + } + } + return (m); + } + return (NULL); +} + +static int +fairq_addq(struct fairq_class *cl, struct mbuf *m, u_int32_t bucketid) +{ + fairq_bucket_t *b; + u_int hindex; + uint64_t bw; + + /* + * If the packet doesn't have any keep state put it on the end of + * our queue. XXX this can result in out of order delivery. + */ + if (bucketid == 0) { + if (cl->cl_head) + b = cl->cl_head->prev; + else + b = &cl->cl_buckets[0]; + } else { + hindex = bucketid & cl->cl_nbucket_mask; + b = &cl->cl_buckets[hindex]; + } + + /* + * Add the bucket to the end of the circular list of active buckets. + * + * As a special case we add the bucket to the beginning of the list + * instead of the end if it was not previously on the list and if + * its traffic is less then the hog level. + */ + if (b->in_use == 0) { + b->in_use = 1; + if (cl->cl_head == NULL) { + cl->cl_head = b; + b->next = b; + b->prev = b; + } else { + b->next = cl->cl_head; + b->prev = cl->cl_head->prev; + b->prev->next = b; + b->next->prev = b; + + if (b->bw_delta && cl->cl_hogs_m1) { + bw = b->bw_bytes * machclk_freq / b->bw_delta; + if (bw < cl->cl_hogs_m1) + cl->cl_head = b; + } + } + } + +#ifdef ALTQ_RIO + if (cl->cl_qtype == Q_RIO) + return rio_addq((rio_t *)cl->cl_red, &b->queue, m, cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (cl->cl_qtype == Q_RED) + return red_addq(cl->cl_red, &b->queue, m, cl->cl_pktattr); +#endif +#ifdef ALTQ_CODEL + if (cl->cl_qtype == Q_CODEL) + return codel_addq(cl->cl_codel, &b->queue, m); +#endif + if (qlen(&b->queue) >= qlimit(&b->queue)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & FARF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(&b->queue, m); + + return (0); +} + +static struct mbuf * +fairq_getq(struct fairq_class *cl, uint64_t cur_time) +{ + fairq_bucket_t *b; + struct mbuf *m; + + b = fairq_selectq(cl, 0); + if (b == NULL) + m = NULL; +#ifdef ALTQ_RIO + else if (cl->cl_qtype == Q_RIO) + m = rio_getq((rio_t *)cl->cl_red, &b->queue); +#endif +#ifdef ALTQ_RED + else if (cl->cl_qtype == Q_RED) + m = red_getq(cl->cl_red, &b->queue); +#endif +#ifdef ALTQ_CODEL + else if (cl->cl_qtype == Q_CODEL) + m = codel_getq(cl->cl_codel, &b->queue); +#endif + else + m = _getq(&b->queue); + + /* + * Calculate the BW change + */ + if (m != NULL) { + uint64_t delta; + + /* + * Per-class bandwidth calculation + */ + delta = (cur_time - cl->cl_last_time); + if (delta > machclk_freq * 8) + delta = machclk_freq * 8; + cl->cl_bw_delta += delta; + cl->cl_bw_bytes += m->m_pkthdr.len; + cl->cl_last_time = cur_time; + cl->cl_bw_delta -= cl->cl_bw_delta >> 3; + cl->cl_bw_bytes -= cl->cl_bw_bytes >> 3; + + /* + * Per-bucket bandwidth calculation + */ + delta = (cur_time - b->last_time); + if (delta > machclk_freq * 8) + delta = machclk_freq * 8; + b->bw_delta += delta; + b->bw_bytes += m->m_pkthdr.len; + b->last_time = cur_time; + b->bw_delta -= b->bw_delta >> 3; + b->bw_bytes -= b->bw_bytes >> 3; + } + return(m); +} + +/* + * Figure out what the next packet would be if there were no limits. If + * this class hits its bandwidth limit *hit_limit is set to no-zero, otherwise + * it is set to 0. A non-NULL mbuf is returned either way. + */ +static struct mbuf * +fairq_pollq(struct fairq_class *cl, uint64_t cur_time, int *hit_limit) +{ + fairq_bucket_t *b; + struct mbuf *m; + uint64_t delta; + uint64_t bw; + + *hit_limit = 0; + b = fairq_selectq(cl, 1); + if (b == NULL) + return(NULL); + m = qhead(&b->queue); + + /* + * Did this packet exceed the class bandwidth? Calculate the + * bandwidth component of the packet. + * + * - Calculate bytes per second + */ + delta = cur_time - cl->cl_last_time; + if (delta > machclk_freq * 8) + delta = machclk_freq * 8; + cl->cl_bw_delta += delta; + cl->cl_last_time = cur_time; + if (cl->cl_bw_delta) { + bw = cl->cl_bw_bytes * machclk_freq / cl->cl_bw_delta; + + if (bw > cl->cl_bandwidth) + *hit_limit = 1; +#ifdef ALTQ_DEBUG + printf("BW %6ju relative to %6u %d queue %p\n", + (uintmax_t)bw, cl->cl_bandwidth, *hit_limit, b); +#endif + } + return(m); +} + +/* + * Locate the next queue we want to pull a packet out of. This code + * is also responsible for removing empty buckets from the circular list. + */ +static +fairq_bucket_t * +fairq_selectq(struct fairq_class *cl, int ispoll) +{ + fairq_bucket_t *b; + uint64_t bw; + + if (ispoll == 0 && cl->cl_polled) { + b = cl->cl_polled; + cl->cl_polled = NULL; + return(b); + } + + while ((b = cl->cl_head) != NULL) { + /* + * Remove empty queues from consideration + */ + if (qempty(&b->queue)) { + b->in_use = 0; + cl->cl_head = b->next; + if (cl->cl_head == b) { + cl->cl_head = NULL; + } else { + b->next->prev = b->prev; + b->prev->next = b->next; + } + continue; + } + + /* + * Advance the round robin. Queues with bandwidths less + * then the hog bandwidth are allowed to burst. + */ + if (cl->cl_hogs_m1 == 0) { + cl->cl_head = b->next; + } else if (b->bw_delta) { + bw = b->bw_bytes * machclk_freq / b->bw_delta; + if (bw >= cl->cl_hogs_m1) { + cl->cl_head = b->next; + } + /* + * XXX TODO - + */ + } + + /* + * Return bucket b. + */ + break; + } + if (ispoll) + cl->cl_polled = b; + return(b); +} + +static void +fairq_purgeq(struct fairq_class *cl) +{ + fairq_bucket_t *b; + struct mbuf *m; + + while ((b = fairq_selectq(cl, 0)) != NULL) { + while ((m = _getq(&b->queue)) != NULL) { + PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m)); + m_freem(m); + } + ASSERT(qlen(&b->queue) == 0); + } +} + +static void +get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl) +{ + fairq_bucket_t *b; + + sp->class_handle = cl->cl_handle; + sp->qlimit = cl->cl_qlimit; + sp->xmit_cnt = cl->cl_xmitcnt; + sp->drop_cnt = cl->cl_dropcnt; + sp->qtype = cl->cl_qtype; + sp->qlength = 0; + + if (cl->cl_head) { + b = cl->cl_head; + do { + sp->qlength += qlen(&b->queue); + b = b->next; + } while (b != cl->cl_head); + } + +#ifdef ALTQ_RED + if (cl->cl_qtype == Q_RED) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (cl->cl_qtype == Q_RIO) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_CODEL + if (cl->cl_qtype == Q_CODEL) + codel_getstats(cl->cl_codel, &sp->codel); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct fairq_class * +clh_to_clp(struct fairq_if *pif, uint32_t chandle) +{ + struct fairq_class *cl; + int idx; + + if (chandle == 0) + return (NULL); + + for (idx = pif->pif_maxpri; idx >= 0; idx--) + if ((cl = pif->pif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + +#endif /* ALTQ_FAIRQ */ diff --git a/freebsd/sys/net/altq/altq_fairq.h b/freebsd/sys/net/altq/altq_fairq.h new file mode 100644 index 00000000..1a4b97dd --- /dev/null +++ b/freebsd/sys/net/altq/altq_fairq.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2008 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon <dillon@backplane.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/net/altq/altq_fairq.h,v 1.1 2008/04/06 18:58:15 dillon Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_FAIRQ_H_ +#define _ALTQ_ALTQ_FAIRQ_H_ + +#include <net/altq/altq.h> +#include <net/altq/altq_classq.h> +#include <net/altq/altq_codel.h> +#include <net/altq/altq_red.h> +#include <net/altq/altq_rio.h> +#include <net/altq/altq_rmclass.h> + +#define FAIRQ_MAX_BUCKETS 2048 /* maximum number of sorting buckets */ +#define FAIRQ_MAXPRI RM_MAXPRIO +#define FAIRQ_BITMAP_WIDTH (sizeof(fairq_bitmap_t)*8) +#define FAIRQ_BITMAP_MASK (FAIRQ_BITMAP_WIDTH - 1) + +/* fairq class flags */ +#define FARF_RED 0x0001 /* use RED */ +#define FARF_ECN 0x0002 /* use RED/ECN */ +#define FARF_RIO 0x0004 /* use RIO */ +#define FARF_CODEL 0x0008 /* use CoDel */ +#define FARF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define FARF_DEFAULTCLASS 0x1000 /* default class */ + +#define FARF_HAS_PACKETS 0x2000 /* might have queued packets */ + +#define FARF_USERFLAGS (FARF_RED|FARF_ECN|FARF_RIO|FARF_CLEARDSCP| \ + FARF_DEFAULTCLASS) + +/* special class handles */ +#define FAIRQ_NULLCLASS_HANDLE 0 + +typedef u_int fairq_bitmap_t; + +struct fairq_classstats { + uint32_t class_handle; + + u_int qlength; + u_int qlimit; + struct pktcntr xmit_cnt; /* transmitted packet counter */ + struct pktcntr drop_cnt; /* dropped packet counter */ + + /* codel, red and rio related info */ + int qtype; + struct redstats red[3]; /* rio has 3 red stats */ + struct codel_stats codel; +}; + +#ifdef _KERNEL + +typedef struct fairq_bucket { + struct fairq_bucket *next; /* circular list */ + struct fairq_bucket *prev; /* circular list */ + class_queue_t queue; /* the actual queue */ + uint64_t bw_bytes; /* statistics used to calculate bw */ + uint64_t bw_delta; /* statistics used to calculate bw */ + uint64_t last_time; + int in_use; +} fairq_bucket_t; + +struct fairq_class { + uint32_t cl_handle; /* class handle */ + u_int cl_nbuckets; /* (power of 2) */ + u_int cl_nbucket_mask; /* bucket mask */ + fairq_bucket_t *cl_buckets; + fairq_bucket_t *cl_head; /* head of circular bucket list */ + fairq_bucket_t *cl_polled; + union { + struct red *cl_red; /* RED state */ + struct codel *cl_codel; /* CoDel state */ + } cl_aqm; +#define cl_red cl_aqm.cl_red +#define cl_codel cl_aqm.cl_codel + u_int cl_hogs_m1; + u_int cl_lssc_m1; + u_int cl_bandwidth; + uint64_t cl_bw_bytes; + uint64_t cl_bw_delta; + uint64_t cl_last_time; + int cl_qtype; /* rollup */ + int cl_qlimit; + int cl_pri; /* priority */ + int cl_flags; /* class flags */ + struct fairq_if *cl_pif; /* back pointer to pif */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + /* round robin index */ + + /* statistics */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +/* + * fairq interface state + */ +struct fairq_if { + struct fairq_if *pif_next; /* interface state list */ + struct ifaltq *pif_ifq; /* backpointer to ifaltq */ + u_int pif_bandwidth; /* link bandwidth in bps */ + int pif_maxpri; /* max priority in use */ + struct fairq_class *pif_poll_cache;/* cached poll */ + struct fairq_class *pif_default; /* default class */ + struct fairq_class *pif_classes[FAIRQ_MAXPRI]; /* classes */ +}; + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_FAIRQ_H_ */ diff --git a/freebsd/sys/net/altq/altq_hfsc.c b/freebsd/sys/net/altq/altq_hfsc.c new file mode 100644 index 00000000..f7a18296 --- /dev/null +++ b/freebsd/sys/net/altq/altq_hfsc.c @@ -0,0 +1,2240 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + * + * $KAME: altq_hfsc.c,v 1.24 2003/12/05 05:40:46 kjc Exp $ + * $FreeBSD$ + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#ifdef ALTQ_HFSC /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/queue.h> +#if 1 /* ALTQ3_COMPAT */ +#include <sys/sockio.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#endif /* ALTQ3_COMPAT */ + +#include <net/if.h> +#include <net/if_var.h> +#include <netinet/in.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <netpfil/pf/pf_mtag.h> +#include <net/altq/altq.h> +#include <net/altq/altq_hfsc.h> +#ifdef ALTQ3_COMPAT +#include <net/altq/altq_conf.h> +#endif + +/* + * function prototypes + */ +static int hfsc_clear_interface(struct hfsc_if *); +static int hfsc_request(struct ifaltq *, int, void *); +static void hfsc_purge(struct hfsc_if *); +static struct hfsc_class *hfsc_class_create(struct hfsc_if *, + struct service_curve *, struct service_curve *, struct service_curve *, + struct hfsc_class *, int, int, int); +static int hfsc_class_destroy(struct hfsc_class *); +static struct hfsc_class *hfsc_nextclass(struct hfsc_class *); +static int hfsc_enqueue(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); +static struct mbuf *hfsc_dequeue(struct ifaltq *, int); + +static int hfsc_addq(struct hfsc_class *, struct mbuf *); +static struct mbuf *hfsc_getq(struct hfsc_class *); +static struct mbuf *hfsc_pollq(struct hfsc_class *); +static void hfsc_purgeq(struct hfsc_class *); + +static void update_cfmin(struct hfsc_class *); +static void set_active(struct hfsc_class *, int); +static void set_passive(struct hfsc_class *); + +static void init_ed(struct hfsc_class *, int); +static void update_ed(struct hfsc_class *, int); +static void update_d(struct hfsc_class *, int); +static void init_vf(struct hfsc_class *, int); +static void update_vf(struct hfsc_class *, int, u_int64_t); +static void ellist_insert(struct hfsc_class *); +static void ellist_remove(struct hfsc_class *); +static void ellist_update(struct hfsc_class *); +struct hfsc_class *hfsc_get_mindl(struct hfsc_if *, u_int64_t); +static void actlist_insert(struct hfsc_class *); +static void actlist_remove(struct hfsc_class *); +static void actlist_update(struct hfsc_class *); + +static struct hfsc_class *actlist_firstfit(struct hfsc_class *, + u_int64_t); + +static __inline u_int64_t seg_x2y(u_int64_t, u_int64_t); +static __inline u_int64_t seg_y2x(u_int64_t, u_int64_t); +static __inline u_int64_t m2sm(u_int); +static __inline u_int64_t m2ism(u_int); +static __inline u_int64_t d2dx(u_int); +static u_int sm2m(u_int64_t); +static u_int dx2d(u_int64_t); + +static void sc2isc(struct service_curve *, struct internal_sc *); +static void rtsc_init(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); +static u_int64_t rtsc_y2x(struct runtime_sc *, u_int64_t); +static u_int64_t rtsc_x2y(struct runtime_sc *, u_int64_t); +static void rtsc_min(struct runtime_sc *, struct internal_sc *, + u_int64_t, u_int64_t); + +static void get_class_stats(struct hfsc_classstats *, + struct hfsc_class *); +static struct hfsc_class *clh_to_clp(struct hfsc_if *, u_int32_t); + + +#ifdef ALTQ3_COMPAT +static struct hfsc_if *hfsc_attach(struct ifaltq *, u_int); +static int hfsc_detach(struct hfsc_if *); +static int hfsc_class_modify(struct hfsc_class *, struct service_curve *, + struct service_curve *, struct service_curve *); + +static int hfsccmd_if_attach(struct hfsc_attach *); +static int hfsccmd_if_detach(struct hfsc_interface *); +static int hfsccmd_add_class(struct hfsc_add_class *); +static int hfsccmd_delete_class(struct hfsc_delete_class *); +static int hfsccmd_modify_class(struct hfsc_modify_class *); +static int hfsccmd_add_filter(struct hfsc_add_filter *); +static int hfsccmd_delete_filter(struct hfsc_delete_filter *); +static int hfsccmd_class_stats(struct hfsc_class_stats *); + +altqdev_decl(hfsc); +#endif /* ALTQ3_COMPAT */ + +/* + * macros + */ +#define is_a_parent_class(cl) ((cl)->cl_children != NULL) + +#define HT_INFINITY 0xffffffffffffffffLL /* infinite time value */ + +#ifdef ALTQ3_COMPAT +/* hif_list keeps all hfsc_if's allocated. */ +static struct hfsc_if *hif_list = NULL; +#endif /* ALTQ3_COMPAT */ + +int +hfsc_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + s = splnet(); + error = altq_attach(&ifp->if_snd, ALTQT_HFSC, a->altq_disc, + hfsc_enqueue, hfsc_dequeue, hfsc_request, NULL, NULL); + splx(s); + return (error); +} + +int +hfsc_add_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + hif = malloc(sizeof(struct hfsc_if), M_DEVBUF, M_NOWAIT | M_ZERO); + if (hif == NULL) + return (ENOMEM); + + TAILQ_INIT(&hif->hif_eligible); + hif->hif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = hif; + + return (0); +} + +int +hfsc_remove_altq(struct pf_altq *a) +{ + struct hfsc_if *hif; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + (void)hfsc_clear_interface(hif); + (void)hfsc_class_destroy(hif->hif_rootclass); + + free(hif, M_DEVBUF); + + return (0); +} + +int +hfsc_add_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + struct hfsc_opts *opts; + struct service_curve rtsc, lssc, ulsc; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + opts = &a->pq_u.hfsc_opts; + + if (a->parent_qid == HFSC_NULLCLASS_HANDLE && + hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = clh_to_clp(hif, a->parent_qid)) == NULL) + return (EINVAL); + + if (a->qid == 0) + return (EINVAL); + + if (clh_to_clp(hif, a->qid) != NULL) + return (EBUSY); + + rtsc.m1 = opts->rtsc_m1; + rtsc.d = opts->rtsc_d; + rtsc.m2 = opts->rtsc_m2; + lssc.m1 = opts->lssc_m1; + lssc.d = opts->lssc_d; + lssc.m2 = opts->lssc_m2; + ulsc.m1 = opts->ulsc_m1; + ulsc.d = opts->ulsc_d; + ulsc.m2 = opts->ulsc_m2; + + cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc, + parent, a->qlimit, opts->flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +hfsc_remove_queue(struct pf_altq *a) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + return (hfsc_class_destroy(cl)); +} + +int +hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct hfsc_classstats stats; + int error = 0; + + if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes except the root class. + */ +static int +hfsc_clear_interface(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + +#ifdef ALTQ3_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&hif->hif_classifier, NULL, 1); +#endif + + /* clear out the classes */ + while (hif->hif_rootclass != NULL && + (cl = hif->hif_rootclass->cl_children) != NULL) { + /* + * remove the first leaf class found in the hierarchy + * then start over + */ + for (; cl != NULL; cl = hfsc_nextclass(cl)) { + if (!is_a_parent_class(cl)) { + (void)hfsc_class_destroy(cl); + break; + } + } + } + + return (0); +} + +static int +hfsc_request(struct ifaltq *ifq, int req, void *arg) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + hfsc_purge(hif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +hfsc_purge(struct hfsc_if *hif) +{ + struct hfsc_class *cl; + + for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + hif->hif_ifq->ifq_len = 0; +} + +struct hfsc_class * +hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, + struct service_curve *fsc, struct service_curve *usc, + struct hfsc_class *parent, int qlimit, int flags, int qid) +{ + struct hfsc_class *cl, *p; + int i, s; + + if (hif->hif_classes >= HFSC_MAX_CLASSES) + return (NULL); + +#ifndef ALTQ_RED + if (flags & HFCF_RED) { +#ifdef ALTQ_DEBUG + printf("hfsc_class_create: RED not configured for HFSC!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_CODEL + if (flags & HFCF_CODEL) { +#ifdef ALTQ_DEBUG + printf("hfsc_class_create: CODEL not configured for HFSC!\n"); +#endif + return (NULL); + } +#endif + + cl = malloc(sizeof(struct hfsc_class), M_DEVBUF, M_NOWAIT | M_ZERO); + if (cl == NULL) + return (NULL); + + cl->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO); + if (cl->cl_q == NULL) + goto err_ret; + + TAILQ_INIT(&cl->cl_actc); + + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + qsize(cl->cl_q) = 0; + cl->cl_flags = flags; +#ifdef ALTQ_RED + if (flags & (HFCF_RED|HFCF_RIO)) { + int red_flags, red_pkttime; + u_int m2; + + m2 = 0; + if (rsc != NULL && rsc->m2 > m2) + m2 = rsc->m2; + if (fsc != NULL && fsc->m2 > m2) + m2 = fsc->m2; + if (usc != NULL && usc->m2 > m2) + m2 = usc->m2; + + red_flags = 0; + if (flags & HFCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & HFCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (m2 < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (m2 / 8); + if (flags & HFCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red != NULL) + qtype(cl->cl_q) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ +#ifdef ALTQ_CODEL + if (flags & HFCF_CODEL) { + cl->cl_codel = codel_alloc(5, 100, 0); + if (cl->cl_codel != NULL) + qtype(cl->cl_q) = Q_CODEL; + } +#endif + + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) { + cl->cl_rsc = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_NOWAIT); + if (cl->cl_rsc == NULL) + goto err_ret; + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); + rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) { + cl->cl_fsc = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_NOWAIT); + if (cl->cl_fsc == NULL) + goto err_ret; + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) { + cl->cl_usc = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_NOWAIT); + if (cl->cl_usc == NULL) + goto err_ret; + sc2isc(usc, cl->cl_usc); + rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0); + } + + cl->cl_id = hif->hif_classid++; + cl->cl_handle = qid; + cl->cl_hif = hif; + cl->cl_parent = parent; + + s = splnet(); + IFQ_LOCK(hif->hif_ifq); + hif->hif_classes++; + + /* + * find a free slot in the class table. if the slot matching + * the lower bits of qid is free, use this slot. otherwise, + * use the first free slot. + */ + i = qid % HFSC_MAX_CLASSES; + if (hif->hif_class_tbl[i] == NULL) + hif->hif_class_tbl[i] = cl; + else { + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if (hif->hif_class_tbl[i] == NULL) { + hif->hif_class_tbl[i] = cl; + break; + } + if (i == HFSC_MAX_CLASSES) { + IFQ_UNLOCK(hif->hif_ifq); + splx(s); + goto err_ret; + } + } + + if (flags & HFCF_DEFAULTCLASS) + hif->hif_defaultclass = cl; + + if (parent == NULL) { + /* this is root class */ + hif->hif_rootclass = cl; + } else { + /* add this class to the children list of the parent */ + if ((p = parent->cl_children) == NULL) + parent->cl_children = cl; + else { + while (p->cl_siblings != NULL) + p = p->cl_siblings; + p->cl_siblings = cl; + } + } + IFQ_UNLOCK(hif->hif_ifq); + splx(s); + + return (cl); + + err_ret: + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_destroy(cl->cl_codel); +#endif + } + if (cl->cl_fsc != NULL) + free(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + free(cl->cl_rsc, M_DEVBUF); + if (cl->cl_usc != NULL) + free(cl->cl_usc, M_DEVBUF); + if (cl->cl_q != NULL) + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + return (NULL); +} + +static int +hfsc_class_destroy(struct hfsc_class *cl) +{ + int i, s; + + if (cl == NULL) + return (0); + + if (is_a_parent_class(cl)) + return (EBUSY); + + s = splnet(); + IFQ_LOCK(cl->cl_hif->hif_ifq); + +#ifdef ALTQ3_COMPAT + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_hif->hif_classifier, cl, 0); +#endif /* ALTQ3_COMPAT */ + + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + + if (cl->cl_parent == NULL) { + /* this is root class */ + } else { + struct hfsc_class *p = cl->cl_parent->cl_children; + + if (p == cl) + cl->cl_parent->cl_children = cl->cl_siblings; + else do { + if (p->cl_siblings == cl) { + p->cl_siblings = cl->cl_siblings; + break; + } + } while ((p = p->cl_siblings) != NULL); + ASSERT(p != NULL); + } + + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if (cl->cl_hif->hif_class_tbl[i] == cl) { + cl->cl_hif->hif_class_tbl[i] = NULL; + break; + } + + cl->cl_hif->hif_classes--; + IFQ_UNLOCK(cl->cl_hif->hif_ifq); + splx(s); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_destroy(cl->cl_codel); +#endif + } + + IFQ_LOCK(cl->cl_hif->hif_ifq); + if (cl == cl->cl_hif->hif_rootclass) + cl->cl_hif->hif_rootclass = NULL; + if (cl == cl->cl_hif->hif_defaultclass) + cl->cl_hif->hif_defaultclass = NULL; + IFQ_UNLOCK(cl->cl_hif->hif_ifq); + + if (cl->cl_usc != NULL) + free(cl->cl_usc, M_DEVBUF); + if (cl->cl_fsc != NULL) + free(cl->cl_fsc, M_DEVBUF); + if (cl->cl_rsc != NULL) + free(cl->cl_rsc, M_DEVBUF); + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + + return (0); +} + +/* + * hfsc_nextclass returns the next class in the tree. + * usage: + * for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) + * do_something; + */ +static struct hfsc_class * +hfsc_nextclass(struct hfsc_class *cl) +{ + if (cl->cl_children != NULL) + cl = cl->cl_children; + else if (cl->cl_siblings != NULL) + cl = cl->cl_siblings; + else { + while ((cl = cl->cl_parent) != NULL) + if (cl->cl_siblings) { + cl = cl->cl_siblings; + break; + } + } + + return (cl); +} + +/* + * hfsc_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct pf_mtag *t; + int len; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = pf_find_mtag(m)) != NULL) + cl = clh_to_clp(hif, t->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL || is_a_parent_class(cl)) { + cl = hif->hif_defaultclass; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (hfsc_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in hfsc_addq. */ + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + cl->cl_hif->hif_packets++; + + /* successfully queued. */ + if (qlen(cl->cl_q) == 1) + set_active(cl, m_pktlen(m)); + + return (0); +} + +/* + * hfsc_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +hfsc_dequeue(struct ifaltq *ifq, int op) +{ + struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; + struct hfsc_class *cl; + struct mbuf *m; + int len, next_len; + int realtime = 0; + u_int64_t cur_time; + + IFQ_LOCK_ASSERT(ifq); + + if (hif->hif_packets == 0) + /* no packet in the tree */ + return (NULL); + + cur_time = read_machclk(); + + if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) { + + cl = hif->hif_pollcache; + hif->hif_pollcache = NULL; + /* check if the class was scheduled by real-time criteria */ + if (cl->cl_rsc != NULL) + realtime = (cl->cl_e <= cur_time); + } else { + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = hfsc_get_mindl(hif, cur_time)) + != NULL) { + realtime = 1; + } else { +#ifdef ALTQ_DEBUG + int fits = 0; +#endif + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = hif->hif_rootclass; + while (is_a_parent_class(cl)) { + + cl = actlist_firstfit(cl, cur_time); + if (cl == NULL) { +#ifdef ALTQ_DEBUG + if (fits > 0) + printf("%d fit but none found\n",fits); +#endif + return (NULL); + } + /* + * update parent's cl_cvtmin. + * don't update if the new vt is smaller. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; +#ifdef ALTQ_DEBUG + fits++; +#endif + } + } + + if (op == ALTDQ_POLL) { + hif->hif_pollcache = cl; + m = hfsc_pollq(cl); + return (m); + } + } + + m = hfsc_getq(cl); + if (m == NULL) + panic("hfsc_dequeue:"); + len = m_pktlen(m); + cl->cl_hif->hif_packets--; + IFQ_DEC_LEN(ifq); + PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len); + + update_vf(cl, len, cur_time); + if (realtime) + cl->cl_cumul += len; + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) { + /* update ed */ + next_len = m_pktlen(qhead(cl->cl_q)); + + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + return (m); +} + +static int +hfsc_addq(struct hfsc_class *cl, struct mbuf *m) +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, + m, cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + return codel_addq(cl->cl_codel, cl->cl_q, m); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & HFCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +hfsc_getq(struct hfsc_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + return codel_getq(cl->cl_codel, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +hfsc_pollq(struct hfsc_class *cl) +{ + return qhead(cl->cl_q); +} + +static void +hfsc_purgeq(struct hfsc_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m)); + m_freem(m); + cl->cl_hif->hif_packets--; + IFQ_DEC_LEN(cl->cl_hif->hif_ifq); + } + ASSERT(qlen(cl->cl_q) == 0); + + update_vf(cl, 0, 0); /* remove cl from the actlist */ + set_passive(cl); +} + +static void +set_active(struct hfsc_class *cl, int len) +{ + if (cl->cl_rsc != NULL) + init_ed(cl, len); + if (cl->cl_fsc != NULL) + init_vf(cl, len); + + cl->cl_stats.period++; +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_rsc != NULL) + ellist_remove(cl); + + /* + * actlist is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from actlist + */ +} + +static void +init_ed(struct hfsc_class *cl, int next_len) +{ + u_int64_t cur_time; + + cur_time = read_machclk(); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + ellist_update(cl); +} + +static void +update_d(struct hfsc_class *cl, int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static void +init_vf(struct hfsc_class *cl, int len) +{ + struct hfsc_class *max_cl, *p; + u_int64_t vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for ( ; cl->cl_parent != NULL; cl = cl->cl_parent) { + + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + max_cl = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead); + if (max_cl != NULL) { + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to vtoff of children + * to make a new vt (vtoff + vt) larger than + * the vt in the last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + for (p = cl->cl_parent->cl_children; p != NULL; + p = p->cl_siblings) + p->cl_vtoff += vt; + cl->cl_vt = 0; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + } + cl->cl_initvt = cl->cl_vt; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, cl->cl_fsc, vt, cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + actlist_insert(cl); + + if (cl->cl_usc != NULL) { + /* class has upper limit curve */ + if (cur_time == 0) + cur_time = read_machclk(); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, int len, u_int64_t cur_time) +{ + u_int64_t f, myf_bound, delta; + int go_passive; + + go_passive = qempty(cl->cl_q); + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + + cl->cl_total += len; + + if (cl->cl_fsc == NULL || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt list */ + actlist_remove(cl); + + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt list */ + actlist_update(cl); + + if (cl->cl_usc != NULL) { + cl->cl_myf = cl->cl_myfadj + + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); + + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - machclk_per_tick; + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } + } + + /* cl_f is max(cl_myf, cl_cfmin) */ + if (cl->cl_myf > cl->cl_cfmin) + f = cl->cl_myf; + else + f = cl->cl_cfmin; + if (f != cl->cl_f) { + cl->cl_f = f; + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_cfmin(struct hfsc_class *cl) +{ + struct hfsc_class *p; + u_int64_t cfmin; + + if (TAILQ_EMPTY(&cl->cl_actc)) { + cl->cl_cfmin = 0; + return; + } + cfmin = HT_INFINITY; + TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) { + if (p->cl_f == 0) { + cl->cl_cfmin = 0; + return; + } + if (p->cl_f < cfmin) + cfmin = p->cl_f; + } + cl->cl_cfmin = cfmin; +} + +/* + * TAILQ based ellist and actlist implementation + * (ion wanted to make a calendar queue based implementation) + */ +/* + * eligible list holds backlogged classes being sorted by their eligible times. + * there is one eligible list per interface. + */ + +static void +ellist_insert(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(&hif->hif_eligible, elighead)) == NULL || + p->cl_e <= cl->cl_e) { + TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist); + return; + } + + TAILQ_FOREACH(p, &hif->hif_eligible, cl_ellist) { + if (cl->cl_e < p->cl_e) { + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +ellist_remove(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + + TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); +} + +static void +ellist_update(struct hfsc_class *cl) +{ + struct hfsc_if *hif = cl->cl_hif; + struct hfsc_class *p, *last; + + /* + * the eligible time of a class increases monotonically. + * if the next entry has a larger eligible time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_ellist); + if (p == NULL || cl->cl_e <= p->cl_e) + return; + + /* check the last entry */ + last = TAILQ_LAST(&hif->hif_eligible, elighead); + ASSERT(last != NULL); + if (last->cl_e <= cl->cl_e) { + TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) { + if (cl->cl_e < p->cl_e) { + TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); + TAILQ_INSERT_BEFORE(p, cl, cl_ellist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +/* find the class with the minimum deadline among the eligible classes */ +struct hfsc_class * +hfsc_get_mindl(struct hfsc_if *hif, u_int64_t cur_time) +{ + struct hfsc_class *p, *cl = NULL; + + TAILQ_FOREACH(p, &hif->hif_eligible, cl_ellist) { + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return (cl); +} + +/* + * active children list holds backlogged child classes being sorted + * by their virtual time. + * each intermediate class has one active children list. + */ + +static void +actlist_insert(struct hfsc_class *cl) +{ + struct hfsc_class *p; + + /* check the last entry first */ + if ((p = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead)) == NULL + || p->cl_vt <= cl->cl_vt) { + TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + TAILQ_FOREACH(p, &cl->cl_parent->cl_actc, cl_actlist) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static void +actlist_remove(struct hfsc_class *cl) +{ + TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); +} + +static void +actlist_update(struct hfsc_class *cl) +{ + struct hfsc_class *p, *last; + + /* + * the virtual time of a class increases monotonically during its + * backlogged period. + * if the next entry has a larger virtual time, nothing to do. + */ + p = TAILQ_NEXT(cl, cl_actlist); + if (p == NULL || cl->cl_vt < p->cl_vt) + return; + + /* check the last entry */ + last = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead); + ASSERT(last != NULL); + if (last->cl_vt <= cl->cl_vt) { + TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist); + return; + } + + /* + * the new position must be between the next entry + * and the last entry + */ + while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) { + if (cl->cl_vt < p->cl_vt) { + TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); + TAILQ_INSERT_BEFORE(p, cl, cl_actlist); + return; + } + } + ASSERT(0); /* should not reach here */ +} + +static struct hfsc_class * +actlist_firstfit(struct hfsc_class *cl, u_int64_t cur_time) +{ + struct hfsc_class *p; + + TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) { + if (p->cl_f <= cur_time) + return (p); + } + return (NULL); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bits/sec + * d: msec + * internal service curve parameters + * sm: (bytes/tsc_interval) << SM_SHIFT + * ism: (tsc_count/byte) << ISM_SHIFT + * dx: tsc_count + * + * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits. + * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU + * speed. SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective + * digits in decimal using the following table. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ----------+------------------------------------------------------- + * bytes/nsec 12.5e-6 125e-6 1250e-6 12500e-6 125000e-6 + * sm(500MHz) 25.0e-6 250e-6 2500e-6 25000e-6 250000e-6 + * sm(200MHz) 62.5e-6 625e-6 6250e-6 62500e-6 625000e-6 + * + * nsec/byte 80000 8000 800 80 8 + * ism(500MHz) 40000 4000 400 40 4 + * ism(200MHz) 16000 1600 160 16 1.6 + */ +#define SM_SHIFT 24 +#define ISM_SHIFT 10 + +#define SM_MASK ((1LL << SM_SHIFT) - 1) +#define ISM_MASK ((1LL << ISM_SHIFT) - 1) + +static __inline u_int64_t +seg_x2y(u_int64_t x, u_int64_t sm) +{ + u_int64_t y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return (y); +} + +static __inline u_int64_t +seg_y2x(u_int64_t y, u_int64_t ism) +{ + u_int64_t x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return (x); +} + +static __inline u_int64_t +m2sm(u_int m) +{ + u_int64_t sm; + + sm = ((u_int64_t)m << SM_SHIFT) / 8 / machclk_freq; + return (sm); +} + +static __inline u_int64_t +m2ism(u_int m) +{ + u_int64_t ism; + + if (m == 0) + ism = HT_INFINITY; + else + ism = ((u_int64_t)machclk_freq << ISM_SHIFT) * 8 / m; + return (ism); +} + +static __inline u_int64_t +d2dx(u_int d) +{ + u_int64_t dx; + + dx = ((u_int64_t)d * machclk_freq) / 1000; + return (dx); +} + +static u_int +sm2m(u_int64_t sm) +{ + u_int64_t m; + + m = (sm * 8 * machclk_freq) >> SM_SHIFT; + return ((u_int)m); +} + +static u_int +dx2d(u_int64_t dx) +{ + u_int64_t d; + + d = dx * 1000 / machclk_freq; + return ((u_int)d); +} + +static void +sc2isc(struct service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc * isc, u_int64_t x, + u_int64_t y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u_int64_t +rtsc_y2x(struct runtime_sc *rtsc, u_int64_t y) +{ + u_int64_t x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return (x); +} + +static u_int64_t +rtsc_x2y(struct runtime_sc *rtsc, u_int64_t x) +{ + u_int64_t y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return (y); +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u_int64_t x, + u_int64_t y) +{ + u_int64_t y1, y2, dx, dy; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; + return; +} + +static void +get_class_stats(struct hfsc_classstats *sp, struct hfsc_class *cl) +{ + sp->class_id = cl->cl_id; + sp->class_handle = cl->cl_handle; + + if (cl->cl_rsc != NULL) { + sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); + sp->rsc.d = dx2d(cl->cl_rsc->dx); + sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); + } else { + sp->rsc.m1 = 0; + sp->rsc.d = 0; + sp->rsc.m2 = 0; + } + if (cl->cl_fsc != NULL) { + sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); + sp->fsc.d = dx2d(cl->cl_fsc->dx); + sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); + } else { + sp->fsc.m1 = 0; + sp->fsc.d = 0; + sp->fsc.m2 = 0; + } + if (cl->cl_usc != NULL) { + sp->usc.m1 = sm2m(cl->cl_usc->sm1); + sp->usc.d = dx2d(cl->cl_usc->dx); + sp->usc.m2 = sm2m(cl->cl_usc->sm2); + } else { + sp->usc.m1 = 0; + sp->usc.d = 0; + sp->usc.m2 = 0; + } + + sp->total = cl->cl_total; + sp->cumul = cl->cl_cumul; + + sp->d = cl->cl_d; + sp->e = cl->cl_e; + sp->vt = cl->cl_vt; + sp->f = cl->cl_f; + + sp->initvt = cl->cl_initvt; + sp->vtperiod = cl->cl_vtperiod; + sp->parentperiod = cl->cl_parentperiod; + sp->nactive = cl->cl_nactive; + sp->vtoff = cl->cl_vtoff; + sp->cvtmax = cl->cl_cvtmax; + sp->myf = cl->cl_myf; + sp->cfmin = cl->cl_cfmin; + sp->cvtmin = cl->cl_cvtmin; + sp->myfadj = cl->cl_myfadj; + sp->vtadj = cl->cl_vtadj; + + sp->cur_time = read_machclk(); + sp->machclk_freq = machclk_freq; + + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->xmit_cnt = cl->cl_stats.xmit_cnt; + sp->drop_cnt = cl->cl_stats.drop_cnt; + sp->period = cl->cl_stats.period; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_getstats(cl->cl_codel, &sp->codel); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct hfsc_class * +clh_to_clp(struct hfsc_if *hif, u_int32_t chandle) +{ + int i; + struct hfsc_class *cl; + + if (chandle == 0) + return (NULL); + /* + * first, try optimistically the slot matching the lower bits of + * the handle. if it fails, do the linear table search. + */ + i = chandle % HFSC_MAX_CLASSES; + if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle) + return (cl); + for (i = 0; i < HFSC_MAX_CLASSES; i++) + if ((cl = hif->hif_class_tbl[i]) != NULL && + cl->cl_handle == chandle) + return (cl); + return (NULL); +} + +#ifdef ALTQ3_COMPAT +static struct hfsc_if * +hfsc_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct hfsc_if *hif; + + hif = malloc(sizeof(struct hfsc_if), M_DEVBUF, M_WAITOK); + if (hif == NULL) + return (NULL); + bzero(hif, sizeof(struct hfsc_if)); + + hif->hif_eligible = ellist_alloc(); + if (hif->hif_eligible == NULL) { + free(hif, M_DEVBUF); + return NULL; + } + + hif->hif_ifq = ifq; + + /* add this state to the hfsc list */ + hif->hif_next = hif_list; + hif_list = hif; + + return (hif); +} + +static int +hfsc_detach(hif) + struct hfsc_if *hif; +{ + (void)hfsc_clear_interface(hif); + (void)hfsc_class_destroy(hif->hif_rootclass); + + /* remove this interface from the hif list */ + if (hif_list == hif) + hif_list = hif->hif_next; + else { + struct hfsc_if *h; + + for (h = hif_list; h != NULL; h = h->hif_next) + if (h->hif_next == hif) { + h->hif_next = hif->hif_next; + break; + } + ASSERT(h != NULL); + } + + ellist_destroy(hif->hif_eligible); + + free(hif, M_DEVBUF); + + return (0); +} + +static int +hfsc_class_modify(cl, rsc, fsc, usc) + struct hfsc_class *cl; + struct service_curve *rsc, *fsc, *usc; +{ + struct internal_sc *rsc_tmp, *fsc_tmp, *usc_tmp; + u_int64_t cur_time; + int s; + + rsc_tmp = fsc_tmp = usc_tmp = NULL; + if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0) && + cl->cl_rsc == NULL) { + rsc_tmp = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (rsc_tmp == NULL) + return (ENOMEM); + } + if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0) && + cl->cl_fsc == NULL) { + fsc_tmp = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (fsc_tmp == NULL) { + free(rsc_tmp); + return (ENOMEM); + } + } + if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0) && + cl->cl_usc == NULL) { + usc_tmp = malloc(sizeof(struct internal_sc), + M_DEVBUF, M_WAITOK); + if (usc_tmp == NULL) { + free(rsc_tmp); + free(fsc_tmp); + return (ENOMEM); + } + } + + cur_time = read_machclk(); + s = splnet(); + IFQ_LOCK(cl->cl_hif->hif_ifq); + + if (rsc != NULL) { + if (rsc->m1 == 0 && rsc->m2 == 0) { + if (cl->cl_rsc != NULL) { + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + free(cl->cl_rsc, M_DEVBUF); + cl->cl_rsc = NULL; + } + } else { + if (cl->cl_rsc == NULL) + cl->cl_rsc = rsc_tmp; + sc2isc(rsc, cl->cl_rsc); + rtsc_init(&cl->cl_deadline, cl->cl_rsc, cur_time, + cl->cl_cumul); + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + } + } + + if (fsc != NULL) { + if (fsc->m1 == 0 && fsc->m2 == 0) { + if (cl->cl_fsc != NULL) { + if (!qempty(cl->cl_q)) + hfsc_purgeq(cl); + free(cl->cl_fsc, M_DEVBUF); + cl->cl_fsc = NULL; + } + } else { + if (cl->cl_fsc == NULL) + cl->cl_fsc = fsc_tmp; + sc2isc(fsc, cl->cl_fsc); + rtsc_init(&cl->cl_virtual, cl->cl_fsc, cl->cl_vt, + cl->cl_total); + } + } + + if (usc != NULL) { + if (usc->m1 == 0 && usc->m2 == 0) { + if (cl->cl_usc != NULL) { + free(cl->cl_usc, M_DEVBUF); + cl->cl_usc = NULL; + cl->cl_myf = 0; + } + } else { + if (cl->cl_usc == NULL) + cl->cl_usc = usc_tmp; + sc2isc(usc, cl->cl_usc); + rtsc_init(&cl->cl_ulimit, cl->cl_usc, cur_time, + cl->cl_total); + } + } + + if (!qempty(cl->cl_q)) { + if (cl->cl_rsc != NULL) + update_ed(cl, m_pktlen(qhead(cl->cl_q))); + if (cl->cl_fsc != NULL) + update_vf(cl, 0, cur_time); + /* is this enough? */ + } + + IFQ_UNLOCK(cl->cl_hif->hif_ifq); + splx(s); + + return (0); +} + +/* + * hfsc device interface + */ +int +hfscopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + if (machclk_freq == 0) + init_machclk(); + + if (machclk_freq == 0) { + printf("hfsc: no cpu clock available!\n"); + return (ENXIO); + } + + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +hfscclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct hfsc_if *hif; + int err, error = 0; + + while ((hif = hif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + err = altq_detach(hif->hif_ifq); + if (err == 0) + err = hfsc_detach(hif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +hfscioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct hfsc_if *hif; + struct hfsc_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case HFSC_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) + return (error); +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case HFSC_IF_ATTACH: + error = hfsccmd_if_attach((struct hfsc_attach *)addr); + break; + + case HFSC_IF_DETACH: + error = hfsccmd_if_detach((struct hfsc_interface *)addr); + break; + + case HFSC_ENABLE: + case HFSC_DISABLE: + case HFSC_CLEAR_HIERARCHY: + ifacep = (struct hfsc_interface *)addr; + if ((hif = altq_lookup(ifacep->hfsc_ifname, + ALTQT_HFSC)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + + case HFSC_ENABLE: + if (hif->hif_defaultclass == NULL) { +#ifdef ALTQ_DEBUG + printf("hfsc: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(hif->hif_ifq); + break; + + case HFSC_DISABLE: + error = altq_disable(hif->hif_ifq); + break; + + case HFSC_CLEAR_HIERARCHY: + hfsc_clear_interface(hif); + break; + } + break; + + case HFSC_ADD_CLASS: + error = hfsccmd_add_class((struct hfsc_add_class *)addr); + break; + + case HFSC_DEL_CLASS: + error = hfsccmd_delete_class((struct hfsc_delete_class *)addr); + break; + + case HFSC_MOD_CLASS: + error = hfsccmd_modify_class((struct hfsc_modify_class *)addr); + break; + + case HFSC_ADD_FILTER: + error = hfsccmd_add_filter((struct hfsc_add_filter *)addr); + break; + + case HFSC_DEL_FILTER: + error = hfsccmd_delete_filter((struct hfsc_delete_filter *)addr); + break; + + case HFSC_GETSTATS: + error = hfsccmd_class_stats((struct hfsc_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +hfsccmd_if_attach(ap) + struct hfsc_attach *ap; +{ + struct hfsc_if *hif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->iface.hfsc_ifname)) == NULL) + return (ENXIO); + + if ((hif = hfsc_attach(&ifp->if_snd, ap->bandwidth)) == NULL) + return (ENOMEM); + + /* + * set HFSC to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_HFSC, hif, + hfsc_enqueue, hfsc_dequeue, hfsc_request, + &hif->hif_classifier, acc_classify)) != 0) + (void)hfsc_detach(hif); + + return (error); +} + +static int +hfsccmd_if_detach(ap) + struct hfsc_interface *ap; +{ + struct hfsc_if *hif; + int error; + + if ((hif = altq_lookup(ap->hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(hif->hif_ifq)) + altq_disable(hif->hif_ifq); + + if ((error = altq_detach(hif->hif_ifq))) + return (error); + + return hfsc_detach(hif); +} + +static int +hfsccmd_add_class(ap) + struct hfsc_add_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl, *parent; + int i; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if (ap->parent_handle == HFSC_NULLCLASS_HANDLE && + hif->hif_rootclass == NULL) + parent = NULL; + else if ((parent = clh_to_clp(hif, ap->parent_handle)) == NULL) + return (EINVAL); + + /* assign a class handle (use a free slot number for now) */ + for (i = 1; i < HFSC_MAX_CLASSES; i++) + if (hif->hif_class_tbl[i] == NULL) + break; + if (i == HFSC_MAX_CLASSES) + return (EBUSY); + + if ((cl = hfsc_class_create(hif, &ap->service_curve, NULL, NULL, + parent, ap->qlimit, ap->flags, i)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = i; + + return (0); +} + +static int +hfsccmd_delete_class(ap) + struct hfsc_delete_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + return hfsc_class_destroy(cl); +} + +static int +hfsccmd_modify_class(ap) + struct hfsc_modify_class *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct service_curve *rsc = NULL; + struct service_curve *fsc = NULL; + struct service_curve *usc = NULL; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (ap->sctype & HFSC_REALTIMESC) + rsc = &ap->service_curve; + if (ap->sctype & HFSC_LINKSHARINGSC) + fsc = &ap->service_curve; + if (ap->sctype & HFSC_UPPERLIMITSC) + usc = &ap->service_curve; + + return hfsc_class_modify(cl, rsc, fsc, usc); +} + +static int +hfsccmd_add_filter(ap) + struct hfsc_add_filter *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) + return (EINVAL); + + if (is_a_parent_class(cl)) { +#ifdef ALTQ_DEBUG + printf("hfsccmd_add_filter: not a leaf class!\n"); +#endif + return (EINVAL); + } + + return acc_add_filter(&hif->hif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +hfsccmd_delete_filter(ap) + struct hfsc_delete_filter *ap; +{ + struct hfsc_if *hif; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + return acc_delete_filter(&hif->hif_classifier, + ap->filter_handle); +} + +static int +hfsccmd_class_stats(ap) + struct hfsc_class_stats *ap; +{ + struct hfsc_if *hif; + struct hfsc_class *cl; + struct hfsc_classstats stats, *usp; + int n, nclasses, error; + + if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) + return (EBADF); + + ap->cur_time = read_machclk(); + ap->machclk_freq = machclk_freq; + ap->hif_classes = hif->hif_classes; + ap->hif_packets = hif->hif_packets; + + /* skip the first N classes in the tree */ + nclasses = ap->nskip; + for (cl = hif->hif_rootclass, n = 0; cl != NULL && n < nclasses; + cl = hfsc_nextclass(cl), n++) + ; + if (n != nclasses) + return (EINVAL); + + /* then, read the next N classes in the tree */ + nclasses = ap->nclasses; + usp = ap->stats; + for (n = 0; cl != NULL && n < nclasses; cl = hfsc_nextclass(cl), n++) { + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + + ap->nclasses = n; + + return (0); +} + +#ifdef KLD_MODULE + +static struct altqsw hfsc_sw = + {"hfsc", hfscopen, hfscclose, hfscioctl}; + +ALTQ_MODULE(altq_hfsc, ALTQT_HFSC, &hfsc_sw); +MODULE_DEPEND(altq_hfsc, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_hfsc, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_HFSC */ diff --git a/freebsd/sys/net/altq/altq_hfsc.h b/freebsd/sys/net/altq/altq_hfsc.h new file mode 100644 index 00000000..de5e89b8 --- /dev/null +++ b/freebsd/sys/net/altq/altq_hfsc.h @@ -0,0 +1,319 @@ +/*- + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + * + * $KAME: altq_hfsc.h,v 1.12 2003/12/05 05:40:46 kjc Exp $ + * $FreeBSD$ + */ +#ifndef _ALTQ_ALTQ_HFSC_H_ +#define _ALTQ_ALTQ_HFSC_H_ + +#include <net/altq/altq.h> +#include <net/altq/altq_classq.h> +#include <net/altq/altq_codel.h> +#include <net/altq/altq_red.h> +#include <net/altq/altq_rio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct service_curve { + u_int m1; /* slope of the first segment in bits/sec */ + u_int d; /* the x-projection of the first segment in msec */ + u_int m2; /* slope of the second segment in bits/sec */ +}; + +/* special class handles */ +#define HFSC_NULLCLASS_HANDLE 0 +#define HFSC_MAX_CLASSES 64 + +/* hfsc class flags */ +#define HFCF_RED 0x0001 /* use RED */ +#define HFCF_ECN 0x0002 /* use RED/ECN */ +#define HFCF_RIO 0x0004 /* use RIO */ +#define HFCF_CODEL 0x0008 /* use CoDel */ +#define HFCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define HFCF_DEFAULTCLASS 0x1000 /* default class */ + +/* service curve types */ +#define HFSC_REALTIMESC 1 +#define HFSC_LINKSHARINGSC 2 +#define HFSC_UPPERLIMITSC 4 +#define HFSC_DEFAULTSC (HFSC_REALTIMESC|HFSC_LINKSHARINGSC) + +struct hfsc_classstats { + u_int class_id; + u_int32_t class_handle; + struct service_curve rsc; + struct service_curve fsc; + struct service_curve usc; /* upper limit service curve */ + + u_int64_t total; /* total work in bytes */ + u_int64_t cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t d; /* deadline */ + u_int64_t e; /* eligible time */ + u_int64_t vt; /* virtual time */ + u_int64_t f; /* fit time for upper-limit */ + + /* info helpful for debugging */ + u_int64_t initvt; /* init virtual time */ + u_int64_t vtoff; /* cl_vt_ipoff */ + u_int64_t cvtmax; /* cl_maxvt */ + u_int64_t myf; /* cl_myf */ + u_int64_t cfmin; /* cl_mincf */ + u_int64_t cvtmin; /* cl_mincvt */ + u_int64_t myfadj; /* cl_myfadj */ + u_int64_t vtadj; /* cl_vtadj */ + u_int64_t cur_time; + u_int32_t machclk_freq; + + u_int qlength; + u_int qlimit; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + + u_int vtperiod; /* vt period sequence no */ + u_int parentperiod; /* parent's vt period seqno */ + int nactive; /* number of active children */ + + /* codel, red and rio related info */ + int qtype; + struct redstats red[3]; + struct codel_stats codel; +}; + +#ifdef ALTQ3_COMPAT +struct hfsc_interface { + char hfsc_ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ +}; + +struct hfsc_attach { + struct hfsc_interface iface; + u_int bandwidth; /* link bandwidth in bits/sec */ +}; + +struct hfsc_add_class { + struct hfsc_interface iface; + u_int32_t parent_handle; + struct service_curve service_curve; + int qlimit; + int flags; + + u_int32_t class_handle; /* return value */ +}; + +struct hfsc_delete_class { + struct hfsc_interface iface; + u_int32_t class_handle; +}; + +struct hfsc_modify_class { + struct hfsc_interface iface; + u_int32_t class_handle; + struct service_curve service_curve; + int sctype; +}; + +struct hfsc_add_filter { + struct hfsc_interface iface; + u_int32_t class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct hfsc_delete_filter { + struct hfsc_interface iface; + u_long filter_handle; +}; + +struct hfsc_class_stats { + struct hfsc_interface iface; + int nskip; /* skip # of classes */ + int nclasses; /* # of class stats (WR) */ + u_int64_t cur_time; /* current time */ + u_int32_t machclk_freq; /* machine clock frequency */ + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + struct hfsc_classstats *stats; /* pointer to stats array */ +}; + +#define HFSC_IF_ATTACH _IOW('Q', 1, struct hfsc_attach) +#define HFSC_IF_DETACH _IOW('Q', 2, struct hfsc_interface) +#define HFSC_ENABLE _IOW('Q', 3, struct hfsc_interface) +#define HFSC_DISABLE _IOW('Q', 4, struct hfsc_interface) +#define HFSC_CLEAR_HIERARCHY _IOW('Q', 5, struct hfsc_interface) +#define HFSC_ADD_CLASS _IOWR('Q', 7, struct hfsc_add_class) +#define HFSC_DEL_CLASS _IOW('Q', 8, struct hfsc_delete_class) +#define HFSC_MOD_CLASS _IOW('Q', 9, struct hfsc_modify_class) +#define HFSC_ADD_FILTER _IOWR('Q', 10, struct hfsc_add_filter) +#define HFSC_DEL_FILTER _IOW('Q', 11, struct hfsc_delete_filter) +#define HFSC_GETSTATS _IOWR('Q', 12, struct hfsc_class_stats) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL +/* + * kernel internal service curve representation + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. for the intel x86 architecture, + * the raw Pentium TSC (Timestamp Counter) value is used. + * virtual time is also calculated in this time scale. + * y-axis: unit is byte. + * + * the service curve parameters are converted to the internal + * representation. + * the slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + * + * note: Intel Pentium TSC never wraps around in several thousands of years. + * x-axis doesn't wrap around for 1089 years with 1GHz clock. + * y-axis doesn't wrap around for 4358 years with 1Gbps bandwidth. + */ + +/* kernel internal representation of a service curve */ +struct internal_sc { + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc { + u_int64_t x; /* current starting position on x-axis */ + u_int64_t y; /* current starting position on x-axis */ + u_int64_t sm1; /* scaled slope of the 1st segment */ + u_int64_t ism1; /* scaled inverse-slope of the 1st segment */ + u_int64_t dx; /* the x-projection of the 1st segment */ + u_int64_t dy; /* the y-projection of the 1st segment */ + u_int64_t sm2; /* scaled slope of the 2nd segment */ + u_int64_t ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +struct hfsc_class { + u_int cl_id; /* class id (just for debug) */ + u_int32_t cl_handle; /* class handle */ + struct hfsc_if *cl_hif; /* back pointer to struct hfsc_if */ + int cl_flags; /* misc flags */ + + struct hfsc_class *cl_parent; /* parent class */ + struct hfsc_class *cl_siblings; /* sibling classes */ + struct hfsc_class *cl_children; /* child classes */ + + class_queue_t *cl_q; /* class queue structure */ + union { + struct red *cl_red; /* RED state */ + struct codel *cl_codel; /* CoDel state */ + } cl_aqm; +#define cl_red cl_aqm.cl_red +#define cl_codel cl_aqm.cl_codel + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + u_int64_t cl_total; /* total work in bytes */ + u_int64_t cl_cumul; /* cumulative work in bytes + done by real-time criteria */ + u_int64_t cl_d; /* deadline */ + u_int64_t cl_e; /* eligible time */ + u_int64_t cl_vt; /* virtual time */ + u_int64_t cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + u_int64_t cl_myf; /* my fit-time (as calculated from this + class's own upperlimit curve) */ + u_int64_t cl_myfadj; /* my fit-time adjustment + (to cancel history dependence) */ + u_int64_t cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + u_int64_t cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + u_int64_t cl_vtadj; /* intra-period cumulative vt + adjustment */ + u_int64_t cl_vtoff; /* inter-period cumulative vt offset */ + u_int64_t cl_cvtmax; /* max child's vt in the last period */ + + u_int64_t cl_initvt; /* init virtual time (for debugging) */ + + struct internal_sc *cl_rsc; /* internal real-time service curve */ + struct internal_sc *cl_fsc; /* internal fair service curve */ + struct internal_sc *cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + u_int cl_vtperiod; /* vt period sequence no */ + u_int cl_parentperiod; /* parent's vt period seqno */ + int cl_nactive; /* number of active children */ + + TAILQ_HEAD(acthead, hfsc_class) cl_actc; /* active children list */ + TAILQ_ENTRY(hfsc_class) cl_actlist; /* active children list entry */ + TAILQ_ENTRY(hfsc_class) cl_ellist; /* eligible list entry */ + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int period; + } cl_stats; +}; + +/* + * hfsc interface state + */ +struct hfsc_if { + struct hfsc_if *hif_next; /* interface state list */ + struct ifaltq *hif_ifq; /* backpointer to ifaltq */ + struct hfsc_class *hif_rootclass; /* root class */ + struct hfsc_class *hif_defaultclass; /* default class */ + struct hfsc_class *hif_class_tbl[HFSC_MAX_CLASSES]; + struct hfsc_class *hif_pollcache; /* cache for poll operation */ + + u_int hif_classes; /* # of classes in the tree */ + u_int hif_packets; /* # of packets in the tree */ + u_int hif_classid; /* class id sequence number */ + + TAILQ_HEAD(elighead, hfsc_class) hif_eligible; /* eligible list */ + +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier hif_classifier; +#endif +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_HFSC_H_ */ diff --git a/freebsd/sys/net/altq/altq_priq.c b/freebsd/sys/net/altq/altq_priq.c new file mode 100644 index 00000000..d257ae3c --- /dev/null +++ b/freebsd/sys/net/altq/altq_priq.c @@ -0,0 +1,1072 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_priq.c,v 1.11 2003/09/17 14:23:25 kjc Exp $ + * $FreeBSD$ + */ +/* + * priority queue + */ + +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#ifdef ALTQ_PRIQ /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/kernel.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <netinet/in.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <netpfil/pf/pf_mtag.h> +#include <net/altq/altq.h> +#ifdef ALTQ3_COMPAT +#include <net/altq/altq_conf.h> +#endif +#include <net/altq/altq_priq.h> + +/* + * function prototypes + */ +#ifdef ALTQ3_COMPAT +static struct priq_if *priq_attach(struct ifaltq *, u_int); +static int priq_detach(struct priq_if *); +#endif +static int priq_clear_interface(struct priq_if *); +static int priq_request(struct ifaltq *, int, void *); +static void priq_purge(struct priq_if *); +static struct priq_class *priq_class_create(struct priq_if *, int, int, int, + int); +static int priq_class_destroy(struct priq_class *); +static int priq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *priq_dequeue(struct ifaltq *, int); + +static int priq_addq(struct priq_class *, struct mbuf *); +static struct mbuf *priq_getq(struct priq_class *); +static struct mbuf *priq_pollq(struct priq_class *); +static void priq_purgeq(struct priq_class *); + +#ifdef ALTQ3_COMPAT +static int priqcmd_if_attach(struct priq_interface *); +static int priqcmd_if_detach(struct priq_interface *); +static int priqcmd_add_class(struct priq_add_class *); +static int priqcmd_delete_class(struct priq_delete_class *); +static int priqcmd_modify_class(struct priq_modify_class *); +static int priqcmd_add_filter(struct priq_add_filter *); +static int priqcmd_delete_filter(struct priq_delete_filter *); +static int priqcmd_class_stats(struct priq_class_stats *); +#endif /* ALTQ3_COMPAT */ + +static void get_class_stats(struct priq_classstats *, struct priq_class *); +static struct priq_class *clh_to_clp(struct priq_if *, u_int32_t); + +#ifdef ALTQ3_COMPAT +altqdev_decl(priq); + +/* pif_list keeps all priq_if's allocated. */ +static struct priq_if *pif_list = NULL; +#endif /* ALTQ3_COMPAT */ + +int +priq_pfattach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error; + + if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) + return (EINVAL); + s = splnet(); + error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, a->altq_disc, + priq_enqueue, priq_dequeue, priq_request, NULL, NULL); + splx(s); + return (error); +} + +int +priq_add_altq(struct pf_altq *a) +{ + struct priq_if *pif; + struct ifnet *ifp; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + if (!ALTQ_IS_READY(&ifp->if_snd)) + return (ENODEV); + + pif = malloc(sizeof(struct priq_if), M_DEVBUF, M_NOWAIT | M_ZERO); + if (pif == NULL) + return (ENOMEM); + pif->pif_bandwidth = a->ifbandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = &ifp->if_snd; + + /* keep the state in pf_altq */ + a->altq_disc = pif; + + return (0); +} + +int +priq_remove_altq(struct pf_altq *a) +{ + struct priq_if *pif; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + a->altq_disc = NULL; + + (void)priq_clear_interface(pif); + + free(pif, M_DEVBUF); + return (0); +} + +int +priq_add_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + /* check parameters */ + if (a->priority >= PRIQ_MAXPRI) + return (EINVAL); + if (a->qid == 0) + return (EINVAL); + if (pif->pif_classes[a->priority] != NULL) + return (EBUSY); + if (clh_to_clp(pif, a->qid) != NULL) + return (EBUSY); + + cl = priq_class_create(pif, a->priority, a->qlimit, + a->pq_u.priq_opts.flags, a->qid); + if (cl == NULL) + return (ENOMEM); + + return (0); +} + +int +priq_remove_queue(struct pf_altq *a) +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = a->altq_disc) == NULL) + return (EINVAL); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + return (priq_class_destroy(cl)); +} + +int +priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + struct priq_if *pif; + struct priq_class *cl; + struct priq_classstats stats; + int error = 0; + + if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, a->qid)) == NULL) + return (EINVAL); + + if (*nbytes < sizeof(stats)) + return (EINVAL); + + get_class_stats(&stats, cl); + + if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) + return (error); + *nbytes = sizeof(stats); + return (0); +} + +/* + * bring the interface back to the initial state by discarding + * all the filters and classes. + */ +static int +priq_clear_interface(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + +#ifdef ALTQ3_CLFIER_COMPAT + /* free the filters for this interface */ + acc_discard_filters(&pif->pif_classifier, NULL, 1); +#endif + + /* clear out the classes */ + for (pri = 0; pri <= pif->pif_maxpri; pri++) + if ((cl = pif->pif_classes[pri]) != NULL) + priq_class_destroy(cl); + + return (0); +} + +static int +priq_request(struct ifaltq *ifq, int req, void *arg) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + priq_purge(pif); + break; + } + return (0); +} + +/* discard all the queued packets on the interface */ +static void +priq_purge(struct priq_if *pif) +{ + struct priq_class *cl; + int pri; + + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) + priq_purgeq(cl); + } + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + pif->pif_ifq->ifq_len = 0; +} + +static struct priq_class * +priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) +{ + struct priq_class *cl; + int s; + +#ifndef ALTQ_RED + if (flags & PRCF_RED) { +#ifdef ALTQ_DEBUG + printf("priq_class_create: RED not configured for PRIQ!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_CODEL + if (flags & PRCF_CODEL) { +#ifdef ALTQ_DEBUG + printf("priq_class_create: CODEL not configured for PRIQ!\n"); +#endif + return (NULL); + } +#endif + + if ((cl = pif->pif_classes[pri]) != NULL) { + /* modify the class instead of creating a new one */ + s = splnet(); + IFQ_LOCK(cl->cl_pif->pif_ifq); + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + IFQ_UNLOCK(cl->cl_pif->pif_ifq); + splx(s); +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_destroy(cl->cl_codel); +#endif + } else { + cl = malloc(sizeof(struct priq_class), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (cl == NULL) + return (NULL); + + cl->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (cl->cl_q == NULL) + goto err_ret; + } + + pif->pif_classes[pri] = cl; + if (flags & PRCF_DEFAULTCLASS) + pif->pif_default = cl; + if (qlimit == 0) + qlimit = 50; /* use default */ + qlimit(cl->cl_q) = qlimit; + qtype(cl->cl_q) = Q_DROPTAIL; + qlen(cl->cl_q) = 0; + qsize(cl->cl_q) = 0; + cl->cl_flags = flags; + cl->cl_pri = pri; + if (pri > pif->pif_maxpri) + pif->pif_maxpri = pri; + cl->cl_pif = pif; + cl->cl_handle = qid; + +#ifdef ALTQ_RED + if (flags & (PRCF_RED|PRCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & PRCF_ECN) + red_flags |= REDF_ECN; +#ifdef ALTQ_RIO + if (flags & PRCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + if (pif->pif_bandwidth < 8) + red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ + else + red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu + * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); +#ifdef ALTQ_RIO + if (flags & PRCF_RIO) { + cl->cl_red = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->cl_red == NULL) + goto err_ret; + qtype(cl->cl_q) = Q_RIO; + } else +#endif + if (flags & PRCF_RED) { + cl->cl_red = red_alloc(0, 0, + qlimit(cl->cl_q) * 10/100, + qlimit(cl->cl_q) * 30/100, + red_flags, red_pkttime); + if (cl->cl_red == NULL) + goto err_ret; + qtype(cl->cl_q) = Q_RED; + } + } +#endif /* ALTQ_RED */ +#ifdef ALTQ_CODEL + if (flags & PRCF_CODEL) { + cl->cl_codel = codel_alloc(5, 100, 0); + if (cl->cl_codel != NULL) + qtype(cl->cl_q) = Q_CODEL; + } +#endif + + return (cl); + + err_ret: + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_destroy(cl->cl_codel); +#endif + } + if (cl->cl_q != NULL) + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + return (NULL); +} + +static int +priq_class_destroy(struct priq_class *cl) +{ + struct priq_if *pif; + int s, pri; + + s = splnet(); + IFQ_LOCK(cl->cl_pif->pif_ifq); + +#ifdef ALTQ3_CLFIER_COMPAT + /* delete filters referencing to this class */ + acc_discard_filters(&cl->cl_pif->pif_classifier, cl, 0); +#endif + + if (!qempty(cl->cl_q)) + priq_purgeq(cl); + + pif = cl->cl_pif; + pif->pif_classes[cl->cl_pri] = NULL; + if (pif->pif_maxpri == cl->cl_pri) { + for (pri = cl->cl_pri; pri >= 0; pri--) + if (pif->pif_classes[pri] != NULL) { + pif->pif_maxpri = pri; + break; + } + if (pri < 0) + pif->pif_maxpri = -1; + } + IFQ_UNLOCK(cl->cl_pif->pif_ifq); + splx(s); + + if (cl->cl_red != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_destroy((rio_t *)cl->cl_red); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_destroy(cl->cl_red); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_destroy(cl->cl_codel); +#endif + } + free(cl->cl_q, M_DEVBUF); + free(cl, M_DEVBUF); + return (0); +} + +/* + * priq_enqueue is an enqueue function to be registered to + * (*altq_enqueue) in struct ifaltq. + */ +static int +priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct pf_mtag *t; + int len; + + IFQ_LOCK_ASSERT(ifq); + + /* grab class set by classifier */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* should not happen */ + printf("altq: packet for %s does not have pkthdr\n", + ifq->altq_ifp->if_xname); + m_freem(m); + return (ENOBUFS); + } + cl = NULL; + if ((t = pf_find_mtag(m)) != NULL) + cl = clh_to_clp(pif, t->qid); +#ifdef ALTQ3_COMPAT + else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) + cl = pktattr->pattr_class; +#endif + if (cl == NULL) { + cl = pif->pif_default; + if (cl == NULL) { + m_freem(m); + return (ENOBUFS); + } + } +#ifdef ALTQ3_COMPAT + if (pktattr != NULL) + cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ + else +#endif + cl->cl_pktattr = NULL; + len = m_pktlen(m); + if (priq_addq(cl, m) != 0) { + /* drop occurred. mbuf was freed in priq_addq. */ + PKTCNTR_ADD(&cl->cl_dropcnt, len); + return (ENOBUFS); + } + IFQ_INC_LEN(ifq); + + /* successfully queued. */ + return (0); +} + +/* + * priq_dequeue is a dequeue function to be registered to + * (*altq_dequeue) in struct ifaltq. + * + * note: ALTDQ_POLL returns the next packet without removing the packet + * from the queue. ALTDQ_REMOVE is a normal dequeue operation. + * ALTDQ_REMOVE must return the same packet if called immediately + * after ALTDQ_POLL. + */ +static struct mbuf * +priq_dequeue(struct ifaltq *ifq, int op) +{ + struct priq_if *pif = (struct priq_if *)ifq->altq_disc; + struct priq_class *cl; + struct mbuf *m; + int pri; + + IFQ_LOCK_ASSERT(ifq); + + if (IFQ_IS_EMPTY(ifq)) + /* no packet in the queue */ + return (NULL); + + for (pri = pif->pif_maxpri; pri >= 0; pri--) { + if ((cl = pif->pif_classes[pri]) != NULL && + !qempty(cl->cl_q)) { + if (op == ALTDQ_POLL) + return (priq_pollq(cl)); + + m = priq_getq(cl); + if (m != NULL) { + IFQ_DEC_LEN(ifq); + if (qempty(cl->cl_q)) + cl->cl_period++; + PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m)); + } + return (m); + } + } + return (NULL); +} + +static int +priq_addq(struct priq_class *cl, struct mbuf *m) +{ + +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m, + cl->cl_pktattr); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + return codel_addq(cl->cl_codel, cl->cl_q, m); +#endif + if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { + m_freem(m); + return (-1); + } + + if (cl->cl_flags & PRCF_CLEARDSCP) + write_dsfield(m, cl->cl_pktattr, 0); + + _addq(cl->cl_q, m); + + return (0); +} + +static struct mbuf * +priq_getq(struct priq_class *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + return rio_getq((rio_t *)cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + return red_getq(cl->cl_red, cl->cl_q); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + return codel_getq(cl->cl_codel, cl->cl_q); +#endif + return _getq(cl->cl_q); +} + +static struct mbuf * +priq_pollq(cl) + struct priq_class *cl; +{ + return qhead(cl->cl_q); +} + +static void +priq_purgeq(struct priq_class *cl) +{ + struct mbuf *m; + + if (qempty(cl->cl_q)) + return; + + while ((m = _getq(cl->cl_q)) != NULL) { + PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m)); + m_freem(m); + } + ASSERT(qlen(cl->cl_q) == 0); +} + +static void +get_class_stats(struct priq_classstats *sp, struct priq_class *cl) +{ + sp->class_handle = cl->cl_handle; + sp->qlength = qlen(cl->cl_q); + sp->qlimit = qlimit(cl->cl_q); + sp->period = cl->cl_period; + sp->xmitcnt = cl->cl_xmitcnt; + sp->dropcnt = cl->cl_dropcnt; + + sp->qtype = qtype(cl->cl_q); +#ifdef ALTQ_RED + if (q_is_red(cl->cl_q)) + red_getstats(cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_RIO + if (q_is_rio(cl->cl_q)) + rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_getstats(cl->cl_codel, &sp->codel); +#endif +} + +/* convert a class handle to the corresponding class pointer */ +static struct priq_class * +clh_to_clp(struct priq_if *pif, u_int32_t chandle) +{ + struct priq_class *cl; + int idx; + + if (chandle == 0) + return (NULL); + + for (idx = pif->pif_maxpri; idx >= 0; idx--) + if ((cl = pif->pif_classes[idx]) != NULL && + cl->cl_handle == chandle) + return (cl); + + return (NULL); +} + + +#ifdef ALTQ3_COMPAT + +static struct priq_if * +priq_attach(ifq, bandwidth) + struct ifaltq *ifq; + u_int bandwidth; +{ + struct priq_if *pif; + + pif = malloc(sizeof(struct priq_if), + M_DEVBUF, M_WAITOK); + if (pif == NULL) + return (NULL); + bzero(pif, sizeof(struct priq_if)); + pif->pif_bandwidth = bandwidth; + pif->pif_maxpri = -1; + pif->pif_ifq = ifq; + + /* add this state to the priq list */ + pif->pif_next = pif_list; + pif_list = pif; + + return (pif); +} + +static int +priq_detach(pif) + struct priq_if *pif; +{ + (void)priq_clear_interface(pif); + + /* remove this interface from the pif list */ + if (pif_list == pif) + pif_list = pif->pif_next; + else { + struct priq_if *p; + + for (p = pif_list; p != NULL; p = p->pif_next) + if (p->pif_next == pif) { + p->pif_next = pif->pif_next; + break; + } + ASSERT(p != NULL); + } + + free(pif, M_DEVBUF); + return (0); +} + +/* + * priq device interface + */ +int +priqopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +priqclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct priq_if *pif; + int err, error = 0; + + while ((pif = pif_list) != NULL) { + /* destroy all */ + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + err = altq_detach(pif->pif_ifq); + if (err == 0) + err = priq_detach(pif); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +priqioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + struct priq_if *pif; + struct priq_interface *ifacep; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case PRIQ_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) + return (error); +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case PRIQ_IF_ATTACH: + error = priqcmd_if_attach((struct priq_interface *)addr); + break; + + case PRIQ_IF_DETACH: + error = priqcmd_if_detach((struct priq_interface *)addr); + break; + + case PRIQ_ENABLE: + case PRIQ_DISABLE: + case PRIQ_CLEAR: + ifacep = (struct priq_interface *)addr; + if ((pif = altq_lookup(ifacep->ifname, + ALTQT_PRIQ)) == NULL) { + error = EBADF; + break; + } + + switch (cmd) { + case PRIQ_ENABLE: + if (pif->pif_default == NULL) { +#ifdef ALTQ_DEBUG + printf("priq: no default class\n"); +#endif + error = EINVAL; + break; + } + error = altq_enable(pif->pif_ifq); + break; + + case PRIQ_DISABLE: + error = altq_disable(pif->pif_ifq); + break; + + case PRIQ_CLEAR: + priq_clear_interface(pif); + break; + } + break; + + case PRIQ_ADD_CLASS: + error = priqcmd_add_class((struct priq_add_class *)addr); + break; + + case PRIQ_DEL_CLASS: + error = priqcmd_delete_class((struct priq_delete_class *)addr); + break; + + case PRIQ_MOD_CLASS: + error = priqcmd_modify_class((struct priq_modify_class *)addr); + break; + + case PRIQ_ADD_FILTER: + error = priqcmd_add_filter((struct priq_add_filter *)addr); + break; + + case PRIQ_DEL_FILTER: + error = priqcmd_delete_filter((struct priq_delete_filter *)addr); + break; + + case PRIQ_GETSTATS: + error = priqcmd_class_stats((struct priq_class_stats *)addr); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +priqcmd_if_attach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + struct ifnet *ifp; + int error; + + if ((ifp = ifunit(ap->ifname)) == NULL) + return (ENXIO); + + if ((pif = priq_attach(&ifp->if_snd, ap->arg)) == NULL) + return (ENOMEM); + + /* + * set PRIQ to this ifnet structure. + */ + if ((error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, pif, + priq_enqueue, priq_dequeue, priq_request, + &pif->pif_classifier, acc_classify)) != 0) + (void)priq_detach(pif); + + return (error); +} + +static int +priqcmd_if_detach(ap) + struct priq_interface *ap; +{ + struct priq_if *pif; + int error; + + if ((pif = altq_lookup(ap->ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ALTQ_IS_ENABLED(pif->pif_ifq)) + altq_disable(pif->pif_ifq); + + if ((error = altq_detach(pif->pif_ifq))) + return (error); + + return priq_detach(pif); +} + +static int +priqcmd_add_class(ap) + struct priq_add_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + int qid; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + if (pif->pif_classes[ap->pri] != NULL) + return (EBUSY); + + qid = ap->pri + 1; + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags, qid)) == NULL) + return (ENOMEM); + + /* return a class handle to the user */ + ap->class_handle = cl->cl_handle; + + return (0); +} + +static int +priqcmd_delete_class(ap) + struct priq_delete_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return priq_class_destroy(cl); +} + +static int +priqcmd_modify_class(ap) + struct priq_modify_class *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) + return (EINVAL); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + /* + * if priority is changed, move the class to the new priority + */ + if (pif->pif_classes[ap->pri] != cl) { + if (pif->pif_classes[ap->pri] != NULL) + return (EEXIST); + pif->pif_classes[cl->cl_pri] = NULL; + pif->pif_classes[ap->pri] = cl; + cl->cl_pri = ap->pri; + } + + /* call priq_class_create to change class parameters */ + if ((cl = priq_class_create(pif, ap->pri, + ap->qlimit, ap->flags, ap->class_handle)) == NULL) + return (ENOMEM); + return 0; +} + +static int +priqcmd_add_filter(ap) + struct priq_add_filter *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) + return (EINVAL); + + return acc_add_filter(&pif->pif_classifier, &ap->filter, + cl, &ap->filter_handle); +} + +static int +priqcmd_delete_filter(ap) + struct priq_delete_filter *ap; +{ + struct priq_if *pif; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + return acc_delete_filter(&pif->pif_classifier, + ap->filter_handle); +} + +static int +priqcmd_class_stats(ap) + struct priq_class_stats *ap; +{ + struct priq_if *pif; + struct priq_class *cl; + struct priq_classstats stats, *usp; + int pri, error; + + if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) + return (EBADF); + + ap->maxpri = pif->pif_maxpri; + + /* then, read the next N classes in the tree */ + usp = ap->stats; + for (pri = 0; pri <= pif->pif_maxpri; pri++) { + cl = pif->pif_classes[pri]; + if (cl != NULL) + get_class_stats(&stats, cl); + else + bzero(&stats, sizeof(stats)); + if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, + sizeof(stats))) != 0) + return (error); + } + return (0); +} + +#ifdef KLD_MODULE + +static struct altqsw priq_sw = + {"priq", priqopen, priqclose, priqioctl}; + +ALTQ_MODULE(altq_priq, ALTQT_PRIQ, &priq_sw); +MODULE_DEPEND(altq_priq, altq_red, 1, 1, 1); +MODULE_DEPEND(altq_priq, altq_rio, 1, 1, 1); + +#endif /* KLD_MODULE */ + +#endif /* ALTQ3_COMPAT */ +#endif /* ALTQ_PRIQ */ diff --git a/freebsd/sys/net/altq/altq_priq.h b/freebsd/sys/net/altq/altq_priq.h new file mode 100644 index 00000000..fcbfee98 --- /dev/null +++ b/freebsd/sys/net/altq/altq_priq.h @@ -0,0 +1,180 @@ +/*- + * Copyright (C) 2000-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_priq.h,v 1.7 2003/10/03 05:05:15 kjc Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_PRIQ_H_ +#define _ALTQ_ALTQ_PRIQ_H_ + +#include <net/altq/altq.h> +#include <net/altq/altq_classq.h> +#include <net/altq/altq_codel.h> +#include <net/altq/altq_red.h> +#include <net/altq/altq_rio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define PRIQ_MAXPRI 16 /* upper limit of the number of priorities */ + +#ifdef ALTQ3_COMPAT +struct priq_interface { + char ifname[IFNAMSIZ]; /* interface name (e.g., fxp0) */ + u_long arg; /* request-specific argument */ +}; + +struct priq_add_class { + struct priq_interface iface; + int pri; /* priority (0 is the lowest) */ + int qlimit; /* queue size limit */ + int flags; /* misc flags (see below) */ + + u_int32_t class_handle; /* return value */ +}; +#endif /* ALTQ3_COMPAT */ + +/* priq class flags */ +#define PRCF_RED 0x0001 /* use RED */ +#define PRCF_ECN 0x0002 /* use RED/ECN */ +#define PRCF_RIO 0x0004 /* use RIO */ +#define PRCF_CODEL 0x0008 /* use CoDel */ +#define PRCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define PRCF_DEFAULTCLASS 0x1000 /* default class */ + +/* special class handles */ +#define PRIQ_NULLCLASS_HANDLE 0 + +#ifdef ALTQ3_COMPAT +struct priq_delete_class { + struct priq_interface iface; + u_int32_t class_handle; +}; + +struct priq_modify_class { + struct priq_interface iface; + u_int32_t class_handle; + int pri; + int qlimit; + int flags; +}; + +struct priq_add_filter { + struct priq_interface iface; + u_int32_t class_handle; + struct flow_filter filter; + + u_long filter_handle; /* return value */ +}; + +struct priq_delete_filter { + struct priq_interface iface; + u_long filter_handle; +}; +#endif /* ALTQ3_COMPAT */ + +struct priq_classstats { + u_int32_t class_handle; + + u_int qlength; + u_int qlimit; + u_int period; + struct pktcntr xmitcnt; /* transmitted packet counter */ + struct pktcntr dropcnt; /* dropped packet counter */ + + /* codel, red and rio related info */ + int qtype; + struct redstats red[3]; /* rio has 3 red stats */ + struct codel_stats codel; +}; + +#ifdef ALTQ3_COMPAT +struct priq_class_stats { + struct priq_interface iface; + int maxpri; /* in/out */ + + struct priq_classstats *stats; /* pointer to stats array */ +}; + +#define PRIQ_IF_ATTACH _IOW('Q', 1, struct priq_interface) +#define PRIQ_IF_DETACH _IOW('Q', 2, struct priq_interface) +#define PRIQ_ENABLE _IOW('Q', 3, struct priq_interface) +#define PRIQ_DISABLE _IOW('Q', 4, struct priq_interface) +#define PRIQ_CLEAR _IOW('Q', 5, struct priq_interface) +#define PRIQ_ADD_CLASS _IOWR('Q', 7, struct priq_add_class) +#define PRIQ_DEL_CLASS _IOW('Q', 8, struct priq_delete_class) +#define PRIQ_MOD_CLASS _IOW('Q', 9, struct priq_modify_class) +#define PRIQ_ADD_FILTER _IOWR('Q', 10, struct priq_add_filter) +#define PRIQ_DEL_FILTER _IOW('Q', 11, struct priq_delete_filter) +#define PRIQ_GETSTATS _IOWR('Q', 12, struct priq_class_stats) + +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +struct priq_class { + u_int32_t cl_handle; /* class handle */ + class_queue_t *cl_q; /* class queue structure */ + union { + struct red *cl_red; /* RED state */ + struct codel *cl_codel; /* CoDel state */ + } cl_aqm; +#define cl_red cl_aqm.cl_red +#define cl_codel cl_aqm.cl_codel + int cl_pri; /* priority */ + int cl_flags; /* class flags */ + struct priq_if *cl_pif; /* back pointer to pif */ + struct altq_pktattr *cl_pktattr; /* saved header used by ECN */ + + /* statistics */ + u_int cl_period; /* backlog period */ + struct pktcntr cl_xmitcnt; /* transmitted packet counter */ + struct pktcntr cl_dropcnt; /* dropped packet counter */ +}; + +/* + * priq interface state + */ +struct priq_if { + struct priq_if *pif_next; /* interface state list */ + struct ifaltq *pif_ifq; /* backpointer to ifaltq */ + u_int pif_bandwidth; /* link bandwidth in bps */ + int pif_maxpri; /* max priority in use */ + struct priq_class *pif_default; /* default class */ + struct priq_class *pif_classes[PRIQ_MAXPRI]; /* classes */ +#ifdef ALTQ3_CLFIER_COMPAT + struct acc_classifier pif_classifier; /* classifier */ +#endif +}; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_PRIQ_H_ */ diff --git a/freebsd/sys/net/altq/altq_red.c b/freebsd/sys/net/altq/altq_red.c new file mode 100644 index 00000000..f83b7b50 --- /dev/null +++ b/freebsd/sys/net/altq/altq_red.c @@ -0,0 +1,1494 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/*- + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <rtems/bsd/sys/errno.h> +#if 1 /* ALTQ3_COMPAT */ +#include <sys/sockio.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#ifdef ALTQ_FLOWVALVE +#include <sys/queue.h> +#include <sys/time.h> +#endif +#endif /* ALTQ3_COMPAT */ + +#include <net/if.h> +#include <net/if_var.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <netpfil/pf/pf_mtag.h> +#include <net/altq/altq.h> +#include <net/altq/altq_red.h> +#ifdef ALTQ3_COMPAT +#include <net/altq/altq_conf.h> +#ifdef ALTQ_FLOWVALVE +#include <net/altq/altq_flowvalve.h> +#endif +#endif + +/* + * ALTQ/RED (Random Early Detection) implementation using 32-bit + * fixed-point calculation. + * + * written by kjc using the ns code as a reference. + * you can learn more about red and ns from Sally's home page at + * http://www-nrg.ee.lbl.gov/floyd/ + * + * most of the red parameter values are fixed in this implementation + * to prevent fixed-point overflow/underflow. + * if you change the parameters, watch out for overflow/underflow! + * + * the parameters used are recommended values by Sally. + * the corresponding ns config looks: + * q_weight=0.00195 + * minthresh=5 maxthresh=15 queue-size=60 + * linterm=30 + * dropmech=drop-tail + * bytes=false (can't be handled by 32-bit fixed-point) + * doubleq=false dqthresh=false + * wait=true + */ +/* + * alternative red parameters for a slow link. + * + * assume the queue length becomes from zero to L and keeps L, it takes + * N packets for q_avg to reach 63% of L. + * when q_weight is 0.002, N is about 500 packets. + * for a slow link like dial-up, 500 packets takes more than 1 minute! + * when q_weight is 0.008, N is about 127 packets. + * when q_weight is 0.016, N is about 63 packets. + * bursts of 50 packets are allowed for 0.002, bursts of 25 packets + * are allowed for 0.016. + * see Sally's paper for more details. + */ +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RED_LIMIT 60 /* default max queue length */ +#define RED_STATS /* collect statistics */ + +/* + * our default policy for forced-drop is drop-tail. + * (in altq-1.1.2 or earlier, the default was random-drop. + * but it makes more sense to punish the cause of the surge.) + * to switch to the random-drop policy, define "RED_RANDOM_DROP". + */ + +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE +/* + * flow-valve is an extension to protect red from unresponsive flows + * and to promote end-to-end congestion control. + * flow-valve observes the average drop rates of the flows that have + * experienced packet drops in the recent past. + * when the average drop rate exceeds the threshold, the flow is + * blocked by the flow-valve. the trapped flow should back off + * exponentially to escape from the flow-valve. + */ +#ifdef RED_RANDOM_DROP +#error "random-drop can't be used with flow-valve!" +#endif +#endif /* ALTQ_FLOWVALVE */ + +/* red_list keeps all red_queue_t's allocated. */ +static red_queue_t *red_list = NULL; + +#endif /* ALTQ3_COMPAT */ + +/* default red parameter values */ +static int default_th_min = TH_MIN; +static int default_th_max = TH_MAX; +static int default_inv_pmax = INV_P_MAX; + +#ifdef ALTQ3_COMPAT +/* internal function prototypes */ +static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *red_dequeue(struct ifaltq *, int); +static int red_request(struct ifaltq *, int, void *); +static void red_purgeq(red_queue_t *); +static int red_detach(red_queue_t *); +#ifdef ALTQ_FLOWVALVE +static __inline struct fve *flowlist_lookup(struct flowvalve *, + struct altq_pktattr *, struct timeval *); +static __inline struct fve *flowlist_reclaim(struct flowvalve *, + struct altq_pktattr *); +static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *); +static __inline int fv_p2f(struct flowvalve *, int); +#if 0 /* XXX: make the compiler happy (fv_alloc unused) */ +static struct flowvalve *fv_alloc(struct red *); +#endif +static void fv_destroy(struct flowvalve *); +static int fv_checkflow(struct flowvalve *, struct altq_pktattr *, + struct fve **); +static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *, + struct fve *); +#endif +#endif /* ALTQ3_COMPAT */ + +/* + * red support routines + */ +red_t * +red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, + int pkttime) +{ + red_t *rp; + int w, i; + int npkts_per_sec; + + rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO); + if (rp == NULL) + return (NULL); + + if (weight == 0) + rp->red_weight = W_WEIGHT; + else + rp->red_weight = weight; + + /* allocate weight table */ + rp->red_wtab = wtab_alloc(rp->red_weight); + if (rp->red_wtab == NULL) { + free(rp, M_DEVBUF); + return (NULL); + } + + rp->red_avg = 0; + rp->red_idle = 1; + + if (inv_pmax == 0) + rp->red_inv_pmax = default_inv_pmax; + else + rp->red_inv_pmax = inv_pmax; + if (th_min == 0) + rp->red_thmin = default_th_min; + else + rp->red_thmin = th_min; + if (th_max == 0) + rp->red_thmax = default_th_max; + else + rp->red_thmax = th_max; + + rp->red_flags = flags; + + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->red_pkttime = 800; + else + rp->red_pkttime = pkttime; + + if (weight == 0) { + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->red_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->red_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->red_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->red_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->red_wshift = i; + w = 1 << rp->red_wshift; + if (w != rp->red_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->red_weight, w); + rp->red_weight = w; + } + + /* + * thmin_s and thmax_s are scaled versions of th_min and th_max + * to be compared with avg. + */ + rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); + rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) + * rp->red_inv_pmax) << FP_SHIFT; + + microtime(&rp->red_last); + return (rp); +} + +void +red_destroy(red_t *rp) +{ +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_destroy(rp->red_flowvalve); +#endif +#endif /* ALTQ3_COMPAT */ + wtab_destroy(rp->red_wtab); + free(rp, M_DEVBUF); +} + +void +red_getstats(red_t *rp, struct redstats *sp) +{ + sp->q_avg = rp->red_avg >> rp->red_wshift; + sp->xmit_cnt = rp->red_stats.xmit_cnt; + sp->drop_cnt = rp->red_stats.drop_cnt; + sp->drop_forced = rp->red_stats.drop_forced; + sp->drop_unforced = rp->red_stats.drop_unforced; + sp->marked_packets = rp->red_stats.marked_packets; +} + +int +red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, + struct altq_pktattr *pktattr) +{ + int avg, droptype; + int n; +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + struct fve *fve = NULL; + + if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0) + if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) { + m_freem(m); + return (-1); + } +#endif +#endif /* ALTQ3_COMPAT */ + + avg = rp->red_avg; + + /* + * if we were idle, we pretend that n packets arrived during + * the idle period. + */ + if (rp->red_idle) { + struct timeval now; + int t; + + rp->red_idle = 0; + microtime(&now); + t = (now.tv_sec - rp->red_last.tv_sec); + if (t > 60) { + /* + * being idle for more than 1 minute, set avg to zero. + * this prevents t from overflow. + */ + avg = 0; + } else { + t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); + n = t / rp->red_pkttime - 1; + + /* the following line does (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->red_wtab, n); + } + } + + /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ + avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); + rp->red_avg = avg; /* save the new value */ + + /* + * red_count keeps a tally of arriving traffic that has not + * been dropped. + */ + rp->red_count++; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= rp->red_thmin_s && qlen(q) > 1) { + if (avg >= rp->red_thmax_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (rp->red_old == 0) { + /* first exceeds th_min */ + rp->red_count = 1; + rp->red_old = 1; + } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, + rp->red_probd, rp->red_count)) { + /* mark or drop by red */ + if ((rp->red_flags & REDF_ECN) && + mark_ecn(m, pktattr, rp->red_flags)) { + /* successfully marked. do not drop. */ + rp->red_count = 0; +#ifdef RED_STATS + rp->red_stats.marked_packets++; +#endif + } else { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } + } else { + /* avg < th_min */ + rp->red_old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + +#ifdef RED_RANDOM_DROP + /* if successful or forced drop, enqueue this packet. */ + if (droptype != DTYPE_EARLY) + _addq(q, m); +#else + /* if successful, enqueue this packet. */ + if (droptype == DTYPE_NODROP) + _addq(q, m); +#endif + if (droptype != DTYPE_NODROP) { + if (droptype == DTYPE_EARLY) { + /* drop the incoming packet */ +#ifdef RED_STATS + rp->red_stats.drop_unforced++; +#endif + } else { + /* forced drop, select a victim packet in the queue. */ +#ifdef RED_RANDOM_DROP + m = _getq_random(q); +#endif +#ifdef RED_STATS + rp->red_stats.drop_forced++; +#endif + } +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); +#endif + rp->red_count = 0; +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) + fv_dropbyred(rp->red_flowvalve, pktattr, fve); +#endif +#endif /* ALTQ3_COMPAT */ + m_freem(m); + return (-1); + } + /* successfully queued */ +#ifdef RED_STATS + PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +/* + * early-drop probability is calculated as follows: + * prob = p_max * (avg - th_min) / (th_max - th_min) + * prob_a = prob / (2 - count*prob) + * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) + * here prob_a increases as successive undrop count increases. + * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), + * becomes 1 when (count >= (2 / prob))). + */ +int +drop_early(int fp_len, int fp_probd, int count) +{ + int d; /* denominator of drop-probability */ + + d = fp_probd - count * fp_len; + if (d <= 0) + /* count exceeds the hard limit: drop or mark */ + return (1); + + /* + * now the range of d is [1..600] in fixed-point. (when + * th_max-th_min=10 and p_max=1/30) + * drop probability = (avg - TH_MIN) / d + */ + + if ((arc4random() % d) < fp_len) { + /* drop or mark */ + return (1); + } + /* no drop/mark */ + return (0); +} + +/* + * try to mark CE bit to the packet. + * returns 1 if successfully marked, 0 otherwise. + */ +int +mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) +{ + struct mbuf *m0; + struct pf_mtag *at; + void *hdr; + + at = pf_find_mtag(m); + if (at != NULL) { + hdr = at->hdr; +#ifdef ALTQ3_COMPAT + } else if (pktattr != NULL) { + af = pktattr->pattr_af; + hdr = pktattr->pattr_hdr; +#endif /* ALTQ3_COMPAT */ + } else + return (0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)hdr >= m0->m_data) && + ((caddr_t)hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, tag info is stale */ + return (0); + } + + switch (((struct ip *)hdr)->ip_v) { + case IPVERSION: + if (flags & REDF_ECN4) { + struct ip *ip = hdr; + u_int8_t otos; + int sum; + + if (ip->ip_v != 4) + return (0); /* version mismatch! */ + + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); /* not-ECT */ + if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (1); /* already marked */ + + /* + * ecn-capable but not marked, + * mark CE and update checksum + */ + otos = ip->ip_tos; + ip->ip_tos |= IPTOS_ECN_CE; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += (~otos & 0xffff) + ip->ip_tos; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + ip->ip_sum = htons(~sum & 0xffff); + return (1); + } + break; +#ifdef INET6 + case (IPV6_VERSION >> 4): + if (flags & REDF_ECN6) { + struct ip6_hdr *ip6 = hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return (0); /* version mismatch! */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_NOTECT << 20)) + return (0); /* not-ECT */ + if ((flowlabel & (IPTOS_ECN_MASK << 20)) == + (IPTOS_ECN_CE << 20)) + return (1); /* already marked */ + /* + * ecn-capable but not marked, mark CE + */ + flowlabel |= (IPTOS_ECN_CE << 20); + ip6->ip6_flow = htonl(flowlabel); + return (1); + } + break; +#endif /* INET6 */ + } + + /* not marked */ + return (0); +} + +struct mbuf * +red_getq(rp, q) + red_t *rp; + class_queue_t *q; +{ + struct mbuf *m; + + if ((m = _getq(q)) == NULL) { + if (rp->red_idle == 0) { + rp->red_idle = 1; + microtime(&rp->red_last); + } + return NULL; + } + + rp->red_idle = 0; + return (m); +} + +/* + * helper routine to calibrate avg during idle. + * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point + * here Wq = 1/weight and the code assumes Wq is close to zero. + * + * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. + */ +static struct wtab *wtab_list = NULL; /* pointer to wtab list */ + +struct wtab * +wtab_alloc(int weight) +{ + struct wtab *w; + int i; + + for (w = wtab_list; w != NULL; w = w->w_next) + if (w->w_weight == weight) { + w->w_refcount++; + return (w); + } + + w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO); + if (w == NULL) + return (NULL); + w->w_weight = weight; + w->w_refcount = 1; + w->w_next = wtab_list; + wtab_list = w; + + /* initialize the weight table */ + w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; + for (i = 1; i < 32; i++) { + w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; + if (w->w_tab[i] == 0 && w->w_param_max == 0) + w->w_param_max = 1 << i; + } + + return (w); +} + +int +wtab_destroy(struct wtab *w) +{ + struct wtab *prev; + + if (--w->w_refcount > 0) + return (0); + + if (wtab_list == w) + wtab_list = w->w_next; + else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) + if (prev->w_next == w) { + prev->w_next = w->w_next; + break; + } + + free(w, M_DEVBUF); + return (0); +} + +int32_t +pow_w(struct wtab *w, int n) +{ + int i, bit; + int32_t val; + + if (n >= w->w_param_max) + return (0); + + val = 1 << FP_SHIFT; + if (n <= 0) + return (val); + + bit = 1; + i = 0; + while (n) { + if (n & bit) { + val = (val * w->w_tab[i]) >> FP_SHIFT; + n &= ~bit; + } + i++; + bit <<= 1; + } + return (val); +} + +#ifdef ALTQ3_COMPAT +/* + * red device interface + */ +altqdev_decl(red); + +int +redopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +redclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + red_queue_t *rqp; + int err, error = 0; + + while ((rqp = red_list) != NULL) { + /* destroy all */ + err = red_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +redioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + red_queue_t *rqp; + struct red_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RED_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) +#endif + return (error); + break; + } + + switch (cmd) { + + case RED_ENABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RED_DISABLE: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RED_IF_ATTACH: + ifp = ifunit(((struct red_interface *)addr)->red_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize red_queue_t */ + rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(red_queue_t)); + + rqp->rq_q = malloc(sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0); + if (rqp->rq_red == NULL) { + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RED_LIMIT; + qtype(rqp->rq_q) = Q_RED; + + /* + * set RED to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp, + red_enqueue, red_dequeue, red_request, + NULL, NULL); + if (error) { + red_destroy(rqp->rq_red); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + break; + } + + /* add this state to the red list */ + rqp->rq_next = red_list; + red_list = rqp; + break; + + case RED_IF_DETACH: + ifacep = (struct red_interface *)addr; + if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + error = red_detach(rqp); + break; + + case RED_GETSTATS: + do { + struct red_stats *q_stats; + red_t *rp; + + q_stats = (struct red_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + + q_stats->q_len = qlen(rqp->rq_q); + q_stats->q_limit = qlimit(rqp->rq_q); + + rp = rqp->rq_red; + q_stats->q_avg = rp->red_avg >> rp->red_wshift; + q_stats->xmit_cnt = rp->red_stats.xmit_cnt; + q_stats->drop_cnt = rp->red_stats.drop_cnt; + q_stats->drop_forced = rp->red_stats.drop_forced; + q_stats->drop_unforced = rp->red_stats.drop_unforced; + q_stats->marked_packets = rp->red_stats.marked_packets; + + q_stats->weight = rp->red_weight; + q_stats->inv_pmax = rp->red_inv_pmax; + q_stats->th_min = rp->red_thmin; + q_stats->th_max = rp->red_thmax; + +#ifdef ALTQ_FLOWVALVE + if (rp->red_flowvalve != NULL) { + struct flowvalve *fv = rp->red_flowvalve; + q_stats->fv_flows = fv->fv_flows; + q_stats->fv_pass = fv->fv_stats.pass; + q_stats->fv_predrop = fv->fv_stats.predrop; + q_stats->fv_alloc = fv->fv_stats.alloc; + q_stats->fv_escape = fv->fv_stats.escape; + } else { +#endif /* ALTQ_FLOWVALVE */ + q_stats->fv_flows = 0; + q_stats->fv_pass = 0; + q_stats->fv_predrop = 0; + q_stats->fv_alloc = 0; + q_stats->fv_escape = 0; +#ifdef ALTQ_FLOWVALVE + } +#endif /* ALTQ_FLOWVALVE */ + } while (/*CONSTCOND*/ 0); + break; + + case RED_CONFIG: + do { + struct red_conf *fc; + red_t *new; + int s, limit; + + fc = (struct red_conf *)addr; + if ((rqp = altq_lookup(fc->iface.red_ifname, + ALTQT_RED)) == NULL) { + error = EBADF; + break; + } + new = red_alloc(fc->red_weight, + fc->red_inv_pmax, + fc->red_thmin, + fc->red_thmax, + fc->red_flags, + fc->red_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + + s = splnet(); + red_purgeq(rqp); + limit = fc->red_limit; + if (limit < fc->red_thmax) + limit = fc->red_thmax; + qlimit(rqp->rq_q) = limit; + fc->red_limit = limit; /* write back the new value */ + + red_destroy(rqp->rq_red); + rqp->rq_red = new; + + splx(s); + + /* write back new values */ + fc->red_limit = limit; + fc->red_inv_pmax = rqp->rq_red->red_inv_pmax; + fc->red_thmin = rqp->rq_red->red_thmin; + fc->red_thmax = rqp->rq_red->red_thmax; + + } while (/*CONSTCOND*/ 0); + break; + + case RED_SETDEFAULTS: + do { + struct redparams *rp; + + rp = (struct redparams *)addr; + + default_th_min = rp->th_min; + default_th_max = rp->th_max; + default_inv_pmax = rp->inv_pmax; + } while (/*CONSTCOND*/ 0); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +static int +red_detach(rqp) + red_queue_t *rqp; +{ + red_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (red_list == rqp) + red_list = rqp->rq_next; + else { + for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("red_detach: no state found in red_list!\n"); + } + + red_destroy(rqp->rq_red); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + return (error); +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +red_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0) + return ENOBUFS; + ifq->ifq_len++; + return 0; +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +red_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + struct mbuf *m; + + IFQ_LOCK_ASSERT(ifq); + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + /* op == ALTDQ_REMOVE */ + m = red_getq(rqp->rq_red, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return (m); +} + +static int +red_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + red_purgeq(rqp); + break; + } + return (0); +} + +static void +red_purgeq(rqp) + red_queue_t *rqp; +{ + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + rqp->rq_ifq->ifq_len = 0; +} + +#ifdef ALTQ_FLOWVALVE + +#define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */ +#define FV_PSCALE(x) ((x) << FV_PSHIFT) +#define FV_PUNSCALE(x) ((x) >> FV_PSHIFT) +#define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */ +#define FV_FSCALE(x) ((x) << FV_FSHIFT) +#define FV_FUNSCALE(x) ((x) >> FV_FSHIFT) + +#define FV_TIMER (3 * hz) /* timer value for garbage collector */ +#define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */ + +#define FV_N 10 /* update fve_f every FV_N packets */ + +#define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */ +#define FV_TTHRESH 3 /* time threshold to delete fve */ +#define FV_ALPHA 5 /* extra packet count */ + +#define FV_STATS + +#if (__FreeBSD_version > 300000) +#define FV_TIMESTAMP(tp) getmicrotime(tp) +#else +#define FV_TIMESTAMP(tp) { (*(tp)) = time; } +#endif + +/* + * Brtt table: 127 entry table to convert drop rate (p) to + * the corresponding bandwidth fraction (f) + * the following equation is implemented to use scaled values, + * fve_p and fve_f, in the fixed point format. + * + * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p)) + * f = Brtt(p) / (max_th + alpha) + */ +#define BRTT_SIZE 128 +#define BRTT_SHIFT 12 +#define BRTT_MASK 0x0007f000 +#define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT)) + +const int brtt_tab[BRTT_SIZE] = { + 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728, + 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361, + 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333, + 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612, + 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957, + 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440, + 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184, + 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611, + 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062, + 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487, + 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222, + 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844, + 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079, + 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746, + 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722, + 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924 +}; + +static __inline struct fve * +flowlist_lookup(fv, pktattr, now) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct timeval *now; +{ + struct fve *fve; + int flows; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + struct timeval tthresh; + + if (pktattr == NULL) + return (NULL); + + tthresh.tv_sec = now->tv_sec - FV_TTHRESH; + flows = 0; + /* + * search the flow list + */ + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET && + fve->fve_flow.flow_ip.ip_src.s_addr == + ip->ip_src.s_addr && + fve->fve_flow.flow_ip.ip_dst.s_addr == + ip->ip_dst.s_addr) + return (fve); + flows++; + } + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ + if (fve->fve_lastdrop.tv_sec == 0) + break; + if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { + fve->fve_lastdrop.tv_sec = 0; + break; + } + if (fve->fve_flow.flow_af == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src, + &ip6->ip6_src) && + IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst, + &ip6->ip6_dst)) + return (fve); + flows++; + } + break; +#endif /* INET6 */ + + default: + /* unknown protocol. no drop. */ + return (NULL); + } + fv->fv_flows = flows; /* save the number of active fve's */ + return (NULL); +} + +static __inline struct fve * +flowlist_reclaim(fv, pktattr) + struct flowvalve *fv; + struct altq_pktattr *pktattr; +{ + struct fve *fve; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + + /* + * get an entry from the tail of the LRU list. + */ + fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead); + + switch (pktattr->pattr_af) { + case AF_INET: + ip = (struct ip *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET; + fve->fve_flow.flow_ip.ip_src = ip->ip_src; + fve->fve_flow.flow_ip.ip_dst = ip->ip_dst; + break; +#ifdef INET6 + case AF_INET6: + ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + fve->fve_flow.flow_af = AF_INET6; + fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src; + fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst; + break; +#endif + } + + fve->fve_state = Green; + fve->fve_p = 0.0; + fve->fve_f = 0.0; + fve->fve_ifseq = fv->fv_ifseq - 1; + fve->fve_count = 0; + + fv->fv_flows++; +#ifdef FV_STATS + fv->fv_stats.alloc++; +#endif + return (fve); +} + +static __inline void +flowlist_move_to_head(fv, fve) + struct flowvalve *fv; + struct fve *fve; +{ + if (TAILQ_FIRST(&fv->fv_flowlist) != fve) { + TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru); + TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru); + } +} + +#if 0 /* XXX: make the compiler happy (fv_alloc unused) */ +/* + * allocate flowvalve structure + */ +static struct flowvalve * +fv_alloc(rp) + struct red *rp; +{ + struct flowvalve *fv; + struct fve *fve; + int i, num; + + num = FV_FLOWLISTSIZE; + fv = malloc(sizeof(struct flowvalve), + M_DEVBUF, M_WAITOK); + if (fv == NULL) + return (NULL); + bzero(fv, sizeof(struct flowvalve)); + + fv->fv_fves = malloc(sizeof(struct fve) * num, + M_DEVBUF, M_WAITOK); + if (fv->fv_fves == NULL) { + free(fv, M_DEVBUF); + return (NULL); + } + bzero(fv->fv_fves, sizeof(struct fve) * num); + + fv->fv_flows = 0; + TAILQ_INIT(&fv->fv_flowlist); + for (i = 0; i < num; i++) { + fve = &fv->fv_fves[i]; + fve->fve_lastdrop.tv_sec = 0; + TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru); + } + + /* initialize drop rate threshold in scaled fixed-point */ + fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax; + + /* initialize drop rate to fraction table */ + fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, + M_DEVBUF, M_WAITOK); + if (fv->fv_p2ftab == NULL) { + free(fv->fv_fves, M_DEVBUF); + free(fv, M_DEVBUF); + return (NULL); + } + /* + * create the p2f table. + * (shift is used to keep the precision) + */ + for (i = 1; i < BRTT_SIZE; i++) { + int f; + + f = brtt_tab[i] << 8; + fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8; + } + + return (fv); +} +#endif + +static void fv_destroy(fv) + struct flowvalve *fv; +{ + free(fv->fv_p2ftab, M_DEVBUF); + free(fv->fv_fves, M_DEVBUF); + free(fv, M_DEVBUF); +} + +static __inline int +fv_p2f(fv, p) + struct flowvalve *fv; + int p; +{ + int val, f; + + if (p >= BRTT_PMAX) + f = fv->fv_p2ftab[BRTT_SIZE-1]; + else if ((val = (p & BRTT_MASK))) + f = fv->fv_p2ftab[(val >> BRTT_SHIFT)]; + else + f = fv->fv_p2ftab[1]; + return (f); +} + +/* + * check if an arriving packet should be pre-dropped. + * called from red_addq() when a packet arrives. + * returns 1 when the packet should be pre-dropped. + * should be called in splimp. + */ +static int +fv_checkflow(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve **fcache; +{ + struct fve *fve; + struct timeval now; + + fv->fv_ifseq++; + FV_TIMESTAMP(&now); + + if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + /* no matching entry in the flowlist */ + return (0); + + *fcache = fve; + + /* update fraction f for every FV_N packets */ + if (++fve->fve_count == FV_N) { + /* + * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f + */ + fve->fve_f = + (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq) + + fve->fve_f - FV_FUNSCALE(fve->fve_f); + fve->fve_ifseq = fv->fv_ifseq; + fve->fve_count = 0; + } + + /* + * overpumping test + */ + if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) { + int fthresh; + + /* calculate a threshold */ + fthresh = fv_p2f(fv, fve->fve_p); + if (fve->fve_f > fthresh) + fve->fve_state = Red; + } + + if (fve->fve_state == Red) { + /* + * backoff test + */ + if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) { + /* no drop for at least FV_BACKOFFTHRESH sec */ + fve->fve_p = 0; + fve->fve_state = Green; +#ifdef FV_STATS + fv->fv_stats.escape++; +#endif + } else { + /* block this flow */ + flowlist_move_to_head(fv, fve); + fve->fve_lastdrop = now; +#ifdef FV_STATS + fv->fv_stats.predrop++; +#endif + return (1); + } + } + + /* + * p = (1 - Wp) * p + */ + fve->fve_p -= FV_PUNSCALE(fve->fve_p); + if (fve->fve_p < 0) + fve->fve_p = 0; +#ifdef FV_STATS + fv->fv_stats.pass++; +#endif + return (0); +} + +/* + * called from red_addq when a packet is dropped by red. + * should be called in splimp. + */ +static void fv_dropbyred(fv, pktattr, fcache) + struct flowvalve *fv; + struct altq_pktattr *pktattr; + struct fve *fcache; +{ + struct fve *fve; + struct timeval now; + + if (pktattr == NULL) + return; + FV_TIMESTAMP(&now); + + if (fcache != NULL) + /* the fve of this packet is already cached */ + fve = fcache; + else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) + fve = flowlist_reclaim(fv, pktattr); + + flowlist_move_to_head(fv, fve); + + /* + * update p: the following line cancels the update + * in fv_checkflow() and calculate + * p = Wp + (1 - Wp) * p + */ + fve->fve_p = (1 << FP_SHIFT) + fve->fve_p; + + fve->fve_lastdrop = now; +} + +#endif /* ALTQ_FLOWVALVE */ + +#ifdef KLD_MODULE + +static struct altqsw red_sw = + {"red", redopen, redclose, redioctl}; + +ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw); +MODULE_VERSION(altq_red, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_RED */ diff --git a/freebsd/sys/net/altq/altq_red.h b/freebsd/sys/net/altq/altq_red.h new file mode 100644 index 00000000..8ae8d291 --- /dev/null +++ b/freebsd/sys/net/altq/altq_red.h @@ -0,0 +1,199 @@ +/*- + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_red.h,v 1.8 2003/07/10 12:07:49 kjc Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_RED_H_ +#define _ALTQ_ALTQ_RED_H_ + +#include <net/altq/altq_classq.h> + +#ifdef ALTQ3_COMPAT +struct red_interface { + char red_ifname[IFNAMSIZ]; +}; + +struct red_stats { + struct red_interface iface; + int q_len; + int q_avg; + + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + + /* static red parameters */ + int q_limit; + int weight; + int inv_pmax; + int th_min; + int th_max; + + /* flowvalve related stuff */ + u_int fv_flows; + u_int fv_pass; + u_int fv_predrop; + u_int fv_alloc; + u_int fv_escape; +}; + +struct red_conf { + struct red_interface iface; + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + int red_limit; /* max queue length */ + int red_pkttime; /* average packet time in usec */ + int red_flags; /* see below */ +}; +#endif /* ALTQ3_COMPAT */ + +/* red flags */ +#define REDF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define REDF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define REDF_ECN (REDF_ECN4 | REDF_ECN6) +#define REDF_FLOWVALVE 0x04 /* use flowvalve (aka penalty-box) */ + +/* + * simpler versions of red parameters and statistics used by other + * disciplines (e.g., CBQ) + */ +struct redparams { + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + int inv_pmax; /* inverse of max drop probability */ +}; + +struct redstats { + int q_avg; + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; +}; + +#ifdef ALTQ3_COMPAT +/* + * IOCTLs for RED + */ +#define RED_IF_ATTACH _IOW('Q', 1, struct red_interface) +#define RED_IF_DETACH _IOW('Q', 2, struct red_interface) +#define RED_ENABLE _IOW('Q', 3, struct red_interface) +#define RED_DISABLE _IOW('Q', 4, struct red_interface) +#define RED_CONFIG _IOWR('Q', 6, struct red_conf) +#define RED_GETSTATS _IOWR('Q', 12, struct red_stats) +#define RED_SETDEFAULTS _IOW('Q', 30, struct redparams) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +#ifdef ALTQ3_COMPAT +struct flowvalve; +#endif + +/* weight table structure for idle time calibration */ +struct wtab { + struct wtab *w_next; + int w_weight; + int w_param_max; + int w_refcount; + int32_t w_tab[32]; +}; + +typedef struct red { + int red_pkttime; /* average packet time in micro sec + used for idle calibration */ + int red_flags; /* red flags */ + + /* red parameters */ + int red_weight; /* weight for EWMA */ + int red_inv_pmax; /* inverse of max drop probability */ + int red_thmin; /* red min threshold */ + int red_thmax; /* red max threshold */ + + /* variables for internal use */ + int red_wshift; /* log(red_weight) */ + int red_thmin_s; /* th_min scaled by avgshift */ + int red_thmax_s; /* th_max scaled by avgshift */ + int red_probd; /* drop probability denominator */ + + int red_avg; /* queue len avg scaled by avgshift */ + int red_count; /* packet count since last dropped/ + marked packet */ + int red_idle; /* queue was empty */ + int red_old; /* avg is above th_min */ + struct wtab *red_wtab; /* weight table */ + struct timeval red_last; /* time when the queue becomes idle */ + +#ifdef ALTQ3_COMPAT + struct flowvalve *red_flowvalve; /* flowvalve state */ +#endif + + struct { + struct pktcntr xmit_cnt; + struct pktcntr drop_cnt; + u_int drop_forced; + u_int drop_unforced; + u_int marked_packets; + } red_stats; +} red_t; + +#ifdef ALTQ3_COMPAT +typedef struct red_queue { + struct red_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + red_t *rq_red; +} red_queue_t; +#endif /* ALTQ3_COMPAT */ + +/* red drop types */ +#define DTYPE_NODROP 0 /* no drop */ +#define DTYPE_FORCED 1 /* a "forced" drop */ +#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ + +extern red_t *red_alloc(int, int, int, int, int, int); +extern void red_destroy(red_t *); +extern void red_getstats(red_t *, struct redstats *); +extern int red_addq(red_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +extern struct mbuf *red_getq(red_t *, class_queue_t *); +extern int drop_early(int, int, int); +extern int mark_ecn(struct mbuf *, struct altq_pktattr *, int); +extern struct wtab *wtab_alloc(int); +extern int wtab_destroy(struct wtab *); +extern int32_t pow_w(struct wtab *, int); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RED_H_ */ diff --git a/freebsd/sys/net/altq/altq_rio.c b/freebsd/sys/net/altq/altq_rio.c new file mode 100644 index 00000000..bad0257c --- /dev/null +++ b/freebsd/sys/net/altq/altq_rio.c @@ -0,0 +1,846 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/*- + * Copyright (c) 1990-1994 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Computer Systems + * Engineering Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $ + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <rtems/bsd/sys/errno.h> +#if 1 /* ALTQ3_COMPAT */ +#include <sys/proc.h> +#include <sys/sockio.h> +#include <sys/kernel.h> +#endif + +#include <net/if.h> +#include <net/if_var.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <net/altq/altq.h> +#include <net/altq/altq_cdnr.h> +#include <net/altq/altq_red.h> +#include <net/altq/altq_rio.h> +#ifdef ALTQ3_COMPAT +#include <net/altq/altq_conf.h> +#endif + +/* + * RIO: RED with IN/OUT bit + * described in + * "Explicit Allocation of Best Effort Packet Delivery Service" + * David D. Clark and Wenjia Fang, MIT Lab for Computer Science + * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} + * + * this implementation is extended to support more than 2 drop precedence + * values as described in RFC2597 (Assured Forwarding PHB Group). + * + */ +/* + * AF DS (differentiated service) codepoints. + * (classes can be mapped to CBQ or H-FSC classes.) + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | CLASS |DropPre| 0 | CU | + * +---+---+---+---+---+---+---+---+ + * + * class 1: 001 + * class 2: 010 + * class 3: 011 + * class 4: 100 + * + * low drop prec: 01 + * medium drop prec: 10 + * high drop prec: 01 + */ + +/* normal red parameters */ +#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ + /* q_weight = 0.00195 */ + +/* red parameters for a slow link */ +#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ + /* q_weight = 0.0078125 */ + +/* red parameters for a very slow link (e.g., dialup) */ +#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ + /* q_weight = 0.015625 */ + +/* fixed-point uses 12-bit decimal places */ +#define FP_SHIFT 12 /* fixed-point shift */ + +/* red parameters for drop probability */ +#define INV_P_MAX 10 /* inverse of max drop probability */ +#define TH_MIN 5 /* min threshold */ +#define TH_MAX 15 /* max threshold */ + +#define RIO_LIMIT 60 /* default max queue length */ +#define RIO_STATS /* collect statistics */ + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \ + if (xxs < 0) { \ + delta = 60000000; \ + } else if (xxs > 4) { \ + if (xxs > 60) \ + delta = 60000000; \ + else \ + delta += xxs * 1000000; \ + } else while (xxs > 0) { \ + delta += 1000000; \ + xxs--; \ + } \ + } \ +} + +#ifdef ALTQ3_COMPAT +/* rio_list keeps all rio_queue_t's allocated. */ +static rio_queue_t *rio_list = NULL; +#endif +/* default rio parameter values */ +static struct redparams default_rio_params[RIO_NDROPPREC] = { + /* th_min, th_max, inv_pmax */ + { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ + { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ + { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ +}; + +/* internal function prototypes */ +static int dscp2index(u_int8_t); +#ifdef ALTQ3_COMPAT +static int rio_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +static struct mbuf *rio_dequeue(struct ifaltq *, int); +static int rio_request(struct ifaltq *, int, void *); +static int rio_detach(rio_queue_t *); + +/* + * rio device interface + */ +altqdev_decl(rio); + +#endif /* ALTQ3_COMPAT */ + +rio_t * +rio_alloc(int weight, struct redparams *params, int flags, int pkttime) +{ + rio_t *rp; + int w, i; + int npkts_per_sec; + + rp = malloc(sizeof(rio_t), M_DEVBUF, M_NOWAIT | M_ZERO); + if (rp == NULL) + return (NULL); + + rp->rio_flags = flags; + if (pkttime == 0) + /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ + rp->rio_pkttime = 800; + else + rp->rio_pkttime = pkttime; + + if (weight != 0) + rp->rio_weight = weight; + else { + /* use default */ + rp->rio_weight = W_WEIGHT; + + /* when the link is very slow, adjust red parameters */ + npkts_per_sec = 1000000 / rp->rio_pkttime; + if (npkts_per_sec < 50) { + /* up to about 400Kbps */ + rp->rio_weight = W_WEIGHT_2; + } else if (npkts_per_sec < 300) { + /* up to about 2.4Mbps */ + rp->rio_weight = W_WEIGHT_1; + } + } + + /* calculate wshift. weight must be power of 2 */ + w = rp->rio_weight; + for (i = 0; w > 1; i++) + w = w >> 1; + rp->rio_wshift = i; + w = 1 << rp->rio_wshift; + if (w != rp->rio_weight) { + printf("invalid weight value %d for red! use %d\n", + rp->rio_weight, w); + rp->rio_weight = w; + } + + /* allocate weight table */ + rp->rio_wtab = wtab_alloc(rp->rio_weight); + + for (i = 0; i < RIO_NDROPPREC; i++) { + struct dropprec_state *prec = &rp->rio_precstate[i]; + + prec->avg = 0; + prec->idle = 1; + + if (params == NULL || params[i].inv_pmax == 0) + prec->inv_pmax = default_rio_params[i].inv_pmax; + else + prec->inv_pmax = params[i].inv_pmax; + if (params == NULL || params[i].th_min == 0) + prec->th_min = default_rio_params[i].th_min; + else + prec->th_min = params[i].th_min; + if (params == NULL || params[i].th_max == 0) + prec->th_max = default_rio_params[i].th_max; + else + prec->th_max = params[i].th_max; + + /* + * th_min_s and th_max_s are scaled versions of th_min + * and th_max to be compared with avg. + */ + prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); + prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); + + /* + * precompute probability denominator + * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point + */ + prec->probd = (2 * (prec->th_max - prec->th_min) + * prec->inv_pmax) << FP_SHIFT; + + microtime(&prec->last); + } + + return (rp); +} + +void +rio_destroy(rio_t *rp) +{ + wtab_destroy(rp->rio_wtab); + free(rp, M_DEVBUF); +} + +void +rio_getstats(rio_t *rp, struct redstats *sp) +{ + int i; + + for (i = 0; i < RIO_NDROPPREC; i++) { + bcopy(&rp->q_stats[i], sp, sizeof(struct redstats)); + sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; + sp++; + } +} + +#if (RIO_NDROPPREC == 3) +/* + * internally, a drop precedence value is converted to an index + * starting from 0. + */ +static int +dscp2index(u_int8_t dscp) +{ + int dpindex = dscp & AF_DROPPRECMASK; + + if (dpindex == 0) + return (0); + return ((dpindex >> 3) - 1); +} +#endif + +#if 1 +/* + * kludge: when a packet is dequeued, we need to know its drop precedence + * in order to keep the queue length of each drop precedence. + * use m_pkthdr.rcvif to pass this info. + */ +#define RIOM_SET_PRECINDEX(m, idx) \ + do { (m)->m_pkthdr.rcvif = (void *)((long)(idx)); } while (0) +#define RIOM_GET_PRECINDEX(m) \ + ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \ + (m)->m_pkthdr.rcvif = NULL; idx; }) +#endif + +int +rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, + struct altq_pktattr *pktattr) +{ + int avg, droptype; + u_int8_t dsfield, odsfield; + int dpindex, i, n, t; + struct timeval now; + struct dropprec_state *prec; + + dsfield = odsfield = read_dsfield(m, pktattr); + dpindex = dscp2index(dsfield); + + /* + * update avg of the precedence states whose drop precedence + * is larger than or equal to the drop precedence of the packet + */ + now.tv_sec = 0; + for (i = dpindex; i < RIO_NDROPPREC; i++) { + prec = &rp->rio_precstate[i]; + avg = prec->avg; + if (prec->idle) { + prec->idle = 0; + if (now.tv_sec == 0) + microtime(&now); + t = (now.tv_sec - prec->last.tv_sec); + if (t > 60) + avg = 0; + else { + t = t * 1000000 + + (now.tv_usec - prec->last.tv_usec); + n = t / rp->rio_pkttime; + /* calculate (avg = (1 - Wq)^n * avg) */ + if (n > 0) + avg = (avg >> FP_SHIFT) * + pow_w(rp->rio_wtab, n); + } + } + + /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ + avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); + prec->avg = avg; /* save the new value */ + /* + * count keeps a tally of arriving traffic that has not + * been dropped. + */ + prec->count++; + } + + prec = &rp->rio_precstate[dpindex]; + avg = prec->avg; + + /* see if we drop early */ + droptype = DTYPE_NODROP; + if (avg >= prec->th_min_s && prec->qlen > 1) { + if (avg >= prec->th_max_s) { + /* avg >= th_max: forced drop */ + droptype = DTYPE_FORCED; + } else if (prec->old == 0) { + /* first exceeds th_min */ + prec->count = 1; + prec->old = 1; + } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, + prec->probd, prec->count)) { + /* unforced drop by red */ + droptype = DTYPE_EARLY; + } + } else { + /* avg < th_min */ + prec->old = 0; + } + + /* + * if the queue length hits the hard limit, it's a forced drop. + */ + if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) + droptype = DTYPE_FORCED; + + if (droptype != DTYPE_NODROP) { + /* always drop incoming packet (as opposed to randomdrop) */ + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].count = 0; +#ifdef RIO_STATS + if (droptype == DTYPE_EARLY) + rp->q_stats[dpindex].drop_unforced++; + else + rp->q_stats[dpindex].drop_forced++; + PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m)); +#endif + m_freem(m); + return (-1); + } + + for (i = dpindex; i < RIO_NDROPPREC; i++) + rp->rio_precstate[i].qlen++; + + /* save drop precedence index in mbuf hdr */ + RIOM_SET_PRECINDEX(m, dpindex); + + if (rp->rio_flags & RIOF_CLEARDSCP) + dsfield &= ~DSCP_MASK; + + if (dsfield != odsfield) + write_dsfield(m, pktattr, dsfield); + + _addq(q, m); + +#ifdef RIO_STATS + PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m)); +#endif + return (0); +} + +struct mbuf * +rio_getq(rio_t *rp, class_queue_t *q) +{ + struct mbuf *m; + int dpindex, i; + + if ((m = _getq(q)) == NULL) + return NULL; + + dpindex = RIOM_GET_PRECINDEX(m); + for (i = dpindex; i < RIO_NDROPPREC; i++) { + if (--rp->rio_precstate[i].qlen == 0) { + if (rp->rio_precstate[i].idle == 0) { + rp->rio_precstate[i].idle = 1; + microtime(&rp->rio_precstate[i].last); + } + } + } + return (m); +} + +#ifdef ALTQ3_COMPAT +int +rioopen(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + /* everything will be done when the queueing scheme is attached. */ + return 0; +} + +int +rioclose(dev, flag, fmt, p) + dev_t dev; + int flag, fmt; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + rio_queue_t *rqp; + int err, error = 0; + + while ((rqp = rio_list) != NULL) { + /* destroy all */ + err = rio_detach(rqp); + if (err != 0 && error == 0) + error = err; + } + + return error; +} + +int +rioioctl(dev, cmd, addr, flag, p) + dev_t dev; + ioctlcmd_t cmd; + caddr_t addr; + int flag; +#if (__FreeBSD_version > 500000) + struct thread *p; +#else + struct proc *p; +#endif +{ + rio_queue_t *rqp; + struct rio_interface *ifacep; + struct ifnet *ifp; + int error = 0; + + /* check super-user privilege */ + switch (cmd) { + case RIO_GETSTATS: + break; + default: +#if (__FreeBSD_version > 700000) + if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) + return (error); +#elsif (__FreeBSD_version > 400000) + if ((error = suser(p)) != 0) + return (error); +#else + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); +#endif + break; + } + + switch (cmd) { + + case RIO_ENABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_enable(rqp->rq_ifq); + break; + + case RIO_DISABLE: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = altq_disable(rqp->rq_ifq); + break; + + case RIO_IF_ATTACH: + ifp = ifunit(((struct rio_interface *)addr)->rio_ifname); + if (ifp == NULL) { + error = ENXIO; + break; + } + + /* allocate and initialize rio_queue_t */ + rqp = malloc(sizeof(rio_queue_t), M_DEVBUF, M_WAITOK); + if (rqp == NULL) { + error = ENOMEM; + break; + } + bzero(rqp, sizeof(rio_queue_t)); + + rqp->rq_q = malloc(sizeof(class_queue_t), + M_DEVBUF, M_WAITOK); + if (rqp->rq_q == NULL) { + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + bzero(rqp->rq_q, sizeof(class_queue_t)); + + rqp->rq_rio = rio_alloc(0, NULL, 0, 0); + if (rqp->rq_rio == NULL) { + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + error = ENOMEM; + break; + } + + rqp->rq_ifq = &ifp->if_snd; + qtail(rqp->rq_q) = NULL; + qlen(rqp->rq_q) = 0; + qlimit(rqp->rq_q) = RIO_LIMIT; + qtype(rqp->rq_q) = Q_RIO; + + /* + * set RIO to this ifnet structure. + */ + error = altq_attach(rqp->rq_ifq, ALTQT_RIO, rqp, + rio_enqueue, rio_dequeue, rio_request, + NULL, NULL); + if (error) { + rio_destroy(rqp->rq_rio); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + break; + } + + /* add this state to the rio list */ + rqp->rq_next = rio_list; + rio_list = rqp; + break; + + case RIO_IF_DETACH: + ifacep = (struct rio_interface *)addr; + if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + error = rio_detach(rqp); + break; + + case RIO_GETSTATS: + do { + struct rio_stats *q_stats; + rio_t *rp; + int i; + + q_stats = (struct rio_stats *)addr; + if ((rqp = altq_lookup(q_stats->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + rp = rqp->rq_rio; + + q_stats->q_limit = qlimit(rqp->rq_q); + q_stats->weight = rp->rio_weight; + q_stats->flags = rp->rio_flags; + + for (i = 0; i < RIO_NDROPPREC; i++) { + q_stats->q_len[i] = rp->rio_precstate[i].qlen; + bcopy(&rp->q_stats[i], &q_stats->q_stats[i], + sizeof(struct redstats)); + q_stats->q_stats[i].q_avg = + rp->rio_precstate[i].avg >> rp->rio_wshift; + + q_stats->q_params[i].inv_pmax + = rp->rio_precstate[i].inv_pmax; + q_stats->q_params[i].th_min + = rp->rio_precstate[i].th_min; + q_stats->q_params[i].th_max + = rp->rio_precstate[i].th_max; + } + } while (/*CONSTCOND*/ 0); + break; + + case RIO_CONFIG: + do { + struct rio_conf *fc; + rio_t *new; + int s, limit, i; + + fc = (struct rio_conf *)addr; + if ((rqp = altq_lookup(fc->iface.rio_ifname, + ALTQT_RIO)) == NULL) { + error = EBADF; + break; + } + + new = rio_alloc(fc->rio_weight, &fc->q_params[0], + fc->rio_flags, fc->rio_pkttime); + if (new == NULL) { + error = ENOMEM; + break; + } + + s = splnet(); + _flushq(rqp->rq_q); + limit = fc->rio_limit; + if (limit < fc->q_params[RIO_NDROPPREC-1].th_max) + limit = fc->q_params[RIO_NDROPPREC-1].th_max; + qlimit(rqp->rq_q) = limit; + + rio_destroy(rqp->rq_rio); + rqp->rq_rio = new; + + splx(s); + + /* write back new values */ + fc->rio_limit = limit; + for (i = 0; i < RIO_NDROPPREC; i++) { + fc->q_params[i].inv_pmax = + rqp->rq_rio->rio_precstate[i].inv_pmax; + fc->q_params[i].th_min = + rqp->rq_rio->rio_precstate[i].th_min; + fc->q_params[i].th_max = + rqp->rq_rio->rio_precstate[i].th_max; + } + } while (/*CONSTCOND*/ 0); + break; + + case RIO_SETDEFAULTS: + do { + struct redparams *rp; + int i; + + rp = (struct redparams *)addr; + for (i = 0; i < RIO_NDROPPREC; i++) + default_rio_params[i] = rp[i]; + } while (/*CONSTCOND*/ 0); + break; + + default: + error = EINVAL; + break; + } + + return error; +} + +static int +rio_detach(rqp) + rio_queue_t *rqp; +{ + rio_queue_t *tmp; + int error = 0; + + if (ALTQ_IS_ENABLED(rqp->rq_ifq)) + altq_disable(rqp->rq_ifq); + + if ((error = altq_detach(rqp->rq_ifq))) + return (error); + + if (rio_list == rqp) + rio_list = rqp->rq_next; + else { + for (tmp = rio_list; tmp != NULL; tmp = tmp->rq_next) + if (tmp->rq_next == rqp) { + tmp->rq_next = rqp->rq_next; + break; + } + if (tmp == NULL) + printf("rio_detach: no state found in rio_list!\n"); + } + + rio_destroy(rqp->rq_rio); + free(rqp->rq_q, M_DEVBUF); + free(rqp, M_DEVBUF); + return (error); +} + +/* + * rio support routines + */ +static int +rio_request(ifq, req, arg) + struct ifaltq *ifq; + int req; + void *arg; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + + IFQ_LOCK_ASSERT(ifq); + + switch (req) { + case ALTRQ_PURGE: + _flushq(rqp->rq_q); + if (ALTQ_IS_ENABLED(ifq)) + ifq->ifq_len = 0; + break; + } + return (0); +} + +/* + * enqueue routine: + * + * returns: 0 when successfully queued. + * ENOBUFS when drop occurs. + */ +static int +rio_enqueue(ifq, m, pktattr) + struct ifaltq *ifq; + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + int error = 0; + + IFQ_LOCK_ASSERT(ifq); + + if (rio_addq(rqp->rq_rio, rqp->rq_q, m, pktattr) == 0) + ifq->ifq_len++; + else + error = ENOBUFS; + return error; +} + +/* + * dequeue routine: + * must be called in splimp. + * + * returns: mbuf dequeued. + * NULL when no packet is available in the queue. + */ + +static struct mbuf * +rio_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; + struct mbuf *m = NULL; + + IFQ_LOCK_ASSERT(ifq); + + if (op == ALTDQ_POLL) + return qhead(rqp->rq_q); + + m = rio_getq(rqp->rq_rio, rqp->rq_q); + if (m != NULL) + ifq->ifq_len--; + return m; +} + +#ifdef KLD_MODULE + +static struct altqsw rio_sw = + {"rio", rioopen, rioclose, rioioctl}; + +ALTQ_MODULE(altq_rio, ALTQT_RIO, &rio_sw); +MODULE_VERSION(altq_rio, 1); +MODULE_DEPEND(altq_rio, altq_red, 1, 1, 1); + +#endif /* KLD_MODULE */ +#endif /* ALTQ3_COMPAT */ + +#endif /* ALTQ_RIO */ diff --git a/freebsd/sys/net/altq/altq_rio.h b/freebsd/sys/net/altq/altq_rio.h new file mode 100644 index 00000000..ce9dc0e0 --- /dev/null +++ b/freebsd/sys/net/altq/altq_rio.h @@ -0,0 +1,145 @@ +/*- + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_rio.h,v 1.9 2003/07/10 12:07:49 kjc Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_RIO_H_ +#define _ALTQ_ALTQ_RIO_H_ + +#include <net/altq/altq_classq.h> + +/* + * RIO: RED with IN/OUT bit + * (extended to support more than 2 drop precedence values) + */ +#define RIO_NDROPPREC 3 /* number of drop precedence values */ + +#ifdef ALTQ3_COMPAT +struct rio_interface { + char rio_ifname[IFNAMSIZ]; +}; + +struct rio_stats { + struct rio_interface iface; + int q_len[RIO_NDROPPREC]; + struct redstats q_stats[RIO_NDROPPREC]; + + /* static red parameters */ + int q_limit; + int weight; + int flags; + struct redparams q_params[RIO_NDROPPREC]; +}; + +struct rio_conf { + struct rio_interface iface; + struct redparams q_params[RIO_NDROPPREC]; + int rio_weight; /* weight for EWMA */ + int rio_limit; /* max queue length */ + int rio_pkttime; /* average packet time in usec */ + int rio_flags; /* see below */ +}; +#endif /* ALTQ3_COMPAT */ + +/* rio flags */ +#define RIOF_ECN4 0x01 /* use packet marking for IPv4 packets */ +#define RIOF_ECN6 0x02 /* use packet marking for IPv6 packets */ +#define RIOF_ECN (RIOF_ECN4 | RIOF_ECN6) +#define RIOF_CLEARDSCP 0x200 /* clear diffserv codepoint */ + +#ifdef ALTQ3_COMPAT +/* + * IOCTLs for RIO + */ +#define RIO_IF_ATTACH _IOW('Q', 1, struct rio_interface) +#define RIO_IF_DETACH _IOW('Q', 2, struct rio_interface) +#define RIO_ENABLE _IOW('Q', 3, struct rio_interface) +#define RIO_DISABLE _IOW('Q', 4, struct rio_interface) +#define RIO_CONFIG _IOWR('Q', 6, struct rio_conf) +#define RIO_GETSTATS _IOWR('Q', 12, struct rio_stats) +#define RIO_SETDEFAULTS _IOW('Q', 30, struct redparams[RIO_NDROPPREC]) +#endif /* ALTQ3_COMPAT */ + +#ifdef _KERNEL + +typedef struct rio { + /* per drop precedence structure */ + struct dropprec_state { + /* red parameters */ + int inv_pmax; /* inverse of max drop probability */ + int th_min; /* red min threshold */ + int th_max; /* red max threshold */ + + /* variables for internal use */ + int th_min_s; /* th_min scaled by avgshift */ + int th_max_s; /* th_max scaled by avgshift */ + int probd; /* drop probability denominator */ + + int qlen; /* queue length */ + int avg; /* (scaled) queue length average */ + int count; /* packet count since the last dropped/ + marked packet */ + int idle; /* queue was empty */ + int old; /* avg is above th_min */ + struct timeval last; /* timestamp when queue becomes idle */ + } rio_precstate[RIO_NDROPPREC]; + + int rio_wshift; /* log(red_weight) */ + int rio_weight; /* weight for EWMA */ + struct wtab *rio_wtab; /* weight table */ + + int rio_pkttime; /* average packet time in micro sec + used for idle calibration */ + int rio_flags; /* rio flags */ + + u_int8_t rio_codepoint; /* codepoint value to tag packets */ + u_int8_t rio_codepointmask; /* codepoint mask bits */ + + struct redstats q_stats[RIO_NDROPPREC]; /* statistics */ +} rio_t; + +#ifdef ALTQ3_COMPAT +typedef struct rio_queue { + struct rio_queue *rq_next; /* next red_state in the list */ + struct ifaltq *rq_ifq; /* backpointer to ifaltq */ + + class_queue_t *rq_q; + + rio_t *rq_rio; +} rio_queue_t; +#endif /* ALTQ3_COMPAT */ + +extern rio_t *rio_alloc(int, struct redparams *, int, int); +extern void rio_destroy(rio_t *); +extern void rio_getstats(rio_t *, struct redstats *); +extern int rio_addq(rio_t *, class_queue_t *, struct mbuf *, + struct altq_pktattr *); +extern struct mbuf *rio_getq(rio_t *, class_queue_t *); + +#endif /* _KERNEL */ + +#endif /* _ALTQ_ALTQ_RIO_H_ */ diff --git a/freebsd/sys/net/altq/altq_rmclass.c b/freebsd/sys/net/altq/altq_rmclass.c new file mode 100644 index 00000000..160884e2 --- /dev/null +++ b/freebsd/sys/net/altq/altq_rmclass.c @@ -0,0 +1,1841 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * LBL code modified by speer@eng.sun.com, May 1977. + * For questions and/or comments, please send mail to cbq@ee.lbl.gov + * + * @(#)rm_class.c 1.48 97/12/05 SMI + * $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $ + * $FreeBSD$ + */ +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> +#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/systm.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/time.h> +#ifdef ALTQ3_COMPAT +#include <sys/kernel.h> +#endif + +#include <net/if.h> +#include <net/if_var.h> +#ifdef ALTQ3_COMPAT +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#endif + +#include <net/altq/if_altq.h> +#include <net/altq/altq.h> +#include <net/altq/altq_codel.h> +#include <net/altq/altq_rmclass.h> +#include <net/altq/altq_rmclass_debug.h> +#include <net/altq/altq_red.h> +#include <net/altq/altq_rio.h> + +/* + * Local Macros + */ + +#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } + +/* + * Local routines. + */ + +static int rmc_satisfied(struct rm_class *, struct timeval *); +static void rmc_wrr_set_weights(struct rm_ifdat *); +static void rmc_depth_compute(struct rm_class *); +static void rmc_depth_recompute(rm_class_t *); + +static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); +static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int); + +static int _rmc_addq(rm_class_t *, mbuf_t *); +static void _rmc_dropq(rm_class_t *); +static mbuf_t *_rmc_getq(rm_class_t *); +static mbuf_t *_rmc_pollq(rm_class_t *); + +static int rmc_under_limit(struct rm_class *, struct timeval *); +static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); +static void rmc_drop_action(struct rm_class *); +static void rmc_restart(struct rm_class *); +static void rmc_root_overlimit(struct rm_class *, struct rm_class *); + +#define BORROW_OFFTIME +/* + * BORROW_OFFTIME (experimental): + * borrow the offtime of the class borrowing from. + * the reason is that when its own offtime is set, the class is unable + * to borrow much, especially when cutoff is taking effect. + * but when the borrowed class is overloaded (advidle is close to minidle), + * use the borrowing class's offtime to avoid overload. + */ +#define ADJUST_CUTOFF +/* + * ADJUST_CUTOFF (experimental): + * if no underlimit class is found due to cutoff, increase cutoff and + * retry the scheduling loop. + * also, don't invoke delay_actions while cutoff is taking effect, + * since a sleeping class won't have a chance to be scheduled in the + * next loop. + * + * now heuristics for setting the top-level variable (cutoff_) becomes: + * 1. if a packet arrives for a not-overlimit class, set cutoff + * to the depth of the class. + * 2. if cutoff is i, and a packet arrives for an overlimit class + * with an underlimit ancestor at a lower level than i (say j), + * then set cutoff to j. + * 3. at scheduling a packet, if there is no underlimit class + * due to the current cutoff level, increase cutoff by 1 and + * then try to schedule again. + */ + +/* + * rm_class_t * + * rmc_newclass(...) - Create a new resource management class at priority + * 'pri' on the interface given by 'ifd'. + * + * nsecPerByte is the data rate of the interface in nanoseconds/byte. + * E.g., 800 for a 10Mb/s ethernet. If the class gets less + * than 100% of the bandwidth, this number should be the + * 'effective' rate for the class. Let f be the + * bandwidth fraction allocated to this class, and let + * nsPerByte be the data rate of the output link in + * nanoseconds/byte. Then nsecPerByte is set to + * nsPerByte / f. E.g., 1600 (= 800 / .5) + * for a class that gets 50% of an ethernet's bandwidth. + * + * action the routine to call when the class is over limit. + * + * maxq max allowable queue size for class (in packets). + * + * parent parent class pointer. + * + * borrow class to borrow from (should be either 'parent' or null). + * + * maxidle max value allowed for class 'idle' time estimate (this + * parameter determines how large an initial burst of packets + * can be before overlimit action is invoked. + * + * offtime how long 'delay' action will delay when class goes over + * limit (this parameter determines the steady-state burst + * size when a class is running over its limit). + * + * Maxidle and offtime have to be computed from the following: If the + * average packet size is s, the bandwidth fraction allocated to this + * class is f, we want to allow b packet bursts, and the gain of the + * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: + * + * ptime = s * nsPerByte * (1 - f) / f + * maxidle = ptime * (1 - g^b) / g^b + * minidle = -ptime * (1 / (f - 1)) + * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) + * + * Operationally, it's convenient to specify maxidle & offtime in units + * independent of the link bandwidth so the maxidle & offtime passed to + * this routine are the above values multiplied by 8*f/(1000*nsPerByte). + * (The constant factor is a scale factor needed to make the parameters + * integers. This scaling also means that the 'unscaled' values of + * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, + * not nanoseconds.) Also note that the 'idle' filter computation keeps + * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of + * maxidle also must be scaled upward by this value. Thus, the passed + * values for maxidle and offtime can be computed as follows: + * + * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) + * offtime = offtime * 8 / (1000 * nsecPerByte) + * + * When USE_HRTIME is employed, then maxidle and offtime become: + * maxidle = maxilde * (8.0 / nsecPerByte); + * offtime = offtime * (8.0 / nsecPerByte); + */ +struct rm_class * +rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*action)(rm_class_t *, rm_class_t *), int maxq, + struct rm_class *parent, struct rm_class *borrow, u_int maxidle, + int minidle, u_int offtime, int pktsize, int flags) +{ + struct rm_class *cl; + struct rm_class *peer; + int s; + + if (pri >= RM_MAXPRIO) + return (NULL); +#ifndef ALTQ_RED + if (flags & RMCF_RED) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RED not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_RIO + if (flags & RMCF_RIO) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: RIO not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif +#ifndef ALTQ_CODEL + if (flags & RMCF_CODEL) { +#ifdef ALTQ_DEBUG + printf("rmc_newclass: CODEL not configured for CBQ!\n"); +#endif + return (NULL); + } +#endif + + cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO); + if (cl == NULL) + return (NULL); + CALLOUT_INIT(&cl->callout_); + cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO); + if (cl->q_ == NULL) { + free(cl, M_DEVBUF); + return (NULL); + } + + /* + * Class initialization. + */ + cl->children_ = NULL; + cl->parent_ = parent; + cl->borrow_ = borrow; + cl->leaf_ = 1; + cl->ifdat_ = ifd; + cl->pri_ = pri; + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->depth_ = 0; + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + qtype(cl->q_) = Q_DROPHEAD; + qlen(cl->q_) = 0; + cl->flags_ = flags; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * (int)nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + cl->overlimit = action; + +#ifdef ALTQ_RED + if (flags & (RMCF_RED|RMCF_RIO)) { + int red_flags, red_pkttime; + + red_flags = 0; + if (flags & RMCF_ECN) + red_flags |= REDF_ECN; + if (flags & RMCF_FLOWVALVE) + red_flags |= REDF_FLOWVALVE; +#ifdef ALTQ_RIO + if (flags & RMCF_CLEARDSCP) + red_flags |= RIOF_CLEARDSCP; +#endif + red_pkttime = nsecPerByte * pktsize / 1000; + + if (flags & RMCF_RED) { + cl->red_ = red_alloc(0, 0, + qlimit(cl->q_) * 10/100, + qlimit(cl->q_) * 30/100, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RED; + } +#ifdef ALTQ_RIO + else { + cl->red_ = (red_t *)rio_alloc(0, NULL, + red_flags, red_pkttime); + if (cl->red_ != NULL) + qtype(cl->q_) = Q_RIO; + } +#endif + } +#endif /* ALTQ_RED */ +#ifdef ALTQ_CODEL + if (flags & RMCF_CODEL) { + cl->codel_ = codel_alloc(5, 100, 0); + if (cl->codel_ != NULL) + qtype(cl->q_) = Q_CODEL; + } +#endif + + /* + * put the class into the class tree + */ + s = splnet(); + IFQ_LOCK(ifd->ifq_); + if ((peer = ifd->active_[pri]) != NULL) { + /* find the last class at this pri */ + cl->peer_ = peer; + while (peer->peer_ != ifd->active_[pri]) + peer = peer->peer_; + peer->peer_ = cl; + } else { + ifd->active_[pri] = cl; + cl->peer_ = cl; + } + + if (cl->parent_) { + cl->next_ = parent->children_; + parent->children_ = cl; + parent->leaf_ = 0; + } + + /* + * Compute the depth of this class and its ancestors in the class + * hierarchy. + */ + rmc_depth_compute(cl); + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->num_[pri]++; + ifd->alloc_[pri] += cl->allotment_; + rmc_wrr_set_weights(ifd); + } + IFQ_UNLOCK(ifd->ifq_); + splx(s); + return (cl); +} + +int +rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, + int minidle, u_int offtime, int pktsize) +{ + struct rm_ifdat *ifd; + u_int old_allotment; + int s; + + ifd = cl->ifdat_; + old_allotment = cl->allotment_; + + s = splnet(); + IFQ_LOCK(ifd->ifq_); + cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ + cl->qthresh_ = 0; + cl->ns_per_byte_ = nsecPerByte; + + qlimit(cl->q_) = maxq; + +#if 1 /* minidle is also scaled in ALTQ */ + cl->minidle_ = (minidle * nsecPerByte) / 8; + if (cl->minidle_ > 0) + cl->minidle_ = 0; +#else + cl->minidle_ = minidle; +#endif + cl->maxidle_ = (maxidle * nsecPerByte) / 8; + if (cl->maxidle_ == 0) + cl->maxidle_ = 1; +#if 1 /* offtime is also scaled in ALTQ */ + cl->avgidle_ = cl->maxidle_; + cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; + if (cl->offtime_ == 0) + cl->offtime_ = 1; +#else + cl->avgidle_ = 0; + cl->offtime_ = (offtime * nsecPerByte) / 8; +#endif + + /* + * If CBQ's WRR is enabled, then initialize the class WRR state. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; + rmc_wrr_set_weights(ifd); + } + IFQ_UNLOCK(ifd->ifq_); + splx(s); + return (0); +} + +/* + * static void + * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes + * the appropriate run robin weights for the CBQ weighted round robin + * algorithm. + * + * Returns: NONE + */ + +static void +rmc_wrr_set_weights(struct rm_ifdat *ifd) +{ + int i; + struct rm_class *cl, *clh; + + for (i = 0; i < RM_MAXPRIO; i++) { + /* + * This is inverted from that of the simulator to + * maintain precision. + */ + if (ifd->num_[i] == 0) + ifd->M_[i] = 0; + else + ifd->M_[i] = ifd->alloc_[i] / + (ifd->num_[i] * ifd->maxpkt_); + /* + * Compute the weighted allotment for each class. + * This takes the expensive div instruction out + * of the main loop for the wrr scheduling path. + * These only get recomputed when a class comes or + * goes. + */ + if (ifd->active_[i] != NULL) { + clh = cl = ifd->active_[i]; + do { + /* safe-guard for slow link or alloc_ == 0 */ + if (ifd->M_[i] == 0) + cl->w_allotment_ = 0; + else + cl->w_allotment_ = cl->allotment_ / + ifd->M_[i]; + cl = cl->peer_; + } while ((cl != NULL) && (cl != clh)); + } + } +} + +int +rmc_get_weight(struct rm_ifdat *ifd, int pri) +{ + if ((pri >= 0) && (pri < RM_MAXPRIO)) + return (ifd->M_[pri]); + else + return (0); +} + +/* + * static void + * rmc_depth_compute(struct rm_class *cl) - This function computes the + * appropriate depth of class 'cl' and its ancestors. + * + * Returns: NONE + */ + +static void +rmc_depth_compute(struct rm_class *cl) +{ + rm_class_t *t = cl, *p; + + /* + * Recompute the depth for the branch of the tree. + */ + while (t != NULL) { + p = t->parent_; + if (p && (t->depth_ >= p->depth_)) { + p->depth_ = t->depth_ + 1; + t = p; + } else + t = NULL; + } +} + +/* + * static void + * rmc_depth_recompute(struct rm_class *cl) - This function re-computes + * the depth of the tree after a class has been deleted. + * + * Returns: NONE + */ + +static void +rmc_depth_recompute(rm_class_t *cl) +{ +#if 1 /* ALTQ */ + rm_class_t *p, *t; + + p = cl; + while (p != NULL) { + if ((t = p->children_) == NULL) { + p->depth_ = 0; + } else { + int cdepth = 0; + + while (t != NULL) { + if (t->depth_ > cdepth) + cdepth = t->depth_; + t = t->next_; + } + + if (p->depth_ == cdepth + 1) + /* no change to this parent */ + return; + + p->depth_ = cdepth + 1; + } + + p = p->parent_; + } +#else + rm_class_t *t; + + if (cl->depth_ >= 1) { + if (cl->children_ == NULL) { + cl->depth_ = 0; + } else if ((t = cl->children_) != NULL) { + while (t != NULL) { + if (t->children_ != NULL) + rmc_depth_recompute(t); + t = t->next_; + } + } else + rmc_depth_compute(cl); + } +#endif +} + +/* + * void + * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This + * function deletes a class from the link-sharing structure and frees + * all resources associated with the class. + * + * Returns: NONE + */ + +void +rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) +{ + struct rm_class *p, *head, *previous; + int s; + + ASSERT(cl->children_ == NULL); + + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + + s = splnet(); + IFQ_LOCK(ifd->ifq_); + /* + * Free packets in the packet queue. + * XXX - this may not be a desired behavior. Packets should be + * re-queued. + */ + rmc_dropall(cl); + + /* + * If the class has a parent, then remove the class from the + * class from the parent's children chain. + */ + if (cl->parent_ != NULL) { + head = cl->parent_->children_; + p = previous = head; + if (head->next_ == NULL) { + ASSERT(head == cl); + cl->parent_->children_ = NULL; + cl->parent_->leaf_ = 1; + } else while (p != NULL) { + if (p == cl) { + if (cl == head) + cl->parent_->children_ = cl->next_; + else + previous->next_ = cl->next_; + cl->next_ = NULL; + p = NULL; + } else { + previous = p; + p = p->next_; + } + } + } + + /* + * Delete class from class priority peer list. + */ + if ((p = ifd->active_[cl->pri_]) != NULL) { + /* + * If there is more than one member of this priority + * level, then look for class(cl) in the priority level. + */ + if (p != p->peer_) { + while (p->peer_ != cl) + p = p->peer_; + p->peer_ = cl->peer_; + + if (ifd->active_[cl->pri_] == cl) + ifd->active_[cl->pri_] = cl->peer_; + } else { + ASSERT(p == cl); + ifd->active_[cl->pri_] = NULL; + } + } + + /* + * Recompute the WRR weights. + */ + if (ifd->wrr_) { + ifd->alloc_[cl->pri_] -= cl->allotment_; + ifd->num_[cl->pri_]--; + rmc_wrr_set_weights(ifd); + } + + /* + * Re-compute the depth of the tree. + */ +#if 1 /* ALTQ */ + rmc_depth_recompute(cl->parent_); +#else + rmc_depth_recompute(ifd->root_); +#endif + + IFQ_UNLOCK(ifd->ifq_); + splx(s); + + /* + * Free the class structure. + */ + if (cl->red_ != NULL) { +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + rio_destroy((rio_t *)cl->red_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + red_destroy(cl->red_); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->q_)) + codel_destroy(cl->codel_); +#endif + } + free(cl->q_, M_DEVBUF); + free(cl, M_DEVBUF); +} + + +/* + * void + * rmc_init(...) - Initialize the resource management data structures + * associated with the output portion of interface 'ifp'. 'ifd' is + * where the structures will be built (for backwards compatibility, the + * structures aren't kept in the ifnet struct). 'nsecPerByte' + * gives the link speed (inverse of bandwidth) in nanoseconds/byte. + * 'restart' is the driver-specific routine that the generic 'delay + * until under limit' action will call to restart output. `maxq' + * is the queue size of the 'link' & 'default' classes. 'maxqueued' + * is the maximum number of packets that the resource management + * code will allow to be queued 'downstream' (this is typically 1). + * + * Returns: NONE + */ + +void +rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, + void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, + int minidle, u_int offtime, int flags) +{ + int i, mtu; + + /* + * Initialize the CBQ tracing/debug facility. + */ + CBQTRACEINIT(); + + bzero((char *)ifd, sizeof (*ifd)); + mtu = ifq->altq_ifp->if_mtu; + ifd->ifq_ = ifq; + ifd->restart = restart; + ifd->maxqueued_ = maxqueued; + ifd->ns_per_byte_ = nsecPerByte; + ifd->maxpkt_ = mtu; + ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; + ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; +#if 1 + ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; + if (mtu * nsecPerByte > 10 * 1000000) + ifd->maxiftime_ /= 4; +#endif + + reset_cutoff(ifd); + CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); + + /* + * Initialize the CBQ's WRR state. + */ + for (i = 0; i < RM_MAXPRIO; i++) { + ifd->alloc_[i] = 0; + ifd->M_[i] = 0; + ifd->num_[i] = 0; + ifd->na_[i] = 0; + ifd->active_[i] = NULL; + } + + /* + * Initialize current packet state. + */ + ifd->qi_ = 0; + ifd->qo_ = 0; + for (i = 0; i < RM_MAXQUEUED; i++) { + ifd->class_[i] = NULL; + ifd->curlen_[i] = 0; + ifd->borrowed_[i] = NULL; + } + + /* + * Create the root class of the link-sharing structure. + */ + if ((ifd->root_ = rmc_newclass(0, ifd, + nsecPerByte, + rmc_root_overlimit, maxq, 0, 0, + maxidle, minidle, offtime, + 0, 0)) == NULL) { + printf("rmc_init: root class not allocated\n"); + return ; + } + ifd->root_->depth_ = 0; +} + +/* + * void + * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by + * mbuf 'm' to queue for resource class 'cl'. This routine is called + * by a driver's if_output routine. This routine must be called with + * output packet completion interrupts locked out (to avoid racing with + * rmc_dequeue_next). + * + * Returns: 0 on successful queueing + * -1 when packet drop occurs + */ +int +rmc_queue_packet(struct rm_class *cl, mbuf_t *m) +{ + struct timeval now; + struct rm_ifdat *ifd = cl->ifdat_; + int cpri = cl->pri_; + int is_empty = qempty(cl->q_); + + RM_GETTIME(now); + if (ifd->cutoff_ > 0) { + if (TV_LT(&cl->undertime_, &now)) { + if (ifd->cutoff_ > cl->depth_) + ifd->cutoff_ = cl->depth_; + CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); + } +#if 1 /* ALTQ */ + else { + /* + * the class is overlimit. if the class has + * underlimit ancestors, set cutoff to the lowest + * depth among them. + */ + struct rm_class *borrow = cl->borrow_; + + while (borrow != NULL && + borrow->depth_ < ifd->cutoff_) { + if (TV_LT(&borrow->undertime_, &now)) { + ifd->cutoff_ = borrow->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); + break; + } + borrow = borrow->borrow_; + } + } +#else /* !ALTQ */ + else if ((ifd->cutoff_ > 1) && cl->borrow_) { + if (TV_LT(&cl->borrow_->undertime_, &now)) { + ifd->cutoff_ = cl->borrow_->depth_; + CBQTRACE(rmc_queue_packet, 'ffob', + cl->borrow_->depth_); + } + } +#endif /* !ALTQ */ + } + + if (_rmc_addq(cl, m) < 0) + /* failed */ + return (-1); + + if (is_empty) { + CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); + ifd->na_[cpri]++; + } + + if (qlen(cl->q_) > qlimit(cl->q_)) { + /* note: qlimit can be set to 0 or 1 */ + rmc_drop_action(cl); + return (-1); + } + return (0); +} + +/* + * void + * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all + * classes to see if there are satified. + */ + +static void +rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) +{ + int i; + rm_class_t *p, *bp; + + for (i = RM_MAXPRIO - 1; i >= 0; i--) { + if ((bp = ifd->active_[i]) != NULL) { + p = bp; + do { + if (!rmc_satisfied(p, now)) { + ifd->cutoff_ = p->depth_; + return; + } + p = p->peer_; + } while (p != bp); + } + } + + reset_cutoff(ifd); +} + +/* + * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. + */ + +static int +rmc_satisfied(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p; + + if (cl == NULL) + return (1); + if (TV_LT(now, &cl->undertime_)) + return (1); + if (cl->depth_ == 0) { + if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) + return (0); + else + return (1); + } + if (cl->children_ != NULL) { + p = cl->children_; + while (p != NULL) { + if (!rmc_satisfied(p, now)) + return (0); + p = p->next_; + } + } + + return (1); +} + +/* + * Return 1 if class 'cl' is under limit or can borrow from a parent, + * 0 if overlimit. As a side-effect, this routine will invoke the + * class overlimit action if the class if overlimit. + */ + +static int +rmc_under_limit(struct rm_class *cl, struct timeval *now) +{ + rm_class_t *p = cl; + rm_class_t *top; + struct rm_ifdat *ifd = cl->ifdat_; + + ifd->borrowed_[ifd->qi_] = NULL; + /* + * If cl is the root class, then always return that it is + * underlimit. Otherwise, check to see if the class is underlimit. + */ + if (cl->parent_ == NULL) + return (1); + + if (cl->sleeping_) { + if (TV_LT(now, &cl->undertime_)) + return (0); + + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + return (1); + } + + top = NULL; + while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { + if (((cl = cl->borrow_) == NULL) || + (cl->depth_ > ifd->cutoff_)) { +#ifdef ADJUST_CUTOFF + if (cl != NULL) + /* cutoff is taking effect, just + return false without calling + the delay action. */ + return (0); +#endif +#ifdef BORROW_OFFTIME + /* + * check if the class can borrow offtime too. + * borrow offtime from the top of the borrow + * chain if the top class is not overloaded. + */ + if (cl != NULL) { + /* cutoff is taking effect, use this class as top. */ + top = cl; + CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); + } + if (top != NULL && top->avgidle_ == top->minidle_) + top = NULL; + p->overtime_ = *now; + (p->overlimit)(p, top); +#else + p->overtime_ = *now; + (p->overlimit)(p, NULL); +#endif + return (0); + } + top = cl; + } + + if (cl != p) + ifd->borrowed_[ifd->qi_] = cl; + return (1); +} + +/* + * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to + * Packet-by-packet round robin. + * + * The heart of the weighted round-robin scheduler, which decides which + * class next gets to send a packet. Highest priority first, then + * weighted round-robin within priorites. + * + * Each able-to-send class gets to send until its byte allocation is + * exhausted. Thus, the active pointer is only changed after a class has + * exhausted its allocation. + * + * If the scheduler finds no class that is underlimit or able to borrow, + * then the first class found that had a nonzero queue and is allowed to + * borrow gets to send. + */ + +static mbuf_t * +_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + struct rm_class *cl = NULL, *first = NULL; + u_int deficit; + int cpri; + mbuf_t *m; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + if (ifd->efficient_) { + /* check if this class is overlimit */ + if (cl->undertime_.tv_sec != 0 && + rmc_under_limit(cl, &now) == 0) + first = cl; + } + ifd->pollcache_ = NULL; + goto _wrr_out; + } + else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + deficit = 0; + /* + * Loop through twice for a priority level, if some class + * was unable to send a packet the first round because + * of the weighted round-robin mechanism. + * During the second loop at this level, deficit==2. + * (This second loop is not needed if for every class, + * "M[cl->pri_])" times "cl->allotment" is greater than + * the byte size for the largest packet in the class.) + */ + _wrr_loop: + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) + cl->bytes_alloc_ += cl->w_allotment_; + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) { + if (cl->bytes_alloc_ > 0 || deficit > 1) + goto _wrr_out; + + /* underlimit but no alloc */ + deficit = 1; +#if 1 + ifd->borrowed_[ifd->qi_] = NULL; +#endif + } + else if (first == NULL && cl->borrow_ != NULL) + first = cl; /* borrowing candidate */ + } + + cl->bytes_alloc_ = 0; + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + + if (deficit == 1) { + /* first loop found an underlimit class with deficit */ + /* Loop on same priority level, with new deficit. */ + deficit = 2; + goto _wrr_loop; + } + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, + * increase cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); + + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _wrr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_wrr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + /* + * Update class statistics and link data. + */ + if (cl->bytes_alloc_ > 0) + cl->bytes_alloc_ -= m_pktlen(m); + + if ((cl->bytes_alloc_ <= 0) || first == cl) + ifd->active_[cl->pri_] = cl->peer_; + else + ifd->active_[cl->pri_] = cl; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_PPOLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * Dequeue & return next packet from the highest priority class that + * has a packet to send & has enough allocation to send it. This + * routine is called by a driver whenever it needs a new packet to + * output. + */ +static mbuf_t * +_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) +{ + mbuf_t *m; + int cpri; + struct rm_class *cl, *first = NULL; + struct timeval now; + + RM_GETTIME(now); + + /* + * if the driver polls the top of the queue and then removes + * the polled packet, we must return the same packet. + */ + if (op == ALTDQ_REMOVE && ifd->pollcache_) { + cl = ifd->pollcache_; + cpri = cl->pri_; + ifd->pollcache_ = NULL; + goto _prr_out; + } else { + /* mode == ALTDQ_POLL || pollcache == NULL */ + ifd->pollcache_ = NULL; + ifd->borrowed_[ifd->qi_] = NULL; + } +#ifdef ADJUST_CUTOFF + _again: +#endif + for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { + if (ifd->na_[cpri] == 0) + continue; + cl = ifd->active_[cpri]; + ASSERT(cl != NULL); + do { + if (!qempty(cl->q_)) { + if ((cl->undertime_.tv_sec == 0) || + rmc_under_limit(cl, &now)) + goto _prr_out; + if (first == NULL && cl->borrow_ != NULL) + first = cl; + } + cl = cl->peer_; + } while (cl != ifd->active_[cpri]); + } + +#ifdef ADJUST_CUTOFF + /* + * no underlimit class found. if cutoff is taking effect, increase + * cutoff and try again. + */ + if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { + ifd->cutoff_++; + goto _again; + } +#endif /* ADJUST_CUTOFF */ + /* + * If LINK_EFFICIENCY is turned on, then the first overlimit + * class we encounter will send a packet if all the classes + * of the link-sharing structure are overlimit. + */ + reset_cutoff(ifd); + if (!ifd->efficient_ || first == NULL) + return (NULL); + + cl = first; + cpri = cl->pri_; +#if 0 /* too time-consuming for nothing */ + if (cl->sleeping_) + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; +#endif + ifd->borrowed_[ifd->qi_] = cl->borrow_; + ifd->cutoff_ = cl->borrow_->depth_; + + /* + * Deque the packet and do the book keeping... + */ + _prr_out: + if (op == ALTDQ_REMOVE) { + m = _rmc_getq(cl); + if (m == NULL) + panic("_rmc_prr_dequeue_next"); + if (qempty(cl->q_)) + ifd->na_[cpri]--; + + ifd->active_[cpri] = cl->peer_; + + ifd->class_[ifd->qi_] = cl; + ifd->curlen_[ifd->qi_] = m_pktlen(m); + ifd->now_[ifd->qi_] = now; + ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; + ifd->queued_++; + } else { + /* mode == ALTDQ_POLL */ + m = _rmc_pollq(cl); + ifd->pollcache_ = cl; + } + return (m); +} + +/* + * mbuf_t * + * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function + * is invoked by the packet driver to get the next packet to be + * dequeued and output on the link. If WRR is enabled, then the + * WRR dequeue next routine will determine the next packet to sent. + * Otherwise, packet-by-packet round robin is invoked. + * + * Returns: NULL, if a packet is not available or if all + * classes are overlimit. + * + * Otherwise, Pointer to the next packet. + */ + +mbuf_t * +rmc_dequeue_next(struct rm_ifdat *ifd, int mode) +{ + if (ifd->queued_ >= ifd->maxqueued_) + return (NULL); + else if (ifd->wrr_) + return (_rmc_wrr_dequeue_next(ifd, mode)); + else + return (_rmc_prr_dequeue_next(ifd, mode)); +} + +/* + * Update the utilization estimate for the packet that just completed. + * The packet's class & the parent(s) of that class all get their + * estimators updated. This routine is called by the driver's output- + * packet-completion interrupt service routine. + */ + +/* + * a macro to approximate "divide by 1000" that gives 0.000999, + * if a value has enough effective digits. + * (on pentium, mul takes 9 cycles but div takes 46!) + */ +#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) +void +rmc_update_class_util(struct rm_ifdat *ifd) +{ + int idle, avgidle, pktlen; + int pkt_time, tidle; + rm_class_t *cl, *borrowed; + rm_class_t *borrows; + struct timeval *nowp; + + /* + * Get the most recent completed class. + */ + if ((cl = ifd->class_[ifd->qo_]) == NULL) + return; + + pktlen = ifd->curlen_[ifd->qo_]; + borrowed = ifd->borrowed_[ifd->qo_]; + borrows = borrowed; + + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + + /* + * Run estimator on class and its ancestors. + */ + /* + * rm_update_class_util is designed to be called when the + * transfer is completed from a xmit complete interrupt, + * but most drivers don't implement an upcall for that. + * so, just use estimated completion time. + * as a result, ifd->qi_ and ifd->qo_ are always synced. + */ + nowp = &ifd->now_[ifd->qo_]; + /* get pkt_time (for link) in usec */ +#if 1 /* use approximation */ + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; +#endif +#if 1 /* ALTQ4PPP */ + if (TV_LT(nowp, &ifd->ifnow_)) { + int iftime; + + /* + * make sure the estimated completion time does not go + * too far. it can happen when the link layer supports + * data compression or the interface speed is set to + * a much lower value. + */ + TV_DELTA(&ifd->ifnow_, nowp, iftime); + if (iftime+pkt_time < ifd->maxiftime_) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); + } + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#else + if (TV_LT(nowp, &ifd->ifnow_)) { + TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); + } else { + TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); + } +#endif + + while (cl != NULL) { + TV_DELTA(&ifd->ifnow_, &cl->last_, idle); + if (idle >= 2000000) + /* + * this class is idle enough, reset avgidle. + * (TV_DELTA returns 2000000 us when delta is large.) + */ + cl->avgidle_ = cl->maxidle_; + + /* get pkt_time (for class) in usec */ +#if 1 /* use approximation */ + pkt_time = pktlen * cl->ns_per_byte_; + pkt_time = NSEC_TO_USEC(pkt_time); +#else + pkt_time = pktlen * cl->ns_per_byte_ / 1000; +#endif + idle -= pkt_time; + + avgidle = cl->avgidle_; + avgidle += idle - (avgidle >> RM_FILTER_GAIN); + cl->avgidle_ = avgidle; + + /* Are we overlimit ? */ + if (avgidle <= 0) { + CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); +#if 1 /* ALTQ */ + /* + * need some lower bound for avgidle, otherwise + * a borrowing class gets unbounded penalty. + */ + if (avgidle < cl->minidle_) + avgidle = cl->avgidle_ = cl->minidle_; +#endif + /* set next idle to make avgidle 0 */ + tidle = pkt_time + + (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); + TV_ADD_DELTA(nowp, tidle, &cl->undertime_); + ++cl->stats_.over; + } else { + cl->avgidle_ = + (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; + cl->undertime_.tv_sec = 0; + if (cl->sleeping_) { + CALLOUT_STOP(&cl->callout_); + cl->sleeping_ = 0; + } + } + + if (borrows != NULL) { + if (borrows != cl) + ++cl->stats_.borrows; + else + borrows = NULL; + } + cl->last_ = ifd->ifnow_; + cl->last_pkttime_ = pkt_time; + +#if 1 + if (cl->parent_ == NULL) { + /* take stats of root class */ + PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); + } +#endif + + cl = cl->parent_; + } + + /* + * Check to see if cutoff needs to set to a new level. + */ + cl = ifd->class_[ifd->qo_]; + if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { +#if 1 /* ALTQ */ + if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { + rmc_tl_satisfied(ifd, nowp); + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#else /* !ALTQ */ + if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { + reset_cutoff(ifd); +#ifdef notdef + rmc_tl_satisfied(ifd, &now); +#endif + CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); + } else { + ifd->cutoff_ = borrowed->depth_; + CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); + } +#endif /* !ALTQ */ + } + + /* + * Release class slot + */ + ifd->borrowed_[ifd->qo_] = NULL; + ifd->class_[ifd->qo_] = NULL; + ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; + ifd->queued_--; +} + +/* + * void + * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) + * over-limit action routines. These get invoked by rmc_under_limit() + * if a class with packets to send if over its bandwidth limit & can't + * borrow from a parent class. + * + * Returns: NONE + */ + +static void +rmc_drop_action(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + ASSERT(qlen(cl->q_) > 0); + _rmc_dropq(cl); + if (qempty(cl->q_)) + ifd->na_[cl->pri_]--; +} + +void rmc_dropall(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + + if (!qempty(cl->q_)) { + _flushq(cl->q_); + + ifd->na_[cl->pri_]--; + } +} + +#if (__FreeBSD_version > 300000) +/* hzto() is removed from FreeBSD-3.0 */ +static int hzto(struct timeval *); + +static int +hzto(tv) + struct timeval *tv; +{ + struct timeval t2; + + getmicrotime(&t2); + t2.tv_sec = tv->tv_sec - t2.tv_sec; + t2.tv_usec = tv->tv_usec - t2.tv_usec; + return (tvtohz(&t2)); +} +#endif /* __FreeBSD_version > 300000 */ + +/* + * void + * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ + * delay action routine. It is invoked via rmc_under_limit when the + * packet is discoverd to be overlimit. + * + * If the delay action is result of borrow class being overlimit, then + * delay for the offtime of the borrowing class that is overlimit. + * + * Returns: NONE + */ + +void +rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) +{ + int delay, t, extradelay; + + cl->stats_.overactions++; + TV_DELTA(&cl->undertime_, &cl->overtime_, delay); +#ifndef BORROW_OFFTIME + delay += cl->offtime_; +#endif + + if (!cl->sleeping_) { + CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); +#ifdef BORROW_OFFTIME + if (borrow != NULL) + extradelay = borrow->offtime_; + else +#endif + extradelay = cl->offtime_; + +#ifdef ALTQ + /* + * XXX recalculate suspend time: + * current undertime is (tidle + pkt_time) calculated + * from the last transmission. + * tidle: time required to bring avgidle back to 0 + * pkt_time: target waiting time for this class + * we need to replace pkt_time by offtime + */ + extradelay -= cl->last_pkttime_; +#endif + if (extradelay > 0) { + TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); + delay += extradelay; + } + + cl->sleeping_ = 1; + cl->stats_.delays++; + + /* + * Since packets are phased randomly with respect to the + * clock, 1 tick (the next clock tick) can be an arbitrarily + * short time so we have to wait for at least two ticks. + * NOTE: If there's no other traffic, we need the timer as + * a 'backstop' to restart this class. + */ + if (delay > tick * 2) { + /* FreeBSD rounds up the tick */ + t = hzto(&cl->undertime_); + } else + t = 2; + CALLOUT_RESET(&cl->callout_, t, + (timeout_t *)rmc_restart, (caddr_t)cl); + } +} + +/* + * void + * rmc_restart() - is just a helper routine for rmc_delay_action -- it is + * called by the system timer code & is responsible checking if the + * class is still sleeping (it might have been restarted as a side + * effect of the queue scan on a packet arrival) and, if so, restarting + * output for the class. Inspecting the class state & restarting output + * require locking the class structure. In general the driver is + * responsible for locking but this is the only routine that is not + * called directly or indirectly from the interface driver so it has + * know about system locking conventions. Under bsd, locking is done + * by raising IPL to splimp so that's what's implemented here. On a + * different system this would probably need to be changed. + * + * Returns: NONE + */ + +static void +rmc_restart(struct rm_class *cl) +{ + struct rm_ifdat *ifd = cl->ifdat_; + int s; + + s = splnet(); + IFQ_LOCK(ifd->ifq_); + if (cl->sleeping_) { + cl->sleeping_ = 0; + cl->undertime_.tv_sec = 0; + + if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { + CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); + (ifd->restart)(ifd->ifq_); + } + } + IFQ_UNLOCK(ifd->ifq_); + splx(s); +} + +/* + * void + * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit + * handling routine for the root class of the link sharing structure. + * + * Returns: NONE + */ + +static void +rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow) +{ + panic("rmc_root_overlimit"); +} + +/* + * Packet Queue handling routines. Eventually, this is to localize the + * effects on the code whether queues are red queues or droptail + * queues. + */ + +static int +_rmc_addq(rm_class_t *cl, mbuf_t *m) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_addq(cl->red_, cl->q_, m, cl->pktattr_); +#endif /* ALTQ_RED */ +#ifdef ALTQ_CODEL + if (q_is_codel(cl->q_)) + return codel_addq(cl->codel_, cl->q_, m); +#endif + + if (cl->flags_ & RMCF_CLEARDSCP) + write_dsfield(m, cl->pktattr_, 0); + + _addq(cl->q_, m); + return (0); +} + +/* note: _rmc_dropq is not called for red */ +static void +_rmc_dropq(rm_class_t *cl) +{ + mbuf_t *m; + + if ((m = _getq(cl->q_)) != NULL) + m_freem(m); +} + +static mbuf_t * +_rmc_getq(rm_class_t *cl) +{ +#ifdef ALTQ_RIO + if (q_is_rio(cl->q_)) + return rio_getq((rio_t *)cl->red_, cl->q_); +#endif +#ifdef ALTQ_RED + if (q_is_red(cl->q_)) + return red_getq(cl->red_, cl->q_); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->q_)) + return codel_getq(cl->codel_, cl->q_); +#endif + return _getq(cl->q_); +} + +static mbuf_t * +_rmc_pollq(rm_class_t *cl) +{ + return qhead(cl->q_); +} + +#ifdef CBQ_TRACE + +struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; +struct cbqtrace *cbqtrace_ptr = NULL; +int cbqtrace_count; + +/* + * DDB hook to trace cbq events: + * the last 1024 events are held in a circular buffer. + * use "call cbqtrace_dump(N)" to display 20 events from Nth event. + */ +void cbqtrace_dump(int); +static char *rmc_funcname(void *); + +static struct rmc_funcs { + void *func; + char *name; +} rmc_funcs[] = +{ + rmc_init, "rmc_init", + rmc_queue_packet, "rmc_queue_packet", + rmc_under_limit, "rmc_under_limit", + rmc_update_class_util, "rmc_update_class_util", + rmc_delay_action, "rmc_delay_action", + rmc_restart, "rmc_restart", + _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", + NULL, NULL +}; + +static char *rmc_funcname(void *func) +{ + struct rmc_funcs *fp; + + for (fp = rmc_funcs; fp->func != NULL; fp++) + if (fp->func == func) + return (fp->name); + return ("unknown"); +} + +void cbqtrace_dump(int counter) +{ + int i, *p; + char *cp; + + counter = counter % NCBQTRACE; + p = (int *)&cbqtrace_buffer[counter]; + + for (i=0; i<20; i++) { + printf("[0x%x] ", *p++); + printf("%s: ", rmc_funcname((void *)*p++)); + cp = (char *)p++; + printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); + printf("%d\n",*p++); + + if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) + p = (int *)cbqtrace_buffer; + } +} +#endif /* CBQ_TRACE */ +#endif /* ALTQ_CBQ */ + +#if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \ + defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL) +#if !defined(__GNUC__) || defined(ALTQ_DEBUG) + +void +_addq(class_queue_t *q, mbuf_t *m) +{ + mbuf_t *m0; + + if ((m0 = qtail(q)) != NULL) + m->m_nextpkt = m0->m_nextpkt; + else + m0 = m; + m0->m_nextpkt = m; + qtail(q) = m; + qlen(q)++; +} + +mbuf_t * +_getq(class_queue_t *q) +{ + mbuf_t *m, *m0; + + if ((m = qtail(q)) == NULL) + return (NULL); + if ((m0 = m->m_nextpkt) != m) + m->m_nextpkt = m0->m_nextpkt; + else { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } + qlen(q)--; + m0->m_nextpkt = NULL; + return (m0); +} + +/* drop a packet at the tail of the queue */ +mbuf_t * +_getq_tail(class_queue_t *q) +{ + mbuf_t *m, *m0, *prev; + + if ((m = m0 = qtail(q)) == NULL) + return NULL; + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else + qtail(q) = prev; + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +/* randomly select a packet in the queue */ +mbuf_t * +_getq_random(class_queue_t *q) +{ + struct mbuf *m; + int i, n; + + if ((m = qtail(q)) == NULL) + return NULL; + if (m->m_nextpkt == m) { + ASSERT(qlen(q) == 1); + qtail(q) = NULL; + } else { + struct mbuf *prev = NULL; + + n = arc4random() % qlen(q) + 1; + for (i = 0; i < n; i++) { + prev = m; + m = m->m_nextpkt; + } + prev->m_nextpkt = m->m_nextpkt; + if (m == qtail(q)) + qtail(q) = prev; + } + qlen(q)--; + m->m_nextpkt = NULL; + return (m); +} + +void +_removeq(class_queue_t *q, mbuf_t *m) +{ + mbuf_t *m0, *prev; + + m0 = qtail(q); + do { + prev = m0; + m0 = m0->m_nextpkt; + } while (m0 != m); + prev->m_nextpkt = m->m_nextpkt; + if (prev == m) + qtail(q) = NULL; + else if (qtail(q) == m) + qtail(q) = prev; + qlen(q)--; +} + +void +_flushq(class_queue_t *q) +{ + mbuf_t *m; + + while ((m = _getq(q)) != NULL) + m_freem(m); + ASSERT(qlen(q) == 0); +} + +#endif /* !__GNUC__ || ALTQ_DEBUG */ +#endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ diff --git a/freebsd/sys/net/altq/altq_rmclass.h b/freebsd/sys/net/altq/altq_rmclass.h new file mode 100644 index 00000000..6130c4ff --- /dev/null +++ b/freebsd/sys/net/altq/altq_rmclass.h @@ -0,0 +1,273 @@ +/*- + * Copyright (c) 1991-1997 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the Network Research + * Group at Lawrence Berkeley Laboratory. + * 4. Neither the name of the University nor of the Laboratory may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_rmclass.h,v 1.10 2003/08/20 23:30:23 itojun Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_H_ +#define _ALTQ_ALTQ_RMCLASS_H_ + +#include <net/altq/altq_classq.h> + +/* #pragma ident "@(#)rm_class.h 1.20 97/10/23 SMI" */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define RM_MAXPRIO 8 /* Max priority */ + +#ifdef _KERNEL + +typedef struct mbuf mbuf_t; +typedef struct rm_ifdat rm_ifdat_t; +typedef struct rm_class rm_class_t; + +struct red; + +/* + * Macros for dealing with time values. We assume all times are + * 'timevals'. `microtime' is used to get the best available clock + * resolution. If `microtime' *doesn't* return a value that's about + * ten times smaller than the average packet time on the fastest + * link that will use these routines, a slightly different clock + * scheme than this one should be used. + * (Bias due to truncation error in this scheme will overestimate utilization + * and discriminate against high bandwidth classes. To remove this bias an + * integrator needs to be added. The simplest integrator uses a history of + * 10 * avg.packet.time / min.tick.time packet completion entries. This is + * straight forward to add but we don't want to pay the extra memory + * traffic to maintain it if it's not necessary (occasionally a vendor + * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) + */ + +#define RM_GETTIME(now) microtime(&now) + +#define TV_LT(a, b) (((a)->tv_sec < (b)->tv_sec) || \ + (((a)->tv_usec < (b)->tv_usec) && ((a)->tv_sec <= (b)->tv_sec))) + +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a)->tv_usec - (b)->tv_usec; \ + if ((xxs = (a)->tv_sec - (b)->tv_sec)) { \ + switch (xxs) { \ + default: \ + /* if (xxs < 0) \ + printf("rm_class: bogus time values\n"); */ \ + delta = 0; \ + /* fall through */ \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + } \ + } \ +} + +#define TV_ADD_DELTA(a, delta, res) { \ + register int xxus = (a)->tv_usec + (delta); \ + \ + (res)->tv_sec = (a)->tv_sec; \ + while (xxus >= 1000000) { \ + ++((res)->tv_sec); \ + xxus -= 1000000; \ + } \ + (res)->tv_usec = xxus; \ +} + +#define RM_TIMEOUT 2 /* 1 Clock tick. */ + +#if 1 +#define RM_MAXQUEUED 1 /* this isn't used in ALTQ/CBQ */ +#else +#define RM_MAXQUEUED 16 /* Max number of packets downstream of CBQ */ +#endif +#define RM_MAXQUEUE 64 /* Max queue length */ +#define RM_FILTER_GAIN 5 /* log2 of gain, e.g., 5 => 31/32 */ +#define RM_POWER (1 << RM_FILTER_GAIN) +#define RM_MAXDEPTH 32 +#define RM_NS_PER_SEC (1000000000) + +typedef struct _rm_class_stats_ { + u_int handle; + u_int depth; + + struct pktcntr xmit_cnt; /* packets sent in this class */ + struct pktcntr drop_cnt; /* dropped packets */ + u_int over; /* # times went over limit */ + u_int borrows; /* # times tried to borrow */ + u_int overactions; /* # times invoked overlimit action */ + u_int delays; /* # times invoked delay actions */ +} rm_class_stats_t; + +/* + * CBQ Class state structure + */ +struct rm_class { + class_queue_t *q_; /* Queue of packets */ + rm_ifdat_t *ifdat_; + int pri_; /* Class priority. */ + int depth_; /* Class depth */ + u_int ns_per_byte_; /* NanoSeconds per byte. */ + u_int maxrate_; /* Bytes per second for this class. */ + u_int allotment_; /* Fraction of link bandwidth. */ + u_int w_allotment_; /* Weighted allotment for WRR */ + int bytes_alloc_; /* Allocation for round of WRR */ + + int avgidle_; + int maxidle_; + int minidle_; + int offtime_; + int sleeping_; /* != 0 if delaying */ + int qthresh_; /* Queue threshold for formal link sharing */ + int leaf_; /* Note whether leaf class or not.*/ + + rm_class_t *children_; /* Children of this class */ + rm_class_t *next_; /* Next pointer, used if child */ + + rm_class_t *peer_; /* Peer class */ + rm_class_t *borrow_; /* Borrow class */ + rm_class_t *parent_; /* Parent class */ + + void (*overlimit)(struct rm_class *, struct rm_class *); + void (*drop)(struct rm_class *); /* Class drop action. */ + + union { + struct red *red_; /* RED state pointer */ + struct codel *codel_; /* codel state pointer */ + } cl_aqm_; +#define red_ cl_aqm_.red_ +#define codel_ cl_aqm_.codel_ + struct altq_pktattr *pktattr_; /* saved hdr used by RED/ECN */ + int flags_; + + int last_pkttime_; /* saved pkt_time */ + struct timeval undertime_; /* time can next send */ + struct timeval last_; /* time last packet sent */ + struct timeval overtime_; + struct callout callout_; /* for timeout() calls */ + + rm_class_stats_t stats_; /* Class Statistics */ +}; + +/* + * CBQ Interface state + */ +struct rm_ifdat { + int queued_; /* # pkts queued downstream */ + int efficient_; /* Link Efficiency bit */ + int wrr_; /* Enable Weighted Round-Robin */ + u_long ns_per_byte_; /* Link byte speed. */ + int maxqueued_; /* Max packets to queue */ + int maxpkt_; /* Max packet size. */ + int qi_; /* In/out pointers for downstream */ + int qo_; /* packets */ + + /* + * Active class state and WRR state. + */ + rm_class_t *active_[RM_MAXPRIO]; /* Active cl's in each pri */ + int na_[RM_MAXPRIO]; /* # of active cl's in a pri */ + int num_[RM_MAXPRIO]; /* # of cl's per pri */ + int alloc_[RM_MAXPRIO]; /* Byte Allocation */ + u_long M_[RM_MAXPRIO]; /* WRR weights. */ + + /* + * Network Interface/Solaris Queue state pointer. + */ + struct ifaltq *ifq_; + rm_class_t *default_; /* Default Pkt class, BE */ + rm_class_t *root_; /* Root Link class. */ + rm_class_t *ctl_; /* Control Traffic class. */ + void (*restart)(struct ifaltq *); /* Restart routine. */ + + /* + * Current packet downstream packet state and dynamic state. + */ + rm_class_t *borrowed_[RM_MAXQUEUED]; /* Class borrowed last */ + rm_class_t *class_[RM_MAXQUEUED]; /* class sending */ + int curlen_[RM_MAXQUEUED]; /* Current pktlen */ + struct timeval now_[RM_MAXQUEUED]; /* Current packet time. */ + int is_overlimit_[RM_MAXQUEUED];/* Current packet time. */ + + int cutoff_; /* Cut-off depth for borrowing */ + + struct timeval ifnow_; /* expected xmit completion time */ +#if 1 /* ALTQ4PPP */ + int maxiftime_; /* max delay inside interface */ +#endif + rm_class_t *pollcache_; /* cached rm_class by poll operation */ +}; + +/* flags for rmc_init and rmc_newclass */ +/* class flags */ +#define RMCF_RED 0x0001 +#define RMCF_ECN 0x0002 +#define RMCF_RIO 0x0004 +#define RMCF_FLOWVALVE 0x0008 /* use flowvalve (aka penalty-box) */ +#define RMCF_CLEARDSCP 0x0010 /* clear diffserv codepoint */ +#define RMCF_CODEL 0x0020 + +/* flags for rmc_init */ +#define RMCF_WRR 0x0100 +#define RMCF_EFFICIENT 0x0200 + +#define is_a_parent_class(cl) ((cl)->children_ != NULL) + +extern rm_class_t *rmc_newclass(int, struct rm_ifdat *, u_int, + void (*)(struct rm_class *, struct rm_class *), + int, struct rm_class *, struct rm_class *, + u_int, int, u_int, int, int); +extern void rmc_delete_class(struct rm_ifdat *, struct rm_class *); +extern int rmc_modclass(struct rm_class *, u_int, int, + u_int, int, u_int, int); +extern void rmc_init(struct ifaltq *, struct rm_ifdat *, u_int, + void (*)(struct ifaltq *), + int, int, u_int, int, u_int, int); +extern int rmc_queue_packet(struct rm_class *, mbuf_t *); +extern mbuf_t *rmc_dequeue_next(struct rm_ifdat *, int); +extern void rmc_update_class_util(struct rm_ifdat *); +extern void rmc_delay_action(struct rm_class *, struct rm_class *); +extern void rmc_dropall(struct rm_class *); +extern int rmc_get_weight(struct rm_ifdat *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_H_ */ diff --git a/freebsd/sys/net/altq/altq_rmclass_debug.h b/freebsd/sys/net/altq/altq_rmclass_debug.h new file mode 100644 index 00000000..7adbaec4 --- /dev/null +++ b/freebsd/sys/net/altq/altq_rmclass_debug.h @@ -0,0 +1,113 @@ +/*- + * Copyright (c) Sun Microsystems, Inc. 1998 All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the SMCC Technology + * Development Group at Sun Microsystems, Inc. + * + * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or + * promote products derived from this software without specific prior + * written permission. + * + * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE + * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is + * provided "as is" without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this software. + * + * $KAME: altq_rmclass_debug.h,v 1.3 2002/11/29 04:36:24 kjc Exp $ + * $FreeBSD$ + */ + +#ifndef _ALTQ_ALTQ_RMCLASS_DEBUG_H_ +#define _ALTQ_ALTQ_RMCLASS_DEBUG_H_ + +/* #pragma ident "@(#)rm_class_debug.h 1.7 98/05/04 SMI" */ + +/* + * Cbq debugging macros + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef CBQ_TRACE +#ifndef NCBQTRACE +#define NCBQTRACE (16 * 1024) +#endif + +/* + * To view the trace output, using adb, type: + * adb -k /dev/ksyms /dev/mem <cr>, then type + * cbqtrace_count/D to get the count, then type + * cbqtrace_buffer,0tcount/Dp4C" "Xn + * This will dump the trace buffer from 0 to count. + */ +/* + * in ALTQ, "call cbqtrace_dump(N)" from DDB to display 20 events + * from Nth event in the circular buffer. + */ + +struct cbqtrace { + int count; + int function; /* address of function */ + int trace_action; /* descriptive 4 characters */ + int object; /* object operated on */ +}; + +extern struct cbqtrace cbqtrace_buffer[]; +extern struct cbqtrace *cbqtrace_ptr; +extern int cbqtrace_count; + +#define CBQTRACEINIT() { \ + if (cbqtrace_ptr == NULL) \ + cbqtrace_ptr = cbqtrace_buffer; \ + else { \ + cbqtrace_ptr = cbqtrace_buffer; \ + bzero((void *)cbqtrace_ptr, sizeof(cbqtrace_buffer)); \ + cbqtrace_count = 0; \ + } \ +} + +#define LOCK_TRACE() splimp() +#define UNLOCK_TRACE(x) splx(x) + +#define CBQTRACE(func, act, obj) { \ + int __s = LOCK_TRACE(); \ + int *_p = &cbqtrace_ptr->count; \ + *_p++ = ++cbqtrace_count; \ + *_p++ = (int)(func); \ + *_p++ = (int)(act); \ + *_p++ = (int)(obj); \ + if ((struct cbqtrace *)(void *)_p >= &cbqtrace_buffer[NCBQTRACE])\ + cbqtrace_ptr = cbqtrace_buffer; \ + else \ + cbqtrace_ptr = (struct cbqtrace *)(void *)_p; \ + UNLOCK_TRACE(__s); \ + } +#else + +/* If no tracing, define no-ops */ +#define CBQTRACEINIT() +#define CBQTRACE(a, b, c) + +#endif /* !CBQ_TRACE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ALTQ_ALTQ_RMCLASS_DEBUG_H_ */ diff --git a/freebsd/sys/net/altq/altq_subr.c b/freebsd/sys/net/altq/altq_subr.c new file mode 100644 index 00000000..66ff441d --- /dev/null +++ b/freebsd/sys/net/altq/altq_subr.c @@ -0,0 +1,1978 @@ +#include <machine/rtems-bsd-kernel-space.h> + +/*- + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $ + * $FreeBSD$ + */ + +#include <rtems/bsd/local/opt_altq.h> +#include <rtems/bsd/local/opt_inet.h> +#include <rtems/bsd/local/opt_inet6.h> + +#include <rtems/bsd/sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <rtems/bsd/sys/errno.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <net/altq/altq.h> +#ifdef ALTQ3_COMPAT +#include <net/altq/altq_conf.h> +#endif + +/* machine dependent clock related includes */ +#include <sys/bus.h> +#include <sys/cpu.h> +#include <sys/eventhandler.h> +#include <machine/clock.h> +#if defined(__amd64__) || defined(__i386__) +#include <machine/cpufunc.h> /* for pentium tsc */ +#include <machine/specialreg.h> /* for CPUID_TSC */ +#include <machine/md_var.h> /* for cpu_feature */ +#endif /* __amd64 || __i386__ */ + +/* + * internal function prototypes + */ +static void tbr_timeout(void *); +int (*altq_input)(struct mbuf *, int) = NULL; +static struct mbuf *tbr_dequeue(struct ifaltq *, int); +static int tbr_timer = 0; /* token bucket regulator timer */ +#if !defined(__FreeBSD__) || (__FreeBSD_version < 600000) +static struct callout tbr_callout = CALLOUT_INITIALIZER; +#else +static struct callout tbr_callout; +#endif + +#ifdef ALTQ3_CLFIER_COMPAT +static int extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *); +#ifdef INET6 +static int extract_ports6(struct mbuf *, struct ip6_hdr *, + struct flowinfo_in6 *); +#endif +static int apply_filter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +static int apply_ppfilter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +#ifdef INET6 +static int apply_filter6(u_int32_t, struct flow_filter6 *, + struct flowinfo_in6 *); +#endif +static int apply_tosfilter4(u_int32_t, struct flow_filter *, + struct flowinfo_in *); +static u_long get_filt_handle(struct acc_classifier *, int); +static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long); +static u_int32_t filt2fibmask(struct flow_filter *); + +static void ip4f_cache(struct ip *, struct flowinfo_in *); +static int ip4f_lookup(struct ip *, struct flowinfo_in *); +static int ip4f_init(void); +static struct ip4_frag *ip4f_alloc(void); +static void ip4f_free(struct ip4_frag *); +#endif /* ALTQ3_CLFIER_COMPAT */ + +/* + * alternate queueing support routines + */ + +/* look up the queue state by the interface name and the queueing type. */ +void * +altq_lookup(name, type) + char *name; + int type; +{ + struct ifnet *ifp; + + if ((ifp = ifunit(name)) != NULL) { + /* read if_snd unlocked */ + if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) + return (ifp->if_snd.altq_disc); + } + + return NULL; +} + +int +altq_attach(ifq, type, discipline, enqueue, dequeue, request, clfier, classify) + struct ifaltq *ifq; + int type; + void *discipline; + int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *); + struct mbuf *(*dequeue)(struct ifaltq *, int); + int (*request)(struct ifaltq *, int, void *); + void *clfier; + void *(*classify)(void *, struct mbuf *, int); +{ + IFQ_LOCK(ifq); + if (!ALTQ_IS_READY(ifq)) { + IFQ_UNLOCK(ifq); + return ENXIO; + } + +#ifdef ALTQ3_COMPAT + /* + * pfaltq can override the existing discipline, but altq3 cannot. + * check these if clfier is not NULL (which implies altq3). + */ + if (clfier != NULL) { + if (ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return EBUSY; + } + if (ALTQ_IS_ATTACHED(ifq)) { + IFQ_UNLOCK(ifq); + return EEXIST; + } + } +#endif + ifq->altq_type = type; + ifq->altq_disc = discipline; + ifq->altq_enqueue = enqueue; + ifq->altq_dequeue = dequeue; + ifq->altq_request = request; + ifq->altq_clfier = clfier; + ifq->altq_classify = classify; + ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_KLD + altq_module_incref(type); +#endif +#endif + IFQ_UNLOCK(ifq); + return 0; +} + +int +altq_detach(ifq) + struct ifaltq *ifq; +{ + IFQ_LOCK(ifq); + + if (!ALTQ_IS_READY(ifq)) { + IFQ_UNLOCK(ifq); + return ENXIO; + } + if (ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return EBUSY; + } + if (!ALTQ_IS_ATTACHED(ifq)) { + IFQ_UNLOCK(ifq); + return (0); + } +#ifdef ALTQ3_COMPAT +#ifdef ALTQ_KLD + altq_module_declref(ifq->altq_type); +#endif +#endif + + ifq->altq_type = ALTQT_NONE; + ifq->altq_disc = NULL; + ifq->altq_enqueue = NULL; + ifq->altq_dequeue = NULL; + ifq->altq_request = NULL; + ifq->altq_clfier = NULL; + ifq->altq_classify = NULL; + ifq->altq_flags &= ALTQF_CANTCHANGE; + + IFQ_UNLOCK(ifq); + return 0; +} + +int +altq_enable(ifq) + struct ifaltq *ifq; +{ + int s; + + IFQ_LOCK(ifq); + + if (!ALTQ_IS_READY(ifq)) { + IFQ_UNLOCK(ifq); + return ENXIO; + } + if (ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return 0; + } + + s = splnet(); + IFQ_PURGE_NOLOCK(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->ifq_drv_maxlen = 0; /* disable bulk dequeue */ + ifq->altq_flags |= ALTQF_ENABLED; + if (ifq->altq_clfier != NULL) + ifq->altq_flags |= ALTQF_CLASSIFY; + splx(s); + + IFQ_UNLOCK(ifq); + return 0; +} + +int +altq_disable(ifq) + struct ifaltq *ifq; +{ + int s; + + IFQ_LOCK(ifq); + if (!ALTQ_IS_ENABLED(ifq)) { + IFQ_UNLOCK(ifq); + return 0; + } + + s = splnet(); + IFQ_PURGE_NOLOCK(ifq); + ASSERT(ifq->ifq_len == 0); + ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY); + splx(s); + + IFQ_UNLOCK(ifq); + return 0; +} + +#ifdef ALTQ_DEBUG +void +altq_assert(file, line, failedexpr) + const char *file, *failedexpr; + int line; +{ + (void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n", + failedexpr, file, line); + panic("altq assertion"); + /* NOTREACHED */ +} +#endif + +/* + * internal representation of token bucket parameters + * rate: byte_per_unittime << 32 + * (((bits_per_sec) / 8) << 32) / machclk_freq + * depth: byte << 32 + * + */ +#define TBR_SHIFT 32 +#define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) +#define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) + +static struct mbuf * +tbr_dequeue(ifq, op) + struct ifaltq *ifq; + int op; +{ + struct tb_regulator *tbr; + struct mbuf *m; + int64_t interval; + u_int64_t now; + + IFQ_LOCK_ASSERT(ifq); + tbr = ifq->altq_tbr; + if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { + /* if this is a remove after poll, bypass tbr check */ + } else { + /* update token only when it is negative */ + if (tbr->tbr_token <= 0) { + now = read_machclk(); + interval = now - tbr->tbr_last; + if (interval >= tbr->tbr_filluptime) + tbr->tbr_token = tbr->tbr_depth; + else { + tbr->tbr_token += interval * tbr->tbr_rate; + if (tbr->tbr_token > tbr->tbr_depth) + tbr->tbr_token = tbr->tbr_depth; + } + tbr->tbr_last = now; + } + /* if token is still negative, don't allow dequeue */ + if (tbr->tbr_token <= 0) + return (NULL); + } + + if (ALTQ_IS_ENABLED(ifq)) + m = (*ifq->altq_dequeue)(ifq, op); + else { + if (op == ALTDQ_POLL) + _IF_POLL(ifq, m); + else + _IF_DEQUEUE(ifq, m); + } + + if (m != NULL && op == ALTDQ_REMOVE) + tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); + tbr->tbr_lastop = op; + return (m); +} + +/* + * set a token bucket regulator. + * if the specified rate is zero, the token bucket regulator is deleted. + */ +int +tbr_set(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr, *otbr; + + if (tbr_dequeue_ptr == NULL) + tbr_dequeue_ptr = tbr_dequeue; + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) { + printf("tbr_set: no cpu clock available!\n"); + return (ENXIO); + } + + IFQ_LOCK(ifq); + if (profile->rate == 0) { + /* delete this tbr */ + if ((tbr = ifq->altq_tbr) == NULL) { + IFQ_UNLOCK(ifq); + return (ENOENT); + } + ifq->altq_tbr = NULL; + free(tbr, M_DEVBUF); + IFQ_UNLOCK(ifq); + return (0); + } + + tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO); + if (tbr == NULL) { + IFQ_UNLOCK(ifq); + return (ENOMEM); + } + + tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; + tbr->tbr_depth = TBR_SCALE(profile->depth); + if (tbr->tbr_rate > 0) + tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; + else + tbr->tbr_filluptime = 0xffffffffffffffffLL; + tbr->tbr_token = tbr->tbr_depth; + tbr->tbr_last = read_machclk(); + tbr->tbr_lastop = ALTDQ_REMOVE; + + otbr = ifq->altq_tbr; + ifq->altq_tbr = tbr; /* set the new tbr */ + + if (otbr != NULL) + free(otbr, M_DEVBUF); + else { + if (tbr_timer == 0) { + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + tbr_timer = 1; + } + } + IFQ_UNLOCK(ifq); + return (0); +} + +/* + * tbr_timeout goes through the interface list, and kicks the drivers + * if necessary. + * + * MPSAFE + */ +static void +tbr_timeout(arg) + void *arg; +{ + VNET_ITERATOR_DECL(vnet_iter); + struct ifnet *ifp; + int active, s; + + active = 0; + s = splnet(); + IFNET_RLOCK_NOSLEEP(); + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + for (ifp = TAILQ_FIRST(&V_ifnet); ifp; + ifp = TAILQ_NEXT(ifp, if_list)) { + /* read from if_snd unlocked */ + if (!TBR_IS_ENABLED(&ifp->if_snd)) + continue; + active++; + if (!IFQ_IS_EMPTY(&ifp->if_snd) && + ifp->if_start != NULL) + (*ifp->if_start)(ifp); + } + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); + IFNET_RUNLOCK_NOSLEEP(); + splx(s); + if (active > 0) + CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); + else + tbr_timer = 0; /* don't need tbr_timer anymore */ +} + +/* + * get token bucket regulator profile + */ +int +tbr_get(ifq, profile) + struct ifaltq *ifq; + struct tb_profile *profile; +{ + struct tb_regulator *tbr; + + IFQ_LOCK(ifq); + if ((tbr = ifq->altq_tbr) == NULL) { + profile->rate = 0; + profile->depth = 0; + } else { + profile->rate = + (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq); + profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth); + } + IFQ_UNLOCK(ifq); + return (0); +} + +/* + * attach a discipline to the interface. if one already exists, it is + * overridden. + * Locking is done in the discipline specific attach functions. Basically + * they call back to altq_attach which takes care of the attach and locking. + */ +int +altq_pfattach(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { + case ALTQT_NONE: + break; +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_pfattach(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_pfattach(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_pfattach(a); + break; +#endif +#ifdef ALTQ_FAIRQ + case ALTQT_FAIRQ: + error = fairq_pfattach(a); + break; +#endif +#ifdef ALTQ_CODEL + case ALTQT_CODEL: + error = codel_pfattach(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * detach a discipline from the interface. + * it is possible that the discipline was already overridden by another + * discipline. + */ +int +altq_pfdetach(struct pf_altq *a) +{ + struct ifnet *ifp; + int s, error = 0; + + if ((ifp = ifunit(a->ifname)) == NULL) + return (EINVAL); + + /* if this discipline is no longer referenced, just return */ + /* read unlocked from if_snd */ + if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc) + return (0); + + s = splnet(); + /* read unlocked from if_snd, _disable and _detach take care */ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + error = altq_disable(&ifp->if_snd); + if (error == 0) + error = altq_detach(&ifp->if_snd); + splx(s); + + return (error); +} + +/* + * add a discipline or a queue + * Locking is done in the discipline specific functions with regards to + * malloc with WAITOK, also it is not yet clear which lock to use. + */ +int +altq_add(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_add_queue(a)); + + if (machclk_freq == 0) + init_machclk(); + if (machclk_freq == 0) + panic("altq_add: no cpu clock"); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_altq(a); + break; +#endif +#ifdef ALTQ_FAIRQ + case ALTQT_FAIRQ: + error = fairq_add_altq(a); + break; +#endif +#ifdef ALTQ_CODEL + case ALTQT_CODEL: + error = codel_add_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a discipline or a queue + * It is yet unclear what lock to use to protect this operation, the + * discipline specific functions will determine and grab it + */ +int +altq_remove(struct pf_altq *a) +{ + int error = 0; + + if (a->qname[0] != 0) + return (altq_remove_queue(a)); + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_altq(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_altq(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_altq(a); + break; +#endif +#ifdef ALTQ_FAIRQ + case ALTQT_FAIRQ: + error = fairq_remove_altq(a); + break; +#endif +#ifdef ALTQ_CODEL + case ALTQT_CODEL: + error = codel_remove_altq(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * add a queue to the discipline + * It is yet unclear what lock to use to protect this operation, the + * discipline specific functions will determine and grab it + */ +int +altq_add_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_add_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_add_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_add_queue(a); + break; +#endif +#ifdef ALTQ_FAIRQ + case ALTQT_FAIRQ: + error = fairq_add_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * remove a queue from the discipline + * It is yet unclear what lock to use to protect this operation, the + * discipline specific functions will determine and grab it + */ +int +altq_remove_queue(struct pf_altq *a) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_remove_queue(a); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_remove_queue(a); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_remove_queue(a); + break; +#endif +#ifdef ALTQ_FAIRQ + case ALTQT_FAIRQ: + error = fairq_remove_queue(a); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * get queue statistics + * Locking is done in the discipline specific functions with regards to + * copyout operations, also it is not yet clear which lock to use. + */ +int +altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) +{ + int error = 0; + + switch (a->scheduler) { +#ifdef ALTQ_CBQ + case ALTQT_CBQ: + error = cbq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_PRIQ + case ALTQT_PRIQ: + error = priq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_HFSC + case ALTQT_HFSC: + error = hfsc_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_FAIRQ + case ALTQT_FAIRQ: + error = fairq_getqstats(a, ubuf, nbytes); + break; +#endif +#ifdef ALTQ_CODEL + case ALTQT_CODEL: + error = codel_getqstats(a, ubuf, nbytes); + break; +#endif + default: + error = ENXIO; + } + + return (error); +} + +/* + * read and write diffserv field in IPv4 or IPv6 header + */ +u_int8_t +read_dsfield(m, pktattr) + struct mbuf *m; + struct altq_pktattr *pktattr; +{ + struct mbuf *m0; + u_int8_t ds_field = 0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return ((u_int8_t)0); + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("read_dsfield: can't locate header!\n"); +#endif + return ((u_int8_t)0); + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + + if (ip->ip_v != 4) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = ip->ip_tos; + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return ((u_int8_t)0); /* version mismatch! */ + ds_field = (flowlabel >> 20) & 0xff; + } +#endif + return (ds_field); +} + +void +write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield) +{ + struct mbuf *m0; + + if (pktattr == NULL || + (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) + return; + + /* verify that pattr_hdr is within the mbuf data */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if ((pktattr->pattr_hdr >= m0->m_data) && + (pktattr->pattr_hdr < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { + /* ick, pattr_hdr is stale */ + pktattr->pattr_af = AF_UNSPEC; +#ifdef ALTQ_DEBUG + printf("write_dsfield: can't locate header!\n"); +#endif + return; + } + + if (pktattr->pattr_af == AF_INET) { + struct ip *ip = (struct ip *)pktattr->pattr_hdr; + u_int8_t old; + int32_t sum; + + if (ip->ip_v != 4) + return; /* version mismatch! */ + old = ip->ip_tos; + dsfield |= old & 3; /* leave CU bits */ + if (old == dsfield) + return; + ip->ip_tos = dsfield; + /* + * update checksum (from RFC1624) + * HC' = ~(~HC + ~m + m') + */ + sum = ~ntohs(ip->ip_sum) & 0xffff; + sum += 0xff00 + (~old & 0xff) + dsfield; + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); /* add carry */ + + ip->ip_sum = htons(~sum & 0xffff); + } +#ifdef INET6 + else if (pktattr->pattr_af == AF_INET6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; + u_int32_t flowlabel; + + flowlabel = ntohl(ip6->ip6_flow); + if ((flowlabel >> 28) != 6) + return; /* version mismatch! */ + flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); + ip6->ip6_flow = htonl(flowlabel); + } +#endif + return; +} + + +/* + * high resolution clock support taking advantage of a machine dependent + * high resolution time counter (e.g., timestamp counter of intel pentium). + * we assume + * - 64-bit-long monotonically-increasing counter + * - frequency range is 100M-4GHz (CPU speed) + */ +/* if pcc is not available or disabled, emulate 256MHz using microtime() */ +#define MACHCLK_SHIFT 8 + +int machclk_usepcc; +u_int32_t machclk_freq; +u_int32_t machclk_per_tick; + +#if defined(__i386__) && defined(__NetBSD__) +extern u_int64_t cpu_tsc_freq; +#endif + +#if (__FreeBSD_version >= 700035) +/* Update TSC freq with the value indicated by the caller. */ +static void +tsc_freq_changed(void *arg, const struct cf_level *level, int status) +{ + /* If there was an error during the transition, don't do anything. */ + if (status != 0) + return; + +#if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__)) + /* If TSC is P-state invariant, don't do anything. */ + if (tsc_is_invariant) + return; +#endif + + /* Total setting for this level gives the new frequency in MHz. */ + init_machclk(); +} +EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL, + EVENTHANDLER_PRI_LAST); +#endif /* __FreeBSD_version >= 700035 */ + +static void +init_machclk_setup(void) +{ +#if (__FreeBSD_version >= 600000) + callout_init(&tbr_callout, 0); +#endif + + machclk_usepcc = 1; + +#if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC) + machclk_usepcc = 0; +#endif +#if defined(__FreeBSD__) && defined(SMP) + machclk_usepcc = 0; +#endif +#if defined(__NetBSD__) && defined(MULTIPROCESSOR) + machclk_usepcc = 0; +#endif +#if defined(__amd64__) || defined(__i386__) + /* check if TSC is available */ + if ((cpu_feature & CPUID_TSC) == 0 || + atomic_load_acq_64(&tsc_freq) == 0) + machclk_usepcc = 0; +#endif +} + +void +init_machclk(void) +{ + static int called; + + /* Call one-time initialization function. */ + if (!called) { + init_machclk_setup(); + called = 1; + } + + if (machclk_usepcc == 0) { + /* emulate 256MHz using microtime() */ + machclk_freq = 1000000 << MACHCLK_SHIFT; + machclk_per_tick = machclk_freq / hz; +#ifdef ALTQ_DEBUG + printf("altq: emulate %uHz cpu clock\n", machclk_freq); +#endif + return; + } + + /* + * if the clock frequency (of Pentium TSC or Alpha PCC) is + * accessible, just use it. + */ +#if defined(__amd64__) || defined(__i386__) + machclk_freq = atomic_load_acq_64(&tsc_freq); +#endif + + /* + * if we don't know the clock frequency, measure it. + */ + if (machclk_freq == 0) { + static int wait; + struct timeval tv_start, tv_end; + u_int64_t start, end, diff; + int timo; + + microtime(&tv_start); + start = read_machclk(); + timo = hz; /* 1 sec */ + (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); + microtime(&tv_end); + end = read_machclk(); + diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + + tv_end.tv_usec - tv_start.tv_usec; + if (diff != 0) + machclk_freq = (u_int)((end - start) * 1000000 / diff); + } + + machclk_per_tick = machclk_freq / hz; + +#ifdef ALTQ_DEBUG + printf("altq: CPU clock: %uHz\n", machclk_freq); +#endif +} + +#if defined(__OpenBSD__) && defined(__i386__) +static __inline u_int64_t +rdtsc(void) +{ + u_int64_t rv; + __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); + return (rv); +} +#endif /* __OpenBSD__ && __i386__ */ + +u_int64_t +read_machclk(void) +{ + u_int64_t val; + + if (machclk_usepcc) { +#if defined(__amd64__) || defined(__i386__) + val = rdtsc(); +#else + panic("read_machclk"); +#endif + } else { + struct timeval tv, boottime; + + microtime(&tv); + getboottime(&boottime); + val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + + tv.tv_usec) << MACHCLK_SHIFT); + } + return (val); +} + +#ifdef ALTQ3_CLFIER_COMPAT + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 /* encapsulating security payload */ +#endif +#ifndef IPPROTO_AH +#define IPPROTO_AH 51 /* authentication header */ +#endif + +/* + * extract flow information from a given packet. + * filt_mask shows flowinfo fields required. + * we assume the ip header is in one mbuf, and addresses and ports are + * in network byte order. + */ +int +altq_extractflow(m, af, flow, filt_bmask) + struct mbuf *m; + int af; + struct flowinfo *flow; + u_int32_t filt_bmask; +{ + + switch (af) { + case PF_INET: { + struct flowinfo_in *fin; + struct ip *ip; + + ip = mtod(m, struct ip *); + + if (ip->ip_v != 4) + break; + + fin = (struct flowinfo_in *)flow; + fin->fi_len = sizeof(struct flowinfo_in); + fin->fi_family = AF_INET; + + fin->fi_proto = ip->ip_p; + fin->fi_tos = ip->ip_tos; + + fin->fi_src.s_addr = ip->ip_src.s_addr; + fin->fi_dst.s_addr = ip->ip_dst.s_addr; + + if (filt_bmask & FIMB4_PORTS) + /* if port info is required, extract port numbers */ + extract_ports4(m, ip, fin); + else { + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + } + return (1); + } + +#ifdef INET6 + case PF_INET6: { + struct flowinfo_in6 *fin6; + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + /* should we check the ip version? */ + + fin6 = (struct flowinfo_in6 *)flow; + fin6->fi6_len = sizeof(struct flowinfo_in6); + fin6->fi6_family = AF_INET6; + + fin6->fi6_proto = ip6->ip6_nxt; + fin6->fi6_tclass = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + + fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff); + fin6->fi6_src = ip6->ip6_src; + fin6->fi6_dst = ip6->ip6_dst; + + if ((filt_bmask & FIMB6_PORTS) || + ((filt_bmask & FIMB6_PROTO) + && ip6->ip6_nxt > IPPROTO_IPV6)) + /* + * if port info is required, or proto is required + * but there are option headers, extract port + * and protocol numbers. + */ + extract_ports6(m, ip6, fin6); + else { + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + fin6->fi6_gpi = 0; + } + return (1); + } +#endif /* INET6 */ + + default: + break; + } + + /* failed */ + flow->fi_len = sizeof(struct flowinfo); + flow->fi_family = AF_UNSPEC; + return (0); +} + +/* + * helper routine to extract port numbers + */ +/* structure for ipsec and ipv6 option header template */ +struct _opt6 { + u_int8_t opt6_nxt; /* next header */ + u_int8_t opt6_hlen; /* header extension length */ + u_int16_t _pad; + u_int32_t ah_spi; /* security parameter index + for authentication header */ +}; + +/* + * extract port numbers from a ipv4 packet. + */ +static int +extract_ports4(m, ip, fin) + struct mbuf *m; + struct ip *ip; + struct flowinfo_in *fin; +{ + struct mbuf *m0; + u_short ip_off; + u_int8_t proto; + int off; + + fin->fi_sport = 0; + fin->fi_dport = 0; + fin->fi_gpi = 0; + + ip_off = ntohs(ip->ip_off); + /* if it is a fragment, try cached fragment info */ + if (ip_off & IP_OFFMASK) { + ip4f_lookup(ip, fin); + return (1); + } + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip >= m0->m_data) && + ((caddr_t)ip < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports4: can't locate header! ip=%p\n", ip); +#endif + return (0); + } + off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2); + proto = ip->ip_p; + +#ifdef ALTQ_IPSEC + again: +#endif + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + if (m0 == NULL) + return (0); /* bogus ip_hl! */ + } + if (m0->m_len < off + 4) + return (0); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin->fi_sport = udp->uh_sport; + fin->fi_dport = udp->uh_dport; + fin->fi_proto = proto; + } + break; + +#ifdef ALTQ_IPSEC + case IPPROTO_ESP: + if (fin->fi_gpi == 0){ + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin->fi_gpi = *gpi; + } + fin->fi_proto = proto; + break; + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + if (fin->fi_gpi == 0 && m0->m_len >= off + 8) + fin->fi_gpi = opt6->ah_spi; + } + /* goto the next header */ + goto again; +#endif /* ALTQ_IPSEC */ + + default: + fin->fi_proto = proto; + return (0); + } + + /* if this is a first fragment, cache it. */ + if (ip_off & IP_MF) + ip4f_cache(ip, fin); + + return (1); +} + +#ifdef INET6 +static int +extract_ports6(m, ip6, fin6) + struct mbuf *m; + struct ip6_hdr *ip6; + struct flowinfo_in6 *fin6; +{ + struct mbuf *m0; + int off; + u_int8_t proto; + + fin6->fi6_gpi = 0; + fin6->fi6_sport = 0; + fin6->fi6_dport = 0; + + /* locate the mbuf containing the protocol header */ + for (m0 = m; m0 != NULL; m0 = m0->m_next) + if (((caddr_t)ip6 >= m0->m_data) && + ((caddr_t)ip6 < m0->m_data + m0->m_len)) + break; + if (m0 == NULL) { +#ifdef ALTQ_DEBUG + printf("extract_ports6: can't locate header! ip6=%p\n", ip6); +#endif + return (0); + } + off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr); + + proto = ip6->ip6_nxt; + do { + while (off >= m0->m_len) { + off -= m0->m_len; + m0 = m0->m_next; + if (m0 == NULL) + return (0); + } + if (m0->m_len < off + 4) + return (0); + + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: { + struct udphdr *udp; + + udp = (struct udphdr *)(mtod(m0, caddr_t) + off); + fin6->fi6_sport = udp->uh_sport; + fin6->fi6_dport = udp->uh_dport; + fin6->fi6_proto = proto; + } + return (1); + + case IPPROTO_ESP: + if (fin6->fi6_gpi == 0) { + u_int32_t *gpi; + + gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); + fin6->fi6_gpi = *gpi; + } + fin6->fi6_proto = proto; + return (1); + + case IPPROTO_AH: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8) + fin6->fi6_gpi = opt6->ah_spi; + proto = opt6->opt6_nxt; + off += 8 + (opt6->opt6_hlen * 4); + /* goto the next header */ + break; + } + + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: { + /* get next header and header length */ + struct _opt6 *opt6; + + opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); + proto = opt6->opt6_nxt; + off += (opt6->opt6_hlen + 1) * 8; + /* goto the next header */ + break; + } + + case IPPROTO_FRAGMENT: + /* ipv6 fragmentations are not supported yet */ + default: + fin6->fi6_proto = proto; + return (0); + } + } while (1); + /*NOTREACHED*/ +} +#endif /* INET6 */ + +/* + * altq common classifier + */ +int +acc_add_filter(classifier, filter, class, phandle) + struct acc_classifier *classifier; + struct flow_filter *filter; + void *class; + u_long *phandle; +{ + struct acc_filter *afp, *prev, *tmp; + int i, s; + +#ifdef INET6 + if (filter->ff_flow.fi_family != AF_INET && + filter->ff_flow.fi_family != AF_INET6) + return (EINVAL); +#else + if (filter->ff_flow.fi_family != AF_INET) + return (EINVAL); +#endif + + afp = malloc(sizeof(struct acc_filter), + M_DEVBUF, M_WAITOK); + if (afp == NULL) + return (ENOMEM); + bzero(afp, sizeof(struct acc_filter)); + + afp->f_filter = *filter; + afp->f_class = class; + + i = ACC_WILDCARD_INDEX; + if (filter->ff_flow.fi_family == AF_INET) { + struct flow_filter *filter4 = &afp->f_filter; + + /* + * if address is 0, it's a wildcard. if address mask + * isn't set, use full mask. + */ + if (filter4->ff_flow.fi_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0; + else if (filter4->ff_mask.mask_dst.s_addr == 0) + filter4->ff_mask.mask_dst.s_addr = 0xffffffff; + if (filter4->ff_flow.fi_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0; + else if (filter4->ff_mask.mask_src.s_addr == 0) + filter4->ff_mask.mask_src.s_addr = 0xffffffff; + + /* clear extra bits in addresses */ + filter4->ff_flow.fi_dst.s_addr &= + filter4->ff_mask.mask_dst.s_addr; + filter4->ff_flow.fi_src.s_addr &= + filter4->ff_mask.mask_src.s_addr; + + /* + * if dst address is a wildcard, use hash-entry + * ACC_WILDCARD_INDEX. + */ + if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr); + } +#ifdef INET6 + else if (filter->ff_flow.fi_family == AF_INET6) { + struct flow_filter6 *filter6 = + (struct flow_filter6 *)&afp->f_filter; +#ifndef IN6MASK0 /* taken from kame ipv6 */ +#define IN6MASK0 {{{ 0, 0, 0, 0 }}} +#define IN6MASK128 {{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}} + const struct in6_addr in6mask0 = IN6MASK0; + const struct in6_addr in6mask128 = IN6MASK128; +#endif + + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst)) + filter6->ff_mask6.mask6_dst = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst)) + filter6->ff_mask6.mask6_dst = in6mask128; + if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src)) + filter6->ff_mask6.mask6_src = in6mask0; + else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src)) + filter6->ff_mask6.mask6_src = in6mask128; + + /* clear extra bits in addresses */ + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_dst.s6_addr[i] &= + filter6->ff_mask6.mask6_dst.s6_addr[i]; + for (i = 0; i < 16; i++) + filter6->ff_flow6.fi6_src.s6_addr[i] &= + filter6->ff_mask6.mask6_src.s6_addr[i]; + + if (filter6->ff_flow6.fi6_flowlabel == 0) + i = ACC_WILDCARD_INDEX; + else + i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel); + } +#endif /* INET6 */ + + afp->f_handle = get_filt_handle(classifier, i); + + /* update filter bitmask */ + afp->f_fbmask = filt2fibmask(filter); + classifier->acc_fbmask |= afp->f_fbmask; + + /* + * add this filter to the filter list. + * filters are ordered from the highest rule number. + */ + s = splnet(); + prev = NULL; + LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) { + if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno) + prev = tmp; + else + break; + } + if (prev == NULL) + LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain); + else + LIST_INSERT_AFTER(prev, afp, f_chain); + splx(s); + + *phandle = afp->f_handle; + return (0); +} + +int +acc_delete_filter(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int s; + + if ((afp = filth_to_filtp(classifier, handle)) == NULL) + return (EINVAL); + + s = splnet(); + LIST_REMOVE(afp, f_chain); + splx(s); + + free(afp, M_DEVBUF); + + /* todo: update filt_bmask */ + + return (0); +} + +/* + * delete filters referencing to the specified class. + * if the all flag is not 0, delete all the filters. + */ +int +acc_discard_filters(classifier, class, all) + struct acc_classifier *classifier; + void *class; + int all; +{ + struct acc_filter *afp; + int i, s; + + s = splnet(); + for (i = 0; i < ACC_FILTER_TABLESIZE; i++) { + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (all || afp->f_class == class) { + LIST_REMOVE(afp, f_chain); + free(afp, M_DEVBUF); + /* start again from the head */ + break; + } + } while (afp != NULL); + } + splx(s); + + if (all) + classifier->acc_fbmask = 0; + + return (0); +} + +void * +acc_classify(clfier, m, af) + void *clfier; + struct mbuf *m; + int af; +{ + struct acc_classifier *classifier; + struct flowinfo flow; + struct acc_filter *afp; + int i; + + classifier = (struct acc_classifier *)clfier; + altq_extractflow(m, af, &flow, classifier->acc_fbmask); + + if (flow.fi_family == AF_INET) { + struct flowinfo_in *fp = (struct flowinfo_in *)&flow; + + if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) { + /* only tos is used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_tosfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else if ((classifier->acc_fbmask & + (~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL)) + == 0) { + /* only proto and ports are used */ + LIST_FOREACH(afp, + &classifier->acc_filters[ACC_WILDCARD_INDEX], + f_chain) + if (apply_ppfilter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + } else { + /* get the filter hash entry from its dest address */ + i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr); + do { + /* + * go through this loop twice. first for dst + * hash, second for wildcards. + */ + LIST_FOREACH(afp, &classifier->acc_filters[i], + f_chain) + if (apply_filter4(afp->f_fbmask, + &afp->f_filter, fp)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a dst addr + * wildcard. + * (daddr == 0 || dmask != 0xffffffff). + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } + } +#ifdef INET6 + else if (flow.fi_family == AF_INET6) { + struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow; + + /* get the filter hash entry from its flow ID */ + if (fp6->fi6_flowlabel != 0) + i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel); + else + /* flowlable can be zero */ + i = ACC_WILDCARD_INDEX; + + /* go through this loop twice. first for flow hash, second + for wildcards. */ + do { + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (apply_filter6(afp->f_fbmask, + (struct flow_filter6 *)&afp->f_filter, + fp6)) + /* filter matched */ + return (afp->f_class); + + /* + * check again for filters with a wildcard. + */ + if (i != ACC_WILDCARD_INDEX) + i = ACC_WILDCARD_INDEX; + else + break; + } while (1); + } +#endif /* INET6 */ + + /* no filter matched */ + return (NULL); +} + +static int +apply_filter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_DADDR) && + filt->ff_flow.fi_dst.s_addr != + (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr)) + return (0); + if ((fbmask & FIMB4_SADDR) && + filt->ff_flow.fi_src.s_addr != + (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr)) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi)) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function optimized for a common case that checks + * only protocol and port numbers + */ +static int +apply_ppfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) + return (0); + if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) + return (0); + if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) + return (0); + /* match */ + return (1); +} + +/* + * filter matching function only for tos field. + */ +static int +apply_tosfilter4(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter *filt; + struct flowinfo_in *pkt; +{ + if (filt->ff_flow.fi_family != AF_INET) + return (0); + if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != + (pkt->fi_tos & filt->ff_mask.mask_tos)) + return (0); + /* match */ + return (1); +} + +#ifdef INET6 +static int +apply_filter6(fbmask, filt, pkt) + u_int32_t fbmask; + struct flow_filter6 *filt; + struct flowinfo_in6 *pkt; +{ + int i; + + if (filt->ff_flow6.fi6_family != AF_INET6) + return (0); + if ((fbmask & FIMB6_FLABEL) && + filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel) + return (0); + if ((fbmask & FIMB6_PROTO) && + filt->ff_flow6.fi6_proto != pkt->fi6_proto) + return (0); + if ((fbmask & FIMB6_SPORT) && + filt->ff_flow6.fi6_sport != pkt->fi6_sport) + return (0); + if ((fbmask & FIMB6_DPORT) && + filt->ff_flow6.fi6_dport != pkt->fi6_dport) + return (0); + if (fbmask & FIMB6_SADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_src.s6_addr32[i] != + (pkt->fi6_src.s6_addr32[i] & + filt->ff_mask6.mask6_src.s6_addr32[i])) + return (0); + } + if (fbmask & FIMB6_DADDR) { + for (i = 0; i < 4; i++) + if (filt->ff_flow6.fi6_dst.s6_addr32[i] != + (pkt->fi6_dst.s6_addr32[i] & + filt->ff_mask6.mask6_dst.s6_addr32[i])) + return (0); + } + if ((fbmask & FIMB6_TCLASS) && + filt->ff_flow6.fi6_tclass != + (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass)) + return (0); + if ((fbmask & FIMB6_GPI) && + filt->ff_flow6.fi6_gpi != pkt->fi6_gpi) + return (0); + /* match */ + return (1); +} +#endif /* INET6 */ + +/* + * filter handle: + * bit 20-28: index to the filter hash table + * bit 0-19: unique id in the hash bucket. + */ +static u_long +get_filt_handle(classifier, i) + struct acc_classifier *classifier; + int i; +{ + static u_long handle_number = 1; + u_long handle; + struct acc_filter *afp; + + while (1) { + handle = handle_number++ & 0x000fffff; + + if (LIST_EMPTY(&classifier->acc_filters[i])) + break; + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if ((afp->f_handle & 0x000fffff) == handle) + break; + if (afp == NULL) + break; + /* this handle is already used, try again */ + } + + return ((i << 20) | handle); +} + +/* convert filter handle to filter pointer */ +static struct acc_filter * +filth_to_filtp(classifier, handle) + struct acc_classifier *classifier; + u_long handle; +{ + struct acc_filter *afp; + int i; + + i = ACC_GET_HINDEX(handle); + + LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) + if (afp->f_handle == handle) + return (afp); + + return (NULL); +} + +/* create flowinfo bitmask */ +static u_int32_t +filt2fibmask(filt) + struct flow_filter *filt; +{ + u_int32_t mask = 0; +#ifdef INET6 + struct flow_filter6 *filt6; +#endif + + switch (filt->ff_flow.fi_family) { + case AF_INET: + if (filt->ff_flow.fi_proto != 0) + mask |= FIMB4_PROTO; + if (filt->ff_flow.fi_tos != 0) + mask |= FIMB4_TOS; + if (filt->ff_flow.fi_dst.s_addr != 0) + mask |= FIMB4_DADDR; + if (filt->ff_flow.fi_src.s_addr != 0) + mask |= FIMB4_SADDR; + if (filt->ff_flow.fi_sport != 0) + mask |= FIMB4_SPORT; + if (filt->ff_flow.fi_dport != 0) + mask |= FIMB4_DPORT; + if (filt->ff_flow.fi_gpi != 0) + mask |= FIMB4_GPI; + break; +#ifdef INET6 + case AF_INET6: + filt6 = (struct flow_filter6 *)filt; + + if (filt6->ff_flow6.fi6_proto != 0) + mask |= FIMB6_PROTO; + if (filt6->ff_flow6.fi6_tclass != 0) + mask |= FIMB6_TCLASS; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst)) + mask |= FIMB6_DADDR; + if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src)) + mask |= FIMB6_SADDR; + if (filt6->ff_flow6.fi6_sport != 0) + mask |= FIMB6_SPORT; + if (filt6->ff_flow6.fi6_dport != 0) + mask |= FIMB6_DPORT; + if (filt6->ff_flow6.fi6_gpi != 0) + mask |= FIMB6_GPI; + if (filt6->ff_flow6.fi6_flowlabel != 0) + mask |= FIMB6_FLABEL; + break; +#endif /* INET6 */ + } + return (mask); +} + + +/* + * helper functions to handle IPv4 fragments. + * currently only in-sequence fragments are handled. + * - fragment info is cached in a LRU list. + * - when a first fragment is found, cache its flow info. + * - when a non-first fragment is found, lookup the cache. + */ + +struct ip4_frag { + TAILQ_ENTRY(ip4_frag) ip4f_chain; + char ip4f_valid; + u_short ip4f_id; + struct flowinfo_in ip4f_info; +}; + +static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */ + +#define IP4F_TABSIZE 16 /* IPv4 fragment cache size */ + + +static void +ip4f_cache(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + if (TAILQ_EMPTY(&ip4f_list)) { + /* first time call, allocate fragment cache entries. */ + if (ip4f_init() < 0) + /* allocation failed! */ + return; + } + + fp = ip4f_alloc(); + fp->ip4f_id = ip->ip_id; + fp->ip4f_info.fi_proto = ip->ip_p; + fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr; + fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr; + + /* save port numbers */ + fp->ip4f_info.fi_sport = fin->fi_sport; + fp->ip4f_info.fi_dport = fin->fi_dport; + fp->ip4f_info.fi_gpi = fin->fi_gpi; +} + +static int +ip4f_lookup(ip, fin) + struct ip *ip; + struct flowinfo_in *fin; +{ + struct ip4_frag *fp; + + for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid; + fp = TAILQ_NEXT(fp, ip4f_chain)) + if (ip->ip_id == fp->ip4f_id && + ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr && + ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr && + ip->ip_p == fp->ip4f_info.fi_proto) { + + /* found the matching entry */ + fin->fi_sport = fp->ip4f_info.fi_sport; + fin->fi_dport = fp->ip4f_info.fi_dport; + fin->fi_gpi = fp->ip4f_info.fi_gpi; + + if ((ntohs(ip->ip_off) & IP_MF) == 0) + /* this is the last fragment, + release the entry. */ + ip4f_free(fp); + + return (1); + } + + /* no matching entry found */ + return (0); +} + +static int +ip4f_init(void) +{ + struct ip4_frag *fp; + int i; + + TAILQ_INIT(&ip4f_list); + for (i=0; i<IP4F_TABSIZE; i++) { + fp = malloc(sizeof(struct ip4_frag), + M_DEVBUF, M_NOWAIT); + if (fp == NULL) { + printf("ip4f_init: can't alloc %dth entry!\n", i); + if (i == 0) + return (-1); + return (0); + } + fp->ip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); + } + return (0); +} + +static struct ip4_frag * +ip4f_alloc(void) +{ + struct ip4_frag *fp; + + /* reclaim an entry at the tail, put it at the head */ + fp = TAILQ_LAST(&ip4f_list, ip4f_list); + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 1; + TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain); + return (fp); +} + +static void +ip4f_free(fp) + struct ip4_frag *fp; +{ + TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); + fp->ip4f_valid = 0; + TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); +} + +#endif /* ALTQ3_CLFIER_COMPAT */ diff --git a/freebsd/sys/net/altq/altq_var.h b/freebsd/sys/net/altq/altq_var.h new file mode 100644 index 00000000..2ddcb211 --- /dev/null +++ b/freebsd/sys/net/altq/altq_var.h @@ -0,0 +1,243 @@ +/*- + * Copyright (C) 1998-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: altq_var.h,v 1.16 2003/10/03 05:05:15 kjc Exp $ + * $FreeBSD$ + */ +#ifndef _ALTQ_ALTQ_VAR_H_ +#define _ALTQ_ALTQ_VAR_H_ + +#ifdef _KERNEL + +#include <rtems/bsd/sys/param.h> +#include <sys/kernel.h> +#include <sys/queue.h> + +#ifdef ALTQ3_CLFIER_COMPAT +/* + * filter structure for altq common classifier + */ +struct acc_filter { + LIST_ENTRY(acc_filter) f_chain; + void *f_class; /* pointer to the class */ + u_long f_handle; /* filter id */ + u_int32_t f_fbmask; /* filter bitmask */ + struct flow_filter f_filter; /* filter value */ +}; + +/* + * XXX ACC_FILTER_TABLESIZE can't be larger than 2048 unless we fix + * the handle assignment. + */ +#define ACC_FILTER_TABLESIZE (256+1) +#define ACC_FILTER_MASK (ACC_FILTER_TABLESIZE - 2) +#define ACC_WILDCARD_INDEX (ACC_FILTER_TABLESIZE - 1) +#ifdef __GNUC__ +#define ACC_GET_HASH_INDEX(addr) \ + ({int x = (addr) + ((addr) >> 16); (x + (x >> 8)) & ACC_FILTER_MASK;}) +#else +#define ACC_GET_HASH_INDEX(addr) \ + (((addr) + ((addr) >> 8) + ((addr) >> 16) + ((addr) >> 24)) \ + & ACC_FILTER_MASK) +#endif +#define ACC_GET_HINDEX(handle) ((handle) >> 20) + +#if (__FreeBSD_version > 500000) +#define ACC_LOCK_INIT(ac) mtx_init(&(ac)->acc_mtx, "classifier", MTX_DEF) +#define ACC_LOCK_DESTROY(ac) mtx_destroy(&(ac)->acc_mtx) +#define ACC_LOCK(ac) mtx_lock(&(ac)->acc_mtx) +#define ACC_UNLOCK(ac) mtx_unlock(&(ac)->acc_mtx) +#else +#define ACC_LOCK_INIT(ac) +#define ACC_LOCK_DESTROY(ac) +#define ACC_LOCK(ac) +#define ACC_UNLOCK(ac) +#endif + +struct acc_classifier { + u_int32_t acc_fbmask; + LIST_HEAD(filt, acc_filter) acc_filters[ACC_FILTER_TABLESIZE]; + +#if (__FreeBSD_version > 500000) + struct mtx acc_mtx; +#endif +}; + +/* + * flowinfo mask bits used by classifier + */ +/* for ipv4 */ +#define FIMB4_PROTO 0x0001 +#define FIMB4_TOS 0x0002 +#define FIMB4_DADDR 0x0004 +#define FIMB4_SADDR 0x0008 +#define FIMB4_DPORT 0x0010 +#define FIMB4_SPORT 0x0020 +#define FIMB4_GPI 0x0040 +#define FIMB4_ALL 0x007f +/* for ipv6 */ +#define FIMB6_PROTO 0x0100 +#define FIMB6_TCLASS 0x0200 +#define FIMB6_DADDR 0x0400 +#define FIMB6_SADDR 0x0800 +#define FIMB6_DPORT 0x1000 +#define FIMB6_SPORT 0x2000 +#define FIMB6_GPI 0x4000 +#define FIMB6_FLABEL 0x8000 +#define FIMB6_ALL 0xff00 + +#define FIMB_ALL (FIMB4_ALL|FIMB6_ALL) + +#define FIMB4_PORTS (FIMB4_DPORT|FIMB4_SPORT|FIMB4_GPI) +#define FIMB6_PORTS (FIMB6_DPORT|FIMB6_SPORT|FIMB6_GPI) +#endif /* ALTQ3_CLFIER_COMPAT */ + +/* + * machine dependent clock + * a 64bit high resolution time counter. + */ +extern int machclk_usepcc; +extern u_int32_t machclk_freq; +extern u_int32_t machclk_per_tick; +extern void init_machclk(void); +extern u_int64_t read_machclk(void); + +/* + * debug support + */ +#ifdef ALTQ_DEBUG +#ifdef __STDC__ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, #e)) +#else /* PCC */ +#define ASSERT(e) ((e) ? (void)0 : altq_assert(__FILE__, __LINE__, "e")) +#endif +#else +#define ASSERT(e) ((void)0) +#endif + +/* + * misc stuff for compatibility + */ +/* ioctl cmd type */ +typedef u_long ioctlcmd_t; + +/* + * queue macros: + * the interface of TAILQ_LAST macro changed after the introduction + * of softupdate. redefine it here to make it work with pre-2.2.7. + */ +#undef TAILQ_LAST +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#ifndef TAILQ_EMPTY +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) +#endif +#ifndef TAILQ_FOREACH +#define TAILQ_FOREACH(var, head, field) \ + for (var = TAILQ_FIRST(head); var; var = TAILQ_NEXT(var, field)) +#endif + +/* macro for timeout/untimeout */ +/* use callout */ +#include <sys/callout.h> + +#if (__FreeBSD_version > 500000) +#define CALLOUT_INIT(c) callout_init((c), 0) +#else +#define CALLOUT_INIT(c) callout_init((c)) +#endif +#define CALLOUT_RESET(c,t,f,a) callout_reset((c),(t),(f),(a)) +#define CALLOUT_STOP(c) callout_stop((c)) +#if !defined(CALLOUT_INITIALIZER) && (__FreeBSD_version < 600000) +#define CALLOUT_INITIALIZER { { { NULL } }, 0, NULL, NULL, 0 } +#endif + +#define m_pktlen(m) ((m)->m_pkthdr.len) + +struct ifnet; struct mbuf; +struct pf_altq; +#ifdef ALTQ3_CLFIER_COMPAT +struct flowinfo; +#endif + +void *altq_lookup(char *, int); +#ifdef ALTQ3_CLFIER_COMPAT +int altq_extractflow(struct mbuf *, int, struct flowinfo *, u_int32_t); +int acc_add_filter(struct acc_classifier *, struct flow_filter *, + void *, u_long *); +int acc_delete_filter(struct acc_classifier *, u_long); +int acc_discard_filters(struct acc_classifier *, void *, int); +void *acc_classify(void *, struct mbuf *, int); +#endif +u_int8_t read_dsfield(struct mbuf *, struct altq_pktattr *); +void write_dsfield(struct mbuf *, struct altq_pktattr *, u_int8_t); +void altq_assert(const char *, int, const char *); +int tbr_set(struct ifaltq *, struct tb_profile *); +int tbr_get(struct ifaltq *, struct tb_profile *); + +int altq_pfattach(struct pf_altq *); +int altq_pfdetach(struct pf_altq *); +int altq_add(struct pf_altq *); +int altq_remove(struct pf_altq *); +int altq_add_queue(struct pf_altq *); +int altq_remove_queue(struct pf_altq *); +int altq_getqstats(struct pf_altq *, void *, int *); + +int cbq_pfattach(struct pf_altq *); +int cbq_add_altq(struct pf_altq *); +int cbq_remove_altq(struct pf_altq *); +int cbq_add_queue(struct pf_altq *); +int cbq_remove_queue(struct pf_altq *); +int cbq_getqstats(struct pf_altq *, void *, int *); + +int codel_pfattach(struct pf_altq *); +int codel_add_altq(struct pf_altq *); +int codel_remove_altq(struct pf_altq *); +int codel_getqstats(struct pf_altq *, void *, int *); + +int priq_pfattach(struct pf_altq *); +int priq_add_altq(struct pf_altq *); +int priq_remove_altq(struct pf_altq *); +int priq_add_queue(struct pf_altq *); +int priq_remove_queue(struct pf_altq *); +int priq_getqstats(struct pf_altq *, void *, int *); + +int hfsc_pfattach(struct pf_altq *); +int hfsc_add_altq(struct pf_altq *); +int hfsc_remove_altq(struct pf_altq *); +int hfsc_add_queue(struct pf_altq *); +int hfsc_remove_queue(struct pf_altq *); +int hfsc_getqstats(struct pf_altq *, void *, int *); + +int fairq_pfattach(struct pf_altq *); +int fairq_add_altq(struct pf_altq *); +int fairq_remove_altq(struct pf_altq *); +int fairq_add_queue(struct pf_altq *); +int fairq_remove_queue(struct pf_altq *); +int fairq_getqstats(struct pf_altq *, void *, int *); + +#endif /* _KERNEL */ +#endif /* _ALTQ_ALTQ_VAR_H_ */ diff --git a/freebsd/sys/net/altq/if_altq.h b/freebsd/sys/net/altq/if_altq.h new file mode 100644 index 00000000..c5ad2875 --- /dev/null +++ b/freebsd/sys/net/altq/if_altq.h @@ -0,0 +1,182 @@ +/*- + * Copyright (C) 1997-2003 + * Sony Computer Science Laboratories Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: if_altq.h,v 1.12 2005/04/13 03:44:25 suz Exp $ + * $FreeBSD$ + */ +#ifndef _ALTQ_IF_ALTQ_H_ +#define _ALTQ_IF_ALTQ_H_ + +#include <rtems/bsd/sys/lock.h> /* XXX */ +#include <sys/mutex.h> /* XXX */ +#include <sys/event.h> /* XXX */ + +struct altq_pktattr; struct tb_regulator; struct top_cdnr; + +/* + * Structure defining a queue for a network interface. + */ +struct ifaltq { + /* fields compatible with struct ifqueue */ + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + struct mtx ifq_mtx; + + /* driver owned queue (used for bulk dequeue and prepend) UNLOCKED */ + struct mbuf *ifq_drv_head; + struct mbuf *ifq_drv_tail; + int ifq_drv_len; + int ifq_drv_maxlen; + + /* alternate queueing related fields */ + int altq_type; /* discipline type */ + int altq_flags; /* flags (e.g. ready, in-use) */ + void *altq_disc; /* for discipline-specific use */ + struct ifnet *altq_ifp; /* back pointer to interface */ + + int (*altq_enqueue)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *); + struct mbuf *(*altq_dequeue)(struct ifaltq *, int); + int (*altq_request)(struct ifaltq *, int, void *); + + /* classifier fields */ + void *altq_clfier; /* classifier-specific use */ + void *(*altq_classify)(void *, struct mbuf *, int); + + /* token bucket regulator */ + struct tb_regulator *altq_tbr; + + /* input traffic conditioner (doesn't belong to the output queue...) */ + struct top_cdnr *altq_cdnr; +}; + + +#ifdef _KERNEL + +/* + * packet attributes used by queueing disciplines. + * pattr_class is a discipline-dependent scheduling class that is + * set by a classifier. + * pattr_hdr and pattr_af may be used by a discipline to access + * the header within a mbuf. (e.g. ECN needs to update the CE bit) + * note that pattr_hdr could be stale after m_pullup, though link + * layer output routines usually don't use m_pullup. link-level + * compression also invalidates these fields. thus, pattr_hdr needs + * to be verified when a discipline touches the header. + */ +struct altq_pktattr { + void *pattr_class; /* sched class set by classifier */ + int pattr_af; /* address family */ + caddr_t pattr_hdr; /* saved header position in mbuf */ +}; + +/* + * mbuf tag to carry a queue id (and hints for ECN). + */ +struct altq_tag { + u_int32_t qid; /* queue id */ + /* hints for ecn */ + int af; /* address family */ + void *hdr; /* saved header position in mbuf */ +}; + +/* + * a token-bucket regulator limits the rate that a network driver can + * dequeue packets from the output queue. + * modern cards are able to buffer a large amount of packets and dequeue + * too many packets at a time. this bursty dequeue behavior makes it + * impossible to schedule packets by queueing disciplines. + * a token-bucket is used to control the burst size in a device + * independent manner. + */ +struct tb_regulator { + int64_t tbr_rate; /* (scaled) token bucket rate */ + int64_t tbr_depth; /* (scaled) token bucket depth */ + + int64_t tbr_token; /* (scaled) current token */ + int64_t tbr_filluptime; /* (scaled) time to fill up bucket */ + u_int64_t tbr_last; /* last time token was updated */ + + int tbr_lastop; /* last dequeue operation type + needed for poll-and-dequeue */ +}; + +/* if_altqflags */ +#define ALTQF_READY 0x01 /* driver supports alternate queueing */ +#define ALTQF_ENABLED 0x02 /* altq is in use */ +#define ALTQF_CLASSIFY 0x04 /* classify packets */ +#define ALTQF_CNDTNING 0x08 /* altq traffic conditioning is enabled */ +#define ALTQF_DRIVER1 0x40 /* driver specific */ + +/* if_altqflags set internally only: */ +#define ALTQF_CANTCHANGE (ALTQF_READY) + +/* altq_dequeue 2nd arg */ +#define ALTDQ_REMOVE 1 /* dequeue mbuf from the queue */ +#define ALTDQ_POLL 2 /* don't dequeue mbuf from the queue */ + +/* altq request types (currently only purge is defined) */ +#define ALTRQ_PURGE 1 /* purge all packets */ + +#define ALTQ_IS_READY(ifq) ((ifq)->altq_flags & ALTQF_READY) +#define ALTQ_IS_ENABLED(ifq) ((ifq)->altq_flags & ALTQF_ENABLED) +#define ALTQ_NEEDS_CLASSIFY(ifq) ((ifq)->altq_flags & ALTQF_CLASSIFY) +#define ALTQ_IS_CNDTNING(ifq) ((ifq)->altq_flags & ALTQF_CNDTNING) + +#define ALTQ_SET_CNDTNING(ifq) ((ifq)->altq_flags |= ALTQF_CNDTNING) +#define ALTQ_CLEAR_CNDTNING(ifq) ((ifq)->altq_flags &= ~ALTQF_CNDTNING) +#define ALTQ_IS_ATTACHED(ifq) ((ifq)->altq_disc != NULL) + +#define ALTQ_ENQUEUE(ifq, m, pa, err) \ + (err) = (*(ifq)->altq_enqueue)((ifq),(m),(pa)) +#define ALTQ_DEQUEUE(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_REMOVE) +#define ALTQ_POLL(ifq, m) \ + (m) = (*(ifq)->altq_dequeue)((ifq), ALTDQ_POLL) +#define ALTQ_PURGE(ifq) \ + (void)(*(ifq)->altq_request)((ifq), ALTRQ_PURGE, (void *)0) +#define ALTQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) +#define TBR_IS_ENABLED(ifq) ((ifq)->altq_tbr != NULL) + +extern int altq_attach(struct ifaltq *, int, void *, + int (*)(struct ifaltq *, struct mbuf *, + struct altq_pktattr *), + struct mbuf *(*)(struct ifaltq *, int), + int (*)(struct ifaltq *, int, void *), + void *, + void *(*)(void *, struct mbuf *, int)); +extern int altq_detach(struct ifaltq *); +extern int altq_enable(struct ifaltq *); +extern int altq_disable(struct ifaltq *); +extern struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int); +extern int (*altq_input)(struct mbuf *, int); +#if 0 /* ALTQ3_CLFIER_COMPAT */ +void altq_etherclassify(struct ifaltq *, struct mbuf *, struct altq_pktattr *); +#endif +#endif /* _KERNEL */ + +#endif /* _ALTQ_IF_ALTQ_H_ */ diff --git a/freebsd/sys/net/bpf.c b/freebsd/sys/net/bpf.c index f74ac9a1..e7822586 100644 --- a/freebsd/sys/net/bpf.c +++ b/freebsd/sys/net/bpf.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_bpf.h> #include <rtems/bsd/local/opt_compat.h> +#include <rtems/bsd/local/opt_ddb.h> #include <rtems/bsd/local/opt_netgraph.h> #include <sys/types.h> @@ -69,8 +70,13 @@ __FBSDID("$FreeBSD$"); #include <sys/socket.h> +#ifdef DDB +#include <ddb/ddb.h> +#endif + #include <net/if.h> -#define BPF_INTERNAL +#include <net/if_var.h> +#include <net/if_dl.h> #include <net/bpf.h> #include <net/bpf_buffer.h> #ifdef BPF_JITTER @@ -78,6 +84,7 @@ __FBSDID("$FreeBSD$"); #endif #include <net/bpf_zerocopy.h> #include <net/bpfdesc.h> +#include <net/route.h> #include <net/vnet.h> #include <netinet/in.h> @@ -96,6 +103,20 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); +struct bpf_if { +#define bif_next bif_ext.bif_next +#define bif_dlist bif_ext.bif_dlist + struct bpf_if_ext bif_ext; /* public members */ + u_int bif_dlt; /* link layer type */ + u_int bif_hdrlen; /* length of link header */ + struct ifnet *bif_ifp; /* corresponding interface */ + struct rwlock bif_lock; /* interface lock */ + LIST_HEAD(, bpf_d) bif_wlist; /* writer-only list */ + int bif_flags; /* Interface flags */ +}; + +CTASSERT(offsetof(struct bpf_if, bif_ext) == 0); + #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ @@ -107,7 +128,7 @@ MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #include <sys/mount.h> #include <compat/freebsd32/freebsd32.h> #define BPF_ALIGNMENT32 sizeof(int32_t) -#define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1)) +#define BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32) #ifndef BURN_BRIDGES /* @@ -148,7 +169,7 @@ struct bpf_dltlist32 { * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ -static LIST_HEAD(, bpf_if) bpf_iflist; +static LIST_HEAD(, bpf_if) bpf_iflist, bpf_freelist; static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; @@ -157,7 +178,7 @@ static void bpf_detachd(struct bpf_d *); static void bpf_detachd_locked(struct bpf_d *); static void bpf_freed(struct bpf_d *); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, - struct sockaddr *, int *, struct bpf_insn *); + struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void @@ -188,8 +209,8 @@ static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, static VNET_DEFINE(int, bpf_optimize_writers) = 0; #define V_bpf_optimize_writers VNET(bpf_optimize_writers) -SYSCTL_VNET_INT(_net_bpf, OID_AUTO, optimize_writers, - CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0, +SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(bpf_optimize_writers), 0, "Do not send packets until BPF program is set"); #ifndef __rtems__ @@ -479,7 +500,7 @@ bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, - struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) + struct sockaddr *sockp, int *hdrlen, struct bpf_d *d) { const struct ieee80211_bpf_params *p; struct ether_header *eh; @@ -561,37 +582,20 @@ bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, } len = uio->uio_resid; - - if (len - hlen > ifp->if_mtu) + if (len < hlen || len - hlen > ifp->if_mtu) return (EMSGSIZE); - if ((unsigned)len > MJUM16BYTES) + m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR); + if (m == NULL) return (EIO); - - if (len <= MHLEN) - MGETHDR(m, M_WAIT, MT_DATA); - else if (len <= MCLBYTES) - m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR); - else - m = m_getjcl(M_WAIT, MT_DATA, M_PKTHDR, -#if (MJUMPAGESIZE > MCLBYTES) - len <= MJUMPAGESIZE ? MJUMPAGESIZE : -#endif - (len <= MJUM9BYTES ? MJUM9BYTES : MJUM16BYTES)); m->m_pkthdr.len = m->m_len = len; - m->m_pkthdr.rcvif = NULL; *mp = m; - if (m->m_len < hlen) { - error = EPERM; - goto bad; - } - error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; - slen = bpf_filter(wfilter, mtod(m, u_char *), len, len); + slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; @@ -608,6 +612,10 @@ bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, else m->m_flags |= M_MCAST; } + if (d->bd_hdrcmplt == 0) { + memcpy(eh->ether_shost, IF_LLADDR(ifp), + sizeof(eh->ether_shost)); + } break; } @@ -632,7 +640,7 @@ bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, goto bad; } } - bcopy(m->m_data, sockp->sa_data, hlen); + bcopy(mtod(m, const void *), sockp->sa_data, hlen); } *hdrlen = hlen; @@ -656,13 +664,13 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) * Save sysctl value to protect from sysctl change * between reads */ - op_w = V_bpf_optimize_writers; + op_w = V_bpf_optimize_writers || d->bd_writer; if (d->bd_bif != NULL) bpf_detachd_locked(d); /* * Point d at bp, and add d to the interface's list. - * Since there are many applicaiotns using BPF for + * Since there are many applications using BPF for * sending raw packets only (dhcpd, cdpd are good examples) * we can delay adding d to the list of active listeners until * some filter is configured. @@ -760,7 +768,7 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) /* * Add d to the list of active bp filters. - * Reuqires bpf_attachd() to be called before + * Requires bpf_attachd() to be called before. */ static void bpf_upgraded(struct bpf_d *d) @@ -909,7 +917,7 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; #ifndef __rtems__ - int error, size; + int error; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); @@ -932,6 +940,8 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) * particular buffer method. */ bpf_buffer_init(d); + if ((flags & FREAD) == 0) + d->bd_writer = 2; d->bd_hbuf_in_use = 0; d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; @@ -945,10 +955,6 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) callout_init_mtx(&d->bd_callout, &d->bd_lock, 0); knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock); - /* Allocate default buffers */ - size = d->bd_bufsize; - bpf_buffer_ioctl_sblen(d, &size); - #ifndef __rtems__ return (0); #else /* __rtems__ */ @@ -1163,6 +1169,7 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) struct ifnet *ifp; struct mbuf *m, *mc; struct sockaddr dst; + struct route ro; int error, hlen; error = devfs_get_cdevpriv((void **)&d); @@ -1194,7 +1201,7 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) hlen = 0; /* XXX: bpf_movein() can sleep */ error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, - &m, &dst, &hlen, d->bd_wfilter); + &m, &dst, &hlen, d); if (error) { d->bd_wdcount++; return (error); @@ -1204,7 +1211,7 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { - mc = m_dup(m, M_DONTWAIT); + mc = m_dup(m, M_NOWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* Set M_PROMISC for outgoing packets to be discarded. */ @@ -1226,7 +1233,14 @@ bpfwrite(struct bpf_d *d, struct uio *uio, int ioflag) BPFD_UNLOCK(d); #endif - error = (*ifp->if_output)(ifp, m, &dst, NULL); + bzero(&ro, sizeof(ro)); + if (hlen != 0) { + ro.ro_prepend = (u_char *)&dst.sa_data; + ro.ro_plen = hlen; + ro.ro_flags = RT_HAS_HEADER; + } + + error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) d->bd_wdcount++; @@ -1278,7 +1292,6 @@ reset_d(struct bpf_d *d) /* * FIONREAD Check for read packet available. - * SIOCGIFADDR Get interface address - convenient hook to driver. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set read filter. * BIOCSETFNR Set read filter without resetting descriptor. @@ -1347,7 +1360,7 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, #endif case BIOCGETIF: case BIOCGRTIMEOUT: -#ifdef COMPAT_FREEBSD32 +#if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCGRTIMEOUT32: #endif case BIOCGSTATS: @@ -1359,7 +1372,7 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: -#ifdef COMPAT_FREEBSD32 +#if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCSRTIMEOUT32: #endif case BIOCIMMEDIATE: @@ -1415,19 +1428,6 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, break; } - case SIOCGIFADDR: - { - struct ifnet *ifp; - - if (d->bd_bif == NULL) - error = EINVAL; - else { - ifp = d->bd_bif->bif_ifp; - error = (*ifp->if_ioctl)(ifp, cmd, addr); - } - break; - } - /* * Get buffer len [for read()]. */ @@ -1564,21 +1564,44 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, * Set interface. */ case BIOCSETIF: - BPF_LOCK(); - error = bpf_setif(d, (struct ifreq *)addr); - BPF_UNLOCK(); - break; + { + int alloc_buf, size; + + /* + * Behavior here depends on the buffering model. If + * we're using kernel memory buffers, then we can + * allocate them here. If we're using zero-copy, + * then the user process must have registered buffers + * by the time we get here. + */ + alloc_buf = 0; + BPFD_LOCK(d); + if (d->bd_bufmode == BPF_BUFMODE_BUFFER && + d->bd_sbuf == NULL) + alloc_buf = 1; + BPFD_UNLOCK(d); + if (alloc_buf) { + size = d->bd_bufsize; + error = bpf_buffer_ioctl_sblen(d, &size); + if (error != 0) + break; + } + BPF_LOCK(); + error = bpf_setif(d, (struct ifreq *)addr); + BPF_UNLOCK(); + break; + } /* * Set read timeout. */ case BIOCSRTIMEOUT: -#ifdef COMPAT_FREEBSD32 +#if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCSRTIMEOUT32: #endif { struct timeval *tv = (struct timeval *)addr; -#ifdef COMPAT_FREEBSD32 +#if defined(COMPAT_FREEBSD32) && !defined(__mips__) struct timeval32 *tv32; struct timeval tv64; @@ -1604,12 +1627,12 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, * Get read timeout. */ case BIOCGRTIMEOUT: -#ifdef COMPAT_FREEBSD32 +#if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCGRTIMEOUT32: #endif { struct timeval *tv; -#ifdef COMPAT_FREEBSD32 +#if defined(COMPAT_FREEBSD32) && !defined(__mips__) struct timeval32 *tv32; struct timeval tv64; @@ -1621,7 +1644,7 @@ bpfioctl(struct bpf_d *d, u_long cmd, caddr_t addr, int flags, tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; -#ifdef COMPAT_FREEBSD32 +#if defined(COMPAT_FREEBSD32) && !defined(__mips__) if (cmd == BIOCGRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv32->tv_sec = tv->tv_sec; @@ -2001,17 +2024,15 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr) /* Check if interface is not being detached from BPF */ BPFIF_RLOCK(bp); - if (bp->flags & BPFIF_FLAG_DYING) { + if (bp->bif_flags & BPFIF_FLAG_DYING) { BPFIF_RUNLOCK(bp); return (ENXIO); } BPFIF_RUNLOCK(bp); /* - * Behavior here depends on the buffering model. If we're using - * kernel memory buffers, then we can allocate them here. If we're - * using zero-copy, then the user process must have registered - * buffers by the time we get here. If not, return an error. + * At this point, we expect the buffer is already allocated. If not, + * return an error. */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: @@ -2131,10 +2152,10 @@ filt_bpfread(struct knote *kn, long hint) ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; - while (d->bd_hbuf_in_use) - mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, - PRINET, "bd_hbuf", 0); - if (d->bd_hbuf) + /* + * Ignore the hold buffer if it is being copied to user space. + */ + if (!d->bd_hbuf_in_use && d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, @@ -2405,12 +2426,19 @@ bpf_hdrlen(struct bpf_d *d) static void bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype) { +#ifndef __rtems__ + struct bintime bt2, boottimebin; +#else /* __rtems__ */ struct bintime bt2; +#endif /* __rtems__ */ struct timeval tsm; struct timespec tsn; if ((tstype & BPF_T_MONOTONIC) == 0) { bt2 = *bt; +#ifndef __rtems__ + getboottimebin(&boottimebin); +#endif /* __rtems__ */ bintime_add(&bt2, &boottimebin); bt = &bt2; } @@ -2466,9 +2494,6 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, * spot to do it. */ if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { - while (d->bd_hbuf_in_use) - mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, - PRINET, "bd_hbuf", 0); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; @@ -2511,9 +2536,7 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, ++d->bd_dcount; return; } - while (d->bd_hbuf_in_use) - mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, - PRINET, "bd_hbuf", 0); + KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use")); ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; @@ -2652,10 +2675,36 @@ bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) bp->bif_hdrlen = hdrlen; - if (bootverbose) + if (bootverbose && IS_DEFAULT_VNET(curvnet)) if_printf(ifp, "bpf attached\n"); } +#ifdef VIMAGE +/* + * When moving interfaces between vnet instances we need a way to + * query the dlt and hdrlen before detach so we can re-attch the if_bpf + * after the vmove. We unfortunately have no device driver infrastructure + * to query the interface for these values after creation/attach, thus + * add this as a workaround. + */ +int +bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen) +{ + + if (bp == NULL) + return (ENXIO); + if (bif_dlt == NULL && bif_hdrlen == NULL) + return (0); + + if (bif_dlt != NULL) + *bif_dlt = bp->bif_dlt; + if (bif_hdrlen != NULL) + *bif_hdrlen = bp->bif_hdrlen; + + return (0); +} +#endif + /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface. Notify each descriptor as it's detached @@ -2664,52 +2713,51 @@ bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) void bpfdetach(struct ifnet *ifp) { - struct bpf_if *bp; + struct bpf_if *bp, *bp_temp; struct bpf_d *d; -#ifdef INVARIANTS int ndetached; ndetached = 0; -#endif BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ - do { - LIST_FOREACH(bp, &bpf_iflist, bif_next) { - if (ifp == bp->bif_ifp) - break; - } - if (bp != NULL) - LIST_REMOVE(bp, bif_next); + LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { + if (ifp != bp->bif_ifp) + continue; - if (bp != NULL) { -#ifdef INVARIANTS - ndetached++; -#endif - while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { - bpf_detachd_locked(d); - BPFD_LOCK(d); - bpf_wakeup(d); - BPFD_UNLOCK(d); - } - /* Free writer-only descriptors */ - while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) { - bpf_detachd_locked(d); - BPFD_LOCK(d); - bpf_wakeup(d); - BPFD_UNLOCK(d); - } + LIST_REMOVE(bp, bif_next); + /* Add to to-be-freed list */ + LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next); - /* - * Delay freing bp till interface is detached - * and all routes through this interface are removed. - * Mark bp as detached to restrict new consumers. - */ - BPFIF_WLOCK(bp); - bp->flags |= BPFIF_FLAG_DYING; - BPFIF_WUNLOCK(bp); + ndetached++; + /* + * Delay freeing bp till interface is detached + * and all routes through this interface are removed. + * Mark bp as detached to restrict new consumers. + */ + BPFIF_WLOCK(bp); + bp->bif_flags |= BPFIF_FLAG_DYING; + BPFIF_WUNLOCK(bp); + + CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", + __func__, bp->bif_dlt, bp, ifp); + + /* Free common descriptors */ + while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { + bpf_detachd_locked(d); + BPFD_LOCK(d); + bpf_wakeup(d); + BPFD_UNLOCK(d); } - } while (bp != NULL); + + /* Free writer-only descriptors */ + while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) { + bpf_detachd_locked(d); + BPFD_LOCK(d); + bpf_wakeup(d); + BPFD_UNLOCK(d); + } + } BPF_UNLOCK(); #ifdef INVARIANTS @@ -2721,32 +2769,46 @@ bpfdetach(struct ifnet *ifp) /* * Interface departure handler. * Note departure event does not guarantee interface is going down. + * Interface renaming is currently done via departure/arrival event set. + * + * Departure handled is called after all routes pointing to + * given interface are removed and interface is in down state + * restricting any packets to be sent/received. We assume it is now safe + * to free data allocated by BPF. */ static void bpf_ifdetach(void *arg __unused, struct ifnet *ifp) { - struct bpf_if *bp; + struct bpf_if *bp, *bp_temp; + int nmatched = 0; BPF_LOCK(); - if ((bp = ifp->if_bpf) == NULL) { - BPF_UNLOCK(); - return; - } + /* + * Find matching entries in free list. + * Nothing should be found if bpfdetach() was not called. + */ + LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) { + if (ifp != bp->bif_ifp) + continue; - /* Check if bpfdetach() was called previously */ - if ((bp->flags & BPFIF_FLAG_DYING) == 0) { - BPF_UNLOCK(); - return; - } + CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p", + __func__, bp, ifp); - CTR3(KTR_NET, "%s: freing BPF instance %p for interface %p", - __func__, bp, ifp); + LIST_REMOVE(bp, bif_next); - ifp->if_bpf = NULL; + rw_destroy(&bp->bif_lock); + free(bp, M_BPF); + + nmatched++; + } BPF_UNLOCK(); - rw_destroy(&bp->bif_lock); - free(bp, M_BPF); + /* + * Note that we cannot zero other pointers to + * custom DLTs possibly used by given interface. + */ + if (nmatched != 0) + ifp->if_bpf = NULL; } /* @@ -2755,26 +2817,44 @@ bpf_ifdetach(void *arg __unused, struct ifnet *ifp) static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { - int n, error; struct ifnet *ifp; struct bpf_if *bp; + u_int *lst; + int error, n, n1; BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; +again: + n1 = 0; + LIST_FOREACH(bp, &bpf_iflist, bif_next) { + if (bp->bif_ifp == ifp) + n1++; + } + if (bfl->bfl_list == NULL) { + bfl->bfl_len = n1; + return (0); + } + if (n1 > bfl->bfl_len) + return (ENOMEM); + BPF_UNLOCK(); + lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK); n = 0; - error = 0; + BPF_LOCK(); LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; - if (bfl->bfl_list != NULL) { - if (n >= bfl->bfl_len) - return (ENOMEM); - error = copyout(&bp->bif_dlt, - bfl->bfl_list + n, sizeof(u_int)); + if (n >= n1) { + free(lst, M_TEMP); + goto again; } + lst[n] = bp->bif_dlt; n++; } + BPF_UNLOCK(); + error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n); + free(lst, M_TEMP); + BPF_LOCK(); bfl->bfl_len = n; return (error); } @@ -2999,6 +3079,7 @@ bpf_drvinit(void *unused) mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF); LIST_INIT(&bpf_iflist); + LIST_INIT(&bpf_freelist); #ifndef __rtems__ dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); @@ -3214,3 +3295,34 @@ bpf_validate(const struct bpf_insn *f, int len) } #endif /* !DEV_BPF && !NETGRAPH_BPF */ + +#ifdef DDB +static void +bpf_show_bpf_if(struct bpf_if *bpf_if) +{ + + if (bpf_if == NULL) + return; + db_printf("%p:\n", bpf_if); +#define BPF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, bpf_if->e); + /* bif_ext.bif_next */ + /* bif_ext.bif_dlist */ + BPF_DB_PRINTF("%#x", bif_dlt); + BPF_DB_PRINTF("%u", bif_hdrlen); + BPF_DB_PRINTF("%p", bif_ifp); + /* bif_lock */ + /* bif_wlist */ + BPF_DB_PRINTF("%#x", bif_flags); +} + +DB_SHOW_COMMAND(bpf_if, db_show_bpf_if) +{ + + if (!have_addr) { + db_printf("usage: show bpf_if <struct bpf_if *>\n"); + return; + } + + bpf_show_bpf_if((struct bpf_if *)addr); +} +#endif diff --git a/freebsd/sys/net/bpf.h b/freebsd/sys/net/bpf.h index bfe8cfe0..f707f436 100644 --- a/freebsd/sys/net/bpf.h +++ b/freebsd/sys/net/bpf.h @@ -582,7 +582,7 @@ struct bpf_zbuf_header { * input packets such as port scans, packets from old lost connections, * etc. to force the connection to stay up). * - * The first byte of the PPP header (0xff03) is modified to accomodate + * The first byte of the PPP header (0xff03) is modified to accommodate * the direction - 0x00 = IN, 0x01 = OUT. */ #define DLT_PPP_PPPD 166 @@ -1096,7 +1096,7 @@ struct bpf_zbuf_header { #define DLT_NETANALYZER_TRANSPARENT 241 /* - * IP-over-Infiniband, as specified by RFC 4391. + * IP-over-InfiniBand, as specified by RFC 4391. * * Requested by Petr Sumbera <petr.sumbera@oracle.com>. */ @@ -1138,7 +1138,145 @@ struct bpf_zbuf_header { #define DLT_PFSYNC 246 #endif -#define DLT_MATCHING_MAX 246 /* highest value in the "matching" range */ +/* + * Raw InfiniBand packets, starting with the Local Routing Header. + * + * Requested by Oren Kladnitsky <orenk@mellanox.com>. + */ +#define DLT_INFINIBAND 247 + +/* + * SCTP, with no lower-level protocols (i.e., no IPv4 or IPv6). + * + * Requested by Michael Tuexen <Michael.Tuexen@lurchi.franken.de>. + */ +#define DLT_SCTP 248 + +/* + * USB packets, beginning with a USBPcap header. + * + * Requested by Tomasz Mon <desowin@gmail.com> + */ +#define DLT_USBPCAP 249 + +/* + * Schweitzer Engineering Laboratories "RTAC" product serial-line + * packets. + * + * Requested by Chris Bontje <chris_bontje@selinc.com>. + */ +#define DLT_RTAC_SERIAL 250 + +/* + * Bluetooth Low Energy air interface link-layer packets. + * + * Requested by Mike Kershaw <dragorn@kismetwireless.net>. + */ +#define DLT_BLUETOOTH_LE_LL 251 + +/* + * DLT type for upper-protocol layer PDU saves from wireshark. + * + * the actual contents are determined by two TAGs stored with each + * packet: + * EXP_PDU_TAG_LINKTYPE the link type (LINKTYPE_ value) of the + * original packet. + * + * EXP_PDU_TAG_PROTO_NAME the name of the wireshark dissector + * that can make sense of the data stored. + */ +#define DLT_WIRESHARK_UPPER_PDU 252 + +/* + * DLT type for the netlink protocol (nlmon devices). + */ +#define DLT_NETLINK 253 + +/* + * Bluetooth Linux Monitor headers for the BlueZ stack. + */ +#define DLT_BLUETOOTH_LINUX_MONITOR 254 + +/* + * Bluetooth Basic Rate/Enhanced Data Rate baseband packets, as + * captured by Ubertooth. + */ +#define DLT_BLUETOOTH_BREDR_BB 255 + +/* + * Bluetooth Low Energy link layer packets, as captured by Ubertooth. + */ +#define DLT_BLUETOOTH_LE_LL_WITH_PHDR 256 + +/* + * PROFIBUS data link layer. + */ +#define DLT_PROFIBUS_DL 257 + +/* + * Apple's DLT_PKTAP headers. + * + * Sadly, the folks at Apple either had no clue that the DLT_USERn values + * are for internal use within an organization and partners only, and + * didn't know that the right way to get a link-layer header type is to + * ask tcpdump.org for one, or knew and didn't care, so they just + * used DLT_USER2, which causes problems for everything except for + * their version of tcpdump. + * + * So I'll just give them one; hopefully this will show up in a + * libpcap release in time for them to get this into 10.10 Big Sur + * or whatever Mavericks' successor is called. LINKTYPE_PKTAP + * will be 258 *even on OS X*; that is *intentional*, so that + * PKTAP files look the same on *all* OSes (different OSes can have + * different numerical values for a given DLT_, but *MUST NOT* have + * different values for what goes in a file, as files can be moved + * between OSes!). + * + * When capturing, on a system with a Darwin-based OS, on a device + * that returns 149 (DLT_USER2 and Apple's DLT_PKTAP) with this + * version of libpcap, the DLT_ value for the pcap_t will be DLT_PKTAP, + * and that will continue to be DLT_USER2 on Darwin-based OSes. That way, + * binary compatibility with Mavericks is preserved for programs using + * this version of libpcap. This does mean that if you were using + * DLT_USER2 for some capture device on OS X, you can't do so with + * this version of libpcap, just as you can't with Apple's libpcap - + * on OS X, they define DLT_PKTAP to be DLT_USER2, so programs won't + * be able to distinguish between PKTAP and whatever you were using + * DLT_USER2 for. + * + * If the program saves the capture to a file using this version of + * libpcap's pcap_dump code, the LINKTYPE_ value in the file will be + * LINKTYPE_PKTAP, which will be 258, even on Darwin-based OSes. + * That way, the file will *not* be a DLT_USER2 file. That means + * that the latest version of tcpdump, when built with this version + * of libpcap, and sufficiently recent versions of Wireshark will + * be able to read those files and interpret them correctly; however, + * Apple's version of tcpdump in OS X 10.9 won't be able to handle + * them. (Hopefully, Apple will pick up this version of libpcap, + * and the corresponding version of tcpdump, so that tcpdump will + * be able to handle the old LINKTYPE_USER2 captures *and* the new + * LINKTYPE_PKTAP captures.) + */ +#ifdef __APPLE__ +#define DLT_PKTAP DLT_USER2 +#else +#define DLT_PKTAP 258 +#endif + +/* + * Ethernet packets preceded by a header giving the last 6 octets + * of the preamble specified by 802.3-2012 Clause 65, section + * 65.1.3.2 "Transmit". + */ +#define DLT_EPON 259 + +/* + * IPMI trace packets, as specified by Table 3-20 "Trace Data Block Format" + * in the PICMG HPM.2 specification. + */ +#define DLT_IPMI_HPM_2 260 + +#define DLT_MATCHING_MAX 260 /* highest value in the "matching" range */ /* * DLT and savefile link type values are split into a class and @@ -1149,7 +1287,17 @@ struct bpf_zbuf_header { /* * The instruction encodings. + * + * Please inform tcpdump-workers@lists.tcpdump.org if you use any + * of the reserved values, so that we can note that they're used + * (and perhaps implement it in the reference BPF implementation + * and encourage its implementation elsewhere). */ + +/* + * The upper 8 bits of the opcode aren't used. BSD/OS used 0x8000. + */ + /* instruction classes */ #define BPF_CLASS(code) ((code) & 0x07) #define BPF_LD 0x00 @@ -1166,6 +1314,7 @@ struct bpf_zbuf_header { #define BPF_W 0x00 #define BPF_H 0x08 #define BPF_B 0x10 +/* 0x18 reserved; used by BSD/OS */ #define BPF_MODE(code) ((code) & 0xe0) #define BPF_IMM 0x00 #define BPF_ABS 0x20 @@ -1173,6 +1322,8 @@ struct bpf_zbuf_header { #define BPF_MEM 0x60 #define BPF_LEN 0x80 #define BPF_MSH 0xa0 +/* 0xc0 reserved; used by BSD/OS */ +/* 0xe0 reserved; used by BSD/OS */ /* alu/jmp fields */ #define BPF_OP(code) ((code) & 0xf0) @@ -1185,11 +1336,30 @@ struct bpf_zbuf_header { #define BPF_LSH 0x60 #define BPF_RSH 0x70 #define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 +/* 0xb0 reserved */ +/* 0xc0 reserved */ +/* 0xd0 reserved */ +/* 0xe0 reserved */ +/* 0xf0 reserved */ + #define BPF_JA 0x00 #define BPF_JEQ 0x10 #define BPF_JGT 0x20 #define BPF_JGE 0x30 #define BPF_JSET 0x40 +/* 0x50 reserved; used on BSD/OS */ +/* 0x60 reserved */ +/* 0x70 reserved */ +/* 0x80 reserved */ +/* 0x90 reserved */ +/* 0xa0 reserved */ +/* 0xb0 reserved */ +/* 0xc0 reserved */ +/* 0xd0 reserved */ +/* 0xe0 reserved */ +/* 0xf0 reserved */ #define BPF_SRC(code) ((code) & 0x08) #define BPF_K 0x00 #define BPF_X 0x08 @@ -1197,11 +1367,43 @@ struct bpf_zbuf_header { /* ret - BPF_K and BPF_X also apply */ #define BPF_RVAL(code) ((code) & 0x18) #define BPF_A 0x10 +/* 0x18 reserved */ /* misc */ #define BPF_MISCOP(code) ((code) & 0xf8) #define BPF_TAX 0x00 +/* 0x08 reserved */ +/* 0x10 reserved */ +/* 0x18 reserved */ +/* #define BPF_COP 0x20 NetBSD "coprocessor" extensions */ +/* 0x28 reserved */ +/* 0x30 reserved */ +/* 0x38 reserved */ +/* #define BPF_COPX 0x40 NetBSD "coprocessor" extensions */ +/* also used on BSD/OS */ +/* 0x48 reserved */ +/* 0x50 reserved */ +/* 0x58 reserved */ +/* 0x60 reserved */ +/* 0x68 reserved */ +/* 0x70 reserved */ +/* 0x78 reserved */ #define BPF_TXA 0x80 +/* 0x88 reserved */ +/* 0x90 reserved */ +/* 0x98 reserved */ +/* 0xa0 reserved */ +/* 0xa8 reserved */ +/* 0xb0 reserved */ +/* 0xb8 reserved */ +/* 0xc0 reserved; used on BSD/OS */ +/* 0xc8 reserved */ +/* 0xd0 reserved */ +/* 0xd8 reserved */ +/* 0xe0 reserved */ +/* 0xe8 reserved */ +/* 0xf0 reserved */ +/* 0xf8 reserved */ /* * The instruction data structure. @@ -1237,9 +1439,9 @@ SYSCTL_DECL(_net_bpf); /* * Rotate the packet buffers in descriptor d. Move the store buffer into the - * hold slot, and the free buffer ino the store slot. Zero the length of the - * new store buffer. Descriptor lock should be held. Hold buffer must - * not be marked "in use". + * hold slot, and the free buffer into the store slot. Zero the length of the + * new store buffer. Descriptor lock should be held. One must be careful to + * not rotate the buffers twice, i.e. if fbuf != NULL. */ #define ROTATE_BUFFERS(d) do { \ (d)->bd_hbuf = (d)->bd_sbuf; \ @@ -1252,21 +1454,14 @@ SYSCTL_DECL(_net_bpf); /* * Descriptor associated with each attached hardware interface. - * FIXME: this structure is exposed to external callers to speed up - * bpf_peers_present() call. However we cover all fields not needed by - * this function via BPF_INTERNAL define + * Part of this structure is exposed to external callers to speed up + * bpf_peers_present() calls. */ -struct bpf_if { +struct bpf_if; + +struct bpf_if_ext { LIST_ENTRY(bpf_if) bif_next; /* list of all interfaces */ LIST_HEAD(, bpf_d) bif_dlist; /* descriptor list */ -#ifdef BPF_INTERNAL - u_int bif_dlt; /* link layer type */ - u_int bif_hdrlen; /* length of link header */ - struct ifnet *bif_ifp; /* corresponding interface */ - struct rwlock bif_lock; /* interface lock */ - LIST_HEAD(, bpf_d) bif_wlist; /* writer-only list */ - int flags; /* Interface flags */ -#endif }; void bpf_bufheld(struct bpf_d *d); @@ -1277,6 +1472,9 @@ void bpf_mtap2(struct bpf_if *, void *, u_int, struct mbuf *); void bpfattach(struct ifnet *, u_int, u_int); void bpfattach2(struct ifnet *, u_int, u_int, struct bpf_if **); void bpfdetach(struct ifnet *); +#ifdef VIMAGE +int bpf_get_bp_params(struct bpf_if *, u_int *, u_int *); +#endif void bpfilterattach(int); u_int bpf_filter(const struct bpf_insn *, u_char *, u_int, u_int); @@ -1284,8 +1482,10 @@ u_int bpf_filter(const struct bpf_insn *, u_char *, u_int, u_int); static __inline int bpf_peers_present(struct bpf_if *bpf) { + struct bpf_if_ext *ext; - if (!LIST_EMPTY(&bpf->bif_dlist)) + ext = (struct bpf_if_ext *)bpf; + if (!LIST_EMPTY(&ext->bif_dlist)) return (1); return (0); } @@ -1313,4 +1513,12 @@ bpf_peers_present(struct bpf_if *bpf) */ #define BPF_MEMWORDS 16 +#ifdef _SYS_EVENTHANDLER_H_ +/* BPF attach/detach events */ +struct ifnet; +typedef void (*bpf_track_fn)(void *, struct ifnet *, int /* dlt */, + int /* 1 =>'s attach */); +EVENTHANDLER_DECLARE(bpf_track, bpf_track_fn); +#endif /* _SYS_EVENTHANDLER_H_ */ + #endif /* _NET_BPF_H_ */ diff --git a/freebsd/sys/net/bpf_buffer.c b/freebsd/sys/net/bpf_buffer.c index ec6aed74..d42df1b0 100644 --- a/freebsd/sys/net/bpf_buffer.c +++ b/freebsd/sys/net/bpf_buffer.c @@ -81,8 +81,6 @@ __FBSDID("$FreeBSD$"); #include <net/bpf_buffer.h> #include <net/bpfdesc.h> -#define PRINET 26 /* interruptible */ - /* * Implement historical kernel memory buffering model for BPF: two malloc(9) * kernel buffers are hung off of the descriptor. The size is fixed prior to @@ -193,9 +191,6 @@ bpf_buffer_ioctl_sblen(struct bpf_d *d, u_int *i) return (EINVAL); } - while (d->bd_hbuf_in_use) - mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, - PRINET, "bd_hbuf", 0); /* Free old buffers if set */ if (d->bd_fbuf != NULL) free(d->bd_fbuf, M_BPF); diff --git a/freebsd/sys/net/bpf_filter.c b/freebsd/sys/net/bpf_filter.c index a313f4bd..941fa290 100644 --- a/freebsd/sys/net/bpf_filter.c +++ b/freebsd/sys/net/bpf_filter.c @@ -41,6 +41,9 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> +#if !defined(_KERNEL) +#include <strings.h> +#endif #if !defined(_KERNEL) || defined(sun) #include <netinet/in.h> #endif @@ -98,7 +101,7 @@ m_xword(struct mbuf *m, bpf_u_int32 k, int *err) while (k >= len) { k -= len; m = m->m_next; - if (m == 0) + if (m == NULL) goto bad; len = m->m_len; } @@ -108,7 +111,7 @@ m_xword(struct mbuf *m, bpf_u_int32 k, int *err) return (EXTRACT_LONG(cp)); } m0 = m->m_next; - if (m0 == 0 || m0->m_len + len - k < 4) + if (m0 == NULL || m0->m_len + len - k < 4) goto bad; *err = 0; np = mtod(m0, u_char *); @@ -147,7 +150,7 @@ m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) while (k >= len) { k -= len; m = m->m_next; - if (m == 0) + if (m == NULL) goto bad; len = m->m_len; } @@ -157,7 +160,7 @@ m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err) return (EXTRACT_SHORT(cp)); } m0 = m->m_next; - if (m0 == 0) + if (m0 == NULL) goto bad; *err = 0; return ((cp[0] << 8) | mtod(m0, u_char *)[0]); diff --git a/freebsd/sys/net/bridgestp.c b/freebsd/sys/net/bridgestp.c index 167bc59f..5fea7ae7 100644 --- a/freebsd/sys/net/bridgestp.c +++ b/freebsd/sys/net/bridgestp.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include <sys/socket.h> #include <sys/sockio.h> #include <sys/kernel.h> +#include <sys/malloc.h> #include <sys/callout.h> #include <sys/module.h> #include <sys/proc.h> @@ -53,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include <sys/taskqueue.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/if_llc.h> @@ -236,7 +238,7 @@ bstp_transmit_tcn(struct bstp_state *bs, struct bstp_port *bp) if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; @@ -350,7 +352,7 @@ bstp_send_bpdu(struct bstp_state *bs, struct bstp_port *bp, if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; @@ -789,7 +791,7 @@ bstp_assign_roles(struct bstp_state *bs) bs->bs_root_htime = bs->bs_bridge_htime; bs->bs_root_port = NULL; - /* check if any recieved info supersedes us */ + /* check if any received info supersedes us */ LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { if (bp->bp_infois != BSTP_INFO_RECEIVED) continue; diff --git a/freebsd/sys/net/ethernet.h b/freebsd/sys/net/ethernet.h index ae7341ee..bc5fa9cb 100644 --- a/freebsd/sys/net/ethernet.h +++ b/freebsd/sys/net/ethernet.h @@ -71,6 +71,28 @@ struct ether_addr { } __packed; #define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ +#define ETHER_IS_BROADCAST(addr) \ + (((addr)[0] & (addr)[1] & (addr)[2] & \ + (addr)[3] & (addr)[4] & (addr)[5]) == 0xff) + +/* + * 802.1q Virtual LAN header. + */ +struct ether_vlan_header { + uint8_t evl_dhost[ETHER_ADDR_LEN]; + uint8_t evl_shost[ETHER_ADDR_LEN]; + uint16_t evl_encap_proto; + uint16_t evl_tag; + uint16_t evl_proto; +} __packed; + +#define EVL_VLID_MASK 0x0FFF +#define EVL_PRI_MASK 0xE000 +#define EVL_VLANOFTAG(tag) ((tag) & EVL_VLID_MASK) +#define EVL_PRIOFTAG(tag) (((tag) >> 13) & 7) +#define EVL_CFIOFTAG(tag) (((tag) >> 12) & 1) +#define EVL_MAKETAG(vlid, pri, cfi) \ + ((((((pri) & 7) << 1) | ((cfi) & 1)) << 12) | ((vlid) & EVL_VLID_MASK)) /* * NOTE: 0x0000-0x05DC (0..1500) are generally IEEE 802.3 length fields. @@ -314,6 +336,7 @@ struct ether_addr { #define ETHERTYPE_SLOW 0x8809 /* 802.3ad link aggregation (LACP) */ #define ETHERTYPE_PPP 0x880B /* PPP (obsolete by PPPoE) */ #define ETHERTYPE_HITACHI 0x8820 /* Hitachi Cable (Optoelectronic Systems Laboratory) */ +#define ETHERTYPE_TEST 0x8822 /* Network Conformance Testing */ #define ETHERTYPE_MPLS 0x8847 /* MPLS Unicast */ #define ETHERTYPE_MPLS_MCAST 0x8848 /* MPLS Multicast */ #define ETHERTYPE_AXIS 0x8856 /* Axis Communications AB proprietary bootstrap/config */ @@ -375,8 +398,8 @@ extern void ether_demux(struct ifnet *, struct mbuf *); extern void ether_ifattach(struct ifnet *, const u_int8_t *); extern void ether_ifdetach(struct ifnet *); extern int ether_ioctl(struct ifnet *, u_long, caddr_t); -extern int ether_output(struct ifnet *, - struct mbuf *, struct sockaddr *, struct route *); +extern int ether_output(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *); extern int ether_output_frame(struct ifnet *, struct mbuf *); extern char *ether_sprintf(const u_int8_t *); void ether_vlan_mtap(struct bpf_if *, struct mbuf *, diff --git a/freebsd/sys/net/flowtable.h b/freebsd/sys/net/flowtable.h index d810fa33..5a1d9273 100644 --- a/freebsd/sys/net/flowtable.h +++ b/freebsd/sys/net/flowtable.h @@ -1,83 +1,56 @@ -/************************************************************************** - -Copyright (c) 2008-2010, BitGravity Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the BitGravity Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -$FreeBSD$ - -***************************************************************************/ +/*- + * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org> + * Copyright (c) 2008-2010, BitGravity Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the BitGravity Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ #ifndef _NET_FLOWTABLE_H_ #define _NET_FLOWTABLE_H_ -#ifdef _KERNEL - -#define FL_HASH_ALL (1<<0) /* hash 4-tuple + protocol */ -#define FL_PCPU (1<<1) /* pcpu cache */ -#define FL_NOAUTO (1<<2) /* don't automatically add flentry on miss */ -#define FL_IPV6 (1<<9) - -#define FL_TCP (1<<11) -#define FL_SCTP (1<<12) -#define FL_UDP (1<<13) -#define FL_DEBUG (1<<14) -#define FL_DEBUG_ALL (1<<15) - -struct flowtable; -struct flentry; -struct route; -struct route_in6; +struct flowtable_stat { + uint64_t ft_collisions; + uint64_t ft_misses; + uint64_t ft_free_checks; + uint64_t ft_frees; + uint64_t ft_hits; + uint64_t ft_lookups; + uint64_t ft_fail_lle_invalid; + uint64_t ft_inserts; +}; -VNET_DECLARE(struct flowtable *, ip_ft); -#define V_ip_ft VNET(ip_ft) - -VNET_DECLARE(struct flowtable *, ip6_ft); -#define V_ip6_ft VNET(ip6_ft) - -struct flowtable *flowtable_alloc(char *name, int nentry, int flags); +#ifdef _KERNEL /* - * Given a flow table, look up the L3 and L2 information and - * return it in the route. - * + * Given a flow table, look up the L3 and L2 information + * and return it in the route. */ -struct flentry *flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af); - -struct flentry *flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, - struct sockaddr_storage *dsa, uint32_t fibnum, int flags); - -int kern_flowtable_insert(struct flowtable *ft, struct sockaddr_storage *ssa, - struct sockaddr_storage *dsa, struct route *ro, uint32_t fibnum, int flags); - -void flow_invalidate(struct flentry *fl); -void flowtable_route_flush(struct flowtable *ft, struct rtentry *rt); - -void flow_to_route(struct flentry *fl, struct route *ro); - -void flow_to_route_in6(struct flentry *fl, struct route_in6 *ro); - +int flowtable_lookup(sa_family_t, struct mbuf *, struct route *); +void flowtable_route_flush(sa_family_t, struct rtentry *); #endif /* _KERNEL */ -#endif +#endif /* !_NET_FLOWTABLE_H_ */ diff --git a/freebsd/sys/net/ieee8023ad_lacp.c b/freebsd/sys/net/ieee8023ad_lacp.c index 5172ad54..619db8af 100644 --- a/freebsd/sys/net/ieee8023ad_lacp.c +++ b/freebsd/sys/net/ieee8023ad_lacp.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/callout.h> +#include <sys/eventhandler.h> #include <sys/mbuf.h> #include <sys/systm.h> #include <sys/malloc.h> @@ -44,8 +45,10 @@ __FBSDID("$FreeBSD$"); #include <machine/stdarg.h> #include <rtems/bsd/sys/lock.h> #include <sys/rwlock.h> +#include <sys/taskqueue.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_dl.h> #include <net/ethernet.h> #include <net/if_media.h> @@ -189,30 +192,37 @@ static const char *lacp_format_portid(const struct lacp_portid *, char *, static void lacp_dprintf(const struct lacp_port *, const char *, ...) __attribute__((__format__(__printf__, 2, 3))); -static int lacp_debug = 0; -SYSCTL_INT(_net, OID_AUTO, lacp_debug, CTLFLAG_RW | CTLFLAG_TUN, - &lacp_debug, 0, "Enable LACP debug logging (1=debug, 2=trace)"); -TUNABLE_INT("net.lacp_debug", &lacp_debug); +static VNET_DEFINE(int, lacp_debug); +#define V_lacp_debug VNET(lacp_debug) +SYSCTL_NODE(_net_link_lagg, OID_AUTO, lacp, CTLFLAG_RD, 0, "ieee802.3ad"); +SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, debug, CTLFLAG_RWTUN | CTLFLAG_VNET, + &VNET_NAME(lacp_debug), 0, "Enable LACP debug logging (1=debug, 2=trace)"); -#define LACP_DPRINTF(a) if (lacp_debug > 0) { lacp_dprintf a ; } -#define LACP_TRACE(a) if (lacp_debug > 1) { lacp_dprintf(a,"%s\n",__func__); } +static VNET_DEFINE(int, lacp_default_strict_mode) = 1; +SYSCTL_INT(_net_link_lagg_lacp, OID_AUTO, default_strict_mode, CTLFLAG_RWTUN, + &VNET_NAME(lacp_default_strict_mode), 0, + "LACP strict protocol compliance default"); + +#define LACP_DPRINTF(a) if (V_lacp_debug & 0x01) { lacp_dprintf a ; } +#define LACP_TRACE(a) if (V_lacp_debug & 0x02) { lacp_dprintf(a,"%s\n",__func__); } +#define LACP_TPRINTF(a) if (V_lacp_debug & 0x04) { lacp_dprintf a ; } /* * partner administration variables. * XXX should be configurable. */ -static const struct lacp_peerinfo lacp_partner_admin = { +static const struct lacp_peerinfo lacp_partner_admin_optimistic = { .lip_systemid = { .lsi_prio = 0xffff }, .lip_portid = { .lpi_prio = 0xffff }, -#if 1 - /* optimistic */ .lip_state = LACP_STATE_SYNC | LACP_STATE_AGGREGATION | LACP_STATE_COLLECTING | LACP_STATE_DISTRIBUTING, -#else - /* pessimistic */ +}; + +static const struct lacp_peerinfo lacp_partner_admin_strict = { + .lip_systemid = { .lsi_prio = 0xffff }, + .lip_portid = { .lpi_prio = 0xffff }, .lip_state = 0, -#endif }; static const lacp_timer_func_t lacp_timer_funcs[LACP_NTIMER] = { @@ -298,11 +308,16 @@ lacp_pdu_input(struct lacp_port *lp, struct mbuf *m) goto bad; } - if (lacp_debug > 0) { + if (V_lacp_debug > 0) { lacp_dprintf(lp, "lacpdu receive\n"); lacp_dump_lacpdu(du); } + if ((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_rx_test) { + LACP_TPRINTF((lp, "Dropping RX PDU\n")); + goto bad; + } + LACP_LOCK(lsc); lacp_sm_rx(lp, du); LACP_UNLOCK(lsc); @@ -350,7 +365,7 @@ lacp_xmit_lacpdu(struct lacp_port *lp) LACP_LOCK_ASSERT(lp->lp_lsc); - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { return (ENOMEM); } @@ -378,7 +393,7 @@ lacp_xmit_lacpdu(struct lacp_port *lp) sizeof(du->ldu_collector)); du->ldu_collector.lci_maxdelay = 0; - if (lacp_debug > 0) { + if (V_lacp_debug > 0) { lacp_dprintf(lp, "lacpdu transmit\n"); lacp_dump_lacpdu(du); } @@ -404,7 +419,7 @@ lacp_xmit_marker(struct lacp_port *lp) LACP_LOCK_ASSERT(lp->lp_lsc); - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { return (ENOMEM); } @@ -490,12 +505,14 @@ lacp_tick(void *arg) if ((lp->lp_state & LACP_STATE_AGGREGATION) == 0) continue; + CURVNET_SET(lp->lp_ifp->if_vnet); lacp_run_timers(lp); lacp_select(lp); lacp_sm_mux(lp); lacp_sm_tx(lp); lacp_sm_ptx_tx_schedule(lp); + CURVNET_RESTORE(); } callout_reset(&lsc->lsc_callout, hz, lacp_tick, lsc); } @@ -512,20 +529,17 @@ lacp_port_create(struct lagg_port *lgp) int error; boolean_t active = TRUE; /* XXX should be configurable */ - boolean_t fast = FALSE; /* XXX should be configurable */ + boolean_t fast = FALSE; /* Configurable via ioctl */ - bzero((char *)&sdl, sizeof(sdl)); - sdl.sdl_len = sizeof(sdl); - sdl.sdl_family = AF_LINK; - sdl.sdl_index = ifp->if_index; - sdl.sdl_type = IFT_ETHER; + link_init_sdl(ifp, (struct sockaddr *)&sdl, IFT_ETHER); sdl.sdl_alen = ETHER_ADDR_LEN; bcopy(ðermulticastaddr_slowprotocols, LLADDR(&sdl), ETHER_ADDR_LEN); error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); if (error) { - printf("%s: ADDMULTI failed on %s\n", __func__, lgp->lp_ifname); + printf("%s: ADDMULTI failed on %s\n", __func__, + lgp->lp_ifp->if_xname); return (error); } @@ -535,7 +549,7 @@ lacp_port_create(struct lagg_port *lgp) return (ENOMEM); LACP_LOCK(lsc); - lgp->lp_psc = (caddr_t)lp; + lgp->lp_psc = lp; lp->lp_ifp = ifp; lp->lp_lagg = lgp; lp->lp_lsc = lsc; @@ -572,17 +586,18 @@ lacp_port_destroy(struct lagg_port *lgp) lacp_disable_distributing(lp); lacp_unselect(lp); + LIST_REMOVE(lp, lp_next); + LACP_UNLOCK(lsc); + /* The address may have already been removed by if_purgemaddrs() */ if (!lgp->lp_detaching) if_delmulti_ifma(lp->lp_ifma); - LIST_REMOVE(lp, lp_next); - LACP_UNLOCK(lsc); free(lp, M_DEVBUF); } void -lacp_req(struct lagg_softc *sc, caddr_t data) +lacp_req(struct lagg_softc *sc, void *data) { struct lacp_opreq *req = (struct lacp_opreq *)data; struct lacp_softc *lsc = LACP_SOFTC(sc); @@ -590,7 +605,7 @@ lacp_req(struct lagg_softc *sc, caddr_t data) bzero(req, sizeof(struct lacp_opreq)); - /* + /* * If the LACP softc is NULL, return with the opreq structure full of * zeros. It is normal for the softc to be NULL while the lagg is * being destroyed. @@ -621,7 +636,7 @@ lacp_req(struct lagg_softc *sc, caddr_t data) } void -lacp_portreq(struct lagg_port *lgp, caddr_t data) +lacp_portreq(struct lagg_port *lgp, void *data) { struct lacp_opreq *req = (struct lacp_opreq *)data; struct lacp_port *lp = LACP_PORT(lgp); @@ -665,6 +680,7 @@ lacp_disable_distributing(struct lacp_port *lp) { struct lacp_aggregator *la = lp->lp_aggregator; struct lacp_softc *lsc = lp->lp_lsc; + struct lagg_softc *sc = lsc->lsc_softc; char buf[LACP_LAGIDSTR_MAX+1]; LACP_LOCK_ASSERT(lsc); @@ -684,6 +700,7 @@ lacp_disable_distributing(struct lacp_port *lp) TAILQ_REMOVE(&la->la_ports, lp, lp_dist_q); la->la_nports--; + sc->sc_active = la->la_nports; if (lsc->lsc_active_aggregator == la) { lacp_suppress_distributing(lsc, la); @@ -700,6 +717,7 @@ lacp_enable_distributing(struct lacp_port *lp) { struct lacp_aggregator *la = lp->lp_aggregator; struct lacp_softc *lsc = lp->lp_lsc; + struct lagg_softc *sc = lsc->lsc_softc; char buf[LACP_LAGIDSTR_MAX+1]; LACP_LOCK_ASSERT(lsc); @@ -716,6 +734,7 @@ lacp_enable_distributing(struct lacp_port *lp) KASSERT(la->la_refcnt > la->la_nports, ("aggregator refcnt invalid")); TAILQ_INSERT_HEAD(&la->la_ports, lp, lp_dist_q); la->la_nports++; + sc->sc_active = la->la_nports; lp->lp_state |= LACP_STATE_DISTRIBUTING; @@ -734,26 +753,26 @@ lacp_transit_expire(void *vp) LACP_LOCK_ASSERT(lsc); + CURVNET_SET(lsc->lsc_softc->sc_ifp->if_vnet); LACP_TRACE(NULL); + CURVNET_RESTORE(); lsc->lsc_suppress_distributing = FALSE; } -int +void lacp_attach(struct lagg_softc *sc) { struct lacp_softc *lsc; - lsc = malloc(sizeof(struct lacp_softc), - M_DEVBUF, M_NOWAIT|M_ZERO); - if (lsc == NULL) - return (ENOMEM); + lsc = malloc(sizeof(struct lacp_softc), M_DEVBUF, M_WAITOK | M_ZERO); - sc->sc_psc = (caddr_t)lsc; + sc->sc_psc = lsc; lsc->lsc_softc = sc; - lsc->lsc_hashkey = arc4random(); + lsc->lsc_hashkey = m_ether_tcpip_hash_init(); lsc->lsc_active_aggregator = NULL; + lsc->lsc_strict_mode = VNET(lacp_default_strict_mode); LACP_LOCK_INIT(lsc); TAILQ_INIT(&lsc->lsc_aggregators); LIST_INIT(&lsc->lsc_ports); @@ -764,27 +783,23 @@ lacp_attach(struct lagg_softc *sc) /* if the lagg is already up then do the same */ if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) lacp_init(sc); - - return (0); } -int -lacp_detach(struct lagg_softc *sc) +void +lacp_detach(void *psc) { - struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_softc *lsc = (struct lacp_softc *)psc; KASSERT(TAILQ_EMPTY(&lsc->lsc_aggregators), ("aggregators still active")); KASSERT(lsc->lsc_active_aggregator == NULL, ("aggregator still attached")); - sc->sc_psc = NULL; callout_drain(&lsc->lsc_transit_callout); callout_drain(&lsc->lsc_callout); LACP_LOCK_DESTROY(lsc); free(lsc, M_DEVBUF); - return (0); } void @@ -827,10 +842,11 @@ lacp_select_tx_port(struct lagg_softc *sc, struct mbuf *m) return (NULL); } - if (sc->use_flowid && (m->m_flags & M_FLOWID)) - hash = m->m_pkthdr.flowid; + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && + M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + hash = m->m_pkthdr.flowid >> sc->flowid_shift; else - hash = lagg_hashmbuf(sc, m, lsc->lsc_hashkey); + hash = m_ether_tcpip_hash(sc->sc_flags, m, lsc->lsc_hashkey); hash %= pm->pm_count; lp = pm->pm_map[hash]; @@ -920,7 +936,6 @@ lacp_aggregator_bandwidth(struct lacp_aggregator *la) static void lacp_select_active_aggregator(struct lacp_softc *lsc) { - struct lagg_softc *sc = lsc->lsc_softc; struct lacp_aggregator *la; struct lacp_aggregator *best_la = NULL; uint64_t best_speed = 0; @@ -940,13 +955,13 @@ lacp_select_active_aggregator(struct lacp_softc *lsc) lacp_format_lagid_aggregator(la, buf, sizeof(buf)), speed, la->la_nports)); - /* This aggregator is chosen if - * the partner has a better system priority - * or, the total aggregated speed is higher - * or, it is already the chosen aggregator + /* + * This aggregator is chosen if the partner has a better + * system priority or, the total aggregated speed is higher + * or, it is already the chosen aggregator */ if ((best_la != NULL && LACP_SYS_PRI(la->la_partner) < - LACP_SYS_PRI(best_la->la_partner)) || + LACP_SYS_PRI(best_la->la_partner)) || speed > best_speed || (speed == best_speed && la == lsc->lsc_active_aggregator)) { @@ -972,7 +987,6 @@ lacp_select_active_aggregator(struct lacp_softc *lsc) lacp_format_lagid_aggregator(best_la, buf, sizeof(buf)))); if (lsc->lsc_active_aggregator != best_la) { - sc->sc_ifp->if_baudrate = best_speed; lsc->lsc_active_aggregator = best_la; lacp_update_portmap(lsc); if (best_la) { @@ -988,15 +1002,18 @@ lacp_select_active_aggregator(struct lacp_softc *lsc) static void lacp_update_portmap(struct lacp_softc *lsc) { + struct lagg_softc *sc = lsc->lsc_softc; struct lacp_aggregator *la; struct lacp_portmap *p; struct lacp_port *lp; + uint64_t speed; u_int newmap; int i; newmap = lsc->lsc_activemap == 0 ? 1 : 0; p = &lsc->lsc_pmap[newmap]; la = lsc->lsc_active_aggregator; + speed = 0; bzero(p, sizeof(struct lacp_portmap)); if (la != NULL && la->la_nports > 0) { @@ -1005,7 +1022,9 @@ lacp_update_portmap(struct lacp_softc *lsc) TAILQ_FOREACH(lp, &la->la_ports, lp_dist_q) p->pm_map[i++] = lp; KASSERT(i == p->pm_count, ("Invalid port count")); + speed = lacp_aggregator_bandwidth(la); } + sc->sc_ifp->if_baudrate = speed; /* switch the active portmap over */ atomic_store_rel_int(&lsc->lsc_activemap, newmap); @@ -1054,12 +1073,16 @@ lacp_compose_key(struct lacp_port *lp) case IFM_100_T4: case IFM_100_VG: case IFM_100_T2: + case IFM_100_T: key = IFM_100_TX; break; case IFM_1000_SX: case IFM_1000_LX: case IFM_1000_CX: case IFM_1000_T: + case IFM_1000_KX: + case IFM_1000_SGMII: + case IFM_1000_CX_SGMII: key = IFM_1000_SX; break; case IFM_10G_LR: @@ -1069,15 +1092,53 @@ lacp_compose_key(struct lacp_port *lp) case IFM_10G_TWINAX_LONG: case IFM_10G_LRM: case IFM_10G_T: + case IFM_10G_KX4: + case IFM_10G_KR: + case IFM_10G_CR1: + case IFM_10G_ER: + case IFM_10G_SFI: key = IFM_10G_LR; break; + case IFM_20G_KR2: + key = IFM_20G_KR2; + break; + case IFM_2500_KX: + case IFM_2500_T: + key = IFM_2500_KX; + break; + case IFM_5000_T: + key = IFM_5000_T; + break; + case IFM_50G_PCIE: + case IFM_50G_CR2: + case IFM_50G_KR2: + key = IFM_50G_PCIE; + break; + case IFM_56G_R4: + key = IFM_56G_R4; + break; + case IFM_25G_PCIE: + case IFM_25G_CR: + case IFM_25G_KR: + case IFM_25G_SR: + key = IFM_25G_PCIE; + break; case IFM_40G_CR4: case IFM_40G_SR4: case IFM_40G_LR4: + case IFM_40G_XLPPI: + case IFM_40G_KR4: key = IFM_40G_CR4; break; + case IFM_100G_CR4: + case IFM_100G_SR4: + case IFM_100G_KR4: + case IFM_100G_LR4: + key = IFM_100G_CR4; + break; default: key = subtype; + break; } /* bit 5..14: (some bits of) if_index of lagg device */ key |= 0x7fe0 & ((sc->sc_ifp->if_index) << 5); @@ -1313,6 +1374,8 @@ lacp_unselect(struct lacp_port *lp) static void lacp_sm_mux(struct lacp_port *lp) { + struct lagg_port *lgp = lp->lp_lagg; + struct lagg_softc *sc = lgp->lp_softc; enum lacp_mux_state new_state; boolean_t p_sync = (lp->lp_partner.lip_state & LACP_STATE_SYNC) != 0; @@ -1321,8 +1384,10 @@ lacp_sm_mux(struct lacp_port *lp) enum lacp_selected selected = lp->lp_selected; struct lacp_aggregator *la; - if (lacp_debug > 1) - lacp_dprintf(lp, "%s: state %d\n", __func__, lp->lp_mux_state); + if (V_lacp_debug > 1) + lacp_dprintf(lp, "%s: state= 0x%x, selected= 0x%x, " + "p_sync= 0x%x, p_collecting= 0x%x\n", __func__, + lp->lp_mux_state, selected, p_sync, p_collecting); re_eval: la = lp->lp_aggregator; @@ -1362,6 +1427,8 @@ re_eval: case LACP_MUX_DISTRIBUTING: if (selected != LACP_SELECTED || !p_sync || !p_collecting) { new_state = LACP_MUX_COLLECTING; + lacp_dprintf(lp, "Interface stopped DISTRIBUTING, possible flapping\n"); + sc->sc_flapping++; } break; default: @@ -1610,6 +1677,10 @@ lacp_sm_rx_record_pdu(struct lacp_port *lp, const struct lacpdu *du) sizeof(buf)))); } + /* XXX Hack, still need to implement 5.4.9 para 2,3,4 */ + if (lp->lp_lsc->lsc_strict_mode) + lp->lp_partner.lip_state |= LACP_STATE_SYNC; + lacp_sm_ptx_update_timeout(lp, oldpstate); } @@ -1635,7 +1706,10 @@ lacp_sm_rx_record_default(struct lacp_port *lp) LACP_TRACE(lp); oldpstate = lp->lp_partner.lip_state; - lp->lp_partner = lacp_partner_admin; + if (lp->lp_lsc->lsc_strict_mode) + lp->lp_partner = lacp_partner_admin_strict; + else + lp->lp_partner = lacp_partner_admin_optimistic; lp->lp_state |= LACP_STATE_DEFAULTED; lacp_sm_ptx_update_timeout(lp, oldpstate); } @@ -1670,7 +1744,12 @@ lacp_sm_rx_update_default_selected(struct lacp_port *lp) LACP_TRACE(lp); - lacp_sm_rx_update_selected_from_peerinfo(lp, &lacp_partner_admin); + if (lp->lp_lsc->lsc_strict_mode) + lacp_sm_rx_update_selected_from_peerinfo(lp, + &lacp_partner_admin_strict); + else + lacp_sm_rx_update_selected_from_peerinfo(lp, + &lacp_partner_admin_optimistic); } /* transmit machine */ @@ -1678,7 +1757,7 @@ lacp_sm_rx_update_default_selected(struct lacp_port *lp) static void lacp_sm_tx(struct lacp_port *lp) { - int error; + int error = 0; if (!(lp->lp_state & LACP_STATE_AGGREGATION) #if 1 @@ -1700,7 +1779,11 @@ lacp_sm_tx(struct lacp_port *lp) return; } - error = lacp_xmit_lacpdu(lp); + if (((1 << lp->lp_ifp->if_dunit) & lp->lp_lsc->lsc_debug.lsc_tx_test) == 0) { + error = lacp_xmit_lacpdu(lp); + } else { + LACP_TPRINTF((lp, "Dropping TX PDU\n")); + } if (error == 0) { lp->lp_flags &= ~LACP_PORT_NTT; diff --git a/freebsd/sys/net/ieee8023ad_lacp.h b/freebsd/sys/net/ieee8023ad_lacp.h index 9cebc591..8f0f51a7 100644 --- a/freebsd/sys/net/ieee8023ad_lacp.h +++ b/freebsd/sys/net/ieee8023ad_lacp.h @@ -75,6 +75,7 @@ "\007DEFAULTED" \ "\010EXPIRED" +#ifdef _KERNEL /* * IEEE802.3 slow protocols * @@ -245,6 +246,12 @@ struct lacp_softc { struct lacp_portmap lsc_pmap[2]; volatile u_int lsc_activemap; u_int32_t lsc_hashkey; + struct { + u_int32_t lsc_rx_test; + u_int32_t lsc_tx_test; + } lsc_debug; + u_int32_t lsc_strict_mode; + boolean_t lsc_fast_timeout; /* if set, fast timeout */ }; #define LACP_TYPE_ACTORINFO 1 @@ -277,15 +284,15 @@ struct lacp_softc { struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); -int lacp_attach(struct lagg_softc *); -int lacp_detach(struct lagg_softc *); +void lacp_attach(struct lagg_softc *); +void lacp_detach(void *); void lacp_init(struct lagg_softc *); void lacp_stop(struct lagg_softc *); int lacp_port_create(struct lagg_port *); void lacp_port_destroy(struct lagg_port *); void lacp_linkstate(struct lagg_port *); -void lacp_req(struct lagg_softc *, caddr_t); -void lacp_portreq(struct lagg_port *, caddr_t); +void lacp_req(struct lagg_softc *, void *); +void lacp_portreq(struct lagg_port *, void *); static __inline int lacp_isactive(struct lagg_port *lgp) @@ -331,3 +338,4 @@ lacp_isdistributing(struct lagg_port *lgp) #define LACP_LAGIDSTR_MAX \ (1 + LACP_PARTNERSTR_MAX + 1 + LACP_PARTNERSTR_MAX + 1) #define LACP_STATESTR_MAX (255) /* XXX */ +#endif /* _KERNEL */ diff --git a/freebsd/sys/net/if.c b/freebsd/sys/net/if.c index 2c638a37..8bfa9e21 100644 --- a/freebsd/sys/net/if.c +++ b/freebsd/sys/net/if.c @@ -65,12 +65,16 @@ #include <machine/stdarg.h> #include <vm/uma.h> +#include <net/bpf.h> +#include <net/ethernet.h> #include <net/if.h> #include <net/if_arp.h> #include <net/if_clone.h> #include <net/if_dl.h> #include <net/if_types.h> #include <net/if_var.h> +#include <net/if_media.h> +#include <net/if_vlan_var.h> #include <net/radix.h> #include <net/route.h> #include <net/vnet.h> @@ -97,14 +101,9 @@ #include <compat/freebsd32/freebsd32.h> #endif -struct ifindex_entry { - struct ifnet *ife_ifnet; -}; - SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); -TUNABLE_INT("net.link.ifqmaxlen", &ifqmaxlen); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); @@ -115,6 +114,13 @@ SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); +/* Log promiscuous mode change events */ +static int log_promisc_mode_change = 1; + +SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, + &log_promisc_mode_change, 1, + "log promiscuous mode change events"); + /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, @@ -132,18 +138,22 @@ void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); +void (*carp_demote_adj_p)(int, char *); +int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) -struct ifnet *(*carp_forus_p)(struct ifnet *ifp, u_char *dhost); +int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *sa, struct rtentry *rt); + const struct sockaddr *sa); +int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); +int (*carp_attach_p)(struct ifaddr *, int); +void (*carp_detach_p)(struct ifaddr *); #endif #ifdef INET -int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, struct in_addr *, - u_int8_t **); +int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); -caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, +caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif @@ -158,23 +168,25 @@ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); -static void if_init(void *); static void if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); +static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); -static int if_rtdel(struct radix_node *, void *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); -static void if_attach_internal(struct ifnet *, int); -static void if_detach_internal(struct ifnet *, int); +static void if_attach_internal(struct ifnet *, int, struct if_clone *); +static int if_detach_internal(struct ifnet *, int, struct if_clone **); +#ifdef VIMAGE +static void if_vmove(struct ifnet *, struct vnet *); +#endif #ifdef INET6 /* @@ -184,6 +196,10 @@ static void if_detach_internal(struct ifnet *, int); extern void nd6_setmtu(struct ifnet *); #endif +/* ipsec helper hooks */ +VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); +VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); + VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ @@ -192,7 +208,7 @@ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ -VNET_DEFINE(struct ifindex_entry *, ifindex_table); +VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) @@ -207,7 +223,9 @@ VNET_DEFINE(struct ifindex_entry *, ifindex_table); * inversions and deadlocks. */ struct rwlock ifnet_rwlock; +RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; +SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we @@ -229,9 +247,9 @@ ifnet_byindex_locked(u_short idx) if (idx > V_if_index) return (NULL); - if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD) + if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); - return (V_ifindex_table[idx].ife_ifnet); + return (V_ifindex_table[idx]); } struct ifnet * @@ -265,34 +283,30 @@ ifnet_byindex_ref(u_short idx) * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ -static int -ifindex_alloc_locked(u_short *idxp) +static u_short +ifindex_alloc(void) { u_short idx; IFNET_WLOCK_ASSERT(); - retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { - if (V_ifindex_table[idx].ife_ifnet == NULL) + if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ - if (idx < 1) - return (ENOSPC); if (idx >= V_if_indexlim) { if_grow(); goto retry; } if (idx > V_if_index) V_if_index = idx; - *idxp = idx; - return (0); + return (idx); } static void @@ -301,9 +315,9 @@ ifindex_free_locked(u_short idx) IFNET_WLOCK_ASSERT(); - V_ifindex_table[idx].ife_ifnet = NULL; + V_ifindex_table[idx] = NULL; while (V_if_index > 0 && - V_ifindex_table[V_if_index].ife_ifnet == NULL) + V_ifindex_table[V_if_index] == NULL) V_if_index--; } @@ -322,7 +336,7 @@ ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) IFNET_WLOCK_ASSERT(); - V_ifindex_table[idx].ife_ifnet = ifp; + V_ifindex_table[idx] = ifp; } static void @@ -337,11 +351,12 @@ ifnet_setbyindex(u_short idx, struct ifnet *ifp) struct ifaddr * ifaddr_byindex(u_short idx) { - struct ifaddr *ifa; + struct ifnet *ifp; + struct ifaddr *ifa = NULL; IFNET_RLOCK_NOSLEEP(); - ifa = ifnet_byindex_locked(idx)->if_addr; - if (ifa != NULL) + ifp = ifnet_byindex_locked(idx); + if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); @@ -368,17 +383,6 @@ vnet_if_init(const void *unused __unused) VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); -/* ARGSUSED*/ -static void -if_init(void *dummy __unused) -{ - - IFNET_LOCK_INIT(); - if_clone_init(); -} -SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL); - - #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) @@ -393,6 +397,20 @@ vnet_if_uninit(const void *unused __unused) } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); + +static void +vnet_if_return(const void *unused __unused) +{ + struct ifnet *ifp, *nifp; + + /* Return all inherited interfaces to their parent vnets. */ + TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { + if (ifp->if_home_vnet != ifp->if_vnet) + if_vmove(ifp, ifp->if_home_vnet); + } +} +VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, + vnet_if_return, NULL); #endif static void @@ -400,7 +418,7 @@ if_grow(void) { int oldlim; u_int n; - struct ifindex_entry *e; + struct ifnet **e; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; @@ -433,16 +451,15 @@ if_alloc(u_char type) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); IFNET_WLOCK(); - if (ifindex_alloc_locked(&idx) != 0) { - IFNET_WUNLOCK(); - free(ifp, M_IFNET); - return (NULL); - } + idx = ifindex_alloc(); ifnet_setbyindex_locked(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; +#ifdef VIMAGE + ifp->if_vnet = curvnet; +#endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { @@ -457,7 +474,6 @@ if_alloc(u_char type) ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); TAILQ_INIT(&ifp->if_addrhead); - TAILQ_INIT(&ifp->if_prefixhead); TAILQ_INIT(&ifp->if_multiaddrs); TAILQ_INIT(&ifp->if_groups); #ifdef MAC @@ -466,6 +482,9 @@ if_alloc(u_char type) ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ + for (int i = 0; i < IFCOUNTERS; i++) + ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); + ifp->if_get_counter = if_get_counter_default; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } @@ -494,23 +513,20 @@ if_free_internal(struct ifnet *ifp) IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); + + for (int i = 0; i < IFCOUNTERS; i++) + counter_u64_free(ifp->if_counters[i]); + free(ifp, M_IFNET); } /* - * This version should only be called by intefaces that switch their type - * after calling if_alloc(). if_free_type() will go away again now that we - * have if_alloctype to cache the original allocation type. For now, assert - * that they match, since we require that in practice. + * Deregister an interface and free the associated storage. */ void -if_free_type(struct ifnet *ifp, u_char type) +if_free(struct ifnet *ifp) { - KASSERT(ifp->if_alloctype == type, - ("if_free_type: type (%d) != alloctype (%d)", type, - ifp->if_alloctype)); - ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); @@ -527,18 +543,6 @@ if_free_type(struct ifnet *ifp, u_char type) } /* - * This is the normal version of if_free(), used by device drivers to free a - * detached network interface. The contents of if_free_type() will move into - * here when if_free_type() goes away. - */ -void -if_free(struct ifnet *ifp) -{ - - if_free_type(ifp, ifp->if_alloctype); -} - -/* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. @@ -583,12 +587,21 @@ ifq_delete(struct ifaltq *ifq) } /* - * Perform generic interface initalization tasks and attach the interface + * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * + * Note that if_detach_internal() removes group membership unconditionally + * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. + * Thus, when if_vmove() is applied to a cloned interface, group membership + * is lost while a cloned one always joins a group whose name is + * ifc->ifc_name. To recover this after if_detach_internal() and + * if_attach_internal(), the cloner should be specified to + * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() + * attempts to join a group whose name is ifc->ifc_name. + * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. @@ -599,14 +612,14 @@ void if_attach(struct ifnet *ifp) { - if_attach_internal(ifp, 0); + if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void -if_hw_tsomax_common(struct ifnet *ifp, struct ifnet_hw_tsomax *pmax) +if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from @@ -635,7 +648,7 @@ if_hw_tsomax_common(struct ifnet *ifp, struct ifnet_hw_tsomax *pmax) * Returns zero if no change. Else non-zero. */ int -if_hw_tsomax_update(struct ifnet *ifp, struct ifnet_hw_tsomax *pmax) +if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { @@ -654,7 +667,7 @@ if_hw_tsomax_update(struct ifnet *ifp, struct ifnet_hw_tsomax *pmax) } static void -if_attach_internal(struct ifnet *ifp, int vmove) +if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; @@ -673,9 +686,12 @@ if_attach_internal(struct ifnet *ifp, int vmove) if_addgroup(ifp, IFG_ALL); + /* Restore group membership for cloned interfaces. */ + if (vmove && ifc != NULL) + if_clone_addgroup(ifp, ifc); + getmicrotime(&ifp->if_lastchange); - ifp->if_data.ifi_epoch = time_uptime; - ifp->if_data.ifi_datalen = sizeof(struct if_data); + ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), @@ -687,6 +703,9 @@ if_attach_internal(struct ifnet *ifp, int vmove) if (ifp->if_input == NULL) ifp->if_input = if_input_default; + if (ifp->if_requestencap == NULL) + ifp->if_requestencap = if_requestencap_default; + if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); @@ -706,8 +725,7 @@ if_attach_internal(struct ifnet *ifp, int vmove) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; - ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO); - ifa_init(ifa); + ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; @@ -792,12 +810,9 @@ static void if_attachdomain(void *dummy) { struct ifnet *ifp; - int s; - s = splnet(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); - splx(s); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); @@ -806,23 +821,16 @@ static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; - int s; - - s = splnet(); /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ - if (IF_AFDATA_TRYLOCK(ifp) == 0) { - splx(s); - return; - } + IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); - splx(s); - printf("if_attachdomain called more than once on %s\n", - ifp->if_xname); + log(LOG_WARNING, "%s called more than once on %s\n", + __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; @@ -835,8 +843,6 @@ if_attachdomain1(struct ifnet *ifp) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } - - splx(s); } /* @@ -847,6 +853,7 @@ if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; + /* XXX cannot hold IF_ADDR_WLOCK over called functions. */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; @@ -871,7 +878,9 @@ if_purgeaddrs(struct ifnet *ifp) continue; } #endif /* INET6 */ + IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } @@ -906,20 +915,34 @@ if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); - if_detach_internal(ifp, 0); + if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } -static void -if_detach_internal(struct ifnet *ifp, int vmove) +/* + * The vmove flag, if set, indicates that we are called from a callpath + * that is moving an interface to a different vnet instance. + * + * The shutdown flag, if set, indicates that we are called in the + * process of shutting down a vnet instance. Currently only the + * vnet_if_return SYSUNINIT function sets it. Note: we can be called + * on a vnet instance shutdown without this flag being set, e.g., when + * the cloned interfaces are destoyed as first thing of teardown. + */ +static int +if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; - struct radix_node_head *rnh; - int i, j; + int i; struct domain *dp; struct ifnet *iter; int found = 0; +#ifdef VIMAGE + int shutdown; + shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && + ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; +#endif IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { @@ -927,28 +950,77 @@ if_detach_internal(struct ifnet *ifp, int vmove) found = 1; break; } -#ifdef VIMAGE - if (found) - curvnet->vnet_ifcnt--; -#endif IFNET_WUNLOCK(); if (!found) { + /* + * While we would want to panic here, we cannot + * guarantee that the interface is indeed still on + * the list given we don't hold locks all the way. + */ + return (ENOENT); +#if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ +#endif } /* - * Remove/wait for pending events. + * At this point we know the interface still was on the ifnet list + * and we removed it so we are in a stable state. */ +#ifdef VIMAGE + curvnet->vnet_ifcnt--; +#endif + + /* + * In any case (destroy or vmove) detach us from the groups + * and remove/wait for pending events on the taskq. + * XXX-BZ in theory an interface could still enqueue a taskq change? + */ + if_delgroups(ifp); + taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* - * Remove routes and flush queues. + * Check if this is a cloned interface or not. Must do even if + * shutting down as a if_vmove_reclaim() would move the ifp and + * the if_clone_addgroup() will have a corrupted string overwise + * from a gibberish pointer. */ + if (vmove && ifcp != NULL) + *ifcp = if_clone_findifc(ifp); + if_down(ifp); + +#ifdef VIMAGE + /* + * On VNET shutdown abort here as the stack teardown will do all + * the work top-down for us. + */ + if (shutdown) { + /* + * In case of a vmove we are done here without error. + * If we would signal an error it would lead to the same + * abort as if we did not find the ifnet anymore. + * if_detach() calls us in void context and does not care + * about an early abort notification, so life is splendid :) + */ + goto finish_vnet_shutdown; + } +#endif + + /* + * At this point we are not tearing down a VNET and are either + * going to destroy or vmove the interface and have to cleanup + * accordingly. + */ + + /* + * Remove routes and flush queues. + */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); @@ -973,6 +1045,12 @@ if_detach_internal(struct ifnet *ifp, int vmove) #endif if_purgemaddrs(ifp); + /* Announce that the interface is gone. */ + rt_ifannouncemsg(ifp, IFAN_DEPARTURE); + EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); + if (!vmove) { /* * Prevent further calls into the device driver via ifnet. @@ -986,37 +1064,21 @@ if_detach_internal(struct ifnet *ifp, int vmove) ifp->if_addr = NULL; /* We can now free link ifaddr. */ + IF_ADDR_WLOCK(ifp); if (!TAILQ_EMPTY(&ifp->if_addrhead)) { ifa = TAILQ_FIRST(&ifp->if_addrhead); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); - } - } - - /* - * Delete all remaining routes using this interface - * Unfortuneatly the only way to do this is to slog through - * the entire routing table looking for routes which point - * to this interface...oh well... - */ - for (i = 1; i <= AF_MAX; i++) { - for (j = 0; j < rt_numfibs; j++) { - rnh = rt_tables_get_rnh(j, i); - if (rnh == NULL) - continue; - RADIX_NODE_HEAD_LOCK(rnh); - (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); - RADIX_NODE_HEAD_UNLOCK(rnh); - } + } else + IF_ADDR_WUNLOCK(ifp); } - /* Announce that the interface is gone. */ - rt_ifannouncemsg(ifp, IFAN_DEPARTURE); - EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); - if (IS_DEFAULT_VNET(curvnet)) - devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); - if_delgroups(ifp); + rt_flushifroutes(ifp); +#ifdef VIMAGE +finish_vnet_shutdown: +#endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the @@ -1027,10 +1089,14 @@ if_detach_internal(struct ifnet *ifp, int vmove) ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { - if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) + if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); + ifp->if_afdata[dp->dom_family] = NULL; + } } + + return (0); } #ifdef VIMAGE @@ -1041,16 +1107,28 @@ if_detach_internal(struct ifnet *ifp, int vmove) * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ -void +static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { - u_short idx; + struct if_clone *ifc; + u_int bif_dlt, bif_hdrlen; + int rc; + + /* + * if_detach_internal() will call the eventhandler to notify + * interface departure. That will detach if_bpf. We need to + * safe the dlt and hdrlen so we can re-attach it later. + */ + bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. + * If we cannot find it, we lost the race to someone else. */ - if_detach_internal(ifp, 1); + rc = if_detach_internal(ifp, 1, &ifc); + if (rc != 0) + return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink @@ -1076,15 +1154,14 @@ if_vmove(struct ifnet *ifp, struct vnet *new_vnet) CURVNET_SET_QUIET(new_vnet); IFNET_WLOCK(); - if (ifindex_alloc_locked(&idx) != 0) { - IFNET_WUNLOCK(); - panic("if_index overflow"); - } - ifp->if_index = idx; + ifp->if_index = ifindex_alloc(); ifnet_setbyindex_locked(ifp->if_index, ifp); IFNET_WUNLOCK(); - if_attach_internal(ifp, 1); + if_attach_internal(ifp, 1, ifc); + + if (ifp->if_bpf == NULL) + bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } @@ -1097,6 +1174,7 @@ if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; + int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); @@ -1117,12 +1195,22 @@ if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); - CURVNET_RESTORE(); if (difp != NULL) { + CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } + /* Make sure the VNET is stable. */ + shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && + ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; + if (shutdown) { + CURVNET_RESTORE(); + prison_free(pr); + return (EBUSY); + } + CURVNET_RESTORE(); + /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); @@ -1139,6 +1227,7 @@ if_vmove_reclaim(struct thread *td, char *ifname, int jid) struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; + int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); @@ -1166,6 +1255,15 @@ if_vmove_reclaim(struct thread *td, char *ifname, int jid) return (EEXIST); } + /* Make sure the VNET is stable. */ + shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && + ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; + if (shutdown) { + CURVNET_RESTORE(); + prison_free(pr); + return (EBUSY); + } + /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); @@ -1187,6 +1285,7 @@ if_addgroup(struct ifnet *ifp, const char *groupname) struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; + int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') @@ -1227,8 +1326,8 @@ if_addgroup(struct ifnet *ifp, const char *groupname) strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); - EVENTHANDLER_INVOKE(group_attach_event, ifg); TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); + new = 1; } ifg->ifg_refcnt++; @@ -1242,6 +1341,8 @@ if_addgroup(struct ifnet *ifp, const char *groupname) IFNET_WUNLOCK(); + if (new) + EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); @@ -1280,10 +1381,11 @@ if_delgroup(struct ifnet *ifp, const char *groupname) if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); + IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); - } - IFNET_WUNLOCK(); + } else + IFNET_WUNLOCK(); free(ifgl, M_TEMP); @@ -1324,11 +1426,12 @@ if_delgroups(struct ifnet *ifp) if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); + IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); - } - IFNET_WUNLOCK(); + } else + IFNET_WUNLOCK(); free(ifgl, M_TEMP); @@ -1434,46 +1537,63 @@ if_getgroupmembers(struct ifgroupreq *data) } /* - * Delete Routes for a Network Interface - * - * Called for each routing entry via the rnh->rnh_walktree() call above - * to delete all route entries referencing a detaching network interface. - * - * Arguments: - * rn pointer to node in the routing table - * arg argument passed to rnh->rnh_walktree() - detaching interface - * - * Returns: - * 0 successful - * errno failed - reason indicated - * + * Return counter values from counter(9)s stored in ifnet. */ -static int -if_rtdel(struct radix_node *rn, void *arg) +uint64_t +if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { - struct rtentry *rt = (struct rtentry *)rn; - struct ifnet *ifp = arg; - int err; - if (rt->rt_ifp == ifp) { + KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); - /* - * Protect (sorta) against walktree recursion problems - * with cloned routes - */ - if ((rt->rt_flags & RTF_UP) == 0) - return (0); + return (counter_u64_fetch(ifp->if_counters[cnt])); +} - err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway, - rt_mask(rt), - rt->rt_flags|RTF_RNH_LOCKED|RTF_PINNED, - (struct rtentry **) NULL, rt->rt_fibnum); - if (err) { - log(LOG_WARNING, "if_rtdel: error %d\n", err); - } - } +/* + * Increase an ifnet counter. Usually used for counters shared + * between the stack and a driver, but function supports them all. + */ +void +if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) +{ - return (0); + KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); + + counter_u64_add(ifp->if_counters[cnt], inc); +} + +/* + * Copy data from ifnet to userland API structure if_data. + */ +void +if_data_copy(struct ifnet *ifp, struct if_data *ifd) +{ + + ifd->ifi_type = ifp->if_type; + ifd->ifi_physical = 0; + ifd->ifi_addrlen = ifp->if_addrlen; + ifd->ifi_hdrlen = ifp->if_hdrlen; + ifd->ifi_link_state = ifp->if_link_state; + ifd->ifi_vhid = 0; + ifd->ifi_datalen = sizeof(struct if_data); + ifd->ifi_mtu = ifp->if_mtu; + ifd->ifi_metric = ifp->if_metric; + ifd->ifi_baudrate = ifp->if_baudrate; + ifd->ifi_hwassist = ifp->if_hwassist; + ifd->ifi_epoch = ifp->if_epoch; + ifd->ifi_lastchange = ifp->if_lastchange; + + ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); + ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); + ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); + ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); + ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); + ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); + ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); + ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); + ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); + ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); + ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); + ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* @@ -1497,28 +1617,56 @@ if_addr_runlock(struct ifnet *ifp) } void -if_maddr_rlock(struct ifnet *ifp) +if_maddr_rlock(if_t ifp) { - IF_ADDR_RLOCK(ifp); + IF_ADDR_RLOCK((struct ifnet *)ifp); } void -if_maddr_runlock(struct ifnet *ifp) +if_maddr_runlock(if_t ifp) { - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_RUNLOCK((struct ifnet *)ifp); } /* - * Reference count functions for ifaddrs. + * Initialization, destruction and refcounting functions for ifaddrs. */ -void -ifa_init(struct ifaddr *ifa) +struct ifaddr * +ifa_alloc(size_t size, int flags) { + struct ifaddr *ifa; + + KASSERT(size >= sizeof(struct ifaddr), + ("%s: invalid size %zu", __func__, size)); + + ifa = malloc(size, M_IFADDR, M_ZERO | flags); + if (ifa == NULL) + return (NULL); + + if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) + goto fail; + if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) + goto fail; + if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) + goto fail; + if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) + goto fail; - mtx_init(&ifa->ifa_mtx, "ifaddr", NULL, MTX_DEF); refcount_init(&ifa->ifa_refcnt, 1); + + return (ifa); + +fail: + /* free(NULL) is okay */ + counter_u64_free(ifa->ifa_opackets); + counter_u64_free(ifa->ifa_ipackets); + counter_u64_free(ifa->ifa_obytes); + counter_u64_free(ifa->ifa_ibytes); + free(ifa, M_IFADDR); + + return (NULL); } void @@ -1533,62 +1681,61 @@ ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) { - mtx_destroy(&ifa->ifa_mtx); + counter_u64_free(ifa->ifa_opackets); + counter_u64_free(ifa->ifa_ipackets); + counter_u64_free(ifa->ifa_obytes); + counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } } -int -ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) +static int +ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, + struct sockaddr *ia) { - int error = 0; - struct rtentry *rt = NULL; + int error; struct rt_addrinfo info; - static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + struct sockaddr_dl null_sdl; + struct ifnet *ifp; + + ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); - info.rti_ifp = V_loif; + if (cmd != RTM_DELETE) + info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; - error = rtrequest1_fib(RTM_ADD, &info, &rt, ifa->ifa_ifp->if_fib); - - if (error == 0 && rt != NULL) { - RT_LOCK(rt); - ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = - ifa->ifa_ifp->if_type; - ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = - ifa->ifa_ifp->if_index; - RT_REMREF(rt); - RT_UNLOCK(rt); - } else if (error != 0) - log(LOG_INFO, "ifa_add_loopback_route: insertion failed\n"); + link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); + + error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); + + if (error != 0) + log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n", + __func__, otype, if_name(ifp), error); return (error); } int +ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) +{ + + return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); +} + +int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { - int error = 0; - struct rt_addrinfo info; - struct sockaddr_dl null_sdl; - bzero(&null_sdl, sizeof(null_sdl)); - null_sdl.sdl_len = sizeof(null_sdl); - null_sdl.sdl_family = AF_LINK; - null_sdl.sdl_type = ifa->ifa_ifp->if_type; - null_sdl.sdl_index = ifa->ifa_ifp->if_index; - bzero(&info, sizeof(info)); - info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; - info.rti_info[RTAX_DST] = ia; - info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; - error = rtrequest1_fib(RTM_DELETE, &info, NULL, ifa->ifa_ifp->if_fib); + return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); +} - if (error != 0) - log(LOG_INFO, "ifa_del_loopback_route: deletion failed\n"); +int +ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) +{ - return (error); + return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* @@ -1597,22 +1744,19 @@ ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) * to perform a different comparison. */ -#define sa_equal(a1, a2) \ - (bcmp((a1), (a2), ((a1))->sa_len) == 0) - #define sa_dl_equal(a1, a2) \ - ((((struct sockaddr_dl *)(a1))->sdl_len == \ - ((struct sockaddr_dl *)(a2))->sdl_len) && \ - (bcmp(LLADDR((struct sockaddr_dl *)(a1)), \ - LLADDR((struct sockaddr_dl *)(a2)), \ - ((struct sockaddr_dl *)(a1))->sdl_alen) == 0)) + ((((const struct sockaddr_dl *)(a1))->sdl_len == \ + ((const struct sockaddr_dl *)(a2))->sdl_len) && \ + (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ + CLLADDR((const struct sockaddr_dl *)(a2)), \ + ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ static struct ifaddr * -ifa_ifwithaddr_internal(struct sockaddr *addr, int getref) +ifa_ifwithaddr_internal(const struct sockaddr *addr, int getref) { struct ifnet *ifp; struct ifaddr *ifa; @@ -1649,14 +1793,14 @@ done: } struct ifaddr * -ifa_ifwithaddr(struct sockaddr *addr) +ifa_ifwithaddr(const struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 1)); } int -ifa_ifwithaddr_check(struct sockaddr *addr) +ifa_ifwithaddr_check(const struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 0) != NULL); @@ -1667,13 +1811,15 @@ ifa_ifwithaddr_check(struct sockaddr *addr) */ /* ARGSUSED */ struct ifaddr * -ifa_ifwithbroadaddr(struct sockaddr *addr) +ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) + continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) @@ -1700,7 +1846,7 @@ done: */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithdstaddr_fib(struct sockaddr *addr, int fibnum) +ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; @@ -1730,32 +1876,25 @@ done: return (ifa); } -struct ifaddr * -ifa_ifwithdstaddr(struct sockaddr *addr) -{ - - return (ifa_ifwithdstaddr_fib(addr, RT_ALL_FIBS)); -} - /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * -ifa_ifwithnet_fib(struct sockaddr *addr, int ignore_ptp, int fibnum) +ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; - char *addr_data = addr->sa_data, *cplim; + const char *addr_data = addr->sa_data, *cplim; /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { - struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; + const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } @@ -1772,7 +1911,7 @@ ifa_ifwithnet_fib(struct sockaddr *addr, int ignore_ptp, int fibnum) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - char *cp, *cp2, *cp3; + const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; @@ -1794,19 +1933,6 @@ next: continue; } } else { /* - * if we have a special address handler, - * then use it instead of the generic one. - */ - if (ifa->ifa_claim_addr) { - if ((*ifa->ifa_claim_addr)(ifa, addr)) { - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); - goto done; - } - continue; - } - - /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask @@ -1826,11 +1952,13 @@ next: continue; /* * If the netmask of what we just found * is more specific than what we had before - * (if we had one) then remember the new one - * before continuing to search - * for an even better one. + * (if we had one), or if the virtual status + * of new prefix is better than of the old one, + * then remember the new one before continuing + * to search for an even better one. */ if (ifa_maybe == NULL || + ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { if (ifa_maybe != NULL) @@ -1851,22 +1979,15 @@ done: return (ifa); } -struct ifaddr * -ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp) -{ - - return (ifa_ifwithnet_fib(addr, ignore_ptp, RT_ALL_FIBS)); -} - /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * -ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) +ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; - char *cp, *cp2, *cp3; + const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; @@ -1909,6 +2030,21 @@ done: return (ifa); } +/* + * See whether new ifa is better than current one: + * 1) A non-virtual one is preferred over virtual. + * 2) A virtual in master state preferred over any other state. + * + * Used in several address selecting functions. + */ +int +ifa_preferred(struct ifaddr *cur, struct ifaddr *next) +{ + + return (cur->ifa_carp && (!next->ifa_carp || + ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); +} + #include <net/if_llatbl.h> /* @@ -1923,10 +2059,8 @@ link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) struct sockaddr *dst; struct ifnet *ifp; - RT_LOCK_ASSERT(rt); - - if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || - ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) + if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == NULL) || + ((ifp = ifa->ifa_ifp) == NULL) || ((dst = rt_key(rt)) == NULL)) return; ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { @@ -1938,10 +2072,41 @@ link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) } } +struct sockaddr_dl * +link_alloc_sdl(size_t size, int flags) +{ + + return (malloc(size, M_TEMP, flags)); +} + +void +link_free_sdl(struct sockaddr *sa) +{ + free(sa, M_TEMP); +} + +/* + * Fills in given sdl with interface basic info. + * Returns pointer to filled sdl. + */ +struct sockaddr_dl * +link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) +{ + struct sockaddr_dl *sdl; + + sdl = (struct sockaddr_dl *)paddr; + memset(sdl, 0, sizeof(struct sockaddr_dl)); + sdl->sdl_len = sizeof(struct sockaddr_dl); + sdl->sdl_family = AF_LINK; + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = iftype; + + return (sdl); +} + /* * Mark an interface down and notify protocols of * the transition. - * NOTE: must be called at splnet or eqivalent. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) @@ -1965,7 +2130,6 @@ if_unroute(struct ifnet *ifp, int flag, int fam) /* * Mark an interface up and notify protocols of * the transition. - * NOTE: must be called at splnet or eqivalent. */ static void if_route(struct ifnet *ifp, int flag, int fam) @@ -2026,7 +2190,7 @@ do_link_state_change(void *arg, int pending) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && - IFP2AC(ifp)->ac_netgraph != NULL) + ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); @@ -2051,7 +2215,6 @@ do_link_state_change(void *arg, int pending) /* * Mark an interface down and notify protocols of * the transition. - * NOTE: must be called at splnet or eqivalent. */ void if_down(struct ifnet *ifp) @@ -2063,7 +2226,6 @@ if_down(struct ifnet *ifp) /* * Mark an interface up and notify protocols of * the transition. - * NOTE: must be called at splnet or eqivalent. */ void if_up(struct ifnet *ifp) @@ -2088,8 +2250,8 @@ if_qflush(struct ifnet *ifp) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; - while ((m = n) != 0) { - n = m->m_act; + while ((m = n) != NULL) { + n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; @@ -2140,7 +2302,6 @@ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; - struct ifstat *ifs; int error = 0; int new_flags, temp_flags; size_t namelen, onamelen; @@ -2182,7 +2343,8 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) break; case SIOCGIFPHYS: - ifr->ifr_phys = ifp->if_physical; + /* XXXGL: did this ever worked? */ + ifr->ifr_phys = 0; break; case SIOCGIFDESCR: @@ -2262,18 +2424,12 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); - if (ifp->if_flags & IFF_SMART) { - /* Smart drivers twiddle their own routes */ - } else if (ifp->if_flags & IFF_UP && + if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { - int s = splimp(); if_down(ifp); - splx(s); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { - int s = splimp(); if_up(ifp); - splx(s); } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { @@ -2281,9 +2437,11 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; - log(LOG_INFO, "%s: permanently promiscuous mode %s\n", - ifp->if_xname, - (new_flags & IFF_PPROMISC) ? "enabled" : "disabled"); + if (log_promisc_mode_change) + log(LOG_INFO, "%s: permanently promiscuous mode %s\n", + ifp->if_xname, + ((new_flags & IFF_PPROMISC) ? + "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); @@ -2321,6 +2479,11 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (error); if (new_name[0] == '\0') return (EINVAL); + if (new_name[IFNAMSIZ-1] != '\0') { + new_name[IFNAMSIZ-1] = '\0'; + if (strlen(new_name) == IFNAMSIZ-1) + return (EINVAL); + } if (ifunit(new_name) != NULL) return (EEXIST); @@ -2339,9 +2502,9 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) log(LOG_INFO, "%s: changing name to '%s'\n", ifp->if_xname, new_name); + IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; - IFA_LOCK(ifa); sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; @@ -2360,7 +2523,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; - IFA_UNLOCK(ifa); + IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ @@ -2420,6 +2583,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) #ifdef INET6 nd6_setmtu(ifp); #endif + rt_updatemtu(ifp); } break; } @@ -2470,7 +2634,6 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif - case SIOCSLIFPHYADDR: case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); @@ -2484,13 +2647,10 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) break; case SIOCGIFSTATUS: - ifs = (struct ifstat *)data; - ifs->ascii[0] = '\0'; - case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: - case SIOCGLIFPHYADDR: case SIOCGIFMEDIA: + case SIOCGIFXMEDIA: case SIOCGIFGENERIC: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); @@ -2503,7 +2663,6 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); - EVENTHANDLER_INVOKE(iflladdr_event, ifp); break; case SIOCAIFGROUP: @@ -2542,6 +2701,9 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) return (error); } +/* COMPAT_SVR4 */ +#define OSIOCGIFCONF _IOWR('i', 20, struct ifconf) + #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; @@ -2563,11 +2725,25 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) struct ifreq *ifr; int error; int oif_flags; +#ifdef VIMAGE + int shutdown; +#endif CURVNET_SET(so->so_vnet); +#ifdef VIMAGE + /* Make sure the VNET is stable. */ + shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET && + so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; + if (shutdown) { + CURVNET_RESTORE(); + return (EBUSY); + } +#endif + + switch (cmd) { case SIOCGIFCONF: - case OSIOCGIFCONF: + case OSIOCGIFCONF: /* COMPAT_SVR4 */ error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); @@ -2626,6 +2802,16 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) error = if_getgroupmembers((struct ifgroupreq *)data); CURVNET_RESTORE(); return (error); +#if defined(INET) || defined(INET6) + case SIOCSVH: + case SIOCGVH: + if (carp_ioctl_p == NULL) + error = EPROTONOSUPPORT; + else + error = (*carp_ioctl_p)(ifr, cmd, td); + CURVNET_RESTORE(); + return (error); +#endif } ifp = ifunit_ref(ifr->ifr_name); @@ -2657,79 +2843,17 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) * layer, and do not perform any credentials checks or input * validation. */ -#ifndef COMPAT_43 - error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, - data, - ifp, td)); + error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, + ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); -#else - { - u_long ocmd = cmd; - - switch (cmd) { - - case SIOCSIFDSTADDR: - case SIOCSIFADDR: - case SIOCSIFBRDADDR: - case SIOCSIFNETMASK: -#if BYTE_ORDER != BIG_ENDIAN - if (ifr->ifr_addr.sa_family == 0 && - ifr->ifr_addr.sa_len < 16) { - ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len; - ifr->ifr_addr.sa_len = 16; - } -#else - if (ifr->ifr_addr.sa_len == 0) - ifr->ifr_addr.sa_len = 16; -#endif - break; - - case OSIOCGIFADDR: - cmd = SIOCGIFADDR; - break; - - case OSIOCGIFDSTADDR: - cmd = SIOCGIFDSTADDR; - break; - - case OSIOCGIFBRDADDR: - cmd = SIOCGIFBRDADDR; - break; - - case OSIOCGIFNETMASK: - cmd = SIOCGIFNETMASK; - } - error = ((*so->so_proto->pr_usrreqs->pru_control)(so, - cmd, - data, - ifp, td)); - if (error == EOPNOTSUPP && ifp != NULL && - ifp->if_ioctl != NULL && - cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && - cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) - error = (*ifp->if_ioctl)(ifp, cmd, data); - switch (ocmd) { - - case OSIOCGIFADDR: - case OSIOCGIFDSTADDR: - case OSIOCGIFBRDADDR: - case OSIOCGIFNETMASK: - *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family; - - } - } -#endif /* COMPAT_43 */ if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 - if (ifp->if_flags & IFF_UP) { - int s = splimp(); + if (ifp->if_flags & IFF_UP) in6_if_up(ifp); - splx(s); - } #endif } if_rele(ifp); @@ -2825,7 +2949,8 @@ ifpromisc(struct ifnet *ifp, int pswitch) error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ - if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC)) + if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && + log_promisc_mode_change) log(LOG_INFO, "%s: promiscuous mode %s\n", ifp->if_xname, (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); @@ -2890,16 +3015,15 @@ again: if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; -#ifdef COMPAT_43 + /* COMPAT_SVR4 */ if (cmd == OSIOCGIFCONF) { struct osockaddr *osa = - (struct osockaddr *)&ifr.ifr_addr; + (struct osockaddr *)&ifr.ifr_addr; ifr.ifr_addr = *sa; osa->sa_family = sa->sa_family; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else -#endif if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); @@ -2955,7 +3079,7 @@ if_allmulti(struct ifnet *ifp, int onswitch) } struct ifmultiaddr * -if_findmulti(struct ifnet *ifp, struct sockaddr *sa) +if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; @@ -3034,8 +3158,6 @@ if_freemulti(struct ifmultiaddr *ifma) KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); - KASSERT(ifma->ifma_protospec == NULL, - ("if_freemulti: protospec not NULL")); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); @@ -3067,6 +3189,7 @@ if_addmulti(struct ifnet *ifp, struct sockaddr *sa, { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; + struct sockaddr_dl sdl; int error; /* @@ -3086,12 +3209,18 @@ if_addmulti(struct ifnet *ifp, struct sockaddr *sa, /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its - * refcount or allocate an ifma for that also. If 'llsa' was - * returned, we will need to free it later. + * refcount or allocate an ifma for that also. + * Most link layer resolving functions returns address data which + * fits inside default sockaddr_dl structure. However callback + * can allocate another sockaddr structure, in that case we need to + * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { + /* Provide called function with buffer size information */ + sdl.sdl_len = sizeof(sdl); + llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; @@ -3155,14 +3284,14 @@ if_addmulti(struct ifnet *ifp, struct sockaddr *sa, (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } - if (llsa != NULL) - free(llsa, M_IFMADDR); + if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) + link_free_sdl(llsa); return (0); free_llsa_out: - if (llsa != NULL) - free(llsa, M_IFMADDR); + if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) + link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); @@ -3363,8 +3492,10 @@ if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. + * + * Set noinline to be dtrace-friendly */ -int +__noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; @@ -3422,17 +3553,45 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } -#ifdef INET - /* - * Also send gratuitous ARPs to notify other nodes about - * the address change. - */ - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family == AF_INET) - arp_ifinit(ifp, ifa); - } -#endif } + EVENTHANDLER_INVOKE(iflladdr_event, ifp); + return (0); +} + +/* + * Compat function for handling basic encapsulation requests. + * Not converted stacks (FDDI, IB, ..) supports traditional + * output model: ARP (and other similar L2 protocols) are handled + * inside output routine, arpresolve/nd6_resolve() returns MAC + * address instead of full prepend. + * + * This function creates calculated header==MAC for IPv4/IPv6 and + * returns EAFNOSUPPORT (which is then handled in ARP code) for other + * address families. + */ +static int +if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) +{ + + if (req->rtype != IFENCAP_LL) + return (EOPNOTSUPP); + + if (req->bufsize < req->lladdr_len) + return (ENOMEM); + + switch (req->family) { + case AF_INET: + case AF_INET6: + break; + default: + return (EAFNOSUPPORT); + } + + /* Copy lladdr to storage as is */ + memmove(req->buf, req->lladdr, req->lladdr_len); + req->bufsize = req->lladdr_len; + req->lladdr_off = 0; + return (0); } @@ -3500,15 +3659,15 @@ if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) IF_LOCK(ifq); if (_IF_QFULL(ifq)) { - _IF_DROP(ifq); IF_UNLOCK(ifq); + if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { - ifp->if_obytes += m->m_pkthdr.len + adjust; + if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_omcasts++; + if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); @@ -3543,3 +3702,465 @@ if_deregister_com_alloc(u_char type) if_com_alloc[type] = NULL; if_com_free[type] = NULL; } + +/* API for driver access to network stack owned ifnet.*/ +uint64_t +if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) +{ + uint64_t oldbrate; + + oldbrate = ifp->if_baudrate; + ifp->if_baudrate = baudrate; + return (oldbrate); +} + +uint64_t +if_getbaudrate(if_t ifp) +{ + + return (((struct ifnet *)ifp)->if_baudrate); +} + +int +if_setcapabilities(if_t ifp, int capabilities) +{ + ((struct ifnet *)ifp)->if_capabilities = capabilities; + return (0); +} + +int +if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) +{ + ((struct ifnet *)ifp)->if_capabilities |= setbit; + ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; + + return (0); +} + +int +if_getcapabilities(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_capabilities; +} + +int +if_setcapenable(if_t ifp, int capabilities) +{ + ((struct ifnet *)ifp)->if_capenable = capabilities; + return (0); +} + +int +if_setcapenablebit(if_t ifp, int setcap, int clearcap) +{ + if(setcap) + ((struct ifnet *)ifp)->if_capenable |= setcap; + if(clearcap) + ((struct ifnet *)ifp)->if_capenable &= ~clearcap; + + return (0); +} + +const char * +if_getdname(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_dname; +} + +int +if_togglecapenable(if_t ifp, int togglecap) +{ + ((struct ifnet *)ifp)->if_capenable ^= togglecap; + return (0); +} + +int +if_getcapenable(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_capenable; +} + +/* + * This is largely undesirable because it ties ifnet to a device, but does + * provide flexiblity for an embedded product vendor. Should be used with + * the understanding that it violates the interface boundaries, and should be + * a last resort only. + */ +int +if_setdev(if_t ifp, void *dev) +{ + return (0); +} + +int +if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) +{ + ((struct ifnet *)ifp)->if_drv_flags |= set_flags; + ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; + + return (0); +} + +int +if_getdrvflags(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_drv_flags; +} + +int +if_setdrvflags(if_t ifp, int flags) +{ + ((struct ifnet *)ifp)->if_drv_flags = flags; + return (0); +} + + +int +if_setflags(if_t ifp, int flags) +{ + ((struct ifnet *)ifp)->if_flags = flags; + return (0); +} + +int +if_setflagbits(if_t ifp, int set, int clear) +{ + ((struct ifnet *)ifp)->if_flags |= set; + ((struct ifnet *)ifp)->if_flags &= ~clear; + + return (0); +} + +int +if_getflags(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_flags; +} + +int +if_clearhwassist(if_t ifp) +{ + ((struct ifnet *)ifp)->if_hwassist = 0; + return (0); +} + +int +if_sethwassistbits(if_t ifp, int toset, int toclear) +{ + ((struct ifnet *)ifp)->if_hwassist |= toset; + ((struct ifnet *)ifp)->if_hwassist &= ~toclear; + + return (0); +} + +int +if_sethwassist(if_t ifp, int hwassist_bit) +{ + ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; + return (0); +} + +int +if_gethwassist(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_hwassist; +} + +int +if_setmtu(if_t ifp, int mtu) +{ + ((struct ifnet *)ifp)->if_mtu = mtu; + return (0); +} + +int +if_getmtu(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_mtu; +} + +int +if_getmtu_family(if_t ifp, int family) +{ + struct domain *dp; + + for (dp = domains; dp; dp = dp->dom_next) { + if (dp->dom_family == family && dp->dom_ifmtu != NULL) + return (dp->dom_ifmtu((struct ifnet *)ifp)); + } + + return (((struct ifnet *)ifp)->if_mtu); +} + +int +if_setsoftc(if_t ifp, void *softc) +{ + ((struct ifnet *)ifp)->if_softc = softc; + return (0); +} + +void * +if_getsoftc(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_softc; +} + +void +if_setrcvif(struct mbuf *m, if_t ifp) +{ + m->m_pkthdr.rcvif = (struct ifnet *)ifp; +} + +void +if_setvtag(struct mbuf *m, uint16_t tag) +{ + m->m_pkthdr.ether_vtag = tag; +} + +uint16_t +if_getvtag(struct mbuf *m) +{ + + return (m->m_pkthdr.ether_vtag); +} + +int +if_sendq_empty(if_t ifp) +{ + return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); +} + +struct ifaddr * +if_getifaddr(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_addr; +} + +int +if_getamcount(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_amcount; +} + + +int +if_setsendqready(if_t ifp) +{ + IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); + return (0); +} + +int +if_setsendqlen(if_t ifp, int tx_desc_count) +{ + IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); + ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; + + return (0); +} + +int +if_vlantrunkinuse(if_t ifp) +{ + return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; +} + +int +if_input(if_t ifp, struct mbuf* sendmp) +{ + (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); + return (0); + +} + +/* XXX */ +#ifndef ETH_ADDR_LEN +#define ETH_ADDR_LEN 6 +#endif + +int +if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) +{ + struct ifmultiaddr *ifma; + uint8_t *lmta = (uint8_t *)mta; + int mcnt = 0; + + TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + + if (mcnt == max) + break; + + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); + mcnt++; + } + *cnt = mcnt; + + return (0); +} + +int +if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) +{ + int error; + + if_maddr_rlock(ifp); + error = if_setupmultiaddr(ifp, mta, cnt, max); + if_maddr_runlock(ifp); + return (error); +} + +int +if_multiaddr_count(if_t ifp, int max) +{ + struct ifmultiaddr *ifma; + int count; + + count = 0; + if_maddr_rlock(ifp); + TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + count++; + if (count == max) + break; + } + if_maddr_runlock(ifp); + return (count); +} + +int +if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) +{ + struct ifmultiaddr *ifma; + int cnt = 0; + + if_maddr_rlock(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) + cnt += filter(arg, ifma, cnt); + if_maddr_runlock(ifp); + return (cnt); +} + +struct mbuf * +if_dequeue(if_t ifp) +{ + struct mbuf *m; + IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); + + return (m); +} + +int +if_sendq_prepend(if_t ifp, struct mbuf *m) +{ + IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); + return (0); +} + +int +if_setifheaderlen(if_t ifp, int len) +{ + ((struct ifnet *)ifp)->if_hdrlen = len; + return (0); +} + +caddr_t +if_getlladdr(if_t ifp) +{ + return (IF_LLADDR((struct ifnet *)ifp)); +} + +void * +if_gethandle(u_char type) +{ + return (if_alloc(type)); +} + +void +if_bpfmtap(if_t ifh, struct mbuf *m) +{ + struct ifnet *ifp = (struct ifnet *)ifh; + + BPF_MTAP(ifp, m); +} + +void +if_etherbpfmtap(if_t ifh, struct mbuf *m) +{ + struct ifnet *ifp = (struct ifnet *)ifh; + + ETHER_BPF_MTAP(ifp, m); +} + +void +if_vlancap(if_t ifh) +{ + struct ifnet *ifp = (struct ifnet *)ifh; + VLAN_CAPABILITIES(ifp); +} + +void +if_setinitfn(if_t ifp, void (*init_fn)(void *)) +{ + ((struct ifnet *)ifp)->if_init = init_fn; +} + +void +if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) +{ + ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; +} + +void +if_setstartfn(if_t ifp, void (*start_fn)(if_t)) +{ + ((struct ifnet *)ifp)->if_start = (void *)start_fn; +} + +void +if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) +{ + ((struct ifnet *)ifp)->if_transmit = start_fn; +} + +void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) +{ + ((struct ifnet *)ifp)->if_qflush = flush_fn; + +} + +void +if_setgetcounterfn(if_t ifp, if_get_counter_t fn) +{ + + ifp->if_get_counter = fn; +} + +/* Revisit these - These are inline functions originally. */ +int +drbr_inuse_drv(if_t ifh, struct buf_ring *br) +{ + return drbr_inuse(ifh, br); +} + +struct mbuf* +drbr_dequeue_drv(if_t ifh, struct buf_ring *br) +{ + return drbr_dequeue(ifh, br); +} + +int +drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) +{ + return drbr_needs_enqueue(ifh, br); +} + +int +drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) +{ + return drbr_enqueue(ifh, br, m); + +} diff --git a/freebsd/sys/net/if.h b/freebsd/sys/net/if.h index e182db54..98ae0a82 100644 --- a/freebsd/sys/net/if.h +++ b/freebsd/sys/net/if.h @@ -35,10 +35,6 @@ #include <sys/cdefs.h> -#ifdef _KERNEL -#include <sys/queue.h> -#endif - #if __BSD_VISIBLE /* * <net/if.h> does not depend on <sys/time.h> on most other systems. This @@ -49,8 +45,6 @@ #include <sys/time.h> #include <sys/socket.h> #endif - -struct ifnet; #endif /* @@ -80,32 +74,45 @@ struct if_clonereq { */ struct if_data { /* generic interface information */ - u_char ifi_type; /* ethernet, tokenring, etc */ - u_char ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ - u_char ifi_addrlen; /* media address length */ - u_char ifi_hdrlen; /* media header length */ - u_char ifi_link_state; /* current link state */ - u_char ifi_spare_char1; /* spare byte */ - u_char ifi_spare_char2; /* spare byte */ - u_char ifi_datalen; /* length of this data struct */ - u_long ifi_mtu; /* maximum transmission unit */ - u_long ifi_metric; /* routing metric (external only) */ - u_long ifi_baudrate; /* linespeed */ + uint8_t ifi_type; /* ethernet, tokenring, etc */ + uint8_t ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ + uint8_t ifi_addrlen; /* media address length */ + uint8_t ifi_hdrlen; /* media header length */ + uint8_t ifi_link_state; /* current link state */ + uint8_t ifi_vhid; /* carp vhid */ + uint16_t ifi_datalen; /* length of this data struct */ + uint32_t ifi_mtu; /* maximum transmission unit */ + uint32_t ifi_metric; /* routing metric (external only) */ + uint64_t ifi_baudrate; /* linespeed */ /* volatile statistics */ - u_long ifi_ipackets; /* packets received on interface */ - u_long ifi_ierrors; /* input errors on interface */ - u_long ifi_opackets; /* packets sent on interface */ - u_long ifi_oerrors; /* output errors on interface */ - u_long ifi_collisions; /* collisions on csma interfaces */ - u_long ifi_ibytes; /* total number of octets received */ - u_long ifi_obytes; /* total number of octets sent */ - u_long ifi_imcasts; /* packets received via multicast */ - u_long ifi_omcasts; /* packets sent via multicast */ - u_long ifi_iqdrops; /* dropped on input, this interface */ - u_long ifi_noproto; /* destined for unsupported protocol */ - u_long ifi_hwassist; /* HW offload capabilities, see IFCAP */ - time_t ifi_epoch; /* uptime at attach or stat reset */ - struct timeval ifi_lastchange; /* time of last administrative change */ + uint64_t ifi_ipackets; /* packets received on interface */ + uint64_t ifi_ierrors; /* input errors on interface */ + uint64_t ifi_opackets; /* packets sent on interface */ + uint64_t ifi_oerrors; /* output errors on interface */ + uint64_t ifi_collisions; /* collisions on csma interfaces */ + uint64_t ifi_ibytes; /* total number of octets received */ + uint64_t ifi_obytes; /* total number of octets sent */ + uint64_t ifi_imcasts; /* packets received via multicast */ + uint64_t ifi_omcasts; /* packets sent via multicast */ + uint64_t ifi_iqdrops; /* dropped on input */ + uint64_t ifi_oqdrops; /* dropped on output */ + uint64_t ifi_noproto; /* destined for unsupported protocol */ + uint64_t ifi_hwassist; /* HW offload capabilities, see IFCAP */ + + /* Unions are here to make sizes MI. */ + union { /* uptime at attach or stat reset */ + time_t tt; + uint64_t ph; + } __ifi_epoch; +#define ifi_epoch __ifi_epoch.tt + union { /* time of last administrative change */ + struct timeval tv; + struct { + uint64_t ph1; + uint64_t ph2; + } ph; + } __ifi_lastchange; +#define ifi_lastchange __ifi_lastchange.tv }; /*- @@ -135,7 +142,7 @@ struct if_data { #define IFF_DEBUG 0x4 /* (n) turn on debugging */ #define IFF_LOOPBACK 0x8 /* (i) is a loopback net */ #define IFF_POINTOPOINT 0x10 /* (i) is a point-to-point link */ -#define IFF_SMART 0x20 /* (i) interface manages own routes */ +/* 0x20 was IFF_SMART */ #define IFF_DRV_RUNNING 0x40 /* (d) resources allocated */ #define IFF_NOARP 0x80 /* (n) no address resolution protocol */ #define IFF_PROMISC 0x100 /* (n) receive all packets */ @@ -153,7 +160,6 @@ struct if_data { #define IFF_STATICARP 0x80000 /* (n) static ARP */ #define IFF_DYING 0x200000 /* (n) interface is winding down */ #define IFF_RENAMING 0x400000 /* (n) interface is being renamed */ - /* * Old names for driver flags so that user space tools can continue to use * the old (portable) names. @@ -166,7 +172,7 @@ struct if_data { /* flags set internally only: */ #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_DRV_RUNNING|IFF_DRV_OACTIVE|\ - IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_SMART|IFF_PROMISC|\ + IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC|\ IFF_DYING|IFF_CANTCONFIG) /* @@ -180,7 +186,7 @@ struct if_data { * Some convenience macros used for setting ifi_baudrate. * XXX 1000 vs. 1024? --thorpej@netbsd.org */ -#define IF_Kbps(x) ((x) * 1000) /* kilobits/sec. */ +#define IF_Kbps(x) ((uintmax_t)(x) * 1000) /* kilobits/sec. */ #define IF_Mbps(x) (IF_Kbps((x) * 1000)) /* megabits/sec. */ #define IF_Gbps(x) (IF_Mbps((x) * 1000)) /* gigabits/sec. */ @@ -232,6 +238,7 @@ struct if_data { #define IFCAP_NETMAP 0x100000 /* netmap mode supported/enabled */ #define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */ #define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */ +#define IFCAP_HWSTATS 0x800000 /* manages counters internally */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) @@ -297,7 +304,7 @@ struct ifa_msghdr { int ifam_addrs; /* like rtm_addrs */ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ - int ifam_metric; /* value of ifa_metric */ + int ifam_metric; /* value of ifa_ifp->if_metric */ }; /* @@ -322,7 +329,7 @@ struct ifa_msghdrl { u_short _ifam_spare1; /* spare space to grow if_index, see if_var.h */ u_short ifam_len; /* length of ifa_msghdrl incl. if_data */ u_short ifam_data_off; /* offset of if_data from beginning */ - int ifam_metric; /* value of ifa_metric */ + int ifam_metric; /* value of ifa_ifp->if_metric */ struct if_data ifam_data;/* statistics and other data about if or * address */ }; @@ -386,6 +393,7 @@ struct ifreq { caddr_t ifru_data; int ifru_cap[2]; u_int ifru_fib; + u_char ifru_vlan_pcp; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -403,6 +411,7 @@ struct ifreq { #define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ #define ifr_index ifr_ifru.ifru_index /* interface index */ #define ifr_fib ifr_ifru.ifru_fib /* interface fib */ +#define ifr_vlan_pcp ifr_ifru.ifru_vlan_pcp /* VLAN priority */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ @@ -415,6 +424,15 @@ struct ifaliasreq { struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; + int ifra_vhid; +}; + +/* 9.x compat */ +struct oifaliasreq { + char ifra_name[IFNAMSIZ]; + struct sockaddr ifra_addr; + struct sockaddr ifra_broadaddr; + struct sockaddr ifra_mask; }; struct ifmediareq { @@ -495,16 +513,17 @@ struct ifgroupreq { }; /* - * Structure for SIOC[AGD]LIFADDR + * Structure used to request i2c data + * from interface transceivers. */ -struct if_laddrreq { - char iflr_name[IFNAMSIZ]; - u_int flags; -#define IFLR_PREFIX 0x8000 /* in: prefix given out: kernel fills id */ - u_int prefixlen; /* in/out */ - struct sockaddr_storage addr; /* in/out */ - struct sockaddr_storage dstaddr; /* out */ -}; +struct ifi2creq { + uint8_t dev_addr; /* i2c address (0xA0, 0xA2) */ + uint8_t offset; /* read offset */ + uint8_t len; /* read length */ + uint8_t spare0; + uint32_t spare1; + uint8_t data[8]; /* read buffer */ +}; #endif /* __BSD_VISIBLE */ @@ -528,10 +547,4 @@ struct if_nameindex *if_nameindex(void); unsigned int if_nametoindex(const char *); __END_DECLS #endif - -#ifdef _KERNEL -/* XXX - this should go away soon. */ -#include <net/if_var.h> -#endif - #endif /* !_NET_IF_H_ */ diff --git a/freebsd/sys/net/if_arc.h b/freebsd/sys/net/if_arc.h index 88a72403..23139aa6 100644 --- a/freebsd/sys/net/if_arc.h +++ b/freebsd/sys/net/if_arc.h @@ -133,7 +133,7 @@ void arc_storelladdr(struct ifnet *, u_int8_t); int arc_isphds(u_int8_t); void arc_input(struct ifnet *, struct mbuf *); int arc_output(struct ifnet *, struct mbuf *, - struct sockaddr *, struct route *); + const struct sockaddr *, struct route *); int arc_ioctl(struct ifnet *, u_long, caddr_t); void arc_frag_init(struct ifnet *); diff --git a/freebsd/sys/net/if_arcsubr.c b/freebsd/sys/net/if_arcsubr.c index fae432ad..1954e262 100644 --- a/freebsd/sys/net/if_arcsubr.c +++ b/freebsd/sys/net/if_arcsubr.c @@ -42,7 +42,6 @@ */ #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> @@ -59,6 +58,7 @@ #include <machine/cpu.h> #include <net/if.h> +#include <net/if_var.h> #include <net/netisr.h> #include <net/route.h> #include <net/if_dl.h> @@ -78,11 +78,6 @@ #include <netinet6/nd6.h> #endif -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - #define ARCNET_ALLOW_BROKEN_ARP static struct mbuf *arc_defrag(struct ifnet *, struct mbuf *); @@ -94,8 +89,7 @@ u_int8_t arcbroadcastaddr = 0; #define ARC_LLADDR(ifp) (*(u_int8_t *)IF_LLADDR(ifp)) #define senderr(e) { error = (e); goto bad;} -#define SIN(s) ((struct sockaddr_in *)s) -#define SIPX(s) ((struct sockaddr_ipx *)s) +#define SIN(s) ((const struct sockaddr_in *)(s)) /* * ARCnet output routine. @@ -103,7 +97,7 @@ u_int8_t arcbroadcastaddr = 0; * Assumes that ifp is actually pointer to arccom structure. */ int -arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, +arc_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct arc_header *ah; @@ -112,7 +106,7 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, int loop_copy = 0; int isphds; #if defined(INET) || defined(INET6) - struct llentry *lle; + int is_gw = 0; #endif if (!((ifp->if_flags & IFF_UP) && @@ -120,6 +114,10 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, return(ENETDOWN); /* m, m1 aren't initialized yet */ error = 0; +#if defined(INET) || defined(INET6) + if (ro != NULL) + is_gw = (ro->ro_flags & RT_HAS_GW) != 0; +#endif switch (dst->sa_family) { #ifdef INET @@ -133,8 +131,8 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, else if (ifp->if_flags & IFF_NOARP) adst = ntohl(SIN(dst)->sin_addr.s_addr) & 0xFF; else { - error = arpresolve(ifp, ro ? ro->ro_rt : NULL, - m, dst, &adst, &lle); + error = arpresolve(ifp, is_gw, m, dst, &adst, NULL, + NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } @@ -172,24 +170,23 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, #endif #ifdef INET6 case AF_INET6: - error = nd6_storelladdr(ifp, m, dst, (u_char *)&adst, &lle); - if (error) - return (error); + if ((m->m_flags & M_MCAST) != 0) + adst = arcbroadcastaddr; /* ARCnet broadcast address */ + else { + error = nd6_resolve(ifp, is_gw, m, dst, &adst, NULL, + NULL); + if (error != 0) + return (error == EWOULDBLOCK ? 0 : error); + } atype = ARCTYPE_INET6; break; #endif -#ifdef IPX - case AF_IPX: - adst = SIPX(dst)->sipx_addr.x_host.c_host[5]; - atype = ARCTYPE_IPX; - if (adst == 0xff) - adst = arcbroadcastaddr; - break; -#endif - case AF_UNSPEC: + { + const struct arc_header *ah; + loop_copy = -1; - ah = (struct arc_header *)dst->sa_data; + ah = (const struct arc_header *)dst->sa_data; adst = ah->arc_dhost; atype = ah->arc_type; @@ -209,15 +206,15 @@ arc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, #endif } break; - + } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); } isphds = arc_isphds(atype); - M_PREPEND(m, isphds ? ARC_HDRNEWLEN : ARC_HDRLEN, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, isphds ? ARC_HDRNEWLEN : ARC_HDRLEN, M_NOWAIT); + if (m == NULL) senderr(ENOBUFS); ah = mtod(m, struct arc_header *); ah->arc_type = atype; @@ -268,12 +265,12 @@ arc_frag_next(struct ifnet *ifp) struct arc_header *ah; ac = (struct arccom *)ifp->if_l2com; - if ((m = ac->curr_frag) == 0) { + if ((m = ac->curr_frag) == NULL) { int tfrags; /* dequeue new packet */ IF_DEQUEUE(&ifp->if_snd, m); - if (m == 0) + if (m == NULL) return 0; ah = mtod(m, struct arc_header *); @@ -281,7 +278,7 @@ arc_frag_next(struct ifnet *ifp) return m; ++ac->ac_seqid; /* make the seqid unique */ - tfrags = (m->m_pkthdr.len + ARC_MAX_DATA - 1) / ARC_MAX_DATA; + tfrags = howmany(m->m_pkthdr.len, ARC_MAX_DATA); ac->fsflag = 2 * tfrags - 3; ac->sflag = 0; ac->rsflag = ac->fsflag; @@ -296,14 +293,14 @@ arc_frag_next(struct ifnet *ifp) /* split out next fragment and return it */ if (ac->sflag < ac->fsflag) { /* we CAN'T have short packets here */ - ac->curr_frag = m_split(m, ARC_MAX_DATA, M_DONTWAIT); + ac->curr_frag = m_split(m, ARC_MAX_DATA, M_NOWAIT); if (ac->curr_frag == 0) { m_freem(m); return 0; } - M_PREPEND(m, ARC_HDRNEWLEN, M_DONTWAIT); - if (m == 0) { + M_PREPEND(m, ARC_HDRNEWLEN, M_NOWAIT); + if (m == NULL) { m_freem(ac->curr_frag); ac->curr_frag = 0; return 0; @@ -321,8 +318,8 @@ arc_frag_next(struct ifnet *ifp) ARC_MAX_FORBID_LEN - ARC_HDRNEWLEN + 2)) { ac->curr_frag = 0; - M_PREPEND(m, ARC_HDRNEWLEN_EXC, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, ARC_HDRNEWLEN_EXC, M_NOWAIT); + if (m == NULL) return 0; ah = mtod(m, struct arc_header *); @@ -334,8 +331,8 @@ arc_frag_next(struct ifnet *ifp) } else { ac->curr_frag = 0; - M_PREPEND(m, ARC_HDRNEWLEN, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, ARC_HDRNEWLEN, M_NOWAIT); + if (m == NULL) return 0; ah = mtod(m, struct arc_header *); @@ -352,7 +349,7 @@ arc_frag_next(struct ifnet *ifp) /* * Defragmenter. Returns mbuf if last packet found, else - * NULL. frees imcoming mbuf as necessary. + * NULL. frees incoming mbuf as necessary. */ static __inline struct mbuf * @@ -371,7 +368,7 @@ arc_defrag(struct ifnet *ifp, struct mbuf *m) if (m->m_len < ARC_HDRNEWLEN) { m = m_pullup(m, ARC_HDRNEWLEN); if (m == NULL) { - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return NULL; } } @@ -391,7 +388,7 @@ arc_defrag(struct ifnet *ifp, struct mbuf *m) if (m->m_len < ARC_HDRNEWLEN) { m = m_pullup(m, ARC_HDRNEWLEN); if (m == NULL) { - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return NULL; } } @@ -544,11 +541,11 @@ arc_input(struct ifnet *ifp, struct mbuf *m) return; } - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if (ah->arc_dhost == arcbroadcastaddr) { m->m_flags |= M_BCAST|M_MCAST; - ifp->if_imcasts++; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } atype = ah->arc_type; @@ -556,15 +553,11 @@ arc_input(struct ifnet *ifp, struct mbuf *m) #ifdef INET case ARCTYPE_IP: m_adj(m, ARC_HDRNEWLEN); - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; case ARCTYPE_IP_OLD: m_adj(m, ARC_HDRLEN); - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; @@ -600,12 +593,6 @@ arc_input(struct ifnet *ifp, struct mbuf *m) isr = NETISR_IPV6; break; #endif -#ifdef IPX - case ARCTYPE_IPX: - m_adj(m, ARC_HDRNEWLEN); - isr = NETISR_IPX; - break; -#endif default: m_freem(m); return; @@ -640,11 +627,7 @@ arc_ifattach(struct ifnet *ifp, u_int8_t lla) ifp->if_resolvemulti = arc_resolvemulti; if (ifp->if_baudrate == 0) ifp->if_baudrate = 2500000; -#if __FreeBSD_version < 500000 - ifa = ifnet_addrs[ifp->if_index - 1]; -#else ifa = ifp->if_addr; -#endif KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ARCNET; @@ -691,26 +674,6 @@ arc_ioctl(struct ifnet *ifp, u_long command, caddr_t data) arp_ifinit(ifp, ifa); break; #endif -#ifdef IPX - /* - * XXX This code is probably wrong - */ - case AF_IPX: - { - struct ipx_addr *ina = &(IA_SIPX(ifa)->sipx_addr); - - if (ipx_nullhost(*ina)) - ina->x_host.c_host[5] = ARC_LLADDR(ifp); - else - arc_storelladdr(ifp, ina->x_host.c_host[5]); - - /* - * Set new address - */ - ifp->if_init(ifp->if_softc); - break; - } -#endif default: ifp->if_init(ifp->if_softc); break; @@ -781,21 +744,14 @@ arc_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, sdl = (struct sockaddr_dl *)sa; if (*LLADDR(sdl) != arcbroadcastaddr) return EADDRNOTAVAIL; - *llsa = 0; + *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT | M_ZERO); - if (sdl == NULL) - return ENOMEM; - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ARCNET; + sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ARC_ADDR_LEN; *LLADDR(sdl) = 0; *llsa = (struct sockaddr *)sdl; @@ -811,19 +767,12 @@ arc_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; - *llsa = 0; + *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT | M_ZERO); - if (sdl == NULL) - return ENOMEM; - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ARCNET; + sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ARC_ADDR_LEN; *LLADDR(sdl) = 0; *llsa = (struct sockaddr *)sdl; diff --git a/freebsd/sys/net/if_arp.h b/freebsd/sys/net/if_arp.h index 38c64020..7d141f37 100644 --- a/freebsd/sys/net/if_arp.h +++ b/freebsd/sys/net/if_arp.h @@ -97,43 +97,37 @@ struct arpreq { #define ATF_PUBL 0x08 /* publish entry (respond for other host) */ #define ATF_USETRAILERS 0x10 /* has requested trailers */ -#ifdef _KERNEL -/* - * Structure shared between the ethernet driver modules and - * the address resolution code. - */ -struct arpcom { - struct ifnet *ac_ifp; /* network-visible interface */ - void *ac_netgraph; /* ng_ether(4) netgraph node info */ -}; -#define IFP2AC(ifp) ((struct arpcom *)(ifp->if_l2com)) -#define AC2IFP(ac) ((ac)->ac_ifp) - -#endif /* _KERNEL */ - struct arpstat { /* Normal things that happen: */ - u_long txrequests; /* # of ARP requests sent by this host. */ - u_long txreplies; /* # of ARP replies sent by this host. */ - u_long rxrequests; /* # of ARP requests received by this host. */ - u_long rxreplies; /* # of ARP replies received by this host. */ - u_long received; /* # of ARP packets received by this host. */ + uint64_t txrequests; /* # of ARP requests sent by this host. */ + uint64_t txreplies; /* # of ARP replies sent by this host. */ + uint64_t rxrequests; /* # of ARP requests received by this host. */ + uint64_t rxreplies; /* # of ARP replies received by this host. */ + uint64_t received; /* # of ARP packets received by this host. */ - u_long arp_spares[4]; /* For either the upper or lower half. */ + uint64_t arp_spares[4]; /* For either the upper or lower half. */ /* Abnormal event and error counting: */ - u_long dropped; /* # of packets dropped waiting for a reply. */ - u_long timeouts; /* # of times with entries removed */ + uint64_t dropped; /* # of packets dropped waiting for a reply. */ + uint64_t timeouts; /* # of times with entries removed */ /* due to timeout. */ - u_long dupips; /* # of duplicate IPs detected. */ + uint64_t dupips; /* # of duplicate IPs detected. */ }; +#ifdef _KERNEL +#include <sys/counter.h> +#include <net/vnet.h> + +VNET_PCPUSTAT_DECLARE(struct arpstat, arpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define ARPSTAT_ADD(name, val) V_arpstat.name += (val) -#define ARPSTAT_SUB(name, val) V_arpstat.name -= (val) +#define ARPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct arpstat, arpstat, name, (val)) +#define ARPSTAT_SUB(name, val) ARPSTAT_ADD(name, -(val)) #define ARPSTAT_INC(name) ARPSTAT_ADD(name, 1) #define ARPSTAT_DEC(name) ARPSTAT_SUB(name, 1) +#endif /* _KERNEL */ + #endif /* !_NET_IF_ARP_H_ */ diff --git a/freebsd/sys/net/if_atm.h b/freebsd/sys/net/if_atm.h index e8f69da0..a0900eee 100644 --- a/freebsd/sys/net/if_atm.h +++ b/freebsd/sys/net/if_atm.h @@ -96,7 +96,7 @@ struct ifatm_mib { /* * Traffic parameters for ATM connections. This contains all parameters - * to accomodate UBR, UBR+MCR, CBR, VBR and ABR connections. + * to accommodate UBR, UBR+MCR, CBR, VBR and ABR connections. * * Keep in sync with ng_atm.h */ @@ -292,7 +292,7 @@ void atm_ifattach(struct ifnet *); void atm_ifdetach(struct ifnet *); void atm_input(struct ifnet *, struct atm_pseudohdr *, struct mbuf *, void *); -int atm_output(struct ifnet *, struct mbuf *, struct sockaddr *, +int atm_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); struct atmio_vcctable *atm_getvccs(struct atmio_vcc **, u_int, u_int, struct mtx *, int); diff --git a/freebsd/sys/net/if_atmsubr.c b/freebsd/sys/net/if_atmsubr.c index a4cbeb09..fff233c4 100644 --- a/freebsd/sys/net/if_atmsubr.c +++ b/freebsd/sys/net/if_atmsubr.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <net/if.h> +#include <net/if_var.h> #include <net/netisr.h> #include <net/route.h> #include <net/if_dl.h> @@ -123,7 +124,7 @@ static MALLOC_DEFINE(M_IFATM, "ifatm", "atm interface internals"); * ro->ro_rt must also be NULL. */ int -atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, +atm_output(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { u_int16_t etype = 0; /* if using LLC/SNAP */ @@ -131,7 +132,7 @@ atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, struct atm_pseudohdr atmdst, *ad; struct mbuf *m = m0; struct atmllc *atmllc; - struct atmllc *llc_hdr = NULL; + const struct atmllc *llc_hdr = NULL; u_int32_t atm_flags; #ifdef MAC @@ -175,7 +176,7 @@ atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, * (atm pseudo header (4) + LLC/SNAP (8)) */ bcopy(dst->sa_data, &atmdst, sizeof(atmdst)); - llc_hdr = (struct atmllc *)(dst->sa_data + + llc_hdr = (const struct atmllc *)(dst->sa_data + sizeof(atmdst)); break; @@ -192,8 +193,8 @@ atm_output(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, atm_flags = ATM_PH_FLAGS(&atmdst); if (atm_flags & ATM_PH_LLCSNAP) sz += 8; /* sizeof snap == 8 */ - M_PREPEND(m, sz, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, sz, M_NOWAIT); + if (m == NULL) senderr(ENOBUFS); ad = mtod(m, struct atm_pseudohdr *); *ad = atmdst; @@ -253,7 +254,7 @@ atm_input(struct ifnet *ifp, struct atm_pseudohdr *ah, struct mbuf *m, #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if (ng_atm_input_p != NULL) { (*ng_atm_input_p)(ifp, &m, ah, rxhand); @@ -296,7 +297,7 @@ atm_input(struct ifnet *ifp, struct atm_pseudohdr *ah, struct mbuf *m, struct atmllc *alc; if (m->m_len < sizeof(*alc) && - (m = m_pullup(m, sizeof(*alc))) == 0) + (m = m_pullup(m, sizeof(*alc))) == NULL) return; /* failed */ alc = mtod(m, struct atmllc *); if (bcmp(alc, ATMLLC_HDR, 6)) { diff --git a/freebsd/sys/net/if_bridge.c b/freebsd/sys/net/if_bridge.c index 65553092..77b376b9 100644 --- a/freebsd/sys/net/if_bridge.c +++ b/freebsd/sys/net/if_bridge.c @@ -73,7 +73,7 @@ * - Currently only supports Ethernet-like interfaces (Ethernet, * 802.11, VLANs on Ethernet, etc.) Figure out a nice way * to bridge other types of interfaces (FDDI-FDDI, and maybe - * consider heterogenous bridges). + * consider heterogeneous bridges). */ #include <sys/cdefs.h> @@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/sys/param.h> +#include <sys/eventhandler.h> #include <sys/mbuf.h> #include <sys/malloc.h> #include <sys/protosw.h> @@ -102,7 +103,6 @@ __FBSDID("$FreeBSD$"); #include <sys/proc.h> #include <rtems/bsd/sys/lock.h> #include <sys/mutex.h> -#include <sys/rwlock.h> #include <net/bpf.h> #include <net/if.h> @@ -113,7 +113,7 @@ __FBSDID("$FreeBSD$"); #include <net/pfil.h> #include <net/vnet.h> -#include <netinet/in.h> /* for struct arpcom */ +#include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> @@ -127,15 +127,13 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip_carp.h> #endif #include <machine/in_cksum.h> -#include <netinet/if_ether.h> /* for struct arpcom */ +#include <netinet/if_ether.h> #include <net/bridgestp.h> #include <net/if_bridgevar.h> #include <net/if_llc.h> #include <net/if_vlan_var.h> #include <net/route.h> -#include <netinet/ip_fw.h> -#include <netpfil/ipfw/ip_fw_private.h> /* * Size of the route hash table. Must be a power of two. @@ -170,7 +168,8 @@ __FBSDID("$FreeBSD$"); /* * List of capabilities to possibly mask on the member interface. */ -#define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM) +#define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM|\ + IFCAP_TXCSUM_IPV6) /* * List of capabilities to strip @@ -230,8 +229,9 @@ struct bridge_softc { u_char sc_defaddr[6]; /* Default MAC address */ }; -static struct mtx bridge_list_mtx; -eventhandler_tag bridge_detach_cookie = NULL; +static VNET_DEFINE(struct mtx, bridge_list_mtx); +#define V_bridge_list_mtx VNET(bridge_list_mtx) +static eventhandler_tag bridge_detach_cookie; int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; @@ -248,11 +248,12 @@ static void bridge_ifdetach(void *arg __unused, struct ifnet *); static void bridge_init(void *); static void bridge_dummynet(struct mbuf *, struct ifnet *); static void bridge_stop(struct ifnet *, int); -static void bridge_start(struct ifnet *); +static int bridge_transmit(struct ifnet *, struct mbuf *); +static void bridge_qflush(struct ifnet *); static struct mbuf *bridge_input(struct ifnet *, struct mbuf *); static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); -static void bridge_enqueue(struct bridge_softc *, struct ifnet *, +static int bridge_enqueue(struct bridge_softc *, struct ifnet *, struct mbuf *); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); @@ -275,7 +276,7 @@ static void bridge_rtflush(struct bridge_softc *, int); static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, uint16_t); -static int bridge_rtable_init(struct bridge_softc *); +static void bridge_rtable_init(struct bridge_softc *); static void bridge_rtable_fini(struct bridge_softc *); static int bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *); @@ -353,43 +354,64 @@ static struct bstp_cb_ops bridge_ops = { SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge"); -static int pfil_onlyip = 1; /* only pass IP[46] packets when pfil is enabled */ -static int pfil_bridge = 1; /* run pfil hooks on the bridge interface */ -static int pfil_member = 1; /* run pfil hooks on the member interface */ -static int pfil_ipfw = 0; /* layer2 filter with ipfw */ -static int pfil_ipfw_arp = 0; /* layer2 filter with ipfw */ -static int pfil_local_phys = 0; /* run pfil hooks on the physical interface for - locally destined packets */ -static int log_stp = 0; /* log STP state changes */ -static int bridge_inherit_mac = 0; /* share MAC with first bridge member */ -TUNABLE_INT("net.link.bridge.pfil_onlyip", &pfil_onlyip); -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW, - &pfil_onlyip, 0, "Only pass IP packets when pfil is enabled"); -TUNABLE_INT("net.link.bridge.ipfw_arp", &pfil_ipfw_arp); -SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW, - &pfil_ipfw_arp, 0, "Filter ARP packets through IPFW layer2"); -TUNABLE_INT("net.link.bridge.pfil_bridge", &pfil_bridge); -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW, - &pfil_bridge, 0, "Packet filter on the bridge interface"); -TUNABLE_INT("net.link.bridge.pfil_member", &pfil_member); -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW, - &pfil_member, 0, "Packet filter on the member interface"); -TUNABLE_INT("net.link.bridge.pfil_local_phys", &pfil_local_phys); -SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RW, - &pfil_local_phys, 0, +/* only pass IP[46] packets when pfil is enabled */ +static VNET_DEFINE(int, pfil_onlyip) = 1; +#define V_pfil_onlyip VNET(pfil_onlyip) +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, + CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_onlyip), 0, + "Only pass IP packets when pfil is enabled"); + +/* run pfil hooks on the bridge interface */ +static VNET_DEFINE(int, pfil_bridge) = 1; +#define V_pfil_bridge VNET(pfil_bridge) +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, + CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_bridge), 0, + "Packet filter on the bridge interface"); + +/* layer2 filter with ipfw */ +static VNET_DEFINE(int, pfil_ipfw); +#define V_pfil_ipfw VNET(pfil_ipfw) + +/* layer2 ARP filter with ipfw */ +static VNET_DEFINE(int, pfil_ipfw_arp); +#define V_pfil_ipfw_arp VNET(pfil_ipfw_arp) +SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, + CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw_arp), 0, + "Filter ARP packets through IPFW layer2"); + +/* run pfil hooks on the member interface */ +static VNET_DEFINE(int, pfil_member) = 1; +#define V_pfil_member VNET(pfil_member) +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, + CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_member), 0, + "Packet filter on the member interface"); + +/* run pfil hooks on the physical interface for locally destined packets */ +static VNET_DEFINE(int, pfil_local_phys); +#define V_pfil_local_phys VNET(pfil_local_phys) +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, + CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_local_phys), 0, "Packet filter on the physical interface for locally destined packets"); -TUNABLE_INT("net.link.bridge.log_stp", &log_stp); -SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RW, - &log_stp, 0, "Log STP state changes"); -TUNABLE_INT("net.link.bridge.inherit_mac", &bridge_inherit_mac); -SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, CTLFLAG_RW, - &bridge_inherit_mac, 0, + +/* log STP state changes */ +static VNET_DEFINE(int, log_stp); +#define V_log_stp VNET(log_stp) +SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, + CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(log_stp), 0, + "Log STP state changes"); + +/* share MAC with first bridge member */ +static VNET_DEFINE(int, bridge_inherit_mac); +#define V_bridge_inherit_mac VNET(bridge_inherit_mac) +SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, + CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(bridge_inherit_mac), 0, "Inherit MAC address from the first bridge member"); static VNET_DEFINE(int, allow_llz_overlap) = 0; #define V_allow_llz_overlap VNET(allow_llz_overlap) -SYSCTL_VNET_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap, CTLFLAG_RW, - &VNET_NAME(allow_llz_overlap), 0, "Allow overlap of link-local scope " +SYSCTL_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0, + "Allow overlap of link-local scope " "zones of a bridge interface and the member interfaces"); struct bridge_control { @@ -487,12 +509,43 @@ const struct bridge_control bridge_control_table[] = { BC_F_COPYIN|BC_F_SUSER }, }; -const int bridge_control_table_size = - sizeof(bridge_control_table) / sizeof(bridge_control_table[0]); +const int bridge_control_table_size = nitems(bridge_control_table); + +static VNET_DEFINE(LIST_HEAD(, bridge_softc), bridge_list); +#define V_bridge_list VNET(bridge_list) +#define BRIDGE_LIST_LOCK_INIT(x) mtx_init(&V_bridge_list_mtx, \ + "if_bridge list", NULL, MTX_DEF) +#define BRIDGE_LIST_LOCK_DESTROY(x) mtx_destroy(&V_bridge_list_mtx) +#define BRIDGE_LIST_LOCK(x) mtx_lock(&V_bridge_list_mtx) +#define BRIDGE_LIST_UNLOCK(x) mtx_unlock(&V_bridge_list_mtx) + +static VNET_DEFINE(struct if_clone *, bridge_cloner); +#define V_bridge_cloner VNET(bridge_cloner) -LIST_HEAD(, bridge_softc) bridge_list; +static const char bridge_name[] = "bridge"; + +static void +vnet_bridge_init(const void *unused __unused) +{ + + BRIDGE_LIST_LOCK_INIT(); + LIST_INIT(&V_bridge_list); + V_bridge_cloner = if_clone_simple(bridge_name, + bridge_clone_create, bridge_clone_destroy, 0); +} +VNET_SYSINIT(vnet_bridge_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_bridge_init, NULL); + +static void +vnet_bridge_uninit(const void *unused __unused) +{ -IFC_SIMPLE_DECLARE(bridge, 0); + if_clone_detach(V_bridge_cloner); + V_bridge_cloner = NULL; + BRIDGE_LIST_LOCK_DESTROY(); +} +VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, + vnet_bridge_uninit, NULL); static int bridge_modevent(module_t mod, int type, void *data) @@ -500,12 +553,9 @@ bridge_modevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: - mtx_init(&bridge_list_mtx, "if_bridge list", NULL, MTX_DEF); - if_clone_attach(&bridge_cloner); bridge_rtnode_zone = uma_zcreate("bridge_rtnode", sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - LIST_INIT(&bridge_list); bridge_input_p = bridge_input; bridge_output_p = bridge_output; bridge_dn_p = bridge_dummynet; @@ -517,13 +567,11 @@ bridge_modevent(module_t mod, int type, void *data) case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, bridge_detach_cookie); - if_clone_detach(&bridge_cloner); uma_zdestroy(bridge_rtnode_zone); bridge_input_p = NULL; bridge_output_p = NULL; bridge_dn_p = NULL; bridge_linkstate_p = NULL; - mtx_destroy(&bridge_list_mtx); break; default: return (EOPNOTSUPP); @@ -541,19 +589,19 @@ DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1); /* - * handler for net.link.bridge.pfil_ipfw + * handler for net.link.bridge.ipfw */ static int sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS) { - int enable = pfil_ipfw; + int enable = V_pfil_ipfw; int error; error = sysctl_handle_int(oidp, &enable, 0, req); - enable = (enable) ? 1 : 0; + enable &= 1; - if (enable != pfil_ipfw) { - pfil_ipfw = enable; + if (enable != V_pfil_ipfw) { + V_pfil_ipfw = enable; /* * Disable pfil so that ipfw doesnt run twice, if the user @@ -561,17 +609,19 @@ sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS) * pfil_member. Also allow non-ip packets as ipfw can filter by * layer2 type. */ - if (pfil_ipfw) { - pfil_onlyip = 0; - pfil_bridge = 0; - pfil_member = 0; + if (V_pfil_ipfw) { + V_pfil_onlyip = 0; + V_pfil_bridge = 0; + V_pfil_member = 0; } } return (error); } -SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT|CTLFLAG_RW, - &pfil_ipfw, 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); +SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(pfil_ipfw), 0, &sysctl_pfil_ipfw, "I", + "Layer2 filter with IPFW"); /* * bridge_clone_create: @@ -606,15 +656,13 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) LIST_INIT(&sc->sc_spanlist); ifp->if_softc = sc; - if_initname(ifp, ifc->ifc_name, unit); + if_initname(ifp, bridge_name, unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = bridge_ioctl; - ifp->if_start = bridge_start; + ifp->if_transmit = bridge_transmit; + ifp->if_qflush = bridge_qflush; ifp->if_init = bridge_init; ifp->if_type = IFT_BRIDGE; - IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); - ifp->if_snd.ifq_drv_maxlen = ifqmaxlen; - IFQ_SET_READY(&ifp->if_snd); /* * Generate an ethernet address with a locally administered address. @@ -626,7 +674,7 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) */ fb = 0; getcredhostid(curthread->td_ucred, &hostid); - for (retry = 1; retry != 0;) { + do { if (fb || hostid == 0) { arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1); sc->sc_defaddr[0] &= ~1;/* clear multicast bit */ @@ -642,15 +690,17 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) fb = 1; retry = 0; - mtx_lock(&bridge_list_mtx); - LIST_FOREACH(sc2, &bridge_list, sc_list) { + BRIDGE_LIST_LOCK(); + LIST_FOREACH(sc2, &V_bridge_list, sc_list) { bifp = sc2->sc_ifp; if (memcmp(sc->sc_defaddr, - IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) + IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) { retry = 1; + break; + } } - mtx_unlock(&bridge_list_mtx); - } + BRIDGE_LIST_UNLOCK(); + } while (retry == 1); bstp_attach(&sc->sc_stp, &bridge_ops); ether_ifattach(ifp, sc->sc_defaddr); @@ -658,9 +708,9 @@ bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) ifp->if_baudrate = 0; ifp->if_type = IFT_BRIDGE; - mtx_lock(&bridge_list_mtx); - LIST_INSERT_HEAD(&bridge_list, sc, sc_list); - mtx_unlock(&bridge_list_mtx); + BRIDGE_LIST_LOCK(); + LIST_INSERT_HEAD(&V_bridge_list, sc, sc_list); + BRIDGE_LIST_UNLOCK(); return (0); } @@ -692,13 +742,13 @@ bridge_clone_destroy(struct ifnet *ifp) callout_drain(&sc->sc_brcallout); - mtx_lock(&bridge_list_mtx); + BRIDGE_LIST_LOCK(); LIST_REMOVE(sc, sc_list); - mtx_unlock(&bridge_list_mtx); + BRIDGE_LIST_UNLOCK(); bstp_detach(&sc->sc_stp); ether_ifdetach(ifp); - if_free_type(ifp, IFT_ETHER); + if_free(ifp); /* Tear down the routing table. */ bridge_rtable_fini(sc); @@ -818,7 +868,7 @@ bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) BRIDGE_LOCK(sc); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp->if_mtu != ifr->ifr_mtu) { - log(LOG_NOTICE, "%s: invalid MTU: %lu(%s)" + log(LOG_NOTICE, "%s: invalid MTU: %u(%s)" " != %d\n", sc->sc_ifp->if_xname, bif->bif_ifp->if_mtu, bif->bif_ifp->if_xname, ifr->ifr_mtu); @@ -960,7 +1010,7 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, * the mac address of the bridge to the address of the next member, or * to its default address if no members are left. */ - if (bridge_inherit_mac && sc->sc_ifaddr == ifs) { + if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) { if (LIST_EMPTY(&sc->sc_iflist)) { bcopy(sc->sc_defaddr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); @@ -986,9 +1036,12 @@ bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, case IFT_ETHER: case IFT_L2VLAN: /* - * Take the interface out of promiscuous mode. + * Take the interface out of promiscuous mode, but only + * if it was promiscuous in the first place. It might + * not be if we're in the bridge_ioctl_add() error path. */ - (void) ifpromisc(ifs, 0); + if (ifs->if_flags & IFF_PROMISC) + (void) ifpromisc(ifs, 0); break; case IFT_GIF: @@ -1108,7 +1161,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) if (LIST_EMPTY(&sc->sc_iflist)) sc->sc_ifp->if_mtu = ifs->if_mtu; else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { - if_printf(sc->sc_ifp, "invalid MTU: %lu(%s) != %lu\n", + if_printf(sc->sc_ifp, "invalid MTU: %u(%s) != %u\n", ifs->if_mtu, ifs->if_xname, sc->sc_ifp->if_mtu); return (EINVAL); } @@ -1126,7 +1179,7 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) * member and the MAC address of the bridge has not been changed from * the default randomly generated one. */ - if (bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) && + if (V_bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) && !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN)) { bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = ifs; @@ -1156,10 +1209,8 @@ bridge_ioctl_add(struct bridge_softc *sc, void *arg) break; } - if (error) { + if (error) bridge_delete_member(sc, bif, 0); - free(bif, M_DEVBUF); - } return (error); } @@ -1751,7 +1802,13 @@ bridge_ifdetach(void *arg __unused, struct ifnet *ifp) if (ifp->if_flags & IFF_RENAMING) return; - + if (V_bridge_cloner == NULL) { + /* + * This detach handler can be called after + * vnet_bridge_uninit(). Just return in that case. + */ + return; + } /* Check if the interface is a bridge member */ if (sc != NULL) { BRIDGE_LOCK(sc); @@ -1765,8 +1822,8 @@ bridge_ifdetach(void *arg __unused, struct ifnet *ifp) } /* Check if the interface is a span port */ - mtx_lock(&bridge_list_mtx); - LIST_FOREACH(sc, &bridge_list, sc_list) { + BRIDGE_LIST_LOCK(); + LIST_FOREACH(sc, &V_bridge_list, sc_list) { BRIDGE_LOCK(sc); LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifp == bif->bif_ifp) { @@ -1776,7 +1833,7 @@ bridge_ifdetach(void *arg __unused, struct ifnet *ifp) BRIDGE_UNLOCK(sc); } - mtx_unlock(&bridge_list_mtx); + BRIDGE_LIST_UNLOCK(); } /* @@ -1832,20 +1889,19 @@ bridge_stop(struct ifnet *ifp, int disable) * Enqueue a packet on a bridge member interface. * */ -static void +static int bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) { int len, err = 0; short mflags; struct mbuf *m0; - len = m->m_pkthdr.len; - mflags = m->m_flags; - /* We may be sending a fragment so traverse the mbuf */ for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = NULL; + len = m->m_pkthdr.len; + mflags = m->m_flags; /* * If underlying interface can not do VLAN tag insertion itself @@ -1857,7 +1913,7 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) if (m == NULL) { if_printf(dst_ifp, "unable to prepend VLAN header\n"); - dst_ifp->if_oerrors++; + if_inc_counter(dst_ifp, IFCOUNTER_OERRORS, 1); continue; } m->m_flags &= ~M_VLANTAG; @@ -1865,16 +1921,17 @@ bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) if ((err = dst_ifp->if_transmit(dst_ifp, m))) { m_freem(m0); + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); break; } - } - if (err == 0) { - sc->sc_ifp->if_opackets++; - sc->sc_ifp->if_obytes += len; + if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len); if (mflags & M_MCAST) - sc->sc_ifp->if_omcasts++; + if_inc_counter(sc->sc_ifp, IFCOUNTER_OMCASTS, 1); } + + return (err); } /* @@ -2000,9 +2057,9 @@ bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, used = 1; mc = m; } else { - mc = m_copypacket(m, M_DONTWAIT); + mc = m_copypacket(m, M_NOWAIT); if (mc == NULL) { - sc->sc_ifp->if_oerrors++; + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } @@ -2033,44 +2090,42 @@ sendunicast: } /* - * bridge_start: + * bridge_transmit: * - * Start output on a bridge. + * Do output on a bridge. * */ -static void -bridge_start(struct ifnet *ifp) +static int +bridge_transmit(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc; - struct mbuf *m; struct ether_header *eh; struct ifnet *dst_if; + int error = 0; sc = ifp->if_softc; - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - for (;;) { - IFQ_DEQUEUE(&ifp->if_snd, m); - if (m == 0) - break; - ETHER_BPF_MTAP(ifp, m); + ETHER_BPF_MTAP(ifp, m); - eh = mtod(m, struct ether_header *); - dst_if = NULL; + eh = mtod(m, struct ether_header *); - BRIDGE_LOCK(sc); - if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { - dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1); - } + BRIDGE_LOCK(sc); + if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) && + (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1)) != NULL) { + BRIDGE_UNLOCK(sc); + error = bridge_enqueue(sc, dst_if, m); + } else + bridge_broadcast(sc, ifp, m, 0); - if (dst_if == NULL) - bridge_broadcast(sc, ifp, m, 0); - else { - BRIDGE_UNLOCK(sc); - bridge_enqueue(sc, dst_if, m); - } - } - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + return (error); +} + +/* + * The ifp->if_qflush entry point for if_bridge(4) is no-op. + */ +static void +bridge_qflush(struct ifnet *ifp __unused) +{ } /* @@ -2094,8 +2149,8 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, src_if = m->m_pkthdr.rcvif; ifp = sc->sc_ifp; - ifp->if_ipackets++; - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); vlan = VLANTAGOF(m); if ((sbif->bif_flags & IFBIF_STP) && @@ -2147,7 +2202,7 @@ bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, goto drop; /* ...forward it to all interfaces. */ - ifp->if_imcasts++; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); dst_if = NULL; } @@ -2255,8 +2310,8 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) if ((bifp->if_flags & IFF_MONITOR) != 0) { m->m_pkthdr.rcvif = bifp; ETHER_BPF_MTAP(bifp, m); - bifp->if_ipackets++; - bifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return (NULL); } @@ -2291,7 +2346,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) * for bridge processing; return the original packet for * local processing. */ - mc = m_dup(m, M_DONTWAIT); + mc = m_dup(m, M_NOWAIT); if (mc == NULL) { BRIDGE_UNLOCK(sc); return (m); @@ -2308,7 +2363,7 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) */ KASSERT(bifp->if_bridge == NULL, ("loop created in bridge_input")); - mc2 = m_dup(m, M_DONTWAIT); + mc2 = m_dup(m, M_NOWAIT); if (mc2 != NULL) { /* Keep the layer3 header aligned */ int i = min(mc2->m_pkthdr.len, max_protohdr); @@ -2357,9 +2412,10 @@ bridge_input(struct ifnet *ifp, struct mbuf *m) ) { \ if ((iface)->if_type == IFT_BRIDGE) { \ ETHER_BPF_MTAP(iface, m); \ - iface->if_ipackets++; \ + if_inc_counter(iface, IFCOUNTER_IPACKETS, 1); \ + if_inc_counter(iface, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ /* Filter on the physical interface. */ \ - if (pfil_local_phys && \ + if (V_pfil_local_phys && \ (PFIL_HOOKED(&V_inet_pfil_hook) \ OR_PFIL_HOOKED_INET6)) { \ if (bridge_pfil(&m, NULL, ifp, \ @@ -2485,9 +2541,9 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, mc = m; used = 1; } else { - mc = m_dup(m, M_DONTWAIT); + mc = m_dup(m, M_NOWAIT); if (mc == NULL) { - sc->sc_ifp->if_oerrors++; + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } @@ -2507,7 +2563,7 @@ bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, i = min(mc->m_pkthdr.len, max_protohdr); mc = m_copyup(mc, i, ETHER_ALIGN); if (mc == NULL) { - sc->sc_ifp->if_oerrors++; + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } @@ -2548,9 +2604,9 @@ bridge_span(struct bridge_softc *sc, struct mbuf *m) if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; - mc = m_copypacket(m, M_DONTWAIT); + mc = m_copypacket(m, M_NOWAIT); if (mc == NULL) { - sc->sc_ifp->if_oerrors++; + if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } @@ -2793,24 +2849,19 @@ bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) * * Initialize the route table for this bridge. */ -static int +static void bridge_rtable_init(struct bridge_softc *sc) { int i; sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, - M_DEVBUF, M_NOWAIT); - if (sc->sc_rthash == NULL) - return (ENOMEM); + M_DEVBUF, M_WAITOK); for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) LIST_INIT(&sc->sc_rthash[i]); sc->sc_rthash_key = arc4random(); - LIST_INIT(&sc->sc_rtlist); - - return (0); } /* @@ -3018,9 +3069,11 @@ bridge_state_change(struct ifnet *ifp, int state) "discarding" }; - if (log_stp) + CURVNET_SET(ifp->if_vnet); + if (V_log_stp) log(LOG_NOTICE, "%s: state changed to %s on %s\n", sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname); + CURVNET_RESTORE(); } /* @@ -3034,7 +3087,6 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) { int snap, error, i, hlen; struct ether_header *eh1, eh2; - struct ip_fw_args args; struct ip *ip; struct llc llc1; u_int16_t ether_type; @@ -3047,7 +3099,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__)); #endif - if (pfil_bridge == 0 && pfil_member == 0 && pfil_ipfw == 0) + if (V_pfil_bridge == 0 && V_pfil_member == 0 && V_pfil_ipfw == 0) return (0); /* filtering is disabled */ i = min((*mp)->m_pkthdr.len, max_protohdr); @@ -3089,7 +3141,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) switch (ether_type) { case ETHERTYPE_ARP: case ETHERTYPE_REVARP: - if (pfil_ipfw_arp == 0) + if (V_pfil_ipfw_arp == 0) return (0); /* Automatically pass */ break; @@ -3104,10 +3156,20 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) * packets, these will not be checked by pfil(9) and * passed unconditionally so the default is to drop. */ - if (pfil_onlyip) + if (V_pfil_onlyip) goto bad; } + /* Run the packet through pfil before stripping link headers */ + if (PFIL_HOOKED(&V_link_pfil_hook) && V_pfil_ipfw != 0 && + dir == PFIL_OUT && ifp != NULL) { + + error = pfil_run_hooks(&V_link_pfil_hook, mp, ifp, dir, NULL); + + if (*mp == NULL || error != 0) /* packet consumed by filter */ + return (error); + } + /* Strip off the Ethernet header and keep a copy. */ m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); m_adj(*mp, ETHER_HDR_LEN); @@ -3138,63 +3200,6 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) goto bad; } - /* XXX this section is also in if_ethersubr.c */ - // XXX PFIL_OUT or DIR_OUT ? - if (V_ip_fw_chk_ptr && pfil_ipfw != 0 && - dir == PFIL_OUT && ifp != NULL) { - struct m_tag *mtag; - - error = -1; - /* fetch the start point from existing tags, if any */ - mtag = m_tag_locate(*mp, MTAG_IPFW_RULE, 0, NULL); - if (mtag == NULL) { - args.rule.slot = 0; - } else { - struct ipfw_rule_ref *r; - - /* XXX can we free the tag after use ? */ - mtag->m_tag_id = PACKET_TAG_NONE; - r = (struct ipfw_rule_ref *)(mtag + 1); - /* packet already partially processed ? */ - if (r->info & IPFW_ONEPASS) - goto ipfwpass; - args.rule = *r; - } - - args.m = *mp; - args.oif = ifp; - args.next_hop = NULL; - args.next_hop6 = NULL; - args.eh = &eh2; - args.inp = NULL; /* used by ipfw uid/gid/jail rules */ - i = V_ip_fw_chk_ptr(&args); - *mp = args.m; - - if (*mp == NULL) - return (error); - - if (ip_dn_io_ptr && (i == IP_FW_DUMMYNET)) { - - /* put the Ethernet header back on */ - M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); - if (*mp == NULL) - return (error); - bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); - - /* - * Pass the pkt to dummynet, which consumes it. The - * packet will return to us via bridge_dummynet(). - */ - args.oif = ifp; - ip_dn_io_ptr(mp, DIR_FWD | PROTO_IFB, &args); - return (error); - } - - if (i != IP_FW_PASS) /* drop */ - goto bad; - } - -ipfwpass: error = 0; /* @@ -3203,36 +3208,27 @@ ipfwpass: switch (ether_type) { case ETHERTYPE_IP: /* - * before calling the firewall, swap fields the same as - * IP does. here we assume the header is contiguous - */ - ip = mtod(*mp, struct ip *); - - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); - - /* * Run pfil on the member interface and the bridge, both can * be skipped by clearing pfil_member or pfil_bridge. * * Keep the order: * in_if -> bridge_if -> out_if */ - if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; - if (pfil_member && ifp != NULL) + if (V_pfil_member && ifp != NULL) error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; - if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, dir, NULL); @@ -3240,7 +3236,7 @@ ipfwpass: break; /* check if we need to fragment the packet */ - if (pfil_member && ifp != NULL && dir == PFIL_OUT) { + if (V_pfil_member && ifp != NULL && dir == PFIL_OUT) { i = (*mp)->m_pkthdr.len; if (i > ifp->if_mtu) { error = bridge_fragment(ifp, *mp, &eh2, snap, @@ -3249,20 +3245,18 @@ ipfwpass: } } - /* Recalculate the ip checksum and restore byte ordering */ + /* Recalculate the ip checksum. */ ip = mtod(*mp, struct ip *); hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) goto bad; if (hlen > (*mp)->m_len) { - if ((*mp = m_pullup(*mp, hlen)) == 0) + if ((*mp = m_pullup(*mp, hlen)) == NULL) goto bad; ip = mtod(*mp, struct ip *); if (ip == NULL) goto bad; } - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); @@ -3272,21 +3266,21 @@ ipfwpass: break; #ifdef INET6 case ETHERTYPE_IPV6: - if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; - if (pfil_member && ifp != NULL) + if (V_pfil_member && ifp != NULL) error = pfil_run_hooks(&V_inet6_pfil_hook, mp, ifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; - if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, dir, NULL); break; @@ -3307,13 +3301,13 @@ ipfwpass: * Finally, put everything back the way it was and return */ if (snap) { - M_PREPEND(*mp, sizeof(struct llc), M_DONTWAIT); + M_PREPEND(*mp, sizeof(struct llc), M_NOWAIT); if (*mp == NULL) return (error); bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); } - M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); + M_PREPEND(*mp, ETHER_HDR_LEN, M_NOWAIT); if (*mp == NULL) return (error); bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); @@ -3375,7 +3369,7 @@ bridge_ip_checkbasic(struct mbuf **mp) goto bad; } if (hlen > m->m_len) { - if ((m = m_pullup(m, hlen)) == 0) { + if ((m = m_pullup(m, hlen)) == NULL) { KMOD_IPSTAT_INC(ips_badhlen); goto bad; } @@ -3499,8 +3493,8 @@ bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, goto out; ip = mtod(m, struct ip *); - error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, - CSUM_DELAY_IP); + m->m_pkthdr.csum_flags |= CSUM_IP; + error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist); if (error) goto out; @@ -3508,7 +3502,7 @@ bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, for (m0 = m; m0; m0 = m0->m_nextpkt) { if (error == 0) { if (snap) { - M_PREPEND(m0, sizeof(struct llc), M_DONTWAIT); + M_PREPEND(m0, sizeof(struct llc), M_NOWAIT); if (m0 == NULL) { error = ENOBUFS; continue; @@ -3516,7 +3510,7 @@ bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, bcopy(llc, mtod(m0, caddr_t), sizeof(struct llc)); } - M_PREPEND(m0, ETHER_HDR_LEN, M_DONTWAIT); + M_PREPEND(m0, ETHER_HDR_LEN, M_NOWAIT); if (m0 == NULL) { error = ENOBUFS; continue; diff --git a/freebsd/sys/net/if_clone.c b/freebsd/sys/net/if_clone.c index 0b752139..61ba9c6c 100644 --- a/freebsd/sys/net/if_clone.c +++ b/freebsd/sys/net/if_clone.c @@ -1,6 +1,7 @@ #include <machine/rtems-bsd-kernel-space.h> /*- + * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org> * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * @@ -33,6 +34,7 @@ */ #include <rtems/bsd/sys/param.h> +#include <sys/eventhandler.h> #include <sys/malloc.h> #include <sys/limits.h> #include <rtems/bsd/sys/lock.h> @@ -43,29 +45,74 @@ #include <sys/socket.h> #include <net/if.h> -#include <net/if_clone.h> -#if 0 -#include <net/if_dl.h> -#endif -#include <net/if_types.h> #include <net/if_var.h> +#include <net/if_clone.h> #include <net/radix.h> #include <net/route.h> #include <net/vnet.h> +/* Current IF_MAXUNIT expands maximum to 5 characters. */ +#define IFCLOSIZ (IFNAMSIZ - 5) + +/* + * Structure describing a `cloning' interface. + * + * List of locks + * (c) const until freeing + * (d) driver specific data, may need external protection. + * (e) locked by if_cloners_mtx + * (i) locked by ifc_mtx mtx + */ +struct if_clone { + char ifc_name[IFCLOSIZ]; /* (c) Name of device, e.g. `gif' */ + struct unrhdr *ifc_unrhdr; /* (c) alloc_unr(9) header */ + int ifc_maxunit; /* (c) maximum unit number */ + long ifc_refcnt; /* (i) Reference count. */ + LIST_HEAD(, ifnet) ifc_iflist; /* (i) List of cloned interfaces */ + struct mtx ifc_mtx; /* Mutex to protect members. */ + + enum { SIMPLE, ADVANCED } ifc_type; /* (c) */ + + /* (c) Driver specific cloning functions. Called with no locks held. */ + union { + struct { /* advanced cloner */ + ifc_match_t *_ifc_match; + ifc_create_t *_ifc_create; + ifc_destroy_t *_ifc_destroy; + } A; + struct { /* simple cloner */ + ifcs_create_t *_ifcs_create; + ifcs_destroy_t *_ifcs_destroy; + int _ifcs_minifs; /* minimum ifs */ + + } S; + } U; +#define ifc_match U.A._ifc_match +#define ifc_create U.A._ifc_create +#define ifc_destroy U.A._ifc_destroy +#define ifcs_create U.S._ifcs_create +#define ifcs_destroy U.S._ifcs_destroy +#define ifcs_minifs U.S._ifcs_minifs + + LIST_ENTRY(if_clone) ifc_list; /* (e) On list of cloners */ +}; + static void if_clone_free(struct if_clone *ifc); static int if_clone_createif(struct if_clone *ifc, char *name, size_t len, caddr_t params); -static struct mtx if_cloners_mtx; +static int ifc_simple_match(struct if_clone *, const char *); +static int ifc_simple_create(struct if_clone *, char *, size_t, caddr_t); +static int ifc_simple_destroy(struct if_clone *, struct ifnet *); + +static struct mtx if_cloners_mtx; +MTX_SYSINIT(if_cloners_lock, &if_cloners_mtx, "if_cloners lock", MTX_DEF); static VNET_DEFINE(int, if_cloners_count); VNET_DEFINE(LIST_HEAD(, if_clone), if_cloners); #define V_if_cloners_count VNET(if_cloners_count) #define V_if_cloners VNET(if_cloners) -#define IF_CLONERS_LOCK_INIT() \ - mtx_init(&if_cloners_mtx, "if_cloners lock", NULL, MTX_DEF) #define IF_CLONERS_LOCK_ASSERT() mtx_assert(&if_cloners_mtx, MA_OWNED) #define IF_CLONERS_LOCK() mtx_lock(&if_cloners_mtx) #define IF_CLONERS_UNLOCK() mtx_unlock(&if_cloners_mtx) @@ -123,13 +170,6 @@ vnet_if_clone_init(void) LIST_INIT(&V_if_cloners); } -void -if_clone_init(void) -{ - - IF_CLONERS_LOCK_INIT(); -} - /* * Lookup and create a clone network interface. */ @@ -140,18 +180,25 @@ if_clone_create(char *name, size_t len, caddr_t params) /* Try to find an applicable cloner for this request */ IF_CLONERS_LOCK(); - LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { - if (ifc->ifc_match(ifc, name)) { - break; + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) + if (ifc->ifc_type == SIMPLE) { + if (ifc_simple_match(ifc, name)) + break; + } else { + if (ifc->ifc_match(ifc, name)) + break; } - } #ifdef VIMAGE if (ifc == NULL && !IS_DEFAULT_VNET(curvnet)) { CURVNET_SET_QUIET(vnet0); - LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { - if (ifc->ifc_match(ifc, name)) - break; - } + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) + if (ifc->ifc_type == SIMPLE) { + if (ifc_simple_match(ifc, name)) + break; + } else { + if (ifc->ifc_match(ifc, name)) + break; + } CURVNET_RESTORE(); } #endif @@ -175,7 +222,10 @@ if_clone_createif(struct if_clone *ifc, char *name, size_t len, caddr_t params) if (ifunit(name) != NULL) return (EEXIST); - err = (*ifc->ifc_create)(ifc, name, len, params); + if (ifc->ifc_type == SIMPLE) + err = ifc_simple_create(ifc, name, len, params); + else + err = (*ifc->ifc_create)(ifc, name, len, params); if (!err) { ifp = ifunit(name); @@ -216,10 +266,14 @@ if_clone_destroy(const char *name) #ifdef VIMAGE if (ifc == NULL && !IS_DEFAULT_VNET(curvnet)) { CURVNET_SET_QUIET(vnet0); - LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { - if (ifc->ifc_match(ifc, name)) - break; - } + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) + if (ifc->ifc_type == SIMPLE) { + if (ifc_simple_match(ifc, name)) + break; + } else { + if (ifc->ifc_match(ifc, name)) + break; + } CURVNET_RESTORE(); } #endif @@ -243,7 +297,7 @@ if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp) int err; struct ifnet *ifcifp; - if (ifc->ifc_destroy == NULL) + if (ifc->ifc_type == ADVANCED && ifc->ifc_destroy == NULL) return(EOPNOTSUPP); /* @@ -268,7 +322,10 @@ if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp) if_delgroup(ifp, ifc->ifc_name); - err = (*ifc->ifc_destroy)(ifc, ifp); + if (ifc->ifc_type == SIMPLE) + err = ifc_simple_destroy(ifc, ifp); + else + err = (*ifc->ifc_destroy)(ifc, ifp); if (err != 0) { if_addgroup(ifp, ifc->ifc_name); @@ -281,36 +338,97 @@ if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp) return (err); } -/* - * Register a network interface cloner. - */ -void -if_clone_attach(struct if_clone *ifc) +static struct if_clone * +if_clone_alloc(const char *name, int maxunit) { - int len, maxclone; + struct if_clone *ifc; - /* - * Compute bitmap size and allocate it. - */ - maxclone = ifc->ifc_maxunit + 1; - len = maxclone >> 3; - if ((len << 3) < maxclone) - len++; - ifc->ifc_units = malloc(len, M_CLONE, M_WAITOK | M_ZERO); - ifc->ifc_bmlen = len; + KASSERT(name != NULL, ("%s: no name\n", __func__)); + + ifc = malloc(sizeof(struct if_clone), M_CLONE, M_WAITOK | M_ZERO); + strncpy(ifc->ifc_name, name, IFCLOSIZ-1); IF_CLONE_LOCK_INIT(ifc); IF_CLONE_ADDREF(ifc); + ifc->ifc_maxunit = maxunit ? maxunit : IF_MAXUNIT; + ifc->ifc_unrhdr = new_unrhdr(0, ifc->ifc_maxunit, &ifc->ifc_mtx); + LIST_INIT(&ifc->ifc_iflist); + + return (ifc); +} + +static int +if_clone_attach(struct if_clone *ifc) +{ + struct if_clone *ifc1; IF_CLONERS_LOCK(); + LIST_FOREACH(ifc1, &V_if_cloners, ifc_list) + if (strcmp(ifc->ifc_name, ifc1->ifc_name) == 0) { + IF_CLONERS_UNLOCK(); + IF_CLONE_REMREF(ifc); + return (EEXIST); + } LIST_INSERT_HEAD(&V_if_cloners, ifc, ifc_list); V_if_cloners_count++; IF_CLONERS_UNLOCK(); - LIST_INIT(&ifc->ifc_iflist); + return (0); +} + +struct if_clone * +if_clone_advanced(const char *name, u_int maxunit, ifc_match_t match, + ifc_create_t create, ifc_destroy_t destroy) +{ + struct if_clone *ifc; + + ifc = if_clone_alloc(name, maxunit); + ifc->ifc_type = ADVANCED; + ifc->ifc_match = match; + ifc->ifc_create = create; + ifc->ifc_destroy = destroy; + + if (if_clone_attach(ifc) != 0) { + if_clone_free(ifc); + return (NULL); + } - if (ifc->ifc_attach != NULL) - (*ifc->ifc_attach)(ifc); EVENTHANDLER_INVOKE(if_clone_event, ifc); + + return (ifc); +} + +struct if_clone * +if_clone_simple(const char *name, ifcs_create_t create, ifcs_destroy_t destroy, + u_int minifs) +{ + struct if_clone *ifc; + u_int unit; + + ifc = if_clone_alloc(name, 0); + ifc->ifc_type = SIMPLE; + ifc->ifcs_create = create; + ifc->ifcs_destroy = destroy; + ifc->ifcs_minifs = minifs; + + if (if_clone_attach(ifc) != 0) { + if_clone_free(ifc); + return (NULL); + } + + for (unit = 0; unit < minifs; unit++) { + char name[IFNAMSIZ]; + int error; + + snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, unit); + error = if_clone_createif(ifc, name, IFNAMSIZ, NULL); + KASSERT(error == 0, + ("%s: failed to create required interface %s", + __func__, name)); + } + + EVENTHANDLER_INVOKE(if_clone_event, ifc); + + return (ifc); } /* @@ -319,7 +437,6 @@ if_clone_attach(struct if_clone *ifc) void if_clone_detach(struct if_clone *ifc) { - struct ifc_simple_data *ifcs = ifc->ifc_data; IF_CLONERS_LOCK(); LIST_REMOVE(ifc, ifc_list); @@ -327,8 +444,8 @@ if_clone_detach(struct if_clone *ifc) IF_CLONERS_UNLOCK(); /* Allow all simples to be destroyed */ - if (ifc->ifc_attach == ifc_simple_attach) - ifcs->ifcs_minifs = 0; + if (ifc->ifc_type == SIMPLE) + ifc->ifcs_minifs = 0; /* destroy all interfaces for this cloner */ while (!LIST_EMPTY(&ifc->ifc_iflist)) @@ -340,16 +457,13 @@ if_clone_detach(struct if_clone *ifc) static void if_clone_free(struct if_clone *ifc) { - for (int bytoff = 0; bytoff < ifc->ifc_bmlen; bytoff++) { - KASSERT(ifc->ifc_units[bytoff] == 0x00, - ("ifc_units[%d] is not empty", bytoff)); - } KASSERT(LIST_EMPTY(&ifc->ifc_iflist), ("%s: ifc_iflist not empty", __func__)); IF_CLONE_LOCK_DESTROY(ifc); - free(ifc->ifc_units, M_CLONE); + delete_unrhdr(ifc->ifc_unrhdr); + free(ifc, M_CLONE); } /* @@ -372,7 +486,7 @@ if_clone_list(struct if_clonereq *ifcr) * below, but that's not a major problem. Not caping our * allocation to the number of cloners actually in the system * could be because that would let arbitrary users cause us to - * allocate abritrary amounts of kernel memory. + * allocate arbitrary amounts of kernel memory. */ buf_count = (V_if_cloners_count < ifcr->ifcr_count) ? V_if_cloners_count : ifcr->ifcr_count; @@ -406,6 +520,49 @@ done: } /* + * if_clone_findifc() looks up ifnet from the current + * cloner list, and returns ifc if found. Note that ifc_refcnt + * is incremented. + */ +struct if_clone * +if_clone_findifc(struct ifnet *ifp) +{ + struct if_clone *ifc, *ifc0; + struct ifnet *ifcifp; + + ifc0 = NULL; + IF_CLONERS_LOCK(); + LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { + IF_CLONE_LOCK(ifc); + LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) { + if (ifp == ifcifp) { + ifc0 = ifc; + IF_CLONE_ADDREF_LOCKED(ifc); + break; + } + } + IF_CLONE_UNLOCK(ifc); + if (ifc0 != NULL) + break; + } + IF_CLONERS_UNLOCK(); + + return (ifc0); +} + +/* + * if_clone_addgroup() decrements ifc_refcnt because it is called after + * if_clone_findifc(). + */ +void +if_clone_addgroup(struct ifnet *ifp, struct if_clone *ifc) +{ + + if_addgroup(ifp, ifc->ifc_name); + IF_CLONE_REMREF(ifc); +} + +/* * A utility function to extract unit numbers from interface names of * the form name###. * @@ -443,98 +600,52 @@ ifc_name2unit(const char *name, int *unit) int ifc_alloc_unit(struct if_clone *ifc, int *unit) { - int wildcard, bytoff, bitoff; - int err = 0; - - IF_CLONE_LOCK(ifc); + char name[IFNAMSIZ]; + int wildcard; - bytoff = bitoff = 0; wildcard = (*unit < 0); - /* - * Find a free unit if none was given. - */ - if (wildcard) { - while ((bytoff < ifc->ifc_bmlen) - && (ifc->ifc_units[bytoff] == 0xff)) - bytoff++; - if (bytoff >= ifc->ifc_bmlen) { - err = ENOSPC; - goto done; +retry: + if (*unit > ifc->ifc_maxunit) + return (ENOSPC); + if (*unit < 0) { + *unit = alloc_unr(ifc->ifc_unrhdr); + if (*unit == -1) + return (ENOSPC); + } else { + *unit = alloc_unr_specific(ifc->ifc_unrhdr, *unit); + if (*unit == -1) { + if (wildcard) { + (*unit)++; + goto retry; + } else + return (EEXIST); } - while ((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) - bitoff++; - *unit = (bytoff << 3) + bitoff; } - if (*unit > ifc->ifc_maxunit) { - err = ENOSPC; - goto done; + snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, *unit); + if (ifunit(name) != NULL) { + free_unr(ifc->ifc_unrhdr, *unit); + if (wildcard) { + (*unit)++; + goto retry; + } else + return (EEXIST); } - if (!wildcard) { - bytoff = *unit >> 3; - bitoff = *unit - (bytoff << 3); - } - - if((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0) { - err = EEXIST; - goto done; - } - /* - * Allocate the unit in the bitmap. - */ - KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) == 0, - ("%s: bit is already set", __func__)); - ifc->ifc_units[bytoff] |= (1 << bitoff); - IF_CLONE_ADDREF_LOCKED(ifc); + IF_CLONE_ADDREF(ifc); -done: - IF_CLONE_UNLOCK(ifc); - return (err); + return (0); } void ifc_free_unit(struct if_clone *ifc, int unit) { - int bytoff, bitoff; - - - /* - * Compute offset in the bitmap and deallocate the unit. - */ - bytoff = unit >> 3; - bitoff = unit - (bytoff << 3); - IF_CLONE_LOCK(ifc); - KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0, - ("%s: bit is already cleared", __func__)); - ifc->ifc_units[bytoff] &= ~(1 << bitoff); - IF_CLONE_REMREF_LOCKED(ifc); /* releases lock */ -} - -void -ifc_simple_attach(struct if_clone *ifc) -{ - int err; - int unit; - char name[IFNAMSIZ]; - struct ifc_simple_data *ifcs = ifc->ifc_data; - - KASSERT(ifcs->ifcs_minifs - 1 <= ifc->ifc_maxunit, - ("%s: %s requested more units than allowed (%d > %d)", - __func__, ifc->ifc_name, ifcs->ifcs_minifs, - ifc->ifc_maxunit + 1)); - - for (unit = 0; unit < ifcs->ifcs_minifs; unit++) { - snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, unit); - err = if_clone_createif(ifc, name, IFNAMSIZ, NULL); - KASSERT(err == 0, - ("%s: failed to create required interface %s", - __func__, name)); - } + free_unr(ifc->ifc_unrhdr, unit); + IF_CLONE_REMREF(ifc); } -int +static int ifc_simple_match(struct if_clone *ifc, const char *name) { const char *cp; @@ -555,14 +666,13 @@ ifc_simple_match(struct if_clone *ifc, const char *name) return (1); } -int +static int ifc_simple_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int wildcard; int unit; int err; - struct ifc_simple_data *ifcs = ifc->ifc_data; err = ifc_name2unit(name, &unit); if (err != 0) @@ -574,7 +684,7 @@ ifc_simple_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) if (err != 0) return (err); - err = ifcs->ifcs_create(ifc, unit, params); + err = ifc->ifcs_create(ifc, unit, params); if (err != 0) { ifc_free_unit(ifc, unit); return (err); @@ -598,18 +708,17 @@ ifc_simple_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) return (0); } -int +static int ifc_simple_destroy(struct if_clone *ifc, struct ifnet *ifp) { int unit; - struct ifc_simple_data *ifcs = ifc->ifc_data; unit = ifp->if_dunit; - if (unit < ifcs->ifcs_minifs) + if (unit < ifc->ifcs_minifs) return (EINVAL); - ifcs->ifcs_destroy(ifp); + ifc->ifcs_destroy(ifp); ifc_free_unit(ifc, unit); diff --git a/freebsd/sys/net/if_clone.h b/freebsd/sys/net/if_clone.h index f125f8b5..3a60b0a1 100644 --- a/freebsd/sys/net/if_clone.h +++ b/freebsd/sys/net/if_clone.h @@ -35,82 +35,45 @@ #ifdef _KERNEL -#define IFC_CLONE_INITIALIZER(name, data, maxunit, \ - attach, match, create, destroy) \ - { { 0 }, name, maxunit, NULL, 0, data, attach, match, create, destroy } - -/* - * Structure describing a `cloning' interface. - * - * List of locks - * (c) const until freeing - * (d) driver specific data, may need external protection. - * (e) locked by if_cloners_mtx - * (i) locked by ifc_mtx mtx - */ -struct if_clone { - LIST_ENTRY(if_clone) ifc_list; /* (e) On list of cloners */ - const char *ifc_name; /* (c) Name of device, e.g. `gif' */ - int ifc_maxunit; /* (c) Maximum unit number */ - unsigned char *ifc_units; /* (i) Bitmap to handle units. */ - /* Considered private, access */ - /* via ifc_(alloc|free)_unit(). */ - int ifc_bmlen; /* (c) Bitmap length. */ - void *ifc_data; /* (*) Data for ifc_* functions. */ - - /* (c) Driver specific cloning functions. Called with no locks held. */ - void (*ifc_attach)(struct if_clone *); - int (*ifc_match)(struct if_clone *, const char *); - int (*ifc_create)(struct if_clone *, char *, size_t, caddr_t); - int (*ifc_destroy)(struct if_clone *, struct ifnet *); - - long ifc_refcnt; /* (i) Refrence count. */ - struct mtx ifc_mtx; /* Muted to protect members. */ - LIST_HEAD(, ifnet) ifc_iflist; /* (i) List of cloned interfaces */ -}; - -void if_clone_init(void); -void if_clone_attach(struct if_clone *); +struct if_clone; + +/* Methods. */ +typedef int ifc_match_t(struct if_clone *, const char *); +typedef int ifc_create_t(struct if_clone *, char *, size_t, caddr_t); +typedef int ifc_destroy_t(struct if_clone *, struct ifnet *); + +typedef int ifcs_create_t(struct if_clone *, int, caddr_t); +typedef void ifcs_destroy_t(struct ifnet *); + +/* Interface cloner (de)allocating functions. */ +struct if_clone * + if_clone_advanced(const char *, u_int, ifc_match_t, ifc_create_t, + ifc_destroy_t); +struct if_clone * + if_clone_simple(const char *, ifcs_create_t, ifcs_destroy_t, u_int); void if_clone_detach(struct if_clone *); -void vnet_if_clone_init(void); - -int if_clone_create(char *, size_t, caddr_t); -int if_clone_destroy(const char *); -int if_clone_destroyif(struct if_clone *, struct ifnet *); -int if_clone_list(struct if_clonereq *); +/* Unit (de)allocating fucntions. */ int ifc_name2unit(const char *name, int *unit); int ifc_alloc_unit(struct if_clone *, int *); void ifc_free_unit(struct if_clone *, int); -/* - * The ifc_simple functions, structures, and macros implement basic - * cloning as in 5.[012]. - */ - -struct ifc_simple_data { - int ifcs_minifs; /* minimum number of interfaces */ - - int (*ifcs_create)(struct if_clone *, int, caddr_t); - void (*ifcs_destroy)(struct ifnet *); -}; - -/* interface clone event */ +#ifdef _SYS_EVENTHANDLER_H_ +/* Interface clone event. */ typedef void (*if_clone_event_handler_t)(void *, struct if_clone *); EVENTHANDLER_DECLARE(if_clone_event, if_clone_event_handler_t); +#endif -#define IFC_SIMPLE_DECLARE(name, minifs) \ -struct ifc_simple_data name##_cloner_data = \ - {minifs, name##_clone_create, name##_clone_destroy}; \ -struct if_clone name##_cloner = \ - IFC_CLONE_INITIALIZER(#name, &name##_cloner_data, IF_MAXUNIT, \ - ifc_simple_attach, ifc_simple_match, ifc_simple_create, ifc_simple_destroy) +/* The below interfaces used only by net/if.c. */ +void vnet_if_clone_init(void); +int if_clone_create(char *, size_t, caddr_t); +int if_clone_destroy(const char *); +int if_clone_list(struct if_clonereq *); +struct if_clone *if_clone_findifc(struct ifnet *); +void if_clone_addgroup(struct ifnet *, struct if_clone *); -void ifc_simple_attach(struct if_clone *); -int ifc_simple_match(struct if_clone *, const char *); -int ifc_simple_create(struct if_clone *, char *, size_t, caddr_t); -int ifc_simple_destroy(struct if_clone *, struct ifnet *); +/* The below interface used only by epair(4). */ +int if_clone_destroyif(struct if_clone *, struct ifnet *); #endif /* _KERNEL */ - #endif /* !_NET_IF_CLONE_H_ */ diff --git a/freebsd/sys/net/if_dead.c b/freebsd/sys/net/if_dead.c index b85793f8..e290823c 100644 --- a/freebsd/sys/net/if_dead.c +++ b/freebsd/sys/net/if_dead.c @@ -44,7 +44,7 @@ __FBSDID("$FreeBSD$"); #include <net/if_var.h> static int -ifdead_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, +ifdead_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa, struct route *ro) { @@ -95,6 +95,13 @@ ifdead_transmit(struct ifnet *ifp, struct mbuf *m) return (ENXIO); } +static uint64_t +ifdead_get_counter(struct ifnet *ifp, ift_counter cnt) +{ + + return (0); +} + void if_dead(struct ifnet *ifp) { @@ -106,4 +113,5 @@ if_dead(struct ifnet *ifp) ifp->if_resolvemulti = ifdead_resolvemulti; ifp->if_qflush = ifdead_qflush; ifp->if_transmit = ifdead_transmit; + ifp->if_get_counter = ifdead_get_counter; } diff --git a/freebsd/sys/net/if_disc.c b/freebsd/sys/net/if_disc.c index 3d4f3159..a2e5a7e8 100644 --- a/freebsd/sys/net/if_disc.c +++ b/freebsd/sys/net/if_disc.c @@ -47,10 +47,12 @@ #include <sys/sockio.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_types.h> #include <net/route.h> #include <net/bpf.h> +#include <net/vnet.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> @@ -61,22 +63,21 @@ #define DSMTU 65532 #endif -#define DISCNAME "disc" - struct disc_softc { struct ifnet *sc_ifp; }; static int discoutput(struct ifnet *, struct mbuf *, - struct sockaddr *, struct route *); -static void discrtrequest(int, struct rtentry *, struct rt_addrinfo *); + const struct sockaddr *, struct route *); static int discioctl(struct ifnet *, u_long, caddr_t); static int disc_clone_create(struct if_clone *, int, caddr_t); static void disc_clone_destroy(struct ifnet *); -static MALLOC_DEFINE(M_DISC, DISCNAME, "Discard interface"); +static const char discname[] = "disc"; +static MALLOC_DEFINE(M_DISC, discname, "Discard interface"); -IFC_SIMPLE_DECLARE(disc, 0); +static VNET_DEFINE(struct if_clone *, disc_cloner); +#define V_disc_cloner VNET(disc_cloner) static int disc_clone_create(struct if_clone *ifc, int unit, caddr_t params) @@ -92,7 +93,7 @@ disc_clone_create(struct if_clone *ifc, int unit, caddr_t params) } ifp->if_softc = sc; - if_initname(ifp, ifc->ifc_name, unit); + if_initname(ifp, discname, unit); ifp->if_mtu = DSMTU; /* * IFF_LOOPBACK should not be removed from disc's flags because @@ -131,16 +132,32 @@ disc_clone_destroy(struct ifnet *ifp) free(sc, M_DISC); } +static void +vnet_disc_init(const void *unused __unused) +{ + + V_disc_cloner = if_clone_simple(discname, disc_clone_create, + disc_clone_destroy, 0); +} +VNET_SYSINIT(vnet_disc_init, SI_SUB_PSEUDO, SI_ORDER_ANY, + vnet_disc_init, NULL); + +static void +vnet_disc_uninit(const void *unused __unused) +{ + + if_clone_detach(V_disc_cloner); +} +VNET_SYSUNINIT(vnet_disc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, + vnet_disc_uninit, NULL); + static int disc_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - if_clone_attach(&disc_cloner); - break; case MOD_UNLOAD: - if_clone_detach(&disc_cloner); break; default: return (EOPNOTSUPP); @@ -157,7 +174,7 @@ static moduledata_t disc_mod = { DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int -discoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, +discoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int32_t af; @@ -165,62 +182,47 @@ discoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, M_ASSERTPKTHDR(m); /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { + if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } + else + af = dst->sa_family; - if (bpf_peers_present(ifp->if_bpf)) { - u_int af = dst->sa_family; + if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); - } + m->m_pkthdr.rcvif = ifp; - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); m_freem(m); return (0); } -/* ARGSUSED */ -static void -discrtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) -{ - RT_LOCK_ASSERT(rt); - rt->rt_rmx.rmx_mtu = DSMTU; -} - /* * Process an ioctl request. */ static int discioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - struct ifaddr *ifa; struct ifreq *ifr = (struct ifreq *)data; int error = 0; switch (cmd) { - case SIOCSIFADDR: ifp->if_flags |= IFF_UP; - ifa = (struct ifaddr *)data; - if (ifa != 0) - ifa->ifa_rtrequest = discrtrequest; + /* * Everything else is done at a higher level. */ break; - case SIOCADDMULTI: case SIOCDELMULTI: - if (ifr == 0) { + if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } switch (ifr->ifr_addr.sa_family) { - #ifdef INET case AF_INET: break; @@ -229,17 +231,14 @@ discioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case AF_INET6: break; #endif - default: error = EAFNOSUPPORT; break; } break; - case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; break; - default: error = EINVAL; } diff --git a/freebsd/sys/net/if_dl.h b/freebsd/sys/net/if_dl.h index 8d88623d..f53bc5e4 100644 --- a/freebsd/sys/net/if_dl.h +++ b/freebsd/sys/net/if_dl.h @@ -67,6 +67,14 @@ struct sockaddr_dl { }; #define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen)) +#define CLLADDR(s) ((c_caddr_t)((s)->sdl_data + (s)->sdl_nlen)) +#define LLINDEX(s) ((s)->sdl_index) + + +struct ifnet; +struct sockaddr_dl *link_alloc_sdl(size_t, int); +void link_free_sdl(struct sockaddr *sa); +struct sockaddr_dl *link_init_sdl(struct ifnet *, struct sockaddr *, u_char); #ifndef _KERNEL diff --git a/freebsd/sys/net/if_edsc.c b/freebsd/sys/net/if_edsc.c index 6bb80fdb..d90f072a 100644 --- a/freebsd/sys/net/if_edsc.c +++ b/freebsd/sys/net/if_edsc.c @@ -48,10 +48,14 @@ #include <net/bpf.h> /* bpf(9) */ #include <net/ethernet.h> /* Ethernet related constants and types */ -#include <net/if.h> /* basic part of ifnet(9) */ +#include <net/if.h> +#include <net/if_var.h> /* basic part of ifnet(9) */ #include <net/if_clone.h> /* network interface cloning */ #include <net/if_types.h> /* IFT_ETHER and friends */ #include <net/if_var.h> /* kernel-only part of ifnet(9) */ +#include <net/vnet.h> + +static const char edscname[] = "edsc"; /* * Software configuration of an interface specific to this device type. @@ -66,9 +70,10 @@ struct edsc_softc { }; /* - * Simple cloning methods. - * IFC_SIMPLE_DECLARE() expects precisely these names. + * Attach to the interface cloning framework. */ +static VNET_DEFINE(struct if_clone *, edsc_cloner); +#define V_edsc_cloner VNET(edsc_cloner) static int edsc_clone_create(struct if_clone *, int, caddr_t); static void edsc_clone_destroy(struct ifnet *); @@ -83,15 +88,7 @@ static void edsc_start(struct ifnet *ifp); /* * We'll allocate softc instances from this. */ -static MALLOC_DEFINE(M_EDSC, "edsc", "Ethernet discard interface"); - -/* - * Attach to the interface cloning framework under the name of "edsc". - * The second argument is the number of units to be created from - * the outset. It's also the minimum number of units allowed. - * We don't want any units created as soon as the driver is loaded. - */ -IFC_SIMPLE_DECLARE(edsc, 0); +static MALLOC_DEFINE(M_EDSC, edscname, "Ethernet discard interface"); /* * Create an interface instance. @@ -118,7 +115,7 @@ edsc_clone_create(struct if_clone *ifc, int unit, caddr_t params) /* * Get a name for this particular interface in its ifnet structure. */ - if_initname(ifp, ifc->ifc_name, unit); + if_initname(ifp, edscname, unit); /* * Typical Ethernet interface flags: we can do broadcast and @@ -298,8 +295,8 @@ edsc_start(struct ifnet *ifp) /* * Update the interface counters. */ - ifp->if_obytes += m->m_pkthdr.len; - ifp->if_opackets++; + if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* * Finally, just drop the packet. @@ -314,6 +311,36 @@ edsc_start(struct ifnet *ifp) */ } +static void +vnet_edsc_init(const void *unused __unused) +{ + + /* + * Connect to the network interface cloning framework. + * The last argument is the number of units to be created + * from the outset. It's also the minimum number of units + * allowed. We don't want any units created as soon as the + * driver is loaded. + */ + V_edsc_cloner = if_clone_simple(edscname, edsc_clone_create, + edsc_clone_destroy, 0); +} +VNET_SYSINIT(vnet_edsc_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_edsc_init, NULL); + +static void +vnet_edsc_uninit(const void *unused __unused) +{ + + /* + * Disconnect from the cloning framework. + * Existing interfaces will be disposed of properly. + */ + if_clone_detach(V_edsc_cloner); +} +VNET_SYSUNINIT(vnet_edsc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, + vnet_edsc_uninit, NULL); + /* * This function provides handlers for module events, namely load and unload. */ @@ -323,20 +350,8 @@ edsc_modevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: - /* - * Connect to the network interface cloning framework. - */ - if_clone_attach(&edsc_cloner); - break; - case MOD_UNLOAD: - /* - * Disconnect from the cloning framework. - * Existing interfaces will be disposed of properly. - */ - if_clone_detach(&edsc_cloner); break; - default: /* * There are other event types, but we don't handle them. diff --git a/freebsd/sys/net/if_ef.c b/freebsd/sys/net/if_ef.c deleted file mode 100644 index 4aa76712..00000000 --- a/freebsd/sys/net/if_ef.c +++ /dev/null @@ -1,610 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/*- - * Copyright (c) 1999, 2000 Boris Popov - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#include <rtems/bsd/local/opt_inet.h> -#include <rtems/bsd/local/opt_ipx.h> -#include <rtems/bsd/local/opt_ef.h> - -#include <rtems/bsd/sys/param.h> -#include <sys/systm.h> -#include <sys/sockio.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/syslog.h> -#include <sys/kernel.h> -#include <sys/module.h> - -#include <net/ethernet.h> -#include <net/if_llc.h> -#include <net/if.h> -#include <net/if_arp.h> -#include <net/if_dl.h> -#include <net/if_types.h> -#include <net/netisr.h> -#include <net/bpf.h> -#include <net/vnet.h> - -#ifdef INET -#include <netinet/in.h> -#include <netinet/in_var.h> -#include <netinet/if_ether.h> -#endif - -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - -/* If none of the supported layers is enabled explicitly enable them all */ -#if !defined(ETHER_II) && !defined(ETHER_8023) && !defined(ETHER_8022) && \ - !defined(ETHER_SNAP) -#define ETHER_II 1 -#define ETHER_8023 1 -#define ETHER_8022 1 -#define ETHER_SNAP 1 -#endif - -/* internal frame types */ -#define ETHER_FT_EII 0 /* Ethernet_II - default */ -#define ETHER_FT_8023 1 /* 802.3 (Novell) */ -#define ETHER_FT_8022 2 /* 802.2 */ -#define ETHER_FT_SNAP 3 /* SNAP */ -#define EF_NFT 4 /* total number of frame types */ - -#ifdef EF_DEBUG -#define EFDEBUG(format, args...) printf("%s: "format, __func__ ,## args) -#else -#define EFDEBUG(format, args...) -#endif - -#define EFERROR(format, args...) printf("%s: "format, __func__ ,## args) - -struct efnet { - struct ifnet *ef_ifp; - struct ifnet *ef_pifp; - int ef_frametype; -}; - -struct ef_link { - SLIST_ENTRY(ef_link) el_next; - struct ifnet *el_ifp; /* raw device for this clones */ - struct efnet *el_units[EF_NFT]; /* our clones */ -}; - -static SLIST_HEAD(ef_link_head, ef_link) efdev = {NULL}; -static int efcount; - -extern int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m); -extern int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp, - struct sockaddr *dst, short *tp, int *hlen); - -/* -static void ef_reset (struct ifnet *); -*/ -static int ef_attach(struct efnet *sc); -static int ef_detach(struct efnet *sc); -static void ef_init(void *); -static int ef_ioctl(struct ifnet *, u_long, caddr_t); -static void ef_start(struct ifnet *); -static int ef_input(struct ifnet*, struct ether_header *, struct mbuf *); -static int ef_output(struct ifnet *ifp, struct mbuf **mp, - struct sockaddr *dst, short *tp, int *hlen); - -static int ef_load(void); -static int ef_unload(void); - -/* - * Install the interface, most of structure initialization done in ef_clone() - */ -static int -ef_attach(struct efnet *sc) -{ - struct ifnet *ifp = sc->ef_ifp; - - ifp->if_start = ef_start; - ifp->if_init = ef_init; - ifp->if_snd.ifq_maxlen = ifqmaxlen; - ifp->if_flags = (IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); - /* - * Attach the interface - */ - ether_ifattach(ifp, IF_LLADDR(sc->ef_pifp)); - - ifp->if_resolvemulti = 0; - ifp->if_type = IFT_XETHER; - ifp->if_drv_flags |= IFF_DRV_RUNNING; - - EFDEBUG("%s: attached\n", ifp->if_xname); - return 1; -} - -/* - * This is for _testing_only_, just removes interface from interfaces list - */ -static int -ef_detach(struct efnet *sc) -{ - struct ifnet *ifp = sc->ef_ifp; - int s; - - s = splimp(); - - ether_ifdetach(ifp); - if_free(ifp); - - splx(s); - return 0; -} - -static void -ef_init(void *foo) { - return; -} - -static int -ef_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct efnet *sc = ifp->if_softc; - struct ifaddr *ifa = (struct ifaddr*)data; - int s, error; - - EFDEBUG("IOCTL %ld for %s\n", cmd, ifp->if_xname); - error = 0; - s = splimp(); - switch (cmd) { - case SIOCSIFFLAGS: - error = 0; - break; - case SIOCSIFADDR: - if (sc->ef_frametype == ETHER_FT_8023 && - ifa->ifa_addr->sa_family != AF_IPX) { - error = EAFNOSUPPORT; - break; - } - ifp->if_flags |= IFF_UP; - /* FALL THROUGH */ - default: - error = ether_ioctl(ifp, cmd, data); - break; - } - splx(s); - return error; -} - -/* - * Currently packet prepared in the ether_output(), but this can be a better - * place. - */ -static void -ef_start(struct ifnet *ifp) -{ - struct efnet *sc = (struct efnet*)ifp->if_softc; - struct ifnet *p; - struct mbuf *m; - int error; - - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - p = sc->ef_pifp; - - EFDEBUG("\n"); - for (;;) { - IF_DEQUEUE(&ifp->if_snd, m); - if (m == 0) - break; - BPF_MTAP(ifp, m); - error = p->if_transmit(p, m); - if (error) { - ifp->if_oerrors++; - continue; - } - ifp->if_opackets++; - } - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - return; -} - -/* - * Inline functions do not put additional overhead to procedure call or - * parameter passing but simplify the code - */ -static int __inline -ef_inputEII(struct mbuf *m, struct ether_header *eh, u_short ether_type) -{ - int isr; - - switch(ether_type) { -#ifdef IPX - case ETHERTYPE_IPX: - isr = NETISR_IPX; - break; -#endif -#ifdef INET - case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return (0); - isr = NETISR_IP; - break; - - case ETHERTYPE_ARP: - isr = NETISR_ARP; - break; -#endif - default: - return (EPROTONOSUPPORT); - } - netisr_dispatch(isr, m); - return (0); -} - -static int __inline -ef_inputSNAP(struct mbuf *m, struct ether_header *eh, struct llc* l, - u_short ether_type) -{ - int isr; - - switch(ether_type) { -#ifdef IPX - case ETHERTYPE_IPX: - m_adj(m, 8); - isr = NETISR_IPX; - break; -#endif - default: - return (EPROTONOSUPPORT); - } - netisr_dispatch(isr, m); - return (0); -} - -static int __inline -ef_input8022(struct mbuf *m, struct ether_header *eh, struct llc* l, - u_short ether_type) -{ - int isr; - - switch(ether_type) { -#ifdef IPX - case 0xe0: - m_adj(m, 3); - isr = NETISR_IPX; - break; -#endif - default: - return (EPROTONOSUPPORT); - } - netisr_dispatch(isr, m); - return (0); -} - -/* - * Called from ether_input() - */ -static int -ef_input(struct ifnet *ifp, struct ether_header *eh, struct mbuf *m) -{ - u_short ether_type; - int ft = -1; - struct efnet *efp; - struct ifnet *eifp; - struct llc *l; - struct ef_link *efl; - int isr; - - ether_type = ntohs(eh->ether_type); - l = NULL; - if (ether_type < ETHERMTU) { - l = mtod(m, struct llc*); - if (l->llc_dsap == 0xff && l->llc_ssap == 0xff) { - /* - * Novell's "802.3" frame - */ - ft = ETHER_FT_8023; - } else if (l->llc_dsap == 0xaa && l->llc_ssap == 0xaa) { - /* - * 802.2/SNAP - */ - ft = ETHER_FT_SNAP; - ether_type = ntohs(l->llc_un.type_snap.ether_type); - } else if (l->llc_dsap == l->llc_ssap) { - /* - * 802.3/802.2 - */ - ft = ETHER_FT_8022; - ether_type = l->llc_ssap; - } - } else - ft = ETHER_FT_EII; - - if (ft == -1) { - EFDEBUG("Unrecognised ether_type %x\n", ether_type); - return EPROTONOSUPPORT; - } - - /* - * Check if interface configured for the given frame - */ - efp = NULL; - SLIST_FOREACH(efl, &efdev, el_next) { - if (efl->el_ifp == ifp) { - efp = efl->el_units[ft]; - break; - } - } - if (efp == NULL) { - EFDEBUG("Can't find if for %d\n", ft); - return EPROTONOSUPPORT; - } - eifp = efp->ef_ifp; - if ((eifp->if_flags & IFF_UP) == 0) - return EPROTONOSUPPORT; - eifp->if_ibytes += m->m_pkthdr.len + sizeof (*eh); - m->m_pkthdr.rcvif = eifp; - - BPF_MTAP2(eifp, eh, ETHER_HDR_LEN, m); - /* - * Now we ready to adjust mbufs and pass them to protocol intr's - */ - switch(ft) { - case ETHER_FT_EII: - return (ef_inputEII(m, eh, ether_type)); -#ifdef IPX - case ETHER_FT_8023: /* only IPX can be here */ - isr = NETISR_IPX; - break; -#endif - case ETHER_FT_SNAP: - return (ef_inputSNAP(m, eh, l, ether_type)); - case ETHER_FT_8022: - return (ef_input8022(m, eh, l, ether_type)); - default: - EFDEBUG("No support for frame %d and proto %04x\n", - ft, ether_type); - return (EPROTONOSUPPORT); - } - netisr_dispatch(isr, m); - return (0); -} - -static int -ef_output(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst, short *tp, - int *hlen) -{ - struct efnet *sc = (struct efnet*)ifp->if_softc; - struct mbuf *m = *mp; - u_char *cp; - short type; - - if (ifp->if_type != IFT_XETHER) - return ENETDOWN; - switch (sc->ef_frametype) { - case ETHER_FT_EII: -#ifdef IPX - type = htons(ETHERTYPE_IPX); -#else - return EPFNOSUPPORT; -#endif - break; - case ETHER_FT_8023: - type = htons(m->m_pkthdr.len); - break; - case ETHER_FT_8022: - M_PREPEND(m, ETHER_HDR_LEN + 3, M_WAIT); - /* - * Ensure that ethernet header and next three bytes - * will fit into single mbuf - */ - m = m_pullup(m, ETHER_HDR_LEN + 3); - if (m == NULL) { - *mp = NULL; - return ENOBUFS; - } - m_adj(m, ETHER_HDR_LEN); - type = htons(m->m_pkthdr.len); - cp = mtod(m, u_char *); - *cp++ = 0xE0; - *cp++ = 0xE0; - *cp++ = 0x03; - *hlen += 3; - break; - case ETHER_FT_SNAP: - M_PREPEND(m, 8, M_WAIT); - type = htons(m->m_pkthdr.len); - cp = mtod(m, u_char *); - bcopy("\xAA\xAA\x03\x00\x00\x00\x81\x37", cp, 8); - *hlen += 8; - break; - default: - return EPFNOSUPPORT; - } - *mp = m; - *tp = type; - return 0; -} - -/* - * Create clone from the given interface - */ -static int -ef_clone(struct ef_link *efl, int ft) -{ - struct efnet *efp; - struct ifnet *eifp; - struct ifnet *ifp = efl->el_ifp; - - efp = (struct efnet*)malloc(sizeof(struct efnet), M_IFADDR, - M_WAITOK | M_ZERO); - if (efp == NULL) - return ENOMEM; - efp->ef_pifp = ifp; - efp->ef_frametype = ft; - eifp = efp->ef_ifp = if_alloc(IFT_ETHER); - if (eifp == NULL) { - free(efp, M_IFADDR); - return (ENOSPC); - } - snprintf(eifp->if_xname, IFNAMSIZ, - "%sf%d", ifp->if_xname, efp->ef_frametype); - eifp->if_dname = "ef"; - eifp->if_dunit = IF_DUNIT_NONE; - eifp->if_softc = efp; - if (ifp->if_ioctl) - eifp->if_ioctl = ef_ioctl; - efl->el_units[ft] = efp; - return 0; -} - -static int -ef_load(void) -{ - VNET_ITERATOR_DECL(vnet_iter); - struct ifnet *ifp; - struct efnet *efp; - struct ef_link *efl = NULL, *efl_temp; - int error = 0, d; - - VNET_LIST_RLOCK(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - - /* - * XXXRW: The following loop walks the ifnet list while - * modifying it, something not well-supported by ifnet - * locking. To avoid lock upgrade/recursion issues, manually - * acquire a write lock of ifnet_sxlock here, rather than a - * read lock, so that when if_alloc() recurses the lock, we - * don't panic. This structure, in which if_ef automatically - * attaches to all ethernet interfaces, should be replaced - * with a model like that found in if_vlan, in which - * interfaces are explicitly configured, which would avoid - * this (and other) problems. - */ - sx_xlock(&ifnet_sxlock); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - if (ifp->if_type != IFT_ETHER) continue; - EFDEBUG("Found interface %s\n", ifp->if_xname); - efl = (struct ef_link*)malloc(sizeof(struct ef_link), - M_IFADDR, M_WAITOK | M_ZERO); - if (efl == NULL) { - error = ENOMEM; - break; - } - - efl->el_ifp = ifp; -#ifdef ETHER_II - error = ef_clone(efl, ETHER_FT_EII); - if (error) break; -#endif -#ifdef ETHER_8023 - error = ef_clone(efl, ETHER_FT_8023); - if (error) break; -#endif -#ifdef ETHER_8022 - error = ef_clone(efl, ETHER_FT_8022); - if (error) break; -#endif -#ifdef ETHER_SNAP - error = ef_clone(efl, ETHER_FT_SNAP); - if (error) break; -#endif - efcount++; - SLIST_INSERT_HEAD(&efdev, efl, el_next); - } - sx_xunlock(&ifnet_sxlock); - CURVNET_RESTORE(); - } - VNET_LIST_RUNLOCK(); - if (error) { - if (efl) - SLIST_INSERT_HEAD(&efdev, efl, el_next); - SLIST_FOREACH_SAFE(efl, &efdev, el_next, efl_temp) { - for (d = 0; d < EF_NFT; d++) - if (efl->el_units[d]) { - if (efl->el_units[d]->ef_pifp != NULL) - if_free(efl->el_units[d]->ef_pifp); - free(efl->el_units[d], M_IFADDR); - } - free(efl, M_IFADDR); - } - return error; - } - SLIST_FOREACH(efl, &efdev, el_next) { - for (d = 0; d < EF_NFT; d++) { - efp = efl->el_units[d]; - if (efp) - ef_attach(efp); - } - } - ef_inputp = ef_input; - ef_outputp = ef_output; - EFDEBUG("Loaded\n"); - return 0; -} - -static int -ef_unload(void) -{ - struct efnet *efp; - struct ef_link *efl; - int d; - - ef_inputp = NULL; - ef_outputp = NULL; - SLIST_FOREACH(efl, &efdev, el_next) { - for (d = 0; d < EF_NFT; d++) { - efp = efl->el_units[d]; - if (efp) { - ef_detach(efp); - } - } - } - EFDEBUG("Unloaded\n"); - return 0; -} - -static int -if_ef_modevent(module_t mod, int type, void *data) -{ - switch ((modeventtype_t)type) { - case MOD_LOAD: - return ef_load(); - case MOD_UNLOAD: - return ef_unload(); - default: - return EOPNOTSUPP; - } - return 0; -} - -static moduledata_t if_ef_mod = { - "if_ef", if_ef_modevent, NULL -}; - -DECLARE_MODULE(if_ef, if_ef_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); diff --git a/freebsd/sys/net/if_enc.c b/freebsd/sys/net/if_enc.c index 91d34722..d0d065b8 100644 --- a/freebsd/sys/net/if_enc.c +++ b/freebsd/sys/net/if_enc.c @@ -2,6 +2,7 @@ /*- * Copyright (c) 2006 The FreeBSD Project. + * Copyright (c) 2015 Andrey V. Elsukov <ae@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,10 +32,10 @@ #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_enc.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> +#include <sys/hhook.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/mbuf.h> @@ -46,6 +47,8 @@ #include <sys/sysctl.h> #include <net/if.h> +#include <net/if_enc.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_types.h> #include <net/pfil.h> @@ -80,56 +83,66 @@ struct enchdr { u_int32_t spi; u_int32_t flags; }; - -struct ifnet *encif; -static struct mtx enc_mtx; - struct enc_softc { struct ifnet *sc_ifp; }; +static VNET_DEFINE(struct enc_softc *, enc_sc); +#define V_enc_sc VNET(enc_sc) +static VNET_DEFINE(struct if_clone *, enc_cloner); +#define V_enc_cloner VNET(enc_cloner) static int enc_ioctl(struct ifnet *, u_long, caddr_t); -static int enc_output(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro); +static int enc_output(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *); static int enc_clone_create(struct if_clone *, int, caddr_t); static void enc_clone_destroy(struct ifnet *); +static int enc_add_hhooks(struct enc_softc *); +static void enc_remove_hhooks(struct enc_softc *); -IFC_SIMPLE_DECLARE(enc, 1); - -/* - * Sysctls. - */ +static const char encname[] = "enc"; /* * Before and after are relative to when we are stripping the * outer IP header. */ -static SYSCTL_NODE(_net, OID_AUTO, enc, CTLFLAG_RW, 0, "enc sysctl"); +static VNET_DEFINE(int, filter_mask_in) = IPSEC_ENC_BEFORE; +static VNET_DEFINE(int, bpf_mask_in) = IPSEC_ENC_BEFORE; +static VNET_DEFINE(int, filter_mask_out) = IPSEC_ENC_BEFORE; +static VNET_DEFINE(int, bpf_mask_out) = IPSEC_ENC_BEFORE | IPSEC_ENC_AFTER; +#define V_filter_mask_in VNET(filter_mask_in) +#define V_bpf_mask_in VNET(bpf_mask_in) +#define V_filter_mask_out VNET(filter_mask_out) +#define V_bpf_mask_out VNET(bpf_mask_out) +static SYSCTL_NODE(_net, OID_AUTO, enc, CTLFLAG_RW, 0, "enc sysctl"); static SYSCTL_NODE(_net_enc, OID_AUTO, in, CTLFLAG_RW, 0, "enc input sysctl"); -static int ipsec_filter_mask_in = ENC_BEFORE; -SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_filter_mask, CTLFLAG_RW, - &ipsec_filter_mask_in, 0, "IPsec input firewall filter mask"); -static int ipsec_bpf_mask_in = ENC_BEFORE; -SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_bpf_mask, CTLFLAG_RW, - &ipsec_bpf_mask_in, 0, "IPsec input bpf mask"); - static SYSCTL_NODE(_net_enc, OID_AUTO, out, CTLFLAG_RW, 0, "enc output sysctl"); -static int ipsec_filter_mask_out = ENC_BEFORE; -SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_filter_mask, CTLFLAG_RW, - &ipsec_filter_mask_out, 0, "IPsec output firewall filter mask"); -static int ipsec_bpf_mask_out = ENC_BEFORE|ENC_AFTER; -SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_bpf_mask, CTLFLAG_RW, - &ipsec_bpf_mask_out, 0, "IPsec output bpf mask"); +SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_filter_mask, + CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_in), 0, + "IPsec input firewall filter mask"); +SYSCTL_INT(_net_enc_in, OID_AUTO, ipsec_bpf_mask, + CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_in), 0, + "IPsec input bpf mask"); +SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_filter_mask, + CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(filter_mask_out), 0, + "IPsec output firewall filter mask"); +SYSCTL_INT(_net_enc_out, OID_AUTO, ipsec_bpf_mask, + CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(bpf_mask_out), 0, + "IPsec output bpf mask"); static void enc_clone_destroy(struct ifnet *ifp) { - KASSERT(ifp != encif, ("%s: destroying encif", __func__)); + struct enc_softc *sc; + + sc = ifp->if_softc; + KASSERT(sc == V_enc_sc, ("sc != ifp->if_softc")); bpfdetach(ifp); if_detach(ifp); if_free(ifp); + free(sc, M_DEVBUF); + V_enc_sc = NULL; } static int @@ -138,244 +151,277 @@ enc_clone_create(struct if_clone *ifc, int unit, caddr_t params) struct ifnet *ifp; struct enc_softc *sc; - sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + sc = malloc(sizeof(struct enc_softc), M_DEVBUF, + M_WAITOK | M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ENC); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } - - if_initname(ifp, ifc->ifc_name, unit); + if (V_enc_sc != NULL) { + if_free(ifp); + free(sc, M_DEVBUF); + return (EEXIST); + } + V_enc_sc = sc; + if_initname(ifp, encname, unit); ifp->if_mtu = ENCMTU; ifp->if_ioctl = enc_ioctl; ifp->if_output = enc_output; - ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_softc = sc; if_attach(ifp); bpfattach(ifp, DLT_ENC, sizeof(struct enchdr)); - - mtx_lock(&enc_mtx); - /* grab a pointer to enc0, ignore the rest */ - if (encif == NULL) - encif = ifp; - mtx_unlock(&enc_mtx); - return (0); } static int -enc_modevent(module_t mod, int type, void *data) +enc_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) { - switch (type) { - case MOD_LOAD: - mtx_init(&enc_mtx, "enc mtx", NULL, MTX_DEF); - if_clone_attach(&enc_cloner); - break; - case MOD_UNLOAD: - printf("enc module unload - not possible for this module\n"); - return (EINVAL); - default: - return (EOPNOTSUPP); - } + + m_freem(m); return (0); } -static moduledata_t enc_mod = { - "if_enc", - enc_modevent, - 0 -}; - -DECLARE_MODULE(if_enc, enc_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); - static int -enc_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct route *ro) +enc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - m_freem(m); + + if (cmd != SIOCSIFFLAGS) + return (EINVAL); + if (ifp->if_flags & IFF_UP) + ifp->if_drv_flags |= IFF_DRV_RUNNING; + else + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; return (0); } /* - * Process an ioctl request. + * One helper hook function is used by any hook points. + * + from hhook_type we can determine the packet direction: + * HHOOK_TYPE_IPSEC_IN or HHOOK_TYPE_IPSEC_OUT; + * + from hhook_id we can determine address family: AF_INET or AF_INET6; + * + udata contains pointer to enc_softc; + * + ctx_data contains pointer to struct ipsec_ctx_data. */ -/* ARGSUSED */ static int -enc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +enc_hhook(int32_t hhook_type, int32_t hhook_id, void *udata, void *ctx_data, + void *hdata, struct osd *hosd) { - int error = 0; - - mtx_lock(&enc_mtx); + struct enchdr hdr; + struct ipsec_ctx_data *ctx; + struct enc_softc *sc; + struct ifnet *ifp, *rcvif; + struct pfil_head *ph; + int pdir; - switch (cmd) { + sc = (struct enc_softc *)udata; + ifp = sc->sc_ifp; + if ((ifp->if_flags & IFF_UP) == 0) + return (0); - case SIOCSIFFLAGS: - if (ifp->if_flags & IFF_UP) - ifp->if_drv_flags |= IFF_DRV_RUNNING; - else - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + ctx = (struct ipsec_ctx_data *)ctx_data; + /* XXX: wrong hook point was used by caller? */ + if (ctx->af != hhook_id) + return (EPFNOSUPPORT); + + if (((hhook_type == HHOOK_TYPE_IPSEC_IN && + (ctx->enc & V_bpf_mask_in) != 0) || + (hhook_type == HHOOK_TYPE_IPSEC_OUT && + (ctx->enc & V_bpf_mask_out) != 0)) && + bpf_peers_present(ifp->if_bpf) != 0) { + hdr.af = ctx->af; + hdr.spi = ctx->sav->spi; + hdr.flags = 0; + if (ctx->sav->alg_enc != SADB_EALG_NONE) + hdr.flags |= M_CONF; + if (ctx->sav->alg_auth != SADB_AALG_NONE) + hdr.flags |= M_AUTH; + bpf_mtap2(ifp->if_bpf, &hdr, sizeof(hdr), *ctx->mp); + } + switch (hhook_type) { + case HHOOK_TYPE_IPSEC_IN: + if (ctx->enc == IPSEC_ENC_BEFORE) { + /* Do accounting only once */ + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_IBYTES, + (*ctx->mp)->m_pkthdr.len); + } + if ((ctx->enc & V_filter_mask_in) == 0) + return (0); /* skip pfil processing */ + pdir = PFIL_IN; + break; + case HHOOK_TYPE_IPSEC_OUT: + if (ctx->enc == IPSEC_ENC_BEFORE) { + /* Do accounting only once */ + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OBYTES, + (*ctx->mp)->m_pkthdr.len); + } + if ((ctx->enc & V_filter_mask_out) == 0) + return (0); /* skip pfil processing */ + pdir = PFIL_OUT; break; - default: - error = EINVAL; + return (EINVAL); } - mtx_unlock(&enc_mtx); - return (error); + switch (hhook_id) { +#ifdef INET + case AF_INET: + ph = &V_inet_pfil_hook; + break; +#endif +#ifdef INET6 + case AF_INET6: + ph = &V_inet6_pfil_hook; + break; +#endif + default: + ph = NULL; + } + if (ph == NULL || !PFIL_HOOKED(ph)) + return (0); + /* Make a packet looks like it was received on enc(4) */ + rcvif = (*ctx->mp)->m_pkthdr.rcvif; + (*ctx->mp)->m_pkthdr.rcvif = ifp; + if (pfil_run_hooks(ph, ctx->mp, ifp, pdir, NULL) != 0 || + *ctx->mp == NULL) { + *ctx->mp = NULL; /* consumed by filter */ + return (EACCES); + } + (*ctx->mp)->m_pkthdr.rcvif = rcvif; + return (0); } -int -ipsec_filter(struct mbuf **mp, int dir, int flags) +static int +enc_add_hhooks(struct enc_softc *sc) { - int error, i; - struct ip *ip; - - KASSERT(encif != NULL, ("%s: encif is null", __func__)); - KASSERT(flags & (ENC_IN|ENC_OUT), - ("%s: invalid flags: %04x", __func__, flags)); - - if ((encif->if_drv_flags & IFF_DRV_RUNNING) == 0) - return (0); + struct hookinfo hki; + int error; - if (flags & ENC_IN) { - if ((flags & ipsec_filter_mask_in) == 0) - return (0); - } else { - if ((flags & ipsec_filter_mask_out) == 0) - return (0); - } - - /* Skip pfil(9) if no filters are loaded */ - if (1 + error = EPFNOSUPPORT; + hki.hook_func = enc_hhook; + hki.hook_helper = NULL; + hki.hook_udata = sc; #ifdef INET - && !PFIL_HOOKED(&V_inet_pfil_hook) + hki.hook_id = AF_INET; + hki.hook_type = HHOOK_TYPE_IPSEC_IN; + error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET], + &hki, HHOOK_WAITOK); + if (error != 0) + return (error); + hki.hook_type = HHOOK_TYPE_IPSEC_OUT; + error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET], + &hki, HHOOK_WAITOK); + if (error != 0) + return (error); #endif #ifdef INET6 - && !PFIL_HOOKED(&V_inet6_pfil_hook) + hki.hook_id = AF_INET6; + hki.hook_type = HHOOK_TYPE_IPSEC_IN; + error = hhook_add_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6], + &hki, HHOOK_WAITOK); + if (error != 0) + return (error); + hki.hook_type = HHOOK_TYPE_IPSEC_OUT; + error = hhook_add_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6], + &hki, HHOOK_WAITOK); + if (error != 0) + return (error); #endif - ) { - return (0); - } + return (error); +} - i = min((*mp)->m_pkthdr.len, max_protohdr); - if ((*mp)->m_len < i) { - *mp = m_pullup(*mp, i); - if (*mp == NULL) { - printf("%s: m_pullup failed\n", __func__); - return (-1); - } - } +static void +enc_remove_hhooks(struct enc_softc *sc) +{ + struct hookinfo hki; - error = 0; - ip = mtod(*mp, struct ip *); - switch (ip->ip_v) { + hki.hook_func = enc_hhook; + hki.hook_helper = NULL; + hki.hook_udata = sc; #ifdef INET - case 4: - /* - * before calling the firewall, swap fields the same as - * IP does. here we assume the header is contiguous - */ - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); - - error = pfil_run_hooks(&V_inet_pfil_hook, mp, - encif, dir, NULL); - - if (*mp == NULL || error != 0) - break; - - /* restore byte ordering */ - ip = mtod(*mp, struct ip *); - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); - break; + hki.hook_id = AF_INET; + hki.hook_type = HHOOK_TYPE_IPSEC_IN; + hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET], &hki); + hki.hook_type = HHOOK_TYPE_IPSEC_OUT; + hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET], &hki); #endif #ifdef INET6 - case 6: - error = pfil_run_hooks(&V_inet6_pfil_hook, mp, - encif, dir, NULL); - break; + hki.hook_id = AF_INET6; + hki.hook_type = HHOOK_TYPE_IPSEC_IN; + hhook_remove_hook(V_ipsec_hhh_in[HHOOK_IPSEC_INET6], &hki); + hki.hook_type = HHOOK_TYPE_IPSEC_OUT; + hhook_remove_hook(V_ipsec_hhh_out[HHOOK_IPSEC_INET6], &hki); #endif - default: - printf("%s: unknown IP version\n", __func__); - } +} - /* - * If the mbuf was consumed by the filter for requeueing (dummynet, etc) - * then error will be zero but we still want to return an error to our - * caller so the null mbuf isn't forwarded further. - */ - if (*mp == NULL && error == 0) - return (-1); /* Consumed by the filter */ - if (*mp == NULL) - return (error); - if (error != 0) - goto bad; +static void +vnet_enc_init(const void *unused __unused) +{ - return (error); + V_enc_sc = NULL; + V_enc_cloner = if_clone_simple(encname, enc_clone_create, + enc_clone_destroy, 1); +} +VNET_SYSINIT(vnet_enc_init, SI_SUB_PSEUDO, SI_ORDER_ANY, + vnet_enc_init, NULL); -bad: - m_freem(*mp); - *mp = NULL; - return (error); +static void +vnet_enc_init_proto(void *unused __unused) +{ + KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc)); + + if (enc_add_hhooks(V_enc_sc) != 0) + enc_clone_destroy(V_enc_sc->sc_ifp); } +VNET_SYSINIT(vnet_enc_init_proto, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_enc_init_proto, NULL); -void -ipsec_bpf(struct mbuf *m, struct secasvar *sav, int af, int flags) +static void +vnet_enc_uninit(const void *unused __unused) { - int mflags; - struct enchdr hdr; + KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc)); - KASSERT(encif != NULL, ("%s: encif is null", __func__)); - KASSERT(flags & (ENC_IN|ENC_OUT), - ("%s: invalid flags: %04x", __func__, flags)); + if_clone_detach(V_enc_cloner); +} +VNET_SYSUNINIT(vnet_enc_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, + vnet_enc_uninit, NULL); - if ((encif->if_drv_flags & IFF_DRV_RUNNING) == 0) - return; +/* + * The hhook consumer needs to go before ip[6]_destroy are called on + * SI_ORDER_THIRD. + */ +static void +vnet_enc_uninit_hhook(const void *unused __unused) +{ + KASSERT(V_enc_sc != NULL, ("%s: V_enc_sc is %p\n", __func__, V_enc_sc)); - if (flags & ENC_IN) { - if ((flags & ipsec_bpf_mask_in) == 0) - return; - } else { - if ((flags & ipsec_bpf_mask_out) == 0) - return; - } + enc_remove_hhooks(V_enc_sc); +} +VNET_SYSUNINIT(vnet_enc_uninit_hhook, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, + vnet_enc_uninit_hhook, NULL); - if (bpf_peers_present(encif->if_bpf)) { - mflags = 0; - hdr.spi = 0; - if (!sav) { - struct m_tag *mtag; - mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); - if (mtag != NULL) { - struct tdb_ident *tdbi; - tdbi = (struct tdb_ident *) (mtag + 1); - if (tdbi->alg_enc != SADB_EALG_NONE) - mflags |= M_CONF; - if (tdbi->alg_auth != SADB_AALG_NONE) - mflags |= M_AUTH; - hdr.spi = tdbi->spi; - } - } else { - if (sav->alg_enc != SADB_EALG_NONE) - mflags |= M_CONF; - if (sav->alg_auth != SADB_AALG_NONE) - mflags |= M_AUTH; - hdr.spi = sav->spi; - } +static int +enc_modevent(module_t mod, int type, void *data) +{ - /* - * We need to prepend the address family as a four byte - * field. Cons up a dummy header to pacify bpf. This - * is safe because bpf will only read from the mbuf - * (i.e., it won't try to free it or keep a pointer a - * to it). - */ - hdr.af = af; - /* hdr.spi already set above */ - hdr.flags = mflags; - - bpf_mtap2(encif->if_bpf, &hdr, sizeof(hdr), m); + switch (type) { + case MOD_LOAD: + case MOD_UNLOAD: + break; + default: + return (EOPNOTSUPP); } + return (0); } + +static moduledata_t enc_mod = { + "if_enc", + enc_modevent, + 0 +}; + +DECLARE_MODULE(if_enc, enc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/freebsd/sys/net/if_enc.h b/freebsd/sys/net/if_enc.h index 59a55fcf..941ed12a 100644 --- a/freebsd/sys/net/if_enc.h +++ b/freebsd/sys/net/if_enc.h @@ -30,6 +30,13 @@ #ifndef _NET_IF_ENC_H #define _NET_IF_ENC_H -extern struct ifnet *encif; +struct ipsec_ctx_data { + struct mbuf **mp; + struct secasvar *sav; + uint8_t af; +#define IPSEC_ENC_BEFORE 0x01 +#define IPSEC_ENC_AFTER 0x02 + uint8_t enc; +}; #endif /* _NET_IF_ENC_H */ diff --git a/freebsd/sys/net/if_epair.c b/freebsd/sys/net/if_epair.c index 755e608a..b4f73d68 100644 --- a/freebsd/sys/net/if_epair.c +++ b/freebsd/sys/net/if_epair.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/sys/param.h> #include <sys/kernel.h> +#include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/module.h> #include <sys/refcount.h> @@ -67,6 +68,7 @@ __FBSDID("$FreeBSD$"); #include <net/bpf.h> #include <net/ethernet.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_media.h> #include <net/if_var.h> @@ -74,8 +76,6 @@ __FBSDID("$FreeBSD$"); #include <net/netisr.h> #include <net/vnet.h> -#define EPAIRNAME "epair" - SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl"); @@ -102,9 +102,11 @@ static int epair_clone_match(struct if_clone *, const char *); static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t); static int epair_clone_destroy(struct if_clone *, struct ifnet *); -/* Netisr realted definitions and sysctl. */ +static const char epairname[] = "epair"; + +/* Netisr related definitions and sysctl. */ static struct netisr_handler epair_nh = { - .nh_name = EPAIRNAME, + .nh_name = epairname, .nh_proto = NETISR_EPAIR, .nh_policy = NETISR_POLICY_CPU, .nh_handler = epair_nh_sintr, @@ -170,12 +172,11 @@ STAILQ_HEAD(eid_list, epair_ifp_drain); #define EPAIR_REFCOUNT_ASSERT(a, p) #endif -static MALLOC_DEFINE(M_EPAIR, EPAIRNAME, +static MALLOC_DEFINE(M_EPAIR, epairname, "Pair of virtual cross-over connected Ethernet-like interfaces"); -static struct if_clone epair_cloner = IFC_CLONE_INITIALIZER( - EPAIRNAME, NULL, IF_MAXUNIT, - NULL, epair_clone_match, epair_clone_create, epair_clone_destroy); +static VNET_DEFINE(struct if_clone *, epair_cloner); +#define V_epair_cloner VNET(epair_cloner) /* * DPCPU area and functions. @@ -421,7 +422,7 @@ epair_start_locked(struct ifnet *ifp) */ if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) ==0) { - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); continue; } @@ -437,15 +438,15 @@ epair_start_locked(struct ifnet *ifp) error = netisr_queue(NETISR_EPAIR, m); CURVNET_RESTORE(); if (!error) { - ifp->if_opackets++; + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* Someone else received the packet. */ - oifp->if_ipackets++; + if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1); } else { /* The packet was freed already. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; ifp->if_drv_flags |= IFF_DRV_OACTIVE; (void) epair_add_ifp_for_draining(ifp); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", @@ -506,7 +507,7 @@ epair_transmit_locked(struct ifnet *ifp, struct mbuf *m) oifp = sc->oifp; if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (oifp->if_flags & IFF_UP) ==0) { - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (0); } @@ -515,17 +516,17 @@ epair_transmit_locked(struct ifnet *ifp, struct mbuf *m) DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname); #ifdef ALTQ - /* Support ALTQ via the clasic if_start() path. */ + /* Support ALTQ via the classic if_start() path. */ IF_LOCK(&ifp->if_snd); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error); if (error) - ifp->if_snd.ifq_drops++; + if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); IF_UNLOCK(&ifp->if_snd); if (!error) { - ifp->if_obytes += len; + if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) - ifp->if_omcasts++; + if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) epair_start_locked(ifp); @@ -559,22 +560,22 @@ epair_transmit_locked(struct ifnet *ifp, struct mbuf *m) error = netisr_queue(NETISR_EPAIR, m); CURVNET_RESTORE(); if (!error) { - ifp->if_opackets++; + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* * IFQ_HANDOFF_ADJ/ip_handoff() update statistics, * but as we bypass all this we have to duplicate * the logic another time. */ - ifp->if_obytes += len; + if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if (mflags & (M_BCAST|M_MCAST)) - ifp->if_omcasts++; + if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); /* Someone else received the packet. */ - oifp->if_ipackets++; + if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1); } else { /* The packet was freed already. */ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE; ifp->if_drv_flags |= IFF_DRV_OACTIVE; - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); EPAIR_REFCOUNT_RELEASE(&sc->refcount); EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1, ("%s: ifp=%p sc->refcount not >= 1: %d", @@ -694,10 +695,10 @@ epair_clone_match(struct if_clone *ifc, const char *name) * - epair<n> * but not the epair<n>[ab] versions. */ - if (strncmp(EPAIRNAME, name, sizeof(EPAIRNAME)-1) != 0) + if (strncmp(epairname, name, sizeof(epairname)-1) != 0) return (0); - for (cp = name + sizeof(EPAIRNAME) - 1; *cp != '\0'; cp++) { + for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } @@ -716,7 +717,7 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) /* * We are abusing params to create our second interface. - * Actually we already created it and called if_clone_createif() + * Actually we already created it and called if_clone_create() * for it to do the official insertion procedure the moment we knew * it cannot fail anymore. So just do attach it here. */ @@ -763,10 +764,17 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) ifc_free_unit(ifc, unit); return (ENOSPC); } - *dp = 'a'; + *dp = 'b'; /* Must not change dp so we can replace 'a' by 'b' later. */ *(dp+1) = '\0'; + /* Check if 'a' and 'b' interfaces already exist. */ + if (ifunit(name) != NULL) + return (EEXIST); + *dp = 'a'; + if (ifunit(name) != NULL) + return (EEXIST); + /* Allocate memory for both [ab] interfaces */ sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO); EPAIR_REFCOUNT_INIT(&sca->refcount, 1); @@ -801,15 +809,23 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) * cache locality but we can at least allow parallelism. */ sca->cpuid = - netisr_get_cpuid(sca->ifp->if_index % netisr_get_cpucount()); + netisr_get_cpuid(sca->ifp->if_index); scb->cpuid = - netisr_get_cpuid(scb->ifp->if_index % netisr_get_cpucount()); + netisr_get_cpuid(scb->ifp->if_index); + + /* Initialise pseudo media types. */ + ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status); + ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL); + ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T); + ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status); + ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL); + ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T); /* Finish initialization of interface <n>a. */ ifp = sca->ifp; ifp->if_softc = sca; strlcpy(ifp->if_xname, name, IFNAMSIZ); - ifp->if_dname = ifc->ifc_name; + ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_capabilities = IFCAP_VLAN_MTU; @@ -827,7 +843,7 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) sca->if_qflush = ifp->if_qflush; ifp->if_qflush = epair_qflush; ifp->if_transmit = epair_transmit; - ifp->if_baudrate = IF_Gbps(10UL); /* arbitrary maximum */ + ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */ /* Swap the name and finish initialization of interface <n>b. */ *dp = 'b'; @@ -835,7 +851,7 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) ifp = scb->ifp; ifp->if_softc = scb; strlcpy(ifp->if_xname, name, IFNAMSIZ); - ifp->if_dname = ifc->ifc_name; + ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_capabilities = IFCAP_VLAN_MTU; @@ -845,15 +861,15 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) ifp->if_init = epair_init; ifp->if_snd.ifq_maxlen = ifqmaxlen; /* We need to play some tricks here for the second interface. */ - strlcpy(name, EPAIRNAME, len); + strlcpy(name, epairname, len); error = if_clone_create(name, len, (caddr_t)scb); if (error) - panic("%s: if_clone_createif() for our 2nd iface failed: %d", + panic("%s: if_clone_create() for our 2nd iface failed: %d", __func__, error); scb->if_qflush = ifp->if_qflush; ifp->if_qflush = epair_qflush; ifp->if_transmit = epair_transmit; - ifp->if_baudrate = IF_Gbps(10UL); /* arbitrary maximum */ + ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */ /* * Restore name to <n>a as the ifp for this will go into the @@ -862,14 +878,6 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) strlcpy(name, sca->ifp->if_xname, len); DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb); - /* Initialise pseudo media types. */ - ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status); - ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL); - ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T); - ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status); - ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL); - ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T); - /* Tell the world, that we are ready to rock. */ sca->ifp->if_drv_flags |= IFF_DRV_RUNNING; scb->ifp->if_drv_flags |= IFF_DRV_RUNNING; @@ -947,6 +955,31 @@ epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) return (0); } +static void +vnet_epair_init(const void *unused __unused) +{ + + V_epair_cloner = if_clone_advanced(epairname, 0, + epair_clone_match, epair_clone_create, epair_clone_destroy); +#ifdef VIMAGE + netisr_register_vnet(&epair_nh); +#endif +} +VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY, + vnet_epair_init, NULL); + +static void +vnet_epair_uninit(const void *unused __unused) +{ + +#ifdef VIMAGE + netisr_unregister_vnet(&epair_nh); +#endif + if_clone_detach(V_epair_cloner); +} +VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, + vnet_epair_uninit, NULL); + static int epair_modevent(module_t mod, int type, void *data) { @@ -962,16 +995,14 @@ epair_modevent(module_t mod, int type, void *data) epair_nh.nh_qlimit = qlimit; #endif /* __rtems__ */ netisr_register(&epair_nh); - if_clone_attach(&epair_cloner); if (bootverbose) - printf("%s initialized.\n", EPAIRNAME); + printf("%s initialized.\n", epairname); break; case MOD_UNLOAD: - if_clone_detach(&epair_cloner); netisr_unregister(&epair_nh); epair_dpcpu_detach(); if (bootverbose) - printf("%s unloaded.\n", EPAIRNAME); + printf("%s unloaded.\n", epairname); break; default: return (EOPNOTSUPP); @@ -985,5 +1016,5 @@ static moduledata_t epair_mod = { 0 }; -DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); MODULE_VERSION(if_epair, 1); diff --git a/freebsd/sys/net/if_ethersubr.c b/freebsd/sys/net/if_ethersubr.c index 5ee2606e..1d22c0a6 100644 --- a/freebsd/sys/net/if_ethersubr.c +++ b/freebsd/sys/net/if_ethersubr.c @@ -32,12 +32,11 @@ * $FreeBSD$ */ -#include <rtems/bsd/local/opt_atalk.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #include <rtems/bsd/local/opt_netgraph.h> #include <rtems/bsd/local/opt_mbuf_profiling.h> +#include <rtems/bsd/local/opt_rss.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> @@ -47,12 +46,13 @@ #include <sys/module.h> #include <sys/mbuf.h> #include <sys/random.h> -#include <sys/rwlock.h> #include <sys/socket.h> #include <sys/sockio.h> #include <sys/sysctl.h> +#include <sys/uuid.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_arp.h> #include <net/netisr.h> #include <net/route.h> @@ -64,43 +64,22 @@ #include <net/if_bridgevar.h> #include <net/if_vlan_var.h> #include <net/if_llatbl.h> -#include <net/pf_mtag.h> +#include <net/pfil.h> +#include <net/rss_config.h> #include <net/vnet.h> +#include <netpfil/pf/pf_mtag.h> + #if defined(INET) || defined(INET6) #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/if_ether.h> #include <netinet/ip_carp.h> #include <netinet/ip_var.h> -#include <netinet/ip_fw.h> -#include <netpfil/ipfw/ip_fw_private.h> #endif #ifdef INET6 #include <netinet6/nd6.h> #endif - -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - -int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m); -int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp, - struct sockaddr *dst, short *tp, int *hlen); - -#ifdef NETATALK -#include <netatalk/at.h> -#include <netatalk/at_var.h> -#include <netatalk/at_extern.h> - -#define llc_snap_org_code llc_un.type_snap.org_code -#define llc_snap_ether_type llc_un.type_snap.ether_type - -extern u_char at_org_code[3]; -extern u_char aarp_org_code[3]; -#endif /* NETATALK */ - #include <security/mac/mac_framework.h> #ifdef CTASSERT @@ -108,6 +87,8 @@ CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif +VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */ + /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); @@ -134,22 +115,160 @@ static int ether_resolvemulti(struct ifnet *, struct sockaddr **, #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif +static int ether_requestencap(struct ifnet *, struct if_encap_req *); -/* XXX: should be in an arp support file, not here */ -static MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals"); - -#define ETHER_IS_BROADCAST(addr) \ - (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) #define senderr(e) do { error = (e); goto bad;} while (0) +static void +update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) +{ + int csum_flags = 0; + + if (src->m_pkthdr.csum_flags & CSUM_IP) + csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); + if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) + csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); + if (src->m_pkthdr.csum_flags & CSUM_SCTP) + csum_flags |= CSUM_SCTP_VALID; + dst->m_pkthdr.csum_flags |= csum_flags; + if (csum_flags & CSUM_DATA_VALID) + dst->m_pkthdr.csum_data = 0xffff; +} + +/* + * Handle link-layer encapsulation requests. + */ +static int +ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) +{ + struct ether_header *eh; + struct arphdr *ah; + uint16_t etype; + const u_char *lladdr; + + if (req->rtype != IFENCAP_LL) + return (EOPNOTSUPP); + + if (req->bufsize < ETHER_HDR_LEN) + return (ENOMEM); + + eh = (struct ether_header *)req->buf; + lladdr = req->lladdr; + req->lladdr_off = 0; + + switch (req->family) { + case AF_INET: + etype = htons(ETHERTYPE_IP); + break; + case AF_INET6: + etype = htons(ETHERTYPE_IPV6); + break; + case AF_ARP: + ah = (struct arphdr *)req->hdata; + ah->ar_hrd = htons(ARPHRD_ETHER); + + switch(ntohs(ah->ar_op)) { + case ARPOP_REVREQUEST: + case ARPOP_REVREPLY: + etype = htons(ETHERTYPE_REVARP); + break; + case ARPOP_REQUEST: + case ARPOP_REPLY: + default: + etype = htons(ETHERTYPE_ARP); + break; + } + + if (req->flags & IFENCAP_FLAG_BROADCAST) + lladdr = ifp->if_broadcastaddr; + break; + default: + return (EAFNOSUPPORT); + } + + memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); + memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + req->bufsize = sizeof(struct ether_header); + + return (0); +} + + +static int +ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro, u_char *phdr, + uint32_t *pflags, struct llentry **plle) +{ + struct ether_header *eh; + uint32_t lleflags = 0; + int error = 0; #if defined(INET) || defined(INET6) -int -ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared); -static VNET_DEFINE(int, ether_ipfw); -#define V_ether_ipfw VNET(ether_ipfw) + uint16_t etype; +#endif + + if (plle) + *plle = NULL; + eh = (struct ether_header *)phdr; + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) + error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, + plle); + else { + if (m->m_flags & M_BCAST) + memcpy(eh->ether_dhost, ifp->if_broadcastaddr, + ETHER_ADDR_LEN); + else { + const struct in_addr *a; + a = &(((const struct sockaddr_in *)dst)->sin_addr); + ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); + } + etype = htons(ETHERTYPE_IP); + memcpy(&eh->ether_type, &etype, sizeof(etype)); + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + } + break; #endif +#ifdef INET6 + case AF_INET6: + if ((m->m_flags & M_MCAST) == 0) + error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, + plle); + else { + const struct in6_addr *a6; + a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); + ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); + etype = htons(ETHERTYPE_IPV6); + memcpy(&eh->ether_type, &etype, sizeof(etype)); + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + } + break; +#endif + default: + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + if (m != NULL) + m_freem(m); + return (EAFNOSUPPORT); + } + + if (error == EHOSTDOWN) { + if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) + error = EHOSTUNREACH; + } + if (error != 0) + return (error); + + *pflags = RT_MAY_LOOP; + if (lleflags & LLE_IFADDR) + *pflags |= RT_L2_ME; + + return (0); +} /* * Ethernet output routine. @@ -159,23 +278,49 @@ static VNET_DEFINE(int, ether_ipfw); */ int ether_output(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro) + const struct sockaddr *dst, struct route *ro) { - short type; - int error = 0, hdrcmplt = 0; - u_char esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN]; - struct llentry *lle = NULL; - struct rtentry *rt0 = NULL; + int error = 0; + char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ + uint32_t pflags; + struct llentry *lle = NULL; + struct rtentry *rt0 = NULL; + int addref = 0; + phdr = NULL; + pflags = 0; if (ro != NULL) { - if (!(m->m_flags & (M_BCAST | M_MCAST))) - lle = ro->ro_lle; + /* XXX BPF uses ro_prepend */ + if (ro->ro_prepend != NULL) { + phdr = ro->ro_prepend; + hlen = ro->ro_plen; + } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { + if ((ro->ro_flags & RT_LLE_CACHE) != 0) { + lle = ro->ro_lle; + if (lle != NULL && + (lle->la_flags & LLE_VALID) == 0) { + LLE_FREE(lle); + lle = NULL; /* redundant */ + ro->ro_lle = NULL; + } + if (lle == NULL) { + /* if we lookup, keep cache */ + addref = 1; + } + } + if (lle != NULL) { + phdr = lle->r_linkdata; + hlen = lle->r_hdrlen; + pflags = lle->r_flags; + } + } rt0 = ro->ro_rt; } + #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) @@ -189,153 +334,39 @@ ether_output(struct ifnet *ifp, struct mbuf *m, (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); - hlen = ETHER_HDR_LEN; - switch (dst->sa_family) { -#ifdef INET - case AF_INET: - if (lle != NULL && (lle->la_flags & LLE_VALID)) - memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); - else - error = arpresolve(ifp, rt0, m, dst, edst, &lle); - if (error) + if (phdr == NULL) { + /* No prepend data supplied. Try to calculate ourselves. */ + phdr = linkhdr; + hlen = ETHER_HDR_LEN; + error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags, + addref ? &lle : NULL); + if (addref && lle != NULL) + ro->ro_lle = lle; + if (error != 0) return (error == EWOULDBLOCK ? 0 : error); - type = htons(ETHERTYPE_IP); - break; - case AF_ARP: - { - struct arphdr *ah; - ah = mtod(m, struct arphdr *); - ah->ar_hrd = htons(ARPHRD_ETHER); - - loop_copy = 0; /* if this is for us, don't do it */ - - switch(ntohs(ah->ar_op)) { - case ARPOP_REVREQUEST: - case ARPOP_REVREPLY: - type = htons(ETHERTYPE_REVARP); - break; - case ARPOP_REQUEST: - case ARPOP_REPLY: - default: - type = htons(ETHERTYPE_ARP); - break; - } - - if (m->m_flags & M_BCAST) - bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN); - else - bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN); - - } - break; -#endif -#ifdef INET6 - case AF_INET6: - if (lle != NULL && (lle->la_flags & LLE_VALID)) - memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); - else - error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); - if (error) - return error; - type = htons(ETHERTYPE_IPV6); - break; -#endif -#ifdef IPX - case AF_IPX: - if (ef_outputp) { - error = ef_outputp(ifp, &m, dst, &type, &hlen); - if (error) - goto bad; - } else - type = htons(ETHERTYPE_IPX); - bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host), - (caddr_t)edst, sizeof (edst)); - break; -#endif -#ifdef NETATALK - case AF_APPLETALK: - { - struct at_ifaddr *aa; - - if ((aa = at_ifawithnet((struct sockaddr_at *)dst)) == NULL) - senderr(EHOSTUNREACH); /* XXX */ - if (!aarpresolve(ifp, m, (struct sockaddr_at *)dst, edst)) { - ifa_free(&aa->aa_ifa); - return (0); - } - /* - * In the phase 2 case, need to prepend an mbuf for the llc header. - */ - if ( aa->aa_flags & AFA_PHASE2 ) { - struct llc llc; - - ifa_free(&aa->aa_ifa); - M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT); - if (m == NULL) - senderr(ENOBUFS); - llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP; - llc.llc_control = LLC_UI; - bcopy(at_org_code, llc.llc_snap_org_code, sizeof(at_org_code)); - llc.llc_snap_ether_type = htons( ETHERTYPE_AT ); - bcopy(&llc, mtod(m, caddr_t), LLC_SNAPFRAMELEN); - type = htons(m->m_pkthdr.len); - hlen = LLC_SNAPFRAMELEN + ETHER_HDR_LEN; - } else { - ifa_free(&aa->aa_ifa); - type = htons(ETHERTYPE_AT); - } - break; - } -#endif /* NETATALK */ - - case pseudo_AF_HDRCMPLT: - hdrcmplt = 1; - eh = (struct ether_header *)dst->sa_data; - (void)memcpy(esrc, eh->ether_shost, sizeof (esrc)); - /* FALLTHROUGH */ - - case AF_UNSPEC: - loop_copy = 0; /* if this is for us, don't do it */ - eh = (struct ether_header *)dst->sa_data; - (void)memcpy(edst, eh->ether_dhost, sizeof (edst)); - type = eh->ether_type; - break; - - default: - if_printf(ifp, "can't handle af%d\n", dst->sa_family); - senderr(EAFNOSUPPORT); } - if (lle != NULL && (lle->la_flags & LLE_IFADDR)) { - int csum_flags = 0; - if (m->m_pkthdr.csum_flags & CSUM_IP) - csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) - csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); - if (m->m_pkthdr.csum_flags & CSUM_SCTP) - csum_flags |= CSUM_SCTP_VALID; - m->m_pkthdr.csum_flags |= csum_flags; - m->m_pkthdr.csum_data = 0xffff; + if ((pflags & RT_L2_ME) != 0) { + update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } + loop_copy = pflags & RT_MAY_LOOP; /* * Add local net header. If no space in first mbuf, * allocate another. + * + * Note that we do prepend regardless of RT_HAS_HEADER flag. + * This is done because BPF code shifts m_data pointer + * to the end of ethernet header prior to calling if_output(). */ - M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); + M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); - eh = mtod(m, struct ether_header *); - (void)memcpy(&eh->ether_type, &type, - sizeof(eh->ether_type)); - (void)memcpy(eh->ether_dhost, edst, sizeof (edst)); - if (hdrcmplt) - (void)memcpy(eh->ether_shost, esrc, - sizeof(eh->ether_shost)); - else - (void)memcpy(eh->ether_shost, IF_LLADDR(ifp), - sizeof(eh->ether_shost)); + if ((pflags & RT_HAS_HEADER) == 0) { + eh = mtod(m, struct ether_header *); + memcpy(eh, phdr, hlen); + } /* * If a simplex interface, and the packet is being sent to our @@ -346,47 +377,27 @@ ether_output(struct ifnet *ifp, struct mbuf *m, * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ - if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy && + if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { - int csum_flags = 0; + struct mbuf *n; - if (m->m_pkthdr.csum_flags & CSUM_IP) - csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) - csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); - if (m->m_pkthdr.csum_flags & CSUM_SCTP) - csum_flags |= CSUM_SCTP_VALID; - - if (m->m_flags & M_BCAST) { - struct mbuf *n; - - /* - * Because if_simloop() modifies the packet, we need a - * writable copy through m_dup() instead of a readonly - * one as m_copy[m] would give us. The alternative would - * be to modify if_simloop() to handle the readonly mbuf, - * but performancewise it is mostly equivalent (trading - * extra data copying vs. extra locking). - * - * XXX This is a local workaround. A number of less - * often used kernel parts suffer from the same bug. - * See PR kern/105943 for a proposed general solution. - */ - if ((n = m_dup(m, M_DONTWAIT)) != NULL) { - n->m_pkthdr.csum_flags |= csum_flags; - if (csum_flags & CSUM_DATA_VALID) - n->m_pkthdr.csum_data = 0xffff; - (void)if_simloop(ifp, n, dst->sa_family, hlen); - } else - ifp->if_iqdrops++; - } else if (bcmp(eh->ether_dhost, eh->ether_shost, - ETHER_ADDR_LEN) == 0) { - m->m_pkthdr.csum_flags |= csum_flags; - if (csum_flags & CSUM_DATA_VALID) - m->m_pkthdr.csum_data = 0xffff; - (void) if_simloop(ifp, m, dst->sa_family, hlen); - return (0); /* XXX */ - } + /* + * Because if_simloop() modifies the packet, we need a + * writable copy through m_dup() instead of a readonly + * one as m_copy[m] would give us. The alternative would + * be to modify if_simloop() to handle the readonly mbuf, + * but performancewise it is mostly equivalent (trading + * extra data copying vs. extra locking). + * + * XXX This is a local workaround. A number of less + * often used kernel parts suffer from the same bug. + * See PR kern/105943 for a proposed general solution. + */ + if ((n = m_dup(m, M_NOWAIT)) != NULL) { + update_mbuf_csumflags(m, n); + (void)if_simloop(ifp, n, dst->sa_family, hlen); + } else + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* @@ -399,12 +410,12 @@ ether_output(struct ifnet *ifp, struct mbuf *m, #if defined(INET) || defined(INET6) if (ifp->if_carp && - (error = (*carp_output_p)(ifp, m, dst, NULL))) + (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { @@ -429,18 +440,17 @@ bad: if (m != NULL) int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { -#if defined(INET) || defined(INET6) + int i; - if (V_ip_fw_chk_ptr && V_ether_ipfw != 0) { - if (ether_ipfw_chk(&m, ifp, 0) == 0) { - if (m) { - m_freem(m); - return EACCES; /* pkt dropped */ - } else - return 0; /* consumed e.g. in a pipe */ - } + if (PFIL_HOOKED(&V_link_pfil_hook)) { + i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL); + + if (i != 0) + return (EACCES); + + if (m == NULL) + return (0); } -#endif /* * Queue message on interface, update output statistics if @@ -449,116 +459,6 @@ ether_output_frame(struct ifnet *ifp, struct mbuf *m) return ((ifp->if_transmit)(ifp, m)); } -#if defined(INET) || defined(INET6) -/* - * ipfw processing for ethernet packets (in and out). - * The second parameter is NULL from ether_demux, and ifp from - * ether_output_frame. - */ -int -ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared) -{ - struct ether_header *eh; - struct ether_header save_eh; - struct mbuf *m; - int i; - struct ip_fw_args args; - struct m_tag *mtag; - - /* fetch start point from rule, if any */ - mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); - if (mtag == NULL) { - args.rule.slot = 0; - } else { - /* dummynet packet, already partially processed */ - struct ipfw_rule_ref *r; - - /* XXX can we free it after use ? */ - mtag->m_tag_id = PACKET_TAG_NONE; - r = (struct ipfw_rule_ref *)(mtag + 1); - if (r->info & IPFW_ONEPASS) - return (1); - args.rule = *r; - } - - /* - * I need some amt of data to be contiguous, and in case others need - * the packet (shared==1) also better be in the first mbuf. - */ - m = *m0; - i = min( m->m_pkthdr.len, max_protohdr); - if ( shared || m->m_len < i) { - m = m_pullup(m, i); - if (m == NULL) { - *m0 = m; - return 0; - } - } - eh = mtod(m, struct ether_header *); - save_eh = *eh; /* save copy for restore below */ - m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ - - args.m = m; /* the packet we are looking at */ - args.oif = dst; /* destination, if any */ - args.next_hop = NULL; /* we do not support forward yet */ - args.next_hop6 = NULL; /* we do not support forward yet */ - args.eh = &save_eh; /* MAC header for bridged/MAC packets */ - args.inp = NULL; /* used by ipfw uid/gid/jail rules */ - i = V_ip_fw_chk_ptr(&args); - m = args.m; - if (m != NULL) { - /* - * Restore Ethernet header, as needed, in case the - * mbuf chain was replaced by ipfw. - */ - M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); - if (m == NULL) { - *m0 = m; - return 0; - } - if (eh != mtod(m, struct ether_header *)) - bcopy(&save_eh, mtod(m, struct ether_header *), - ETHER_HDR_LEN); - } - *m0 = m; - - if (i == IP_FW_DENY) /* drop */ - return 0; - - KASSERT(m != NULL, ("ether_ipfw_chk: m is NULL")); - - if (i == IP_FW_PASS) /* a PASS rule. */ - return 1; - - if (ip_dn_io_ptr && (i == IP_FW_DUMMYNET)) { - int dir; - /* - * Pass the pkt to dummynet, which consumes it. - * If shared, make a copy and keep the original. - */ - if (shared) { - m = m_copypacket(m, M_DONTWAIT); - if (m == NULL) - return 0; - } else { - /* - * Pass the original to dummynet and - * nothing back to the caller - */ - *m0 = NULL ; - } - dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); - ip_dn_io_ptr(&m, dir, &args); - return 0; - } - /* - * XXX at some point add support for divert/forward actions. - * If none of the above matches, we have to drop the pkt. - */ - return 0; -} -#endif - /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. @@ -580,39 +480,18 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) return; } #endif - /* - * Do consistency checks to verify assumptions - * made by code past this point. - */ - if ((m->m_flags & M_PKTHDR) == 0) { - if_printf(ifp, "discard frame w/o packet header\n"); - ifp->if_ierrors++; - m_freem(m); - return; - } if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); - if (m->m_pkthdr.rcvif == NULL) { - if_printf(ifp, "discard frame w/o interface pointer\n"); - ifp->if_ierrors++; - m_freem(m); - return; - } -#ifdef DIAGNOSTIC - if (m->m_pkthdr.rcvif != ifp) { - if_printf(ifp, "Warning, frame marked as received on %s\n", - m->m_pkthdr.rcvif->if_xname); - } -#endif + random_harvest_queue(m, sizeof(*m), 2, RANDOM_NET_ETHER); CURVNET_SET_QUIET(ifp->if_vnet); @@ -621,7 +500,7 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; - ifp->if_imcasts++; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC @@ -647,7 +526,8 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) m->m_flags &= ~M_HASFCS; } - ifp->if_ibytes += m->m_pkthdr.len; + if (!(ifp->if_capenable & IFCAP_HWSTATS)) + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { @@ -683,8 +563,7 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif - ifp->if_ierrors++; - m_freem(m); + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } @@ -702,7 +581,7 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; @@ -757,22 +636,36 @@ ether_input_internal(struct ifnet *ifp, struct mbuf *m) m->m_flags |= M_PROMISC; } - /* First chunk of an mbuf contains good entropy */ - if (harvest.ethernet) - random_harvest(m, 16, 3, 0, RANDOM_NET); - ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of - * global configuration. + * global configuration. However, if RSS is enabled, hook up RSS affinity + * so that when deferred or hybrid dispatch is enabled, we can redistribute + * load based on RSS. + * + * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or + * not it had already done work distribution via multi-queue. Then we could + * direct dispatch in the event load balancing was already complete and + * handle the case of interfaces with different capabilities better. + * + * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions + * at multiple layers? + * + * XXXRW: For now, enable all this only if RSS is compiled in, although it + * works fine without RSS. Need to characterise the performance overhead + * of the detour through the netisr code in the event the result is always + * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { + M_ASSERTPKTHDR(m); + KASSERT(m->m_pkthdr.rcvif != NULL, + ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } @@ -780,8 +673,14 @@ static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, +#ifdef RSS + .nh_policy = NETISR_POLICY_CPU, + .nh_dispatch = NETISR_DISPATCH_DIRECT, + .nh_m2cpuid = rss_m2cpuid, +#else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, +#endif }; static void @@ -793,16 +692,74 @@ ether_init(__unused void *arg) SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void +vnet_ether_init(__unused void *arg) +{ + int i; + + /* Initialize packet filter hooks. */ + V_link_pfil_hook.ph_type = PFIL_TYPE_AF; + V_link_pfil_hook.ph_af = AF_LINK; + if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) + printf("%s: WARNING: unable to register pfil link hook, " + "error %d\n", __func__, i); +#ifdef VIMAGE + netisr_register_vnet(ðer_nh); +#endif +} +VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_ether_init, NULL); + +#ifdef VIMAGE +static void +vnet_ether_pfil_destroy(__unused void *arg) +{ + int i; + + if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0) + printf("%s: WARNING: unable to unregister pfil link hook, " + "error %d\n", __func__, i); +} +VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, + vnet_ether_pfil_destroy, NULL); + +static void +vnet_ether_destroy(__unused void *arg) +{ + + netisr_unregister_vnet(ðer_nh); +} +VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_ether_destroy, NULL); +#endif + + + +static void ether_input(struct ifnet *ifp, struct mbuf *m) { + struct mbuf *mn; + /* - * We will rely on rcvif being set properly in the deferred context, - * so assert it is correct here. + * The drivers are allowed to pass in a chain of packets linked with + * m_nextpkt. We split them up into separate packets here and pass + * them up. This allows the drivers to amortize the receive lock. */ - KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__)); + while (m) { + mn = m->m_nextpkt; + m->m_nextpkt = NULL; - netisr_dispatch(NETISR_ETHER, m); + /* + * We will rely on rcvif being set properly in the deferred context, + * so assert it is correct here. + */ + KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " + "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); + CURVNET_SET_QUIET(ifp->if_vnet); + netisr_dispatch(NETISR_ETHER, m); + CURVNET_RESTORE(); + m = mn; + } } /* @@ -812,27 +769,19 @@ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; - int isr; + int i, isr; u_short ether_type; -#if defined(NETATALK) - struct llc *l; -#endif KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); -#if defined(INET) || defined(INET6) - /* - * Allow dummynet and/or ipfw to claim the frame. - * Do not do this for PROMISC frames in case we are re-entered. - */ - if (V_ip_fw_chk_ptr && V_ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) { - if (ether_ipfw_chk(&m, NULL, 0) == 0) { - if (m) - m_freem(m); /* dropped; free mbuf chain */ - return; /* consumed */ - } + /* Do not grab PROMISC frames in case we are re-entered. */ + if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) { + i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL); + + if (i != 0 || m == NULL) + return; } -#endif + eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); @@ -843,7 +792,7 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } @@ -869,7 +818,7 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; - m->m_flags &= ~(M_PROTOFLAGS); + m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* @@ -878,8 +827,6 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) switch (ether_type) { #ifdef INET case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; @@ -892,54 +839,12 @@ ether_demux(struct ifnet *ifp, struct mbuf *m) isr = NETISR_ARP; break; #endif -#ifdef IPX - case ETHERTYPE_IPX: - if (ef_inputp && ef_inputp(ifp, eh, m) == 0) - return; - isr = NETISR_IPX; - break; -#endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif -#ifdef NETATALK - case ETHERTYPE_AT: - isr = NETISR_ATALK1; - break; - case ETHERTYPE_AARP: - isr = NETISR_AARP; - break; -#endif /* NETATALK */ default: -#ifdef IPX - if (ef_inputp && ef_inputp(ifp, eh, m) == 0) - return; -#endif /* IPX */ -#if defined(NETATALK) - if (ether_type > ETHERMTU) - goto discard; - l = mtod(m, struct llc *); - if (l->llc_dsap == LLC_SNAP_LSAP && - l->llc_ssap == LLC_SNAP_LSAP && - l->llc_control == LLC_UI) { - if (bcmp(&(l->llc_snap_org_code)[0], at_org_code, - sizeof(at_org_code)) == 0 && - ntohs(l->llc_snap_ether_type) == ETHERTYPE_AT) { - m_adj(m, LLC_SNAPFRAMELEN); - isr = NETISR_ATALK2; - break; - } - if (bcmp(&(l->llc_snap_org_code)[0], aarp_org_code, - sizeof(aarp_org_code)) == 0 && - ntohs(l->llc_snap_ether_type) == ETHERTYPE_AARP) { - m_adj(m, LLC_SNAPFRAMELEN); - isr = NETISR_AARP; - break; - } - } -#endif /* NETATALK */ goto discard; } netisr_dispatch(isr, m); @@ -951,14 +856,14 @@ discard: * hand the packet to it for last chance processing; * otherwise dispose of it. */ - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ - M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); + M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } @@ -998,6 +903,7 @@ ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; + ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif @@ -1022,6 +928,8 @@ ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); + + uuid_ether_add(LLADDR(sdl)); } /* @@ -1030,7 +938,12 @@ ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) void ether_ifdetach(struct ifnet *ifp) { - if (IFP2AC(ifp)->ac_netgraph != NULL) { + struct sockaddr_dl *sdl; + + sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); + uuid_ether_del(LLADDR(sdl)); + + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); @@ -1045,7 +958,7 @@ void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); @@ -1061,10 +974,6 @@ ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); -#if defined(INET) || defined(INET6) -SYSCTL_VNET_INT(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW, - &VNET_NAME(ether_ipfw), 0, "Pass ether pkts through firewall"); -#endif #if 0 /* @@ -1158,31 +1067,6 @@ ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) arp_ifinit(ifp, ifa); break; #endif -#ifdef IPX - /* - * XXX - This code is probably wrong - */ - case AF_IPX: - { - struct ipx_addr *ina = &(IA_SIPX(ifa)->sipx_addr); - - if (ipx_nullhost(*ina)) - ina->x_host = - *(union ipx_host *) - IF_LLADDR(ifp); - else { - bcopy((caddr_t) ina->x_host.c_host, - (caddr_t) IF_LLADDR(ifp), - ETHER_ADDR_LEN); - } - - /* - * Set new address - */ - ifp->if_init(ifp->if_softc); - break; - } -#endif default: ifp->if_init(ifp->if_softc); break; @@ -1238,7 +1122,7 @@ ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; - *llsa = 0; + *llsa = NULL; return 0; #ifdef INET @@ -1246,14 +1130,7 @@ ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT|M_ZERO); - if (sdl == NULL) - return ENOMEM; - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; + sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); @@ -1270,19 +1147,12 @@ ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; - *llsa = 0; + *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT|M_ZERO); - if (sdl == NULL) - return (ENOMEM); - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; + sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); @@ -1299,46 +1169,8 @@ ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, } } -static void* -ether_alloc(u_char type, struct ifnet *ifp) -{ - struct arpcom *ac; - - ac = malloc(sizeof(struct arpcom), M_ARPCOM, M_WAITOK | M_ZERO); - ac->ac_ifp = ifp; - - return (ac); -} - -static void -ether_free(void *com, u_char type) -{ - - free(com, M_ARPCOM); -} - -static int -ether_modevent(module_t mod, int type, void *data) -{ - - switch (type) { - case MOD_LOAD: - if_register_com_alloc(IFT_ETHER, ether_alloc, ether_free); - break; - case MOD_UNLOAD: - if_deregister_com_alloc(IFT_ETHER); - break; - default: - return EOPNOTSUPP; - } - - return (0); -} - static moduledata_t ether_mod = { - "ether", - ether_modevent, - 0 + .name = "ether", }; void @@ -1386,7 +1218,7 @@ ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; - M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT); + M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ diff --git a/freebsd/sys/net/if_faith.c b/freebsd/sys/net/if_faith.c deleted file mode 100644 index cf4a7fba..00000000 --- a/freebsd/sys/net/if_faith.c +++ /dev/null @@ -1,353 +0,0 @@ -#include <machine/rtems-bsd-kernel-space.h> - -/* $KAME: if_faith.c,v 1.23 2001/12/17 13:55:29 sumikawa Exp $ */ - -/*- - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ -/* - * derived from - * @(#)if_loop.c 8.1 (Berkeley) 6/10/93 - * Id: if_loop.c,v 1.22 1996/06/19 16:24:10 wollman Exp - */ - -/* - * Loopback interface driver for protocol testing and timing. - */ -#include <rtems/bsd/local/opt_inet.h> -#include <rtems/bsd/local/opt_inet6.h> - -#include <rtems/bsd/sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/socket.h> -#include <rtems/bsd/sys/errno.h> -#include <sys/sockio.h> -#include <sys/time.h> -#include <sys/queue.h> -#include <sys/types.h> -#include <sys/malloc.h> - -#include <net/if.h> -#include <net/if_clone.h> -#include <net/if_types.h> -#include <net/netisr.h> -#include <net/route.h> -#include <net/bpf.h> -#include <net/vnet.h> - -#ifdef INET -#include <netinet/in.h> -#include <netinet/in_systm.h> -#include <netinet/in_var.h> -#include <netinet/ip.h> -#endif - -#ifdef INET6 -#ifndef INET -#include <netinet/in.h> -#endif -#include <netinet6/in6_var.h> -#include <netinet/ip6.h> -#include <netinet6/ip6_var.h> -#endif - -#define FAITHNAME "faith" - -struct faith_softc { - struct ifnet *sc_ifp; -}; - -static int faithioctl(struct ifnet *, u_long, caddr_t); -int faithoutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *); -static void faithrtrequest(int, struct rtentry *, struct rt_addrinfo *); -#ifdef INET6 -static int faithprefix(struct in6_addr *); -#endif - -static int faithmodevent(module_t, int, void *); - -static MALLOC_DEFINE(M_FAITH, FAITHNAME, "Firewall Assisted Tunnel Interface"); - -static int faith_clone_create(struct if_clone *, int, caddr_t); -static void faith_clone_destroy(struct ifnet *); - -IFC_SIMPLE_DECLARE(faith, 0); - -#define FAITHMTU 1500 - -static int -faithmodevent(mod, type, data) - module_t mod; - int type; - void *data; -{ - - switch (type) { - case MOD_LOAD: - if_clone_attach(&faith_cloner); - -#ifdef INET6 - faithprefix_p = faithprefix; -#endif - - break; - case MOD_UNLOAD: -#ifdef INET6 - faithprefix_p = NULL; -#endif - - if_clone_detach(&faith_cloner); - break; - default: - return EOPNOTSUPP; - } - return 0; -} - -static moduledata_t faith_mod = { - "if_faith", - faithmodevent, - 0 -}; - -DECLARE_MODULE(if_faith, faith_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); -MODULE_VERSION(if_faith, 1); - -static int -faith_clone_create(ifc, unit, params) - struct if_clone *ifc; - int unit; - caddr_t params; -{ - struct ifnet *ifp; - struct faith_softc *sc; - - sc = malloc(sizeof(struct faith_softc), M_FAITH, M_WAITOK | M_ZERO); - ifp = sc->sc_ifp = if_alloc(IFT_FAITH); - if (ifp == NULL) { - free(sc, M_FAITH); - return (ENOSPC); - } - - ifp->if_softc = sc; - if_initname(sc->sc_ifp, ifc->ifc_name, unit); - - ifp->if_mtu = FAITHMTU; - /* Change to BROADCAST experimentaly to announce its prefix. */ - ifp->if_flags = /* IFF_LOOPBACK */ IFF_BROADCAST | IFF_MULTICAST; - ifp->if_ioctl = faithioctl; - ifp->if_output = faithoutput; - ifp->if_hdrlen = 0; - ifp->if_addrlen = 0; - ifp->if_snd.ifq_maxlen = ifqmaxlen; - if_attach(ifp); - bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); - return (0); -} - -static void -faith_clone_destroy(ifp) - struct ifnet *ifp; -{ - struct faith_softc *sc = ifp->if_softc; - - bpfdetach(ifp); - if_detach(ifp); - if_free(ifp); - free(sc, M_FAITH); -} - -int -faithoutput(ifp, m, dst, ro) - struct ifnet *ifp; - struct mbuf *m; - struct sockaddr *dst; - struct route *ro; -{ - int isr; - u_int32_t af; - struct rtentry *rt = NULL; - - M_ASSERTPKTHDR(m); - - if (ro != NULL) - rt = ro->ro_rt; - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { - bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } - - if (bpf_peers_present(ifp->if_bpf)) { - af = dst->sa_family; - bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); - } - - if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - m_freem(m); - return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); - } - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - switch (dst->sa_family) { -#ifdef INET - case AF_INET: - isr = NETISR_IP; - break; -#endif -#ifdef INET6 - case AF_INET6: - isr = NETISR_IPV6; - break; -#endif - default: - m_freem(m); - return EAFNOSUPPORT; - } - - /* XXX do we need more sanity checks? */ - - m->m_pkthdr.rcvif = ifp; - ifp->if_ipackets++; - ifp->if_ibytes += m->m_pkthdr.len; - netisr_dispatch(isr, m); - return (0); -} - -/* ARGSUSED */ -static void -faithrtrequest(cmd, rt, info) - int cmd; - struct rtentry *rt; - struct rt_addrinfo *info; -{ - RT_LOCK_ASSERT(rt); - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; -} - -/* - * Process an ioctl request. - */ -/* ARGSUSED */ -static int -faithioctl(ifp, cmd, data) - struct ifnet *ifp; - u_long cmd; - caddr_t data; -{ - struct ifaddr *ifa; - struct ifreq *ifr = (struct ifreq *)data; - int error = 0; - - switch (cmd) { - - case SIOCSIFADDR: - ifp->if_flags |= IFF_UP; - ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifa = (struct ifaddr *)data; - ifa->ifa_rtrequest = faithrtrequest; - /* - * Everything else is done at a higher level. - */ - break; - - case SIOCADDMULTI: - case SIOCDELMULTI: - if (ifr == 0) { - error = EAFNOSUPPORT; /* XXX */ - break; - } - switch (ifr->ifr_addr.sa_family) { -#ifdef INET - case AF_INET: - break; -#endif -#ifdef INET6 - case AF_INET6: - break; -#endif - - default: - error = EAFNOSUPPORT; - break; - } - break; - -#ifdef SIOCSIFMTU - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; -#endif - - case SIOCSIFFLAGS: - break; - - default: - error = EINVAL; - } - return (error); -} - -#ifdef INET6 -/* - * XXX could be slow - * XXX could be layer violation to call sys/net from sys/netinet6 - */ -static int -faithprefix(in6) - struct in6_addr *in6; -{ - struct rtentry *rt; - struct sockaddr_in6 sin6; - int ret; - - if (V_ip6_keepfaith == 0) - return 0; - - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr = *in6; - rt = in6_rtalloc1((struct sockaddr *)&sin6, 0, 0UL, RT_DEFAULT_FIB); - if (rt && rt->rt_ifp && rt->rt_ifp->if_type == IFT_FAITH && - (rt->rt_ifp->if_flags & IFF_UP) != 0) - ret = 1; - else - ret = 0; - if (rt) - RTFREE_LOCKED(rt); - return ret; -} -#endif diff --git a/freebsd/sys/net/if_fddisubr.c b/freebsd/sys/net/if_fddisubr.c index 7a7fb471..9df882ec 100644 --- a/freebsd/sys/net/if_fddisubr.c +++ b/freebsd/sys/net/if_fddisubr.c @@ -38,10 +38,8 @@ * $FreeBSD$ */ -#include <rtems/bsd/local/opt_atalk.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> @@ -53,6 +51,7 @@ #include <sys/sockio.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_dl.h> #include <net/if_llc.h> #include <net/if_types.h> @@ -73,24 +72,10 @@ #include <netinet6/nd6.h> #endif -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - #ifdef DECNET #include <netdnet/dn.h> #endif -#ifdef NETATALK -#include <netatalk/at.h> -#include <netatalk/at_var.h> -#include <netatalk/at_extern.h> - -extern u_char at_org_code[ 3 ]; -extern u_char aarp_org_code[ 3 ]; -#endif /* NETATALK */ - #include <security/mac/mac_framework.h> static const u_char fddibroadcastaddr[FDDI_ADDR_LEN] = @@ -98,7 +83,7 @@ static const u_char fddibroadcastaddr[FDDI_ADDR_LEN] = static int fddi_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); -static int fddi_output(struct ifnet *, struct mbuf *, struct sockaddr *, +static int fddi_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void fddi_input(struct ifnet *ifp, struct mbuf *m); @@ -109,21 +94,17 @@ static void fddi_input(struct ifnet *ifp, struct mbuf *m); * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. - * Assumes that ifp is actually pointer to arpcom structure. */ static int -fddi_output(ifp, m, dst, ro) - struct ifnet *ifp; - struct mbuf *m; - struct sockaddr *dst; - struct route *ro; +fddi_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) { u_int16_t type; int loop_copy = 0, error = 0, hdrcmplt = 0; u_char esrc[FDDI_ADDR_LEN], edst[FDDI_ADDR_LEN]; struct fddi_header *fh; #if defined(INET) || defined(INET6) - struct llentry *lle; + int is_gw = 0; #endif #ifdef MAC @@ -139,14 +120,15 @@ fddi_output(ifp, m, dst, ro) senderr(ENETDOWN); getmicrotime(&ifp->if_lastchange); +#if defined(INET) || defined(INET6) + if (ro != NULL) + is_gw = (ro->ro_flags & RT_HAS_GW) != 0; +#endif + switch (dst->sa_family) { #ifdef INET case AF_INET: { - struct rtentry *rt0 = NULL; - - if (ro != NULL) - rt0 = ro->ro_rt; - error = arpresolve(ifp, rt0, m, dst, edst, &lle); + error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); @@ -182,68 +164,29 @@ fddi_output(ifp, m, dst, ro) #endif /* INET */ #ifdef INET6 case AF_INET6: - error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); + error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) - return (error); /* Something bad happened */ + return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IPV6); break; #endif /* INET6 */ -#ifdef IPX - case AF_IPX: - type = htons(ETHERTYPE_IPX); - bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host), - (caddr_t)edst, FDDI_ADDR_LEN); - break; -#endif /* IPX */ -#ifdef NETATALK - case AF_APPLETALK: { - struct at_ifaddr *aa; - if (!aarpresolve(ifp, m, (struct sockaddr_at *)dst, edst)) - return (0); - /* - * ifaddr is the first thing in at_ifaddr - */ - if ((aa = at_ifawithnet( (struct sockaddr_at *)dst)) == 0) - goto bad; - - /* - * In the phase 2 case, we need to prepend an mbuf for the llc header. - * Since we must preserve the value of m, which is passed to us by - * value, we m_copy() the first mbuf, and use it for our llc header. - */ - if (aa->aa_flags & AFA_PHASE2) { - struct llc llc; - - M_PREPEND(m, LLC_SNAPFRAMELEN, M_WAIT); - llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP; - llc.llc_control = LLC_UI; - bcopy(at_org_code, llc.llc_snap.org_code, sizeof(at_org_code)); - llc.llc_snap.ether_type = htons(ETHERTYPE_AT); - bcopy(&llc, mtod(m, caddr_t), LLC_SNAPFRAMELEN); - type = 0; - } else { - type = htons(ETHERTYPE_AT); - } - ifa_free(&aa->aa_ifa); - break; - } -#endif /* NETATALK */ - case pseudo_AF_HDRCMPLT: { - struct ether_header *eh; + const struct ether_header *eh; + hdrcmplt = 1; - eh = (struct ether_header *)dst->sa_data; - bcopy((caddr_t)eh->ether_shost, (caddr_t)esrc, FDDI_ADDR_LEN); + eh = (const struct ether_header *)dst->sa_data; + bcopy(eh->ether_shost, esrc, FDDI_ADDR_LEN); /* FALLTHROUGH */ } case AF_UNSPEC: { - struct ether_header *eh; + const struct ether_header *eh; + loop_copy = -1; - eh = (struct ether_header *)dst->sa_data; - bcopy((caddr_t)eh->ether_dhost, (caddr_t)edst, FDDI_ADDR_LEN); + eh = (const struct ether_header *)dst->sa_data; + bcopy(eh->ether_dhost, edst, FDDI_ADDR_LEN); if (*edst & 1) m->m_flags |= (M_BCAST|M_MCAST); type = eh->ether_type; @@ -293,8 +236,8 @@ fddi_output(ifp, m, dst, ro) */ if (type != 0) { struct llc *l; - M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT); + if (m == NULL) senderr(ENOBUFS); l = mtod(m, struct llc *); l->llc_control = LLC_UI; @@ -309,8 +252,8 @@ fddi_output(ifp, m, dst, ro) * Add local net header. If no space in first mbuf, * allocate another. */ - M_PREPEND(m, FDDI_HDR_LEN, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, FDDI_HDR_LEN, M_NOWAIT); + if (m == NULL) senderr(ENOBUFS); fh = mtod(m, struct fddi_header *); fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4; @@ -347,12 +290,12 @@ fddi_output(ifp, m, dst, ro) error = (ifp->if_transmit)(ifp, m); if (error) - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); bad: - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (m) m_freem(m); return (error); @@ -376,24 +319,23 @@ fddi_input(ifp, m) */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m = m_pullup(m, FDDI_HDR_LEN); if (m == NULL) { - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } fh = mtod(m, struct fddi_header *); - m->m_pkthdr.header = (void *)fh; /* * Discard packet if interface is not up. @@ -422,7 +364,7 @@ fddi_input(ifp, m) /* * Update interface statistics. */ - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); getmicrotime(&ifp->if_lastchange); /* @@ -443,7 +385,7 @@ fddi_input(ifp, m) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; - ifp->if_imcasts++; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef M_LINK0 @@ -461,7 +403,7 @@ fddi_input(ifp, m) m = m_pullup(m, LLC_SNAPFRAMELEN); if (m == 0) { - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } l = mtod(m, struct llc *); @@ -472,30 +414,13 @@ fddi_input(ifp, m) u_int16_t type; if ((l->llc_control != LLC_UI) || (l->llc_ssap != LLC_SNAP_LSAP)) { - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } -#ifdef NETATALK - if (bcmp(&(l->llc_snap.org_code)[0], at_org_code, - sizeof(at_org_code)) == 0 && - ntohs(l->llc_snap.ether_type) == ETHERTYPE_AT) { - isr = NETISR_ATALK2; - m_adj(m, LLC_SNAPFRAMELEN); - break; - } - - if (bcmp(&(l->llc_snap.org_code)[0], aarp_org_code, - sizeof(aarp_org_code)) == 0 && - ntohs(l->llc_snap.ether_type) == ETHERTYPE_AARP) { - m_adj(m, LLC_SNAPFRAMELEN); - isr = NETISR_AARP; - break; - } -#endif /* NETATALK */ if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0 || l->llc_snap.org_code[2] != 0) { - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } @@ -505,8 +430,6 @@ fddi_input(ifp, m) switch (type) { #ifdef INET case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; @@ -521,27 +444,14 @@ fddi_input(ifp, m) isr = NETISR_IPV6; break; #endif -#ifdef IPX - case ETHERTYPE_IPX: - isr = NETISR_IPX; - break; -#endif #ifdef DECNET case ETHERTYPE_DECNET: isr = NETISR_DECNET; break; #endif -#ifdef NETATALK - case ETHERTYPE_AT: - isr = NETISR_ATALK1; - break; - case ETHERTYPE_AARP: - isr = NETISR_AARP; - break; -#endif /* NETATALK */ default: /* printf("fddi_input: unknown protocol 0x%x\n", type); */ - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } break; @@ -549,7 +459,7 @@ fddi_input(ifp, m) default: /* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */ - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } M_SETFIB(m, ifp->if_fib); @@ -557,7 +467,7 @@ fddi_input(ifp, m) return; dropanyway: - ifp->if_iqdrops++; + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if (m) m_freem(m); return; @@ -643,31 +553,6 @@ fddi_ioctl (ifp, command, data) arp_ifinit(ifp, ifa); break; #endif -#ifdef IPX - /* - * XXX - This code is probably wrong - */ - case AF_IPX: { - struct ipx_addr *ina; - - ina = &(IA_SIPX(ifa)->sipx_addr); - - if (ipx_nullhost(*ina)) { - ina->x_host = *(union ipx_host *) - IF_LLADDR(ifp); - } else { - bcopy((caddr_t) ina->x_host.c_host, - (caddr_t) IF_LLADDR(ifp), - ETHER_ADDR_LEN); - } - - /* - * Set new address - */ - ifp->if_init(ifp->if_softc); - } - break; -#endif default: ifp->if_init(ifp->if_softc); break; @@ -724,7 +609,7 @@ fddi_resolvemulti(ifp, llsa, sa) e_addr = LLADDR(sdl); if ((e_addr[0] & 1) != 1) return (EADDRNOTAVAIL); - *llsa = 0; + *llsa = NULL; return (0); #ifdef INET @@ -732,14 +617,7 @@ fddi_resolvemulti(ifp, llsa, sa) sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EADDRNOTAVAIL); - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT | M_ZERO); - if (sdl == NULL) - return (ENOMEM); - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_FDDI; + sdl = link_init_sdl(ifp, *llsa, IFT_FDDI); sdl->sdl_nlen = 0; sdl->sdl_alen = FDDI_ADDR_LEN; sdl->sdl_slen = 0; @@ -758,19 +636,12 @@ fddi_resolvemulti(ifp, llsa, sa) * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; - *llsa = 0; + *llsa = NULL; return (0); } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EADDRNOTAVAIL); - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT | M_ZERO); - if (sdl == NULL) - return (ENOMEM); - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_FDDI; + sdl = link_init_sdl(ifp, *llsa, IFT_FDDI); sdl->sdl_nlen = 0; sdl->sdl_alen = FDDI_ADDR_LEN; sdl->sdl_slen = 0; diff --git a/freebsd/sys/net/if_fwsubr.c b/freebsd/sys/net/if_fwsubr.c index b022ecae..df4c38cf 100644 --- a/freebsd/sys/net/if_fwsubr.c +++ b/freebsd/sys/net/if_fwsubr.c @@ -45,6 +45,7 @@ #include <sys/sockio.h> #include <net/if.h> +#include <net/if_var.h> #include <net/netisr.h> #include <net/route.h> #include <net/if_llc.h> @@ -77,7 +78,7 @@ struct fw_hwaddr firewire_broadcastaddr = { }; static int -firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, +firewire_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct fw_com *fc = IFP2FWC(ifp); @@ -91,7 +92,7 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, int unicast, dgl, foff; static int next_dgl; #if defined(INET) || defined(INET6) - struct llentry *lle; + int is_gw = 0; #endif #ifdef MAC @@ -106,6 +107,10 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, goto bad; } +#if defined(INET) || defined(INET6) + if (ro != NULL) + is_gw = (ro->ro_flags & RT_HAS_GW) != 0; +#endif /* * For unicast, we make a tag to store the lladdr of the * destination. This might not be the first time we have seen @@ -129,7 +134,7 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, } destfw = (struct fw_hwaddr *)(mtag + 1); } else { - destfw = 0; + destfw = NULL; } switch (dst->sa_family) { @@ -141,7 +146,8 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, * doesn't fit into the arp model. */ if (unicast) { - error = arpresolve(ifp, ro ? ro->ro_rt : NULL, m, dst, (u_char *) destfw, &lle); + error = arpresolve(ifp, is_gw, m, dst, + (u_char *) destfw, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); } @@ -170,10 +176,10 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, #ifdef INET6 case AF_INET6: if (unicast) { - error = nd6_storelladdr(fc->fc_ifp, m, dst, - (u_char *) destfw, &lle); + error = nd6_resolve(fc->fc_ifp, is_gw, m, dst, + (u_char *) destfw, NULL, NULL); if (error) - return (error); + return (error == EWOULDBLOCK ? 0 : error); } type = ETHERTYPE_IPV6; break; @@ -231,7 +237,7 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, /* * No fragmentation is necessary. */ - M_PREPEND(m, sizeof(uint32_t), M_DONTWAIT); + M_PREPEND(m, sizeof(uint32_t), M_NOWAIT); if (!m) { error = ENOBUFS; goto bad; @@ -263,17 +269,17 @@ firewire_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, * Split off the tail segment from the * datagram, copying our tags over. */ - mtail = m_split(m, fsize, M_DONTWAIT); + mtail = m_split(m, fsize, M_NOWAIT); m_tag_copy_chain(mtail, m, M_NOWAIT); } else { - mtail = 0; + mtail = NULL; } /* * Add our encapsulation header to this * fragment and hand it off to the link. */ - M_PREPEND(m, 2*sizeof(uint32_t), M_DONTWAIT); + M_PREPEND(m, 2*sizeof(uint32_t), M_NOWAIT); if (!m) { error = ENOBUFS; goto bad; @@ -538,7 +544,7 @@ firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src) if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } @@ -583,7 +589,7 @@ firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src) return; } - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Discard packet if interface is not up */ if ((ifp->if_flags & IFF_UP) == 0) { @@ -592,13 +598,11 @@ firewire_input(struct ifnet *ifp, struct mbuf *m, uint16_t src) } if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); switch (type) { #ifdef INET case ETHERTYPE_IP: - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; @@ -700,7 +704,7 @@ firewire_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, /* * No mapping needed. */ - *llsa = 0; + *llsa = NULL; return 0; #ifdef INET @@ -708,7 +712,7 @@ firewire_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; - *llsa = 0; + *llsa = NULL; return 0; #endif #ifdef INET6 @@ -721,12 +725,12 @@ firewire_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; - *llsa = 0; + *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; - *llsa = 0; + *llsa = NULL; return 0; #endif diff --git a/freebsd/sys/net/if_gif.c b/freebsd/sys/net/if_gif.c index 27cbbdda..e07a2da0 100644 --- a/freebsd/sys/net/if_gif.c +++ b/freebsd/sys/net/if_gif.c @@ -1,8 +1,5 @@ #include <machine/rtems-bsd-kernel-space.h> -/* $FreeBSD$ */ -/* $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ - /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. @@ -30,8 +27,13 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> @@ -39,11 +41,14 @@ #include <sys/systm.h> #include <sys/jail.h> #include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/module.h> +#include <sys/rmlock.h> #include <sys/socket.h> #include <sys/sockio.h> +#include <sys/sx.h> #include <rtems/bsd/sys/errno.h> #include <sys/time.h> #include <sys/sysctl.h> @@ -55,6 +60,7 @@ #include <machine/cpu.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_types.h> #include <net/netisr.h> @@ -65,9 +71,9 @@ #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> +#include <netinet/ip_ecn.h> #ifdef INET #include <netinet/in_var.h> -#include <netinet/in_gif.h> #include <netinet/ip_var.h> #endif /* INET */ @@ -77,9 +83,9 @@ #endif #include <netinet6/in6_var.h> #include <netinet/ip6.h> +#include <netinet6/ip6_ecn.h> #include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> -#include <netinet6/in6_gif.h> #include <netinet6/ip6protosw.h> #endif /* INET6 */ @@ -90,26 +96,41 @@ #include <security/mac/mac_framework.h> -#define GIFNAME "gif" +static const char gifname[] = "gif"; /* - * gif_mtx protects the global gif_softc_list. + * gif_mtx protects a per-vnet gif_softc_list. */ -static struct mtx gif_mtx; +static VNET_DEFINE(struct mtx, gif_mtx); +#define V_gif_mtx VNET(gif_mtx) static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static VNET_DEFINE(LIST_HEAD(, gif_softc), gif_softc_list); #define V_gif_softc_list VNET(gif_softc_list) +static struct sx gif_ioctl_sx; +SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl"); + +#define GIF_LIST_LOCK_INIT(x) mtx_init(&V_gif_mtx, "gif_mtx", \ + NULL, MTX_DEF) +#define GIF_LIST_LOCK_DESTROY(x) mtx_destroy(&V_gif_mtx) +#define GIF_LIST_LOCK(x) mtx_lock(&V_gif_mtx) +#define GIF_LIST_UNLOCK(x) mtx_unlock(&V_gif_mtx) void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); void (*ng_gif_attach_p)(struct ifnet *ifp); void (*ng_gif_detach_p)(struct ifnet *ifp); -static void gif_start(struct ifnet *); +static int gif_check_nesting(struct ifnet *, struct mbuf *); +static int gif_set_tunnel(struct ifnet *, struct sockaddr *, + struct sockaddr *); +static void gif_delete_tunnel(struct ifnet *); +static int gif_ioctl(struct ifnet *, u_long, caddr_t); +static int gif_transmit(struct ifnet *, struct mbuf *); +static void gif_qflush(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); - -IFC_SIMPLE_DECLARE(gif, 0); +static VNET_DEFINE(struct if_clone *, gif_cloner); +#define V_gif_cloner VNET(gif_cloner) static int gifmodevent(module_t, int, void *); @@ -129,7 +150,7 @@ static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0, #endif static VNET_DEFINE(int, max_gif_nesting) = MAX_GIF_NEST; #define V_max_gif_nesting VNET(max_gif_nesting) -SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW, +SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels"); /* @@ -143,22 +164,12 @@ static VNET_DEFINE(int, parallel_tunnels) = 1; static VNET_DEFINE(int, parallel_tunnels) = 0; #endif #define V_parallel_tunnels VNET(parallel_tunnels) -SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, - &VNET_NAME(parallel_tunnels), 0, "Allow parallel tunnels?"); - -/* copy from src/sys/net/if_ethersubr.c */ -static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -#ifndef ETHER_IS_BROADCAST -#define ETHER_IS_BROADCAST(addr) \ - (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) -#endif +SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(parallel_tunnels), 0, + "Allow parallel tunnels?"); static int -gif_clone_create(ifc, unit, params) - struct if_clone *ifc; - int unit; - caddr_t params; +gif_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gif_softc *sc; @@ -169,18 +180,9 @@ gif_clone_create(ifc, unit, params) sc->gif_fibnum = BSD_DEFAULT_FIB; #endif /* __rtems__ */ GIF2IFP(sc) = if_alloc(IFT_GIF); - if (GIF2IFP(sc) == NULL) { - free(sc, M_GIF); - return (ENOSPC); - } - GIF_LOCK_INIT(sc); - GIF2IFP(sc)->if_softc = sc; - if_initname(GIF2IFP(sc), ifc->ifc_name, unit); - - sc->encap_cookie4 = sc->encap_cookie6 = NULL; - sc->gif_options = GIF_ACCEPT_REVETHIP; + if_initname(GIF2IFP(sc), gifname, unit); GIF2IFP(sc)->if_addrlen = 0; GIF2IFP(sc)->if_mtu = GIF_MTU; @@ -190,56 +192,42 @@ gif_clone_create(ifc, unit, params) GIF2IFP(sc)->if_flags |= IFF_LINK2; #endif GIF2IFP(sc)->if_ioctl = gif_ioctl; - GIF2IFP(sc)->if_start = gif_start; + GIF2IFP(sc)->if_transmit = gif_transmit; + GIF2IFP(sc)->if_qflush = gif_qflush; GIF2IFP(sc)->if_output = gif_output; - GIF2IFP(sc)->if_snd.ifq_maxlen = ifqmaxlen; + GIF2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; + GIF2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GIF2IFP(sc)); bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); if (ng_gif_attach_p != NULL) (*ng_gif_attach_p)(GIF2IFP(sc)); - mtx_lock(&gif_mtx); + GIF_LIST_LOCK(); LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list); - mtx_unlock(&gif_mtx); - + GIF_LIST_UNLOCK(); return (0); } static void -gif_clone_destroy(ifp) - struct ifnet *ifp; +gif_clone_destroy(struct ifnet *ifp) { -#if defined(INET) || defined(INET6) - int err; -#endif - struct gif_softc *sc = ifp->if_softc; - - mtx_lock(&gif_mtx); - LIST_REMOVE(sc, gif_list); - mtx_unlock(&gif_mtx); + struct gif_softc *sc; + sx_xlock(&gif_ioctl_sx); + sc = ifp->if_softc; gif_delete_tunnel(ifp); -#ifdef INET6 - if (sc->encap_cookie6 != NULL) { - err = encap_detach(sc->encap_cookie6); - KASSERT(err == 0, ("Unexpected error detaching encap_cookie6")); - } -#endif -#ifdef INET - if (sc->encap_cookie4 != NULL) { - err = encap_detach(sc->encap_cookie4); - KASSERT(err == 0, ("Unexpected error detaching encap_cookie4")); - } -#endif - + GIF_LIST_LOCK(); + LIST_REMOVE(sc, gif_list); + GIF_LIST_UNLOCK(); if (ng_gif_detach_p != NULL) (*ng_gif_detach_p)(ifp); bpfdetach(ifp); if_detach(ifp); - if_free(ifp); + ifp->if_softc = NULL; + sx_xunlock(&gif_ioctl_sx); + if_free(ifp); GIF_LOCK_DESTROY(sc); - free(sc, M_GIF); } @@ -248,31 +236,35 @@ vnet_gif_init(const void *unused __unused) { LIST_INIT(&V_gif_softc_list); + GIF_LIST_LOCK_INIT(); + V_gif_cloner = if_clone_simple(gifname, gif_clone_create, + gif_clone_destroy, 0); } -VNET_SYSINIT(vnet_gif_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, vnet_gif_init, - NULL); +VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_gif_init, NULL); + +static void +vnet_gif_uninit(const void *unused __unused) +{ + + if_clone_detach(V_gif_cloner); + GIF_LIST_LOCK_DESTROY(); +} +VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_gif_uninit, NULL); static int -gifmodevent(mod, type, data) - module_t mod; - int type; - void *data; +gifmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF); - if_clone_attach(&gif_cloner); - break; - case MOD_UNLOAD: - if_clone_detach(&gif_cloner); - mtx_destroy(&gif_mtx); break; default: - return EOPNOTSUPP; + return (EOPNOTSUPP); } - return 0; + return (0); } static moduledata_t gif_mod = { @@ -285,219 +277,257 @@ DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gif, 1); int -gif_encapcheck(m, off, proto, arg) - const struct mbuf *m; - int off; - int proto; - void *arg; +gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { - struct ip ip; + GIF_RLOCK_TRACKER; + const struct ip *ip; struct gif_softc *sc; + int ret; sc = (struct gif_softc *)arg; - if (sc == NULL) - return 0; + if (sc == NULL || (GIF2IFP(sc)->if_flags & IFF_UP) == 0) + return (0); - if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0) - return 0; + ret = 0; + GIF_RLOCK(sc); /* no physical address */ - if (!sc->gif_psrc || !sc->gif_pdst) - return 0; + if (sc->gif_family == 0) + goto done; switch (proto) { #ifdef INET case IPPROTO_IPV4: - break; #endif #ifdef INET6 case IPPROTO_IPV6: - break; #endif case IPPROTO_ETHERIP: break; - default: - return 0; + goto done; } /* Bail on short packets */ - if (m->m_pkthdr.len < sizeof(ip)) - return 0; + M_ASSERTPKTHDR(m); + if (m->m_pkthdr.len < sizeof(struct ip)) + goto done; - m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); - - switch (ip.ip_v) { + ip = mtod(m, const struct ip *); + switch (ip->ip_v) { #ifdef INET case 4: - if (sc->gif_psrc->sa_family != AF_INET || - sc->gif_pdst->sa_family != AF_INET) - return 0; - return gif_encapcheck4(m, off, proto, arg); + if (sc->gif_family != AF_INET) + goto done; + ret = in_gif_encapcheck(m, off, proto, arg); + break; #endif #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) - return 0; - if (sc->gif_psrc->sa_family != AF_INET6 || - sc->gif_pdst->sa_family != AF_INET6) - return 0; - return gif_encapcheck6(m, off, proto, arg); + goto done; + if (sc->gif_family != AF_INET6) + goto done; + ret = in6_gif_encapcheck(m, off, proto, arg); + break; #endif - default: - return 0; } +done: + GIF_RUNLOCK(sc); + return (ret); } -static void -gif_start(struct ifnet *ifp) +static int +gif_transmit(struct ifnet *ifp, struct mbuf *m) { struct gif_softc *sc; - struct mbuf *m; - - sc = ifp->if_softc; - - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - for (;;) { - IFQ_DEQUEUE(&ifp->if_snd, m); - if (m == 0) - break; - - gif_output(ifp, m, sc->gif_pdst, NULL); - - } - ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - - return; -} - -int -gif_output(ifp, m, dst, ro) - struct ifnet *ifp; - struct mbuf *m; - struct sockaddr *dst; - struct route *ro; -{ - struct gif_softc *sc = ifp->if_softc; - struct m_tag *mtag; - int error = 0; - int gif_called; - u_int32_t af; + struct etherip_header *eth; +#ifdef INET + struct ip *ip; +#endif +#ifdef INET6 + struct ip6_hdr *ip6; + uint32_t t; +#endif + uint32_t af; + uint8_t proto, ecn; + int error; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { m_freem(m); - goto end; + goto err; } #endif - - /* - * gif may cause infinite recursion calls when misconfigured. - * We'll prevent this by detecting loops. - * - * High nesting level may cause stack exhaustion. - * We'll prevent this by introducing upper limit. - */ - gif_called = 1; - mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, NULL); - while (mtag != NULL) { - if (*(struct ifnet **)(mtag + 1) == ifp) { - log(LOG_NOTICE, - "gif_output: loop detected on %s\n", - (*(struct ifnet **)(mtag + 1))->if_xname); - m_freem(m); - error = EIO; /* is there better errno? */ - goto end; - } - mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag); - gif_called++; - } - if (gif_called > V_max_gif_nesting) { - log(LOG_NOTICE, - "gif_output: recursively called too many times(%d)\n", - gif_called); - m_freem(m); - error = EIO; /* is there better errno? */ - goto end; - } - mtag = m_tag_alloc(MTAG_GIF, MTAG_GIF_CALLED, sizeof(struct ifnet *), - M_NOWAIT); - if (mtag == NULL) { - m_freem(m); - error = ENOMEM; - goto end; - } - *(struct ifnet **)(mtag + 1) = ifp; - m_tag_prepend(m, mtag); - - m->m_flags &= ~(M_BCAST|M_MCAST); - - GIF_LOCK(sc); - - if (!(ifp->if_flags & IFF_UP) || - sc->gif_psrc == NULL || sc->gif_pdst == NULL) { - GIF_UNLOCK(sc); + error = ENETDOWN; + sc = ifp->if_softc; + if ((ifp->if_flags & IFF_MONITOR) != 0 || + (ifp->if_flags & IFF_UP) == 0 || + sc->gif_family == 0 || + (error = gif_check_nesting(ifp, m)) != 0) { m_freem(m); - error = ENETDOWN; - goto end; - } - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { - bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; + goto err; } - - af = dst->sa_family; - BPF_MTAP2(ifp, &af, sizeof(af), m); - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - - /* override to IPPROTO_ETHERIP for bridged traffic */ + /* Now pull back the af that we stashed in the csum_data. */ if (ifp->if_bridge) af = AF_LINK; - + else + af = m->m_pkthdr.csum_data; + m->m_flags &= ~(M_BCAST|M_MCAST); M_SETFIB(m, sc->gif_fibnum); + BPF_MTAP2(ifp, &af, sizeof(af), m); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); /* inner AF-specific encapsulation */ - + ecn = 0; + switch (af) { +#ifdef INET + case AF_INET: + proto = IPPROTO_IPV4; + if (m->m_len < sizeof(struct ip)) + m = m_pullup(m, sizeof(struct ip)); + if (m == NULL) { + error = ENOBUFS; + goto err; + } + ip = mtod(m, struct ip *); + ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: + ECN_NOCARE, &ecn, &ip->ip_tos); + break; +#endif +#ifdef INET6 + case AF_INET6: + proto = IPPROTO_IPV6; + if (m->m_len < sizeof(struct ip6_hdr)) + m = m_pullup(m, sizeof(struct ip6_hdr)); + if (m == NULL) { + error = ENOBUFS; + goto err; + } + t = 0; + ip6 = mtod(m, struct ip6_hdr *); + ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: + ECN_NOCARE, &t, &ip6->ip6_flow); + ecn = (ntohl(t) >> 20) & 0xff; + break; +#endif + case AF_LINK: + proto = IPPROTO_ETHERIP; + M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT); + if (m == NULL) { + error = ENOBUFS; + goto err; + } + eth = mtod(m, struct etherip_header *); + eth->eip_resvh = 0; + eth->eip_ver = ETHERIP_VERSION; + eth->eip_resvl = 0; + break; + default: + error = EAFNOSUPPORT; + m_freem(m); + goto err; + } /* XXX should we check if our outer source is legal? */ - /* dispatch to output logic based on outer AF */ - switch (sc->gif_psrc->sa_family) { + switch (sc->gif_family) { #ifdef INET case AF_INET: - error = in_gif_output(ifp, af, m); + error = in_gif_output(ifp, m, proto, ecn); break; #endif #ifdef INET6 case AF_INET6: - error = in6_gif_output(ifp, af, m); + error = in6_gif_output(ifp, m, proto, ecn); break; #endif default: - m_freem(m); - error = ENETDOWN; + m_freem(m); } - - GIF_UNLOCK(sc); - end: +err: if (error) - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } +static void +gif_qflush(struct ifnet *ifp __unused) +{ + +} + +#define MTAG_GIF 1080679712 +static int +gif_check_nesting(struct ifnet *ifp, struct mbuf *m) +{ + struct m_tag *mtag; + int count; + + /* + * gif may cause infinite recursion calls when misconfigured. + * We'll prevent this by detecting loops. + * + * High nesting level may cause stack exhaustion. + * We'll prevent this by introducing upper limit. + */ + count = 1; + mtag = NULL; + while ((mtag = m_tag_locate(m, MTAG_GIF, 0, mtag)) != NULL) { + if (*(struct ifnet **)(mtag + 1) == ifp) { + log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); + return (EIO); + } + count++; + } + if (count > V_max_gif_nesting) { + log(LOG_NOTICE, + "%s: if_output recursively called too many times(%d)\n", + if_name(ifp), count); + return (EIO); + } + mtag = m_tag_alloc(MTAG_GIF, 0, sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) + return (ENOMEM); + *(struct ifnet **)(mtag + 1) = ifp; + m_tag_prepend(m, mtag); + return (0); +} + +int +gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) +{ + uint32_t af; + + if (dst->sa_family == AF_UNSPEC) + bcopy(dst->sa_data, &af, sizeof(af)); + else + af = dst->sa_family; + /* + * Now save the af in the inbound pkt csum data, this is a cheat since + * we are using the inbound csum_data field to carry the af over to + * the gif_transmit() routine, avoiding using yet another mtag. + */ + m->m_pkthdr.csum_data = af; + return (ifp->if_transmit(ifp, m)); +} + void -gif_input(m, af, ifp) - struct mbuf *m; - int af; - struct ifnet *ifp; +gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn) { - int isr, n; - struct gif_softc *sc; struct etherip_header *eip; +#ifdef INET + struct ip *ip; +#endif +#ifdef INET6 + struct ip6_hdr *ip6; + uint32_t t; +#endif + struct gif_softc *sc; struct ether_header *eh; struct ifnet *oldifp; + int isr, n, af; if (ifp == NULL) { /* just in case */ @@ -506,20 +536,67 @@ gif_input(m, af, ifp) } sc = ifp->if_softc; m->m_pkthdr.rcvif = ifp; + m_clrprotoflags(m); + switch (proto) { +#ifdef INET + case IPPROTO_IPV4: + af = AF_INET; + if (m->m_len < sizeof(struct ip)) + m = m_pullup(m, sizeof(struct ip)); + if (m == NULL) + goto drop; + ip = mtod(m, struct ip *); + if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: + ECN_NOCARE, &ecn, &ip->ip_tos) == 0) { + m_freem(m); + goto drop; + } + break; +#endif +#ifdef INET6 + case IPPROTO_IPV6: + af = AF_INET6; + if (m->m_len < sizeof(struct ip6_hdr)) + m = m_pullup(m, sizeof(struct ip6_hdr)); + if (m == NULL) + goto drop; + t = htonl((uint32_t)ecn << 20); + ip6 = mtod(m, struct ip6_hdr *); + if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: + ECN_NOCARE, &t, &ip6->ip6_flow) == 0) { + m_freem(m); + goto drop; + } + break; +#endif + case IPPROTO_ETHERIP: + af = AF_LINK; + break; + default: + m_freem(m); + goto drop; + } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (bpf_peers_present(ifp->if_bpf)) { - u_int32_t af1 = af; + uint32_t af1 = af; bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); } + if ((ifp->if_flags & IFF_MONITOR) != 0) { + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); + m_freem(m); + return; + } + if (ng_gif_input_p != NULL) { (*ng_gif_input_p)(ifp, &m, af); if (m == NULL) - return; + goto drop; } /* @@ -546,34 +623,15 @@ gif_input(m, af, ifp) #endif case AF_LINK: n = sizeof(struct etherip_header) + sizeof(struct ether_header); - if (n > m->m_len) { + if (n > m->m_len) m = m_pullup(m, n); - if (m == NULL) { - ifp->if_ierrors++; - return; - } - } - + if (m == NULL) + goto drop; eip = mtod(m, struct etherip_header *); - /* - * GIF_ACCEPT_REVETHIP (enabled by default) intentionally - * accepts an EtherIP packet with revered version field in - * the header. This is a knob for backward compatibility - * with FreeBSD 7.2R or prior. - */ - if (sc->gif_options & GIF_ACCEPT_REVETHIP) { - if (eip->eip_resvl != ETHERIP_VERSION - && eip->eip_ver != ETHERIP_VERSION) { - /* discard unknown versions */ - m_freem(m); - return; - } - } else { - if (eip->eip_ver != ETHERIP_VERSION) { - /* discard unknown versions */ - m_freem(m); - return; - } + if (eip->eip_ver != ETHERIP_VERSION) { + /* discard unknown versions */ + m_freem(m); + goto drop; } m_adj(m, sizeof(struct etherip_header)); @@ -588,7 +646,7 @@ gif_input(m, af, ifp) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; - ifp->if_imcasts++; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } BRIDGE_INPUT(ifp, m); @@ -613,59 +671,61 @@ gif_input(m, af, ifp) return; } - ifp->if_ipackets++; - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); + return; +drop: + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ int -gif_ioctl(ifp, cmd, data) - struct ifnet *ifp; - u_long cmd; - caddr_t data; +gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - struct gif_softc *sc = ifp->if_softc; - struct ifreq *ifr = (struct ifreq*)data; - int error = 0, size; - u_int options; + GIF_RLOCK_TRACKER; + struct ifreq *ifr = (struct ifreq*)data; struct sockaddr *dst, *src; -#ifdef SIOCSIFMTU /* xxx */ - u_long mtu; + struct gif_softc *sc; +#ifdef INET + struct sockaddr_in *sin = NULL; #endif +#ifdef INET6 + struct sockaddr_in6 *sin6 = NULL; +#endif + u_int options; + int error; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; - break; - - case SIOCSIFDSTADDR: - break; - case SIOCADDMULTI: case SIOCDELMULTI: - break; - -#ifdef SIOCSIFMTU /* xxx */ case SIOCGIFMTU: - break; - + case SIOCSIFFLAGS: + return (0); case SIOCSIFMTU: - mtu = ifr->ifr_mtu; - if (mtu < GIF_MTU_MIN || mtu > GIF_MTU_MAX) + if (ifr->ifr_mtu < GIF_MTU_MIN || + ifr->ifr_mtu > GIF_MTU_MAX) return (EINVAL); - ifp->if_mtu = mtu; - break; -#endif /* SIOCSIFMTU */ - -#ifdef INET + else + ifp->if_mtu = ifr->ifr_mtu; + return (0); + } + sx_xlock(&gif_ioctl_sx); + sc = ifp->if_softc; + if (sc == NULL) { + error = ENXIO; + goto bad; + } + error = 0; + switch (cmd) { case SIOCSIFPHYADDR: -#endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: -#endif /* INET6 */ - case SIOCSLIFPHYADDR: +#endif + error = EINVAL; switch (cmd) { #ifdef INET case SIOCSIFPHYADDR: @@ -683,199 +743,169 @@ gif_ioctl(ifp, cmd, data) &(((struct in6_aliasreq *)data)->ifra_dstaddr); break; #endif - case SIOCSLIFPHYADDR: - src = (struct sockaddr *) - &(((struct if_laddrreq *)data)->addr); - dst = (struct sockaddr *) - &(((struct if_laddrreq *)data)->dstaddr); - break; default: - return EINVAL; + goto bad; } - /* sa_family must be equal */ - if (src->sa_family != dst->sa_family) - return EINVAL; + if (src->sa_family != dst->sa_family || + src->sa_len != dst->sa_len) + goto bad; /* validate sa_len */ + /* check sa_family looks sane for the cmd */ switch (src->sa_family) { #ifdef INET case AF_INET: if (src->sa_len != sizeof(struct sockaddr_in)) - return EINVAL; + goto bad; + if (cmd != SIOCSIFPHYADDR) { + error = EAFNOSUPPORT; + goto bad; + } + if (satosin(src)->sin_addr.s_addr == INADDR_ANY || + satosin(dst)->sin_addr.s_addr == INADDR_ANY) { + error = EADDRNOTAVAIL; + goto bad; + } break; #endif #ifdef INET6 case AF_INET6: if (src->sa_len != sizeof(struct sockaddr_in6)) - return EINVAL; - break; -#endif - default: - return EAFNOSUPPORT; - } - switch (dst->sa_family) { -#ifdef INET - case AF_INET: - if (dst->sa_len != sizeof(struct sockaddr_in)) - return EINVAL; - break; -#endif -#ifdef INET6 - case AF_INET6: - if (dst->sa_len != sizeof(struct sockaddr_in6)) - return EINVAL; + goto bad; + if (cmd != SIOCSIFPHYADDR_IN6) { + error = EAFNOSUPPORT; + goto bad; + } + error = EADDRNOTAVAIL; + if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr) + || + IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr)) + goto bad; + /* + * Check validity of the scope zone ID of the + * addresses, and convert it into the kernel + * internal form if necessary. + */ + error = sa6_embedscope(satosin6(src), 0); + if (error != 0) + goto bad; + error = sa6_embedscope(satosin6(dst), 0); + if (error != 0) + goto bad; break; #endif default: - return EAFNOSUPPORT; - } - - /* check sa_family looks sane for the cmd */ - switch (cmd) { - case SIOCSIFPHYADDR: - if (src->sa_family == AF_INET) - break; - return EAFNOSUPPORT; -#ifdef INET6 - case SIOCSIFPHYADDR_IN6: - if (src->sa_family == AF_INET6) - break; - return EAFNOSUPPORT; -#endif /* INET6 */ - case SIOCSLIFPHYADDR: - /* checks done in the above */ - break; + error = EAFNOSUPPORT; + goto bad; } - - error = gif_set_tunnel(GIF2IFP(sc), src, dst); + error = gif_set_tunnel(ifp, src, dst); break; - -#ifdef SIOCDIFPHYADDR case SIOCDIFPHYADDR: - gif_delete_tunnel(GIF2IFP(sc)); + gif_delete_tunnel(ifp); break; -#endif - case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: #ifdef INET6 case SIOCGIFPSRCADDR_IN6: -#endif /* INET6 */ - if (sc->gif_psrc == NULL) { + case SIOCGIFPDSTADDR_IN6: +#endif + if (sc->gif_family == 0) { error = EADDRNOTAVAIL; - goto bad; + break; } - src = sc->gif_psrc; + GIF_RLOCK(sc); switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: - dst = &ifr->ifr_addr; - size = sizeof(ifr->ifr_addr); + case SIOCGIFPDSTADDR: + if (sc->gif_family != AF_INET) { + error = EADDRNOTAVAIL; + break; + } + sin = (struct sockaddr_in *)&ifr->ifr_addr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); break; -#endif /* INET */ +#endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: - dst = (struct sockaddr *) + case SIOCGIFPDSTADDR_IN6: + if (sc->gif_family != AF_INET6) { + error = EADDRNOTAVAIL; + break; + } + sin6 = (struct sockaddr_in6 *) &(((struct in6_ifreq *)data)->ifr_addr); - size = sizeof(((struct in6_ifreq *)data)->ifr_addr); + memset(sin6, 0, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); break; -#endif /* INET6 */ +#endif default: - error = EADDRNOTAVAIL; - goto bad; - } - if (src->sa_len > size) - return EINVAL; - bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); -#ifdef INET6 - if (dst->sa_family == AF_INET6) { - error = sa6_recoverscope((struct sockaddr_in6 *)dst); - if (error != 0) - return (error); + error = EAFNOSUPPORT; } + if (error == 0) { + switch (cmd) { +#ifdef INET + case SIOCGIFPSRCADDR: + sin->sin_addr = sc->gif_iphdr->ip_src; + break; + case SIOCGIFPDSTADDR: + sin->sin_addr = sc->gif_iphdr->ip_dst; + break; #endif - break; - - case SIOCGIFPDSTADDR: #ifdef INET6 - case SIOCGIFPDSTADDR_IN6: -#endif /* INET6 */ - if (sc->gif_pdst == NULL) { - error = EADDRNOTAVAIL; - goto bad; + case SIOCGIFPSRCADDR_IN6: + sin6->sin6_addr = sc->gif_ip6hdr->ip6_src; + break; + case SIOCGIFPDSTADDR_IN6: + sin6->sin6_addr = sc->gif_ip6hdr->ip6_dst; + break; +#endif + } } - src = sc->gif_pdst; + GIF_RUNLOCK(sc); + if (error != 0) + break; switch (cmd) { #ifdef INET + case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: - dst = &ifr->ifr_addr; - size = sizeof(ifr->ifr_addr); + error = prison_if(curthread->td_ucred, + (struct sockaddr *)sin); + if (error != 0) + memset(sin, 0, sizeof(*sin)); break; -#endif /* INET */ +#endif #ifdef INET6 + case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: - dst = (struct sockaddr *) - &(((struct in6_ifreq *)data)->ifr_addr); - size = sizeof(((struct in6_ifreq *)data)->ifr_addr); - break; -#endif /* INET6 */ - default: - error = EADDRNOTAVAIL; - goto bad; - } - if (src->sa_len > size) - return EINVAL; - error = prison_if(curthread->td_ucred, src); - if (error != 0) - return (error); - error = prison_if(curthread->td_ucred, dst); - if (error != 0) - return (error); - bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); -#ifdef INET6 - if (dst->sa_family == AF_INET6) { - error = sa6_recoverscope((struct sockaddr_in6 *)dst); + error = prison_if(curthread->td_ucred, + (struct sockaddr *)sin6); + if (error == 0) + error = sa6_recoverscope(sin6); if (error != 0) - return (error); - } + memset(sin6, 0, sizeof(*sin6)); #endif - break; - - case SIOCGLIFPHYADDR: - if (sc->gif_psrc == NULL || sc->gif_pdst == NULL) { - error = EADDRNOTAVAIL; - goto bad; } - - /* copy src */ - src = sc->gif_psrc; - dst = (struct sockaddr *) - &(((struct if_laddrreq *)data)->addr); - size = sizeof(((struct if_laddrreq *)data)->addr); - if (src->sa_len > size) - return EINVAL; - bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); - - /* copy dst */ - src = sc->gif_pdst; - dst = (struct sockaddr *) - &(((struct if_laddrreq *)data)->dstaddr); - size = sizeof(((struct if_laddrreq *)data)->dstaddr); - if (src->sa_len > size) - return EINVAL; - bcopy((caddr_t)src, (caddr_t)dst, src->sa_len); break; - - case SIOCSIFFLAGS: - /* if_ioctl() takes care of it */ + case SIOCGTUNFIB: + ifr->ifr_fib = sc->gif_fibnum; + break; + case SIOCSTUNFIB: + if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) + break; + if (ifr->ifr_fib >= rt_numfibs) + error = EINVAL; + else + sc->gif_fibnum = ifr->ifr_fib; break; - case GIFGOPTS: options = sc->gif_options; - error = copyout(&options, ifr->ifr_data, - sizeof(options)); + error = copyout(&options, ifr->ifr_data, sizeof(options)); break; - case GIFSOPTS: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; @@ -887,151 +917,154 @@ gif_ioctl(ifp, cmd, data) else sc->gif_options = options; break; - default: error = EINVAL; break; } - bad: - return error; +bad: + sx_xunlock(&gif_ioctl_sx); + return (error); } -/* - * XXXRW: There's a general event-ordering issue here: the code to check - * if a given tunnel is already present happens before we perform a - * potentially blocking setup of the tunnel. This code needs to be - * re-ordered so that the check and replacement can be atomic using - * a mutex. - */ -int -gif_set_tunnel(ifp, src, dst) - struct ifnet *ifp; - struct sockaddr *src; - struct sockaddr *dst; +static void +gif_detach(struct gif_softc *sc) { - struct gif_softc *sc = ifp->if_softc; - struct gif_softc *sc2; - struct sockaddr *osrc, *odst, *sa; - int error = 0; - - mtx_lock(&gif_mtx); - LIST_FOREACH(sc2, &V_gif_softc_list, gif_list) { - if (sc2 == sc) - continue; - if (!sc2->gif_pdst || !sc2->gif_psrc) - continue; - if (sc2->gif_pdst->sa_family != dst->sa_family || - sc2->gif_pdst->sa_len != dst->sa_len || - sc2->gif_psrc->sa_family != src->sa_family || - sc2->gif_psrc->sa_len != src->sa_len) - continue; - - /* - * Disallow parallel tunnels unless instructed - * otherwise. - */ - if (!V_parallel_tunnels && - bcmp(sc2->gif_pdst, dst, dst->sa_len) == 0 && - bcmp(sc2->gif_psrc, src, src->sa_len) == 0) { - error = EADDRNOTAVAIL; - mtx_unlock(&gif_mtx); - goto bad; - } - /* XXX both end must be valid? (I mean, not 0.0.0.0) */ - } - mtx_unlock(&gif_mtx); + sx_assert(&gif_ioctl_sx, SA_XLOCKED); + if (sc->gif_ecookie != NULL) + encap_detach(sc->gif_ecookie); + sc->gif_ecookie = NULL; +} + +static int +gif_attach(struct gif_softc *sc, int af) +{ - /* XXX we can detach from both, but be polite just in case */ - if (sc->gif_psrc) - switch (sc->gif_psrc->sa_family) { + sx_assert(&gif_ioctl_sx, SA_XLOCKED); + switch (af) { #ifdef INET - case AF_INET: - (void)in_gif_detach(sc); - break; + case AF_INET: + return (in_gif_attach(sc)); #endif #ifdef INET6 - case AF_INET6: - (void)in6_gif_detach(sc); - break; + case AF_INET6: + return (in6_gif_attach(sc)); #endif - } - - osrc = sc->gif_psrc; - sa = (struct sockaddr *)malloc(src->sa_len, M_IFADDR, M_WAITOK); - bcopy((caddr_t)src, (caddr_t)sa, src->sa_len); - sc->gif_psrc = sa; + } + return (EAFNOSUPPORT); +} - odst = sc->gif_pdst; - sa = (struct sockaddr *)malloc(dst->sa_len, M_IFADDR, M_WAITOK); - bcopy((caddr_t)dst, (caddr_t)sa, dst->sa_len); - sc->gif_pdst = sa; +static int +gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) +{ + struct gif_softc *sc = ifp->if_softc; + struct gif_softc *tsc; +#ifdef INET + struct ip *ip; +#endif +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + void *hdr; + int error = 0; - switch (sc->gif_psrc->sa_family) { + if (sc == NULL) + return (ENXIO); + /* Disallow parallel tunnels unless instructed otherwise. */ + if (V_parallel_tunnels == 0) { + GIF_LIST_LOCK(); + LIST_FOREACH(tsc, &V_gif_softc_list, gif_list) { + if (tsc == sc || tsc->gif_family != src->sa_family) + continue; +#ifdef INET + if (tsc->gif_family == AF_INET && + tsc->gif_iphdr->ip_src.s_addr == + satosin(src)->sin_addr.s_addr && + tsc->gif_iphdr->ip_dst.s_addr == + satosin(dst)->sin_addr.s_addr) { + error = EADDRNOTAVAIL; + GIF_LIST_UNLOCK(); + goto bad; + } +#endif +#ifdef INET6 + if (tsc->gif_family == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_src, + &satosin6(src)->sin6_addr) && + IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_dst, + &satosin6(dst)->sin6_addr)) { + error = EADDRNOTAVAIL; + GIF_LIST_UNLOCK(); + goto bad; + } +#endif + } + GIF_LIST_UNLOCK(); + } + switch (src->sa_family) { #ifdef INET case AF_INET: - error = in_gif_attach(sc); + hdr = ip = malloc(sizeof(struct ip), M_GIF, + M_WAITOK | M_ZERO); + ip->ip_src.s_addr = satosin(src)->sin_addr.s_addr; + ip->ip_dst.s_addr = satosin(dst)->sin_addr.s_addr; break; #endif #ifdef INET6 case AF_INET6: - /* - * Check validity of the scope zone ID of the addresses, and - * convert it into the kernel internal form if necessary. - */ - error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_psrc, 0); - if (error != 0) - break; - error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_pdst, 0); - if (error != 0) - break; - error = in6_gif_attach(sc); + hdr = ip6 = malloc(sizeof(struct ip6_hdr), M_GIF, + M_WAITOK | M_ZERO); + ip6->ip6_src = satosin6(src)->sin6_addr; + ip6->ip6_dst = satosin6(dst)->sin6_addr; + ip6->ip6_vfc = IPV6_VERSION; break; #endif + default: + return (EAFNOSUPPORT); } - if (error) { - /* rollback */ - free((caddr_t)sc->gif_psrc, M_IFADDR); - free((caddr_t)sc->gif_pdst, M_IFADDR); - sc->gif_psrc = osrc; - sc->gif_pdst = odst; - goto bad; - } - - if (osrc) - free((caddr_t)osrc, M_IFADDR); - if (odst) - free((caddr_t)odst, M_IFADDR); - bad: - if (sc->gif_psrc && sc->gif_pdst) + if (sc->gif_family != src->sa_family) + gif_detach(sc); + if (sc->gif_family == 0 || + sc->gif_family != src->sa_family) + error = gif_attach(sc, src->sa_family); + + GIF_WLOCK(sc); + if (sc->gif_family != 0) + free(sc->gif_hdr, M_GIF); + sc->gif_family = src->sa_family; + sc->gif_hdr = hdr; + GIF_WUNLOCK(sc); +#if defined(INET) || defined(INET6) +bad: +#endif + if (error == 0 && sc->gif_family != 0) { ifp->if_drv_flags |= IFF_DRV_RUNNING; - else + if_link_state_change(ifp, LINK_STATE_UP); + } else { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - - return error; + if_link_state_change(ifp, LINK_STATE_DOWN); + } + return (error); } -void -gif_delete_tunnel(ifp) - struct ifnet *ifp; +static void +gif_delete_tunnel(struct ifnet *ifp) { struct gif_softc *sc = ifp->if_softc; + int family; - if (sc->gif_psrc) { - free((caddr_t)sc->gif_psrc, M_IFADDR); - sc->gif_psrc = NULL; - } - if (sc->gif_pdst) { - free((caddr_t)sc->gif_pdst, M_IFADDR); - sc->gif_pdst = NULL; + if (sc == NULL) + return; + + GIF_WLOCK(sc); + family = sc->gif_family; + sc->gif_family = 0; + GIF_WUNLOCK(sc); + if (family != 0) { + gif_detach(sc); + free(sc->gif_hdr, M_GIF); } - /* it is safe to detach from both */ -#ifdef INET - (void)in_gif_detach(sc); -#endif -#ifdef INET6 - (void)in6_gif_detach(sc); -#endif ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + if_link_state_change(ifp, LINK_STATE_DOWN); } diff --git a/freebsd/sys/net/if_gif.h b/freebsd/sys/net/if_gif.h index a2f214c5..28da85bd 100644 --- a/freebsd/sys/net/if_gif.h +++ b/freebsd/sys/net/if_gif.h @@ -30,21 +30,17 @@ * SUCH DAMAGE. */ -/* - * if_gif.h - */ - #ifndef _NET_IF_GIF_H_ #define _NET_IF_GIF_H_ - #ifdef _KERNEL #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <netinet/in.h> -/* xxx sigh, why route have struct route instead of pointer? */ +struct ip; +struct ip6_hdr; struct encaptab; extern void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, @@ -56,44 +52,44 @@ extern void (*ng_gif_attach_p)(struct ifnet *ifp); extern void (*ng_gif_detach_p)(struct ifnet *ifp); struct gif_softc { - struct ifnet *gif_ifp; - struct mtx gif_mtx; - struct sockaddr *gif_psrc; /* Physical src addr */ - struct sockaddr *gif_pdst; /* Physical dst addr */ + struct ifnet *gif_ifp; + struct rmlock gif_lock; + const struct encaptab *gif_ecookie; + int gif_family; + int gif_flags; + u_int gif_fibnum; + u_int gif_options; + void *gif_netgraph; /* netgraph node info */ union { - struct route gifscr_ro; /* xxx */ + void *hdr; + struct ip *iphdr; #ifdef INET6 - struct route_in6 gifscr_ro6; /* xxx */ + struct ip6_hdr *ip6hdr; #endif - } gifsc_gifscr; - int gif_flags; - u_int gif_fibnum; - const struct encaptab *encap_cookie4; - const struct encaptab *encap_cookie6; - void *gif_netgraph; /* ng_gif(4) netgraph node info */ - u_int gif_options; - LIST_ENTRY(gif_softc) gif_list; /* all gif's are linked */ + } gif_uhdr; + LIST_ENTRY(gif_softc) gif_list; /* all gif's are linked */ }; #define GIF2IFP(sc) ((sc)->gif_ifp) -#define GIF_LOCK_INIT(sc) mtx_init(&(sc)->gif_mtx, "gif softc", \ - NULL, MTX_DEF) -#define GIF_LOCK_DESTROY(sc) mtx_destroy(&(sc)->gif_mtx) -#define GIF_LOCK(sc) mtx_lock(&(sc)->gif_mtx) -#define GIF_UNLOCK(sc) mtx_unlock(&(sc)->gif_mtx) -#define GIF_LOCK_ASSERT(sc) mtx_assert(&(sc)->gif_mtx, MA_OWNED) - -#define gif_ro gifsc_gifscr.gifscr_ro +#define GIF_LOCK_INIT(sc) rm_init(&(sc)->gif_lock, "gif softc") +#define GIF_LOCK_DESTROY(sc) rm_destroy(&(sc)->gif_lock) +#define GIF_RLOCK_TRACKER struct rm_priotracker gif_tracker +#define GIF_RLOCK(sc) rm_rlock(&(sc)->gif_lock, &gif_tracker) +#define GIF_RUNLOCK(sc) rm_runlock(&(sc)->gif_lock, &gif_tracker) +#define GIF_RLOCK_ASSERT(sc) rm_assert(&(sc)->gif_lock, RA_RLOCKED) +#define GIF_WLOCK(sc) rm_wlock(&(sc)->gif_lock) +#define GIF_WUNLOCK(sc) rm_wunlock(&(sc)->gif_lock) +#define GIF_WLOCK_ASSERT(sc) rm_assert(&(sc)->gif_lock, RA_WLOCKED) + +#define gif_iphdr gif_uhdr.iphdr +#define gif_hdr gif_uhdr.hdr #ifdef INET6 -#define gif_ro6 gifsc_gifscr.gifscr_ro6 +#define gif_ip6hdr gif_uhdr.ip6hdr #endif #define GIF_MTU (1280) /* Default MTU */ #define GIF_MTU_MIN (1280) /* Minimum MTU */ #define GIF_MTU_MAX (8192) /* Maximum MTU */ -#define MTAG_GIF 1080679712 -#define MTAG_GIF_CALLED 0 - struct etherip_header { #if BYTE_ORDER == LITTLE_ENDIAN u_int eip_resvl:4, /* reserved */ @@ -111,20 +107,26 @@ struct etherip_header { #define ETHERIP_ALIGN 2 /* Prototypes */ -void gif_input(struct mbuf *, int, struct ifnet *); -int gif_output(struct ifnet *, struct mbuf *, struct sockaddr *, +void gif_input(struct mbuf *, struct ifnet *, int, uint8_t); +int gif_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); -int gif_ioctl(struct ifnet *, u_long, caddr_t); -int gif_set_tunnel(struct ifnet *, struct sockaddr *, struct sockaddr *); -void gif_delete_tunnel(struct ifnet *); int gif_encapcheck(const struct mbuf *, int, int, void *); +#ifdef INET +int in_gif_output(struct ifnet *, struct mbuf *, int, uint8_t); +int in_gif_encapcheck(const struct mbuf *, int, int, void *); +int in_gif_attach(struct gif_softc *); +#endif +#ifdef INET6 +int in6_gif_output(struct ifnet *, struct mbuf *, int, uint8_t); +int in6_gif_encapcheck(const struct mbuf *, int, int, void *); +int in6_gif_attach(struct gif_softc *); +#endif #endif /* _KERNEL */ #define GIFGOPTS _IOWR('i', 150, struct ifreq) #define GIFSOPTS _IOW('i', 151, struct ifreq) -#define GIF_ACCEPT_REVETHIP 0x0001 -#define GIF_SEND_REVETHIP 0x0010 -#define GIF_OPTMASK (GIF_ACCEPT_REVETHIP|GIF_SEND_REVETHIP) +#define GIF_IGNORE_SOURCE 0x0002 +#define GIF_OPTMASK (GIF_IGNORE_SOURCE) #endif /* _NET_IF_GIF_H_ */ diff --git a/freebsd/sys/net/if_gre.c b/freebsd/sys/net/if_gre.c index b7e0bd15..68b515ea 100644 --- a/freebsd/sys/net/if_gre.c +++ b/freebsd/sys/net/if_gre.c @@ -1,10 +1,8 @@ #include <machine/rtems-bsd-kernel-space.h> -/* $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */ -/* $FreeBSD$ */ - /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. + * Copyright (c) 2014 Andrey V. Elsukov <ae@FreeBSD.org> * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -32,24 +30,20 @@ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. + * + * $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */ -/* - * Encapsulate L3 protocols into IP - * See RFC 2784 (successor of RFC 1701 and 1702) for more details. - * If_gre is compatible with Cisco GRE tunnels, so you can - * have a NetBSD box as the other end of a tunnel interface of a Cisco - * router. See gre(4) for more details. - * Also supported: IP in IP encaps (proto 55) as of RFC 2004 - */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); -#include <rtems/bsd/local/opt_atalk.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/sys/param.h> #include <sys/jail.h> #include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> #include <sys/libkern.h> #include <sys/malloc.h> #include <sys/module.h> @@ -57,97 +51,76 @@ #include <sys/priv.h> #include <sys/proc.h> #include <sys/protosw.h> +#include <sys/rmlock.h> #include <sys/socket.h> #include <sys/sockio.h> +#include <sys/sx.h> #include <sys/sysctl.h> +#include <sys/syslog.h> #include <sys/systm.h> #include <net/ethernet.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_types.h> -#include <net/route.h> +#include <net/netisr.h> #include <net/vnet.h> +#include <net/route.h> -#ifdef INET #include <netinet/in.h> +#ifdef INET #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> -#include <netinet/ip_gre.h> #include <netinet/ip_var.h> -#include <netinet/ip_encap.h> -#else -#error "Huh? if_gre without inet?" #endif -#include <net/bpf.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> +#include <netinet6/scope6_var.h> +#endif +#include <netinet/ip_encap.h> +#include <net/bpf.h> #include <net/if_gre.h> -/* - * It is not easy to calculate the right value for a GRE MTU. - * We leave this task to the admin and use the same default that - * other vendors use. - */ -#define GREMTU 1476 - -#define GRENAME "gre" - -#define MTAG_COOKIE_GRE 1307983903 -#define MTAG_GRE_NESTING 1 -struct mtag_gre_nesting { - uint16_t count; - uint16_t max; - struct ifnet *ifp[]; -}; - -/* - * gre_mtx protects all global variables in if_gre.c. - * XXX: gre_softc data not protected yet. - */ -struct mtx gre_mtx; -static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation"); - -struct gre_softc_head gre_softc_list; +#include <machine/in_cksum.h> +#include <security/mac/mac_framework.h> + +#define GREMTU 1500 +static const char grename[] = "gre"; +static MALLOC_DEFINE(M_GRE, grename, "Generic Routing Encapsulation"); +static VNET_DEFINE(struct mtx, gre_mtx); +#define V_gre_mtx VNET(gre_mtx) +#define GRE_LIST_LOCK_INIT(x) mtx_init(&V_gre_mtx, "gre_mtx", NULL, \ + MTX_DEF) +#define GRE_LIST_LOCK_DESTROY(x) mtx_destroy(&V_gre_mtx) +#define GRE_LIST_LOCK(x) mtx_lock(&V_gre_mtx) +#define GRE_LIST_UNLOCK(x) mtx_unlock(&V_gre_mtx) + +static VNET_DEFINE(LIST_HEAD(, gre_softc), gre_softc_list); +#define V_gre_softc_list VNET(gre_softc_list) +static struct sx gre_ioctl_sx; +SX_SYSINIT(gre_ioctl_sx, &gre_ioctl_sx, "gre_ioctl"); static int gre_clone_create(struct if_clone *, int, caddr_t); static void gre_clone_destroy(struct ifnet *); -static int gre_ioctl(struct ifnet *, u_long, caddr_t); -static int gre_output(struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *ro); - -IFC_SIMPLE_DECLARE(gre, 0); +static VNET_DEFINE(struct if_clone *, gre_cloner); +#define V_gre_cloner VNET(gre_cloner) -static int gre_compute_route(struct gre_softc *sc); - -static void greattach(void); +static void gre_qflush(struct ifnet *); +static int gre_transmit(struct ifnet *, struct mbuf *); +static int gre_ioctl(struct ifnet *, u_long, caddr_t); +static int gre_output(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *); -#ifdef INET -extern struct domain inetdomain; -static const struct protosw in_gre_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_GRE, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = gre_input, - .pr_output = (pr_output_t *)rip_output, - .pr_ctlinput = rip_ctlinput, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; -static const struct protosw in_mobile_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_MOBILE, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = gre_mobile_input, - .pr_output = (pr_output_t *)rip_output, - .pr_ctlinput = rip_ctlinput, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; -#endif +static void gre_updatehdr(struct gre_softc *); +static int gre_set_tunnel(struct ifnet *, struct sockaddr *, + struct sockaddr *); +static void gre_delete_tunnel(struct ifnet *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW, 0, @@ -163,805 +136,851 @@ static SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW, 0, */ #define MAX_GRE_NEST 1 #endif -static int max_gre_nesting = MAX_GRE_NEST; -SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW, - &max_gre_nesting, 0, "Max nested tunnels"); -/* ARGSUSED */ +static VNET_DEFINE(int, max_gre_nesting) = MAX_GRE_NEST; +#define V_max_gre_nesting VNET(max_gre_nesting) +SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(max_gre_nesting), 0, "Max nested tunnels"); + +static void +vnet_gre_init(const void *unused __unused) +{ + LIST_INIT(&V_gre_softc_list); + GRE_LIST_LOCK_INIT(); + V_gre_cloner = if_clone_simple(grename, gre_clone_create, + gre_clone_destroy, 0); +} +VNET_SYSINIT(vnet_gre_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_gre_init, NULL); + static void -greattach(void) +vnet_gre_uninit(const void *unused __unused) { - mtx_init(&gre_mtx, "gre_mtx", NULL, MTX_DEF); - LIST_INIT(&gre_softc_list); - if_clone_attach(&gre_cloner); + if_clone_detach(V_gre_cloner); + GRE_LIST_LOCK_DESTROY(); } +VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_gre_uninit, NULL); static int -gre_clone_create(ifc, unit, params) - struct if_clone *ifc; - int unit; - caddr_t params; +gre_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gre_softc *sc; sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK | M_ZERO); - +#ifndef __rtems__ + sc->gre_fibnum = curthread->td_proc->p_fibnum; +#else /* __rtems__ */ + sc->gre_fibnum = BSD_DEFAULT_FIB; +#endif /* __rtems__ */ GRE2IFP(sc) = if_alloc(IFT_TUNNEL); - if (GRE2IFP(sc) == NULL) { - free(sc, M_GRE); - return (ENOSPC); - } - + GRE_LOCK_INIT(sc); GRE2IFP(sc)->if_softc = sc; - if_initname(GRE2IFP(sc), ifc->ifc_name, unit); + if_initname(GRE2IFP(sc), grename, unit); - GRE2IFP(sc)->if_snd.ifq_maxlen = ifqmaxlen; - GRE2IFP(sc)->if_addrlen = 0; - GRE2IFP(sc)->if_hdrlen = 24; /* IP + GRE */ - GRE2IFP(sc)->if_mtu = GREMTU; + GRE2IFP(sc)->if_mtu = sc->gre_mtu = GREMTU; GRE2IFP(sc)->if_flags = IFF_POINTOPOINT|IFF_MULTICAST; GRE2IFP(sc)->if_output = gre_output; GRE2IFP(sc)->if_ioctl = gre_ioctl; - sc->g_dst.s_addr = sc->g_src.s_addr = INADDR_ANY; - sc->g_proto = IPPROTO_GRE; - GRE2IFP(sc)->if_flags |= IFF_LINK0; - sc->encap = NULL; -#ifndef __rtems__ - sc->gre_fibnum = curthread->td_proc->p_fibnum; -#else /* __rtems__ */ - sc->gre_fibnum = BSD_DEFAULT_FIB; -#endif /* __rtems__ */ - sc->wccp_ver = WCCP_V1; - sc->key = 0; + GRE2IFP(sc)->if_transmit = gre_transmit; + GRE2IFP(sc)->if_qflush = gre_qflush; + GRE2IFP(sc)->if_capabilities |= IFCAP_LINKSTATE; + GRE2IFP(sc)->if_capenable |= IFCAP_LINKSTATE; if_attach(GRE2IFP(sc)); bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t)); - mtx_lock(&gre_mtx); - LIST_INSERT_HEAD(&gre_softc_list, sc, sc_list); - mtx_unlock(&gre_mtx); + GRE_LIST_LOCK(); + LIST_INSERT_HEAD(&V_gre_softc_list, sc, gre_list); + GRE_LIST_UNLOCK(); return (0); } static void -gre_clone_destroy(ifp) - struct ifnet *ifp; +gre_clone_destroy(struct ifnet *ifp) { - struct gre_softc *sc = ifp->if_softc; - - mtx_lock(&gre_mtx); - LIST_REMOVE(sc, sc_list); - mtx_unlock(&gre_mtx); + struct gre_softc *sc; -#ifdef INET - if (sc->encap != NULL) - encap_detach(sc->encap); -#endif + sx_xlock(&gre_ioctl_sx); + sc = ifp->if_softc; + gre_delete_tunnel(ifp); + GRE_LIST_LOCK(); + LIST_REMOVE(sc, gre_list); + GRE_LIST_UNLOCK(); bpfdetach(ifp); if_detach(ifp); + ifp->if_softc = NULL; + sx_xunlock(&gre_ioctl_sx); + if_free(ifp); + GRE_LOCK_DESTROY(sc); free(sc, M_GRE); } -/* - * The output routine. Takes a packet and encapsulates it in the protocol - * given by sc->g_proto. See also RFC 1701 and RFC 2004 - */ static int -gre_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct route *ro) +gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - int error = 0; - struct gre_softc *sc = ifp->if_softc; - struct greip *gh; - struct ip *ip; - struct m_tag *mtag; - struct mtag_gre_nesting *gt; - size_t len; - u_short gre_ip_id = 0; - uint8_t gre_ip_tos = 0; - u_int16_t etype = 0; - struct mobile_h mob_h; - u_int32_t af; - int extra = 0, max; - - /* - * gre may cause infinite recursion calls when misconfigured. High - * nesting level may cause stack exhaustion. We'll prevent this by - * detecting loops and by introducing upper limit. - */ - mtag = m_tag_locate(m, MTAG_COOKIE_GRE, MTAG_GRE_NESTING, NULL); - if (mtag != NULL) { - struct ifnet **ifp2; - - gt = (struct mtag_gre_nesting *)(mtag + 1); - gt->count++; - if (gt->count > min(gt->max,max_gre_nesting)) { - printf("%s: hit maximum recursion limit %u on %s\n", - __func__, gt->count - 1, ifp->if_xname); - m_freem(m); - error = EIO; /* is there better errno? */ - goto end; - } - - ifp2 = gt->ifp; - for (max = gt->count - 1; max > 0; max--) { - if (*ifp2 == ifp) - break; - ifp2++; - } - if (*ifp2 == ifp) { - printf("%s: detected loop with nexting %u on %s\n", - __func__, gt->count-1, ifp->if_xname); - m_freem(m); - error = EIO; /* is there better errno? */ - goto end; - } - *ifp2 = ifp; + GRE_RLOCK_TRACKER; + struct ifreq *ifr = (struct ifreq *)data; + struct sockaddr *src, *dst; + struct gre_softc *sc; +#ifdef INET + struct sockaddr_in *sin = NULL; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6 = NULL; +#endif + uint32_t opt; + int error; - } else { - /* - * Given that people should NOT increase max_gre_nesting beyond - * their real needs, we allocate once per packet rather than - * allocating an mtag once per passing through gre. - * - * Note: the sysctl does not actually check for saneness, so we - * limit the maximum numbers of possible recursions here. - */ - max = imin(max_gre_nesting, 256); - /* If someone sets the sysctl <= 0, we want at least 1. */ - max = imax(max, 1); - len = sizeof(struct mtag_gre_nesting) + - max * sizeof(struct ifnet *); - mtag = m_tag_alloc(MTAG_COOKIE_GRE, MTAG_GRE_NESTING, len, - M_NOWAIT); - if (mtag == NULL) { - m_freem(m); - error = ENOMEM; - goto end; - } - gt = (struct mtag_gre_nesting *)(mtag + 1); - bzero(gt, len); - gt->count = 1; - gt->max = max; - *gt->ifp = ifp; - m_tag_prepend(m, mtag); + switch (cmd) { + case SIOCSIFMTU: + /* XXX: */ + if (ifr->ifr_mtu < 576) + return (EINVAL); + break; + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: + return (0); + case GRESADDRS: + case GRESADDRD: + case GREGADDRS: + case GREGADDRD: + case GRESPROTO: + case GREGPROTO: + return (EOPNOTSUPP); } - - if (!((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING)) || - sc->g_src.s_addr == INADDR_ANY || sc->g_dst.s_addr == INADDR_ANY) { - m_freem(m); - error = ENETDOWN; + src = dst = NULL; + sx_xlock(&gre_ioctl_sx); + sc = ifp->if_softc; + if (sc == NULL) { + error = ENXIO; goto end; } - - gh = NULL; - ip = NULL; - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { - bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } - - if (bpf_peers_present(ifp->if_bpf)) { - af = dst->sa_family; - bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); - } - - if ((ifp->if_flags & IFF_MONITOR) != 0) { - m_freem(m); - error = ENETDOWN; + error = 0; + switch (cmd) { + case SIOCSIFMTU: + GRE_WLOCK(sc); + sc->gre_mtu = ifr->ifr_mtu; + gre_updatehdr(sc); + GRE_WUNLOCK(sc); goto end; - } - - m->m_flags &= ~(M_BCAST|M_MCAST); - - if (sc->g_proto == IPPROTO_MOBILE) { - if (dst->sa_family == AF_INET) { - struct mbuf *m0; - int msiz; - - ip = mtod(m, struct ip *); - - /* - * RFC2004 specifies that fragmented diagrams shouldn't - * be encapsulated. - */ - if (ip->ip_off & (IP_MF | IP_OFFMASK)) { - _IF_DROP(&ifp->if_snd); - m_freem(m); - error = EINVAL; /* is there better errno? */ - goto end; - } - memset(&mob_h, 0, MOB_H_SIZ_L); - mob_h.proto = (ip->ip_p) << 8; - mob_h.odst = ip->ip_dst.s_addr; - ip->ip_dst.s_addr = sc->g_dst.s_addr; - - /* - * If the packet comes from our host, we only change - * the destination address in the IP header. - * Else we also need to save and change the source - */ - if (in_hosteq(ip->ip_src, sc->g_src)) { - msiz = MOB_H_SIZ_S; - } else { - mob_h.proto |= MOB_H_SBIT; - mob_h.osrc = ip->ip_src.s_addr; - ip->ip_src.s_addr = sc->g_src.s_addr; - msiz = MOB_H_SIZ_L; - } - mob_h.proto = htons(mob_h.proto); - mob_h.hcrc = gre_in_cksum((u_int16_t *)&mob_h, msiz); - - if ((m->m_data - msiz) < m->m_pktdat) { - /* need new mbuf */ - MGETHDR(m0, M_DONTWAIT, MT_DATA); - if (m0 == NULL) { - _IF_DROP(&ifp->if_snd); - m_freem(m); - error = ENOBUFS; - goto end; - } - m0->m_next = m; - m->m_data += sizeof(struct ip); - m->m_len -= sizeof(struct ip); - m0->m_pkthdr.len = m->m_pkthdr.len + msiz; - m0->m_len = msiz + sizeof(struct ip); - m0->m_data += max_linkhdr; - memcpy(mtod(m0, caddr_t), (caddr_t)ip, - sizeof(struct ip)); - m = m0; - } else { /* we have some space left in the old one */ - m->m_data -= msiz; - m->m_len += msiz; - m->m_pkthdr.len += msiz; - bcopy(ip, mtod(m, caddr_t), - sizeof(struct ip)); - } - ip = mtod(m, struct ip *); - memcpy((caddr_t)(ip + 1), &mob_h, (unsigned)msiz); - ip->ip_len = ntohs(ip->ip_len) + msiz; - } else { /* AF_INET */ - _IF_DROP(&ifp->if_snd); - m_freem(m); - error = EINVAL; - goto end; - } - } else if (sc->g_proto == IPPROTO_GRE) { - switch (dst->sa_family) { - case AF_INET: - ip = mtod(m, struct ip *); - gre_ip_tos = ip->ip_tos; - gre_ip_id = ip->ip_id; - if (sc->wccp_ver == WCCP_V2) { - extra = sizeof(uint32_t); - etype = WCCP_PROTOCOL_TYPE; - } else { - etype = ETHERTYPE_IP; - } - break; + case SIOCSIFPHYADDR: #ifdef INET6 - case AF_INET6: - gre_ip_id = ip_newid(); - etype = ETHERTYPE_IPV6; + case SIOCSIFPHYADDR_IN6: +#endif + error = EINVAL; + switch (cmd) { +#ifdef INET + case SIOCSIFPHYADDR: + src = (struct sockaddr *) + &(((struct in_aliasreq *)data)->ifra_addr); + dst = (struct sockaddr *) + &(((struct in_aliasreq *)data)->ifra_dstaddr); break; #endif -#ifdef NETATALK - case AF_APPLETALK: - etype = ETHERTYPE_ATALK; +#ifdef INET6 + case SIOCSIFPHYADDR_IN6: + src = (struct sockaddr *) + &(((struct in6_aliasreq *)data)->ifra_addr); + dst = (struct sockaddr *) + &(((struct in6_aliasreq *)data)->ifra_dstaddr); break; #endif default: - _IF_DROP(&ifp->if_snd); - m_freem(m); error = EAFNOSUPPORT; goto end; } - - /* Reserve space for GRE header + optional GRE key */ - int hdrlen = sizeof(struct greip) + extra; - if (sc->key) - hdrlen += sizeof(uint32_t); - M_PREPEND(m, hdrlen, M_DONTWAIT); - } else { - _IF_DROP(&ifp->if_snd); - m_freem(m); - error = EINVAL; - goto end; - } - - if (m == NULL) { /* mbuf allocation failed */ - _IF_DROP(&ifp->if_snd); - error = ENOBUFS; - goto end; - } - - M_SETFIB(m, sc->gre_fibnum); /* The envelope may use a different FIB */ - - gh = mtod(m, struct greip *); - if (sc->g_proto == IPPROTO_GRE) { - uint32_t *options = gh->gi_options; - - memset((void *)gh, 0, sizeof(struct greip) + extra); - gh->gi_ptype = htons(etype); - gh->gi_flags = 0; - - /* Add key option */ - if (sc->key) - { - gh->gi_flags |= htons(GRE_KP); - *(options++) = htonl(sc->key); - } - } - - gh->gi_pr = sc->g_proto; - if (sc->g_proto != IPPROTO_MOBILE) { - gh->gi_src = sc->g_src; - gh->gi_dst = sc->g_dst; - ((struct ip*)gh)->ip_v = IPPROTO_IPV4; - ((struct ip*)gh)->ip_hl = (sizeof(struct ip)) >> 2; - ((struct ip*)gh)->ip_ttl = GRE_TTL; - ((struct ip*)gh)->ip_tos = gre_ip_tos; - ((struct ip*)gh)->ip_id = gre_ip_id; - gh->gi_len = m->m_pkthdr.len; - } - - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - /* - * Send it off and with IP_FORWARD flag to prevent it from - * overwriting the ip_id again. ip_id is already set to the - * ip_id of the encapsulated packet. - */ - error = ip_output(m, NULL, &sc->route, IP_FORWARDING, - (struct ip_moptions *)NULL, (struct inpcb *)NULL); - end: - if (error) - ifp->if_oerrors++; - return (error); -} - -static int -gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct ifreq *ifr = (struct ifreq *)data; - struct if_laddrreq *lifr = (struct if_laddrreq *)data; - struct in_aliasreq *aifr = (struct in_aliasreq *)data; - struct gre_softc *sc = ifp->if_softc; - int s; - struct sockaddr_in si; - struct sockaddr *sa = NULL; - int error, adj; - struct sockaddr_in sp, sm, dp, dm; - uint32_t key; - - error = 0; - adj = 0; + /* sa_family must be equal */ + if (src->sa_family != dst->sa_family || + src->sa_len != dst->sa_len) + goto end; - s = splnet(); - switch (cmd) { - case SIOCSIFADDR: - ifp->if_flags |= IFF_UP; - break; - case SIOCSIFDSTADDR: - break; - case SIOCSIFFLAGS: - /* - * XXXRW: Isn't this priv_check() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_SETIFFLAGS)) != 0) - break; - if ((ifr->ifr_flags & IFF_LINK0) != 0) - sc->g_proto = IPPROTO_GRE; - else - sc->g_proto = IPPROTO_MOBILE; - if ((ifr->ifr_flags & IFF_LINK2) != 0) - sc->wccp_ver = WCCP_V2; - else - sc->wccp_ver = WCCP_V1; - goto recompute; - case SIOCSIFMTU: - /* - * XXXRW: Isn't this priv_check() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_SETIFMTU)) != 0) - break; - if (ifr->ifr_mtu < 576) { - error = EINVAL; - break; - } - ifp->if_mtu = ifr->ifr_mtu; - break; - case SIOCGIFMTU: - ifr->ifr_mtu = GRE2IFP(sc)->if_mtu; - break; - case SIOCADDMULTI: - /* - * XXXRW: Isn't this priv_checkr() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_ADDMULTI)) != 0) - break; - if (ifr == 0) { - error = EAFNOSUPPORT; - break; - } - switch (ifr->ifr_addr.sa_family) { + /* validate sa_len */ + switch (src->sa_family) { #ifdef INET case AF_INET: + if (src->sa_len != sizeof(struct sockaddr_in)) + goto end; break; #endif #ifdef INET6 case AF_INET6: + if (src->sa_len != sizeof(struct sockaddr_in6)) + goto end; break; #endif default: error = EAFNOSUPPORT; - break; + goto end; } - break; - case SIOCDELMULTI: - /* - * XXXRW: Isn't this priv_check() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_DELIFGROUP)) != 0) - break; - if (ifr == 0) { - error = EAFNOSUPPORT; - break; + /* check sa_family looks sane for the cmd */ + error = EAFNOSUPPORT; + switch (cmd) { +#ifdef INET + case SIOCSIFPHYADDR: + if (src->sa_family == AF_INET) + break; + goto end; +#endif +#ifdef INET6 + case SIOCSIFPHYADDR_IN6: + if (src->sa_family == AF_INET6) + break; + goto end; +#endif } - switch (ifr->ifr_addr.sa_family) { + error = EADDRNOTAVAIL; + switch (src->sa_family) { #ifdef INET case AF_INET: + if (satosin(src)->sin_addr.s_addr == INADDR_ANY || + satosin(dst)->sin_addr.s_addr == INADDR_ANY) + goto end; break; #endif #ifdef INET6 case AF_INET6: - break; + if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr) + || + IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr)) + goto end; + /* + * Check validity of the scope zone ID of the + * addresses, and convert it into the kernel + * internal form if necessary. + */ + error = sa6_embedscope(satosin6(src), 0); + if (error != 0) + goto end; + error = sa6_embedscope(satosin6(dst), 0); + if (error != 0) + goto end; #endif - default: - error = EAFNOSUPPORT; - break; } + error = gre_set_tunnel(ifp, src, dst); break; - case GRESPROTO: - /* - * XXXRW: Isn't this priv_check() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) - break; - sc->g_proto = ifr->ifr_flags; - switch (sc->g_proto) { - case IPPROTO_GRE: - ifp->if_flags |= IFF_LINK0; - break; - case IPPROTO_MOBILE: - ifp->if_flags &= ~IFF_LINK0; - break; - default: - error = EPROTONOSUPPORT; - break; - } - goto recompute; - case GREGPROTO: - ifr->ifr_flags = sc->g_proto; + case SIOCDIFPHYADDR: + gre_delete_tunnel(ifp); break; - case GRESADDRS: - case GRESADDRD: - error = priv_check(curthread, PRIV_NET_GRE); - if (error) - return (error); - /* - * set tunnel endpoints, compute a less specific route - * to the remote end and mark if as up - */ - sa = &ifr->ifr_addr; - if (cmd == GRESADDRS) - sc->g_src = (satosin(sa))->sin_addr; - if (cmd == GRESADDRD) - sc->g_dst = (satosin(sa))->sin_addr; - recompute: -#ifdef INET - if (sc->encap != NULL) { - encap_detach(sc->encap); - sc->encap = NULL; - } + case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: +#ifdef INET6 + case SIOCGIFPSRCADDR_IN6: + case SIOCGIFPDSTADDR_IN6: #endif - if ((sc->g_src.s_addr != INADDR_ANY) && - (sc->g_dst.s_addr != INADDR_ANY)) { - bzero(&sp, sizeof(sp)); - bzero(&sm, sizeof(sm)); - bzero(&dp, sizeof(dp)); - bzero(&dm, sizeof(dm)); - sp.sin_len = sm.sin_len = dp.sin_len = dm.sin_len = - sizeof(struct sockaddr_in); - sp.sin_family = sm.sin_family = dp.sin_family = - dm.sin_family = AF_INET; - sp.sin_addr = sc->g_src; - dp.sin_addr = sc->g_dst; - sm.sin_addr.s_addr = dm.sin_addr.s_addr = - INADDR_BROADCAST; -#ifdef INET - sc->encap = encap_attach(AF_INET, sc->g_proto, - sintosa(&sp), sintosa(&sm), sintosa(&dp), - sintosa(&dm), (sc->g_proto == IPPROTO_GRE) ? - &in_gre_protosw : &in_mobile_protosw, sc); - if (sc->encap == NULL) - printf("%s: unable to attach encap\n", - if_name(GRE2IFP(sc))); -#endif - if (sc->route.ro_rt != 0) /* free old route */ - RTFREE(sc->route.ro_rt); - if (gre_compute_route(sc) == 0) - ifp->if_drv_flags |= IFF_DRV_RUNNING; - else - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - } - break; - case GREGADDRS: - memset(&si, 0, sizeof(si)); - si.sin_family = AF_INET; - si.sin_len = sizeof(struct sockaddr_in); - si.sin_addr.s_addr = sc->g_src.s_addr; - sa = sintosa(&si); - error = prison_if(curthread->td_ucred, sa); - if (error != 0) - break; - ifr->ifr_addr = *sa; - break; - case GREGADDRD: - memset(&si, 0, sizeof(si)); - si.sin_family = AF_INET; - si.sin_len = sizeof(struct sockaddr_in); - si.sin_addr.s_addr = sc->g_dst.s_addr; - sa = sintosa(&si); - error = prison_if(curthread->td_ucred, sa); - if (error != 0) - break; - ifr->ifr_addr = *sa; - break; - case SIOCSIFPHYADDR: - /* - * XXXRW: Isn't this priv_check() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0) - break; - if (aifr->ifra_addr.sin_family != AF_INET || - aifr->ifra_dstaddr.sin_family != AF_INET) { - error = EAFNOSUPPORT; - break; - } - if (aifr->ifra_addr.sin_len != sizeof(si) || - aifr->ifra_dstaddr.sin_len != sizeof(si)) { - error = EINVAL; - break; - } - sc->g_src = aifr->ifra_addr.sin_addr; - sc->g_dst = aifr->ifra_dstaddr.sin_addr; - goto recompute; - case SIOCSLIFPHYADDR: - /* - * XXXRW: Isn't this priv_check() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0) - break; - if (lifr->addr.ss_family != AF_INET || - lifr->dstaddr.ss_family != AF_INET) { - error = EAFNOSUPPORT; - break; - } - if (lifr->addr.ss_len != sizeof(si) || - lifr->dstaddr.ss_len != sizeof(si)) { - error = EINVAL; - break; - } - sc->g_src = (satosin(&lifr->addr))->sin_addr; - sc->g_dst = - (satosin(&lifr->dstaddr))->sin_addr; - goto recompute; - case SIOCDIFPHYADDR: - /* - * XXXRW: Isn't this priv_check() redundant to the ifnet - * layer check? - */ - if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0) - break; - sc->g_src.s_addr = INADDR_ANY; - sc->g_dst.s_addr = INADDR_ANY; - goto recompute; - case SIOCGLIFPHYADDR: - if (sc->g_src.s_addr == INADDR_ANY || - sc->g_dst.s_addr == INADDR_ANY) { + if (sc->gre_family == 0) { error = EADDRNOTAVAIL; break; } - memset(&si, 0, sizeof(si)); - si.sin_family = AF_INET; - si.sin_len = sizeof(struct sockaddr_in); - si.sin_addr.s_addr = sc->g_src.s_addr; - error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); - if (error != 0) + GRE_RLOCK(sc); + switch (cmd) { +#ifdef INET + case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: + if (sc->gre_family != AF_INET) { + error = EADDRNOTAVAIL; + break; + } + sin = (struct sockaddr_in *)&ifr->ifr_addr; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); break; - memcpy(&lifr->addr, &si, sizeof(si)); - si.sin_addr.s_addr = sc->g_dst.s_addr; - error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); - if (error != 0) +#endif +#ifdef INET6 + case SIOCGIFPSRCADDR_IN6: + case SIOCGIFPDSTADDR_IN6: + if (sc->gre_family != AF_INET6) { + error = EADDRNOTAVAIL; + break; + } + sin6 = (struct sockaddr_in6 *) + &(((struct in6_ifreq *)data)->ifr_addr); + memset(sin6, 0, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); break; - memcpy(&lifr->dstaddr, &si, sizeof(si)); - break; - case SIOCGIFPSRCADDR: +#endif + } + if (error == 0) { + switch (cmd) { +#ifdef INET + case SIOCGIFPSRCADDR: + sin->sin_addr = sc->gre_oip.ip_src; + break; + case SIOCGIFPDSTADDR: + sin->sin_addr = sc->gre_oip.ip_dst; + break; +#endif #ifdef INET6 - case SIOCGIFPSRCADDR_IN6: + case SIOCGIFPSRCADDR_IN6: + sin6->sin6_addr = sc->gre_oip6.ip6_src; + break; + case SIOCGIFPDSTADDR_IN6: + sin6->sin6_addr = sc->gre_oip6.ip6_dst; + break; #endif - if (sc->g_src.s_addr == INADDR_ANY) { - error = EADDRNOTAVAIL; - break; + } } - memset(&si, 0, sizeof(si)); - si.sin_family = AF_INET; - si.sin_len = sizeof(struct sockaddr_in); - si.sin_addr.s_addr = sc->g_src.s_addr; - error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); + GRE_RUNLOCK(sc); if (error != 0) break; - bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); - break; - case SIOCGIFPDSTADDR: + switch (cmd) { +#ifdef INET + case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: + error = prison_if(curthread->td_ucred, + (struct sockaddr *)sin); + if (error != 0) + memset(sin, 0, sizeof(*sin)); + break; +#endif #ifdef INET6 - case SIOCGIFPDSTADDR_IN6: + case SIOCGIFPSRCADDR_IN6: + case SIOCGIFPDSTADDR_IN6: + error = prison_if(curthread->td_ucred, + (struct sockaddr *)sin6); + if (error == 0) + error = sa6_recoverscope(sin6); + if (error != 0) + memset(sin6, 0, sizeof(*sin6)); #endif - if (sc->g_dst.s_addr == INADDR_ANY) { - error = EADDRNOTAVAIL; - break; } - memset(&si, 0, sizeof(si)); - si.sin_family = AF_INET; - si.sin_len = sizeof(struct sockaddr_in); - si.sin_addr.s_addr = sc->g_dst.s_addr; - error = prison_if(curthread->td_ucred, (struct sockaddr *)&si); - if (error != 0) + break; + case SIOCGTUNFIB: + ifr->ifr_fib = sc->gre_fibnum; + break; + case SIOCSTUNFIB: + if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; - bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr)); + if (ifr->ifr_fib >= rt_numfibs) + error = EINVAL; + else + sc->gre_fibnum = ifr->ifr_fib; break; case GRESKEY: - error = priv_check(curthread, PRIV_NET_GRE); - if (error) - break; - error = copyin(ifr->ifr_data, &key, sizeof(key)); - if (error) + if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; - /* adjust MTU for option header */ - if (key == 0 && sc->key != 0) /* clear */ - adj += sizeof(key); - else if (key != 0 && sc->key == 0) /* set */ - adj -= sizeof(key); - - if (ifp->if_mtu + adj < 576) { - error = EINVAL; + if ((error = copyin(ifr->ifr_data, &opt, sizeof(opt))) != 0) break; + if (sc->gre_key != opt) { + GRE_WLOCK(sc); + sc->gre_key = opt; + gre_updatehdr(sc); + GRE_WUNLOCK(sc); } - ifp->if_mtu += adj; - sc->key = key; break; case GREGKEY: - error = copyout(&sc->key, ifr->ifr_data, sizeof(sc->key)); + error = copyout(&sc->gre_key, ifr->ifr_data, + sizeof(sc->gre_key)); + break; + case GRESOPTS: + if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) + break; + if ((error = copyin(ifr->ifr_data, &opt, sizeof(opt))) != 0) + break; + if (opt & ~GRE_OPTMASK) + error = EINVAL; + else { + if (sc->gre_options != opt) { + GRE_WLOCK(sc); + sc->gre_options = opt; + gre_updatehdr(sc); + GRE_WUNLOCK(sc); + } + } break; + case GREGOPTS: + error = copyout(&sc->gre_options, ifr->ifr_data, + sizeof(sc->gre_options)); + break; default: error = EINVAL; break; } - - splx(s); +end: + sx_xunlock(&gre_ioctl_sx); return (error); } -/* - * computes a route to our destination that is not the one - * which would be taken by ip_output(), as this one will loop back to - * us. If the interface is p2p as a--->b, then a routing entry exists - * If we now send a packet to b (e.g. ping b), this will come down here - * gets src=a, dst=b tacked on and would from ip_output() sent back to - * if_gre. - * Goal here is to compute a route to b that is less specific than - * a-->b. We know that this one exists as in normal operation we have - * at least a default route which matches. - */ +static void +gre_updatehdr(struct gre_softc *sc) +{ + struct grehdr *gh = NULL; + uint32_t *opts; + uint16_t flags; + + GRE_WLOCK_ASSERT(sc); + switch (sc->gre_family) { +#ifdef INET + case AF_INET: + sc->gre_hlen = sizeof(struct greip); + sc->gre_oip.ip_v = IPPROTO_IPV4; + sc->gre_oip.ip_hl = sizeof(struct ip) >> 2; + sc->gre_oip.ip_p = IPPROTO_GRE; + gh = &sc->gre_gihdr->gi_gre; + break; +#endif +#ifdef INET6 + case AF_INET6: + sc->gre_hlen = sizeof(struct greip6); + sc->gre_oip6.ip6_vfc = IPV6_VERSION; + sc->gre_oip6.ip6_nxt = IPPROTO_GRE; + gh = &sc->gre_gi6hdr->gi6_gre; + break; +#endif + default: + return; + } + flags = 0; + opts = gh->gre_opts; + if (sc->gre_options & GRE_ENABLE_CSUM) { + flags |= GRE_FLAGS_CP; + sc->gre_hlen += 2 * sizeof(uint16_t); + *opts++ = 0; + } + if (sc->gre_key != 0) { + flags |= GRE_FLAGS_KP; + sc->gre_hlen += sizeof(uint32_t); + *opts++ = htonl(sc->gre_key); + } + if (sc->gre_options & GRE_ENABLE_SEQ) { + flags |= GRE_FLAGS_SP; + sc->gre_hlen += sizeof(uint32_t); + *opts++ = 0; + } else + sc->gre_oseq = 0; + gh->gre_flags = htons(flags); + GRE2IFP(sc)->if_mtu = sc->gre_mtu - sc->gre_hlen; +} + +static void +gre_detach(struct gre_softc *sc) +{ + + sx_assert(&gre_ioctl_sx, SA_XLOCKED); + if (sc->gre_ecookie != NULL) + encap_detach(sc->gre_ecookie); + sc->gre_ecookie = NULL; +} + static int -gre_compute_route(struct gre_softc *sc) +gre_set_tunnel(struct ifnet *ifp, struct sockaddr *src, + struct sockaddr *dst) { - struct route *ro; - - ro = &sc->route; - - memset(ro, 0, sizeof(struct route)); - ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst; - ro->ro_dst.sa_family = AF_INET; - ro->ro_dst.sa_len = sizeof(ro->ro_dst); - - /* - * toggle last bit, so our interface is not found, but a less - * specific route. I'd rather like to specify a shorter mask, - * but this is not possible. Should work though. XXX - * XXX MRT Use a different FIB for the tunnel to solve this problem. - */ - if ((GRE2IFP(sc)->if_flags & IFF_LINK1) == 0) { - ((struct sockaddr_in *)&ro->ro_dst)->sin_addr.s_addr ^= - htonl(0x01); + struct gre_softc *sc, *tsc; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif +#ifdef INET + struct ip *ip; +#endif + void *hdr; + int error; + + sx_assert(&gre_ioctl_sx, SA_XLOCKED); + GRE_LIST_LOCK(); + sc = ifp->if_softc; + LIST_FOREACH(tsc, &V_gre_softc_list, gre_list) { + if (tsc == sc || tsc->gre_family != src->sa_family) + continue; +#ifdef INET + if (tsc->gre_family == AF_INET && + tsc->gre_oip.ip_src.s_addr == + satosin(src)->sin_addr.s_addr && + tsc->gre_oip.ip_dst.s_addr == + satosin(dst)->sin_addr.s_addr) { + GRE_LIST_UNLOCK(); + return (EADDRNOTAVAIL); + } +#endif +#ifdef INET6 + if (tsc->gre_family == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&tsc->gre_oip6.ip6_src, + &satosin6(src)->sin6_addr) && + IN6_ARE_ADDR_EQUAL(&tsc->gre_oip6.ip6_dst, + &satosin6(dst)->sin6_addr)) { + GRE_LIST_UNLOCK(); + return (EADDRNOTAVAIL); + } +#endif } + GRE_LIST_UNLOCK(); -#ifdef DIAGNOSTIC - printf("%s: searching for a route to %s", if_name(GRE2IFP(sc)), - inet_ntoa(((struct sockaddr_in *)&ro->ro_dst)->sin_addr)); + switch (src->sa_family) { +#ifdef INET + case AF_INET: + hdr = ip = malloc(sizeof(struct greip) + + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); + ip->ip_src = satosin(src)->sin_addr; + ip->ip_dst = satosin(dst)->sin_addr; + break; +#endif +#ifdef INET6 + case AF_INET6: + hdr = ip6 = malloc(sizeof(struct greip6) + + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); + ip6->ip6_src = satosin6(src)->sin6_addr; + ip6->ip6_dst = satosin6(dst)->sin6_addr; + break; +#endif + default: + return (EAFNOSUPPORT); + } + if (sc->gre_family != 0) + gre_detach(sc); + GRE_WLOCK(sc); + if (sc->gre_family != 0) + free(sc->gre_hdr, M_GRE); + sc->gre_family = src->sa_family; + sc->gre_hdr = hdr; + sc->gre_oseq = 0; + sc->gre_iseq = UINT32_MAX; + gre_updatehdr(sc); + GRE_WUNLOCK(sc); + + error = 0; + switch (src->sa_family) { +#ifdef INET + case AF_INET: + error = in_gre_attach(sc); + break; +#endif +#ifdef INET6 + case AF_INET6: + error = in6_gre_attach(sc); + break; #endif + } + if (error == 0) { + ifp->if_drv_flags |= IFF_DRV_RUNNING; + if_link_state_change(ifp, LINK_STATE_UP); + } + return (error); +} - rtalloc_fib(ro, sc->gre_fibnum); +static void +gre_delete_tunnel(struct ifnet *ifp) +{ + struct gre_softc *sc = ifp->if_softc; + int family; + + GRE_WLOCK(sc); + family = sc->gre_family; + sc->gre_family = 0; + GRE_WUNLOCK(sc); + if (family != 0) { + gre_detach(sc); + free(sc->gre_hdr, M_GRE); + } + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + if_link_state_change(ifp, LINK_STATE_DOWN); +} - /* - * check if this returned a route at all and this route is no - * recursion to ourself - */ - if (ro->ro_rt == NULL || ro->ro_rt->rt_ifp->if_softc == sc) { -#ifdef DIAGNOSTIC - if (ro->ro_rt == NULL) - printf(" - no route found!\n"); - else - printf(" - route loops back to ourself!\n"); +int +gre_input(struct mbuf **mp, int *offp, int proto) +{ + struct gre_softc *sc; + struct grehdr *gh; + struct ifnet *ifp; + struct mbuf *m; + uint32_t *opts; +#ifdef notyet + uint32_t key; +#endif + uint16_t flags; + int hlen, isr, af; + + m = *mp; + sc = encap_getarg(m); + KASSERT(sc != NULL, ("encap_getarg returned NULL")); + + ifp = GRE2IFP(sc); + hlen = *offp + sizeof(struct grehdr) + 4 * sizeof(uint32_t); + if (m->m_pkthdr.len < hlen) + goto drop; + if (m->m_len < hlen) { + m = m_pullup(m, hlen); + if (m == NULL) + goto drop; + } + gh = (struct grehdr *)mtodo(m, *offp); + flags = ntohs(gh->gre_flags); + if (flags & ~GRE_FLAGS_MASK) + goto drop; + opts = gh->gre_opts; + hlen = 2 * sizeof(uint16_t); + if (flags & GRE_FLAGS_CP) { + /* reserved1 field must be zero */ + if (((uint16_t *)opts)[1] != 0) + goto drop; + if (in_cksum_skip(m, m->m_pkthdr.len, *offp) != 0) + goto drop; + hlen += 2 * sizeof(uint16_t); + opts++; + } + if (flags & GRE_FLAGS_KP) { +#ifdef notyet + /* + * XXX: The current implementation uses the key only for outgoing + * packets. But we can check the key value here, or even in the + * encapcheck function. + */ + key = ntohl(*opts); +#endif + hlen += sizeof(uint32_t); + opts++; + } +#ifdef notyet + } else + key = 0; + + if (sc->gre_key != 0 && (key != sc->gre_key || key != 0)) + goto drop; +#endif + if (flags & GRE_FLAGS_SP) { +#ifdef notyet + seq = ntohl(*opts); +#endif + hlen += sizeof(uint32_t); + } + switch (ntohs(gh->gre_proto)) { + case ETHERTYPE_WCCP: + /* + * For WCCP skip an additional 4 bytes if after GRE header + * doesn't follow an IP header. + */ + if (flags == 0 && (*(uint8_t *)gh->gre_opts & 0xF0) != 0x40) + hlen += sizeof(uint32_t); + /* FALLTHROUGH */ + case ETHERTYPE_IP: + isr = NETISR_IP; + af = AF_INET; + break; + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + af = AF_INET6; + break; + default: + goto drop; + } + m_adj(m, *offp + hlen); + m_clrprotoflags(m); + m->m_pkthdr.rcvif = ifp; + M_SETFIB(m, ifp->if_fib); +#ifdef MAC + mac_ifnet_create_mbuf(ifp, m); #endif - return EADDRNOTAVAIL; + BPF_MTAP2(ifp, &af, sizeof(af), m); + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); + if ((ifp->if_flags & IFF_MONITOR) != 0) + m_freem(m); + else + netisr_dispatch(isr, m); + return (IPPROTO_DONE); +drop: + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + m_freem(m); + return (IPPROTO_DONE); +} + +#define MTAG_GRE 1307983903 +static int +gre_check_nesting(struct ifnet *ifp, struct mbuf *m) +{ + struct m_tag *mtag; + int count; + + count = 1; + mtag = NULL; + while ((mtag = m_tag_locate(m, MTAG_GRE, 0, mtag)) != NULL) { + if (*(struct ifnet **)(mtag + 1) == ifp) { + log(LOG_NOTICE, "%s: loop detected\n", ifp->if_xname); + return (EIO); + } + count++; + } + if (count > V_max_gre_nesting) { + log(LOG_NOTICE, + "%s: if_output recursively called too many times(%d)\n", + ifp->if_xname, count); + return (EIO); } + mtag = m_tag_alloc(MTAG_GRE, 0, sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) + return (ENOMEM); + *(struct ifnet **)(mtag + 1) = ifp; + m_tag_prepend(m, mtag); + return (0); +} - /* - * now change it back - else ip_output will just drop - * the route and search one to this interface ... - */ - if ((GRE2IFP(sc)->if_flags & IFF_LINK1) == 0) - ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst; +static int +gre_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) +{ + uint32_t af; + int error; -#ifdef DIAGNOSTIC - printf(", choosing %s with gateway %s", if_name(ro->ro_rt->rt_ifp), - inet_ntoa(((struct sockaddr_in *)(ro->ro_rt->rt_gateway))->sin_addr)); - printf("\n"); +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error != 0) + goto drop; #endif + if ((ifp->if_flags & IFF_MONITOR) != 0 || + (ifp->if_flags & IFF_UP) == 0) { + error = ENETDOWN; + goto drop; + } + + error = gre_check_nesting(ifp, m); + if (error != 0) + goto drop; - return 0; + m->m_flags &= ~(M_BCAST|M_MCAST); + if (dst->sa_family == AF_UNSPEC) + bcopy(dst->sa_data, &af, sizeof(af)); + else + af = dst->sa_family; + BPF_MTAP2(ifp, &af, sizeof(af), m); + m->m_pkthdr.csum_data = af; /* save af for if_transmit */ + return (ifp->if_transmit(ifp, m)); +drop: + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (error); } -/* - * do a checksum of a buffer - much like in_cksum, which operates on - * mbufs. - */ -u_int16_t -gre_in_cksum(u_int16_t *p, u_int len) +static void +gre_setseqn(struct grehdr *gh, uint32_t seq) { - u_int32_t sum = 0; - int nwords = len >> 1; - - while (nwords-- != 0) - sum += *p++; - - if (len & 1) { - union { - u_short w; - u_char c[2]; - } u; - u.c[0] = *(u_char *)p; - u.c[1] = 0; - sum += u.w; + uint32_t *opts; + uint16_t flags; + + opts = gh->gre_opts; + flags = ntohs(gh->gre_flags); + KASSERT((flags & GRE_FLAGS_SP) != 0, + ("gre_setseqn called, but GRE_FLAGS_SP isn't set ")); + if (flags & GRE_FLAGS_CP) + opts++; + if (flags & GRE_FLAGS_KP) + opts++; + *opts = htonl(seq); +} + +static int +gre_transmit(struct ifnet *ifp, struct mbuf *m) +{ + GRE_RLOCK_TRACKER; + struct gre_softc *sc; + struct grehdr *gh; + uint32_t iaf, oaf, oseq; + int error, hlen, olen, plen; + int want_seq, want_csum; + + plen = 0; + sc = ifp->if_softc; + if (sc == NULL) { + error = ENETDOWN; + m_freem(m); + goto drop; + } + GRE_RLOCK(sc); + if (sc->gre_family == 0) { + GRE_RUNLOCK(sc); + error = ENETDOWN; + m_freem(m); + goto drop; + } + iaf = m->m_pkthdr.csum_data; + oaf = sc->gre_family; + hlen = sc->gre_hlen; + want_seq = (sc->gre_options & GRE_ENABLE_SEQ) != 0; + if (want_seq) + oseq = sc->gre_oseq++; /* XXX */ + else + oseq = 0; /* Make compiler happy. */ + want_csum = (sc->gre_options & GRE_ENABLE_CSUM) != 0; + M_SETFIB(m, sc->gre_fibnum); + M_PREPEND(m, hlen, M_NOWAIT); + if (m == NULL) { + GRE_RUNLOCK(sc); + error = ENOBUFS; + goto drop; + } + bcopy(sc->gre_hdr, mtod(m, void *), hlen); + GRE_RUNLOCK(sc); + switch (oaf) { +#ifdef INET + case AF_INET: + olen = sizeof(struct ip); + break; +#endif +#ifdef INET6 + case AF_INET6: + olen = sizeof(struct ip6_hdr); + break; +#endif + default: + error = ENETDOWN; + goto drop; } + gh = (struct grehdr *)mtodo(m, olen); + switch (iaf) { +#ifdef INET + case AF_INET: + gh->gre_proto = htons(ETHERTYPE_IP); + break; +#endif +#ifdef INET6 + case AF_INET6: + gh->gre_proto = htons(ETHERTYPE_IPV6); + break; +#endif + default: + error = ENETDOWN; + goto drop; + } + if (want_seq) + gre_setseqn(gh, oseq); + if (want_csum) { + *(uint16_t *)gh->gre_opts = in_cksum_skip(m, + m->m_pkthdr.len, olen); + } + plen = m->m_pkthdr.len - hlen; + switch (oaf) { +#ifdef INET + case AF_INET: + error = in_gre_output(m, iaf, hlen); + break; +#endif +#ifdef INET6 + case AF_INET6: + error = in6_gre_output(m, iaf, hlen); + break; +#endif + default: + m_freem(m); + error = ENETDOWN; + } +drop: + if (error) + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + else { + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OBYTES, plen); + } + return (error); +} + +static void +gre_qflush(struct ifnet *ifp __unused) +{ - /* end-around-carry */ - sum = (sum >> 16) + (sum & 0xffff); - sum += (sum >> 16); - return (~sum); } static int @@ -970,16 +989,12 @@ gremodevent(module_t mod, int type, void *data) switch (type) { case MOD_LOAD: - greattach(); - break; case MOD_UNLOAD: - if_clone_detach(&gre_cloner); - mtx_destroy(&gre_mtx); break; default: - return EOPNOTSUPP; + return (EOPNOTSUPP); } - return 0; + return (0); } static moduledata_t gre_mod = { diff --git a/freebsd/sys/net/if_gre.h b/freebsd/sys/net/if_gre.h index 74d16b1c..806b0cb8 100644 --- a/freebsd/sys/net/if_gre.h +++ b/freebsd/sys/net/if_gre.h @@ -1,8 +1,6 @@ -/* $NetBSD: if_gre.h,v 1.13 2003/11/10 08:51:52 wiz Exp $ */ -/* $FreeBSD$ */ - /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. + * Copyright (c) 2014 Andrey V. Elsukov <ae@FreeBSD.org> * All rights reserved * * This code is derived from software contributed to The NetBSD Foundation @@ -28,158 +26,111 @@ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. + * + * $NetBSD: if_gre.h,v 1.13 2003/11/10 08:51:52 wiz Exp $ + * $FreeBSD$ */ -#ifndef _NET_IF_GRE_H -#define _NET_IF_GRE_H +#ifndef _NET_IF_GRE_H_ +#define _NET_IF_GRE_H_ -#include <sys/ioccom.h> #ifdef _KERNEL -#include <sys/queue.h> - -/* - * Version of the WCCP, need to be configured manually since - * header for version 2 is the same but IP payload is prepended - * with additional 4-bytes field. - */ -typedef enum { - WCCP_V1 = 0, - WCCP_V2 -} wccp_ver_t; - -struct gre_softc { - struct ifnet *sc_ifp; - LIST_ENTRY(gre_softc) sc_list; - int gre_unit; - int gre_flags; - u_int gre_fibnum; /* use this fib for envelopes */ - struct in_addr g_src; /* source address of gre packets */ - struct in_addr g_dst; /* destination address of gre packets */ - struct route route; /* routing entry that determines, where a - encapsulated packet should go */ - u_char g_proto; /* protocol of encapsulator */ - - const struct encaptab *encap; /* encapsulation cookie */ - - uint32_t key; /* key included in outgoing GRE packets */ - /* zero means none */ - - wccp_ver_t wccp_ver; /* version of the WCCP */ -}; -#define GRE2IFP(sc) ((sc)->sc_ifp) - - -struct gre_h { - u_int16_t flags; /* GRE flags */ - u_int16_t ptype; /* protocol type of payload typically - Ether protocol type*/ - uint32_t options[0]; /* optional options */ -/* - * from here on: fields are optional, presence indicated by flags - * - u_int_16 checksum checksum (one-complements of GRE header - and payload - Present if (ck_pres | rt_pres == 1). - Valid if (ck_pres == 1). - u_int_16 offset offset from start of routing filed to - first octet of active SRE (see below). - Present if (ck_pres | rt_pres == 1). - Valid if (rt_pres == 1). - u_int_32 key inserted by encapsulator e.g. for - authentication - Present if (key_pres ==1 ). - u_int_32 seq_num Sequence number to allow for packet order - Present if (seq_pres ==1 ). - struct gre_sre[] routing Routing fileds (see below) - Present if (rt_pres == 1) - */ +/* GRE header according to RFC 2784 and RFC 2890 */ +struct grehdr { + uint16_t gre_flags; /* GRE flags */ +#define GRE_FLAGS_CP 0x8000 /* checksum present */ +#define GRE_FLAGS_KP 0x2000 /* key present */ +#define GRE_FLAGS_SP 0x1000 /* sequence present */ +#define GRE_FLAGS_MASK (GRE_FLAGS_CP|GRE_FLAGS_KP|GRE_FLAGS_SP) + uint16_t gre_proto; /* protocol type */ + uint32_t gre_opts[0]; /* optional fields */ } __packed; +#ifdef INET struct greip { - struct ip gi_i; - struct gre_h gi_g; + struct ip gi_ip; + struct grehdr gi_gre; } __packed; +#endif -#define gi_pr gi_i.ip_p -#define gi_len gi_i.ip_len -#define gi_src gi_i.ip_src -#define gi_dst gi_i.ip_dst -#define gi_ptype gi_g.ptype -#define gi_flags gi_g.flags -#define gi_options gi_g.options - -#define GRE_CP 0x8000 /* Checksum Present */ -#define GRE_RP 0x4000 /* Routing Present */ -#define GRE_KP 0x2000 /* Key Present */ -#define GRE_SP 0x1000 /* Sequence Present */ -#define GRE_SS 0x0800 /* Strict Source Route */ +#ifdef INET6 +struct greip6 { + struct ip6_hdr gi6_ip6; + struct grehdr gi6_gre; +} __packed; +#endif +struct gre_softc { + struct ifnet *gre_ifp; + LIST_ENTRY(gre_softc) gre_list; + struct rmlock gre_lock; + int gre_family; /* AF of delivery header */ + uint32_t gre_iseq; + uint32_t gre_oseq; + uint32_t gre_key; + uint32_t gre_options; + uint32_t gre_mtu; + u_int gre_fibnum; + u_int gre_hlen; /* header size */ + union { + void *hdr; +#ifdef INET + struct greip *gihdr; +#endif +#ifdef INET6 + struct greip6 *gi6hdr; +#endif + } gre_uhdr; + const struct encaptab *gre_ecookie; +}; +#define GRE2IFP(sc) ((sc)->gre_ifp) +#define GRE_LOCK_INIT(sc) rm_init(&(sc)->gre_lock, "gre softc") +#define GRE_LOCK_DESTROY(sc) rm_destroy(&(sc)->gre_lock) +#define GRE_RLOCK_TRACKER struct rm_priotracker gre_tracker +#define GRE_RLOCK(sc) rm_rlock(&(sc)->gre_lock, &gre_tracker) +#define GRE_RUNLOCK(sc) rm_runlock(&(sc)->gre_lock, &gre_tracker) +#define GRE_RLOCK_ASSERT(sc) rm_assert(&(sc)->gre_lock, RA_RLOCKED) +#define GRE_WLOCK(sc) rm_wlock(&(sc)->gre_lock) +#define GRE_WUNLOCK(sc) rm_wunlock(&(sc)->gre_lock) +#define GRE_WLOCK_ASSERT(sc) rm_assert(&(sc)->gre_lock, RA_WLOCKED) + +#define gre_hdr gre_uhdr.hdr +#define gre_gihdr gre_uhdr.gihdr +#define gre_gi6hdr gre_uhdr.gi6hdr +#define gre_oip gre_gihdr->gi_ip +#define gre_oip6 gre_gi6hdr->gi6_ip6 + +int gre_input(struct mbuf **, int *, int); +#ifdef INET +int in_gre_attach(struct gre_softc *); +int in_gre_output(struct mbuf *, int, int); +#endif +#ifdef INET6 +int in6_gre_attach(struct gre_softc *); +int in6_gre_output(struct mbuf *, int, int); +#endif /* * CISCO uses special type for GRE tunnel created as part of WCCP * connection, while in fact those packets are just IPv4 encapsulated * into GRE. */ -#define WCCP_PROTOCOL_TYPE 0x883E - -/* - * gre_sre defines a Source route Entry. These are needed if packets - * should be routed over more than one tunnel hop by hop - */ -struct gre_sre { - u_int16_t sre_family; /* address family */ - u_char sre_offset; /* offset to first octet of active entry */ - u_char sre_length; /* number of octets in the SRE. - sre_lengthl==0 -> last entry. */ - u_char *sre_rtinfo; /* the routing information */ -}; - -struct greioctl { - int unit; - struct in_addr addr; -}; - -/* for mobile encaps */ - -struct mobile_h { - u_int16_t proto; /* protocol and S-bit */ - u_int16_t hcrc; /* header checksum */ - u_int32_t odst; /* original destination address */ - u_int32_t osrc; /* original source addr, if S-bit set */ -} __packed; - -struct mobip_h { - struct ip mi; - struct mobile_h mh; -} __packed; - - -#define MOB_H_SIZ_S (sizeof(struct mobile_h) - sizeof(u_int32_t)) -#define MOB_H_SIZ_L (sizeof(struct mobile_h)) -#define MOB_H_SBIT 0x0080 - -#define GRE_TTL 30 - +#define ETHERTYPE_WCCP 0x883E #endif /* _KERNEL */ -/* - * ioctls needed to manipulate the interface - */ - #define GRESADDRS _IOW('i', 101, struct ifreq) #define GRESADDRD _IOW('i', 102, struct ifreq) #define GREGADDRS _IOWR('i', 103, struct ifreq) #define GREGADDRD _IOWR('i', 104, struct ifreq) #define GRESPROTO _IOW('i' , 105, struct ifreq) #define GREGPROTO _IOWR('i', 106, struct ifreq) -#define GREGKEY _IOWR('i', 107, struct ifreq) -#define GRESKEY _IOW('i', 108, struct ifreq) -#ifdef _KERNEL -LIST_HEAD(gre_softc_head, gre_softc); -extern struct mtx gre_mtx; -extern struct gre_softc_head gre_softc_list; +#define GREGKEY _IOWR('i', 107, struct ifreq) +#define GRESKEY _IOW('i', 108, struct ifreq) +#define GREGOPTS _IOWR('i', 109, struct ifreq) +#define GRESOPTS _IOW('i', 110, struct ifreq) -u_int16_t gre_in_cksum(u_int16_t *, u_int); -#endif /* _KERNEL */ +#define GRE_ENABLE_CSUM 0x0001 +#define GRE_ENABLE_SEQ 0x0002 +#define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ) -#endif +#endif /* _NET_IF_GRE_H_ */ diff --git a/freebsd/sys/net/if_iso88025subr.c b/freebsd/sys/net/if_iso88025subr.c index 660dc7dd..d26d0ebd 100644 --- a/freebsd/sys/net/if_iso88025subr.c +++ b/freebsd/sys/net/if_iso88025subr.c @@ -44,7 +44,6 @@ #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> @@ -56,6 +55,7 @@ #include <sys/sockio.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_arp.h> #include <net/if_dl.h> #include <net/if_llc.h> @@ -77,11 +77,6 @@ #include <netinet6/nd6.h> #endif -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - #include <security/mac/mac_framework.h> static const u_char iso88025_broadcastaddr[ISO88025_ADDR_LEN] = @@ -172,30 +167,6 @@ iso88025_ioctl(struct ifnet *ifp, u_long command, caddr_t data) arp_ifinit(ifp, ifa); break; #endif /* INET */ -#ifdef IPX - /* - * XXX - This code is probably wrong - */ - case AF_IPX: { - struct ipx_addr *ina; - - ina = &(IA_SIPX(ifa)->sipx_addr); - - if (ipx_nullhost(*ina)) - ina->x_host = *(union ipx_host *) - IF_LLADDR(ifp); - else - bcopy((caddr_t) ina->x_host.c_host, - (caddr_t) IF_LLADDR(ifp), - ISO88025_ADDR_LEN); - - /* - * Set new address - */ - ifp->if_init(ifp->if_softc); - } - break; -#endif /* IPX */ default: ifp->if_init(ifp->if_softc); break; @@ -233,11 +204,8 @@ iso88025_ioctl(struct ifnet *ifp, u_long command, caddr_t data) * ISO88025 encapsulation */ int -iso88025_output(ifp, m, dst, ro) - struct ifnet *ifp; - struct mbuf *m; - struct sockaddr *dst; - struct route *ro; +iso88025_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) { u_int16_t snap_type = 0; int loop_copy = 0, error = 0, rif_len = 0; @@ -246,13 +214,10 @@ iso88025_output(ifp, m, dst, ro) struct iso88025_header gen_th; struct sockaddr_dl *sdl = NULL; struct rtentry *rt0 = NULL; -#if defined(INET) || defined(INET6) - struct llentry *lle; -#endif + int is_gw = 0; if (ro != NULL) - rt0 = ro->ro_rt; - + is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) @@ -291,7 +256,7 @@ iso88025_output(ifp, m, dst, ro) switch (dst->sa_family) { #ifdef INET case AF_INET: - error = arpresolve(ifp, rt0, m, dst, edst, &lle); + error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); snap_type = ETHERTYPE_IP; @@ -326,34 +291,15 @@ iso88025_output(ifp, m, dst, ro) #endif /* INET */ #ifdef INET6 case AF_INET6: - error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); + error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) - return (error); + return (error == EWOULDBLOCK ? 0 : error); snap_type = ETHERTYPE_IPV6; break; #endif /* INET6 */ -#ifdef IPX - case AF_IPX: - { - u_int8_t *cp; - - bcopy((caddr_t)&(satoipx_addr(dst).x_host), (caddr_t)edst, - ISO88025_ADDR_LEN); - - M_PREPEND(m, 3, M_WAIT); - m = m_pullup(m, 3); - if (m == 0) - senderr(ENOBUFS); - cp = mtod(m, u_int8_t *); - *cp++ = ETHERTYPE_IPX_8022; - *cp++ = ETHERTYPE_IPX_8022; - *cp++ = LLC_UI; - } - break; -#endif /* IPX */ case AF_UNSPEC: { - struct iso88025_sockaddr_data *sd; + const struct iso88025_sockaddr_data *sd; /* * For AF_UNSPEC sockaddr.sa_data must contain all of the * mac information needed to send the packet. This allows @@ -363,13 +309,12 @@ iso88025_output(ifp, m, dst, ro) * should be an iso88025_sockaddr_data structure see iso88025.h */ loop_copy = -1; - sd = (struct iso88025_sockaddr_data *)dst->sa_data; + sd = (const struct iso88025_sockaddr_data *)dst->sa_data; gen_th.ac = sd->ac; gen_th.fc = sd->fc; - (void)memcpy((caddr_t)edst, (caddr_t)sd->ether_dhost, - ISO88025_ADDR_LEN); - (void)memcpy((caddr_t)gen_th.iso88025_shost, - (caddr_t)sd->ether_shost, ISO88025_ADDR_LEN); + (void)memcpy(edst, sd->ether_dhost, ISO88025_ADDR_LEN); + (void)memcpy(gen_th.iso88025_shost, sd->ether_shost, + ISO88025_ADDR_LEN); rif_len = 0; break; } @@ -384,8 +329,8 @@ iso88025_output(ifp, m, dst, ro) */ if (snap_type != 0) { struct llc *l; - M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT); + if (m == NULL) senderr(ENOBUFS); l = mtod(m, struct llc *); l->llc_control = LLC_UI; @@ -400,8 +345,8 @@ iso88025_output(ifp, m, dst, ro) * Add local net header. If no space in first mbuf, * allocate another. */ - M_PREPEND(m, ISO88025_HDR_LEN + rif_len, M_DONTWAIT); - if (m == 0) + M_PREPEND(m, ISO88025_HDR_LEN + rif_len, M_NOWAIT); + if (m == NULL) senderr(ENOBUFS); th = mtod(m, struct iso88025_header *); bcopy((caddr_t)edst, (caddr_t)&gen_th.iso88025_dhost, ISO88025_ADDR_LEN); @@ -435,12 +380,12 @@ iso88025_output(ifp, m, dst, ro) IFQ_HANDOFF_ADJ(ifp, m, ISO88025_HDR_LEN + LLC_SNAPFRAMELEN, error); if (error) { printf("iso88025_output: packet dropped QFULL.\n"); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } return (error); bad: - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (m) m_freem(m); return (error); @@ -465,24 +410,23 @@ iso88025_input(ifp, m) */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m = m_pullup(m, ISO88025_HDR_LEN); if (m == NULL) { - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } th = mtod(m, struct iso88025_header *); - m->m_pkthdr.header = (void *)th; /* * Discard packet if interface is not up. @@ -511,7 +455,7 @@ iso88025_input(ifp, m) /* * Update interface statistics. */ - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); getmicrotime(&ifp->if_lastchange); /* @@ -533,7 +477,7 @@ iso88025_input(ifp, m) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; - ifp->if_imcasts++; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } mac_hdr_len = ISO88025_HDR_LEN; @@ -546,37 +490,24 @@ iso88025_input(ifp, m) m = m_pullup(m, LLC_SNAPFRAMELEN); if (m == 0) { - ifp->if_ierrors++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } l = mtod(m, struct llc *); switch (l->llc_dsap) { -#ifdef IPX - case ETHERTYPE_IPX_8022: /* Thanks a bunch Novell */ - if ((l->llc_control != LLC_UI) || - (l->llc_ssap != ETHERTYPE_IPX_8022)) { - ifp->if_noproto++; - goto dropanyway; - } - - th->iso88025_shost[0] &= ~(TR_RII); - m_adj(m, 3); - isr = NETISR_IPX; - break; -#endif /* IPX */ case LLC_SNAP_LSAP: { u_int16_t type; if ((l->llc_control != LLC_UI) || (l->llc_ssap != LLC_SNAP_LSAP)) { - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0 || l->llc_snap.org_code[2] != 0) { - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } @@ -586,8 +517,6 @@ iso88025_input(ifp, m) #ifdef INET case ETHERTYPE_IP: th->iso88025_shost[0] &= ~(TR_RII); - if ((m = ip_fastforward(m)) == NULL) - return; isr = NETISR_IP; break; @@ -597,12 +526,6 @@ iso88025_input(ifp, m) isr = NETISR_ARP; break; #endif /* INET */ -#ifdef IPX_SNAP /* XXX: Not supported! */ - case ETHERTYPE_IPX: - th->iso88025_shost[0] &= ~(TR_RII); - isr = NETISR_IPX; - break; -#endif /* IPX_SNAP */ #ifdef INET6 case ETHERTYPE_IPV6: th->iso88025_shost[0] &= ~(TR_RII); @@ -611,7 +534,7 @@ iso88025_input(ifp, m) #endif /* INET6 */ default: printf("iso88025_input: unexpected llc_snap ether_type 0x%02x\n", type); - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } break; @@ -620,7 +543,7 @@ iso88025_input(ifp, m) case LLC_ISO_LSAP: switch (l->llc_control) { case LLC_UI: - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; case LLC_XID: @@ -636,7 +559,6 @@ iso88025_input(ifp, m) case LLC_TEST_P: { struct sockaddr sa; - struct arpcom *ac; struct iso88025_sockaddr_data *th2; int i; u_char c; @@ -669,7 +591,7 @@ iso88025_input(ifp, m) } default: printf("iso88025_input: unexpected llc control 0x%02x\n", l->llc_control); - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; } @@ -677,7 +599,7 @@ iso88025_input(ifp, m) #endif /* ISO */ default: printf("iso88025_input: unknown dsap 0x%x\n", l->llc_dsap); - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; } @@ -687,7 +609,7 @@ iso88025_input(ifp, m) return; dropanyway: - ifp->if_iqdrops++; + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if (m) m_freem(m); return; @@ -718,7 +640,7 @@ iso88025_resolvemulti (ifp, llsa, sa) if ((e_addr[0] & 1) != 1) { return (EADDRNOTAVAIL); } - *llsa = 0; + *llsa = NULL; return (0); #ifdef INET @@ -727,14 +649,7 @@ iso88025_resolvemulti (ifp, llsa, sa) if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { return (EADDRNOTAVAIL); } - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT|M_ZERO); - if (sdl == NULL) - return (ENOMEM); - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ISO88025; + sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025); sdl->sdl_alen = ISO88025_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); @@ -751,20 +666,13 @@ iso88025_resolvemulti (ifp, llsa, sa) * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; - *llsa = 0; + *llsa = NULL; return (0); } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { return (EADDRNOTAVAIL); } - sdl = malloc(sizeof *sdl, M_IFMADDR, - M_NOWAIT|M_ZERO); - if (sdl == NULL) - return (ENOMEM); - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ISO88025; + sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025); sdl->sdl_alen = ISO88025_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); @@ -783,49 +691,8 @@ iso88025_resolvemulti (ifp, llsa, sa) return (0); } -static MALLOC_DEFINE(M_ISO88025, "arpcom", "802.5 interface internals"); - -static void* -iso88025_alloc(u_char type, struct ifnet *ifp) -{ - struct arpcom *ac; - - ac = malloc(sizeof(struct arpcom), M_ISO88025, M_WAITOK | M_ZERO); - ac->ac_ifp = ifp; - - return (ac); -} - -static void -iso88025_free(void *com, u_char type) -{ - - free(com, M_ISO88025); -} - -static int -iso88025_modevent(module_t mod, int type, void *data) -{ - - switch (type) { - case MOD_LOAD: - if_register_com_alloc(IFT_ISO88025, iso88025_alloc, - iso88025_free); - break; - case MOD_UNLOAD: - if_deregister_com_alloc(IFT_ISO88025); - break; - default: - return EOPNOTSUPP; - } - - return (0); -} - static moduledata_t iso88025_mod = { - "iso88025", - iso88025_modevent, - 0 + .name = "iso88025", }; DECLARE_MODULE(iso88025, iso88025_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/freebsd/sys/net/if_lagg.c b/freebsd/sys/net/if_lagg.c index 46f3f46c..9cfb7b8b 100644 --- a/freebsd/sys/net/if_lagg.c +++ b/freebsd/sys/net/if_lagg.c @@ -5,6 +5,7 @@ /* * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org> * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org> + * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -37,9 +38,8 @@ __FBSDID("$FreeBSD$"); #include <sys/priv.h> #include <sys/systm.h> #include <sys/proc.h> -#include <sys/hash.h> #include <rtems/bsd/sys/lock.h> -#include <sys/rwlock.h> +#include <sys/rmlock.h> #include <sys/taskqueue.h> #include <sys/eventhandler.h> @@ -48,11 +48,11 @@ __FBSDID("$FreeBSD$"); #include <net/if_clone.h> #include <net/if_arp.h> #include <net/if_dl.h> -#include <net/if_llc.h> #include <net/if_media.h> #include <net/if_types.h> #include <net/if_var.h> #include <net/bpf.h> +#include <net/vnet.h> #if defined(INET) || defined(INET6) #include <netinet/in.h> @@ -83,15 +83,26 @@ static struct { {0, NULL} }; -SLIST_HEAD(__trhead, lagg_softc) lagg_list; /* list of laggs */ -static struct mtx lagg_list_mtx; +VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ +#define V_lagg_list VNET(lagg_list) +static VNET_DEFINE(struct mtx, lagg_list_mtx); +#define V_lagg_list_mtx VNET(lagg_list_mtx) +#define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ + "if_lagg list", NULL, MTX_DEF) +#define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) +#define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) +#define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); +static VNET_DEFINE(struct if_clone *, lagg_cloner); +#define V_lagg_cloner VNET(lagg_cloner) +static const char laggname[] = "lagg"; + static void lagg_lladdr(struct lagg_softc *, uint8_t *); static void lagg_capabilities(struct lagg_softc *); -static void lagg_port_lladdr(struct lagg_port *, uint8_t *); +static void lagg_port_lladdr(struct lagg_port *, uint8_t *, lagg_llqtype); static void lagg_port_setlladdr(void *, int); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); @@ -100,7 +111,7 @@ static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, - struct sockaddr *, struct route *); + const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); @@ -114,33 +125,28 @@ static int lagg_ether_cmdmulti(struct lagg_port *, int); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); +static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); -static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *); - -IFC_SIMPLE_DECLARE(lagg, 0); /* Simple round robin */ -static int lagg_rr_attach(struct lagg_softc *); -static int lagg_rr_detach(struct lagg_softc *); +static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ -static int lagg_fail_attach(struct lagg_softc *); -static int lagg_fail_detach(struct lagg_softc *); static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ -static int lagg_lb_attach(struct lagg_softc *); -static int lagg_lb_detach(struct lagg_softc *); +static void lagg_lb_attach(struct lagg_softc *); +static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); @@ -148,50 +154,134 @@ static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); +/* Broadcast */ +static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); +static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + /* 802.3ad LACP */ -static int lagg_lacp_attach(struct lagg_softc *); -static int lagg_lacp_detach(struct lagg_softc *); +static void lagg_lacp_attach(struct lagg_softc *); +static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ -static const struct { - int ti_proto; - int (*ti_attach)(struct lagg_softc *); +static const struct lagg_proto { + lagg_proto pr_num; + void (*pr_attach)(struct lagg_softc *); + void (*pr_detach)(struct lagg_softc *); + int (*pr_start)(struct lagg_softc *, struct mbuf *); + struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, + struct mbuf *); + int (*pr_addport)(struct lagg_port *); + void (*pr_delport)(struct lagg_port *); + void (*pr_linkstate)(struct lagg_port *); + void (*pr_init)(struct lagg_softc *); + void (*pr_stop)(struct lagg_softc *); + void (*pr_lladdr)(struct lagg_softc *); + void (*pr_request)(struct lagg_softc *, void *); + void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { - { LAGG_PROTO_ROUNDROBIN, lagg_rr_attach }, - { LAGG_PROTO_FAILOVER, lagg_fail_attach }, - { LAGG_PROTO_LOADBALANCE, lagg_lb_attach }, - { LAGG_PROTO_ETHERCHANNEL, lagg_lb_attach }, - { LAGG_PROTO_LACP, lagg_lacp_attach }, - { LAGG_PROTO_NONE, NULL } + { + .pr_num = LAGG_PROTO_NONE + }, + { + .pr_num = LAGG_PROTO_ROUNDROBIN, + .pr_attach = lagg_rr_attach, + .pr_start = lagg_rr_start, + .pr_input = lagg_rr_input, + }, + { + .pr_num = LAGG_PROTO_FAILOVER, + .pr_start = lagg_fail_start, + .pr_input = lagg_fail_input, + }, + { + .pr_num = LAGG_PROTO_LOADBALANCE, + .pr_attach = lagg_lb_attach, + .pr_detach = lagg_lb_detach, + .pr_start = lagg_lb_start, + .pr_input = lagg_lb_input, + .pr_addport = lagg_lb_port_create, + .pr_delport = lagg_lb_port_destroy, + }, + { + .pr_num = LAGG_PROTO_LACP, + .pr_attach = lagg_lacp_attach, + .pr_detach = lagg_lacp_detach, + .pr_start = lagg_lacp_start, + .pr_input = lagg_lacp_input, + .pr_addport = lacp_port_create, + .pr_delport = lacp_port_destroy, + .pr_linkstate = lacp_linkstate, + .pr_init = lacp_init, + .pr_stop = lacp_stop, + .pr_lladdr = lagg_lacp_lladdr, + .pr_request = lacp_req, + .pr_portreq = lacp_portreq, + }, + { + .pr_num = LAGG_PROTO_BROADCAST, + .pr_start = lagg_bcast_start, + .pr_input = lagg_bcast_input, + }, }; SYSCTL_DECL(_net_link); -static SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, +SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); -static int lagg_failover_rx_all = 0; /* Allow input on any failover links */ -SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW, - &lagg_failover_rx_all, 0, +/* Allow input on any failover links */ +static VNET_DEFINE(int, lagg_failover_rx_all); +#define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) +SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); -static int def_use_flowid = 1; /* Default value for using M_FLOWID */ -TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid); -SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW, - &def_use_flowid, 0, + +/* Default value for using flowid */ +static VNET_DEFINE(int, def_use_flowid) = 1; +#define V_def_use_flowid VNET(def_use_flowid) +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, + &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); +/* Default value for flowid shift */ +static VNET_DEFINE(int, def_flowid_shift) = 16; +#define V_def_flowid_shift VNET(def_flowid_shift) +SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, + &VNET_NAME(def_flowid_shift), 0, + "Default setting for flowid shift for load sharing"); + +static void +vnet_lagg_init(const void *unused __unused) +{ + + LAGG_LIST_LOCK_INIT(); + SLIST_INIT(&V_lagg_list); + V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, + lagg_clone_destroy, 0); +} +VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + vnet_lagg_init, NULL); + +static void +vnet_lagg_uninit(const void *unused __unused) +{ + + if_clone_detach(V_lagg_cloner); + LAGG_LIST_LOCK_DESTROY(); +} +VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, + vnet_lagg_uninit, NULL); + static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF); - SLIST_INIT(&lagg_list); - if_clone_attach(&lagg_cloner); lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( @@ -201,10 +291,8 @@ lagg_modevent(module_t mod, int type, void *data) case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); - if_clone_detach(&lagg_cloner); lagg_input_p = NULL; lagg_linkstate_p = NULL; - mtx_destroy(&lagg_list_mtx); break; default: return (EOPNOTSUPP); @@ -221,7 +309,117 @@ static moduledata_t lagg_mod = { DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); -#if __FreeBSD_version >= 800000 +static void +lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) +{ + + KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", + __func__, sc)); + + if (sc->sc_ifflags & IFF_DEBUG) + if_printf(sc->sc_ifp, "using proto %u\n", pr); + + if (lagg_protos[pr].pr_attach != NULL) + lagg_protos[pr].pr_attach(sc); + sc->sc_proto = pr; +} + +static void +lagg_proto_detach(struct lagg_softc *sc) +{ + lagg_proto pr; + + LAGG_WLOCK_ASSERT(sc); + + pr = sc->sc_proto; + sc->sc_proto = LAGG_PROTO_NONE; + + if (lagg_protos[pr].pr_detach != NULL) + lagg_protos[pr].pr_detach(sc); + else + LAGG_WUNLOCK(sc); +} + +static int +lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) +{ + + return (lagg_protos[sc->sc_proto].pr_start(sc, m)); +} + +static struct mbuf * +lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) +{ + + return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); +} + +static int +lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) +{ + + if (lagg_protos[sc->sc_proto].pr_addport == NULL) + return (0); + else + return (lagg_protos[sc->sc_proto].pr_addport(lp)); +} + +static void +lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) +{ + + if (lagg_protos[sc->sc_proto].pr_delport != NULL) + lagg_protos[sc->sc_proto].pr_delport(lp); +} + +static void +lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) +{ + + if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) + lagg_protos[sc->sc_proto].pr_linkstate(lp); +} + +static void +lagg_proto_init(struct lagg_softc *sc) +{ + + if (lagg_protos[sc->sc_proto].pr_init != NULL) + lagg_protos[sc->sc_proto].pr_init(sc); +} + +static void +lagg_proto_stop(struct lagg_softc *sc) +{ + + if (lagg_protos[sc->sc_proto].pr_stop != NULL) + lagg_protos[sc->sc_proto].pr_stop(sc); +} + +static void +lagg_proto_lladdr(struct lagg_softc *sc) +{ + + if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) + lagg_protos[sc->sc_proto].pr_lladdr(sc); +} + +static void +lagg_proto_request(struct lagg_softc *sc, void *v) +{ + + if (lagg_protos[sc->sc_proto].pr_request != NULL) + lagg_protos[sc->sc_proto].pr_request(sc, v); +} + +static void +lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) +{ + + if (lagg_protos[sc->sc_proto].pr_portreq != NULL) + lagg_protos[sc->sc_proto].pr_portreq(lp, v); +} + /* * This routine is run via an vlan * config EVENT @@ -229,18 +427,19 @@ MODULE_VERSION(if_lagg, 1); static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { - struct lagg_softc *sc = ifp->if_softc; - struct lagg_port *lp; + struct lagg_softc *sc = ifp->if_softc; + struct lagg_port *lp; + struct rm_priotracker tracker; - if (ifp->if_softc != arg) /* Not our event */ - return; + if (ifp->if_softc != arg) /* Not our event */ + return; - LAGG_RLOCK(sc); - if (!SLIST_EMPTY(&sc->sc_ports)) { - SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) - EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); - } - LAGG_RUNLOCK(sc); + LAGG_RLOCK(sc, &tracker); + if (!SLIST_EMPTY(&sc->sc_ports)) { + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); + } + LAGG_RUNLOCK(sc, &tracker); } /* @@ -250,30 +449,27 @@ lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { - struct lagg_softc *sc = ifp->if_softc; - struct lagg_port *lp; + struct lagg_softc *sc = ifp->if_softc; + struct lagg_port *lp; + struct rm_priotracker tracker; - if (ifp->if_softc != arg) /* Not our event */ - return; + if (ifp->if_softc != arg) /* Not our event */ + return; - LAGG_RLOCK(sc); - if (!SLIST_EMPTY(&sc->sc_ports)) { - SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) - EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); - } - LAGG_RUNLOCK(sc); + LAGG_RLOCK(sc, &tracker); + if (!SLIST_EMPTY(&sc->sc_ports)) { + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); + } + LAGG_RUNLOCK(sc, &tracker); } -#endif static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; - int i, error = 0; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ - struct sysctl_oid *oid; - char num[14]; /* sufficient for 32 bits */ sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); @@ -282,32 +478,15 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) return (ENOSPC); } - sysctl_ctx_init(&sc->ctx); - snprintf(num, sizeof(num), "%u", unit); - sc->use_flowid = def_use_flowid; - oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg), - OID_AUTO, num, CTLFLAG_RD, NULL, ""); - SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "use_flowid", CTLFLAG_RW, &sc->use_flowid, sc->use_flowid, - "Use flow id for load sharing"); - SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, - "count", CTLFLAG_RD, &sc->sc_count, sc->sc_count, - "Total number of ports"); + if (V_def_use_flowid) + sc->sc_opts |= LAGG_OPT_USE_FLOWID; + sc->flowid_shift = V_def_flowid_shift; + /* Hash all layers by default */ - sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4; + sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4; + + lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); - sc->sc_proto = LAGG_PROTO_NONE; - for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) { - if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) { - sc->sc_proto = lagg_protos[i].ti_proto; - if ((error = lagg_protos[i].ti_attach(sc)) != 0) { - if_free_type(ifp, IFT_ETHER); - free(sc, M_DEVBUF); - return (error); - } - break; - } - } LAGG_LOCK_INIT(sc); SLIST_INIT(&sc->sc_ports); TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc); @@ -318,32 +497,31 @@ lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); - if_initname(ifp, ifc->ifc_name, unit); - ifp->if_type = IFT_ETHER; + if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; + ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; + ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* - * Attach as an ordinary ethernet device, childs will be attached + * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG. */ ether_ifattach(ifp, eaddr); -#if __FreeBSD_version >= 800000 sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); -#endif /* Insert into the global list of laggs */ - mtx_lock(&lagg_list_mtx); - SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries); - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_LOCK(); + SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); + LAGG_LIST_UNLOCK(); return (0); } @@ -359,47 +537,64 @@ lagg_clone_destroy(struct ifnet *ifp) lagg_stop(sc); ifp->if_flags &= ~IFF_UP; -#if __FreeBSD_version >= 800000 EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); -#endif /* Shutdown and remove lagg ports */ while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ - if (sc->sc_detach != NULL) - (*sc->sc_detach)(sc); + lagg_proto_detach(sc); + LAGG_UNLOCK_ASSERT(sc); - LAGG_WUNLOCK(sc); - - sysctl_ctx_free(&sc->ctx); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); - if_free_type(ifp, IFT_ETHER); + if_free(ifp); - mtx_lock(&lagg_list_mtx); - SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries); - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_LOCK(); + SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); + LAGG_LIST_UNLOCK(); taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task); LAGG_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } -static void +/* + * Set link-layer address on the lagg interface itself. + * + * Set noinline to be dtrace-friendly + */ +static __noinline void lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr) { struct ifnet *ifp = sc->sc_ifp; + struct lagg_port lp; if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) return; + LAGG_WLOCK_ASSERT(sc); + /* + * Set the link layer address on the lagg interface. + * lagg_proto_lladdr() notifies the MAC change to + * the aggregation protocol. iflladdr_event handler which + * may trigger gratuitous ARPs for INET will be handled in + * a taskqueue. + */ bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN); - /* Let the protocol know the MAC has changed */ - if (sc->sc_lladdr != NULL) - (*sc->sc_lladdr)(sc); - EVENTHANDLER_INVOKE(iflladdr_event, ifp); + lagg_proto_lladdr(sc); + + /* + * Send notification request for lagg interface + * itself. Note that new lladdr is already set. + */ + bzero(&lp, sizeof(lp)); + lp.lp_ifp = sc->sc_ifp; + lp.lp_softc = sc; + + /* Do not request lladdr change */ + lagg_port_lladdr(&lp, lladdr, LAGG_LLQTYPE_VIRT); } static void @@ -440,54 +635,63 @@ lagg_capabilities(struct lagg_softc *sc) } } -static void -lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr) +/* + * Enqueue interface lladdr notification. + * If request is already queued, it is updated. + * If setting lladdr is also desired, @do_change has to be set to 1. + * + * Set noinline to be dtrace-friendly + */ +static __noinline void +lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr, lagg_llqtype llq_type) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct lagg_llq *llq; - int pending = 0; LAGG_WLOCK_ASSERT(sc); - if (lp->lp_detaching || - memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0) + /* + * Do not enqueue requests where lladdr is the same for + * "physical" interfaces (e.g. ports in lagg) + */ + if (llq_type == LAGG_LLQTYPE_PHYS && + memcmp(IF_LLADDR(ifp), lladdr, ETHER_ADDR_LEN) == 0) return; /* Check to make sure its not already queued to be changed */ SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) { if (llq->llq_ifp == ifp) { - pending = 1; - break; + /* Update lladdr, it may have changed */ + bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN); + return; } } - if (!pending) { - llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT); - if (llq == NULL) /* XXX what to do */ - return; - } + llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT | M_ZERO); + if (llq == NULL) /* XXX what to do */ + return; - /* Update the lladdr even if pending, it may have changed */ llq->llq_ifp = ifp; + llq->llq_type = llq_type; bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN); - - if (!pending) - SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries); + /* XXX: We should insert to tail */ + SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries); taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task); } /* * Set the interface MAC address from a taskqueue to avoid a LOR. + * + * Set noinline to be dtrace-friendly */ -static void +static __noinline void lagg_port_setlladdr(void *arg, int pending) { struct lagg_softc *sc = (struct lagg_softc *)arg; struct lagg_llq *llq, *head; struct ifnet *ifp; - int error; /* Grab a local reference of the queue and remove it from the softc */ LAGG_WLOCK(sc); @@ -502,14 +706,19 @@ lagg_port_setlladdr(void *arg, int pending) for (llq = head; llq != NULL; llq = head) { ifp = llq->llq_ifp; - /* Set the link layer address */ CURVNET_SET(ifp->if_vnet); - error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN); - CURVNET_RESTORE(); - if (error) - printf("%s: setlladdr failed on %s\n", __func__, - ifp->if_xname); + /* + * Set the link layer address on the laggport interface. + * Note that if_setlladdr() or iflladdr_event handler + * may result in arp transmission / lltable updates. + */ + if (llq->llq_type == LAGG_LLQTYPE_PHYS) + if_setlladdr(ifp, llq->llq_lladdr, + ETHER_ADDR_LEN); + else + EVENTHANDLER_INVOKE(iflladdr_event, ifp); + CURVNET_RESTORE(); head = SLIST_NEXT(llq, llq_entries); free(llq, M_DEVBUF); } @@ -520,7 +729,8 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; - int error = 0; + int error, i; + uint64_t *pval; LAGG_WLOCK_ASSERT(sc); @@ -538,37 +748,9 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) } /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ - if (ifp->if_type != IFT_ETHER) + if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return (EPROTONOSUPPORT); -#ifdef INET6 - /* - * The member interface should not have inet6 address because - * two interfaces with a valid link-local scope zone must not be - * merged in any form. This restriction is needed to - * prevent violation of link-local scope zone. Attempts to - * add a member interface which has inet6 addresses triggers - * removal of all inet6 addresses on the member interface. - */ - SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { - if (in6ifa_llaonifp(lp->lp_ifp)) { - in6_ifdetach(lp->lp_ifp); - if_printf(sc->sc_ifp, - "IPv6 addresses on %s have been removed " - "before adding it as a member to prevent " - "IPv6 address scope violation.\n", - lp->lp_ifp->if_xname); - } - } - if (in6ifa_llaonifp(ifp)) { - in6_ifdetach(ifp); - if_printf(sc->sc_ifp, - "IPv6 addresses on %s have been removed " - "before adding it as a member to prevent " - "IPv6 address scope violation.\n", - ifp->if_xname); - } -#endif /* Allow the first Ethernet member to define the MTU */ if (SLIST_EMPTY(&sc->sc_ports)) sc->sc_ifp->if_mtu = ifp->if_mtu; @@ -583,10 +765,10 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) return (ENOMEM); /* Check if port is a stacked lagg */ - mtx_lock(&lagg_list_mtx); - SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) { + LAGG_LIST_LOCK(); + SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (EINVAL); /* XXX disable stacking for the moment, its untested */ @@ -594,14 +776,14 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_UNLOCK(); free(lp, M_DEVBUF); return (E2BIG); } #endif } } - mtx_unlock(&lagg_list_mtx); + LAGG_LIST_UNLOCK(); /* Change the interface type */ lp->lp_iftype = ifp->if_type; @@ -620,10 +802,15 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) if (SLIST_EMPTY(&sc->sc_ports)) { sc->sc_primary = lp; + /* First port in lagg. Update/notify lagg lladdress */ lagg_lladdr(sc, IF_LLADDR(ifp)); } else { - /* Update link layer address for this port */ - lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp)); + + /* + * Update link layer address for this port and + * send notifications to other subsystems. + */ + lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp), LAGG_LLQTYPE_PHYS); } /* @@ -649,19 +836,21 @@ lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) lagg_capabilities(sc); lagg_linkstate(sc); + /* Read port counters */ + pval = lp->port_counters.val; + for (i = 0; i < IFCOUNTERS; i++, pval++) + *pval = ifp->if_get_counter(ifp, i); /* Add multicast addresses and interface flags to this port */ lagg_ether_cmdmulti(lp, 1); lagg_setflags(lp, 1); - if (sc->sc_port_create != NULL) - error = (*sc->sc_port_create)(lp); - if (error) { - /* remove the port again, without calling sc_port_destroy */ + if ((error = lagg_proto_addport(sc, lp)) != 0) { + /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); return (error); } - return (error); + return (0); } #ifdef LAGG_PORT_STACKING @@ -686,17 +875,19 @@ lagg_port_checkstacking(struct lagg_softc *sc) #endif static int -lagg_port_destroy(struct lagg_port *lp, int runpd) +lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; - struct lagg_port *lp_ptr; + struct lagg_port *lp_ptr, *lp0; struct lagg_llq *llq; struct ifnet *ifp = lp->lp_ifp; + uint64_t *pval, vdiff; + int i; LAGG_WLOCK_ASSERT(sc); - if (runpd && sc->sc_port_destroy != NULL) - (*sc->sc_port_destroy)(lp); + if (rundelport) + lagg_proto_delport(sc, lp); /* * Remove multicast addresses and interface flags from this port and @@ -705,7 +896,7 @@ lagg_port_destroy(struct lagg_port *lp, int runpd) if (!lp->lp_detaching) { lagg_ether_cmdmulti(lp, 0); lagg_setflags(lp, 0); - lagg_port_lladdr(lp, lp->lp_lladdr); + lagg_port_lladdr(lp, lp->lp_lladdr, LAGG_LLQTYPE_PHYS); } /* Restore interface */ @@ -714,6 +905,13 @@ lagg_port_destroy(struct lagg_port *lp, int runpd) ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; + /* Update detached port counters */ + pval = lp->port_counters.val; + for (i = 0; i < IFCOUNTERS; i++, pval++) { + vdiff = ifp->if_get_counter(ifp, i) - *pval; + sc->detached_counters.val[i] += vdiff; + } + /* Finally, remove the port from the lagg */ SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; @@ -722,18 +920,24 @@ lagg_port_destroy(struct lagg_port *lp, int runpd) if (lp == sc->sc_primary) { uint8_t lladdr[ETHER_ADDR_LEN]; - if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) { + if ((lp0 = SLIST_FIRST(&sc->sc_ports)) == NULL) { bzero(&lladdr, ETHER_ADDR_LEN); } else { - bcopy(lp_ptr->lp_lladdr, + bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN); } lagg_lladdr(sc, lladdr); - sc->sc_primary = lp_ptr; - /* Update link layer address for each port */ + /* Mark lp0 as new primary */ + sc->sc_primary = lp0; + + /* + * Enqueue lladdr update/notification for each port + * (new primary needs update as well, to switch from + * old lladdr to its 'real' one). + */ SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) - lagg_port_lladdr(lp_ptr, lladdr); + lagg_port_lladdr(lp_ptr, lladdr, LAGG_LLQTYPE_PHYS); } /* Remove any pending lladdr changes from the queue */ @@ -767,6 +971,7 @@ lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; + struct rm_priotracker tracker; /* Should be checked by the caller */ if (ifp->if_type != IFT_IEEE8023ADLAG || @@ -781,15 +986,15 @@ lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; } - LAGG_RLOCK(sc); + LAGG_RLOCK(sc, &tracker); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); break; } lagg_port2req(lp, rp); - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); break; case SIOCSIFCAP: @@ -826,11 +1031,66 @@ fallback: } /* + * Requests counter @cnt data. + * + * Counter value is calculated the following way: + * 1) for each port, sum difference between current and "initial" measurements. + * 2) add lagg logical interface counters. + * 3) add data from detached_counters array. + * + * We also do the following things on ports attach/detach: + * 1) On port attach we store all counters it has into port_counter array. + * 2) On port detach we add the different between "initial" and + * current counters data to detached_counters array. + */ +static uint64_t +lagg_get_counter(struct ifnet *ifp, ift_counter cnt) +{ + struct lagg_softc *sc; + struct lagg_port *lp; + struct ifnet *lpifp; + struct rm_priotracker tracker; + uint64_t newval, oldval, vsum; + + /* Revise this when we've got non-generic counters. */ + KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); + + sc = (struct lagg_softc *)ifp->if_softc; + LAGG_RLOCK(sc, &tracker); + + vsum = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + /* Saved attached value */ + oldval = lp->port_counters.val[cnt]; + /* current value */ + lpifp = lp->lp_ifp; + newval = lpifp->if_get_counter(lpifp, cnt); + /* Calculate diff and save new */ + vsum += newval - oldval; + } + + /* + * Add counter data which might be added by upper + * layer protocols operating on logical interface. + */ + vsum += if_get_counter_default(ifp, cnt); + + /* + * Add counter data from detached ports counters + */ + vsum += sc->detached_counters.val[cnt]; + + LAGG_RUNLOCK(sc, &tracker); + + return (vsum); +} + +/* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro) + const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; @@ -874,8 +1134,7 @@ lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; - if (sc->sc_portreq != NULL) - (*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc); + lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { @@ -888,7 +1147,7 @@ lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: - case LAGG_PROTO_ETHERCHANNEL: + case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; @@ -910,8 +1169,8 @@ static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; - struct lagg_port *lp; struct ifnet *ifp = sc->sc_ifp; + struct lagg_port *lp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; @@ -919,12 +1178,16 @@ lagg_init(void *xsc) LAGG_WLOCK(sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; - /* Update the port lladdrs */ + + /* + * Update the port lladdrs if needed. + * This might be if_setlladdr() notification + * that lladdr has been changed. + */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) - lagg_port_lladdr(lp, IF_LLADDR(ifp)); + lagg_port_lladdr(lp, IF_LLADDR(ifp), LAGG_LLQTYPE_PHYS); - if (sc->sc_init != NULL) - (*sc->sc_init)(sc); + lagg_proto_init(sc); LAGG_WUNLOCK(sc); } @@ -941,8 +1204,7 @@ lagg_stop(struct lagg_softc *sc) ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - if (sc->sc_stop != NULL) - (*sc->sc_stop)(sc); + lagg_proto_stop(sc); } static int @@ -950,6 +1212,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; + struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; @@ -958,25 +1221,24 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0; + struct rm_priotracker tracker; bzero(&rpbuf, sizeof(rpbuf)); switch (cmd) { case SIOCGLAGG: - LAGG_RLOCK(sc); + LAGG_RLOCK(sc, &tracker); count = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) count++; buflen = count * sizeof(struct lagg_reqport); - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); - LAGG_RLOCK(sc); + LAGG_RLOCK(sc, &tracker); ra->ra_proto = sc->sc_proto; - if (sc->sc_req != NULL) - (*sc->sc_req)(sc, (caddr_t)&ra->ra_psc); - + lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); @@ -990,7 +1252,7 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) buf += sizeof(rpbuf); len -= sizeof(rpbuf); } - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); @@ -1004,49 +1266,150 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = EPROTONOSUPPORT; break; } + LAGG_WLOCK(sc); - if (sc->sc_proto != LAGG_PROTO_NONE) { - /* Reset protocol first in case detach unlocks */ - sc->sc_proto = LAGG_PROTO_NONE; - error = sc->sc_detach(sc); - sc->sc_detach = NULL; - sc->sc_start = NULL; - sc->sc_input = NULL; - sc->sc_port_create = NULL; - sc->sc_port_destroy = NULL; - sc->sc_linkstate = NULL; - sc->sc_init = NULL; - sc->sc_stop = NULL; - sc->sc_lladdr = NULL; - sc->sc_req = NULL; - sc->sc_portreq = NULL; - } else if (sc->sc_input != NULL) { - /* Still detaching */ - error = EBUSY; + lagg_proto_detach(sc); + LAGG_UNLOCK_ASSERT(sc); + lagg_proto_attach(sc, ra->ra_proto); + break; + case SIOCGLAGGOPTS: + ro->ro_opts = sc->sc_opts; + if (sc->sc_proto == LAGG_PROTO_LACP) { + struct lacp_softc *lsc; + + lsc = (struct lacp_softc *)sc->sc_psc; + if (lsc->lsc_debug.lsc_tx_test != 0) + ro->ro_opts |= LAGG_OPT_LACP_TXTEST; + if (lsc->lsc_debug.lsc_rx_test != 0) + ro->ro_opts |= LAGG_OPT_LACP_RXTEST; + if (lsc->lsc_strict_mode != 0) + ro->ro_opts |= LAGG_OPT_LACP_STRICT; + if (lsc->lsc_fast_timeout != 0) + ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT; + + ro->ro_active = sc->sc_active; + } else { + ro->ro_active = 0; + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) + ro->ro_active += LAGG_PORTACTIVE(lp); } - if (error != 0) { - LAGG_WUNLOCK(sc); + ro->ro_bkt = sc->sc_bkt; + ro->ro_flapping = sc->sc_flapping; + ro->ro_flowid_shift = sc->flowid_shift; + break; + case SIOCSLAGGOPTS: + if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) { + if (ro->ro_bkt == 0) + sc->sc_bkt = 1; // Minimum 1 packet per iface. + else + sc->sc_bkt = ro->ro_bkt; + } + error = priv_check(td, PRIV_NET_LAGG); + if (error) + break; + if (ro->ro_opts == 0) + break; + /* + * Set options. LACP options are stored in sc->sc_psc, + * not in sc_opts. + */ + int valid, lacp; + + switch (ro->ro_opts) { + case LAGG_OPT_USE_FLOWID: + case -LAGG_OPT_USE_FLOWID: + case LAGG_OPT_FLOWIDSHIFT: + valid = 1; + lacp = 0; + break; + case LAGG_OPT_LACP_TXTEST: + case -LAGG_OPT_LACP_TXTEST: + case LAGG_OPT_LACP_RXTEST: + case -LAGG_OPT_LACP_RXTEST: + case LAGG_OPT_LACP_STRICT: + case -LAGG_OPT_LACP_STRICT: + case LAGG_OPT_LACP_TIMEOUT: + case -LAGG_OPT_LACP_TIMEOUT: + valid = lacp = 1; + break; + default: + valid = lacp = 0; break; } - for (int i = 0; i < (sizeof(lagg_protos) / - sizeof(lagg_protos[0])); i++) { - if (lagg_protos[i].ti_proto == ra->ra_proto) { - if (sc->sc_ifflags & IFF_DEBUG) - printf("%s: using proto %u\n", - sc->sc_ifname, - lagg_protos[i].ti_proto); - sc->sc_proto = lagg_protos[i].ti_proto; - if (sc->sc_proto != LAGG_PROTO_NONE) - error = lagg_protos[i].ti_attach(sc); - LAGG_WUNLOCK(sc); - return (error); + + LAGG_WLOCK(sc); + + if (valid == 0 || + (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { + /* Invalid combination of options specified. */ + error = EINVAL; + LAGG_WUNLOCK(sc); + break; /* Return from SIOCSLAGGOPTS. */ + } + /* + * Store new options into sc->sc_opts except for + * FLOWIDSHIFT and LACP options. + */ + if (lacp == 0) { + if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) + sc->flowid_shift = ro->ro_flowid_shift; + else if (ro->ro_opts > 0) + sc->sc_opts |= ro->ro_opts; + else + sc->sc_opts &= ~ro->ro_opts; + } else { + struct lacp_softc *lsc; + struct lacp_port *lp; + + lsc = (struct lacp_softc *)sc->sc_psc; + + switch (ro->ro_opts) { + case LAGG_OPT_LACP_TXTEST: + lsc->lsc_debug.lsc_tx_test = 1; + break; + case -LAGG_OPT_LACP_TXTEST: + lsc->lsc_debug.lsc_tx_test = 0; + break; + case LAGG_OPT_LACP_RXTEST: + lsc->lsc_debug.lsc_rx_test = 1; + break; + case -LAGG_OPT_LACP_RXTEST: + lsc->lsc_debug.lsc_rx_test = 0; + break; + case LAGG_OPT_LACP_STRICT: + lsc->lsc_strict_mode = 1; + break; + case -LAGG_OPT_LACP_STRICT: + lsc->lsc_strict_mode = 0; + break; + case LAGG_OPT_LACP_TIMEOUT: + LACP_LOCK(lsc); + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) + lp->lp_state |= LACP_STATE_TIMEOUT; + LACP_UNLOCK(lsc); + lsc->lsc_fast_timeout = 1; + break; + case -LAGG_OPT_LACP_TIMEOUT: + LACP_LOCK(lsc); + LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) + lp->lp_state &= ~LACP_STATE_TIMEOUT; + LACP_UNLOCK(lsc); + lsc->lsc_fast_timeout = 0; + break; } } LAGG_WUNLOCK(sc); - error = EPROTONOSUPPORT; break; case SIOCGLAGGFLAGS: - rf->rf_flags = sc->sc_flags; + rf->rf_flags = 0; + LAGG_RLOCK(sc, &tracker); + if (sc->sc_flags & MBUF_HASHFLAG_L2) + rf->rf_flags |= LAGG_F_HASHL2; + if (sc->sc_flags & MBUF_HASHFLAG_L3) + rf->rf_flags |= LAGG_F_HASHL3; + if (sc->sc_flags & MBUF_HASHFLAG_L4) + rf->rf_flags |= LAGG_F_HASHL4; + LAGG_RUNLOCK(sc, &tracker); break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); @@ -1057,8 +1420,13 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; } LAGG_WLOCK(sc); - sc->sc_flags &= ~LAGG_F_HASHMASK; - sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK; + sc->sc_flags = 0; + if (rf->rf_flags & LAGG_F_HASHL2) + sc->sc_flags |= MBUF_HASHFLAG_L2; + if (rf->rf_flags & LAGG_F_HASHL3) + sc->sc_flags |= MBUF_HASHFLAG_L3; + if (rf->rf_flags & LAGG_F_HASHL4) + sc->sc_flags |= MBUF_HASHFLAG_L4; LAGG_WUNLOCK(sc); break; case SIOCGLAGGPORT: @@ -1068,16 +1436,16 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) break; } - LAGG_RLOCK(sc); + LAGG_RLOCK(sc, &tracker); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); break; } lagg_port2req(lp, rp); - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); @@ -1088,6 +1456,26 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = EINVAL; break; } +#ifdef INET6 + /* + * A laggport interface should not have inet6 address + * because two interfaces with a valid link-local + * scope zone must not be merged in any form. This + * restriction is needed to prevent violation of + * link-local scope zone. Attempts to add a laggport + * interface which has inet6 addresses triggers + * removal of all inet6 addresses on the member + * interface. + */ + if (in6ifa_llaonifp(tpif)) { + in6_ifdetach(tpif); + if_printf(sc->sc_ifp, + "IPv6 addresses on %s have been removed " + "before adding it as a member to prevent " + "IPv6 address scope violation.\n", + tpif->if_xname); + } +#endif LAGG_WLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_WUNLOCK(sc); @@ -1186,39 +1574,39 @@ lagg_ether_cmdmulti(struct lagg_port *lp, int set) struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; - struct ifmultiaddr *ifma, *rifma = NULL; - struct sockaddr_dl sdl; + struct ifmultiaddr *ifma; int error; LAGG_WLOCK_ASSERT(sc); - bzero((char *)&sdl, sizeof(sdl)); - sdl.sdl_len = sizeof(sdl); - sdl.sdl_family = AF_LINK; - sdl.sdl_type = IFT_ETHER; - sdl.sdl_alen = ETHER_ADDR_LEN; - sdl.sdl_index = ifp->if_index; - if (set) { + IF_ADDR_WLOCK(scifp); TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; - bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), - LLADDR(&sdl), ETHER_ADDR_LEN); - - error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma); - if (error) - return (error); mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT); - if (mc == NULL) + if (mc == NULL) { + IF_ADDR_WUNLOCK(scifp); return (ENOMEM); - mc->mc_ifma = rifma; + } + bcopy(ifma->ifma_addr, &mc->mc_addr, + ifma->ifma_addr->sa_len); + mc->mc_addr.sdl_index = ifp->if_index; + mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } + IF_ADDR_WUNLOCK(scifp); + SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { + error = if_addmulti(ifp, + (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); + if (error) + return (error); + } } else { while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); - if_delmulti_ifma(mc->mc_ifma); + if (mc->mc_ifma && !lp->lp_detaching) + if_delmulti_ifma(mc->mc_ifma); free(mc, M_DEVBUF); } } @@ -1228,7 +1616,7 @@ lagg_ether_cmdmulti(struct lagg_port *lp, int set) /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, - int (*func)(struct ifnet *, int)) + int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; @@ -1283,30 +1671,27 @@ lagg_transmit(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error, len, mcast; + struct rm_priotracker tracker; len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; - LAGG_RLOCK(sc); + LAGG_RLOCK(sc, &tracker); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); m_freem(m); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); - error = (*sc->sc_start)(sc, m); - LAGG_RUNLOCK(sc); + error = lagg_proto_start(sc, m); + LAGG_RUNLOCK(sc, &tracker); - if (error == 0) { - ifp->if_opackets++; - ifp->if_omcasts += mcast; - ifp->if_obytes += len; - } else - ifp->if_oerrors++; + if (error != 0) + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } @@ -1325,31 +1710,33 @@ lagg_input(struct ifnet *ifp, struct mbuf *m) struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; + struct rm_priotracker tracker; - LAGG_RLOCK(sc); + LAGG_RLOCK(sc, &tracker); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (lp->lp_flags & LAGG_PORT_DISABLED) || sc->sc_proto == LAGG_PROTO_NONE) { - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); - m = (*sc->sc_input)(sc, lp, m); + if (lp->lp_detaching != 0) { + m_freem(m); + m = NULL; + } else + m = lagg_proto_input(sc, lp, m); if (m != NULL) { - scifp->if_ipackets++; - scifp->if_ibytes += m->m_pkthdr.len; - if (scifp->if_flags & IFF_MONITOR) { m_freem(m); m = NULL; } } - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); return (m); } @@ -1370,16 +1757,17 @@ lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; + struct rm_priotracker tracker; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; - LAGG_RLOCK(sc); + LAGG_RLOCK(sc, &tracker); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } - LAGG_RUNLOCK(sc); + LAGG_RUNLOCK(sc, &tracker); } static void @@ -1391,7 +1779,7 @@ lagg_linkstate(struct lagg_softc *sc) /* Our link is considered up if at least one of our ports is active */ SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { - if (lp->lp_link_state == LINK_STATE_UP) { + if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } @@ -1406,7 +1794,7 @@ lagg_linkstate(struct lagg_softc *sc) break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: - case LAGG_PROTO_ETHERCHANNEL: + case LAGG_PROTO_BROADCAST: speed = 0; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; @@ -1431,8 +1819,7 @@ lagg_port_state(struct ifnet *ifp, int state) LAGG_WLOCK(sc); lagg_linkstate(sc); - if (sc->sc_linkstate != NULL) - (*sc->sc_linkstate)(lp); + lagg_proto_linkstate(sc, lp); LAGG_WUNLOCK(sc); } @@ -1487,120 +1874,6 @@ found: return (rval); } -static const void * -lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf) -{ - if (m->m_pkthdr.len < (off + len)) { - return (NULL); - } else if (m->m_len < (off + len)) { - m_copydata(m, off, len, buf); - return (buf); - } - return (mtod(m, char *) + off); -} - -uint32_t -lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key) -{ - uint16_t etype; - uint32_t p = key; - int off; - struct ether_header *eh; - const struct ether_vlan_header *vlan; -#ifdef INET - const struct ip *ip; - const uint32_t *ports; - int iphlen; -#endif -#ifdef INET6 - const struct ip6_hdr *ip6; - uint32_t flow; -#endif - union { -#ifdef INET - struct ip ip; -#endif -#ifdef INET6 - struct ip6_hdr ip6; -#endif - struct ether_vlan_header vlan; - uint32_t port; - } buf; - - - off = sizeof(*eh); - if (m->m_len < off) - goto out; - eh = mtod(m, struct ether_header *); - etype = ntohs(eh->ether_type); - if (sc->sc_flags & LAGG_F_HASHL2) { - p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p); - p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p); - } - - /* Special handling for encapsulating VLAN frames */ - if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) { - p = hash32_buf(&m->m_pkthdr.ether_vtag, - sizeof(m->m_pkthdr.ether_vtag), p); - } else if (etype == ETHERTYPE_VLAN) { - vlan = lagg_gethdr(m, off, sizeof(*vlan), &buf); - if (vlan == NULL) - goto out; - - if (sc->sc_flags & LAGG_F_HASHL2) - p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p); - etype = ntohs(vlan->evl_proto); - off += sizeof(*vlan) - sizeof(*eh); - } - - switch (etype) { -#ifdef INET - case ETHERTYPE_IP: - ip = lagg_gethdr(m, off, sizeof(*ip), &buf); - if (ip == NULL) - goto out; - - if (sc->sc_flags & LAGG_F_HASHL3) { - p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p); - p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p); - } - if (!(sc->sc_flags & LAGG_F_HASHL4)) - break; - switch (ip->ip_p) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_SCTP: - iphlen = ip->ip_hl << 2; - if (iphlen < sizeof(*ip)) - break; - off += iphlen; - ports = lagg_gethdr(m, off, sizeof(*ports), &buf); - if (ports == NULL) - break; - p = hash32_buf(ports, sizeof(*ports), p); - break; - } - break; -#endif -#ifdef INET6 - case ETHERTYPE_IPV6: - if (!(sc->sc_flags & LAGG_F_HASHL3)) - break; - ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf); - if (ip6 == NULL) - goto out; - - p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p); - p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p); - flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK; - p = hash32_buf(&flow, sizeof(flow), p); /* IPv6 flow label */ - break; -#endif - } -out: - return (p); -} - int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { @@ -1611,24 +1884,12 @@ lagg_enqueue(struct ifnet *ifp, struct mbuf *m) /* * Simple round robin aggregation */ - -static int +static void lagg_rr_attach(struct lagg_softc *sc) { - sc->sc_detach = lagg_rr_detach; - sc->sc_start = lagg_rr_start; - sc->sc_input = lagg_rr_input; - sc->sc_port_create = NULL; sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; sc->sc_seq = 0; - - return (0); -} - -static int -lagg_rr_detach(struct lagg_softc *sc) -{ - return (0); + sc->sc_bkt_count = sc->sc_bkt; } static int @@ -1637,9 +1898,21 @@ lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) struct lagg_port *lp; uint32_t p; - p = atomic_fetchadd_32(&sc->sc_seq, 1); + if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0) + sc->sc_bkt_count = sc->sc_bkt; + + if (sc->sc_bkt > 0) { + atomic_subtract_int(&sc->sc_bkt_count, 1); + if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt)) + p = atomic_fetchadd_32(&sc->sc_seq, 1); + else + p = sc->sc_seq; + } else + p = atomic_fetchadd_32(&sc->sc_seq, 1); + p %= sc->sc_count; lp = SLIST_FIRST(&sc->sc_ports); + while (p--) lp = SLIST_NEXT(lp, lp_entries); @@ -1668,27 +1941,69 @@ lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) } /* - * Active failover + * Broadcast mode */ - static int -lagg_fail_attach(struct lagg_softc *sc) +lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { - sc->sc_detach = lagg_fail_detach; - sc->sc_start = lagg_fail_start; - sc->sc_input = lagg_fail_input; - sc->sc_port_create = NULL; - sc->sc_port_destroy = NULL; + int active_ports = 0; + int errors = 0; + int ret; + struct lagg_port *lp, *last = NULL; + struct mbuf *m0; + + SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { + if (!LAGG_PORTACTIVE(lp)) + continue; + + active_ports++; + + if (last != NULL) { + m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (m0 == NULL) { + ret = ENOBUFS; + errors++; + break; + } + + ret = lagg_enqueue(last->lp_ifp, m0); + if (ret != 0) + errors++; + } + last = lp; + } + if (last == NULL) { + m_freem(m); + return (ENOENT); + } + if ((last = lagg_link_active(sc, last)) == NULL) { + m_freem(m); + return (ENETDOWN); + } + + ret = lagg_enqueue(last->lp_ifp, m); + if (ret != 0) + errors++; + + if (errors == 0) + return (ret); return (0); } -static int -lagg_fail_detach(struct lagg_softc *sc) +static struct mbuf* +lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { - return (0); + struct ifnet *ifp = sc->sc_ifp; + + /* Just pass in the packet to our lagg device */ + m->m_pkthdr.rcvif = ifp; + return (m); } +/* + * Active failover + */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { @@ -1710,7 +2025,7 @@ lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; - if (lp == sc->sc_primary || lagg_failover_rx_all) { + if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } @@ -1718,7 +2033,7 @@ lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* - * If tmp_tp is null, we've recieved a packet when all + * If tmp_tp is null, we've received a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { @@ -1734,40 +2049,32 @@ lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) /* * Loadbalancing */ - -static int +static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; - if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb), - M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL) - return (ENOMEM); + lb = malloc(sizeof(struct lagg_lb), M_DEVBUF, M_WAITOK | M_ZERO); - sc->sc_detach = lagg_lb_detach; - sc->sc_start = lagg_lb_start; - sc->sc_input = lagg_lb_input; - sc->sc_port_create = lagg_lb_port_create; - sc->sc_port_destroy = lagg_lb_port_destroy; sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX; - lb->lb_key = arc4random(); - sc->sc_psc = (caddr_t)lb; + lb->lb_key = m_ether_tcpip_hash_init(); + sc->sc_psc = lb; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); - - return (0); } -static int +static void lagg_lb_detach(struct lagg_softc *sc) { - struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; + struct lagg_lb *lb; + + lb = (struct lagg_lb *)sc->sc_psc; + LAGG_WUNLOCK(sc); if (lb != NULL) free(lb, M_DEVBUF); - return (0); } static int @@ -1785,7 +2092,7 @@ lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) return (EINVAL); if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", - sc->sc_ifname, lp_next->lp_ifname, i); + sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } @@ -1813,10 +2120,11 @@ lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) struct lagg_port *lp = NULL; uint32_t p = 0; - if (sc->use_flowid && (m->m_flags & M_FLOWID)) - p = m->m_pkthdr.flowid; + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && + M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) + p = m->m_pkthdr.flowid >> sc->flowid_shift; else - p = lagg_hashmbuf(sc, m, lb->lb_key); + p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; @@ -1847,50 +2155,30 @@ lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) /* * 802.3ad LACP */ - -static int +static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; - int error; - - sc->sc_detach = lagg_lacp_detach; - sc->sc_port_create = lacp_port_create; - sc->sc_port_destroy = lacp_port_destroy; - sc->sc_linkstate = lacp_linkstate; - sc->sc_start = lagg_lacp_start; - sc->sc_input = lagg_lacp_input; - sc->sc_init = lacp_init; - sc->sc_stop = lacp_stop; - sc->sc_lladdr = lagg_lacp_lladdr; - sc->sc_req = lacp_req; - sc->sc_portreq = lacp_portreq; - - error = lacp_attach(sc); - if (error) - return (error); + lacp_attach(sc); SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); - - return (error); } -static int +static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; - int error; + void *psc; SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); - /* unlocking is safe here */ + psc = sc->sc_psc; + sc->sc_psc = NULL; LAGG_WUNLOCK(sc); - error = lacp_detach(sc); - LAGG_WLOCK(sc); - return (error); + lacp_detach(psc); } static void @@ -1951,3 +2239,4 @@ lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) m->m_pkthdr.rcvif = ifp; return (m); } + diff --git a/freebsd/sys/net/if_lagg.h b/freebsd/sys/net/if_lagg.h index 27ab46f2..334995e5 100644 --- a/freebsd/sys/net/if_lagg.h +++ b/freebsd/sys/net/if_lagg.h @@ -21,8 +21,6 @@ #ifndef _NET_LAGG_H #define _NET_LAGG_H -#include <sys/sysctl.h> - /* * Global definitions */ @@ -49,26 +47,28 @@ "\05DISTRIBUTING\06DISABLED" /* Supported lagg PROTOs */ -#define LAGG_PROTO_NONE 0 /* no lagg protocol defined */ -#define LAGG_PROTO_ROUNDROBIN 1 /* simple round robin */ -#define LAGG_PROTO_FAILOVER 2 /* active failover */ -#define LAGG_PROTO_LOADBALANCE 3 /* loadbalance */ -#define LAGG_PROTO_LACP 4 /* 802.3ad lacp */ -#define LAGG_PROTO_ETHERCHANNEL 5 /* Cisco FEC */ -#define LAGG_PROTO_MAX 6 +typedef enum { + LAGG_PROTO_NONE = 0, /* no lagg protocol defined */ + LAGG_PROTO_ROUNDROBIN, /* simple round robin */ + LAGG_PROTO_FAILOVER, /* active failover */ + LAGG_PROTO_LOADBALANCE, /* loadbalance */ + LAGG_PROTO_LACP, /* 802.3ad lacp */ + LAGG_PROTO_BROADCAST, /* broadcast */ + LAGG_PROTO_MAX, +} lagg_proto; struct lagg_protos { const char *lpr_name; - int lpr_proto; + lagg_proto lpr_proto; }; #define LAGG_PROTO_DEFAULT LAGG_PROTO_FAILOVER #define LAGG_PROTOS { \ - { "failover", LAGG_PROTO_FAILOVER }, \ - { "fec", LAGG_PROTO_ETHERCHANNEL }, \ + { "failover", LAGG_PROTO_FAILOVER }, \ { "lacp", LAGG_PROTO_LACP }, \ { "loadbalance", LAGG_PROTO_LOADBALANCE }, \ - { "roundrobin", LAGG_PROTO_ROUNDROBIN }, \ + { "roundrobin", LAGG_PROTO_ROUNDROBIN }, \ + { "broadcast", LAGG_PROTO_BROADCAST }, \ { "none", LAGG_PROTO_NONE }, \ { "default", LAGG_PROTO_DEFAULT } \ } @@ -136,16 +136,40 @@ struct lagg_reqflags { #define SIOCGLAGGFLAGS _IOWR('i', 145, struct lagg_reqflags) #define SIOCSLAGGHASH _IOW('i', 146, struct lagg_reqflags) +struct lagg_reqopts { + char ro_ifname[IFNAMSIZ]; /* name of the lagg */ + + int ro_opts; /* Option bitmap */ +#define LAGG_OPT_NONE 0x00 +#define LAGG_OPT_USE_FLOWID 0x01 /* enable use of flowid */ +/* Pseudo flags which are used in ro_opts but not stored into sc_opts. */ +#define LAGG_OPT_FLOWIDSHIFT 0x02 /* set flowid shift */ +#define LAGG_OPT_FLOWIDSHIFT_MASK 0x1f /* flowid is uint32_t */ +#define LAGG_OPT_LACP_STRICT 0x10 /* LACP strict mode */ +#define LAGG_OPT_LACP_TXTEST 0x20 /* LACP debug: txtest */ +#define LAGG_OPT_LACP_RXTEST 0x40 /* LACP debug: rxtest */ +#define LAGG_OPT_LACP_TIMEOUT 0x80 /* LACP timeout */ + u_int ro_count; /* number of ports */ + u_int ro_active; /* active port count */ + u_int ro_flapping; /* number of flapping */ + int ro_flowid_shift; /* shift the flowid */ + uint32_t ro_bkt; /* packet bucket for roundrobin */ +}; + +#define SIOCGLAGGOPTS _IOWR('i', 152, struct lagg_reqopts) +#define SIOCSLAGGOPTS _IOW('i', 153, struct lagg_reqopts) + +#define LAGG_OPT_BITS "\020\001USE_FLOWID\005LACP_STRICT" \ + "\006LACP_TXTEST\007LACP_RXTEST" + #ifdef _KERNEL + /* * Internal kernel part */ -#define lp_ifname lp_ifp->if_xname /* interface name */ -#define lp_link_state lp_ifp->if_link_state /* link state */ - #define LAGG_PORTACTIVE(_tp) ( \ - ((_tp)->lp_link_state == LINK_STATE_UP) && \ + ((_tp)->lp_ifp->if_link_state == LINK_STATE_UP) && \ ((_tp)->lp_ifp->if_flags & IFF_UP) \ ) @@ -173,25 +197,39 @@ struct lagg_lb { }; struct lagg_mc { + struct sockaddr_dl mc_addr; struct ifmultiaddr *mc_ifma; SLIST_ENTRY(lagg_mc) mc_entries; }; +typedef enum { + LAGG_LLQTYPE_PHYS = 0, /* Task related to physical (underlying) port */ + LAGG_LLQTYPE_VIRT, /* Task related to lagg interface itself */ +} lagg_llqtype; + /* List of interfaces to have the MAC address modified */ struct lagg_llq { struct ifnet *llq_ifp; uint8_t llq_lladdr[ETHER_ADDR_LEN]; + lagg_llqtype llq_type; SLIST_ENTRY(lagg_llq) llq_entries; }; +struct lagg_counters { + uint64_t val[IFCOUNTERS]; +}; + struct lagg_softc { struct ifnet *sc_ifp; /* virtual interface */ - struct rwlock sc_mtx; + struct rmlock sc_mtx; int sc_proto; /* lagg protocol */ u_int sc_count; /* number of ports */ + u_int sc_active; /* active port count */ + u_int sc_flapping; /* number of flapping + * events */ struct lagg_port *sc_primary; /* primary port */ struct ifmedia sc_media; /* media config */ - caddr_t sc_psc; /* protocol data */ + void *sc_psc; /* protocol data */ uint32_t sc_seq; /* sequence counter */ uint32_t sc_flags; @@ -201,26 +239,14 @@ struct lagg_softc { struct task sc_lladdr_task; SLIST_HEAD(__llqhd, lagg_llq) sc_llq_head; /* interfaces to program the lladdr on */ - - /* lagg protocol callbacks */ - int (*sc_detach)(struct lagg_softc *); - int (*sc_start)(struct lagg_softc *, struct mbuf *); - struct mbuf *(*sc_input)(struct lagg_softc *, struct lagg_port *, - struct mbuf *); - int (*sc_port_create)(struct lagg_port *); - void (*sc_port_destroy)(struct lagg_port *); - void (*sc_linkstate)(struct lagg_port *); - void (*sc_init)(struct lagg_softc *); - void (*sc_stop)(struct lagg_softc *); - void (*sc_lladdr)(struct lagg_softc *); - void (*sc_req)(struct lagg_softc *, caddr_t); - void (*sc_portreq)(struct lagg_port *, caddr_t); -#if __FreeBSD_version >= 800000 eventhandler_tag vlan_attach; eventhandler_tag vlan_detach; -#endif - struct sysctl_ctx_list ctx; /* sysctl variables */ - int use_flowid; /* use M_FLOWID */ + struct callout sc_callout; + u_int sc_opts; + int flowid_shift; /* shift the flowid */ + uint32_t sc_bkt; /* packates bucket for roundrobin */ + uint32_t sc_bkt_count; /* packates bucket count for roundrobin */ + struct lagg_counters detached_counters; /* detached ports sum */ }; struct lagg_port { @@ -233,33 +259,36 @@ struct lagg_port { uint32_t lp_flags; /* port flags */ int lp_ifflags; /* saved ifp flags */ void *lh_cookie; /* if state hook */ - caddr_t lp_psc; /* protocol data */ + void *lp_psc; /* protocol data */ int lp_detaching; /* ifnet is detaching */ SLIST_HEAD(__mclhd, lagg_mc) lp_mc_head; /* multicast addresses */ /* Redirected callbacks */ int (*lp_ioctl)(struct ifnet *, u_long, caddr_t); - int (*lp_output)(struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *); + int (*lp_output)(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *); + struct lagg_counters port_counters; /* ifp counters copy */ SLIST_ENTRY(lagg_port) lp_entries; }; -#define LAGG_LOCK_INIT(_sc) rw_init(&(_sc)->sc_mtx, "if_lagg rwlock") -#define LAGG_LOCK_DESTROY(_sc) rw_destroy(&(_sc)->sc_mtx) -#define LAGG_RLOCK(_sc) rw_rlock(&(_sc)->sc_mtx) -#define LAGG_WLOCK(_sc) rw_wlock(&(_sc)->sc_mtx) -#define LAGG_RUNLOCK(_sc) rw_runlock(&(_sc)->sc_mtx) -#define LAGG_WUNLOCK(_sc) rw_wunlock(&(_sc)->sc_mtx) -#define LAGG_RLOCK_ASSERT(_sc) rw_assert(&(_sc)->sc_mtx, RA_RLOCKED) -#define LAGG_WLOCK_ASSERT(_sc) rw_assert(&(_sc)->sc_mtx, RA_WLOCKED) +#define LAGG_LOCK_INIT(_sc) rm_init(&(_sc)->sc_mtx, "if_lagg rmlock") +#define LAGG_LOCK_DESTROY(_sc) rm_destroy(&(_sc)->sc_mtx) +#define LAGG_RLOCK(_sc, _p) rm_rlock(&(_sc)->sc_mtx, (_p)) +#define LAGG_WLOCK(_sc) rm_wlock(&(_sc)->sc_mtx) +#define LAGG_RUNLOCK(_sc, _p) rm_runlock(&(_sc)->sc_mtx, (_p)) +#define LAGG_WUNLOCK(_sc) rm_wunlock(&(_sc)->sc_mtx) +#define LAGG_RLOCK_ASSERT(_sc) rm_assert(&(_sc)->sc_mtx, RA_RLOCKED) +#define LAGG_WLOCK_ASSERT(_sc) rm_assert(&(_sc)->sc_mtx, RA_WLOCKED) +#define LAGG_UNLOCK_ASSERT(_sc) rm_assert(&(_sc)->sc_mtx, RA_UNLOCKED) extern struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); extern void (*lagg_linkstate_p)(struct ifnet *, int ); int lagg_enqueue(struct ifnet *, struct mbuf *); -uint32_t lagg_hashmbuf(struct lagg_softc *, struct mbuf *, uint32_t); + +SYSCTL_DECL(_net_link_lagg); #endif /* _KERNEL */ diff --git a/freebsd/sys/net/if_llatbl.c b/freebsd/sys/net/if_llatbl.c index 55b816a7..20c0b9d2 100644 --- a/freebsd/sys/net/if_llatbl.c +++ b/freebsd/sys/net/if_llatbl.c @@ -64,17 +64,43 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_LLTABLE, "lltable", "link level address tables"); -static VNET_DEFINE(SLIST_HEAD(, lltable), lltables); +static VNET_DEFINE(SLIST_HEAD(, lltable), lltables) = + SLIST_HEAD_INITIALIZER(lltables); #define V_lltables VNET(lltables) -extern void arprequest(struct ifnet *, struct in_addr *, struct in_addr *, - u_char *); - -static void vnet_lltable_init(void); - struct rwlock lltable_rwlock; RW_SYSINIT(lltable_rwlock, &lltable_rwlock, "lltable_rwlock"); +static void lltable_unlink(struct lltable *llt); +static void llentries_unlink(struct lltable *llt, struct llentries *head); + +static void htable_unlink_entry(struct llentry *lle); +static void htable_link_entry(struct lltable *llt, struct llentry *lle); +static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, + void *farg); + +/* + * Dump lle state for a specific address family. + */ +static int +lltable_dump_af(struct lltable *llt, struct sysctl_req *wr) +{ + int error; + + LLTABLE_LOCK_ASSERT(); + + if (llt->llt_ifp->if_flags & IFF_LOOPBACK) + return (0); + error = 0; + + IF_AFDATA_RLOCK(llt->llt_ifp); + error = lltable_foreach_lle(llt, + (llt_foreach_cb_t *)llt->llt_dump_entry, wr); + IF_AFDATA_RUNLOCK(llt->llt_ifp); + + return (error); +} + /* * Dump arp state for a specific address family. */ @@ -87,7 +113,7 @@ lltable_sysctl_dumparp(int af, struct sysctl_req *wr) LLTABLE_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af == af) { - error = llt->llt_dump(llt, wr); + error = lltable_dump_af(llt, wr); if (error != 0) goto done; } @@ -98,25 +124,144 @@ done: } /* - * Deletes an address from the address table. - * This function is called by the timer functions - * such as arptimer() and nd6_llinfo_timer(), and - * the caller does the locking. + * Common function helpers for chained hash table. + */ + +/* + * Runs specified callback for each entry in @llt. + * Caller does the locking. + * + */ +static int +htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) +{ + struct llentry *lle, *next; + int i, error; + + error = 0; + + for (i = 0; i < llt->llt_hsize; i++) { + LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { + error = f(llt, lle, farg); + if (error != 0) + break; + } + } + + return (error); +} + +static void +htable_link_entry(struct lltable *llt, struct llentry *lle) +{ + struct llentries *lleh; + uint32_t hashidx; + + if ((lle->la_flags & LLE_LINKED) != 0) + return; + + IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); + + hashidx = llt->llt_hash(lle, llt->llt_hsize); + lleh = &llt->lle_head[hashidx]; + + lle->lle_tbl = llt; + lle->lle_head = lleh; + lle->la_flags |= LLE_LINKED; + LIST_INSERT_HEAD(lleh, lle, lle_next); +} + +static void +htable_unlink_entry(struct llentry *lle) +{ + + if ((lle->la_flags & LLE_LINKED) != 0) { + IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); + LIST_REMOVE(lle, lle_next); + lle->la_flags &= ~(LLE_VALID | LLE_LINKED); +#if 0 + lle->lle_tbl = NULL; + lle->lle_head = NULL; +#endif + } +} + +struct prefix_match_data { + const struct sockaddr *addr; + const struct sockaddr *mask; + struct llentries dchain; + u_int flags; +}; + +static int +htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg) +{ + struct prefix_match_data *pmd; + + pmd = (struct prefix_match_data *)farg; + + if (llt->llt_match_prefix(pmd->addr, pmd->mask, pmd->flags, lle)) { + LLE_WLOCK(lle); + LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain); + } + + return (0); +} + +static void +htable_prefix_free(struct lltable *llt, const struct sockaddr *addr, + const struct sockaddr *mask, u_int flags) +{ + struct llentry *lle, *next; + struct prefix_match_data pmd; + + bzero(&pmd, sizeof(pmd)); + pmd.addr = addr; + pmd.mask = mask; + pmd.flags = flags; + LIST_INIT(&pmd.dchain); + + IF_AFDATA_WLOCK(llt->llt_ifp); + /* Push matching lles to chain */ + lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd); + + llentries_unlink(llt, &pmd.dchain); + IF_AFDATA_WUNLOCK(llt->llt_ifp); + + LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next) + lltable_free_entry(llt, lle); +} + +static void +htable_free_tbl(struct lltable *llt) +{ + + free(llt->lle_head, M_LLTABLE); + free(llt, M_LLTABLE); +} + +static void +llentries_unlink(struct lltable *llt, struct llentries *head) +{ + struct llentry *lle, *next; + + LIST_FOREACH_SAFE(lle, head, lle_chain, next) + llt->llt_unlink_entry(lle); +} + +/* + * Helper function used to drop all mbufs in hold queue. * * Returns the number of held packets, if any, that were dropped. */ size_t -llentry_free(struct llentry *lle) +lltable_drop_entry_queue(struct llentry *lle) { size_t pkts_dropped; struct mbuf *next; - IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); LLE_WLOCK_ASSERT(lle); - LIST_REMOVE(lle, lle_next); - lle->la_flags &= ~(LLE_VALID | LLE_LINKED); - pkts_dropped = 0; while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) { next = lle->la_hold->m_nextpkt; @@ -130,6 +275,162 @@ llentry_free(struct llentry *lle) ("%s: la_numheld %d > 0, pkts_droped %zd", __func__, lle->la_numheld, pkts_dropped)); + return (pkts_dropped); +} + +void +lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle, + const char *linkhdr, size_t linkhdrsize, int lladdr_off) +{ + + memcpy(lle->r_linkdata, linkhdr, linkhdrsize); + lle->r_hdrlen = linkhdrsize; + lle->ll_addr = &lle->r_linkdata[lladdr_off]; + lle->la_flags |= LLE_VALID; + lle->r_flags |= RLLE_VALID; +} + +/* + * Tries to update @lle link-level address. + * Since update requires AFDATA WLOCK, function + * drops @lle lock, acquires AFDATA lock and then acquires + * @lle lock to maintain lock order. + * + * Returns 1 on success. + */ +int +lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle, + const char *linkhdr, size_t linkhdrsize, int lladdr_off) +{ + + /* Perform real LLE update */ + /* use afdata WLOCK to update fields */ + LLE_WLOCK_ASSERT(lle); + LLE_ADDREF(lle); + LLE_WUNLOCK(lle); + IF_AFDATA_WLOCK(ifp); + LLE_WLOCK(lle); + + /* + * Since we droppped LLE lock, other thread might have deleted + * this lle. Check and return + */ + if ((lle->la_flags & LLE_DELETED) != 0) { + IF_AFDATA_WUNLOCK(ifp); + LLE_FREE_LOCKED(lle); + return (0); + } + + /* Update data */ + lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); + + IF_AFDATA_WUNLOCK(ifp); + + LLE_REMREF(lle); + + return (1); +} + + /* + * Helper function used to pre-compute full/partial link-layer + * header data suitable for feeding into if_output(). + */ +int +lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr, + char *buf, size_t *bufsize, int *lladdr_off) +{ + struct if_encap_req ereq; + int error; + + bzero(buf, *bufsize); + bzero(&ereq, sizeof(ereq)); + ereq.buf = buf; + ereq.bufsize = *bufsize; + ereq.rtype = IFENCAP_LL; + ereq.family = family; + ereq.lladdr = lladdr; + ereq.lladdr_len = ifp->if_addrlen; + error = ifp->if_requestencap(ifp, &ereq); + if (error == 0) { + *bufsize = ereq.bufsize; + *lladdr_off = ereq.lladdr_off; + } + + return (error); +} + +/* + * Update link-layer header for given @lle after + * interface lladdr was changed. + */ +static int +llentry_update_ifaddr(struct lltable *llt, struct llentry *lle, void *farg) +{ + struct ifnet *ifp; + u_char linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + u_char *lladdr; + int lladdr_off; + + ifp = (struct ifnet *)farg; + + lladdr = lle->ll_addr; + + LLE_WLOCK(lle); + if ((lle->la_flags & LLE_VALID) == 0) { + LLE_WUNLOCK(lle); + return (0); + } + + if ((lle->la_flags & LLE_IFADDR) != 0) + lladdr = IF_LLADDR(ifp); + + linkhdrsize = sizeof(linkhdr); + lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize, + &lladdr_off); + memcpy(lle->r_linkdata, linkhdr, linkhdrsize); + LLE_WUNLOCK(lle); + + return (0); +} + +/* + * Update all calculated headers for given @llt + */ +void +lltable_update_ifaddr(struct lltable *llt) +{ + + if (llt->llt_ifp->if_flags & IFF_LOOPBACK) + return; + + IF_AFDATA_WLOCK(llt->llt_ifp); + lltable_foreach_lle(llt, llentry_update_ifaddr, llt->llt_ifp); + IF_AFDATA_WUNLOCK(llt->llt_ifp); +} + +/* + * + * Performs generic cleanup routines and frees lle. + * + * Called for non-linked entries, with callouts and + * other AF-specific cleanups performed. + * + * @lle must be passed WLOCK'ed + * + * Returns the number of held packets, if any, that were dropped. + */ +size_t +llentry_free(struct llentry *lle) +{ + size_t pkts_dropped; + + LLE_WLOCK_ASSERT(lle); + + KASSERT((lle->la_flags & LLE_LINKED) == 0, ("freeing linked lle")); + + pkts_dropped = lltable_drop_entry_queue(lle); + LLE_FREE_LOCKED(lle); return (pkts_dropped); @@ -144,22 +445,35 @@ struct llentry * llentry_alloc(struct ifnet *ifp, struct lltable *lt, struct sockaddr_storage *dst) { - struct llentry *la; + struct llentry *la, *la_tmp; IF_AFDATA_RLOCK(ifp); la = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); IF_AFDATA_RUNLOCK(ifp); - if ((la == NULL) && - (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { - IF_AFDATA_WLOCK(ifp); - la = lla_lookup(lt, (LLE_CREATE | LLE_EXCLUSIVE), - (struct sockaddr *)dst); - IF_AFDATA_WUNLOCK(ifp); - } if (la != NULL) { LLE_ADDREF(la); LLE_WUNLOCK(la); + return (la); + } + + if ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { + la = lltable_alloc_entry(lt, 0, (struct sockaddr *)dst); + if (la == NULL) + return (NULL); + IF_AFDATA_WLOCK(ifp); + LLE_WLOCK(la); + /* Prefer any existing LLE over newly-created one */ + la_tmp = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); + if (la_tmp == NULL) + lltable_link_entry(lt, la); + IF_AFDATA_WUNLOCK(ifp); + if (la_tmp != NULL) { + lltable_free_entry(lt, la); + la = la_tmp; + } + LLE_ADDREF(la); + LLE_WUNLOCK(la); } return (la); @@ -168,30 +482,47 @@ llentry_alloc(struct ifnet *ifp, struct lltable *lt, /* * Free all entries from given table and free itself. */ + +static int +lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg) +{ + struct llentries *dchain; + + dchain = (struct llentries *)farg; + + LLE_WLOCK(lle); + LIST_INSERT_HEAD(dchain, lle, lle_chain); + + return (0); +} + +/* + * Free all entries from given table and free itself. + */ void lltable_free(struct lltable *llt) { struct llentry *lle, *next; - int i; + struct llentries dchain; KASSERT(llt != NULL, ("%s: llt is NULL", __func__)); - LLTABLE_WLOCK(); - SLIST_REMOVE(&V_lltables, llt, lltable, llt_link); - LLTABLE_WUNLOCK(); + lltable_unlink(llt); + LIST_INIT(&dchain); IF_AFDATA_WLOCK(llt->llt_ifp); - for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { - LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { - LLE_WLOCK(lle); - if (callout_stop(&lle->la_timer)) - LLE_REMREF(lle); - llentry_free(lle); - } - } + /* Push all lles to @dchain */ + lltable_foreach_lle(llt, lltable_free_cb, &dchain); + llentries_unlink(llt, &dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); - free(llt, M_LLTABLE); + LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) { + if (callout_stop(&lle->lle_timer) > 0) + LLE_REMREF(lle); + llentry_free(lle); + } + + llt->llt_free_tbl(llt); } #if 0 @@ -207,7 +538,7 @@ lltable_drain(int af) if (llt->llt_af != af) continue; - for (i=0; i < LLTBL_HASHTBL_SIZE; i++) { + for (i=0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { LLE_WLOCK(lle); if (lle->la_hold) { @@ -222,8 +553,42 @@ lltable_drain(int af) } #endif +/* + * Deletes an address from given lltable. + * Used for userland interaction to remove + * individual entries. Skips entries added by OS. + */ +int +lltable_delete_addr(struct lltable *llt, u_int flags, + const struct sockaddr *l3addr) +{ + struct llentry *lle; + struct ifnet *ifp; + + ifp = llt->llt_ifp; + IF_AFDATA_WLOCK(ifp); + lle = lla_lookup(llt, LLE_EXCLUSIVE, l3addr); + + if (lle == NULL) { + IF_AFDATA_WUNLOCK(ifp); + return (ENOENT); + } + if ((lle->la_flags & LLE_IFADDR) != 0 && (flags & LLE_IFADDR) == 0) { + IF_AFDATA_WUNLOCK(ifp); + LLE_WUNLOCK(lle); + return (EPERM); + } + + lltable_unlink_entry(llt, lle); + IF_AFDATA_WUNLOCK(ifp); + + llt->llt_delete_entry(llt, lle); + + return (0); +} + void -lltable_prefix_free(int af, struct sockaddr *prefix, struct sockaddr *mask, +lltable_prefix_free(int af, struct sockaddr *addr, struct sockaddr *mask, u_int flags) { struct lltable *llt; @@ -233,38 +598,122 @@ lltable_prefix_free(int af, struct sockaddr *prefix, struct sockaddr *mask, if (llt->llt_af != af) continue; - llt->llt_prefix_free(llt, prefix, mask, flags); + llt->llt_prefix_free(llt, addr, mask, flags); } LLTABLE_RUNLOCK(); } - - -/* - * Create a new lltable. - */ struct lltable * -lltable_init(struct ifnet *ifp, int af) +lltable_allocate_htbl(uint32_t hsize) { struct lltable *llt; - register int i; + int i; - llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK); + llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO); + llt->llt_hsize = hsize; + llt->lle_head = malloc(sizeof(struct llentries) * hsize, + M_LLTABLE, M_WAITOK | M_ZERO); - llt->llt_af = af; - llt->llt_ifp = ifp; - for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) + for (i = 0; i < llt->llt_hsize; i++) LIST_INIT(&llt->lle_head[i]); + /* Set some default callbacks */ + llt->llt_link_entry = htable_link_entry; + llt->llt_unlink_entry = htable_unlink_entry; + llt->llt_prefix_free = htable_prefix_free; + llt->llt_foreach_entry = htable_foreach_lle; + llt->llt_free_tbl = htable_free_tbl; + + return (llt); +} + +/* + * Links lltable to global llt list. + */ +void +lltable_link(struct lltable *llt) +{ + LLTABLE_WLOCK(); SLIST_INSERT_HEAD(&V_lltables, llt, llt_link); LLTABLE_WUNLOCK(); +} - return (llt); +static void +lltable_unlink(struct lltable *llt) +{ + + LLTABLE_WLOCK(); + SLIST_REMOVE(&V_lltables, llt, lltable, llt_link); + LLTABLE_WUNLOCK(); + +} + +/* + * External methods used by lltable consumers + */ + +int +lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) +{ + + return (llt->llt_foreach_entry(llt, f, farg)); +} + +struct llentry * +lltable_alloc_entry(struct lltable *llt, u_int flags, + const struct sockaddr *l3addr) +{ + + return (llt->llt_alloc_entry(llt, flags, l3addr)); +} + +void +lltable_free_entry(struct lltable *llt, struct llentry *lle) +{ + + llt->llt_free_entry(llt, lle); +} + +void +lltable_link_entry(struct lltable *llt, struct llentry *lle) +{ + + llt->llt_link_entry(llt, lle); +} + +void +lltable_unlink_entry(struct lltable *llt, struct llentry *lle) +{ + + llt->llt_unlink_entry(lle); +} + +void +lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) +{ + struct lltable *llt; + + llt = lle->lle_tbl; + llt->llt_fill_sa_entry(lle, sa); +} + +struct ifnet * +lltable_get_ifp(const struct lltable *llt) +{ + + return (llt->llt_ifp); +} + +int +lltable_get_af(const struct lltable *llt) +{ + + return (llt->llt_af); } /* - * Called in route_output when adding/deleting a route to an interface. + * Called in route_output when rtm_flags contains RTF_LLDATA. */ int lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info) @@ -274,14 +723,16 @@ lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info) struct sockaddr *dst = (struct sockaddr *)info->rti_info[RTAX_DST]; struct ifnet *ifp; struct lltable *llt; - struct llentry *lle; - u_int laflags = 0, flags = 0; - int error = 0; + struct llentry *lle, *lle_tmp; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; + u_int laflags = 0; + int error; + + KASSERT(dl != NULL && dl->sdl_family == AF_LINK, + ("%s: invalid dl\n", __func__)); - if (dl == NULL || dl->sdl_family != AF_LINK) { - log(LOG_INFO, "%s: invalid dl\n", __func__); - return EINVAL; - } ifp = ifnet_byindex(dl->sdl_index); if (ifp == NULL) { log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n", @@ -289,44 +740,6 @@ lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info) return EINVAL; } - switch (rtm->rtm_type) { - case RTM_ADD: - if (rtm->rtm_flags & RTF_ANNOUNCE) { - flags |= LLE_PUB; -#ifdef INET - if (dst->sa_family == AF_INET && - ((struct sockaddr_inarp *)dst)->sin_other != 0) { - struct rtentry *rt; - ((struct sockaddr_inarp *)dst)->sin_other = 0; - rt = rtalloc1(dst, 0, 0); - if (rt == NULL || !(rt->rt_flags & RTF_HOST)) { - log(LOG_INFO, "%s: RTM_ADD publish " - "(proxy only) is invalid\n", - __func__); - if (rt) - RTFREE_LOCKED(rt); - return EINVAL; - } - RTFREE_LOCKED(rt); - - flags |= LLE_PROXY; - } -#endif - } - flags |= LLE_CREATE; - break; - - case RTM_DELETE: - flags |= LLE_DELETE; - break; - - case RTM_CHANGE: - break; - - default: - return EINVAL; /* XXX not implemented yet */ - } - /* XXX linked list may be too expensive */ LLTABLE_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { @@ -337,73 +750,82 @@ lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info) LLTABLE_RUNLOCK(); KASSERT(llt != NULL, ("Yep, ugly hacks are bad\n")); - if (flags & LLE_CREATE) - flags |= LLE_EXCLUSIVE; - - IF_AFDATA_LOCK(ifp); - lle = lla_lookup(llt, flags, dst); - IF_AFDATA_UNLOCK(ifp); - if (LLE_IS_VALID(lle)) { - if (flags & LLE_CREATE) { - /* - * If we delay the delete, then a subsequent - * "arp add" should look up this entry, reset the - * LLE_DELETED flag, and reset the expiration timer - */ - bcopy(LLADDR(dl), &lle->ll_addr, ifp->if_addrlen); - lle->la_flags |= (flags & (LLE_PUB | LLE_PROXY)); - lle->la_flags |= LLE_VALID; - lle->la_flags &= ~LLE_DELETED; -#ifdef INET6 - /* - * ND6 - */ - if (dst->sa_family == AF_INET6) - lle->ln_state = ND6_LLINFO_REACHABLE; -#endif - /* - * NB: arp and ndp always set (RTF_STATIC | RTF_HOST) - */ - - if (rtm->rtm_rmx.rmx_expire == 0) { - lle->la_flags |= LLE_STATIC; - lle->la_expire = 0; - } else - lle->la_expire = rtm->rtm_rmx.rmx_expire; - laflags = lle->la_flags; - LLE_WUNLOCK(lle); -#ifdef INET - /* gratuitous ARP */ - if ((laflags & LLE_PUB) && dst->sa_family == AF_INET) { - arprequest(ifp, - &((struct sockaddr_in *)dst)->sin_addr, - &((struct sockaddr_in *)dst)->sin_addr, - ((laflags & LLE_PROXY) ? - (u_char *)IF_LLADDR(ifp) : - (u_char *)LLADDR(dl))); + error = 0; + + switch (rtm->rtm_type) { + case RTM_ADD: + /* Add static LLE */ + laflags = 0; + if (rtm->rtm_rmx.rmx_expire == 0) + laflags = LLE_STATIC; + lle = lltable_alloc_entry(llt, laflags, dst); + if (lle == NULL) + return (ENOMEM); + + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, dst->sa_family, LLADDR(dl), + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return (EINVAL); + lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, + lladdr_off); + if ((rtm->rtm_flags & RTF_ANNOUNCE)) + lle->la_flags |= LLE_PUB; + lle->la_expire = rtm->rtm_rmx.rmx_expire; + + laflags = lle->la_flags; + + /* Try to link new entry */ + lle_tmp = NULL; + IF_AFDATA_WLOCK(ifp); + LLE_WLOCK(lle); + lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, dst); + if (lle_tmp != NULL) { + /* Check if we are trying to replace immutable entry */ + if ((lle_tmp->la_flags & LLE_IFADDR) != 0) { + IF_AFDATA_WUNLOCK(ifp); + LLE_WUNLOCK(lle_tmp); + lltable_free_entry(llt, lle); + return (EPERM); } -#endif - } else { - if (flags & LLE_EXCLUSIVE) - LLE_WUNLOCK(lle); - else - LLE_RUNLOCK(lle); + /* Unlink existing entry from table */ + lltable_unlink_entry(llt, lle_tmp); } - } else if ((lle == NULL) && (flags & LLE_DELETE)) - error = EINVAL; + lltable_link_entry(llt, lle); + IF_AFDATA_WUNLOCK(ifp); + if (lle_tmp != NULL) { + EVENTHANDLER_INVOKE(lle_event, lle_tmp,LLENTRY_EXPIRED); + lltable_free_entry(llt, lle_tmp); + } - return (error); -} + /* + * By invoking LLE handler here we might get + * two events on static LLE entry insertion + * in routing socket. However, since we might have + * other subscribers we need to generate this event. + */ + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); + LLE_WUNLOCK(lle); +#ifdef INET + /* gratuitous ARP */ + if ((laflags & LLE_PUB) && dst->sa_family == AF_INET) + arprequest(ifp, + &((struct sockaddr_in *)dst)->sin_addr, + &((struct sockaddr_in *)dst)->sin_addr, + (u_char *)LLADDR(dl)); +#endif -static void -vnet_lltable_init() -{ + break; - SLIST_INIT(&V_lltables); + case RTM_DELETE: + return (lltable_delete_addr(llt, 0, dst)); + + default: + error = EINVAL; + } + + return (error); } -VNET_SYSINIT(vnet_lltable_init, SI_SUB_PSEUDO, SI_ORDER_FIRST, - vnet_lltable_init, NULL); #ifdef DDB struct llentry_sa { @@ -429,15 +851,14 @@ llatbl_lle_show(struct llentry_sa *la) db_printf(" la_flags=0x%04x\n", lle->la_flags); db_printf(" la_asked=%u\n", lle->la_asked); db_printf(" la_preempt=%u\n", lle->la_preempt); - db_printf(" ln_byhint=%u\n", lle->ln_byhint); db_printf(" ln_state=%d\n", lle->ln_state); db_printf(" ln_router=%u\n", lle->ln_router); db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick); db_printf(" lle_refcnt=%d\n", lle->lle_refcnt); - bcopy(&lle->ll_addr.mac16, octet, sizeof(octet)); + bcopy(lle->ll_addr, octet, sizeof(octet)); db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n", octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]); - db_printf(" la_timer=%p\n", &lle->la_timer); + db_printf(" lle_timer=%p\n", &lle->lle_timer); switch (la->l3_addr.sa_family) { #ifdef INET @@ -490,7 +911,7 @@ llatbl_llt_show(struct lltable *llt) db_printf("llt=%p llt_af=%d llt_ifp=%p\n", llt, llt->llt_af, llt->llt_ifp); - for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { + for (i = 0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { llatbl_lle_show((struct llentry_sa *)lle); diff --git a/freebsd/sys/net/if_llatbl.h b/freebsd/sys/net/if_llatbl.h index 8ac72c4f..51de726a 100644 --- a/freebsd/sys/net/if_llatbl.h +++ b/freebsd/sys/net/if_llatbl.h @@ -30,8 +30,6 @@ __FBSDID("$FreeBSD$"); #ifndef _NET_IF_LLATBL_H_ #define _NET_IF_LLATBL_H_ -#include <rtems/bsd/local/opt_ofed.h> - #include <sys/_rwlock.h> #include <netinet/in.h> @@ -50,42 +48,44 @@ extern struct rwlock lltable_rwlock; #define LLTABLE_WUNLOCK() rw_wunlock(&lltable_rwlock) #define LLTABLE_LOCK_ASSERT() rw_assert(&lltable_rwlock, RA_LOCKED) +#define LLE_MAX_LINKHDR 24 /* Full IB header */ /* * Code referencing llentry must at least hold * a shared lock */ struct llentry { LIST_ENTRY(llentry) lle_next; - struct rwlock lle_lock; + union { + struct in_addr addr4; + struct in6_addr addr6; + } r_l3addr; + char r_linkdata[LLE_MAX_LINKHDR]; /* L2 data */ + uint8_t r_hdrlen; /* length for LL header */ + uint8_t spare0[3]; + uint16_t r_flags; /* LLE runtime flags */ + uint16_t r_skip_req; /* feedback from fast path */ + struct lltable *lle_tbl; struct llentries *lle_head; - void (*lle_free)(struct lltable *, struct llentry *); + void (*lle_free)(struct llentry *); struct mbuf *la_hold; int la_numheld; /* # of packets currently held */ time_t la_expire; uint16_t la_flags; uint16_t la_asked; uint16_t la_preempt; - uint16_t ln_byhint; int16_t ln_state; /* IPv6 has ND6_LLINFO_NOSTATE == -2 */ uint16_t ln_router; time_t ln_ntick; + time_t lle_remtime; /* Real time remaining */ + time_t lle_hittime; /* Time when r_skip_req was unset */ int lle_refcnt; + char *ll_addr; /* link-layer address */ - union { - uint64_t mac_aligned; - uint16_t mac16[3]; -#ifdef OFED - uint8_t mac8[20]; /* IB needs 20 bytes. */ -#endif - } ll_addr; - - /* XXX af-private? */ - union { - struct callout ln_timer_ch; - struct callout la_timer; - } lle_timer; - /* NB: struct sockaddr must immediately follow */ + LIST_ENTRY(llentry) lle_chain; /* chain of deleted items */ + struct callout lle_timer; + struct rwlock lle_lock; + struct mtx req_mtx; }; #define LLE_WLOCK(lle) rw_wlock(&(lle)->lle_lock) @@ -98,6 +98,12 @@ struct llentry { #define LLE_LOCK_DESTROY(lle) rw_destroy(&(lle)->lle_lock) #define LLE_WLOCK_ASSERT(lle) rw_assert(&(lle)->lle_lock, RA_WLOCKED) +#define LLE_REQ_INIT(lle) mtx_init(&(lle)->req_mtx, "lle req", \ + NULL, MTX_DEF) +#define LLE_REQ_DESTROY(lle) mtx_destroy(&(lle)->req_mtx) +#define LLE_REQ_LOCK(lle) mtx_lock(&(lle)->req_mtx) +#define LLE_REQ_UNLOCK(lle) mtx_unlock(&(lle)->req_mtx) + #define LLE_IS_VALID(lle) (((lle) != NULL) && ((lle) != (void *)-1)) #define LLE_ADDREF(lle) do { \ @@ -118,7 +124,7 @@ struct llentry { #define LLE_FREE_LOCKED(lle) do { \ if ((lle)->lle_refcnt == 1) \ - (lle)->lle_free((lle)->lle_tbl, (lle)); \ + (lle)->lle_free(lle); \ else { \ LLE_REMREF(lle); \ LLE_WUNLOCK(lle); \ @@ -132,58 +138,77 @@ struct llentry { LLE_FREE_LOCKED(lle); \ } while (0) +typedef struct llentry *(llt_lookup_t)(struct lltable *, u_int flags, + const struct sockaddr *l3addr); +typedef struct llentry *(llt_alloc_t)(struct lltable *, u_int flags, + const struct sockaddr *l3addr); +typedef void (llt_delete_t)(struct lltable *, struct llentry *); +typedef void (llt_prefix_free_t)(struct lltable *, + const struct sockaddr *addr, const struct sockaddr *mask, u_int flags); +typedef int (llt_dump_entry_t)(struct lltable *, struct llentry *, + struct sysctl_req *); +typedef uint32_t (llt_hash_t)(const struct llentry *, uint32_t); +typedef int (llt_match_prefix_t)(const struct sockaddr *, + const struct sockaddr *, u_int, struct llentry *); +typedef void (llt_free_entry_t)(struct lltable *, struct llentry *); +typedef void (llt_fill_sa_entry_t)(const struct llentry *, struct sockaddr *); +typedef void (llt_free_tbl_t)(struct lltable *); +typedef void (llt_link_entry_t)(struct lltable *, struct llentry *); +typedef void (llt_unlink_entry_t)(struct llentry *); -#define ln_timer_ch lle_timer.ln_timer_ch -#define la_timer lle_timer.la_timer - -/* XXX bad name */ -#define L3_ADDR(lle) ((struct sockaddr *)(&lle[1])) -#define L3_ADDR_LEN(lle) (((struct sockaddr *)(&lle[1]))->sa_len) - -#ifndef LLTBL_HASHTBL_SIZE -#define LLTBL_HASHTBL_SIZE 32 /* default 32 ? */ -#endif - -#ifndef LLTBL_HASHMASK -#define LLTBL_HASHMASK (LLTBL_HASHTBL_SIZE - 1) -#endif +typedef int (llt_foreach_cb_t)(struct lltable *, struct llentry *, void *); +typedef int (llt_foreach_entry_t)(struct lltable *, llt_foreach_cb_t *, void *); struct lltable { SLIST_ENTRY(lltable) llt_link; - struct llentries lle_head[LLTBL_HASHTBL_SIZE]; int llt_af; + int llt_hsize; + struct llentries *lle_head; struct ifnet *llt_ifp; - void (*llt_prefix_free)(struct lltable *, - const struct sockaddr *prefix, - const struct sockaddr *mask, - u_int flags); - struct llentry * (*llt_lookup)(struct lltable *, u_int flags, - const struct sockaddr *l3addr); - int (*llt_dump)(struct lltable *, - struct sysctl_req *); + llt_lookup_t *llt_lookup; + llt_alloc_t *llt_alloc_entry; + llt_delete_t *llt_delete_entry; + llt_prefix_free_t *llt_prefix_free; + llt_dump_entry_t *llt_dump_entry; + llt_hash_t *llt_hash; + llt_match_prefix_t *llt_match_prefix; + llt_free_entry_t *llt_free_entry; + llt_foreach_entry_t *llt_foreach_entry; + llt_link_entry_t *llt_link_entry; + llt_unlink_entry_t *llt_unlink_entry; + llt_fill_sa_entry_t *llt_fill_sa_entry; + llt_free_tbl_t *llt_free_tbl; }; + MALLOC_DECLARE(M_LLTABLE); /* - * flags to be passed to arplookup. + * LLentry flags */ #define LLE_DELETED 0x0001 /* entry must be deleted */ #define LLE_STATIC 0x0002 /* entry is static */ #define LLE_IFADDR 0x0004 /* entry is interface addr */ #define LLE_VALID 0x0008 /* ll_addr is valid */ -#define LLE_PROXY 0x0010 /* proxy entry ??? */ +#define LLE_REDIRECT 0x0010 /* installed by redirect; has host rtentry */ #define LLE_PUB 0x0020 /* publish entry ??? */ #define LLE_LINKED 0x0040 /* linked to lookup structure */ +/* LLE request flags */ #define LLE_EXCLUSIVE 0x2000 /* return lle xlocked */ -#define LLE_DELETE 0x4000 /* delete on a lookup - match LLE_IFADDR */ -#define LLE_CREATE 0x8000 /* create on a lookup miss */ +#define LLE_UNLOCKED 0x4000 /* return lle unlocked */ +#define LLE_ADDRONLY 0x4000 /* return lladdr instead of full header */ +#define LLE_CREATE 0x8000 /* hint to avoid lle lookup */ + +/* LLE flags used by fastpath code */ +#define RLLE_VALID 0x0001 /* entry is valid */ +#define RLLE_IFADDR LLE_IFADDR /* entry is ifaddr */ #define LLATBL_HASH(key, mask) \ (((((((key >> 8) ^ key) >> 8) ^ key) >> 8) ^ key) & mask) -struct lltable *lltable_init(struct ifnet *, int); +struct lltable *lltable_allocate_htbl(uint32_t hsize); void lltable_free(struct lltable *); +void lltable_link(struct lltable *llt); void lltable_prefix_free(int, struct sockaddr *, struct sockaddr *, u_int); #if 0 @@ -195,13 +220,37 @@ size_t llentry_free(struct llentry *); struct llentry *llentry_alloc(struct ifnet *, struct lltable *, struct sockaddr_storage *); +/* helper functions */ +size_t lltable_drop_entry_queue(struct llentry *); +void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle, + const char *linkhdr, size_t linkhdrsize, int lladdr_off); +int lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle, + const char *linkhdr, size_t linkhdrsize, int lladdr_off); + +int lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr, + char *buf, size_t *bufsize, int *lladdr_off); +void lltable_update_ifaddr(struct lltable *llt); +struct llentry *lltable_alloc_entry(struct lltable *llt, u_int flags, + const struct sockaddr *l4addr); +void lltable_free_entry(struct lltable *llt, struct llentry *lle); +int lltable_delete_addr(struct lltable *llt, u_int flags, + const struct sockaddr *l3addr); +void lltable_link_entry(struct lltable *llt, struct llentry *lle); +void lltable_unlink_entry(struct lltable *llt, struct llentry *lle); +void lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa); +struct ifnet *lltable_get_ifp(const struct lltable *llt); +int lltable_get_af(const struct lltable *llt); + +int lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, + void *farg); /* * Generic link layer address lookup function. */ static __inline struct llentry * lla_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { - return llt->llt_lookup(llt, flags, l3addr); + + return (llt->llt_lookup(llt, flags, l3addr)); } int lla_rt_output(struct rt_msghdr *, struct rt_addrinfo *); diff --git a/freebsd/sys/net/if_loop.c b/freebsd/sys/net/if_loop.c index b40dec8e..aa5109eb 100644 --- a/freebsd/sys/net/if_loop.c +++ b/freebsd/sys/net/if_loop.c @@ -36,10 +36,8 @@ * Loopback interface driver for protocol testing and timing. */ -#include <rtems/bsd/local/opt_atalk.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #include <rtems/bsd/sys/param.h> #include <sys/systm.h> @@ -53,6 +51,7 @@ #include <sys/sysctl.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_types.h> #include <net/netisr.h> @@ -65,11 +64,6 @@ #include <netinet/in_var.h> #endif -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - #ifdef INET6 #ifndef INET #include <netinet/in.h> @@ -78,11 +72,6 @@ #include <netinet/ip6.h> #endif -#ifdef NETATALK -#include <netatalk/at.h> -#include <netatalk/at_var.h> -#endif - #include <security/mac/mac_framework.h> #ifdef TINY_LOMTU @@ -101,22 +90,20 @@ CSUM_SCTP_VALID) int loioctl(struct ifnet *, u_long, caddr_t); -static void lortrequest(int, struct rtentry *, struct rt_addrinfo *); int looutput(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro); + const struct sockaddr *dst, struct route *ro); static int lo_clone_create(struct if_clone *, int, caddr_t); static void lo_clone_destroy(struct ifnet *); VNET_DEFINE(struct ifnet *, loif); /* Used externally */ #ifdef VIMAGE -static VNET_DEFINE(struct ifc_simple_data, lo_cloner_data); -static VNET_DEFINE(struct if_clone, lo_cloner); -#define V_lo_cloner_data VNET(lo_cloner_data) +static VNET_DEFINE(struct if_clone *, lo_cloner); #define V_lo_cloner VNET(lo_cloner) #endif -IFC_SIMPLE_DECLARE(lo, 1); +static struct if_clone *lo_cloner; +static const char loname[] = "lo"; static void lo_clone_destroy(struct ifnet *ifp) @@ -141,7 +128,7 @@ lo_clone_create(struct if_clone *ifc, int unit, caddr_t params) if (ifp == NULL) return (ENOSPC); - if_initname(ifp, ifc->ifc_name, unit); + if_initname(ifp, loname, unit); ifp->if_mtu = LOMTU; ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; ifp->if_ioctl = loioctl; @@ -163,15 +150,15 @@ vnet_loif_init(const void *unused __unused) { #ifdef VIMAGE + lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy, + 1); V_lo_cloner = lo_cloner; - V_lo_cloner_data = lo_cloner_data; - V_lo_cloner.ifc_data = &V_lo_cloner_data; - if_clone_attach(&V_lo_cloner); #else - if_clone_attach(&lo_cloner); + lo_cloner = if_clone_simple(loname, lo_clone_create, lo_clone_destroy, + 1); #endif } -VNET_SYSINIT(vnet_loif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, +VNET_SYSINIT(vnet_loif_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_loif_init, NULL); #ifdef VIMAGE @@ -179,10 +166,10 @@ static void vnet_loif_uninit(const void *unused __unused) { - if_clone_detach(&V_lo_cloner); + if_clone_detach(V_lo_cloner); V_loif = NULL; } -VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, +VNET_SYSUNINIT(vnet_loif_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_loif_uninit, NULL); #endif @@ -213,19 +200,16 @@ static moduledata_t loop_mod = { DECLARE_MODULE(if_lo, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); int -looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, +looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int32_t af; - struct rtentry *rt = NULL; #ifdef MAC int error; #endif M_ASSERTPKTHDR(m); /* check if we have the packet header */ - if (ro != NULL) - rt = ro->ro_rt; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) { @@ -234,23 +218,22 @@ looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, } #endif - if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + if (ro != NULL && ro->ro_flags & (RT_REJECT|RT_BLACKHOLE)) { m_freem(m); - return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + return (ro->ro_flags & RT_BLACKHOLE ? 0 : EHOSTUNREACH); } - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { + if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } + else + af = dst->sa_family; #if 1 /* XXX */ - switch (dst->sa_family) { + switch (af) { case AF_INET: if (ifp->if_capenable & IFCAP_RXCSUM) { m->m_pkthdr.csum_data = 0xffff; @@ -275,16 +258,13 @@ looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, #endif m->m_pkthdr.csum_flags &= ~LO_CSUM_FEATURES6; break; - case AF_IPX: - case AF_APPLETALK: - break; default: - printf("looutput: af=%d unexpected\n", dst->sa_family); + printf("looutput: af=%d unexpected\n", af); m_freem(m); return (EAFNOSUPPORT); } #endif - return (if_simloop(ifp, m, dst->sa_family, 0)); + return (if_simloop(ifp, m, af, 0)); } /* @@ -370,36 +350,17 @@ if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen) isr = NETISR_IPV6; break; #endif -#ifdef IPX - case AF_IPX: - isr = NETISR_IPX; - break; -#endif -#ifdef NETATALK - case AF_APPLETALK: - isr = NETISR_ATALK2; - break; -#endif default: printf("if_simloop: can't handle af=%d\n", af); m_freem(m); return (EAFNOSUPPORT); } - ifp->if_ipackets++; - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); netisr_queue(isr, m); /* mbuf is free'd on failure. */ return (0); } -/* ARGSUSED */ -static void -lortrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) -{ - - RT_LOCK_ASSERT(rt); - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; -} - /* * Process an ioctl request. */ @@ -407,7 +368,6 @@ lortrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) int loioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { - struct ifaddr *ifa; struct ifreq *ifr = (struct ifreq *)data; int error = 0, mask; @@ -415,8 +375,6 @@ loioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case SIOCSIFADDR: ifp->if_flags |= IFF_UP; ifp->if_drv_flags |= IFF_DRV_RUNNING; - ifa = (struct ifaddr *)data; - ifa->ifa_rtrequest = lortrequest; /* * Everything else is done at a higher level. */ @@ -424,7 +382,7 @@ loioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case SIOCADDMULTI: case SIOCDELMULTI: - if (ifr == 0) { + if (ifr == NULL) { error = EAFNOSUPPORT; /* XXX */ break; } diff --git a/freebsd/sys/net/if_media.c b/freebsd/sys/net/if_media.c index 264d3535..66b13568 100644 --- a/freebsd/sys/net/if_media.c +++ b/freebsd/sys/net/if_media.c @@ -48,6 +48,8 @@ * to implement this interface. */ +#include <rtems/bsd/local/opt_ifmedia.h> + #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <sys/socket.h> @@ -70,6 +72,7 @@ static struct ifmedia_entry *ifmedia_match(struct ifmedia *ifm, int flags, int mask); #ifdef IFMEDIA_DEBUG +#include <net/if_var.h> int ifmedia_debug = 0; SYSCTL_INT(_debug, OID_AUTO, ifmedia, CTLFLAG_RW, &ifmedia_debug, 0, "if_media debugging msgs"); @@ -195,6 +198,21 @@ ifmedia_set(ifm, target) } /* + * Given a media word, return one suitable for an application + * using the original encoding. + */ +static int +compat_media(int media) +{ + + if (IFM_TYPE(media) == IFM_ETHER && IFM_SUBTYPE(media) > IFM_OTHER) { + media &= ~(IFM_ETH_XTYPE|IFM_TMASK); + media |= IFM_OTHER; + } + return (media); +} + +/* * Device-independent media ioctl support function. */ int @@ -206,7 +224,7 @@ ifmedia_ioctl(ifp, ifr, ifm, cmd) { struct ifmedia_entry *match; struct ifmediareq *ifmr = (struct ifmediareq *) ifr; - int error = 0, sticky; + int error = 0; if (ifp == NULL || ifr == NULL || ifm == NULL) return(EINVAL); @@ -273,80 +291,42 @@ ifmedia_ioctl(ifp, ifr, ifm, cmd) * Get list of available media and current media on interface. */ case SIOCGIFMEDIA: + case SIOCGIFXMEDIA: { struct ifmedia_entry *ep; - int *kptr, count; - int usermax; /* user requested max */ + int i; - kptr = NULL; /* XXX gcc */ + if (ifmr->ifm_count < 0) + return (EINVAL); - ifmr->ifm_active = ifmr->ifm_current = ifm->ifm_cur ? - ifm->ifm_cur->ifm_media : IFM_NONE; + if (cmd == SIOCGIFMEDIA) { + ifmr->ifm_active = ifmr->ifm_current = ifm->ifm_cur ? + compat_media(ifm->ifm_cur->ifm_media) : IFM_NONE; + } else { + ifmr->ifm_active = ifmr->ifm_current = ifm->ifm_cur ? + ifm->ifm_cur->ifm_media : IFM_NONE; + } ifmr->ifm_mask = ifm->ifm_mask; ifmr->ifm_status = 0; (*ifm->ifm_status)(ifp, ifmr); - count = 0; - usermax = 0; - /* * If there are more interfaces on the list, count * them. This allows the caller to set ifmr->ifm_count * to 0 on the first call to know how much space to * allocate. */ + i = 0; LIST_FOREACH(ep, &ifm->ifm_list, ifm_list) - usermax++; - - /* - * Don't allow the user to ask for too many - * or a negative number. - */ - if (ifmr->ifm_count > usermax) - ifmr->ifm_count = usermax; - else if (ifmr->ifm_count < 0) - return (EINVAL); - - if (ifmr->ifm_count != 0) { - kptr = (int *)malloc(ifmr->ifm_count * sizeof(int), - M_TEMP, M_NOWAIT); - - if (kptr == NULL) - return (ENOMEM); - /* - * Get the media words from the interface's list. - */ - ep = LIST_FIRST(&ifm->ifm_list); - for (; ep != NULL && count < ifmr->ifm_count; - ep = LIST_NEXT(ep, ifm_list), count++) - kptr[count] = ep->ifm_media; - - if (ep != NULL) - error = E2BIG; /* oops! */ - } else { - count = usermax; - } - - /* - * We do the copyout on E2BIG, because that's - * just our way of telling userland that there - * are more. This is the behavior I've observed - * under BSD/OS 3.0 - */ - sticky = error; - if ((error == 0 || error == E2BIG) && ifmr->ifm_count != 0) { - error = copyout((caddr_t)kptr, - (caddr_t)ifmr->ifm_ulist, - ifmr->ifm_count * sizeof(int)); - } - - if (error == 0) - error = sticky; - - if (ifmr->ifm_count != 0) - free(kptr, M_TEMP); - - ifmr->ifm_count = count; + if (i++ < ifmr->ifm_count) { + error = copyout(&ep->ifm_media, + ifmr->ifm_ulist + i - 1, sizeof(int)); + if (error) + break; + } + if (error == 0 && i > ifmr->ifm_count) + error = ifmr->ifm_count ? E2BIG : 0; + ifmr->ifm_count = i; break; } @@ -400,8 +380,7 @@ ifmedia_baudrate(int mword) int i; for (i = 0; ifmedia_baudrate_descriptions[i].ifmb_word != 0; i++) { - if ((mword & (IFM_NMASK|IFM_TMASK)) == - ifmedia_baudrate_descriptions[i].ifmb_word) + if (IFM_TYPE_MATCH(mword, ifmedia_baudrate_descriptions[i].ifmb_word)) return (ifmedia_baudrate_descriptions[i].ifmb_baudrate); } @@ -507,7 +486,7 @@ ifmedia_printword(ifmw) printf("<unknown type>\n"); return; } - printf(desc->ifmt_string); + printf("%s", desc->ifmt_string); /* Any mode. */ for (desc = ttos->modes; desc && desc->ifmt_string != NULL; desc++) diff --git a/freebsd/sys/net/if_media.h b/freebsd/sys/net/if_media.h index 12585095..86439950 100644 --- a/freebsd/sys/net/if_media.h +++ b/freebsd/sys/net/if_media.h @@ -54,11 +54,13 @@ #include <sys/queue.h> +struct ifnet; + /* * Driver callbacks for media status and change requests. */ -typedef int (*ifm_change_cb_t)(struct ifnet *ifp); -typedef void (*ifm_stat_cb_t)(struct ifnet *ifp, struct ifmediareq *req); +typedef int (*ifm_change_cb_t)(struct ifnet *); +typedef void (*ifm_stat_cb_t)(struct ifnet *, struct ifmediareq *req); /* * In-kernel representation of a single supported media type. @@ -104,6 +106,7 @@ void ifmedia_set(struct ifmedia *ifm, int mword); int ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, struct ifmedia *ifm, u_long cmd); + /* Compute baudrate for a given media. */ uint64_t ifmedia_baudrate(int); @@ -115,7 +118,7 @@ uint64_t ifmedia_baudrate(int); * ---- ------- * 0-4 Media variant * 5-7 Media type - * 8-15 Type specific options + * 8-15 Type specific options (includes added variant bits on Ethernet) * 16-18 Mode (for multi-mode devices) * 19 RFU * 20-27 Shared (global) options @@ -124,8 +127,18 @@ uint64_t ifmedia_baudrate(int); /* * Ethernet + * In order to use more than 31 subtypes, Ethernet uses some of the option + * bits as part of the subtype field. See the options section below for + * relevant definitions */ #define IFM_ETHER 0x00000020 +#define IFM_ETHER_SUBTYPE(x) (((x) & IFM_TMASK) | \ + (((x) & (IFM_ETH_XTYPE >> IFM_ETH_XSHIFT)) << IFM_ETH_XSHIFT)) +#define IFM_X(x) IFM_ETHER_SUBTYPE(x) /* internal shorthand */ +#define IFM_ETHER_SUBTYPE_SET(x) (IFM_ETHER_SUBTYPE(x) | IFM_ETHER) +#define IFM_ETHER_SUBTYPE_GET(x) ((x) & (IFM_TMASK|IFM_ETH_XTYPE)) +#define IFM_ETHER_IS_EXTENDED(x) ((x) & IFM_ETH_XTYPE) + #define IFM_10_T 3 /* 10BaseT - RJ45 */ #define IFM_10_2 4 /* 10Base2 - Thinnet */ #define IFM_10_5 5 /* 10Base5 - AUI */ @@ -153,15 +166,49 @@ uint64_t ifmedia_baudrate(int); #define IFM_40G_CR4 27 /* 40GBase-CR4 */ #define IFM_40G_SR4 28 /* 40GBase-SR4 */ #define IFM_40G_LR4 29 /* 40GBase-LR4 */ +#define IFM_1000_KX 30 /* 1000Base-KX backplane */ +#define IFM_OTHER 31 /* Other: one of the following */ + +/* following types are not visible to old binaries using only IFM_TMASK */ +#define IFM_10G_KX4 IFM_X(32) /* 10GBase-KX4 backplane */ +#define IFM_10G_KR IFM_X(33) /* 10GBase-KR backplane */ +#define IFM_10G_CR1 IFM_X(34) /* 10GBase-CR1 Twinax splitter */ +#define IFM_20G_KR2 IFM_X(35) /* 20GBase-KR2 backplane */ +#define IFM_2500_KX IFM_X(36) /* 2500Base-KX backplane */ +#define IFM_2500_T IFM_X(37) /* 2500Base-T - RJ45 (NBaseT) */ +#define IFM_5000_T IFM_X(38) /* 5000Base-T - RJ45 (NBaseT) */ +#define IFM_50G_PCIE IFM_X(39) /* 50G Ethernet over PCIE */ +#define IFM_25G_PCIE IFM_X(40) /* 25G Ethernet over PCIE */ +#define IFM_1000_SGMII IFM_X(41) /* 1G media interface */ +#define IFM_10G_SFI IFM_X(42) /* 10G media interface */ +#define IFM_40G_XLPPI IFM_X(43) /* 40G media interface */ +#define IFM_1000_CX_SGMII IFM_X(44) /* 1000Base-CX-SGMII */ +#define IFM_40G_KR4 IFM_X(45) /* 40GBase-KR4 */ +#define IFM_10G_ER IFM_X(46) /* 10GBase-ER */ +#define IFM_100G_CR4 IFM_X(47) /* 100GBase-CR4 */ +#define IFM_100G_SR4 IFM_X(48) /* 100GBase-SR4 */ +#define IFM_100G_KR4 IFM_X(49) /* 100GBase-KR4 */ +#define IFM_100G_LR4 IFM_X(50) /* 100GBase-LR4 */ +#define IFM_56G_R4 IFM_X(51) /* 56GBase-R4 */ +#define IFM_100_T IFM_X(52) /* 100BaseT - RJ45 */ +#define IFM_25G_CR IFM_X(53) /* 25GBase-CR */ +#define IFM_25G_KR IFM_X(54) /* 25GBase-KR */ +#define IFM_25G_SR IFM_X(55) /* 25GBase-SR */ +#define IFM_50G_CR2 IFM_X(56) /* 50GBase-CR2 */ +#define IFM_50G_KR2 IFM_X(57) /* 50GBase-KR2 */ + /* * Please update ieee8023ad_lacp.c:lacp_compose_key() * after adding new Ethernet media types. */ -/* note 31 is the max! */ +/* Note IFM_X(511) is the max! */ +/* Ethernet option values; includes bits used for extended variant field */ #define IFM_ETH_MASTER 0x00000100 /* master mode (1000baseT) */ #define IFM_ETH_RXPAUSE 0x00000200 /* receive PAUSE frames */ #define IFM_ETH_TXPAUSE 0x00000400 /* transmit PAUSE frames */ +#define IFM_ETH_XTYPE 0x00007800 /* extended media variants */ +#define IFM_ETH_XSHIFT 6 /* shift XTYPE next to TMASK */ /* * Token ring @@ -253,11 +300,6 @@ uint64_t ifmedia_baudrate(int); #define IFM_ATM_UNASSIGNED 0x00000400 /* unassigned cells */ /* - * CARP Common Address Redundancy Protocol - */ -#define IFM_CARP 0x000000c0 - -/* * Shared media sub-types */ #define IFM_AUTO 0 /* Autoselect best media */ @@ -309,7 +351,10 @@ uint64_t ifmedia_baudrate(int); * Macros to extract various bits of information from the media word. */ #define IFM_TYPE(x) ((x) & IFM_NMASK) -#define IFM_SUBTYPE(x) ((x) & IFM_TMASK) +#define IFM_SUBTYPE(x) \ + (IFM_TYPE(x) == IFM_ETHER ? IFM_ETHER_SUBTYPE_GET(x) : ((x) & IFM_TMASK)) +#define IFM_TYPE_MATCH(x,y) \ + (IFM_TYPE(x) == IFM_TYPE(y) && IFM_SUBTYPE(x) == IFM_SUBTYPE(y)) #define IFM_TYPE_OPTIONS(x) ((x) & IFM_OMASK) #define IFM_INST(x) (((x) & IFM_IMASK) >> IFM_ISHIFT) #define IFM_OPTIONS(x) ((x) & (IFM_OMASK | IFM_GMASK)) @@ -343,7 +388,6 @@ struct ifmedia_description { { IFM_FDDI, "FDDI" }, \ { IFM_IEEE80211, "IEEE 802.11 Wireless Ethernet" }, \ { IFM_ATM, "ATM" }, \ - { IFM_CARP, "Common Address Redundancy Protocol" }, \ { 0, NULL }, \ } @@ -375,6 +419,34 @@ struct ifmedia_description { { IFM_40G_CR4, "40Gbase-CR4" }, \ { IFM_40G_SR4, "40Gbase-SR4" }, \ { IFM_40G_LR4, "40Gbase-LR4" }, \ + { IFM_1000_KX, "1000Base-KX" }, \ + { IFM_OTHER, "Other" }, \ + { IFM_10G_KX4, "10GBase-KX4" }, \ + { IFM_10G_KR, "10GBase-KR" }, \ + { IFM_10G_CR1, "10GBase-CR1" }, \ + { IFM_20G_KR2, "20GBase-KR2" }, \ + { IFM_2500_KX, "2500Base-KX" }, \ + { IFM_2500_T, "2500Base-T" }, \ + { IFM_5000_T, "5000Base-T" }, \ + { IFM_50G_PCIE, "PCIExpress-50G" }, \ + { IFM_25G_PCIE, "PCIExpress-25G" }, \ + { IFM_1000_SGMII, "1000Base-SGMII" }, \ + { IFM_10G_SFI, "10GBase-SFI" }, \ + { IFM_40G_XLPPI, "40GBase-XLPPI" }, \ + { IFM_1000_CX_SGMII, "1000Base-CX-SGMII" }, \ + { IFM_40G_KR4, "40GBase-KR4" }, \ + { IFM_10G_ER, "10GBase-ER" }, \ + { IFM_100G_CR4, "100GBase-CR4" }, \ + { IFM_100G_SR4, "100GBase-SR4" }, \ + { IFM_100G_KR4, "100GBase-KR4" }, \ + { IFM_100G_LR4, "100GBase-LR4" }, \ + { IFM_56G_R4, "56GBase-R4" }, \ + { IFM_100_T, "100BaseT" }, \ + { IFM_25G_CR, "25GBase-CR" }, \ + { IFM_25G_KR, "25GBase-KR" }, \ + { IFM_25G_SR, "25GBase-SR" }, \ + { IFM_50G_CR2, "50GBase-CR2" }, \ + { IFM_50G_KR2, "50GBase-KR2" }, \ { 0, NULL }, \ } @@ -676,6 +748,33 @@ struct ifmedia_baudrate { { IFM_ETHER | IFM_40G_CR4, IF_Gbps(40ULL) }, \ { IFM_ETHER | IFM_40G_SR4, IF_Gbps(40ULL) }, \ { IFM_ETHER | IFM_40G_LR4, IF_Gbps(40ULL) }, \ + { IFM_ETHER | IFM_1000_KX, IF_Mbps(1000) }, \ + { IFM_ETHER | IFM_10G_KX4, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_10G_KR, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_10G_CR1, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_20G_KR2, IF_Gbps(20ULL) }, \ + { IFM_ETHER | IFM_2500_KX, IF_Mbps(2500) }, \ + { IFM_ETHER | IFM_2500_T, IF_Mbps(2500) }, \ + { IFM_ETHER | IFM_5000_T, IF_Mbps(5000) }, \ + { IFM_ETHER | IFM_50G_PCIE, IF_Gbps(50ULL) }, \ + { IFM_ETHER | IFM_25G_PCIE, IF_Gbps(25ULL) }, \ + { IFM_ETHER | IFM_1000_SGMII, IF_Mbps(1000) }, \ + { IFM_ETHER | IFM_10G_SFI, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_40G_XLPPI, IF_Gbps(40ULL) }, \ + { IFM_ETHER | IFM_1000_CX_SGMII, IF_Mbps(1000) }, \ + { IFM_ETHER | IFM_40G_KR4, IF_Gbps(40ULL) }, \ + { IFM_ETHER | IFM_10G_ER, IF_Gbps(10ULL) }, \ + { IFM_ETHER | IFM_100G_CR4, IF_Gbps(100ULL) }, \ + { IFM_ETHER | IFM_100G_SR4, IF_Gbps(100ULL) }, \ + { IFM_ETHER | IFM_100G_KR4, IF_Gbps(100ULL) }, \ + { IFM_ETHER | IFM_100G_LR4, IF_Gbps(100ULL) }, \ + { IFM_ETHER | IFM_56G_R4, IF_Gbps(56ULL) }, \ + { IFM_ETHER | IFM_100_T, IF_Mbps(100ULL) }, \ + { IFM_ETHER | IFM_25G_CR, IF_Gbps(25ULL) }, \ + { IFM_ETHER | IFM_25G_KR, IF_Gbps(25ULL) }, \ + { IFM_ETHER | IFM_25G_SR, IF_Gbps(25ULL) }, \ + { IFM_ETHER | IFM_50G_CR2, IF_Gbps(50ULL) }, \ + { IFM_ETHER | IFM_50G_KR2, IF_Gbps(50ULL) }, \ \ { IFM_TOKEN | IFM_TOK_STP4, IF_Mbps(4) }, \ { IFM_TOKEN | IFM_TOK_STP16, IF_Mbps(16) }, \ @@ -730,8 +829,6 @@ struct ifmedia_status_description { { "no network", "active" } }, \ { IFM_ATM, IFM_AVALID, IFM_ACTIVE, \ { "no network", "active" } }, \ - { IFM_CARP, IFM_AVALID, IFM_ACTIVE, \ - { "backup", "master" } }, \ { 0, 0, 0, \ { NULL, NULL } } \ } diff --git a/freebsd/sys/net/if_mib.c b/freebsd/sys/net/if_mib.c index ec7a6984..d91c94ab 100644 --- a/freebsd/sys/net/if_mib.c +++ b/freebsd/sys/net/if_mib.c @@ -34,10 +34,12 @@ #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/malloc.h> #include <sys/socket.h> #include <sys/sysctl.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_mib.h> #include <net/vnet.h> @@ -68,9 +70,9 @@ SYSCTL_DECL(_net_link_generic); static SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0, "Variables global to all interfaces"); -SYSCTL_VNET_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, - &VNET_NAME(if_index), 0, - "Number of configured interfaces"); +SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(if_index), 0, + "Number of configured interfaces"); static int sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */ @@ -100,37 +102,18 @@ sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */ bzero(&ifmd, sizeof(ifmd)); strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name)); -#define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld - COPY(pcount); - COPY(data); -#undef COPY + ifmd.ifmd_pcount = ifp->if_pcount; + if_data_copy(ifp, &ifmd.ifmd_data); + ifmd.ifmd_flags = ifp->if_flags | ifp->if_drv_flags; ifmd.ifmd_snd_len = ifp->if_snd.ifq_len; ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen; - ifmd.ifmd_snd_drops = ifp->if_snd.ifq_drops; + ifmd.ifmd_snd_drops = + ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); error = SYSCTL_OUT(req, &ifmd, sizeof ifmd); - if (error || !req->newptr) - goto out; - - error = SYSCTL_IN(req, &ifmd, sizeof ifmd); if (error) goto out; - -#define DONTCOPY(fld) ifmd.ifmd_data.ifi_##fld = ifp->if_data.ifi_##fld - DONTCOPY(type); - DONTCOPY(physical); - DONTCOPY(addrlen); - DONTCOPY(hdrlen); - DONTCOPY(mtu); - DONTCOPY(metric); - DONTCOPY(baudrate); -#undef DONTCOPY -#define COPY(fld) ifp->if_##fld = ifmd.ifmd_##fld - COPY(data); - ifp->if_snd.ifq_maxlen = ifmd.ifmd_snd_maxlen; - ifp->if_snd.ifq_drops = ifmd.ifmd_snd_drops; -#undef COPY break; case IFDATA_LINKSPECIFIC: diff --git a/freebsd/sys/net/if_pflog.h b/freebsd/sys/net/if_pflog.h new file mode 100644 index 00000000..0faeb7d4 --- /dev/null +++ b/freebsd/sys/net/if_pflog.h @@ -0,0 +1,66 @@ +/* $OpenBSD: if_pflog.h,v 1.13 2006/10/23 12:46:09 henning Exp $ */ +/* + * Copyright 2001 Niels Provos <provos@citi.umich.edu> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _NET_IF_PFLOG_H_ +#define _NET_IF_PFLOG_H_ + +#define PFLOGIFS_MAX 16 + +#define PFLOG_RULESET_NAME_SIZE 16 + +struct pfloghdr { + u_int8_t length; + sa_family_t af; + u_int8_t action; + u_int8_t reason; + char ifname[IFNAMSIZ]; + char ruleset[PFLOG_RULESET_NAME_SIZE]; + u_int32_t rulenr; + u_int32_t subrulenr; + uid_t uid; + pid_t pid; + uid_t rule_uid; + pid_t rule_pid; + u_int8_t dir; + u_int8_t pad[3]; +}; + +#define PFLOG_HDRLEN sizeof(struct pfloghdr) +/* minus pad, also used as a signature */ +#define PFLOG_REAL_HDRLEN offsetof(struct pfloghdr, pad) + +#ifdef _KERNEL +struct pf_rule; +struct pf_ruleset; +struct pfi_kif; +struct pf_pdesc; + +#define PFLOG_PACKET(i,a,b,c,d,e,f,g,h,di) do { \ + if (pflog_packet_ptr != NULL) \ + pflog_packet_ptr(i,a,b,c,d,e,f,g,h,di); \ +} while (0) +#endif /* _KERNEL */ +#endif /* _NET_IF_PFLOG_H_ */ diff --git a/freebsd/sys/net/if_pfsync.h b/freebsd/sys/net/if_pfsync.h new file mode 100644 index 00000000..5c4ba631 --- /dev/null +++ b/freebsd/sys/net/if_pfsync.h @@ -0,0 +1,265 @@ +/*- + * Copyright (c) 2001 Michael Shalayeff + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 2008 David Gwynne <dlg@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * $OpenBSD: if_pfsync.h,v 1.35 2008/06/29 08:42:15 mcbride Exp $ + * $FreeBSD$ + */ + + +#ifndef _NET_IF_PFSYNC_H_ +#define _NET_IF_PFSYNC_H_ + +#define PFSYNC_VERSION 5 +#define PFSYNC_DFLTTL 255 + +#define PFSYNC_ACT_CLR 0 /* clear all states */ +#define PFSYNC_ACT_INS 1 /* insert state */ +#define PFSYNC_ACT_INS_ACK 2 /* ack of insterted state */ +#define PFSYNC_ACT_UPD 3 /* update state */ +#define PFSYNC_ACT_UPD_C 4 /* "compressed" update state */ +#define PFSYNC_ACT_UPD_REQ 5 /* request "uncompressed" state */ +#define PFSYNC_ACT_DEL 6 /* delete state */ +#define PFSYNC_ACT_DEL_C 7 /* "compressed" delete state */ +#define PFSYNC_ACT_INS_F 8 /* insert fragment */ +#define PFSYNC_ACT_DEL_F 9 /* delete fragments */ +#define PFSYNC_ACT_BUS 10 /* bulk update status */ +#define PFSYNC_ACT_TDB 11 /* TDB replay counter update */ +#define PFSYNC_ACT_EOF 12 /* end of frame */ +#define PFSYNC_ACT_MAX 13 + +/* + * A pfsync frame is built from a header followed by several sections which + * are all prefixed with their own subheaders. Frames must be terminated with + * an EOF subheader. + * + * | ... | + * | IP header | + * +============================+ + * | pfsync_header | + * +----------------------------+ + * | pfsync_subheader | + * +----------------------------+ + * | first action fields | + * | ... | + * +----------------------------+ + * | pfsync_subheader | + * +----------------------------+ + * | second action fields | + * | ... | + * +----------------------------+ + * | EOF pfsync_subheader | + * +----------------------------+ + * | HMAC | + * +============================+ + */ + +/* + * Frame header + */ + +struct pfsync_header { + u_int8_t version; + u_int8_t _pad; + u_int16_t len; + u_int8_t pfcksum[PF_MD5_DIGEST_LENGTH]; +} __packed; + +/* + * Frame region subheader + */ + +struct pfsync_subheader { + u_int8_t action; + u_int8_t _pad; + u_int16_t count; +} __packed; + +/* + * CLR + */ + +struct pfsync_clr { + char ifname[IFNAMSIZ]; + u_int32_t creatorid; +} __packed; + +/* + * INS, UPD, DEL + */ + +/* these use struct pfsync_state in pfvar.h */ + +/* + * INS_ACK + */ + +struct pfsync_ins_ack { + u_int64_t id; + u_int32_t creatorid; +} __packed; + +/* + * UPD_C + */ + +struct pfsync_upd_c { + u_int64_t id; + struct pfsync_state_peer src; + struct pfsync_state_peer dst; + u_int32_t creatorid; + u_int32_t expire; + u_int8_t timeout; + u_int8_t _pad[3]; +} __packed; + +/* + * UPD_REQ + */ + +struct pfsync_upd_req { + u_int64_t id; + u_int32_t creatorid; +} __packed; + +/* + * DEL_C + */ + +struct pfsync_del_c { + u_int64_t id; + u_int32_t creatorid; +} __packed; + +/* + * INS_F, DEL_F + */ + +/* not implemented (yet) */ + +/* + * BUS + */ + +struct pfsync_bus { + u_int32_t creatorid; + u_int32_t endtime; + u_int8_t status; +#define PFSYNC_BUS_START 1 +#define PFSYNC_BUS_END 2 + u_int8_t _pad[3]; +} __packed; + +/* + * TDB + */ + +struct pfsync_tdb { + u_int32_t spi; + union sockaddr_union dst; + u_int32_t rpl; + u_int64_t cur_bytes; + u_int8_t sproto; + u_int8_t updates; + u_int8_t _pad[2]; +} __packed; + +#define PFSYNC_HDRLEN sizeof(struct pfsync_header) + +struct pfsyncstats { + u_int64_t pfsyncs_ipackets; /* total input packets, IPv4 */ + u_int64_t pfsyncs_ipackets6; /* total input packets, IPv6 */ + u_int64_t pfsyncs_badif; /* not the right interface */ + u_int64_t pfsyncs_badttl; /* TTL is not PFSYNC_DFLTTL */ + u_int64_t pfsyncs_hdrops; /* packets shorter than hdr */ + u_int64_t pfsyncs_badver; /* bad (incl unsupp) version */ + u_int64_t pfsyncs_badact; /* bad action */ + u_int64_t pfsyncs_badlen; /* data length does not match */ + u_int64_t pfsyncs_badauth; /* bad authentication */ + u_int64_t pfsyncs_stale; /* stale state */ + u_int64_t pfsyncs_badval; /* bad values */ + u_int64_t pfsyncs_badstate; /* insert/lookup failed */ + + u_int64_t pfsyncs_opackets; /* total output packets, IPv4 */ + u_int64_t pfsyncs_opackets6; /* total output packets, IPv6 */ + u_int64_t pfsyncs_onomem; /* no memory for an mbuf */ + u_int64_t pfsyncs_oerrors; /* ip output error */ + + u_int64_t pfsyncs_iacts[PFSYNC_ACT_MAX]; + u_int64_t pfsyncs_oacts[PFSYNC_ACT_MAX]; +}; + +/* + * Configuration structure for SIOCSETPFSYNC SIOCGETPFSYNC + */ +struct pfsyncreq { + char pfsyncr_syncdev[IFNAMSIZ]; + struct in_addr pfsyncr_syncpeer; + int pfsyncr_maxupdates; + int pfsyncr_defer; +}; + +#define SIOCSETPFSYNC _IOW('i', 247, struct ifreq) +#define SIOCGETPFSYNC _IOWR('i', 248, struct ifreq) + +#ifdef _KERNEL + +/* + * this shows where a pf state is with respect to the syncing. + */ +#define PFSYNC_S_INS 0x00 +#define PFSYNC_S_IACK 0x01 +#define PFSYNC_S_UPD 0x02 +#define PFSYNC_S_UPD_C 0x03 +#define PFSYNC_S_DEL 0x04 +#define PFSYNC_S_COUNT 0x05 + +#define PFSYNC_S_DEFER 0xfe +#define PFSYNC_S_NONE 0xff + +#define PFSYNC_SI_IOCTL 0x01 +#define PFSYNC_SI_CKSUM 0x02 +#define PFSYNC_SI_ACK 0x04 + +#endif /* _KERNEL */ + +#endif /* _NET_IF_PFSYNC_H_ */ diff --git a/freebsd/sys/net/if_sppp.h b/freebsd/sys/net/if_sppp.h index 97f94b39..23a08e77 100644 --- a/freebsd/sys/net/if_sppp.h +++ b/freebsd/sys/net/if_sppp.h @@ -78,7 +78,7 @@ struct sauth { /* * Don't change the order of this. Ordering the phases this way allows - * for a comparision of ``pp_phase >= PHASE_AUTHENTICATE'' in order to + * for a comparison of ``pp_phase >= PHASE_AUTHENTICATE'' in order to * know whether LCP is up. */ enum ppp_phase { diff --git a/freebsd/sys/net/if_spppfr.c b/freebsd/sys/net/if_spppfr.c index 93bbaeba..d30509d5 100644 --- a/freebsd/sys/net/if_spppfr.c +++ b/freebsd/sys/net/if_spppfr.c @@ -27,10 +27,9 @@ #include <rtems/bsd/sys/param.h> -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 +#if defined(__FreeBSD__) #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #endif #ifdef NetBSD1_3 @@ -47,7 +46,7 @@ #include <sys/sockio.h> #include <sys/socket.h> #include <sys/syslog.h> -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 +#if defined(__FreeBSD__) #include <sys/random.h> #endif #include <sys/malloc.h> @@ -60,6 +59,7 @@ #endif #include <net/if.h> +#include <net/if_var.h> #include <net/netisr.h> #include <net/if_types.h> #include <net/route.h> @@ -86,11 +86,6 @@ # include <net/ethertypes.h> #endif -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - #include <net/if_sppp.h> /* @@ -151,7 +146,7 @@ struct arp_req { unsigned short ptarget2; } __packed; -#if defined(__FreeBSD__) && __FreeBSD__ >= 3 && __FreeBSD_version < 501113 +#if defined(__FreeBSD__) && __FreeBSD_version < 501113 #define SPP_FMT "%s%d: " #define SPP_ARGS(ifp) (ifp)->if_name, (ifp)->if_unit #else @@ -257,9 +252,9 @@ bad: m_freem (m); switch (proto) { default: - ++ifp->if_noproto; -drop: ++ifp->if_ierrors; - ++ifp->if_iqdrops; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); +drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); m_freem (m); return; #ifdef INET @@ -267,16 +262,6 @@ drop: ++ifp->if_ierrors; isr = NETISR_IP; break; #endif -#ifdef IPX - case ETHERTYPE_IPX: - isr = NETISR_IPX; - break; -#endif -#ifdef NETATALK - case ETHERTYPE_AT: - isr = NETISR_ATALK; - break; -#endif } if (! (ifp->if_flags & IFF_UP)) @@ -306,7 +291,7 @@ struct mbuf *sppp_fr_header (struct sppp *sp, struct mbuf *m, /* Prepend the space for Frame Relay header. */ hlen = (family == AF_INET) ? 4 : 10; - M_PREPEND (m, hlen, M_DONTWAIT); + M_PREPEND (m, hlen, M_NOWAIT); if (! m) return 0; h = mtod (m, u_char*); @@ -346,21 +331,11 @@ struct mbuf *sppp_fr_header (struct sppp *sp, struct mbuf *m, h[3] = FR_IP; return m; #endif -#ifdef IPX - case AF_IPX: - type = ETHERTYPE_IPX; - break; -#endif #ifdef NS case AF_NS: type = 0x8137; break; #endif -#ifdef NETATALK - case AF_APPLETALK: - type = ETHERTYPE_AT; - break; -#endif } h[3] = FR_PADDING; h[4] = FR_SNAP; @@ -383,7 +358,7 @@ void sppp_fr_keepalive (struct sppp *sp) unsigned char *h, *p; struct mbuf *m; - MGETHDR (m, M_DONTWAIT, MT_DATA); + MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.rcvif = 0; @@ -421,7 +396,7 @@ void sppp_fr_keepalive (struct sppp *sp) (u_char) sp->pp_rseq[IDX_LCP]); if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) - ++ifp->if_oerrors; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* @@ -503,7 +478,7 @@ static void sppp_fr_arp (struct sppp *sp, struct arp_req *req, (unsigned char) his_ip_address); /* Send the Inverse ARP reply. */ - MGETHDR (m, M_DONTWAIT, MT_DATA); + MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.len = m->m_len = 10 + sizeof (*reply); @@ -535,7 +510,7 @@ static void sppp_fr_arp (struct sppp *sp, struct arp_req *req, reply->ptarget2 = htonl (his_ip_address) >> 16; if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) - ++ifp->if_oerrors; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* diff --git a/freebsd/sys/net/if_spppsubr.c b/freebsd/sys/net/if_spppsubr.c index fa6a7c1b..e7a62277 100644 --- a/freebsd/sys/net/if_spppsubr.c +++ b/freebsd/sys/net/if_spppsubr.c @@ -27,11 +27,12 @@ #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> #include <sys/module.h> +#include <sys/rmlock.h> #include <sys/sockio.h> #include <sys/socket.h> #include <sys/syslog.h> @@ -42,6 +43,7 @@ #include <sys/md5.h> #include <net/if.h> +#include <net/if_var.h> #include <net/netisr.h> #include <net/if_types.h> #include <net/route.h> @@ -66,11 +68,6 @@ #include <netinet/if_ether.h> -#ifdef IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - #include <net/if_sppp.h> #define IOCTL_CMD_T u_long @@ -264,7 +261,7 @@ static const u_short interactive_ports[8] = { int debug = ifp->if_flags & IFF_DEBUG static int sppp_output(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro); + const struct sockaddr *dst, struct route *ro); static void sppp_cisco_send(struct sppp *sp, int type, long par1, long par2); static void sppp_cisco_input(struct sppp *sp, struct mbuf *m); @@ -525,7 +522,7 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) if (ifp->if_flags & IFF_UP) /* Count received bytes, add FCS and one flag */ - ifp->if_ibytes += m->m_pkthdr.len + 3; + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len + 3); if (m->m_pkthdr.len <= PPP_HEADER_LEN) { /* Too small packet, drop it. */ @@ -537,8 +534,8 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) m_freem (m); SPPP_UNLOCK(sp); drop2: - ++ifp->if_ierrors; - ++ifp->if_iqdrops; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return; } @@ -577,7 +574,7 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) sppp_cp_send (sp, PPP_LCP, PROTO_REJ, ++sp->pp_seq[IDX_LCP], m->m_pkthdr.len + 2, &h->protocol); - ++ifp->if_noproto; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto drop; case PPP_LCP: sppp_cp_input(&lcp, sp, m); @@ -631,7 +628,7 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) * enough leading space in the existing mbuf). */ m_adj(m, vjlen); - M_PREPEND(m, hlen, M_DONTWAIT); + M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { SPPP_UNLOCK(sp); goto drop2; @@ -673,14 +670,6 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) do_account++; break; #endif -#ifdef IPX - case PPP_IPX: - /* IPX IPXCP not implemented yet */ - if (sp->pp_phase == PHASE_NETWORK) - isr = NETISR_IPX; - do_account++; - break; -#endif } break; case CISCO_MULTICAST: @@ -697,7 +686,7 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) } switch (ntohs (h->protocol)) { default: - ++ifp->if_noproto; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto invalid; case CISCO_KEEPALIVE: sppp_cisco_input (sp, m); @@ -716,12 +705,6 @@ sppp_input(struct ifnet *ifp, struct mbuf *m) do_account++; break; #endif -#ifdef IPX - case ETHERTYPE_IPX: - isr = NETISR_IPX; - do_account++; - break; -#endif } break; default: /* Invalid PPP packet. */ @@ -787,19 +770,18 @@ sppp_ifstart(struct ifnet *ifp) * Enqueue transmit packet. */ static int -sppp_output(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro) +sppp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) { struct sppp *sp = IFP2SP(ifp); struct ppp_header *h; struct ifqueue *ifq = NULL; - int s, error, rv = 0; + int error, rv = 0; #ifdef INET int ipproto = PPP_IP; #endif int debug = ifp->if_flags & IFF_DEBUG; - s = splimp(); SPPP_LOCK(sp); if (!(ifp->if_flags & IFF_UP) || @@ -810,7 +792,6 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, #endif m_freem (m); SPPP_UNLOCK(sp); - splx (s); return (ENETDOWN); } @@ -834,9 +815,7 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, * to start LCP for it. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; - splx(s); lcp.Open(sp); - s = splimp(); } #ifdef INET @@ -860,7 +839,6 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, { m_freem(m); SPPP_UNLOCK(sp); - splx(s); if(ip->ip_p == IPPROTO_TCP) return(EADDRNOTAVAIL); else @@ -905,7 +883,6 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, default: m_freem(m); SPPP_UNLOCK(sp); - splx(s); return (EINVAL); } } @@ -928,14 +905,13 @@ sppp_output(struct ifnet *ifp, struct mbuf *m, /* * Prepend general data packet PPP header. For now, IP only. */ - M_PREPEND (m, PPP_HEADER_LEN, M_DONTWAIT); + M_PREPEND (m, PPP_HEADER_LEN, M_NOWAIT); if (! m) { nobufs: if (debug) log(LOG_DEBUG, SPP_FMT "no memory for transmit header\n", SPP_ARGS(ifp)); - ++ifp->if_oerrors; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); - splx (s); return (ENOBUFS); } /* @@ -992,17 +968,10 @@ nobufs: if (debug) } break; #endif -#ifdef IPX - case AF_IPX: /* Novell IPX Protocol */ - h->protocol = htons (sp->pp_mode == IFF_CISCO ? - ETHERTYPE_IPX : PPP_IPX); - break; -#endif default: m_freem (m); - ++ifp->if_oerrors; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); - splx (s); return (EAFNOSUPPORT); } @@ -1016,13 +985,11 @@ out: else IFQ_HANDOFF_ADJ(ifp, m, 3, error); if (error) { - ++ifp->if_oerrors; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); SPPP_UNLOCK(sp); - splx (s); return (rv? rv: ENOBUFS); } SPPP_UNLOCK(sp); - splx (s); /* * Unlike in sppp_input(), we can always bump the timestamp * here since sppp_output() is only called on behalf of @@ -1042,7 +1009,7 @@ sppp_attach(struct ifnet *ifp) mtx_init(&sp->mtx, "sppp", MTX_NETWORK_LOCK, MTX_DEF | MTX_RECURSE); /* Initialize keepalive handler. */ - callout_init(&sp->keepalive_callout, CALLOUT_MPSAFE); + callout_init(&sp->keepalive_callout, 1); callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, (void *)sp); @@ -1074,7 +1041,7 @@ sppp_attach(struct ifnet *ifp) #ifdef INET6 sp->confflags |= CONF_ENABLE_IPV6; #endif - callout_init(&sp->ifstart_callout, CALLOUT_MPSAFE); + callout_init(&sp->ifstart_callout, 1); sp->if_start = ifp->if_start; ifp->if_start = sppp_ifstart; sp->pp_comp = malloc(sizeof(struct slcompress), M_TEMP, M_WAITOK); @@ -1139,14 +1106,12 @@ int sppp_isempty(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); - int empty, s; + int empty; - s = splimp(); SPPP_LOCK(sp); empty = !sp->pp_fastq.ifq_head && !sp->pp_cpq.ifq_head && !SP2IFP(sp)->if_snd.ifq_head; SPPP_UNLOCK(sp); - splx(s); return (empty); } @@ -1158,9 +1123,7 @@ sppp_dequeue(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); struct mbuf *m; - int s; - s = splimp(); SPPP_LOCK(sp); /* * Process only the control protocol queue until we have at @@ -1177,7 +1140,6 @@ sppp_dequeue(struct ifnet *ifp) IF_DEQUEUE (&SP2IFP(sp)->if_snd, m); } SPPP_UNLOCK(sp); - splx(s); return m; } @@ -1189,9 +1151,7 @@ sppp_pick(struct ifnet *ifp) { struct sppp *sp = IFP2SP(ifp); struct mbuf *m; - int s; - s = splimp (); SPPP_LOCK(sp); m = sp->pp_cpq.ifq_head; @@ -1202,7 +1162,6 @@ sppp_pick(struct ifnet *ifp) if ((m = sp->pp_fastq.ifq_head) == NULL) m = SP2IFP(sp)->if_snd.ifq_head; SPPP_UNLOCK(sp); - splx (s); return (m); } @@ -1214,14 +1173,12 @@ sppp_ioctl(struct ifnet *ifp, IOCTL_CMD_T cmd, void *data) { struct ifreq *ifr = (struct ifreq*) data; struct sppp *sp = IFP2SP(ifp); - int s, rv, going_up, going_down, newmode; + int rv, going_up, going_down, newmode; - s = splimp(); SPPP_LOCK(sp); rv = 0; switch (cmd) { case SIOCAIFADDR: - case SIOCSIFDSTADDR: break; case SIOCSIFADDR: @@ -1322,7 +1279,6 @@ sppp_ioctl(struct ifnet *ifp, IOCTL_CMD_T cmd, void *data) rv = ENOTTY; } SPPP_UNLOCK(sp); - splx(s); return rv; } @@ -1414,7 +1370,7 @@ sppp_cisco_send(struct sppp *sp, int type, long par1, long par2) getmicrouptime(&tv); - MGETHDR (m, M_DONTWAIT, MT_DATA); + MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + CISCO_PACKET_LEN; @@ -1441,7 +1397,7 @@ sppp_cisco_send(struct sppp *sp, int type, long par1, long par2) (u_long)ch->par2, (u_int)ch->rel, (u_int)ch->time0, (u_int)ch->time1); if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* @@ -1462,7 +1418,7 @@ sppp_cp_send(struct sppp *sp, u_short proto, u_char type, if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) len = MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN; - MGETHDR (m, M_DONTWAIT, MT_DATA); + MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len; @@ -1490,7 +1446,7 @@ sppp_cp_send(struct sppp *sp, u_short proto, u_char type, log(-1, ">\n"); } if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* @@ -1532,7 +1488,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) log(-1, SPP_FMT "%s invalid conf-req length %d\n", SPP_ARGS(ifp), cp->name, len); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } /* handle states where RCR doesn't get a SCA/SCN */ @@ -1588,7 +1544,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CONF_ACK: @@ -1597,7 +1553,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", SPP_ARGS(ifp), cp->name, h->ident, sp->confid[cp->protoidx]); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } switch (sp->state[cp->protoidx]) { @@ -1632,7 +1588,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CONF_NAK: @@ -1642,7 +1598,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n", SPP_ARGS(ifp), cp->name, h->ident, sp->confid[cp->protoidx]); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (h->type == CONF_NAK) @@ -1682,7 +1638,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; @@ -1715,7 +1671,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case TERM_ACK: @@ -1746,7 +1702,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case CODE_REJ: @@ -1773,7 +1729,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; case PROTO_REJ: @@ -1832,7 +1788,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, sppp_cp_type_name(h->type), sppp_state_name(sp->state[cp->protoidx])); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } break; } @@ -1848,7 +1804,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) if (debug) log(-1, SPP_FMT "lcp echo req but lcp closed\n", SPP_ARGS(ifp)); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (len < 8) { @@ -1882,7 +1838,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) if (cp->proto != PPP_LCP) goto illegal; if (h->ident != sp->lcp.echoid) { - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); break; } if (len < 8) { @@ -1907,7 +1863,7 @@ sppp_cp_input(const struct cp *cp, struct sppp *sp, struct mbuf *m) SPP_ARGS(ifp), cp->name, h->type); sppp_cp_send(sp, cp->proto, CODE_REJ, ++sp->pp_seq[cp->protoidx], m->m_pkthdr.len, h); - ++ifp->if_ierrors; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } } @@ -2072,9 +2028,7 @@ static void sppp_to_event(const struct cp *cp, struct sppp *sp) { STDDCL; - int s; - s = splimp(); SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "%s TO(%s) rst_counter = %d\n", @@ -2124,7 +2078,6 @@ sppp_to_event(const struct cp *cp, struct sppp *sp) } SPPP_UNLOCK(sp); - splx(s); } /* @@ -2196,7 +2149,7 @@ sppp_lcp_init(struct sppp *sp) sp->lcp.max_terminate = 2; sp->lcp.max_configure = 10; sp->lcp.max_failure = 10; - callout_init(&sp->ch[IDX_LCP], CALLOUT_MPSAFE); + callout_init(&sp->ch[IDX_LCP], 1); } static void @@ -2887,7 +2840,7 @@ sppp_ipcp_init(struct sppp *sp) sp->fail_counter[IDX_IPCP] = 0; sp->pp_seq[IDX_IPCP] = 0; sp->pp_rseq[IDX_IPCP] = 0; - callout_init(&sp->ch[IDX_IPCP], CALLOUT_MPSAFE); + callout_init(&sp->ch[IDX_IPCP], 1); } static void @@ -3011,7 +2964,7 @@ sppp_ipcp_RCR(struct sppp *sp, struct lcp_header *h, int len) * since our algorithm always uses the * original option to NAK it with new values, * things would become more complicated. In - * pratice, the only commonly implemented IP + * practice, the only commonly implemented IP * compression option is VJ anyway, so the * difference is negligible. */ @@ -3446,7 +3399,7 @@ sppp_ipv6cp_init(struct sppp *sp) sp->fail_counter[IDX_IPV6CP] = 0; sp->pp_seq[IDX_IPV6CP] = 0; sp->pp_rseq[IDX_IPV6CP] = 0; - callout_init(&sp->ch[IDX_IPV6CP], CALLOUT_MPSAFE); + callout_init(&sp->ch[IDX_IPV6CP], 1); } static void @@ -4027,7 +3980,7 @@ sppp_chap_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; - int len, x; + int len; u_char *value, *name, digest[AUTHKEYLEN], dsize; int value_len, name_len; MD5_CTX ctx; @@ -4104,7 +4057,6 @@ sppp_chap_input(struct sppp *sp, struct mbuf *m) } log(-1, "\n"); } - x = splimp(); SPPP_LOCK(sp); sp->pp_flags &= ~PP_NEEDAUTH; if (sp->myauth.proto == PPP_CHAP && @@ -4116,11 +4068,9 @@ sppp_chap_input(struct sppp *sp, struct mbuf *m) * to network phase. */ SPPP_UNLOCK(sp); - splx(x); break; } SPPP_UNLOCK(sp); - splx(x); sppp_phase_network(sp); break; @@ -4254,7 +4204,7 @@ sppp_chap_init(struct sppp *sp) sp->fail_counter[IDX_CHAP] = 0; sp->pp_seq[IDX_CHAP] = 0; sp->pp_rseq[IDX_CHAP] = 0; - callout_init(&sp->ch[IDX_CHAP], CALLOUT_MPSAFE); + callout_init(&sp->ch[IDX_CHAP], 1); } static void @@ -4282,9 +4232,7 @@ sppp_chap_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; - int s; - s = splimp(); SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "chap TO(%s) rst_counter = %d\n", @@ -4315,14 +4263,13 @@ sppp_chap_TO(void *cookie) } SPPP_UNLOCK(sp); - splx(s); } static void sppp_chap_tlu(struct sppp *sp) { STDDCL; - int i, x; + int i; i = 0; sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure; @@ -4350,10 +4297,9 @@ sppp_chap_tlu(struct sppp *sp) if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) log(-1, "next re-challenge in %d seconds\n", i); else - log(-1, "re-challenging supressed\n"); + log(-1, "re-challenging suppressed\n"); } - x = splimp(); SPPP_LOCK(sp); /* indicate to LCP that we need to be closed down */ sp->lcp.protos |= (1 << IDX_CHAP); @@ -4365,11 +4311,9 @@ sppp_chap_tlu(struct sppp *sp) * phase. */ SPPP_UNLOCK(sp); - splx(x); return; } SPPP_UNLOCK(sp); - splx(x); /* * If we are already in phase network, we are done here. This @@ -4438,7 +4382,7 @@ sppp_pap_input(struct sppp *sp, struct mbuf *m) { STDDCL; struct lcp_header *h; - int len, x; + int len; u_char *name, *passwd, mlen; int name_len, passwd_len; @@ -4525,7 +4469,6 @@ sppp_pap_input(struct sppp *sp, struct mbuf *m) } log(-1, "\n"); } - x = splimp(); SPPP_LOCK(sp); sp->pp_flags &= ~PP_NEEDAUTH; if (sp->myauth.proto == PPP_PAP && @@ -4537,11 +4480,9 @@ sppp_pap_input(struct sppp *sp, struct mbuf *m) * to network phase. */ SPPP_UNLOCK(sp); - splx(x); break; } SPPP_UNLOCK(sp); - splx(x); sppp_phase_network(sp); break; @@ -4585,8 +4526,8 @@ sppp_pap_init(struct sppp *sp) sp->fail_counter[IDX_PAP] = 0; sp->pp_seq[IDX_PAP] = 0; sp->pp_rseq[IDX_PAP] = 0; - callout_init(&sp->ch[IDX_PAP], CALLOUT_MPSAFE); - callout_init(&sp->pap_my_to_ch, CALLOUT_MPSAFE); + callout_init(&sp->ch[IDX_PAP], 1); + callout_init(&sp->pap_my_to_ch, 1); } static void @@ -4622,9 +4563,7 @@ sppp_pap_TO(void *cookie) { struct sppp *sp = (struct sppp *)cookie; STDDCL; - int s; - s = splimp(); SPPP_LOCK(sp); if (debug) log(LOG_DEBUG, SPP_FMT "pap TO(%s) rst_counter = %d\n", @@ -4650,7 +4589,6 @@ sppp_pap_TO(void *cookie) } SPPP_UNLOCK(sp); - splx(s); } /* @@ -4677,7 +4615,6 @@ static void sppp_pap_tlu(struct sppp *sp) { STDDCL; - int x; sp->rst_counter[IDX_PAP] = sp->lcp.max_configure; @@ -4685,7 +4622,6 @@ sppp_pap_tlu(struct sppp *sp) log(LOG_DEBUG, SPP_FMT "%s tlu\n", SPP_ARGS(ifp), pap.name); - x = splimp(); SPPP_LOCK(sp); /* indicate to LCP that we need to be closed down */ sp->lcp.protos |= (1 << IDX_PAP); @@ -4697,11 +4633,9 @@ sppp_pap_tlu(struct sppp *sp) * phase. */ SPPP_UNLOCK(sp); - splx(x); return; } SPPP_UNLOCK(sp); - splx(x); sppp_phase_network(sp); } @@ -4766,7 +4700,7 @@ sppp_auth_send(const struct cp *cp, struct sppp *sp, const char *msg; va_list ap; - MGETHDR (m, M_DONTWAIT, MT_DATA); + MGETHDR (m, M_NOWAIT, MT_DATA); if (! m) return; m->m_pkthdr.rcvif = 0; @@ -4810,7 +4744,7 @@ sppp_auth_send(const struct cp *cp, struct sppp *sp, log(-1, ">\n"); } if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3)) - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } /* @@ -4823,7 +4757,7 @@ sppp_qflush(struct ifqueue *ifq) n = ifq->ifq_head; while ((m = n)) { - n = m->m_act; + n = m->m_nextpkt; m_freem (m); } ifq->ifq_head = 0; @@ -4839,9 +4773,7 @@ sppp_keepalive(void *dummy) { struct sppp *sp = (struct sppp*)dummy; struct ifnet *ifp = SP2IFP(sp); - int s; - s = splimp(); SPPP_LOCK(sp); /* Keepalive mode disabled or channel down? */ if (! (sp->pp_flags & PP_KEEPALIVE) || @@ -4884,7 +4816,6 @@ sppp_keepalive(void *dummy) } out: SPPP_UNLOCK(sp); - splx(s); callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive, (void *)sp); } @@ -4906,7 +4837,7 @@ sppp_get_ip_addrs(struct sppp *sp, u_long *src, u_long *dst, u_long *srcmask) * Pick the first AF_INET address from the list, * aliases don't make any sense on a p2p link anyway. */ - si = 0; + si = NULL; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { @@ -4934,7 +4865,7 @@ sppp_get_ip_addrs(struct sppp *sp, u_long *src, u_long *dst, u_long *srcmask) #ifdef INET /* - * Set my IP address. Must be called at splimp. + * Set my IP address. */ static void sppp_set_ip_addr(struct sppp *sp, u_long src) @@ -4948,7 +4879,7 @@ sppp_set_ip_addr(struct sppp *sp, u_long src) * Pick the first AF_INET address from the list, * aliases don't make any sense on a p2p link anyway. */ - si = 0; + si = NULL; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { @@ -5051,7 +4982,7 @@ sppp_gen_ip6_addr(struct sppp *sp, struct in6_addr *addr) } /* - * Set my IPv6 address. Must be called at splimp. + * Set my IPv6 address. */ static void sppp_set_ip6_addr(struct sppp *sp, const struct in6_addr *src) @@ -5126,14 +5057,15 @@ sppp_params(struct sppp *sp, u_long cmd, void *data) struct spppreq *spr; int rv = 0; - if ((spr = malloc(sizeof(struct spppreq), M_TEMP, M_NOWAIT)) == 0) + if ((spr = malloc(sizeof(struct spppreq), M_TEMP, M_NOWAIT)) == NULL) return (EAGAIN); /* * ifr->ifr_data is supposed to point to a struct spppreq. * Check the cmd word first before attempting to fetch all the * data. */ - if ((subcmd = fuword(ifr->ifr_data)) == -1) { + rv = fueword(ifr->ifr_data, &subcmd); + if (rv == -1) { rv = EFAULT; goto quit; } diff --git a/freebsd/sys/net/if_stf.c b/freebsd/sys/net/if_stf.c index e88fd34d..7c1b7075 100644 --- a/freebsd/sys/net/if_stf.c +++ b/freebsd/sys/net/if_stf.c @@ -76,9 +76,6 @@ * Note that there is no way to be 100% secure. */ -#include <rtems/bsd/local/opt_inet.h> -#include <rtems/bsd/local/opt_inet6.h> - #include <rtems/bsd/sys/param.h> #include <sys/systm.h> #include <sys/socket.h> @@ -86,24 +83,27 @@ #include <sys/mbuf.h> #include <rtems/bsd/sys/errno.h> #include <sys/kernel.h> +#include <rtems/bsd/sys/lock.h> #include <sys/module.h> #include <sys/protosw.h> #include <sys/proc.h> #include <sys/queue.h> +#include <sys/rmlock.h> #include <sys/sysctl.h> #include <machine/cpu.h> #include <sys/malloc.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/route.h> #include <net/netisr.h> #include <net/if_types.h> -#include <net/if_stf.h> #include <net/vnet.h> #include <netinet/in.h> +#include <netinet/in_fib.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/ip_var.h> @@ -125,16 +125,10 @@ SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface"); -static int stf_route_cache = 1; -SYSCTL_INT(_net_link_stf, OID_AUTO, route_cache, CTLFLAG_RW, - &stf_route_cache, 0, "Caching of IPv4 routes for 6to4 Output"); - static int stf_permit_rfc1918 = 0; -TUNABLE_INT("net.link.stf.permit_rfc1918", &stf_permit_rfc1918); -SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RW | CTLFLAG_TUN, +SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RWTUN, &stf_permit_rfc1918, 0, "Permit the use of private IPv4 addresses"); -#define STFNAME "stf" #define STFUNIT 0 #define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) @@ -143,36 +137,34 @@ SYSCTL_INT(_net_link_stf, OID_AUTO, permit_rfc1918, CTLFLAG_RW | CTLFLAG_TUN, * XXX: Return a pointer with 16-bit aligned. Don't cast it to * struct in_addr *; use bcopy() instead. */ -#define GET_V4(x) ((caddr_t)(&(x)->s6_addr16[1])) +#define GET_V4(x) (&(x)->s6_addr16[1]) struct stf_softc { struct ifnet *sc_ifp; - union { - struct route __sc_ro4; - struct route_in6 __sc_ro6; /* just for safety */ - } __sc_ro46; -#define sc_ro __sc_ro46.__sc_ro4 struct mtx sc_ro_mtx; u_int sc_fibnum; const struct encaptab *encap_cookie; }; #define STF2IFP(sc) ((sc)->sc_ifp) +static const char stfname[] = "stf"; + /* * Note that mutable fields in the softc are not currently locked. * We do lock sc_ro in stf_output though. */ -static MALLOC_DEFINE(M_STF, STFNAME, "6to4 Tunnel Interface"); +static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface"); static const int ip_stf_ttl = 40; extern struct domain inetdomain; -struct protosw in_stf_protosw = { +static int in_stf_input(struct mbuf **, int *, int); +static struct protosw in_stf_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = in_stf_input, - .pr_output = (pr_output_t *)rip_output, + .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; @@ -181,22 +173,20 @@ static char *stfnames[] = {"stf0", "stf", "6to4", NULL}; static int stfmodevent(module_t, int, void *); static int stf_encapcheck(const struct mbuf *, int, int, void *); -static struct in6_ifaddr *stf_getsrcifa6(struct ifnet *); -static int stf_output(struct ifnet *, struct mbuf *, struct sockaddr *, +static int stf_getsrcifa6(struct ifnet *, struct in6_addr *, struct in6_addr *); +static int stf_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int isrfc1918addr(struct in_addr *); static int stf_checkaddr4(struct stf_softc *, struct in_addr *, struct ifnet *); static int stf_checkaddr6(struct stf_softc *, struct in6_addr *, struct ifnet *); -static void stf_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int stf_ioctl(struct ifnet *, u_long, caddr_t); static int stf_clone_match(struct if_clone *, const char *); static int stf_clone_create(struct if_clone *, char *, size_t, caddr_t); static int stf_clone_destroy(struct if_clone *, struct ifnet *); -struct if_clone stf_cloner = IFC_CLONE_INITIALIZER(STFNAME, NULL, 0, - NULL, stf_clone_match, stf_clone_create, stf_clone_destroy); +static struct if_clone *stf_cloner; static int stf_clone_match(struct if_clone *ifc, const char *name) @@ -247,7 +237,7 @@ stf_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); - ifp->if_dname = ifc->ifc_name; + ifp->if_dname = stfname; ifp->if_dunit = IF_DUNIT_NONE; mtx_init(&(sc)->sc_ro_mtx, "stf ro", NULL, MTX_DEF); @@ -289,18 +279,16 @@ stf_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) } static int -stfmodevent(mod, type, data) - module_t mod; - int type; - void *data; +stfmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: - if_clone_attach(&stf_cloner); + stf_cloner = if_clone_advanced(stfname, 0, stf_clone_match, + stf_clone_create, stf_clone_destroy); break; case MOD_UNLOAD: - if_clone_detach(&stf_cloner); + if_clone_detach(stf_cloner); break; default: return (EOPNOTSUPP); @@ -318,16 +306,12 @@ static moduledata_t stf_mod = { DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); static int -stf_encapcheck(m, off, proto, arg) - const struct mbuf *m; - int off; - int proto; - void *arg; +stf_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; - struct in6_ifaddr *ia6; struct stf_softc *sc; struct in_addr a, b, mask; + struct in6_addr addr6, mask6; sc = (struct stf_softc *)arg; if (sc == NULL) @@ -349,20 +333,16 @@ stf_encapcheck(m, off, proto, arg) if (ip.ip_v != 4) return 0; - ia6 = stf_getsrcifa6(STF2IFP(sc)); - if (ia6 == NULL) - return 0; + if (stf_getsrcifa6(STF2IFP(sc), &addr6, &mask6) != 0) + return (0); /* * check if IPv4 dst matches the IPv4 address derived from the * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ - if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst, - sizeof(ip.ip_dst)) != 0) { - ifa_free(&ia6->ia_ifa); + if (bcmp(GET_V4(&addr6), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) return 0; - } /* * check if IPv4 src matches the IPv4 address derived from the @@ -371,9 +351,8 @@ stf_encapcheck(m, off, proto, arg) * fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24 */ bzero(&a, sizeof(a)); - bcopy(GET_V4(&ia6->ia_addr.sin6_addr), &a, sizeof(a)); - bcopy(GET_V4(&ia6->ia_prefixmask.sin6_addr), &mask, sizeof(mask)); - ifa_free(&ia6->ia_ifa); + bcopy(GET_V4(&addr6), &a, sizeof(a)); + bcopy(GET_V4(&mask6), &mask, sizeof(mask)); a.s_addr &= mask.s_addr; b = ip.ip_src; b.s_addr &= mask.s_addr; @@ -384,12 +363,12 @@ stf_encapcheck(m, off, proto, arg) return 32; } -static struct in6_ifaddr * -stf_getsrcifa6(ifp) - struct ifnet *ifp; +static int +stf_getsrcifa6(struct ifnet *ifp, struct in6_addr *addr, struct in6_addr *mask) { struct ifaddr *ia; struct in_ifaddr *ia4; + struct in6_ifaddr *ia6; struct sockaddr_in6 *sin6; struct in_addr in; @@ -408,33 +387,30 @@ stf_getsrcifa6(ifp) if (ia4 == NULL) continue; - ifa_ref(ia); + ia6 = (struct in6_ifaddr *)ia; + + *addr = sin6->sin6_addr; + *mask = ia6->ia_prefixmask.sin6_addr; if_addr_runlock(ifp); - return (struct in6_ifaddr *)ia; + return (0); } if_addr_runlock(ifp); - return NULL; + return (ENOENT); } static int -stf_output(ifp, m, dst, ro) - struct ifnet *ifp; - struct mbuf *m; - struct sockaddr *dst; - struct route *ro; +stf_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) { struct stf_softc *sc; - struct sockaddr_in6 *dst6; - struct route *cached_route; + const struct sockaddr_in6 *dst6; struct in_addr in4; - caddr_t ptr; - struct sockaddr_in *dst4; + const void *ptr; u_int8_t tos; struct ip *ip; struct ip6_hdr *ip6; - struct in6_ifaddr *ia6; - u_int32_t af; + struct in6_addr addr6, mask6; int error; #ifdef MAC @@ -446,12 +422,12 @@ stf_output(ifp, m, dst, ro) #endif sc = ifp->if_softc; - dst6 = (struct sockaddr_in6 *)dst; + dst6 = (const struct sockaddr_in6 *)dst; /* just in case */ if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } @@ -460,18 +436,16 @@ stf_output(ifp, m, dst, ro) * we shouldn't generate output. Without this check, we'll end up * using wrong IPv4 source. */ - ia6 = stf_getsrcifa6(ifp); - if (ia6 == NULL) { + if (stf_getsrcifa6(ifp, &addr6, &mask6) != 0) { m_freem(m); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETDOWN; } if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) { - ifa_free(&ia6->ia_ifa); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } } @@ -479,15 +453,6 @@ stf_output(ifp, m, dst, ro) tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; /* - * BPF writes need to be handled specially. - * This is a null operation, nothing here checks dst->sa_family. - */ - if (dst->sa_family == AF_UNSPEC) { - bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } - - /* * Pickup the right outer dst addr from the list of candidates. * ip6_dst has priority as it may be able to give us shorter IPv4 hops. */ @@ -497,9 +462,8 @@ stf_output(ifp, m, dst, ro) else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) ptr = GET_V4(&dst6->sin6_addr); else { - ifa_free(&ia6->ia_ifa); m_freem(m); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENETUNREACH; } bcopy(ptr, &in4, sizeof(in4)); @@ -512,78 +476,38 @@ stf_output(ifp, m, dst, ro) * will only read from the mbuf (i.e., it won't * try to free it or keep a pointer a to it). */ - af = AF_INET6; + u_int af = AF_INET6; bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m); } - M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) { - ifa_free(&ia6->ia_ifa); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); - bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr), - &ip->ip_src, sizeof(ip->ip_src)); - ifa_free(&ia6->ia_ifa); + bcopy(GET_V4(&addr6), &ip->ip_src, sizeof(ip->ip_src)); bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; - ip->ip_len = m->m_pkthdr.len; /*host order*/ + ip->ip_len = htons(m->m_pkthdr.len); if (ifp->if_flags & IFF_LINK1) ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos); else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); - if (!stf_route_cache) { - cached_route = NULL; - goto sendit; - } - - /* - * Do we have a cached route? - */ - mtx_lock(&(sc)->sc_ro_mtx); - dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; - if (dst4->sin_family != AF_INET || - bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { - /* cache route doesn't match */ - dst4->sin_family = AF_INET; - dst4->sin_len = sizeof(struct sockaddr_in); - bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr)); - if (sc->sc_ro.ro_rt) { - RTFREE(sc->sc_ro.ro_rt); - sc->sc_ro.ro_rt = NULL; - } - } - - if (sc->sc_ro.ro_rt == NULL) { - rtalloc_fib(&sc->sc_ro, sc->sc_fibnum); - if (sc->sc_ro.ro_rt == NULL) { - m_freem(m); - mtx_unlock(&(sc)->sc_ro_mtx); - ifp->if_oerrors++; - return ENETUNREACH; - } - } - cached_route = &sc->sc_ro; - -sendit: M_SETFIB(m, sc->sc_fibnum); - ifp->if_opackets++; - error = ip_output(m, NULL, cached_route, 0, NULL, NULL); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + error = ip_output(m, NULL, NULL, 0, NULL, NULL); - if (cached_route != NULL) - mtx_unlock(&(sc)->sc_ro_mtx); return error; } static int -isrfc1918addr(in) - struct in_addr *in; +isrfc1918addr(struct in_addr *in) { /* * returns 1 if private address range: @@ -599,11 +523,9 @@ isrfc1918addr(in) } static int -stf_checkaddr4(sc, in, inifp) - struct stf_softc *sc; - struct in_addr *in; - struct ifnet *inifp; /* incoming interface */ +stf_checkaddr4(struct stf_softc *sc, struct in_addr *in, struct ifnet *inifp) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia4; /* @@ -627,54 +549,35 @@ stf_checkaddr4(sc, in, inifp) /* * reject packets with broadcast */ - IN_IFADDR_RLOCK(); - for (ia4 = TAILQ_FIRST(&V_in_ifaddrhead); - ia4; - ia4 = TAILQ_NEXT(ia4, ia_link)) - { + IN_IFADDR_RLOCK(&in_ifa_tracker); + TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) continue; if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return -1; } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * perform ingress filter */ if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) { - struct sockaddr_in sin; - struct rtentry *rt; - - bzero(&sin, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); - sin.sin_addr = *in; - rt = rtalloc1_fib((struct sockaddr *)&sin, 0, - 0UL, sc->sc_fibnum); - if (!rt || rt->rt_ifp != inifp) { -#if 0 - log(LOG_WARNING, "%s: packet from 0x%x dropped " - "due to ingress filter\n", if_name(STF2IFP(sc)), - (u_int32_t)ntohl(sin.sin_addr.s_addr)); -#endif - if (rt) - RTFREE_LOCKED(rt); - return -1; - } - RTFREE_LOCKED(rt); + struct nhop4_basic nh4; + + if (fib4_lookup_nh_basic(sc->sc_fibnum, *in, 0, 0, &nh4) != 0) + return (-1); + + if (nh4.nh_ifp != inifp) + return (-1); } return 0; } static int -stf_checkaddr6(sc, in6, inifp) - struct stf_softc *sc; - struct in6_addr *in6; - struct ifnet *inifp; /* incoming interface */ +stf_checkaddr6(struct stf_softc *sc, struct in6_addr *in6, struct ifnet *inifp) { /* * check 6to4 addresses @@ -697,23 +600,23 @@ stf_checkaddr6(sc, in6, inifp) return 0; } -void -in_stf_input(m, off) - struct mbuf *m; - int off; +static int +in_stf_input(struct mbuf **mp, int *offp, int proto) { - int proto; struct stf_softc *sc; struct ip *ip; struct ip6_hdr *ip6; + struct mbuf *m; u_int8_t otos, itos; struct ifnet *ifp; + int off; - proto = mtod(m, struct ip *)->ip_p; + m = *mp; + off = *offp; if (proto != IPPROTO_IPV6) { m_freem(m); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); @@ -722,7 +625,7 @@ in_stf_input(m, off) if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) { m_freem(m); - return; + return (IPPROTO_DONE); } ifp = STF2IFP(sc); @@ -738,7 +641,7 @@ in_stf_input(m, off) if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 || stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); - return; + return (IPPROTO_DONE); } otos = ip->ip_tos; @@ -747,7 +650,7 @@ in_stf_input(m, off) if (m->m_len < sizeof(*ip6)) { m = m_pullup(m, sizeof(*ip6)); if (!m) - return; + return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); @@ -758,7 +661,7 @@ in_stf_input(m, off) if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 || stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) { m_freem(m); - return; + return (IPPROTO_DONE); } itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; @@ -789,28 +692,15 @@ in_stf_input(m, off) * See net/if_gif.c for possible issues with packet processing * reorder due to extra queueing. */ - ifp->if_ipackets++; - ifp->if_ibytes += m->m_pkthdr.len; + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(NETISR_IPV6, m); -} - -/* ARGSUSED */ -static void -stf_rtrequest(cmd, rt, info) - int cmd; - struct rtentry *rt; - struct rt_addrinfo *info; -{ - RT_LOCK_ASSERT(rt); - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + return (IPPROTO_DONE); } static int -stf_ioctl(ifp, cmd, data) - struct ifnet *ifp; - u_long cmd; - caddr_t data; +stf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; @@ -837,7 +727,6 @@ stf_ioctl(ifp, cmd, data) break; } - ifa->ifa_rtrequest = stf_rtrequest; ifp->if_flags |= IFF_UP; break; diff --git a/freebsd/sys/net/if_tap.c b/freebsd/sys/net/if_tap.c index 599905e8..24ae0092 100644 --- a/freebsd/sys/net/if_tap.c +++ b/freebsd/sys/net/if_tap.c @@ -65,6 +65,7 @@ #include <net/bpf.h> #include <net/ethernet.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_dl.h> #include <net/if_media.h> @@ -81,8 +82,8 @@ #define CDEV_NAME "tap" #define TAPDEBUG if (tapdebug) printf -#define TAP "tap" -#define VMNET "vmnet" +static const char tapname[] = "tap"; +static const char vmnetname[] = "vmnet"; #define TAPMAXUNIT 0x7fff #define VMNET_DEV_MASK CLONE_FLAG0 @@ -101,11 +102,10 @@ static void tapifinit(void *); static int tap_clone_create(struct if_clone *, int, caddr_t); static void tap_clone_destroy(struct ifnet *); +static struct if_clone *tap_cloner; static int vmnet_clone_create(struct if_clone *, int, caddr_t); static void vmnet_clone_destroy(struct ifnet *); - -IFC_SIMPLE_DECLARE(tap, 0); -IFC_SIMPLE_DECLARE(vmnet, 0); +static struct if_clone *vmnet_cloner; /* character device */ static d_open_t tapopen; @@ -137,7 +137,7 @@ static struct filterops tap_write_filterops = { static struct cdevsw tap_cdevsw = { .d_version = D_VERSION, - .d_flags = D_PSEUDO | D_NEEDMINOR, + .d_flags = D_NEEDMINOR, .d_open = tapopen, .d_close = tapclose, .d_read = tapread, @@ -172,12 +172,10 @@ SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tapuopen, 0, "Allow user to open /dev/tap (based on node permissions)"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); -SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tapdclone, 0, - "Enably legacy devfs interface creation"); +SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, + "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tapdebug, 0, ""); -TUNABLE_INT("net.link.tap.devfs_cloning", &tapdclone); - DEV_MODULE(if_tap, tapmodevent, NULL); static int @@ -185,18 +183,12 @@ tap_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct cdev *dev; int i; - int extra; - if (strcmp(ifc->ifc_name, VMNET) == 0) - extra = VMNET_DEV_MASK; - else - extra = 0; - - /* find any existing device, or allocate new unit number */ - i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, extra); + /* Find any existing device, or allocate new unit number. */ + i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, 0); if (i) { - dev = make_dev(&tap_cdevsw, unit | extra, - UID_ROOT, GID_WHEEL, 0600, "%s%d", ifc->ifc_name, unit); + dev = make_dev(&tap_cdevsw, unit, UID_ROOT, GID_WHEEL, 0600, + "%s%d", tapname, unit); } tapcreate(dev); @@ -207,7 +199,18 @@ tap_clone_create(struct if_clone *ifc, int unit, caddr_t params) static int vmnet_clone_create(struct if_clone *ifc, int unit, caddr_t params) { - return tap_clone_create(ifc, unit, params); + struct cdev *dev; + int i; + + /* Find any existing device, or allocate new unit number. */ + i = clone_create(&tapclones, &tap_cdevsw, &unit, &dev, VMNET_DEV_MASK); + if (i) { + dev = make_dev(&tap_cdevsw, unit | VMNET_DEV_MASK, UID_ROOT, + GID_WHEEL, 0600, "%s%d", vmnetname, unit); + } + + tapcreate(dev); + return (0); } static void @@ -218,9 +221,10 @@ tap_destroy(struct tap_softc *tp) CURVNET_SET(ifp->if_vnet); destroy_dev(tp->tap_dev); seldrain(&tp->tap_rsel); + knlist_clear(&tp->tap_rsel.si_note, 0); knlist_destroy(&tp->tap_rsel.si_note); ether_ifdetach(ifp); - if_free_type(ifp, IFT_ETHER); + if_free(ifp); mtx_destroy(&tp->tap_mtx); free(tp, M_TAP); @@ -272,8 +276,10 @@ tapmodevent(module_t mod, int type, void *data) mtx_destroy(&tapmtx); return (ENOMEM); } - if_clone_attach(&tap_cloner); - if_clone_attach(&vmnet_cloner); + tap_cloner = if_clone_simple(tapname, tap_clone_create, + tap_clone_destroy, 0); + vmnet_cloner = if_clone_simple(vmnetname, vmnet_clone_create, + vmnet_clone_destroy, 0); return (0); case MOD_UNLOAD: @@ -295,8 +301,8 @@ tapmodevent(module_t mod, int type, void *data) mtx_unlock(&tapmtx); EVENTHANDLER_DEREGISTER(dev_clone, eh_tag); - if_clone_detach(&tap_cloner); - if_clone_detach(&vmnet_cloner); + if_clone_detach(tap_cloner); + if_clone_detach(vmnet_cloner); drain_dev_clone_events(); mtx_lock(&tapmtx); @@ -350,13 +356,13 @@ tapclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **d extra = 0; /* We're interested in only tap/vmnet devices. */ - if (strcmp(name, TAP) == 0) { + if (strcmp(name, tapname) == 0) { unit = -1; - } else if (strcmp(name, VMNET) == 0) { + } else if (strcmp(name, vmnetname) == 0) { unit = -1; extra = VMNET_DEV_MASK; - } else if (dev_stdclone(name, NULL, TAP, &unit) != 1) { - if (dev_stdclone(name, NULL, VMNET, &unit) != 1) { + } else if (dev_stdclone(name, NULL, tapname, &unit) != 1) { + if (dev_stdclone(name, NULL, vmnetname, &unit) != 1) { return; } else { extra = VMNET_DEV_MASK; @@ -402,11 +408,9 @@ tapcreate(struct cdev *dev) unsigned short macaddr_hi; uint32_t macaddr_mid; int unit; - char *name = NULL; + const char *name = NULL; u_char eaddr[6]; - dev->si_flags &= ~SI_CHEAPCLONE; - /* allocate driver storage and create device */ tp = malloc(sizeof(*tp), M_TAP, M_WAITOK | M_ZERO); mtx_init(&tp->tap_mtx, "tap_mtx", NULL, MTX_DEF); @@ -418,10 +422,10 @@ tapcreate(struct cdev *dev) /* select device: tap or vmnet */ if (unit & VMNET_DEV_MASK) { - name = VMNET; + name = vmnetname; tp->tap_flags |= TAP_VMNET; } else - name = TAP; + name = tapname; unit &= TAPMAXUNIT; @@ -534,11 +538,11 @@ tapclose(struct cdev *dev, int foo, int bar, struct thread *td) IF_DRAIN(&ifp->if_snd); /* - * do not bring the interface down, and do not anything with - * interface, if we are in VMnet mode. just close the device. + * Do not bring the interface down, and do not anything with + * interface, if we are in VMnet mode. Just close the device. */ - - if (((tp->tap_flags & TAP_VMNET) == 0) && (ifp->if_flags & IFF_UP)) { + if (((tp->tap_flags & TAP_VMNET) == 0) && + (ifp->if_flags & (IFF_UP | IFF_LINK0)) == IFF_UP) { mtx_unlock(&tp->tap_mtx); if_down(ifp); mtx_lock(&tp->tap_mtx); @@ -636,12 +640,12 @@ tapifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case SIOCGIFSTATUS: ifs = (struct ifstat *)data; - dummy = strlen(ifs->ascii); mtx_lock(&tp->tap_mtx); - if (tp->tap_pid != 0 && dummy < sizeof(ifs->ascii)) - snprintf(ifs->ascii + dummy, - sizeof(ifs->ascii) - dummy, + if (tp->tap_pid != 0) + snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tap_pid); + else + ifs->ascii[0] = '\0'; mtx_unlock(&tp->tap_mtx); break; @@ -684,7 +688,7 @@ tapifstart(struct ifnet *ifp) IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } @@ -709,7 +713,7 @@ tapifstart(struct ifnet *ifp) selwakeuppri(&tp->tap_rsel, PZERO+1); KNOTE_LOCKED(&tp->tap_rsel.si_note, 0); - ifp->if_opackets ++; /* obytes are counted in ether_output */ + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; @@ -829,8 +833,7 @@ tapioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td mtx_unlock(&tp->tap_mtx); break; - case OSIOCGIFADDR: /* get MAC address of the remote side */ - case SIOCGIFADDR: + case SIOCGIFADDR: /* get MAC address of the remote side */ mtx_lock(&tp->tap_mtx); bcopy(tp->ether_addr, data, sizeof(tp->ether_addr)); mtx_unlock(&tp->tap_mtx); @@ -948,9 +951,9 @@ tapwrite(struct cdev *dev, struct uio *uio, int flag) return (EIO); } - if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, ETHER_ALIGN, + if ((m = m_uiotombuf(uio, M_NOWAIT, 0, ETHER_ALIGN, M_PKTHDR)) == NULL) { - ifp->if_ierrors ++; + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } @@ -977,7 +980,7 @@ tapwrite(struct cdev *dev, struct uio *uio, int flag) CURVNET_SET(ifp->if_vnet); (*ifp->if_input)(ifp, m); CURVNET_RESTORE(); - ifp->if_ipackets ++; /* ibytes are counted in parent */ + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); /* ibytes are counted in parent */ return (0); } /* tapwrite */ diff --git a/freebsd/sys/net/if_tun.c b/freebsd/sys/net/if_tun.c index 556a4860..edb30d04 100644 --- a/freebsd/sys/net/if_tun.c +++ b/freebsd/sys/net/if_tun.c @@ -18,10 +18,8 @@ * $FreeBSD$ */ -#include <rtems/bsd/local/opt_atalk.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> -#include <rtems/bsd/local/opt_ipx.h> #include <rtems/bsd/sys/param.h> #include <sys/priv.h> @@ -47,6 +45,7 @@ #include <sys/random.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_types.h> #include <net/netisr.h> @@ -101,7 +100,6 @@ struct tun_softc { #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf -#define TUNNAME "tun" /* * All mutable global variables in if_tun are locked using tunmtx, with @@ -109,7 +107,8 @@ struct tun_softc { * which is static after setup. */ static struct mtx tunmtx; -static MALLOC_DEFINE(M_TUN, TUNNAME, "Tunnel Interface"); +static const char tunname[] = "tun"; +static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static struct clonedevs *tunclones; @@ -119,25 +118,22 @@ SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, "IP tunnel software network interface."); -SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tundclone, 0, +SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation."); -TUNABLE_INT("net.link.tun.devfs_cloning", &tundclone); - static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(const char *name, struct cdev *dev); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static int tunmodevent(module_t, int, void *); -static int tunoutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *ro); +static int tunoutput(struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static int tun_clone_create(struct if_clone *, int, caddr_t); static void tun_clone_destroy(struct ifnet *); - -IFC_SIMPLE_DECLARE(tun, 0); +static struct if_clone *tun_cloner; static d_open_t tunopen; static d_close_t tunclose; @@ -167,7 +163,7 @@ static struct filterops tun_write_filterops = { static struct cdevsw tun_cdevsw = { .d_version = D_VERSION, - .d_flags = D_PSEUDO | D_NEEDMINOR, + .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, @@ -175,7 +171,7 @@ static struct cdevsw tun_cdevsw = { .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, - .d_name = TUNNAME, + .d_name = tunname, }; static int @@ -189,9 +185,9 @@ tun_clone_create(struct if_clone *ifc, int unit, caddr_t params) if (i) { /* No preexisting struct cdev *, create one */ dev = make_dev(&tun_cdevsw, unit, - UID_UUCP, GID_DIALER, 0600, "%s%d", ifc->ifc_name, unit); + UID_UUCP, GID_DIALER, 0600, "%s%d", tunname, unit); } - tuncreate(ifc->ifc_name, dev); + tuncreate(tunname, dev); return (0); } @@ -213,9 +209,9 @@ tunclone(void *arg, struct ucred *cred, char *name, int namelen, if (!tundclone || priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0) return; - if (strcmp(name, TUNNAME) == 0) { + if (strcmp(name, tunname) == 0) { u = -1; - } else if (dev_stdclone(name, NULL, TUNNAME, &u) != 1) + } else if (dev_stdclone(name, NULL, tunname, &u) != 1) return; /* Don't recognise the name */ if (u != -1 && u > IF_MAXUNIT) return; /* Unit number too high */ @@ -248,7 +244,6 @@ tun_destroy(struct tun_softc *tp) { struct cdev *dev; - /* Unlocked read. */ mtx_lock(&tp->tun_mtx); if ((tp->tun_flags & TUN_OPEN) != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); @@ -262,6 +257,7 @@ tun_destroy(struct tun_softc *tp) if_free(TUN2IFP(tp)); destroy_dev(dev); seldrain(&tp->tun_rsel); + knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); @@ -293,10 +289,11 @@ tunmodevent(module_t mod, int type, void *data) tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (tag == NULL) return (ENOMEM); - if_clone_attach(&tun_cloner); + tun_cloner = if_clone_simple(tunname, tun_clone_create, + tun_clone_destroy, 0); break; case MOD_UNLOAD: - if_clone_detach(&tun_cloner); + if_clone_detach(tun_cloner); EVENTHANDLER_DEREGISTER(dev_clone, tag); drain_dev_clone_events(); @@ -364,8 +361,6 @@ tuncreate(const char *name, struct cdev *dev) struct tun_softc *sc; struct ifnet *ifp; - dev->si_flags &= ~SI_CHEAPCLONE; - sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&sc->tun_cv, "tun_condvar"); @@ -412,7 +407,7 @@ tunopen(struct cdev *dev, int flag, int mode, struct thread *td) */ tp = dev->si_drv1; if (!tp) { - tuncreate(TUNNAME, dev); + tuncreate(tunname, dev); tp = dev->si_drv1; } @@ -557,18 +552,16 @@ tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifs = (struct ifstat *)data; mtx_lock(&tp->tun_mtx); if (tp->tun_pid) - sprintf(ifs->ascii + strlen(ifs->ascii), + snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); + else + ifs->ascii[0] = '\0'; mtx_unlock(&tp->tun_mtx); break; case SIOCSIFADDR: tuninit(ifp); TUNDEBUG(ifp, "address set\n"); break; - case SIOCSIFDSTADDR: - tuninit(ifp); - TUNDEBUG(ifp, "destination address set\n"); - break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); @@ -587,7 +580,7 @@ tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) * tunoutput - queue packets from higher level ready to put out. */ static int -tunoutput(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, +tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tun_softc *tp = ifp->if_softc; @@ -621,25 +614,23 @@ tunoutput(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, } /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { + if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } - - if (bpf_peers_present(ifp->if_bpf)) { + else af = dst->sa_family; + + if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); - } /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ - M_PREPEND(m0, dst->sa_len, M_DONTWAIT); + M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { - ifp->if_iqdrops++; - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); @@ -648,18 +639,18 @@ tunoutput(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ - M_PREPEND(m0, 4, M_DONTWAIT); + M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { - ifp->if_iqdrops++; - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else - *(u_int32_t *)m0->m_data = htonl(dst->sa_family); + *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET - if (dst->sa_family != AF_INET) + if (af != AF_INET) #endif { m_freem(m0); @@ -670,7 +661,7 @@ tunoutput(struct ifnet *ifp, struct mbuf *m0, struct sockaddr *dst, error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); - ifp->if_opackets++; + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } @@ -871,7 +862,7 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) struct tun_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; - uint32_t family; + uint32_t family, mru; int isr; TUNDEBUG(ifp, "tunwrite\n"); @@ -883,13 +874,16 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) if (uio->uio_resid == 0) return (0); - if (uio->uio_resid < 0 || uio->uio_resid > TUNMRU) { + mru = TUNMRU; + if (tp->tun_flags & TUN_IFHEAD) + mru += sizeof(family); + if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } - if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) { - ifp->if_ierrors++; + if ((m = m_uiotombuf(uio, M_NOWAIT, 0, 0, M_PKTHDR)) == NULL) { + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } @@ -925,25 +919,13 @@ tunwrite(struct cdev *dev, struct uio *uio, int flag) isr = NETISR_IPV6; break; #endif -#ifdef IPX - case AF_IPX: - isr = NETISR_IPX; - break; -#endif -#ifdef NETATALK - case AF_APPLETALK: - isr = NETISR_ATALK2; - break; -#endif default: m_freem(m); return (EAFNOSUPPORT); } - /* First chunk of an mbuf contains good junk */ - if (harvest.point_to_point) - random_harvest(m, 16, 3, 0, RANDOM_NET); - ifp->if_ibytes += m->m_pkthdr.len; - ifp->if_ipackets++; + random_harvest_queue(m, sizeof(*m), 2, RANDOM_NET_TUN); + if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); + if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); diff --git a/freebsd/sys/net/if_tun.h b/freebsd/sys/net/if_tun.h index 382881cb..1ea375f7 100644 --- a/freebsd/sys/net/if_tun.h +++ b/freebsd/sys/net/if_tun.h @@ -25,11 +25,11 @@ #define TUNMTU 1500 /* Maximum receive packet size (hard limit) */ -#define TUNMRU 16384 +#define TUNMRU 65535 struct tuninfo { int baudrate; /* linespeed */ - short mtu; /* maximum transmission unit */ + unsigned short mtu; /* maximum transmission unit */ u_char type; /* ethernet, tokenring, etc. */ u_char dummy; /* place holder */ }; diff --git a/freebsd/sys/net/if_types.h b/freebsd/sys/net/if_types.h index c2effacd..92e101ac 100644 --- a/freebsd/sys/net/if_types.h +++ b/freebsd/sys/net/if_types.h @@ -42,214 +42,232 @@ * http://www.iana.org/assignments/smi-numbers */ -#define IFT_OTHER 0x1 /* none of the following */ -#define IFT_1822 0x2 /* old-style arpanet imp */ -#define IFT_HDH1822 0x3 /* HDH arpanet imp */ -#define IFT_X25DDN 0x4 /* x25 to imp */ -#define IFT_X25 0x5 /* PDN X25 interface (RFC877) */ -#define IFT_ETHER 0x6 /* Ethernet CSMA/CD */ -#define IFT_ISO88023 0x7 /* CMSA/CD */ -#define IFT_ISO88024 0x8 /* Token Bus */ -#define IFT_ISO88025 0x9 /* Token Ring */ -#define IFT_ISO88026 0xa /* MAN */ -#define IFT_STARLAN 0xb -#define IFT_P10 0xc /* Proteon 10MBit ring */ -#define IFT_P80 0xd /* Proteon 80MBit ring */ -#define IFT_HY 0xe /* Hyperchannel */ -#define IFT_FDDI 0xf -#define IFT_LAPB 0x10 -#define IFT_SDLC 0x11 -#define IFT_T1 0x12 -#define IFT_CEPT 0x13 /* E1 - european T1 */ -#define IFT_ISDNBASIC 0x14 -#define IFT_ISDNPRIMARY 0x15 -#define IFT_PTPSERIAL 0x16 /* Proprietary PTP serial */ -#define IFT_PPP 0x17 /* RFC 1331 */ -#define IFT_LOOP 0x18 /* loopback */ -#define IFT_EON 0x19 /* ISO over IP */ -#define IFT_XETHER 0x1a /* obsolete 3MB experimental ethernet */ -#define IFT_NSIP 0x1b /* XNS over IP */ -#define IFT_SLIP 0x1c /* IP over generic TTY */ -#define IFT_ULTRA 0x1d /* Ultra Technologies */ -#define IFT_DS3 0x1e /* Generic T3 */ -#define IFT_SIP 0x1f /* SMDS */ -#define IFT_FRELAY 0x20 /* Frame Relay DTE only */ -#define IFT_RS232 0x21 -#define IFT_PARA 0x22 /* parallel-port */ -#define IFT_ARCNET 0x23 -#define IFT_ARCNETPLUS 0x24 -#define IFT_ATM 0x25 /* ATM cells */ -#define IFT_MIOX25 0x26 -#define IFT_SONET 0x27 /* SONET or SDH */ -#define IFT_X25PLE 0x28 -#define IFT_ISO88022LLC 0x29 -#define IFT_LOCALTALK 0x2a -#define IFT_SMDSDXI 0x2b -#define IFT_FRELAYDCE 0x2c /* Frame Relay DCE */ -#define IFT_V35 0x2d -#define IFT_HSSI 0x2e -#define IFT_HIPPI 0x2f -#define IFT_MODEM 0x30 /* Generic Modem */ -#define IFT_AAL5 0x31 /* AAL5 over ATM */ -#define IFT_SONETPATH 0x32 -#define IFT_SONETVT 0x33 -#define IFT_SMDSICIP 0x34 /* SMDS InterCarrier Interface */ -#define IFT_PROPVIRTUAL 0x35 /* Proprietary Virtual/internal */ -#define IFT_PROPMUX 0x36 /* Proprietary Multiplexing */ -#define IFT_IEEE80212 0x37 /* 100BaseVG */ -#define IFT_FIBRECHANNEL 0x38 /* Fibre Channel */ -#define IFT_HIPPIINTERFACE 0x39 /* HIPPI interfaces */ -#define IFT_FRAMERELAYINTERCONNECT 0x3a /* Obsolete, use either 0x20 or 0x2c */ -#define IFT_AFLANE8023 0x3b /* ATM Emulated LAN for 802.3 */ -#define IFT_AFLANE8025 0x3c /* ATM Emulated LAN for 802.5 */ -#define IFT_CCTEMUL 0x3d /* ATM Emulated circuit */ -#define IFT_FASTETHER 0x3e /* Fast Ethernet (100BaseT) */ -#define IFT_ISDN 0x3f /* ISDN and X.25 */ -#define IFT_V11 0x40 /* CCITT V.11/X.21 */ -#define IFT_V36 0x41 /* CCITT V.36 */ -#define IFT_G703AT64K 0x42 /* CCITT G703 at 64Kbps */ -#define IFT_G703AT2MB 0x43 /* Obsolete see DS1-MIB */ -#define IFT_QLLC 0x44 /* SNA QLLC */ -#define IFT_FASTETHERFX 0x45 /* Fast Ethernet (100BaseFX) */ -#define IFT_CHANNEL 0x46 /* channel */ -#define IFT_IEEE80211 0x47 /* radio spread spectrum */ -#define IFT_IBM370PARCHAN 0x48 /* IBM System 360/370 OEMI Channel */ -#define IFT_ESCON 0x49 /* IBM Enterprise Systems Connection */ -#define IFT_DLSW 0x4a /* Data Link Switching */ -#define IFT_ISDNS 0x4b /* ISDN S/T interface */ -#define IFT_ISDNU 0x4c /* ISDN U interface */ -#define IFT_LAPD 0x4d /* Link Access Protocol D */ -#define IFT_IPSWITCH 0x4e /* IP Switching Objects */ -#define IFT_RSRB 0x4f /* Remote Source Route Bridging */ -#define IFT_ATMLOGICAL 0x50 /* ATM Logical Port */ -#define IFT_DS0 0x51 /* Digital Signal Level 0 */ -#define IFT_DS0BUNDLE 0x52 /* group of ds0s on the same ds1 */ -#define IFT_BSC 0x53 /* Bisynchronous Protocol */ -#define IFT_ASYNC 0x54 /* Asynchronous Protocol */ -#define IFT_CNR 0x55 /* Combat Net Radio */ -#define IFT_ISO88025DTR 0x56 /* ISO 802.5r DTR */ -#define IFT_EPLRS 0x57 /* Ext Pos Loc Report Sys */ -#define IFT_ARAP 0x58 /* Appletalk Remote Access Protocol */ -#define IFT_PROPCNLS 0x59 /* Proprietary Connectionless Protocol*/ -#define IFT_HOSTPAD 0x5a /* CCITT-ITU X.29 PAD Protocol */ -#define IFT_TERMPAD 0x5b /* CCITT-ITU X.3 PAD Facility */ -#define IFT_FRAMERELAYMPI 0x5c /* Multiproto Interconnect over FR */ -#define IFT_X213 0x5d /* CCITT-ITU X213 */ -#define IFT_ADSL 0x5e /* Asymmetric Digital Subscriber Loop */ -#define IFT_RADSL 0x5f /* Rate-Adapt. Digital Subscriber Loop*/ -#define IFT_SDSL 0x60 /* Symmetric Digital Subscriber Loop */ -#define IFT_VDSL 0x61 /* Very H-Speed Digital Subscrib. Loop*/ -#define IFT_ISO88025CRFPINT 0x62 /* ISO 802.5 CRFP */ -#define IFT_MYRINET 0x63 /* Myricom Myrinet */ -#define IFT_VOICEEM 0x64 /* voice recEive and transMit */ -#define IFT_VOICEFXO 0x65 /* voice Foreign Exchange Office */ -#define IFT_VOICEFXS 0x66 /* voice Foreign Exchange Station */ -#define IFT_VOICEENCAP 0x67 /* voice encapsulation */ -#define IFT_VOICEOVERIP 0x68 /* voice over IP encapsulation */ -#define IFT_ATMDXI 0x69 /* ATM DXI */ -#define IFT_ATMFUNI 0x6a /* ATM FUNI */ -#define IFT_ATMIMA 0x6b /* ATM IMA */ -#define IFT_PPPMULTILINKBUNDLE 0x6c /* PPP Multilink Bundle */ -#define IFT_IPOVERCDLC 0x6d /* IBM ipOverCdlc */ -#define IFT_IPOVERCLAW 0x6e /* IBM Common Link Access to Workstn */ -#define IFT_STACKTOSTACK 0x6f /* IBM stackToStack */ -#define IFT_VIRTUALIPADDRESS 0x70 /* IBM VIPA */ -#define IFT_MPC 0x71 /* IBM multi-protocol channel support */ -#define IFT_IPOVERATM 0x72 /* IBM ipOverAtm */ -#define IFT_ISO88025FIBER 0x73 /* ISO 802.5j Fiber Token Ring */ -#define IFT_TDLC 0x74 /* IBM twinaxial data link control */ -#define IFT_GIGABITETHERNET 0x75 /* Gigabit Ethernet */ -#define IFT_HDLC 0x76 /* HDLC */ -#define IFT_LAPF 0x77 /* LAP F */ -#define IFT_V37 0x78 /* V.37 */ -#define IFT_X25MLP 0x79 /* Multi-Link Protocol */ -#define IFT_X25HUNTGROUP 0x7a /* X25 Hunt Group */ -#define IFT_TRANSPHDLC 0x7b /* Transp HDLC */ -#define IFT_INTERLEAVE 0x7c /* Interleave channel */ -#define IFT_FAST 0x7d /* Fast channel */ -#define IFT_IP 0x7e /* IP (for APPN HPR in IP networks) */ -#define IFT_DOCSCABLEMACLAYER 0x7f /* CATV Mac Layer */ -#define IFT_DOCSCABLEDOWNSTREAM 0x80 /* CATV Downstream interface */ -#define IFT_DOCSCABLEUPSTREAM 0x81 /* CATV Upstream interface */ -#define IFT_A12MPPSWITCH 0x82 /* Avalon Parallel Processor */ -#define IFT_TUNNEL 0x83 /* Encapsulation interface */ -#define IFT_COFFEE 0x84 /* coffee pot */ -#define IFT_CES 0x85 /* Circiut Emulation Service */ -#define IFT_ATMSUBINTERFACE 0x86 /* (x) ATM Sub Interface */ -#define IFT_L2VLAN 0x87 /* Layer 2 Virtual LAN using 802.1Q */ -#define IFT_L3IPVLAN 0x88 /* Layer 3 Virtual LAN - IP Protocol */ -#define IFT_L3IPXVLAN 0x89 /* Layer 3 Virtual LAN - IPX Prot. */ -#define IFT_DIGITALPOWERLINE 0x8a /* IP over Power Lines */ -#define IFT_MEDIAMAILOVERIP 0x8b /* (xxx) Multimedia Mail over IP */ -#define IFT_DTM 0x8c /* Dynamic synchronous Transfer Mode */ -#define IFT_DCN 0x8d /* Data Communications Network */ -#define IFT_IPFORWARD 0x8e /* IP Forwarding Interface */ -#define IFT_MSDSL 0x8f /* Multi-rate Symmetric DSL */ -#define IFT_IEEE1394 0x90 /* IEEE1394 High Performance SerialBus*/ -#define IFT_IFGSN 0x91 /* HIPPI-6400 */ -#define IFT_DVBRCCMACLAYER 0x92 /* DVB-RCC MAC Layer */ -#define IFT_DVBRCCDOWNSTREAM 0x93 /* DVB-RCC Downstream Channel */ -#define IFT_DVBRCCUPSTREAM 0x94 /* DVB-RCC Upstream Channel */ -#define IFT_ATMVIRTUAL 0x95 /* ATM Virtual Interface */ -#define IFT_MPLSTUNNEL 0x96 /* MPLS Tunnel Virtual Interface */ -#define IFT_SRP 0x97 /* Spatial Reuse Protocol */ -#define IFT_VOICEOVERATM 0x98 /* Voice over ATM */ -#define IFT_VOICEOVERFRAMERELAY 0x99 /* Voice Over Frame Relay */ -#define IFT_IDSL 0x9a /* Digital Subscriber Loop over ISDN */ -#define IFT_COMPOSITELINK 0x9b /* Avici Composite Link Interface */ -#define IFT_SS7SIGLINK 0x9c /* SS7 Signaling Link */ -#define IFT_PROPWIRELESSP2P 0x9d /* Prop. P2P wireless interface */ -#define IFT_FRFORWARD 0x9e /* Frame forward Interface */ -#define IFT_RFC1483 0x9f /* Multiprotocol over ATM AAL5 */ -#define IFT_USB 0xa0 /* USB Interface */ -#define IFT_IEEE8023ADLAG 0xa1 /* IEEE 802.3ad Link Aggregate*/ -#define IFT_BGPPOLICYACCOUNTING 0xa2 /* BGP Policy Accounting */ -#define IFT_FRF16MFRBUNDLE 0xa3 /* FRF.16 Multilik Frame Relay*/ -#define IFT_H323GATEKEEPER 0xa4 /* H323 Gatekeeper */ -#define IFT_H323PROXY 0xa5 /* H323 Voice and Video Proxy */ -#define IFT_MPLS 0xa6 /* MPLS */ -#define IFT_MFSIGLINK 0xa7 /* Multi-frequency signaling link */ -#define IFT_HDSL2 0xa8 /* High Bit-Rate DSL, 2nd gen. */ -#define IFT_SHDSL 0xa9 /* Multirate HDSL2 */ -#define IFT_DS1FDL 0xaa /* Facility Data Link (4Kbps) on a DS1*/ -#define IFT_POS 0xab /* Packet over SONET/SDH Interface */ -#define IFT_DVBASILN 0xac /* DVB-ASI Input */ -#define IFT_DVBASIOUT 0xad /* DVB-ASI Output */ -#define IFT_PLC 0xae /* Power Line Communications */ -#define IFT_NFAS 0xaf /* Non-Facility Associated Signaling */ -#define IFT_TR008 0xb0 /* TROO8 */ -#define IFT_GR303RDT 0xb1 /* Remote Digital Terminal */ -#define IFT_GR303IDT 0xb2 /* Integrated Digital Terminal */ -#define IFT_ISUP 0xb3 /* ISUP */ -#define IFT_PROPDOCSWIRELESSMACLAYER 0xb4 /* prop/Wireless MAC Layer */ -#define IFT_PROPDOCSWIRELESSDOWNSTREAM 0xb5 /* prop/Wireless Downstream */ -#define IFT_PROPDOCSWIRELESSUPSTREAM 0xb6 /* prop/Wireless Upstream */ -#define IFT_HIPERLAN2 0xb7 /* HIPERLAN Type 2 Radio Interface */ -#define IFT_PROPBWAP2MP 0xb8 /* PropBroadbandWirelessAccess P2MP*/ -#define IFT_SONETOVERHEADCHANNEL 0xb9 /* SONET Overhead Channel */ -#define IFT_DIGITALWRAPPEROVERHEADCHANNEL 0xba /* Digital Wrapper Overhead */ -#define IFT_AAL2 0xbb /* ATM adaptation layer 2 */ -#define IFT_RADIOMAC 0xbc /* MAC layer over radio links */ -#define IFT_ATMRADIO 0xbd /* ATM over radio links */ -#define IFT_IMT 0xbe /* Inter-Machine Trunks */ -#define IFT_MVL 0xbf /* Multiple Virtual Lines DSL */ -#define IFT_REACHDSL 0xc0 /* Long Reach DSL */ -#define IFT_FRDLCIENDPT 0xc1 /* Frame Relay DLCI End Point */ -#define IFT_ATMVCIENDPT 0xc2 /* ATM VCI End Point */ -#define IFT_OPTICALCHANNEL 0xc3 /* Optical Channel */ -#define IFT_OPTICALTRANSPORT 0xc4 /* Optical Transport */ -#define IFT_INFINIBAND 0xc7 /* Infiniband */ -#define IFT_BRIDGE 0xd1 /* Transparent bridge interface */ +typedef enum { + IFT_OTHER = 0x1, /* none of the following */ + IFT_1822 = 0x2, /* old-style arpanet imp */ + IFT_HDH1822 = 0x3, /* HDH arpanet imp */ + IFT_X25DDN = 0x4, /* x25 to imp */ + IFT_X25 = 0x5, /* PDN X25 interface (RFC877) */ + IFT_ETHER = 0x6, /* Ethernet CSMA/CD */ + IFT_ISO88023 = 0x7, /* CMSA/CD */ + IFT_ISO88024 = 0x8, /* Token Bus */ + IFT_ISO88025 = 0x9, /* Token Ring */ + IFT_ISO88026 = 0xa, /* MAN */ + IFT_STARLAN = 0xb, + IFT_P10 = 0xc, /* Proteon 10MBit ring */ + IFT_P80 = 0xd, /* Proteon 80MBit ring */ + IFT_HY = 0xe, /* Hyperchannel */ + IFT_FDDI = 0xf, + IFT_LAPB = 0x10, + IFT_SDLC = 0x11, + IFT_T1 = 0x12, + IFT_CEPT = 0x13, /* E1 - european T1 */ + IFT_ISDNBASIC = 0x14, + IFT_ISDNPRIMARY = 0x15, + IFT_PTPSERIAL = 0x16, /* Proprietary PTP serial */ + IFT_PPP = 0x17, /* RFC 1331 */ + IFT_LOOP = 0x18, /* loopback */ + IFT_EON = 0x19, /* ISO over IP */ + IFT_XETHER = 0x1a, /* obsolete 3MB experimental ethernet */ + IFT_NSIP = 0x1b, /* XNS over IP */ + IFT_SLIP = 0x1c, /* IP over generic TTY */ + IFT_ULTRA = 0x1d, /* Ultra Technologies */ + IFT_DS3 = 0x1e, /* Generic T3 */ + IFT_SIP = 0x1f, /* SMDS */ + IFT_FRELAY = 0x20, /* Frame Relay DTE only */ + IFT_RS232 = 0x21, + IFT_PARA = 0x22, /* parallel-port */ + IFT_ARCNET = 0x23, + IFT_ARCNETPLUS = 0x24, + IFT_ATM = 0x25, /* ATM cells */ + IFT_MIOX25 = 0x26, + IFT_SONET = 0x27, /* SONET or SDH */ + IFT_X25PLE = 0x28, + IFT_ISO88022LLC = 0x29, + IFT_LOCALTALK = 0x2a, + IFT_SMDSDXI = 0x2b, + IFT_FRELAYDCE = 0x2c, /* Frame Relay DCE */ + IFT_V35 = 0x2d, + IFT_HSSI = 0x2e, + IFT_HIPPI = 0x2f, + IFT_MODEM = 0x30, /* Generic Modem */ + IFT_AAL5 = 0x31, /* AAL5 over ATM */ + IFT_SONETPATH = 0x32, + IFT_SONETVT = 0x33, + IFT_SMDSICIP = 0x34, /* SMDS InterCarrier Interface */ + IFT_PROPVIRTUAL = 0x35, /* Proprietary Virtual/internal */ + IFT_PROPMUX = 0x36, /* Proprietary Multiplexing */ + IFT_IEEE80212 = 0x37, /* 100BaseVG */ + IFT_FIBRECHANNEL = 0x38, /* Fibre Channel */ + IFT_HIPPIINTERFACE = 0x39, /* HIPPI interfaces */ + IFT_FRAMERELAYINTERCONNECT = 0x3a, /* Obsolete, use 0x20 either 0x2c */ + IFT_AFLANE8023 = 0x3b, /* ATM Emulated LAN for 802.3 */ + IFT_AFLANE8025 = 0x3c, /* ATM Emulated LAN for 802.5 */ + IFT_CCTEMUL = 0x3d, /* ATM Emulated circuit */ + IFT_FASTETHER = 0x3e, /* Fast Ethernet (100BaseT) */ + IFT_ISDN = 0x3f, /* ISDN and X.25 */ + IFT_V11 = 0x40, /* CCITT V.11/X.21 */ + IFT_V36 = 0x41, /* CCITT V.36 */ + IFT_G703AT64K = 0x42, /* CCITT G703 at 64Kbps */ + IFT_G703AT2MB = 0x43, /* Obsolete see DS1-MIB */ + IFT_QLLC = 0x44, /* SNA QLLC */ + IFT_FASTETHERFX = 0x45, /* Fast Ethernet (100BaseFX) */ + IFT_CHANNEL = 0x46, /* channel */ + IFT_IEEE80211 = 0x47, /* radio spread spectrum */ + IFT_IBM370PARCHAN = 0x48, /* IBM System 360/370 OEMI Channel */ + IFT_ESCON = 0x49, /* IBM Enterprise Systems Connection */ + IFT_DLSW = 0x4a, /* Data Link Switching */ + IFT_ISDNS = 0x4b, /* ISDN S/T interface */ + IFT_ISDNU = 0x4c, /* ISDN U interface */ + IFT_LAPD = 0x4d, /* Link Access Protocol D */ + IFT_IPSWITCH = 0x4e, /* IP Switching Objects */ + IFT_RSRB = 0x4f, /* Remote Source Route Bridging */ + IFT_ATMLOGICAL = 0x50, /* ATM Logical Port */ + IFT_DS0 = 0x51, /* Digital Signal Level 0 */ + IFT_DS0BUNDLE = 0x52, /* group of ds0s on the same ds1 */ + IFT_BSC = 0x53, /* Bisynchronous Protocol */ + IFT_ASYNC = 0x54, /* Asynchronous Protocol */ + IFT_CNR = 0x55, /* Combat Net Radio */ + IFT_ISO88025DTR = 0x56, /* ISO 802.5r DTR */ + IFT_EPLRS = 0x57, /* Ext Pos Loc Report Sys */ + IFT_ARAP = 0x58, /* Appletalk Remote Access Protocol */ + IFT_PROPCNLS = 0x59, /* Proprietary Connectionless Protocol*/ + IFT_HOSTPAD = 0x5a, /* CCITT-ITU X.29 PAD Protocol */ + IFT_TERMPAD = 0x5b, /* CCITT-ITU X.3 PAD Facility */ + IFT_FRAMERELAYMPI = 0x5c, /* Multiproto Interconnect over FR */ + IFT_X213 = 0x5d, /* CCITT-ITU X213 */ + IFT_ADSL = 0x5e, /* Asymmetric Digital Subscriber Loop */ + IFT_RADSL = 0x5f, /* Rate-Adapt. Digital Subscriber Loop*/ + IFT_SDSL = 0x60, /* Symmetric Digital Subscriber Loop */ + IFT_VDSL = 0x61, /* Very H-Speed Digital Subscrib. Loop*/ + IFT_ISO88025CRFPINT = 0x62, /* ISO 802.5 CRFP */ + IFT_MYRINET = 0x63, /* Myricom Myrinet */ + IFT_VOICEEM = 0x64, /* voice recEive and transMit */ + IFT_VOICEFXO = 0x65, /* voice Foreign Exchange Office */ + IFT_VOICEFXS = 0x66, /* voice Foreign Exchange Station */ + IFT_VOICEENCAP = 0x67, /* voice encapsulation */ + IFT_VOICEOVERIP = 0x68, /* voice over IP encapsulation */ + IFT_ATMDXI = 0x69, /* ATM DXI */ + IFT_ATMFUNI = 0x6a, /* ATM FUNI */ + IFT_ATMIMA = 0x6b, /* ATM IMA */ + IFT_PPPMULTILINKBUNDLE = 0x6c, /* PPP Multilink Bundle */ + IFT_IPOVERCDLC = 0x6d, /* IBM ipOverCdlc */ + IFT_IPOVERCLAW = 0x6e, /* IBM Common Link Access to Workstn */ + IFT_STACKTOSTACK = 0x6f, /* IBM stackToStack */ + IFT_VIRTUALIPADDRESS = 0x70, /* IBM VIPA */ + IFT_MPC = 0x71, /* IBM multi-protocol channel support */ + IFT_IPOVERATM = 0x72, /* IBM ipOverAtm */ + IFT_ISO88025FIBER = 0x73, /* ISO 802.5j Fiber Token Ring */ + IFT_TDLC = 0x74, /* IBM twinaxial data link control */ + IFT_GIGABITETHERNET = 0x75, /* Gigabit Ethernet */ + IFT_HDLC = 0x76, /* HDLC */ + IFT_LAPF = 0x77, /* LAP F */ + IFT_V37 = 0x78, /* V.37 */ + IFT_X25MLP = 0x79, /* Multi-Link Protocol */ + IFT_X25HUNTGROUP = 0x7a, /* X25 Hunt Group */ + IFT_TRANSPHDLC = 0x7b, /* Transp HDLC */ + IFT_INTERLEAVE = 0x7c, /* Interleave channel */ + IFT_FAST = 0x7d, /* Fast channel */ + IFT_IP = 0x7e, /* IP (for APPN HPR in IP networks) */ + IFT_DOCSCABLEMACLAYER = 0x7f, /* CATV Mac Layer */ + IFT_DOCSCABLEDOWNSTREAM = 0x80, /* CATV Downstream interface */ + IFT_DOCSCABLEUPSTREAM = 0x81, /* CATV Upstream interface */ + IFT_A12MPPSWITCH = 0x82, /* Avalon Parallel Processor */ + IFT_TUNNEL = 0x83, /* Encapsulation interface */ + IFT_COFFEE = 0x84, /* coffee pot */ + IFT_CES = 0x85, /* Circiut Emulation Service */ + IFT_ATMSUBINTERFACE = 0x86, /* (x) ATM Sub Interface */ + IFT_L2VLAN = 0x87, /* Layer 2 Virtual LAN using 802.1Q */ + IFT_L3IPVLAN = 0x88, /* Layer 3 Virtual LAN - IP Protocol */ + IFT_L3IPXVLAN = 0x89, /* Layer 3 Virtual LAN - IPX Prot. */ + IFT_DIGITALPOWERLINE = 0x8a, /* IP over Power Lines */ + IFT_MEDIAMAILOVERIP = 0x8b, /* (xxx) Multimedia Mail over IP */ + IFT_DTM = 0x8c, /* Dynamic synchronous Transfer Mode */ + IFT_DCN = 0x8d, /* Data Communications Network */ + IFT_IPFORWARD = 0x8e, /* IP Forwarding Interface */ + IFT_MSDSL = 0x8f, /* Multi-rate Symmetric DSL */ + IFT_IEEE1394 = 0x90, /* IEEE1394 High Performance SerialBus*/ + IFT_IFGSN = 0x91, /* HIPPI-6400 */ + IFT_DVBRCCMACLAYER = 0x92, /* DVB-RCC MAC Layer */ + IFT_DVBRCCDOWNSTREAM = 0x93, /* DVB-RCC Downstream Channel */ + IFT_DVBRCCUPSTREAM = 0x94, /* DVB-RCC Upstream Channel */ + IFT_ATMVIRTUAL = 0x95, /* ATM Virtual Interface */ + IFT_MPLSTUNNEL = 0x96, /* MPLS Tunnel Virtual Interface */ + IFT_SRP = 0x97, /* Spatial Reuse Protocol */ + IFT_VOICEOVERATM = 0x98, /* Voice over ATM */ + IFT_VOICEOVERFRAMERELAY = 0x99, /* Voice Over Frame Relay */ + IFT_IDSL = 0x9a, /* Digital Subscriber Loop over ISDN */ + IFT_COMPOSITELINK = 0x9b, /* Avici Composite Link Interface */ + IFT_SS7SIGLINK = 0x9c, /* SS7 Signaling Link */ + IFT_PROPWIRELESSP2P = 0x9d, /* Prop. P2P wireless interface */ + IFT_FRFORWARD = 0x9e, /* Frame forward Interface */ + IFT_RFC1483 = 0x9f, /* Multiprotocol over ATM AAL5 */ + IFT_USB = 0xa0, /* USB Interface */ + IFT_IEEE8023ADLAG = 0xa1, /* IEEE 802.3ad Link Aggregate*/ + IFT_BGPPOLICYACCOUNTING = 0xa2, /* BGP Policy Accounting */ + IFT_FRF16MFRBUNDLE = 0xa3, /* FRF.16 Multilik Frame Relay*/ + IFT_H323GATEKEEPER = 0xa4, /* H323 Gatekeeper */ + IFT_H323PROXY = 0xa5, /* H323 Voice and Video Proxy */ + IFT_MPLS = 0xa6, /* MPLS */ + IFT_MFSIGLINK = 0xa7, /* Multi-frequency signaling link */ + IFT_HDSL2 = 0xa8, /* High Bit-Rate DSL, 2nd gen. */ + IFT_SHDSL = 0xa9, /* Multirate HDSL2 */ + IFT_DS1FDL = 0xaa, /* Facility Data Link (4Kbps) on a DS1*/ + IFT_POS = 0xab, /* Packet over SONET/SDH Interface */ + IFT_DVBASILN = 0xac, /* DVB-ASI Input */ + IFT_DVBASIOUT = 0xad, /* DVB-ASI Output */ + IFT_PLC = 0xae, /* Power Line Communications */ + IFT_NFAS = 0xaf, /* Non-Facility Associated Signaling */ + IFT_TR008 = 0xb0, /* TROO8 */ + IFT_GR303RDT = 0xb1, /* Remote Digital Terminal */ + IFT_GR303IDT = 0xb2, /* Integrated Digital Terminal */ + IFT_ISUP = 0xb3, /* ISUP */ + IFT_PROPDOCSWIRELESSMACLAYER = 0xb4, /* prop/Wireless MAC Layer */ + IFT_PROPDOCSWIRELESSDOWNSTREAM = 0xb5, /* prop/Wireless Downstream */ + IFT_PROPDOCSWIRELESSUPSTREAM = 0xb6, /* prop/Wireless Upstream */ + IFT_HIPERLAN2 = 0xb7, /* HIPERLAN Type 2 Radio Interface */ + IFT_PROPBWAP2MP = 0xb8, /* PropBroadbandWirelessAccess P2MP*/ + IFT_SONETOVERHEADCHANNEL = 0xb9, /* SONET Overhead Channel */ + IFT_DIGITALWRAPPEROVERHEADCHANNEL = 0xba, /* Digital Wrapper Overhead */ + IFT_AAL2 = 0xbb, /* ATM adaptation layer 2 */ + IFT_RADIOMAC = 0xbc, /* MAC layer over radio links */ + IFT_ATMRADIO = 0xbd, /* ATM over radio links */ + IFT_IMT = 0xbe, /* Inter-Machine Trunks */ + IFT_MVL = 0xbf, /* Multiple Virtual Lines DSL */ + IFT_REACHDSL = 0xc0, /* Long Reach DSL */ + IFT_FRDLCIENDPT = 0xc1, /* Frame Relay DLCI End Point */ + IFT_ATMVCIENDPT = 0xc2, /* ATM VCI End Point */ + IFT_OPTICALCHANNEL = 0xc3, /* Optical Channel */ + IFT_OPTICALTRANSPORT = 0xc4, /* Optical Transport */ + IFT_INFINIBAND = 0xc7, /* Infiniband */ + IFT_BRIDGE = 0xd1, /* Transparent bridge interface */ + IFT_STF = 0xd7, /* 6to4 interface */ -#define IFT_STF 0xd7 /* 6to4 interface */ + /* + * Not based on IANA assignments. Conflicting with IANA assignments. + * We should make them negative probably. + * This requires changes to struct if_data. + */ + IFT_GIF = 0xf0, /* Generic tunnel interface */ + IFT_PVC = 0xf1, /* Unused */ + IFT_ENC = 0xf4, /* Encapsulating interface */ + IFT_PFLOG = 0xf6, /* PF packet filter logging */ + IFT_PFSYNC = 0xf7, /* PF packet filter synchronization */ +} ifType; + +/* + * Some (broken) software uses #ifdef IFT_TYPE to check whether + * an operating systems supports certain interface type. Lack of + * ifdef leads to a piece of functionality compiled out. + */ +#ifndef BURN_BRIDGES +#define IFT_BRIDGE IFT_BRIDGE +#define IFT_PPP IFT_PPP +#define IFT_PROPVIRTUAL IFT_PROPVIRTUAL +#define IFT_L2VLAN IFT_L2VLAN +#define IFT_L3IPVLAN IFT_L3IPVLAN +#define IFT_IEEE1394 IFT_IEEE1394 +#define IFT_INFINIBAND IFT_INFINIBAND +#endif -/* not based on IANA assignments */ -#define IFT_GIF 0xf0 -#define IFT_PVC 0xf1 -#define IFT_FAITH 0xf2 -#define IFT_ENC 0xf4 -#define IFT_PFLOG 0xf6 -#define IFT_PFSYNC 0xf7 -#define IFT_CARP 0xf8 /* Common Address Redundancy Protocol */ -#define IFT_IPXIP 0xf9 /* IPX over IP tunneling; no longer used. */ #endif /* !_NET_IF_TYPES_H_ */ diff --git a/freebsd/sys/net/if_var.h b/freebsd/sys/net/if_var.h index ee4db195..ec3719d4 100644 --- a/freebsd/sys/net/if_var.h +++ b/freebsd/sys/net/if_var.h @@ -58,58 +58,75 @@ * interfaces. These routines live in the files if.c and route.c */ -#ifdef __STDC__ -/* - * Forward structure declarations for function prototypes [sic]. - */ -struct mbuf; -struct thread; -struct rtentry; -struct rt_addrinfo; +struct rtentry; /* ifa_rtrequest */ +struct rt_addrinfo; /* ifa_rtrequest */ struct socket; -struct ether_header; struct carp_if; +struct carp_softc; struct ifvlantrunk; -struct route; +struct route; /* if_output */ struct vnet; -#endif - -#include <sys/queue.h> /* get TAILQ macros */ +struct ifmedia; +struct netmap_adapter; #ifdef _KERNEL -#include <sys/mbuf.h> -#include <sys/eventhandler.h> +#include <sys/mbuf.h> /* ifqueue only? */ #include <sys/buf_ring.h> #include <net/vnet.h> #endif /* _KERNEL */ +#include <sys/counter.h> #include <rtems/bsd/sys/lock.h> /* XXX */ -#include <sys/mutex.h> /* XXX */ +#include <sys/mutex.h> /* struct ifqueue */ #include <sys/rwlock.h> /* XXX */ #include <sys/sx.h> /* XXX */ -#include <sys/event.h> /* XXX */ -#include <sys/_task.h> +#include <sys/_task.h> /* if_link_task */ #define IF_DUNIT_NONE -1 -#include <altq/if_altq.h> +#include <net/altq/if_altq.h> TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ -TAILQ_HEAD(ifprefixhead, ifprefix); TAILQ_HEAD(ifmultihead, ifmultiaddr); TAILQ_HEAD(ifgrouphead, ifg_group); -/* - * Structure defining a queue for a network interface. - */ -struct ifqueue { - struct mbuf *ifq_head; - struct mbuf *ifq_tail; - int ifq_len; - int ifq_maxlen; - int ifq_drops; - struct mtx ifq_mtx; -}; +#ifdef _KERNEL +VNET_DECLARE(struct pfil_head, link_pfil_hook); /* packet filter hooks */ +#define V_link_pfil_hook VNET(link_pfil_hook) + +#define HHOOK_IPSEC_INET 0 +#define HHOOK_IPSEC_INET6 1 +#define HHOOK_IPSEC_COUNT 2 +VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); +VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); +#define V_ipsec_hhh_in VNET(ipsec_hhh_in) +#define V_ipsec_hhh_out VNET(ipsec_hhh_out) +#endif /* _KERNEL */ + +typedef enum { + IFCOUNTER_IPACKETS = 0, + IFCOUNTER_IERRORS, + IFCOUNTER_OPACKETS, + IFCOUNTER_OERRORS, + IFCOUNTER_COLLISIONS, + IFCOUNTER_IBYTES, + IFCOUNTER_OBYTES, + IFCOUNTER_IMCASTS, + IFCOUNTER_OMCASTS, + IFCOUNTER_IQDROPS, + IFCOUNTER_OQDROPS, + IFCOUNTER_NOPROTO, + IFCOUNTERS /* Array size. */ +} ift_counter; + +typedef struct ifnet * if_t; + +typedef void (*if_start_fn_t)(if_t); +typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); +typedef void (*if_init_fn_t)(void *); +typedef void (*if_qflush_fn_t)(if_t); +typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); +typedef uint64_t (*if_get_counter_t)(if_t, ift_counter); struct ifnet_hw_tsomax { u_int tsomaxbytes; /* TSO total burst length limit in bytes */ @@ -117,22 +134,99 @@ struct ifnet_hw_tsomax { u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ }; +/* Interface encap request types */ +typedef enum { + IFENCAP_LL = 1 /* pre-calculate link-layer header */ +} ife_type; + /* - * Structure defining a network interface. + * The structure below allows to request various pre-calculated L2/L3 headers + * for different media. Requests varies by type (rtype field). + * + * IFENCAP_LL type: pre-calculates link header based on address family + * and destination lladdr. * - * (Would like to call this struct ``if'', but C isn't PL/1.) + * Input data fields: + * buf: pointer to destination buffer + * bufsize: buffer size + * flags: IFENCAP_FLAG_BROADCAST if destination is broadcast + * family: address family defined by AF_ constant. + * lladdr: pointer to link-layer address + * lladdr_len: length of link-layer address + * hdata: pointer to L3 header (optional, used for ARP requests). + * Output data fields: + * buf: encap data is stored here + * bufsize: resulting encap length is stored here + * lladdr_off: offset of link-layer address from encap hdr start + * hdata: L3 header may be altered if necessary */ +struct if_encap_req { + u_char *buf; /* Destination buffer (w) */ + size_t bufsize; /* size of provided buffer (r) */ + ife_type rtype; /* request type (r) */ + uint32_t flags; /* Request flags (r) */ + int family; /* Address family AF_* (r) */ + int lladdr_off; /* offset from header start (w) */ + int lladdr_len; /* lladdr length (r) */ + char *lladdr; /* link-level address pointer (r) */ + char *hdata; /* Upper layer header data (rw) */ +}; + +#define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ + + +/* + * Structure defining a network interface. + * + * Size ILP32: 592 (approx) + * LP64: 1048 (approx) + */ struct ifnet { + /* General book keeping of interface lists. */ + TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ + LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ + TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ + /* protected by if_addr_lock */ + u_char if_alloctype; /* if_type at time of allocation */ + + /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ + void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ - struct vnet *if_vnet; /* pointer to network stack instance */ - TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ - char if_xname[IFNAMSIZ]; /* external name (name + unit) */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ + u_short if_index; /* numeric abbreviation for this if */ + short if_index_reserved; /* spare space to grow if_index */ + char if_xname[IFNAMSIZ]; /* external name (name + unit) */ + char *if_description; /* interface description */ + + /* Variable fields that are touched by the stack and drivers. */ + int if_flags; /* up/down, broadcast, etc. */ + int if_drv_flags; /* driver-managed status flags */ + int if_capabilities; /* interface features & capabilities */ + int if_capenable; /* enabled features & capabilities */ + void *if_linkmib; /* link-type-specific MIB data */ + size_t if_linkmiblen; /* length of above data */ u_int if_refcount; /* reference count */ - struct ifaddrhead if_addrhead; /* linked list of addresses per if */ + + /* These fields are shared with struct if_data. */ + uint8_t if_type; /* ethernet, tokenring, etc */ + uint8_t if_addrlen; /* media address length */ + uint8_t if_hdrlen; /* media header length */ + uint8_t if_link_state; /* current link state */ + uint32_t if_mtu; /* maximum transmission unit */ + uint32_t if_metric; /* routing metric (external only) */ + uint64_t if_baudrate; /* linespeed */ + uint64_t if_hwassist; /* HW offload capabilities, see IFCAP */ + time_t if_epoch; /* uptime at attach or stat reset */ + struct timeval if_lastchange; /* time of last administrative change */ + + struct ifaltq if_snd; /* output queue (includes altq) */ + struct task if_linktask; /* task for link change events */ + + /* Addresses of different protocol families assigned to this if. */ + struct rwlock if_addr_lock; /* lock to protect address lists */ /* * if_addrhead is the list of all addresses associated to * an interface. @@ -143,74 +237,53 @@ struct ifnet { * However, access to the AF_LINK address through this * field is deprecated. Use if_addr or ifaddr_byindex() instead. */ - int if_pcount; /* number of promiscuous listeners */ - struct carp_if *if_carp; /* carp interface structure */ - struct bpf_if *if_bpf; /* packet filter structure */ - u_short if_index; /* numeric abbreviation for this if */ - short if_index_reserved; /* spare space to grow if_index */ - struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ - int if_flags; /* up/down, broadcast, etc. */ - int if_capabilities; /* interface features & capabilities */ - int if_capenable; /* enabled features & capabilities */ - void *if_linkmib; /* link-type-specific MIB data */ - size_t if_linkmiblen; /* length of above data */ - struct if_data if_data; + struct ifaddrhead if_addrhead; /* linked list of addresses per if */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ -/* procedure handles */ + struct ifaddr *if_addr; /* pointer to link-level address */ + const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ + struct rwlock if_afdata_lock; + void *if_afdata[AF_MAX]; + int if_afdata_initialized; + + /* Additional features hung off the interface. */ + u_int if_fib; /* interface FIB */ + struct vnet *if_vnet; /* pointer to network stack instance */ + struct vnet *if_home_vnet; /* where this ifnet originates from */ + struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ + struct bpf_if *if_bpf; /* packet filter structure */ + int if_pcount; /* number of promiscuous listeners */ + void *if_bridge; /* bridge glue */ + void *if_lagg; /* lagg glue */ + void *if_pf_kif; /* pf glue */ + struct carp_if *if_carp; /* carp interface structure */ + struct label *if_label; /* interface MAC label */ + struct netmap_adapter *if_netmap; /* netmap(4) softc */ + + /* Various procedures of the layer2 encapsulation and drivers. */ int (*if_output) /* output routine (enqueue) */ - (struct ifnet *, struct mbuf *, struct sockaddr *, + (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void (*if_input) /* input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); - void (*if_start) /* initiate output routine */ - (struct ifnet *); - int (*if_ioctl) /* ioctl routine */ - (struct ifnet *, u_long, caddr_t); - void (*if_init) /* Init routine */ - (void *); + if_start_fn_t if_start; /* initiate output routine */ + if_ioctl_fn_t if_ioctl; /* ioctl routine */ + if_init_fn_t if_init; /* Init routine */ int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); - void (*if_qflush) /* flush any queues */ - (struct ifnet *); - int (*if_transmit) /* initiate output routine */ - (struct ifnet *, struct mbuf *); + if_qflush_fn_t if_qflush; /* flush any queue */ + if_transmit_fn_t if_transmit; /* initiate output routine */ + void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); - struct vnet *if_home_vnet; /* where this ifnet originates from */ - struct ifaddr *if_addr; /* pointer to link-level address */ - void *if_llsoftc; /* link layer softc */ - int if_drv_flags; /* driver-managed status flags */ - struct ifaltq if_snd; /* output queue (includes altq) */ - const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ + if_get_counter_t if_get_counter; /* get counter values */ + int (*if_requestencap) /* make link header from request */ + (struct ifnet *, struct if_encap_req *); - void *if_bridge; /* bridge glue */ + /* Statistics. */ + counter_u64_t if_counters[IFCOUNTERS]; - struct label *if_label; /* interface MAC label */ - - /* these are only used by IPv6 */ - struct ifprefixhead if_prefixhead; /* list of prefixes per if */ - void *if_afdata[AF_MAX]; - int if_afdata_initialized; - struct rwlock if_afdata_lock; - struct task if_linktask; /* task for link change events */ - struct mtx if_addr_mtx; /* mutex to protect address lists */ - - LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ - TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */ - /* protected by if_addr_mtx */ - void *if_pf_kif; - void *if_lagg; /* lagg glue */ - char *if_description; /* interface description */ - u_int if_fib; /* interface FIB */ - u_char if_alloctype; /* if_type at time of allocation */ - - /* - * Spare fields are added so that we can modify sensitive data - * structures without changing the kernel binary interface, and must - * be used with care where binary compatibility is required. - */ - char if_cspare[3]; + /* Stuff that's only temporary and doesn't belong here. */ /* * Network adapter TSO limits: @@ -222,50 +295,25 @@ struct ifnet { * count limit does not apply. If all three fields are zero, * there is no TSO limit. * - * NOTE: The TSO limits only apply to the data payload part of - * a TCP/IP packet. That means there is no need to subtract - * space for ethernet-, vlan-, IP- or TCP- headers from the - * TSO limits unless the hardware driver in question requires - * so. - */ - u_int if_hw_tsomax; - int if_ispare[1]; - /* - * TSO fields for segment limits. If a field is zero below, - * there is no limit: + * NOTE: The TSO limits should reflect the values used in the + * BUSDMA tag a network adapter is using to load a mbuf chain + * for transmission. The TCP/IP network stack will subtract + * space for all linklevel and protocol level headers and + * ensure that the full mbuf chain passed to the network + * adapter fits within the given limits. */ + u_int if_hw_tsomax; /* TSO maximum size in bytes */ u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */ u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ - void *if_pspare[8]; /* 1 netmap, 7 TDB */ -}; - -typedef void if_init_f_t(void *); -/* - * XXX These aliases are terribly dangerous because they could apply - * to anything. - */ -#define if_mtu if_data.ifi_mtu -#define if_type if_data.ifi_type -#define if_physical if_data.ifi_physical -#define if_addrlen if_data.ifi_addrlen -#define if_hdrlen if_data.ifi_hdrlen -#define if_metric if_data.ifi_metric -#define if_link_state if_data.ifi_link_state -#define if_baudrate if_data.ifi_baudrate -#define if_hwassist if_data.ifi_hwassist -#define if_ipackets if_data.ifi_ipackets -#define if_ierrors if_data.ifi_ierrors -#define if_opackets if_data.ifi_opackets -#define if_oerrors if_data.ifi_oerrors -#define if_collisions if_data.ifi_collisions -#define if_ibytes if_data.ifi_ibytes -#define if_obytes if_data.ifi_obytes -#define if_imcasts if_data.ifi_imcasts -#define if_omcasts if_data.ifi_omcasts -#define if_iqdrops if_data.ifi_iqdrops -#define if_noproto if_data.ifi_noproto -#define if_lastchange if_data.ifi_lastchange + /* + * Spare fields to be added before branching a stable branch, so + * that structure can be enhanced without changing the kernel + * binary interface. + */ + void *if_pspare[4]; /* packet pacing / general use */ + int if_ispare[4]; /* packet pacing / general use */ +}; /* for compatibility with other BSDs */ #define if_addrlist if_addrhead @@ -275,18 +323,14 @@ typedef void if_init_f_t(void *); /* * Locks for address lists on the network interface. */ -#define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_mtx, \ - "if_addr_mtx", NULL, MTX_DEF) -#define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_mtx) -#define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_mtx) -#define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_mtx) -#define IF_ADDR_RLOCK(if) mtx_lock(&(if)->if_addr_mtx) -#define IF_ADDR_RUNLOCK(if) mtx_unlock(&(if)->if_addr_mtx) -#define IF_ADDR_LOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED) -#define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED) -/* XXX: Compat. */ -#define IF_ADDR_LOCK(if) IF_ADDR_WLOCK(if) -#define IF_ADDR_UNLOCK(if) IF_ADDR_WUNLOCK(if) +#define IF_ADDR_LOCK_INIT(if) rw_init(&(if)->if_addr_lock, "if_addr_lock") +#define IF_ADDR_LOCK_DESTROY(if) rw_destroy(&(if)->if_addr_lock) +#define IF_ADDR_WLOCK(if) rw_wlock(&(if)->if_addr_lock) +#define IF_ADDR_WUNLOCK(if) rw_wunlock(&(if)->if_addr_lock) +#define IF_ADDR_RLOCK(if) rw_rlock(&(if)->if_addr_lock) +#define IF_ADDR_RUNLOCK(if) rw_runlock(&(if)->if_addr_lock) +#define IF_ADDR_LOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_LOCKED) +#define IF_ADDR_WLOCK_ASSERT(if) rw_assert(&(if)->if_addr_lock, RA_WLOCKED) /* * Function variations on locking macros intended to be used by loadable @@ -295,100 +339,11 @@ typedef void if_init_f_t(void *); */ void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */ void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */ -void if_maddr_rlock(struct ifnet *ifp); /* if_multiaddrs */ -void if_maddr_runlock(struct ifnet *ifp); /* if_multiaddrs */ - -/* - * Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq) - * are queues of messages stored on ifqueue structures - * (defined above). Entries are added to and deleted from these structures - * by these macros, which should be called with ipl raised to splimp(). - */ -#define IF_LOCK(ifq) mtx_lock(&(ifq)->ifq_mtx) -#define IF_UNLOCK(ifq) mtx_unlock(&(ifq)->ifq_mtx) -#define IF_LOCK_ASSERT(ifq) mtx_assert(&(ifq)->ifq_mtx, MA_OWNED) -#define _IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) -#define _IF_DROP(ifq) ((ifq)->ifq_drops++) -#define _IF_QLEN(ifq) ((ifq)->ifq_len) - -#define _IF_ENQUEUE(ifq, m) do { \ - (m)->m_nextpkt = NULL; \ - if ((ifq)->ifq_tail == NULL) \ - (ifq)->ifq_head = m; \ - else \ - (ifq)->ifq_tail->m_nextpkt = m; \ - (ifq)->ifq_tail = m; \ - (ifq)->ifq_len++; \ -} while (0) - -#define IF_ENQUEUE(ifq, m) do { \ - IF_LOCK(ifq); \ - _IF_ENQUEUE(ifq, m); \ - IF_UNLOCK(ifq); \ -} while (0) - -#define _IF_PREPEND(ifq, m) do { \ - (m)->m_nextpkt = (ifq)->ifq_head; \ - if ((ifq)->ifq_tail == NULL) \ - (ifq)->ifq_tail = (m); \ - (ifq)->ifq_head = (m); \ - (ifq)->ifq_len++; \ -} while (0) - -#define IF_PREPEND(ifq, m) do { \ - IF_LOCK(ifq); \ - _IF_PREPEND(ifq, m); \ - IF_UNLOCK(ifq); \ -} while (0) - -#define _IF_DEQUEUE(ifq, m) do { \ - (m) = (ifq)->ifq_head; \ - if (m) { \ - if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \ - (ifq)->ifq_tail = NULL; \ - (m)->m_nextpkt = NULL; \ - (ifq)->ifq_len--; \ - } \ -} while (0) - -#define IF_DEQUEUE(ifq, m) do { \ - IF_LOCK(ifq); \ - _IF_DEQUEUE(ifq, m); \ - IF_UNLOCK(ifq); \ -} while (0) - -#define _IF_DEQUEUE_ALL(ifq, m) do { \ - (m) = (ifq)->ifq_head; \ - (ifq)->ifq_head = (ifq)->ifq_tail = NULL; \ - (ifq)->ifq_len = 0; \ -} while (0) - -#define IF_DEQUEUE_ALL(ifq, m) do { \ - IF_LOCK(ifq); \ - _IF_DEQUEUE_ALL(ifq, m); \ - IF_UNLOCK(ifq); \ -} while (0) - -#define _IF_POLL(ifq, m) ((m) = (ifq)->ifq_head) -#define IF_POLL(ifq, m) _IF_POLL(ifq, m) - -#define _IF_DRAIN(ifq) do { \ - struct mbuf *m; \ - for (;;) { \ - _IF_DEQUEUE(ifq, m); \ - if (m == NULL) \ - break; \ - m_freem(m); \ - } \ -} while (0) - -#define IF_DRAIN(ifq) do { \ - IF_LOCK(ifq); \ - _IF_DRAIN(ifq); \ - IF_UNLOCK(ifq); \ -} while(0) +void if_maddr_rlock(if_t ifp); /* if_multiaddrs */ +void if_maddr_runlock(if_t ifp); /* if_multiaddrs */ #ifdef _KERNEL +#ifdef _SYS_EVENTHANDLER_H_ /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); @@ -404,6 +359,7 @@ EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); +#endif /* _SYS_EVENTHANDLER_H_ */ /* * interface groups @@ -426,6 +382,7 @@ struct ifg_list { TAILQ_ENTRY(ifg_list) ifgl_next; }; +#ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); @@ -435,6 +392,7 @@ EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); +#endif /* _SYS_EVENTHANDLER_H_ */ #define IF_AFDATA_LOCK_INIT(ifp) \ rw_init(&(ifp)->if_afdata_lock, "if_afdata") @@ -453,331 +411,6 @@ EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #define IF_AFDATA_WLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_WLOCKED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED) -int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, - int adjust); -#define IF_HANDOFF(ifq, m, ifp) \ - if_handoff((struct ifqueue *)ifq, m, ifp, 0) -#define IF_HANDOFF_ADJ(ifq, m, ifp, adj) \ - if_handoff((struct ifqueue *)ifq, m, ifp, adj) - -void if_start(struct ifnet *); - -#define IFQ_ENQUEUE(ifq, m, err) \ -do { \ - IF_LOCK(ifq); \ - if (ALTQ_IS_ENABLED(ifq)) \ - ALTQ_ENQUEUE(ifq, m, NULL, err); \ - else { \ - if (_IF_QFULL(ifq)) { \ - m_freem(m); \ - (err) = ENOBUFS; \ - } else { \ - _IF_ENQUEUE(ifq, m); \ - (err) = 0; \ - } \ - } \ - if (err) \ - (ifq)->ifq_drops++; \ - IF_UNLOCK(ifq); \ -} while (0) - -#define IFQ_DEQUEUE_NOLOCK(ifq, m) \ -do { \ - if (TBR_IS_ENABLED(ifq)) \ - (m) = tbr_dequeue_ptr(ifq, ALTDQ_REMOVE); \ - else if (ALTQ_IS_ENABLED(ifq)) \ - ALTQ_DEQUEUE(ifq, m); \ - else \ - _IF_DEQUEUE(ifq, m); \ -} while (0) - -#define IFQ_DEQUEUE(ifq, m) \ -do { \ - IF_LOCK(ifq); \ - IFQ_DEQUEUE_NOLOCK(ifq, m); \ - IF_UNLOCK(ifq); \ -} while (0) - -#define IFQ_POLL_NOLOCK(ifq, m) \ -do { \ - if (TBR_IS_ENABLED(ifq)) \ - (m) = tbr_dequeue_ptr(ifq, ALTDQ_POLL); \ - else if (ALTQ_IS_ENABLED(ifq)) \ - ALTQ_POLL(ifq, m); \ - else \ - _IF_POLL(ifq, m); \ -} while (0) - -#define IFQ_POLL(ifq, m) \ -do { \ - IF_LOCK(ifq); \ - IFQ_POLL_NOLOCK(ifq, m); \ - IF_UNLOCK(ifq); \ -} while (0) - -#define IFQ_PURGE_NOLOCK(ifq) \ -do { \ - if (ALTQ_IS_ENABLED(ifq)) { \ - ALTQ_PURGE(ifq); \ - } else \ - _IF_DRAIN(ifq); \ -} while (0) - -#define IFQ_PURGE(ifq) \ -do { \ - IF_LOCK(ifq); \ - IFQ_PURGE_NOLOCK(ifq); \ - IF_UNLOCK(ifq); \ -} while (0) - -#define IFQ_SET_READY(ifq) \ - do { ((ifq)->altq_flags |= ALTQF_READY); } while (0) - -#define IFQ_LOCK(ifq) IF_LOCK(ifq) -#define IFQ_UNLOCK(ifq) IF_UNLOCK(ifq) -#define IFQ_LOCK_ASSERT(ifq) IF_LOCK_ASSERT(ifq) -#define IFQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) -#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++) -#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len) -#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++) -#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len)) - -/* - * The IFF_DRV_OACTIVE test should really occur in the device driver, not in - * the handoff logic, as that flag is locked by the device driver. - */ -#define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \ -do { \ - int len; \ - short mflags; \ - \ - len = (m)->m_pkthdr.len; \ - mflags = (m)->m_flags; \ - IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \ - if ((err) == 0) { \ - (ifp)->if_obytes += len + (adj); \ - if (mflags & M_MCAST) \ - (ifp)->if_omcasts++; \ - if (((ifp)->if_drv_flags & IFF_DRV_OACTIVE) == 0) \ - if_start(ifp); \ - } \ -} while (0) - -#define IFQ_HANDOFF(ifp, m, err) \ - IFQ_HANDOFF_ADJ(ifp, m, 0, err) - -#define IFQ_DRV_DEQUEUE(ifq, m) \ -do { \ - (m) = (ifq)->ifq_drv_head; \ - if (m) { \ - if (((ifq)->ifq_drv_head = (m)->m_nextpkt) == NULL) \ - (ifq)->ifq_drv_tail = NULL; \ - (m)->m_nextpkt = NULL; \ - (ifq)->ifq_drv_len--; \ - } else { \ - IFQ_LOCK(ifq); \ - IFQ_DEQUEUE_NOLOCK(ifq, m); \ - while ((ifq)->ifq_drv_len < (ifq)->ifq_drv_maxlen) { \ - struct mbuf *m0; \ - IFQ_DEQUEUE_NOLOCK(ifq, m0); \ - if (m0 == NULL) \ - break; \ - m0->m_nextpkt = NULL; \ - if ((ifq)->ifq_drv_tail == NULL) \ - (ifq)->ifq_drv_head = m0; \ - else \ - (ifq)->ifq_drv_tail->m_nextpkt = m0; \ - (ifq)->ifq_drv_tail = m0; \ - (ifq)->ifq_drv_len++; \ - } \ - IFQ_UNLOCK(ifq); \ - } \ -} while (0) - -#define IFQ_DRV_PREPEND(ifq, m) \ -do { \ - (m)->m_nextpkt = (ifq)->ifq_drv_head; \ - if ((ifq)->ifq_drv_tail == NULL) \ - (ifq)->ifq_drv_tail = (m); \ - (ifq)->ifq_drv_head = (m); \ - (ifq)->ifq_drv_len++; \ -} while (0) - -#define IFQ_DRV_IS_EMPTY(ifq) \ - (((ifq)->ifq_drv_len == 0) && ((ifq)->ifq_len == 0)) - -#define IFQ_DRV_PURGE(ifq) \ -do { \ - struct mbuf *m, *n = (ifq)->ifq_drv_head; \ - while((m = n) != NULL) { \ - n = m->m_nextpkt; \ - m_freem(m); \ - } \ - (ifq)->ifq_drv_head = (ifq)->ifq_drv_tail = NULL; \ - (ifq)->ifq_drv_len = 0; \ - IFQ_PURGE(ifq); \ -} while (0) - -#ifdef _KERNEL -static __inline int -drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m) -{ - int error = 0; - -#ifdef ALTQ - if (ALTQ_IS_ENABLED(&ifp->if_snd)) { - IFQ_ENQUEUE(&ifp->if_snd, m, error); - return (error); - } -#endif - error = buf_ring_enqueue(br, m); - if (error) - m_freem(m); - - return (error); -} - -static __inline void -drbr_putback(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new_mbuf) -{ - /* - * The top of the list needs to be swapped - * for this one. - */ -#ifdef ALTQ - if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { - /* - * Peek in altq case dequeued it - * so put it back. - */ - IFQ_DRV_PREPEND(&ifp->if_snd, new_mbuf); - return; - } -#endif - buf_ring_putback_sc(br, new_mbuf); -} - -static __inline struct mbuf * -drbr_peek(struct ifnet *ifp, struct buf_ring *br) -{ -#ifdef ALTQ - struct mbuf *m; - if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { - /* - * Pull it off like a dequeue - * since drbr_advance() does nothing - * for altq and drbr_putback() will - * use the old prepend function. - */ - IFQ_DEQUEUE(&ifp->if_snd, m); - return (m); - } -#endif - return ((struct mbuf *)buf_ring_peek(br)); -} - -static __inline void -drbr_flush(struct ifnet *ifp, struct buf_ring *br) -{ - struct mbuf *m; - -#ifdef ALTQ - if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) - IFQ_PURGE(&ifp->if_snd); -#endif - while ((m = (struct mbuf *)buf_ring_dequeue_sc(br)) != NULL) - m_freem(m); -} - -static __inline void -drbr_free(struct buf_ring *br, struct malloc_type *type) -{ - - drbr_flush(NULL, br); - buf_ring_free(br, type); -} - -static __inline struct mbuf * -drbr_dequeue(struct ifnet *ifp, struct buf_ring *br) -{ -#ifdef ALTQ - struct mbuf *m; - - if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { - IFQ_DEQUEUE(&ifp->if_snd, m); - return (m); - } -#endif - return ((struct mbuf *)buf_ring_dequeue_sc(br)); -} - -static __inline void -drbr_advance(struct ifnet *ifp, struct buf_ring *br) -{ -#ifdef ALTQ - /* Nothing to do here since peek dequeues in altq case */ - if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) - return; -#endif - return (buf_ring_advance_sc(br)); -} - - -static __inline struct mbuf * -drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br, - int (*func) (struct mbuf *, void *), void *arg) -{ - struct mbuf *m; -#ifdef ALTQ - if (ALTQ_IS_ENABLED(&ifp->if_snd)) { - IFQ_LOCK(&ifp->if_snd); - IFQ_POLL_NOLOCK(&ifp->if_snd, m); - if (m != NULL && func(m, arg) == 0) { - IFQ_UNLOCK(&ifp->if_snd); - return (NULL); - } - IFQ_DEQUEUE_NOLOCK(&ifp->if_snd, m); - IFQ_UNLOCK(&ifp->if_snd); - return (m); - } -#endif - m = (struct mbuf *)buf_ring_peek(br); - if (m == NULL || func(m, arg) == 0) - return (NULL); - - return ((struct mbuf *)buf_ring_dequeue_sc(br)); -} - -static __inline int -drbr_empty(struct ifnet *ifp, struct buf_ring *br) -{ -#ifdef ALTQ - if (ALTQ_IS_ENABLED(&ifp->if_snd)) - return (IFQ_IS_EMPTY(&ifp->if_snd)); -#endif - return (buf_ring_empty(br)); -} - -static __inline int -drbr_needs_enqueue(struct ifnet *ifp, struct buf_ring *br) -{ -#ifdef ALTQ - if (ALTQ_IS_ENABLED(&ifp->if_snd)) - return (1); -#endif - return (!buf_ring_empty(br)); -} - -static __inline int -drbr_inuse(struct ifnet *ifp, struct buf_ring *br) -{ -#ifdef ALTQ - if (ALTQ_IS_ENABLED(&ifp->if_snd)) - return (ifp->if_snd.ifq_len); -#endif - return (buf_ring_count(br)); -} -#endif /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). @@ -787,8 +420,6 @@ drbr_inuse(struct ifnet *ifp, struct buf_ring *br) #define TOEDEV(ifp) ((ifp)->if_llsoftc) -#endif /* _KERNEL */ - /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, @@ -804,46 +435,28 @@ struct ifaddr { struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ - struct if_data if_data; /* not all members are meaningful */ struct ifnet *ifa_ifp; /* back-pointer to interface */ + struct carp_softc *ifa_carp; /* pointer to CARP data */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ +#define IFA_ROUTE RTF_UP /* route installed */ +#define IFA_RTSELF RTF_HOST /* loopback route to self installed */ u_int ifa_refcnt; /* references to this structure */ - int ifa_metric; /* cost of going out this interface */ - int (*ifa_claim_addr) /* check if an addr goes to this if */ - (struct ifaddr *, struct sockaddr *); - struct mtx ifa_mtx; + + counter_u64_t ifa_ipackets; + counter_u64_t ifa_opackets; + counter_u64_t ifa_ibytes; + counter_u64_t ifa_obytes; }; -#define IFA_ROUTE RTF_UP /* route installed */ -#define IFA_RTSELF RTF_HOST /* loopback route to self installed */ -/* for compatibility with other BSDs */ +/* For compatibility with other BSDs. SCTP uses it. */ #define ifa_list ifa_link -#ifdef _KERNEL -#define IFA_LOCK(ifa) mtx_lock(&(ifa)->ifa_mtx) -#define IFA_UNLOCK(ifa) mtx_unlock(&(ifa)->ifa_mtx) - +struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); -void ifa_init(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); -#endif - -/* - * The prefix structure contains information about one prefix - * of an interface. They are maintained by the different address families, - * are allocated and attached when a prefix or an address is set, - * and are linked together so all prefixes for an interface can be located. - */ -struct ifprefix { - struct sockaddr *ifpr_prefix; /* prefix of interface */ - struct ifnet *ifpr_ifp; /* back-pointer to interface */ - TAILQ_ENTRY(ifprefix) ifpr_list; /* queue macro glue */ - u_char ifpr_plen; /* prefix length in bits */ - u_char ifpr_type; /* protocol dependent prefix type */ -}; /* * Multicast address structure. This is analogous to the ifaddr @@ -859,16 +472,9 @@ struct ifmultiaddr { struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ }; -#ifdef _KERNEL - extern struct rwlock ifnet_rwlock; extern struct sx ifnet_sxlock; -#define IFNET_LOCK_INIT() do { \ - rw_init_flags(&ifnet_rwlock, "ifnet_rw", RW_RECURSE); \ - sx_init_flags(&ifnet_sxlock, "ifnet_sx", SX_RECURSE); \ -} while(0) - #define IFNET_WLOCK() do { \ sx_xlock(&ifnet_sxlock); \ rw_wlock(&ifnet_rwlock); \ @@ -915,15 +521,11 @@ VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(int, if_index); VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ -VNET_DECLARE(int, useloopback); #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_if_index VNET(if_index) #define V_loif VNET(loif) -#define V_useloopback VNET(useloopback) - -extern int ifqmaxlen; int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); @@ -935,18 +537,15 @@ void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_detach(struct ifnet *); -void if_vmove(struct ifnet *, struct vnet *); void if_purgeaddrs(struct ifnet *); void if_delallmulti(struct ifnet *); void if_down(struct ifnet *); struct ifmultiaddr * - if_findmulti(struct ifnet *, struct sockaddr *); + if_findmulti(struct ifnet *, const struct sockaddr *); void if_free(struct ifnet *); -void if_free_type(struct ifnet *, u_char); void if_initname(struct ifnet *, const char *, int); void if_link_state_change(struct ifnet *, int); int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); -void if_qflush(struct ifnet *); void if_ref(struct ifnet *); void if_rele(struct ifnet *); int if_setlladdr(struct ifnet *, const u_char *, int); @@ -956,23 +555,19 @@ int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *ifunit_ref(const char *); -void ifq_init(struct ifaltq *, struct ifnet *ifp); -void ifq_delete(struct ifaltq *); - int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); - -struct ifaddr *ifa_ifwithaddr(struct sockaddr *); -int ifa_ifwithaddr_check(struct sockaddr *); -struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *); -struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *); -struct ifaddr *ifa_ifwithdstaddr_fib(struct sockaddr *, int); -struct ifaddr *ifa_ifwithnet(struct sockaddr *, int); -struct ifaddr *ifa_ifwithnet_fib(struct sockaddr *, int, int); -struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *); -struct ifaddr *ifa_ifwithroute_fib(int, struct sockaddr *, struct sockaddr *, u_int); - -struct ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *); +int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *); + +struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); +int ifa_ifwithaddr_check(const struct sockaddr *); +struct ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int); +struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int); +struct ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int); +struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, struct sockaddr *, + u_int); +struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); +int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); @@ -980,22 +575,92 @@ typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); +void if_data_copy(struct ifnet *, struct if_data *); +uint64_t if_get_counter_default(struct ifnet *, ift_counter); +void if_inc_counter(struct ifnet *, ift_counter, int64_t); #define IF_LLADDR(ifp) \ LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) +uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); +uint64_t if_getbaudrate(if_t ifp); +int if_setcapabilities(if_t ifp, int capabilities); +int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); +int if_getcapabilities(if_t ifp); +int if_togglecapenable(if_t ifp, int togglecap); +int if_setcapenable(if_t ifp, int capenable); +int if_setcapenablebit(if_t ifp, int setcap, int clearcap); +int if_getcapenable(if_t ifp); +const char *if_getdname(if_t ifp); +int if_setdev(if_t ifp, void *dev); +int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); +int if_getdrvflags(if_t ifp); +int if_setdrvflags(if_t ifp, int flags); +int if_clearhwassist(if_t ifp); +int if_sethwassistbits(if_t ifp, int toset, int toclear); +int if_sethwassist(if_t ifp, int hwassist_bit); +int if_gethwassist(if_t ifp); +int if_setsoftc(if_t ifp, void *softc); +void *if_getsoftc(if_t ifp); +int if_setflags(if_t ifp, int flags); +int if_setmtu(if_t ifp, int mtu); +int if_getmtu(if_t ifp); +int if_getmtu_family(if_t ifp, int family); +int if_setflagbits(if_t ifp, int set, int clear); +int if_getflags(if_t ifp); +int if_sendq_empty(if_t ifp); +int if_setsendqready(if_t ifp); +int if_setsendqlen(if_t ifp, int tx_desc_count); +int if_input(if_t ifp, struct mbuf* sendmp); +int if_sendq_prepend(if_t ifp, struct mbuf *m); +struct mbuf *if_dequeue(if_t ifp); +int if_setifheaderlen(if_t ifp, int len); +void if_setrcvif(struct mbuf *m, if_t ifp); +void if_setvtag(struct mbuf *m, u_int16_t tag); +u_int16_t if_getvtag(struct mbuf *m); +int if_vlantrunkinuse(if_t ifp); +caddr_t if_getlladdr(if_t ifp); +void *if_gethandle(u_char); +void if_bpfmtap(if_t ifp, struct mbuf *m); +void if_etherbpfmtap(if_t ifp, struct mbuf *m); +void if_vlancap(if_t ifp); + +int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max); +int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max); +int if_multiaddr_count(if_t ifp, int max); + +int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg); +int if_getamcount(if_t ifp); +struct ifaddr * if_getifaddr(if_t ifp); + +/* Functions */ +void if_setinitfn(if_t ifp, void (*)(void *)); +void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t)); +void if_setstartfn(if_t ifp, void (*)(if_t)); +void if_settransmitfn(if_t ifp, if_transmit_fn_t); +void if_setqflushfn(if_t ifp, if_qflush_fn_t); +void if_setgetcounterfn(if_t ifp, if_get_counter_t); + +/* Revisit the below. These are inline functions originally */ +int drbr_inuse_drv(if_t ifp, struct buf_ring *br); +struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); +int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); +int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); + +/* TSO */ +void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *); +int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); + #ifdef DEVICE_POLLING -enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; +enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; -typedef int poll_handler_t(struct ifnet *ifp, enum poll_cmd cmd, int count); -int ether_poll_register(poll_handler_t *h, struct ifnet *ifp); -int ether_poll_deregister(struct ifnet *ifp); +typedef int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count); +int ether_poll_register(poll_handler_t *h, if_t ifp); +int ether_poll_deregister(if_t ifp); #endif /* DEVICE_POLLING */ -/* TSO */ -void if_hw_tsomax_common(struct ifnet *, struct ifnet_hw_tsomax *); -int if_hw_tsomax_update(struct ifnet *, struct ifnet_hw_tsomax *); - #endif /* _KERNEL */ +#include <net/ifq.h> /* XXXAO: temporary unconditional include */ + #endif /* !_NET_IF_VAR_H_ */ diff --git a/freebsd/sys/net/if_vlan.c b/freebsd/sys/net/if_vlan.c index 7d08e298..8a93565b 100644 --- a/freebsd/sys/net/if_vlan.c +++ b/freebsd/sys/net/if_vlan.c @@ -2,6 +2,10 @@ /*- * Copyright 1998 Massachusetts Institute of Technology + * Copyright 2012 ADARA Networks, Inc. + * + * Portions of this software were developed by Robert N. M. Watson under + * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby @@ -31,8 +35,7 @@ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. - * Might be extended some day to also handle IEEE 802.1p priority - * tagging. This is sort of sneaky in the implementation, since + * This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that @@ -47,12 +50,14 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_vlan.h> #include <rtems/bsd/sys/param.h> +#include <sys/eventhandler.h> #include <sys/kernel.h> #include <rtems/bsd/sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/module.h> -#include <sys/rwlock.h> +#include <sys/rmlock.h> +#include <sys/priv.h> #include <sys/queue.h> #include <sys/socket.h> #include <sys/sockio.h> @@ -63,6 +68,7 @@ __FBSDID("$FreeBSD$"); #include <net/bpf.h> #include <net/ethernet.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_clone.h> #include <net/if_dl.h> #include <net/if_types.h> @@ -74,7 +80,6 @@ __FBSDID("$FreeBSD$"); #include <netinet/if_ether.h> #endif -#define VLANNAME "vlan" #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) @@ -85,7 +90,7 @@ LIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ - struct rwlock rw; + struct rmlock lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ @@ -105,9 +110,9 @@ struct vlan_mc_entry { struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; - void *ifv_cookie; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) ((ifv)->ifv_trunk->parent) + void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ struct ifv_linkmib { int ifvm_encaplen; /* encapsulation length */ @@ -115,6 +120,8 @@ struct ifvlan { int ifvm_mintu; /* min transmission unit */ uint16_t ifvm_proto; /* encapsulation ethertype */ uint16_t ifvm_tag; /* tag to apply on packets leaving if */ + uint16_t ifvm_vid; /* VLAN ID */ + uint8_t ifvm_pcp; /* Priority Code Point (PCP). */ } ifv_mib; SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY @@ -123,6 +130,8 @@ struct ifvlan { }; #define ifv_proto ifv_mib.ifvm_proto #define ifv_tag ifv_mib.ifvm_tag +#define ifv_vid ifv_mib.ifvm_vid +#define ifv_pcp ifv_mib.ifvm_pcp #define ifv_encaplen ifv_mib.ifvm_encaplen #define ifv_mtufudge ifv_mib.ifvm_mtufudge #define ifv_mintu ifv_mib.ifvm_mintu @@ -143,11 +152,22 @@ static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0, static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); -static int soft_pad = 0; -SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW, &soft_pad, 0, - "pad short frames before tagging"); +static VNET_DEFINE(int, soft_pad); +#define V_soft_pad VNET(soft_pad) +SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(soft_pad), 0, "pad short frames before tagging"); + +/* + * For now, make preserving PCP via an mbuf tag optional, as it increases + * per-packet memory allocations and frees. In the future, it would be + * preferable to reuse ether_vtag for this, or similar. + */ +static int vlan_mtag_pcp = 0; +SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW, &vlan_mtag_pcp, 0, + "Retain VLAN PCP information as packets are passed up the stack"); -static MALLOC_DEFINE(M_VLAN, VLANNAME, "802.1Q Virtual LAN Interface"); +static const char vlanname[] = "vlan"; +static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; @@ -156,7 +176,7 @@ static eventhandler_tag iflladdr_tag; * We have a global mutex, that is used to serialize configuration * changes and isn't used in normal packet delivery. * - * We also have a per-trunk rwlock, that is locked shared on packet + * We also have a per-trunk rmlock(9), that is locked shared on packet * processing and exclusive when configuration is changed. * * The VLAN_ARRAY substitutes the dynamic hash with a static array @@ -170,14 +190,15 @@ static struct sx ifv_lock; #define VLAN_LOCK_ASSERT() sx_assert(&ifv_lock, SA_LOCKED) #define VLAN_LOCK() sx_xlock(&ifv_lock) #define VLAN_UNLOCK() sx_xunlock(&ifv_lock) -#define TRUNK_LOCK_INIT(trunk) rw_init(&(trunk)->rw, VLANNAME) -#define TRUNK_LOCK_DESTROY(trunk) rw_destroy(&(trunk)->rw) -#define TRUNK_LOCK(trunk) rw_wlock(&(trunk)->rw) -#define TRUNK_UNLOCK(trunk) rw_wunlock(&(trunk)->rw) -#define TRUNK_LOCK_ASSERT(trunk) rw_assert(&(trunk)->rw, RA_WLOCKED) -#define TRUNK_RLOCK(trunk) rw_rlock(&(trunk)->rw) -#define TRUNK_RUNLOCK(trunk) rw_runlock(&(trunk)->rw) -#define TRUNK_LOCK_RASSERT(trunk) rw_assert(&(trunk)->rw, RA_RLOCKED) +#define TRUNK_LOCK_INIT(trunk) rm_init(&(trunk)->lock, vlanname) +#define TRUNK_LOCK_DESTROY(trunk) rm_destroy(&(trunk)->lock) +#define TRUNK_LOCK(trunk) rm_wlock(&(trunk)->lock) +#define TRUNK_UNLOCK(trunk) rm_wunlock(&(trunk)->lock) +#define TRUNK_LOCK_ASSERT(trunk) rm_assert(&(trunk)->lock, RA_WLOCKED) +#define TRUNK_RLOCK(trunk) rm_rlock(&(trunk)->lock, &tracker) +#define TRUNK_RUNLOCK(trunk) rm_runlock(&(trunk)->lock, &tracker) +#define TRUNK_LOCK_RASSERT(trunk) rm_assert(&(trunk)->lock, RA_RLOCKED) +#define TRUNK_LOCK_READER struct rm_priotracker tracker #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); @@ -186,7 +207,7 @@ static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, - uint16_t tag); + uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); @@ -206,8 +227,7 @@ static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); -static struct ifnet *vlan_clone_match_ethertag(struct if_clone *, - const char *, int *); +static struct ifnet *vlan_clone_match_ethervid(const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); static int vlan_clone_destroy(struct if_clone *, struct ifnet *); @@ -215,11 +235,10 @@ static int vlan_clone_destroy(struct if_clone *, struct ifnet *); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); -static struct if_clone vlan_cloner = IFC_CLONE_INITIALIZER(VLANNAME, NULL, - IF_MAXUNIT, NULL, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); +static struct if_clone *vlan_cloner; #ifdef VIMAGE -static VNET_DEFINE(struct if_clone, vlan_cloner); +static VNET_DEFINE(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif @@ -274,9 +293,9 @@ vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; - i = HASH(ifv->ifv_tag, trunk->hmask); + i = HASH(ifv->ifv_vid, trunk->hmask); LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) - if (ifv->ifv_tag == ifv2->ifv_tag) + if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* @@ -286,7 +305,7 @@ vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); - i = HASH(ifv->ifv_tag, trunk->hmask); + i = HASH(ifv->ifv_vid, trunk->hmask); } LIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; @@ -304,7 +323,7 @@ vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; - i = HASH(ifv->ifv_tag, trunk->hmask); + i = HASH(ifv->ifv_vid, trunk->hmask); LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; @@ -356,7 +375,7 @@ vlan_growhash(struct ifvlantrunk *trunk, int howmuch) for (i = 0; i < n; i++) while ((ifv = LIST_FIRST(&trunk->hash[i])) != NULL) { LIST_REMOVE(ifv, ifv_list); - j = HASH(ifv->ifv_tag, n2 - 1); + j = HASH(ifv->ifv_vid, n2 - 1); LIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } free(trunk->hash, M_VLAN); @@ -370,14 +389,14 @@ vlan_growhash(struct ifvlantrunk *trunk, int howmuch) } static __inline struct ifvlan * -vlan_gethash(struct ifvlantrunk *trunk, uint16_t tag) +vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; TRUNK_LOCK_RASSERT(trunk); - LIST_FOREACH(ifv, &trunk->hash[HASH(tag, trunk->hmask)], ifv_list) - if (ifv->ifv_tag == tag) + LIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) + if (ifv->ifv_vid == vid) return (ifv); return (NULL); } @@ -401,19 +420,19 @@ vlan_dumphash(struct ifvlantrunk *trunk) #else static __inline struct ifvlan * -vlan_gethash(struct ifvlantrunk *trunk, uint16_t tag) +vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { - return trunk->vlans[tag]; + return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { - if (trunk->vlans[ifv->ifv_tag] != NULL) + if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; - trunk->vlans[ifv->ifv_tag] = ifv; + trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); @@ -423,7 +442,7 @@ static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { - trunk->vlans[ifv->ifv_tag] = NULL; + trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); @@ -461,48 +480,48 @@ trunk_destroy(struct ifvlantrunk *trunk) * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. - * - * XXX: There is a possible race here if more than one thread is - * modifying the multicast state of the vlan interface at the same time. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; - struct ifmultiaddr *ifma, *rifma = NULL; + struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; - /*VLAN_LOCK_ASSERT();*/ - /* Find the parent. */ sc = ifp->if_softc; + TRUNK_LOCK_ASSERT(TRUNK(sc)); ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { - error = if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); - if (error) - return (error); SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); + (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); free(mc, M_VLAN); } /* Now program new ones. */ + IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); - if (mc == NULL) + if (mc == NULL) { + IF_ADDR_WUNLOCK(ifp); return (ENOMEM); + } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); + } + IF_ADDR_WUNLOCK(ifp); + SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, - &rifma); + NULL); if (error) return (error); } @@ -625,17 +644,21 @@ vlan_trunkdev(struct ifnet *ifp) } /* - * Return the 16bit vlan tag for this interface. + * Return the 12-bit VLAN VID for this interface, for use by external + * components such as Infiniband. + * + * XXXRW: Note that the function name here is historical; it should be named + * vlan_vid(). */ static int -vlan_tag(struct ifnet *ifp, uint16_t *tagp) +vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; - *tagp = ifv->ifv_tag; + *vidp = ifv->ifv_vid; return (0); } @@ -671,20 +694,21 @@ vlan_setcookie(struct ifnet *ifp, void *cookie) } /* - * Return the vlan device present at the specific tag. + * Return the vlan device present at the specific VID. */ static struct ifnet * -vlan_devat(struct ifnet *ifp, uint16_t tag) +vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; + TRUNK_LOCK_READER; trunk = ifp->if_vlantrunk; if (trunk == NULL) return (NULL); ifp = NULL; TRUNK_RLOCK(trunk); - ifv = vlan_gethash(trunk, tag); + ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; TRUNK_RUNLOCK(trunk); @@ -692,10 +716,20 @@ vlan_devat(struct ifnet *ifp, uint16_t tag) } /* + * Recalculate the cached VLAN tag exposed via the MIB. + */ +static void +vlan_tag_recalculate(struct ifvlan *ifv) +{ + + ifv->ifv_tag = EVL_MAKETAG(ifv->ifv_vid, ifv->ifv_pcp, 0); +} + +/* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and - * set here. Noone else in the system should be aware of this so + * set here. No one else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); @@ -727,7 +761,8 @@ vlan_modevent(module_t mod, int type, void *data) vlan_tag_p = vlan_tag; vlan_devat_p = vlan_devat; #ifndef VIMAGE - if_clone_attach(&vlan_cloner); + vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, + vlan_clone_create, vlan_clone_destroy); #endif if (bootverbose) printf("vlan: initialized, using " @@ -741,7 +776,7 @@ vlan_modevent(module_t mod, int type, void *data) break; case MOD_UNLOAD: #ifndef VIMAGE - if_clone_detach(&vlan_cloner); + if_clone_detach(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); @@ -777,8 +812,9 @@ static void vnet_vlan_init(const void *unused __unused) { + vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, + vlan_clone_create, vlan_clone_destroy); V_vlan_cloner = vlan_cloner; - if_clone_attach(&V_vlan_cloner); } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); @@ -787,46 +823,39 @@ static void vnet_vlan_uninit(const void *unused __unused) { - if_clone_detach(&V_vlan_cloner); + if_clone_detach(V_vlan_cloner); } -VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, +VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_vlan_uninit, NULL); #endif +/* + * Check for <etherif>.<vlan> style interface names. + */ static struct ifnet * -vlan_clone_match_ethertag(struct if_clone *ifc, const char *name, int *tag) +vlan_clone_match_ethervid(const char *name, int *vidp) { - const char *cp; + char ifname[IFNAMSIZ]; + char *cp; struct ifnet *ifp; - int t; + int vid; - /* Check for <etherif>.<vlan> style interface names. */ - IFNET_RLOCK_NOSLEEP(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { - /* - * We can handle non-ethernet hardware types as long as - * they handle the tagging and headers themselves. - */ - if (ifp->if_type != IFT_ETHER && - (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) - continue; - if (strncmp(ifp->if_xname, name, strlen(ifp->if_xname)) != 0) - continue; - cp = name + strlen(ifp->if_xname); - if (*cp++ != '.') - continue; - if (*cp == '\0') - continue; - t = 0; - for(; *cp >= '0' && *cp <= '9'; cp++) - t = (t * 10) + (*cp - '0'); - if (*cp != '\0') - continue; - if (tag != NULL) - *tag = t; - break; - } - IFNET_RUNLOCK_NOSLEEP(); + strlcpy(ifname, name, IFNAMSIZ); + if ((cp = strchr(ifname, '.')) == NULL) + return (NULL); + *cp = '\0'; + if ((ifp = ifunit(ifname)) == NULL) + return (NULL); + /* Parse VID. */ + if (*++cp == '\0') + return (NULL); + vid = 0; + for(; *cp >= '0' && *cp <= '9'; cp++) + vid = (vid * 10) + (*cp - '0'); + if (*cp != '\0') + return (NULL); + if (vidp != NULL) + *vidp = vid; return (ifp); } @@ -836,10 +865,10 @@ vlan_clone_match(struct if_clone *ifc, const char *name) { const char *cp; - if (vlan_clone_match_ethertag(ifc, name, NULL) != NULL) + if (vlan_clone_match_ethervid(name, NULL) != NULL) return (1); - if (strncmp(VLANNAME, name, strlen(VLANNAME)) != 0) + if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') @@ -856,7 +885,7 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) int wildcard; int unit; int error; - int tag; + int vid; int ethertag; struct ifvlan *ifv; struct ifnet *ifp; @@ -873,7 +902,10 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter two are - * supported for backwards compatibilty. + * supported for backwards compatibility. + * + * XXXRW: Note historic use of the word "tag" here. New ioctls may be + * called for. */ if (params) { error = copyin(params, &vlr, sizeof(vlr)); @@ -881,31 +913,18 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) return error; p = ifunit(vlr.vlr_parent); if (p == NULL) - return ENXIO; - /* - * Don't let the caller set up a VLAN tag with - * anything except VLID bits. - */ - if (vlr.vlr_tag & ~EVL_VLID_MASK) - return (EINVAL); + return (ENXIO); error = ifc_name2unit(name, &unit); if (error != 0) return (error); ethertag = 1; - tag = vlr.vlr_tag; + vid = vlr.vlr_tag; wildcard = (unit < 0); - } else if ((p = vlan_clone_match_ethertag(ifc, name, &tag)) != NULL) { + } else if ((p = vlan_clone_match_ethervid(name, &vid)) != NULL) { ethertag = 1; unit = -1; wildcard = 0; - - /* - * Don't let the caller set up a VLAN tag with - * anything except VLID bits. - */ - if (tag & ~EVL_VLID_MASK) - return (EINVAL); } else { ethertag = 0; @@ -937,14 +956,13 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) return (ENOSPC); } SLIST_INIT(&ifv->vlan_mc_listhead); - ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); - ifp->if_dname = ifc->ifc_name; + ifp->if_dname = vlanname; ifp->if_dunit = unit; /* NB: flags are not set here */ ifp->if_linkmib = &ifv->ifv_mib; @@ -966,7 +984,7 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) sdl->sdl_type = IFT_L2VLAN; if (ethertag) { - error = vlan_config(ifv, p, tag); + error = vlan_config(ifv, p, vid); if (error != 0) { /* * Since we've partially failed, we need to back @@ -975,7 +993,7 @@ vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) */ ether_ifdetach(ifp); vlan_unconfig(ifp); - if_free_type(ifp, IFT_ETHER); + if_free(ifp); ifc_free_unit(ifc, unit); free(ifv, M_VLAN); @@ -997,7 +1015,7 @@ vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ - if_free_type(ifp, IFT_ETHER); + if_free(ifp); free(ifv, M_VLAN); ifc_free_unit(ifc, unit); @@ -1020,6 +1038,8 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct ifvlan *ifv; struct ifnet *p; + struct m_tag *mtag; + uint16_t tag; int error, len, mcast; ifv = ifp->if_softc; @@ -1035,7 +1055,7 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) */ if (!UP_AND_RUNNING(p)) { m_freem(m); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENETDOWN); } @@ -1051,7 +1071,7 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) * devices that just discard such runts instead or mishandle * them somehow. */ - if (soft_pad && p->if_type == IFT_ETHER) { + if (V_soft_pad && p->if_type == IFT_ETHER) { static char pad[8]; /* just zeros */ int n; @@ -1062,7 +1082,7 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) if (n > 0) { if_printf(ifp, "cannot pad short frame\n"); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (0); } @@ -1075,14 +1095,19 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ + if (vlan_mtag_pcp && (mtag = m_tag_locate(m, MTAG_8021Q, + MTAG_8021Q_PCP_OUT, NULL)) != NULL) + tag = EVL_MAKETAG(ifv->ifv_vid, *(uint8_t *)(mtag + 1), 0); + else + tag = ifv->ifv_tag; if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { - m->m_pkthdr.ether_vtag = ifv->ifv_tag; + m->m_pkthdr.ether_vtag = tag; m->m_flags |= M_VLANTAG; } else { - m = ether_vlanencap(m, ifv->ifv_tag); + m = ether_vlanencap(m, tag); if (m == NULL) { if_printf(ifp, "unable to prepend VLAN header\n"); - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (0); } } @@ -1091,12 +1116,12 @@ vlan_transmit(struct ifnet *ifp, struct mbuf *m) * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); - if (!error) { - ifp->if_opackets++; - ifp->if_omcasts += mcast; - ifp->if_obytes += len; + if (error == 0) { + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OBYTES, len); + if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else - ifp->if_oerrors++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } @@ -1113,7 +1138,9 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk = ifp->if_vlantrunk; struct ifvlan *ifv; - uint16_t tag; + TRUNK_LOCK_READER; + struct m_tag *mtag; + uint16_t vid, tag; KASSERT(trunk != NULL, ("%s: no trunk", __func__)); @@ -1122,7 +1149,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ - tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag); + tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; @@ -1138,7 +1165,7 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) return; } evl = mtod(m, struct ether_vlan_header *); - tag = EVL_VLANOFTAG(ntohs(evl->evl_tag)); + tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet @@ -1157,43 +1184,75 @@ vlan_input(struct ifnet *ifp, struct mbuf *m) __func__, ifp->if_xname, ifp->if_type); #endif m_freem(m); - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); return; } } + vid = EVL_VLANOFTAG(tag); + TRUNK_RLOCK(trunk); - ifv = vlan_gethash(trunk, tag); + ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { TRUNK_RUNLOCK(trunk); m_freem(m); - ifp->if_noproto++; + if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); return; } TRUNK_RUNLOCK(trunk); + if (vlan_mtag_pcp) { + /* + * While uncommon, it is possible that we will find a 802.1q + * packet encapsulated inside another packet that also had an + * 802.1q header. For example, ethernet tunneled over IPSEC + * arriving over ethernet. In that case, we replace the + * existing 802.1q PCP m_tag value. + */ + mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); + if (mtag == NULL) { + mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, + sizeof(uint8_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + return; + } + m_tag_prepend(m, mtag); + } + *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); + } + m->m_pkthdr.rcvif = ifv->ifv_ifp; - ifv->ifv_ifp->if_ipackets++; + if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); /* Pass it back through the parent's input routine. */ (*ifp->if_input)(ifv->ifv_ifp, m); } static int -vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag) +vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) { struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; - /* VID numbers 0x0 and 0xFFF are reserved */ - if (tag == 0 || tag == 0xFFF) - return (EINVAL); + /* + * We can handle non-ethernet hardware types as long as + * they handle the tagging and headers themselves. + */ if (p->if_type != IFT_ETHER && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); + /* + * Don't let the caller set up a VLAN VID with + * anything except VLID bits. + * VID numbers 0x0 and 0xFFF are reserved. + */ + if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK)) + return (EINVAL); if (ifv->ifv_trunk) return (EBUSY); @@ -1203,7 +1262,7 @@ vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag) vlan_inithash(trunk); VLAN_LOCK(); if (p->if_vlantrunk != NULL) { - /* A race that that is very unlikely to be hit. */ + /* A race that is very unlikely to be hit. */ vlan_freehash(trunk); free(trunk, M_VLAN); goto exists; @@ -1219,7 +1278,9 @@ exists: TRUNK_LOCK(trunk); } - ifv->ifv_tag = tag; /* must set this before vlan_inshash() */ + ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ + ifv->ifv_pcp = 0; /* Default: best effort delivery. */ + vlan_tag_recalculate(ifv); error = vlan_inshash(trunk, ifv); if (error) goto done; @@ -1297,7 +1358,7 @@ exists: done: TRUNK_UNLOCK(trunk); if (error == 0) - EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_tag); + EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_UNLOCK(); return (error); @@ -1366,7 +1427,7 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) * Check if we were the last. */ if (trunk->refcnt == 0) { - trunk->parent->if_vlantrunk = NULL; + parent->if_vlantrunk = NULL; /* * XXXGL: If some ithread has already entered * vlan_input() and is now blocked on the trunk @@ -1393,7 +1454,7 @@ vlan_unconfig_locked(struct ifnet *ifp, int departing) * to cleanup anyway. */ if (parent != NULL) - EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_tag); + EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ @@ -1494,7 +1555,7 @@ vlan_capabilities(struct ifvlan *ifv) p->if_capenable & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable = p->if_capenable & IFCAP_HWCSUM; ifp->if_hwassist = p->if_hwassist & (CSUM_IP | CSUM_TCP | - CSUM_UDP | CSUM_SCTP | CSUM_FRAGMENT); + CSUM_UDP | CSUM_SCTP); } else { ifp->if_capenable = 0; ifp->if_hwassist = 0; @@ -1562,6 +1623,7 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) struct ifreq *ifr; struct ifaddr *ifa; struct ifvlan *ifv; + struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0; @@ -1633,6 +1695,13 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case SIOCSETVLAN: #ifdef VIMAGE + /* + * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN + * interface to be delegated to a jail without allowing the + * jail to change what underlying interface/VID it is + * associated with. We are not entirely convinced that this + * is the right way to accomplish that policy goal. + */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; @@ -1650,14 +1719,6 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = ENOENT; break; } - /* - * Don't let the caller set up a VLAN tag with - * anything except VLID bits. - */ - if (vlr.vlr_tag & ~EVL_VLID_MASK) { - error = EINVAL; - break; - } error = vlan_config(ifv, p, vlr.vlr_tag); if (error) break; @@ -1678,7 +1739,7 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); - vlr.vlr_tag = ifv->ifv_tag; + vlr.vlr_tag = ifv->ifv_vid; } VLAN_UNLOCK(); error = copyout(&vlr, ifr->ifr_data, sizeof(vlr)); @@ -1699,8 +1760,40 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) * If we don't have a parent, just remember the membership for * when we do. */ - if (TRUNK(ifv) != NULL) + trunk = TRUNK(ifv); + if (trunk != NULL) { + TRUNK_LOCK(trunk); error = vlan_setmulti(ifp); + TRUNK_UNLOCK(trunk); + } + break; + + case SIOCGVLANPCP: +#ifdef VIMAGE + if (ifp->if_vnet != ifp->if_home_vnet) { + error = EPERM; + break; + } +#endif + ifr->ifr_vlan_pcp = ifv->ifv_pcp; + break; + + case SIOCSVLANPCP: +#ifdef VIMAGE + if (ifp->if_vnet != ifp->if_home_vnet) { + error = EPERM; + break; + } +#endif + error = priv_check(curthread, PRIV_NET_SETVLANPCP); + if (error) + break; + if (ifr->ifr_vlan_pcp > 7) { + error = EINVAL; + break; + } + ifv->ifv_pcp = ifr->ifr_vlan_pcp; + vlan_tag_recalculate(ifv); break; default: diff --git a/freebsd/sys/net/if_vlan_var.h b/freebsd/sys/net/if_vlan_var.h index fd3fc4f3..6b20d142 100644 --- a/freebsd/sys/net/if_vlan_var.h +++ b/freebsd/sys/net/if_vlan_var.h @@ -32,22 +32,6 @@ #ifndef _NET_IF_VLAN_VAR_H_ #define _NET_IF_VLAN_VAR_H_ 1 -struct ether_vlan_header { - u_char evl_dhost[ETHER_ADDR_LEN]; - u_char evl_shost[ETHER_ADDR_LEN]; - u_int16_t evl_encap_proto; - u_int16_t evl_tag; - u_int16_t evl_proto; -}; - -#define EVL_VLID_MASK 0x0FFF -#define EVL_PRI_MASK 0xE000 -#define EVL_VLANOFTAG(tag) ((tag) & EVL_VLID_MASK) -#define EVL_PRIOFTAG(tag) (((tag) >> 13) & 7) -#define EVL_CFIOFTAG(tag) (((tag) >> 12) & 1) -#define EVL_MAKETAG(vlid, pri, cfi) \ - ((((((pri) & 7) << 1) | ((cfi) & 1)) << 12) | ((vlid) & EVL_VLID_MASK)) - /* Set the VLAN ID in an mbuf packet header non-destructively. */ #define EVL_APPLY_VLID(m, vlid) \ do { \ @@ -89,6 +73,23 @@ struct vlanreq { #define SIOCSETVLAN SIOCSIFGENERIC #define SIOCGETVLAN SIOCGIFGENERIC +#define SIOCGVLANPCP _IOWR('i', 152, struct ifreq) /* Get VLAN PCP */ +#define SIOCSVLANPCP _IOW('i', 153, struct ifreq) /* Set VLAN PCP */ + +/* + * Names for 802.1q priorities ("802.1p"). Notice that in this scheme, + * (0 < 1), allowing default 0-tagged traffic to take priority over background + * tagged traffic. + */ +#define IEEE8021Q_PCP_BK 1 /* Background (lowest) */ +#define IEEE8021Q_PCP_BE 0 /* Best effort (default) */ +#define IEEE8021Q_PCP_EE 2 /* Excellent effort */ +#define IEEE8021Q_PCP_CA 3 /* Critical applications */ +#define IEEE8021Q_PCP_VI 4 /* Video, < 100ms latency */ +#define IEEE8021Q_PCP_VO 5 /* Video, < 10ms latency */ +#define IEEE8021Q_PCP_IC 6 /* Internetwork control */ +#define IEEE8021Q_PCP_NC 7 /* Network control (highest) */ + #ifdef _KERNEL /* * Drivers that are capable of adding and removing the VLAN header @@ -108,7 +109,7 @@ struct vlanreq { * received VLAN tag (containing both vlan and priority information) * into the ether_vtag mbuf packet header field: * - * m->m_pkthdr.ether_vtag = vlan_id; // ntohs()? + * m->m_pkthdr.ether_vtag = vtag; // ntohs()? * m->m_flags |= M_VLANTAG; * * to mark the packet m with the specified VLAN tag. @@ -126,6 +127,16 @@ struct vlanreq { * if_capabilities. */ +/* + * The 802.1q code may also tag mbufs with the PCP (priority) field for use in + * other layers of the stack, in which case an m_tag will be used. This is + * semantically quite different from use of the ether_vtag field, which is + * defined only between the device driver and VLAN layer. + */ +#define MTAG_8021Q 1326104895 +#define MTAG_8021Q_PCP_IN 0 /* Input priority. */ +#define MTAG_8021Q_PCP_OUT 1 /* Output priority. */ + #define VLAN_CAPABILITIES(_ifp) do { \ if ((_ifp)->if_vlantrunk != NULL) \ (*vlan_trunk_cap_p)(_ifp); \ @@ -133,15 +144,15 @@ struct vlanreq { #define VLAN_TRUNKDEV(_ifp) \ (_ifp)->if_type == IFT_L2VLAN ? (*vlan_trunkdev_p)((_ifp)) : NULL -#define VLAN_TAG(_ifp, _tag) \ - (_ifp)->if_type == IFT_L2VLAN ? (*vlan_tag_p)((_ifp), (_tag)) : EINVAL +#define VLAN_TAG(_ifp, _vid) \ + (_ifp)->if_type == IFT_L2VLAN ? (*vlan_tag_p)((_ifp), (_vid)) : EINVAL #define VLAN_COOKIE(_ifp) \ (_ifp)->if_type == IFT_L2VLAN ? (*vlan_cookie_p)((_ifp)) : NULL #define VLAN_SETCOOKIE(_ifp, _cookie) \ (_ifp)->if_type == IFT_L2VLAN ? \ (*vlan_setcookie_p)((_ifp), (_cookie)) : EINVAL -#define VLAN_DEVAT(_ifp, _tag) \ - (_ifp)->if_vlantrunk != NULL ? (*vlan_devat_p)((_ifp), (_tag)) : NULL +#define VLAN_DEVAT(_ifp, _vid) \ + (_ifp)->if_vlantrunk != NULL ? (*vlan_devat_p)((_ifp), (_vid)) : NULL extern void (*vlan_trunk_cap_p)(struct ifnet *); extern struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); @@ -150,6 +161,14 @@ extern int (*vlan_tag_p)(struct ifnet *, uint16_t *); extern int (*vlan_setcookie_p)(struct ifnet *, void *); extern void *(*vlan_cookie_p)(struct ifnet *); +#ifdef _SYS_EVENTHANDLER_H_ +/* VLAN state change events */ +typedef void (*vlan_config_fn)(void *, struct ifnet *, uint16_t); +typedef void (*vlan_unconfig_fn)(void *, struct ifnet *, uint16_t); +EVENTHANDLER_DECLARE(vlan_config, vlan_config_fn); +EVENTHANDLER_DECLARE(vlan_unconfig, vlan_unconfig_fn); +#endif /* _SYS_EVENTHANDLER_H_ */ + #endif /* _KERNEL */ #endif /* _NET_IF_VLAN_VAR_H_ */ diff --git a/freebsd/sys/net/ifq.h b/freebsd/sys/net/ifq.h new file mode 100644 index 00000000..f0d206d8 --- /dev/null +++ b/freebsd/sys/net/ifq.h @@ -0,0 +1,484 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)if.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NET_IFQ_H_ +#define _NET_IFQ_H_ + +#ifdef _KERNEL +#include <sys/mbuf.h> /* ifqueue only? */ +#include <sys/buf_ring.h> +#include <net/vnet.h> +#endif /* _KERNEL */ +#include <rtems/bsd/sys/lock.h> /* XXX */ +#include <sys/mutex.h> /* struct ifqueue */ + +/* + * Couple of ugly extra definitions that are required since ifq.h + * is splitted from if_var.h. + */ +#define IF_DUNIT_NONE -1 + +#include <net/altq/if_altq.h> + +/* + * Structure defining a queue for a network interface. + */ +struct ifqueue { + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + struct mtx ifq_mtx; +}; + +#ifdef _KERNEL +/* + * Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq) + * are queues of messages stored on ifqueue structures + * (defined above). Entries are added to and deleted from these structures + * by these macros. + */ +#define IF_LOCK(ifq) mtx_lock(&(ifq)->ifq_mtx) +#define IF_UNLOCK(ifq) mtx_unlock(&(ifq)->ifq_mtx) +#define IF_LOCK_ASSERT(ifq) mtx_assert(&(ifq)->ifq_mtx, MA_OWNED) +#define _IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) +#define _IF_QLEN(ifq) ((ifq)->ifq_len) + +#define _IF_ENQUEUE(ifq, m) do { \ + (m)->m_nextpkt = NULL; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_head = m; \ + else \ + (ifq)->ifq_tail->m_nextpkt = m; \ + (ifq)->ifq_tail = m; \ + (ifq)->ifq_len++; \ +} while (0) + +#define IF_ENQUEUE(ifq, m) do { \ + IF_LOCK(ifq); \ + _IF_ENQUEUE(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define _IF_PREPEND(ifq, m) do { \ + (m)->m_nextpkt = (ifq)->ifq_head; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_tail = (m); \ + (ifq)->ifq_head = (m); \ + (ifq)->ifq_len++; \ +} while (0) + +#define IF_PREPEND(ifq, m) do { \ + IF_LOCK(ifq); \ + _IF_PREPEND(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define _IF_DEQUEUE(ifq, m) do { \ + (m) = (ifq)->ifq_head; \ + if (m) { \ + if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \ + (ifq)->ifq_tail = NULL; \ + (m)->m_nextpkt = NULL; \ + (ifq)->ifq_len--; \ + } \ +} while (0) + +#define IF_DEQUEUE(ifq, m) do { \ + IF_LOCK(ifq); \ + _IF_DEQUEUE(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define _IF_DEQUEUE_ALL(ifq, m) do { \ + (m) = (ifq)->ifq_head; \ + (ifq)->ifq_head = (ifq)->ifq_tail = NULL; \ + (ifq)->ifq_len = 0; \ +} while (0) + +#define IF_DEQUEUE_ALL(ifq, m) do { \ + IF_LOCK(ifq); \ + _IF_DEQUEUE_ALL(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define _IF_POLL(ifq, m) ((m) = (ifq)->ifq_head) +#define IF_POLL(ifq, m) _IF_POLL(ifq, m) + +#define _IF_DRAIN(ifq) do { \ + struct mbuf *m; \ + for (;;) { \ + _IF_DEQUEUE(ifq, m); \ + if (m == NULL) \ + break; \ + m_freem(m); \ + } \ +} while (0) + +#define IF_DRAIN(ifq) do { \ + IF_LOCK(ifq); \ + _IF_DRAIN(ifq); \ + IF_UNLOCK(ifq); \ +} while(0) + +int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, + int adjust); +#define IF_HANDOFF(ifq, m, ifp) \ + if_handoff((struct ifqueue *)ifq, m, ifp, 0) +#define IF_HANDOFF_ADJ(ifq, m, ifp, adj) \ + if_handoff((struct ifqueue *)ifq, m, ifp, adj) + +void if_start(struct ifnet *); + +#define IFQ_ENQUEUE(ifq, m, err) \ +do { \ + IF_LOCK(ifq); \ + if (ALTQ_IS_ENABLED(ifq)) \ + ALTQ_ENQUEUE(ifq, m, NULL, err); \ + else { \ + if (_IF_QFULL(ifq)) { \ + m_freem(m); \ + (err) = ENOBUFS; \ + } else { \ + _IF_ENQUEUE(ifq, m); \ + (err) = 0; \ + } \ + } \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_DEQUEUE_NOLOCK(ifq, m) \ +do { \ + if (TBR_IS_ENABLED(ifq)) \ + (m) = tbr_dequeue_ptr(ifq, ALTDQ_REMOVE); \ + else if (ALTQ_IS_ENABLED(ifq)) \ + ALTQ_DEQUEUE(ifq, m); \ + else \ + _IF_DEQUEUE(ifq, m); \ +} while (0) + +#define IFQ_DEQUEUE(ifq, m) \ +do { \ + IF_LOCK(ifq); \ + IFQ_DEQUEUE_NOLOCK(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_POLL_NOLOCK(ifq, m) \ +do { \ + if (TBR_IS_ENABLED(ifq)) \ + (m) = tbr_dequeue_ptr(ifq, ALTDQ_POLL); \ + else if (ALTQ_IS_ENABLED(ifq)) \ + ALTQ_POLL(ifq, m); \ + else \ + _IF_POLL(ifq, m); \ +} while (0) + +#define IFQ_POLL(ifq, m) \ +do { \ + IF_LOCK(ifq); \ + IFQ_POLL_NOLOCK(ifq, m); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_PURGE_NOLOCK(ifq) \ +do { \ + if (ALTQ_IS_ENABLED(ifq)) { \ + ALTQ_PURGE(ifq); \ + } else \ + _IF_DRAIN(ifq); \ +} while (0) + +#define IFQ_PURGE(ifq) \ +do { \ + IF_LOCK(ifq); \ + IFQ_PURGE_NOLOCK(ifq); \ + IF_UNLOCK(ifq); \ +} while (0) + +#define IFQ_SET_READY(ifq) \ + do { ((ifq)->altq_flags |= ALTQF_READY); } while (0) + +#define IFQ_LOCK(ifq) IF_LOCK(ifq) +#define IFQ_UNLOCK(ifq) IF_UNLOCK(ifq) +#define IFQ_LOCK_ASSERT(ifq) IF_LOCK_ASSERT(ifq) +#define IFQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0) +#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++) +#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len) +#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len)) + +/* + * The IFF_DRV_OACTIVE test should really occur in the device driver, not in + * the handoff logic, as that flag is locked by the device driver. + */ +#define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \ +do { \ + int len; \ + short mflags; \ + \ + len = (m)->m_pkthdr.len; \ + mflags = (m)->m_flags; \ + IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \ + if ((err) == 0) { \ + if_inc_counter((ifp), IFCOUNTER_OBYTES, len + (adj)); \ + if (mflags & M_MCAST) \ + if_inc_counter((ifp), IFCOUNTER_OMCASTS, 1); \ + if (((ifp)->if_drv_flags & IFF_DRV_OACTIVE) == 0) \ + if_start(ifp); \ + } else \ + if_inc_counter((ifp), IFCOUNTER_OQDROPS, 1); \ +} while (0) + +#define IFQ_HANDOFF(ifp, m, err) \ + IFQ_HANDOFF_ADJ(ifp, m, 0, err) + +#define IFQ_DRV_DEQUEUE(ifq, m) \ +do { \ + (m) = (ifq)->ifq_drv_head; \ + if (m) { \ + if (((ifq)->ifq_drv_head = (m)->m_nextpkt) == NULL) \ + (ifq)->ifq_drv_tail = NULL; \ + (m)->m_nextpkt = NULL; \ + (ifq)->ifq_drv_len--; \ + } else { \ + IFQ_LOCK(ifq); \ + IFQ_DEQUEUE_NOLOCK(ifq, m); \ + while ((ifq)->ifq_drv_len < (ifq)->ifq_drv_maxlen) { \ + struct mbuf *m0; \ + IFQ_DEQUEUE_NOLOCK(ifq, m0); \ + if (m0 == NULL) \ + break; \ + m0->m_nextpkt = NULL; \ + if ((ifq)->ifq_drv_tail == NULL) \ + (ifq)->ifq_drv_head = m0; \ + else \ + (ifq)->ifq_drv_tail->m_nextpkt = m0; \ + (ifq)->ifq_drv_tail = m0; \ + (ifq)->ifq_drv_len++; \ + } \ + IFQ_UNLOCK(ifq); \ + } \ +} while (0) + +#define IFQ_DRV_PREPEND(ifq, m) \ +do { \ + (m)->m_nextpkt = (ifq)->ifq_drv_head; \ + if ((ifq)->ifq_drv_tail == NULL) \ + (ifq)->ifq_drv_tail = (m); \ + (ifq)->ifq_drv_head = (m); \ + (ifq)->ifq_drv_len++; \ +} while (0) + +#define IFQ_DRV_IS_EMPTY(ifq) \ + (((ifq)->ifq_drv_len == 0) && ((ifq)->ifq_len == 0)) + +#define IFQ_DRV_PURGE(ifq) \ +do { \ + struct mbuf *m, *n = (ifq)->ifq_drv_head; \ + while((m = n) != NULL) { \ + n = m->m_nextpkt; \ + m_freem(m); \ + } \ + (ifq)->ifq_drv_head = (ifq)->ifq_drv_tail = NULL; \ + (ifq)->ifq_drv_len = 0; \ + IFQ_PURGE(ifq); \ +} while (0) + +static __inline int +drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m) +{ + int error = 0; + +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_ENQUEUE(&ifp->if_snd, m, error); + if (error) + if_inc_counter((ifp), IFCOUNTER_OQDROPS, 1); + return (error); + } +#endif + error = buf_ring_enqueue(br, m); + if (error) + m_freem(m); + + return (error); +} + +static __inline void +drbr_putback(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new) +{ + /* + * The top of the list needs to be swapped + * for this one. + */ +#ifdef ALTQ + if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + /* + * Peek in altq case dequeued it + * so put it back. + */ + IFQ_DRV_PREPEND(&ifp->if_snd, new); + return; + } +#endif + buf_ring_putback_sc(br, new); +} + +static __inline struct mbuf * +drbr_peek(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + struct mbuf *m; + if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + /* + * Pull it off like a dequeue + * since drbr_advance() does nothing + * for altq and drbr_putback() will + * use the old prepend function. + */ + IFQ_DEQUEUE(&ifp->if_snd, m); + return (m); + } +#endif + return(buf_ring_peek_clear_sc(br)); +} + +static __inline void +drbr_flush(struct ifnet *ifp, struct buf_ring *br) +{ + struct mbuf *m; + +#ifdef ALTQ + if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) + IFQ_PURGE(&ifp->if_snd); +#endif + while ((m = buf_ring_dequeue_sc(br)) != NULL) + m_freem(m); +} + +static __inline void +drbr_free(struct buf_ring *br, struct malloc_type *type) +{ + + drbr_flush(NULL, br); + buf_ring_free(br, type); +} + +static __inline struct mbuf * +drbr_dequeue(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + struct mbuf *m; + + if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_DEQUEUE(&ifp->if_snd, m); + return (m); + } +#endif + return (buf_ring_dequeue_sc(br)); +} + +static __inline void +drbr_advance(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + /* Nothing to do here since peek dequeues in altq case */ + if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) + return; +#endif + return (buf_ring_advance_sc(br)); +} + + +static __inline struct mbuf * +drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br, + int (*func) (struct mbuf *, void *), void *arg) +{ + struct mbuf *m; +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) { + IFQ_LOCK(&ifp->if_snd); + IFQ_POLL_NOLOCK(&ifp->if_snd, m); + if (m != NULL && func(m, arg) == 0) { + IFQ_UNLOCK(&ifp->if_snd); + return (NULL); + } + IFQ_DEQUEUE_NOLOCK(&ifp->if_snd, m); + IFQ_UNLOCK(&ifp->if_snd); + return (m); + } +#endif + m = buf_ring_peek(br); + if (m == NULL || func(m, arg) == 0) + return (NULL); + + return (buf_ring_dequeue_sc(br)); +} + +static __inline int +drbr_empty(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + return (IFQ_IS_EMPTY(&ifp->if_snd)); +#endif + return (buf_ring_empty(br)); +} + +static __inline int +drbr_needs_enqueue(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + return (1); +#endif + return (!buf_ring_empty(br)); +} + +static __inline int +drbr_inuse(struct ifnet *ifp, struct buf_ring *br) +{ +#ifdef ALTQ + if (ALTQ_IS_ENABLED(&ifp->if_snd)) + return (ifp->if_snd.ifq_len); +#endif + return (buf_ring_count(br)); +} + +extern int ifqmaxlen; + +void if_qflush(struct ifnet *); +void ifq_init(struct ifaltq *, struct ifnet *ifp); +void ifq_delete(struct ifaltq *); + +#endif /* _KERNEL */ +#endif /* !_NET_IFQ_H_ */ diff --git a/freebsd/sys/net/iso88025.h b/freebsd/sys/net/iso88025.h index 6edd2e0b..11bd6ec4 100644 --- a/freebsd/sys/net/iso88025.h +++ b/freebsd/sys/net/iso88025.h @@ -162,11 +162,13 @@ struct iso88025_addr { #define ISO88025_BPF_UNSUPPORTED 0 #define ISO88025_BPF_SUPPORTED 1 +#ifdef _KERNEL void iso88025_ifattach (struct ifnet *, const u_int8_t *, int); void iso88025_ifdetach (struct ifnet *, int); int iso88025_ioctl (struct ifnet *, u_long, caddr_t ); -int iso88025_output (struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *); +int iso88025_output (struct ifnet *, struct mbuf *, + const struct sockaddr *, struct route *); void iso88025_input (struct ifnet *, struct mbuf *); +#endif /* _KERNEL */ -#endif +#endif /* !_NET_ISO88025_H_ */ diff --git a/freebsd/sys/net/netisr.c b/freebsd/sys/net/netisr.c index f43cffa1..f14b2e95 100644 --- a/freebsd/sys/net/netisr.c +++ b/freebsd/sys/net/netisr.c @@ -72,6 +72,7 @@ __FBSDID("$FreeBSD$"); #include <sys/bus.h> #include <sys/kernel.h> #include <sys/kthread.h> +#include <sys/malloc.h> #include <sys/interrupt.h> #include <rtems/bsd/sys/lock.h> #include <sys/mbuf.h> @@ -131,7 +132,7 @@ static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); /*- * Three global direct dispatch policies are supported: * - * NETISR_DISPATCH_QUEUED: All work is deferred for a netisr, regardless of + * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of * context (may be overriden by protocols). * * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch, @@ -151,37 +152,25 @@ static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); #define NETISR_DISPATCH_POLICY_MAXSTR 20 /* Used for temporary buffers. */ static u_int netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT; static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS); -SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RW | - CTLFLAG_TUN, 0, 0, sysctl_netisr_dispatch_policy, "A", +SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN, + 0, 0, sysctl_netisr_dispatch_policy, "A", "netisr dispatch policy"); /* - * These sysctls were used in previous versions to control and export - * dispatch policy state. Now, we provide read-only export via them so that - * older netstat binaries work. At some point they can be garbage collected. - */ -static int netisr_direct_force; -SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RD, - &netisr_direct_force, 0, "compat: force direct dispatch"); - -static int netisr_direct; -SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RD, &netisr_direct, 0, - "compat: enable direct dispatch"); - -/* * Allow the administrator to limit the number of threads (CPUs) to use for * netisr. We don't check netisr_maxthreads before creating the thread for - * CPU 0, so in practice we ignore values <= 1. This must be set at boot. - * We will create at most one thread per CPU. + * CPU 0. This must be set at boot. We will create at most one thread per CPU. + * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and + * therefore only 1 workstream. If set to -1, netisr would use all cpus + * (mp_ncpus) and therefore would have those many workstreams. One workstream + * per thread (CPU). */ -static int netisr_maxthreads = -1; /* Max number of threads. */ -TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); +static int netisr_maxthreads = 1; /* Max number of threads. */ SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ -TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); @@ -192,7 +181,6 @@ SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; -TUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit); SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN, &netisr_maxqlimit, 0, "Maximum netisr per-protocol, per-CPU queue depth."); @@ -204,7 +192,6 @@ SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN, */ #define NETISR_DEFAULT_DEFAULTQLIMIT 256 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; -TUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit); SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN, &netisr_defaultqlimit, 0, "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); @@ -225,6 +212,23 @@ SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD, */ static struct netisr_proto netisr_proto[NETISR_MAXPROT]; +#ifdef VIMAGE +/* + * The netisr_enable array describes a per-VNET flag for registered + * protocols on whether this netisr is active in this VNET or not. + * netisr_register() will automatically enable the netisr for the + * default VNET and all currently active instances. + * netisr_unregister() will disable all active VNETs, including vnet0. + * Individual network stack instances can be enabled/disabled by the + * netisr_(un)register _vnet() functions. + * With this we keep the one netisr_proto per protocol but add a + * mechanism to stop netisr processing for vnet teardown. + * Apart from that we expect a VNET to always be enabled. + */ +static VNET_DEFINE(u_int, netisr_enable[NETISR_MAXPROT]); +#define V_netisr_enable VNET(netisr_enable) +#endif + #ifndef __rtems__ /* * Per-CPU workstream data. See netisr_internal.h for more details. @@ -275,10 +279,7 @@ u_int netisr_get_cpuid(u_int cpunumber) { - KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber, - nws_count)); - - return (nws_array[cpunumber]); + return (nws_array[cpunumber % nws_count]); } /* @@ -308,8 +309,6 @@ static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = { { NETISR_DISPATCH_HYBRID, "hybrid" }, { NETISR_DISPATCH_DIRECT, "direct" }, }; -static const u_int netisr_dispatch_table_len = - (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0])); static void netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer, @@ -320,7 +319,7 @@ netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer, u_int i; str = "unknown"; - for (i = 0; i < netisr_dispatch_table_len; i++) { + for (i = 0; i < nitems(netisr_dispatch_table); i++) { ndtep = &netisr_dispatch_table[i]; if (ndtep->ndte_policy == dispatch_policy) { str = ndtep->ndte_policy_str; @@ -336,7 +335,7 @@ netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp) const struct netisr_dispatch_table_entry *ndtep; u_int i; - for (i = 0; i < netisr_dispatch_table_len; i++) { + for (i = 0; i < nitems(netisr_dispatch_table); i++) { ndtep = &netisr_dispatch_table[i]; if (strcmp(ndtep->ndte_policy_str, str) == 0) { *dispatch_policyp = ndtep->ndte_policy; @@ -346,32 +345,6 @@ netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp) return (EINVAL); } -static void -netisr_dispatch_policy_compat(void) -{ - - switch (netisr_dispatch_policy) { - case NETISR_DISPATCH_DEFERRED: - netisr_direct_force = 0; - netisr_direct = 0; - break; - - case NETISR_DISPATCH_HYBRID: - netisr_direct_force = 0; - netisr_direct = 1; - break; - - case NETISR_DISPATCH_DIRECT: - netisr_direct_force = 1; - netisr_direct = 1; - break; - - default: - panic("%s: unknown policy %u", __func__, - netisr_dispatch_policy); - } -} - static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) { @@ -387,10 +360,8 @@ sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) &dispatch_policy); if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT) error = EINVAL; - if (error == 0) { + if (error == 0) netisr_dispatch_policy = dispatch_policy; - netisr_dispatch_policy_compat(); - } } return (error); } @@ -403,6 +374,7 @@ sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) void netisr_register(const struct netisr_handler *nhp) { + VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; const char *name; u_int i, proto; @@ -475,6 +447,22 @@ netisr_register(const struct netisr_handler *nhp) bzero(npwp, sizeof(*npwp)); npwp->nw_qlimit = netisr_proto[proto].np_qlimit; } + +#ifdef VIMAGE + /* + * Test that we are in vnet0 and have a curvnet set. + */ + KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); + KASSERT(IS_DEFAULT_VNET(curvnet), ("%s: curvnet %p is not vnet0 %p", + __func__, curvnet, vnet0)); + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_netisr_enable[proto] = 1; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +#endif NETISR_WUNLOCK(); } @@ -651,6 +639,7 @@ netisr_drain_proto(struct netisr_work *npwp) void netisr_unregister(const struct netisr_handler *nhp) { + VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; #ifdef INVARIANTS const char *name; @@ -669,6 +658,16 @@ netisr_unregister(const struct netisr_handler *nhp) ("%s(%u): protocol not registered for %s", __func__, proto, name)); +#ifdef VIMAGE + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + V_netisr_enable[proto] = 0; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +#endif + netisr_proto[proto].np_name = NULL; netisr_proto[proto].np_handler = NULL; netisr_proto[proto].np_m2flow = NULL; @@ -687,6 +686,97 @@ netisr_unregister(const struct netisr_handler *nhp) NETISR_WUNLOCK(); } +#ifdef VIMAGE +void +netisr_register_vnet(const struct netisr_handler *nhp) +{ + u_int proto; + + proto = nhp->nh_proto; + + KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); + NETISR_WLOCK(); + KASSERT(netisr_proto[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + nhp->nh_name)); + + V_netisr_enable[proto] = 1; + NETISR_WUNLOCK(); +} + +static void +netisr_drain_proto_vnet(struct vnet *vnet, u_int proto) +{ + struct netisr_workstream *nwsp; + struct netisr_work *npwp; + struct mbuf *m, *mp, *n, *ne; + u_int i; + + KASSERT(vnet != NULL, ("%s: vnet is NULL", __func__)); + NETISR_LOCK_ASSERT(); + + CPU_FOREACH(i) { + nwsp = DPCPU_ID_PTR(i, nws); + if (nwsp->nws_intr_event == NULL) + continue; + npwp = &nwsp->nws_work[proto]; + NWS_LOCK(nwsp); + + /* + * Rather than dissecting and removing mbufs from the middle + * of the chain, we build a new chain if the packet stays and + * update the head and tail pointers at the end. All packets + * matching the given vnet are freed. + */ + m = npwp->nw_head; + n = ne = NULL; + while (m != NULL) { + mp = m; + m = m->m_nextpkt; + mp->m_nextpkt = NULL; + if (mp->m_pkthdr.rcvif->if_vnet != vnet) { + if (n == NULL) { + n = ne = mp; + } else { + ne->m_nextpkt = mp; + ne = mp; + } + continue; + } + /* This is a packet in the selected vnet. Free it. */ + npwp->nw_len--; + m_freem(mp); + } + npwp->nw_head = n; + npwp->nw_tail = ne; + NWS_UNLOCK(nwsp); + } +} + +void +netisr_unregister_vnet(const struct netisr_handler *nhp) +{ + u_int proto; + + proto = nhp->nh_proto; + + KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); + KASSERT(proto < NETISR_MAXPROT, + ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); + NETISR_WLOCK(); + KASSERT(netisr_proto[proto].np_handler != NULL, + ("%s(%u): protocol not registered for %s", __func__, proto, + nhp->nh_name)); + + V_netisr_enable[proto] = 0; + + netisr_drain_proto_vnet(curvnet, proto); + NETISR_WUNLOCK(); +} +#endif + /* * Compose the global and per-protocol policies on dispatch, and return the * dispatch policy to use. @@ -746,22 +836,25 @@ netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, * dispatch. In the queued case, fall back on the SOURCE * policy. */ - if (*cpuidp != NETISR_CPUID_NONE) + if (*cpuidp != NETISR_CPUID_NONE) { + *cpuidp = netisr_get_cpuid(*cpuidp); return (m); + } if (dispatch_policy == NETISR_DISPATCH_HYBRID) { - *cpuidp = curcpu; + *cpuidp = netisr_get_cpuid(curcpu); return (m); } policy = NETISR_POLICY_SOURCE; } if (policy == NETISR_POLICY_FLOW) { - if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) { + if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE && + npp->np_m2flow != NULL) { m = npp->np_m2flow(m, source); if (m == NULL) return (NULL); } - if (m->m_flags & M_FLOWID) { + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { *cpuidp = netisr_default_flow2cpu(m->m_pkthdr.flowid); return (m); @@ -984,6 +1077,13 @@ netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); +#ifdef VIMAGE + if (V_netisr_enable[proto] == 0) { + m_freem(m); + return (ENOPROTOOPT); + } +#endif + m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED, source, m, &cpuid); if (m != NULL) { @@ -1030,6 +1130,13 @@ netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); +#ifdef VIMAGE + if (V_netisr_enable[proto] == 0) { + m_freem(m); + return (ENOPROTOOPT); + } +#endif + dispatch_policy = netisr_get_dispatch(npp); if (dispatch_policy == NETISR_DISPATCH_DEFERRED) return (netisr_queue_src(proto, source, m)); @@ -1215,15 +1322,15 @@ netisr_start_swi(u_int cpuid, struct pcpu *pc) static void netisr_init(void *arg) { - char tmp[NETISR_DISPATCH_POLICY_MAXSTR]; - u_int dispatch_policy; - int error; - - KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); +#ifdef EARLY_AP_STARTUP + struct pcpu *pc; +#endif NETISR_LOCK_INIT(); - if (netisr_maxthreads < 1) - netisr_maxthreads = 1; + if (netisr_maxthreads == 0 || netisr_maxthreads < -1 ) + netisr_maxthreads = 1; /* default behavior */ + else if (netisr_maxthreads == -1) + netisr_maxthreads = mp_ncpus; /* use max cpus */ if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); @@ -1248,31 +1355,24 @@ netisr_init(void *arg) } #endif -#ifndef __rtems__ - if (TUNABLE_STR_FETCH("net.isr.dispatch", tmp, sizeof(tmp))) { - error = netisr_dispatch_policy_from_str(tmp, - &dispatch_policy); - if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT) - error = EINVAL; - if (error == 0) { - netisr_dispatch_policy = dispatch_policy; - netisr_dispatch_policy_compat(); - } else - printf( - "%s: invalid dispatch policy %s, using default\n", - __func__, tmp); +#ifdef EARLY_AP_STARTUP + STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { + if (nws_count >= netisr_maxthreads) + break; + netisr_start_swi(pc->pc_cpuid, pc); } -#endif /* __rtems__ */ - +#else #ifndef __rtems__ netisr_start_swi(curcpu, pcpu_find(curcpu)); #else /* __rtems__ */ netisr_start_swi(0, NULL); #endif /* __rtems__ */ +#endif } SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); #ifndef __rtems__ +#ifndef EARLY_AP_STARTUP /* * Start worker threads for additional CPUs. No attempt to gracefully handle * work reassignment, we don't yet support dynamic reconfiguration. @@ -1285,9 +1385,6 @@ netisr_start(void *arg) STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (nws_count >= netisr_maxthreads) break; - /* XXXRW: Is skipping absent CPUs still required here? */ - if (CPU_ABSENT(pc->pc_cpuid)) - continue; /* Worker will already be present for boot CPU. */ if (pc->pc_netisr != NULL) continue; @@ -1295,6 +1392,7 @@ netisr_start(void *arg) } } SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); +#endif #endif /* __rtems__ */ /* diff --git a/freebsd/sys/net/netisr.h b/freebsd/sys/net/netisr.h index 83bf9ce5..63764a74 100644 --- a/freebsd/sys/net/netisr.h +++ b/freebsd/sys/net/netisr.h @@ -52,15 +52,13 @@ #define NETISR_IP 1 #define NETISR_IGMP 2 /* IGMPv3 output queue */ #define NETISR_ROUTE 3 /* routing socket */ -#define NETISR_AARP 4 /* Appletalk ARP */ -#define NETISR_ATALK2 5 /* Appletalk phase 2 */ -#define NETISR_ATALK1 6 /* Appletalk phase 1 */ -#define NETISR_ARP 7 /* same as AF_LINK */ -#define NETISR_IPX 8 /* same as AF_IPX */ -#define NETISR_ETHER 9 /* ethernet input */ -#define NETISR_IPV6 10 -#define NETISR_NATM 11 -#define NETISR_EPAIR 12 /* if_epair(4) */ +#define NETISR_ARP 4 /* same as AF_LINK */ +#define NETISR_ETHER 5 /* ethernet input */ +#define NETISR_IPV6 6 +#define NETISR_NATM 7 +#define NETISR_EPAIR 8 /* if_epair(4) */ +#define NETISR_IP_DIRECT 9 /* direct-dispatch IPv4 */ +#define NETISR_IPV6_DIRECT 10 /* direct-dispatch IPv6 */ /* * Protocol ordering and affinity policy constants. See the detailed @@ -212,6 +210,10 @@ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp); void netisr_register(const struct netisr_handler *nhp); int netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit); void netisr_unregister(const struct netisr_handler *nhp); +#ifdef VIMAGE +void netisr_register_vnet(const struct netisr_handler *nhp); +void netisr_unregister_vnet(const struct netisr_handler *nhp); +#endif /* * Process a packet destined for a protocol, and attempt direct dispatch. diff --git a/freebsd/sys/net/pfil.c b/freebsd/sys/net/pfil.c index 123d03c4..7fcecc88 100644 --- a/freebsd/sys/net/pfil.c +++ b/freebsd/sys/net/pfil.c @@ -47,6 +47,7 @@ #include <sys/queue.h> #include <net/if.h> +#include <net/if_var.h> #include <net/pfil.h> static struct mtx pfil_global_lock; @@ -54,18 +55,18 @@ static struct mtx pfil_global_lock; MTX_SYSINIT(pfil_heads_lock, &pfil_global_lock, "pfil_head_list lock", MTX_DEF); -static int pfil_list_add(pfil_list_t *, struct packet_filter_hook *, int); - -static int pfil_list_remove(pfil_list_t *, - int (*)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), - void *); +static struct packet_filter_hook *pfil_chain_get(int, struct pfil_head *); +static int pfil_chain_add(pfil_chain_t *, struct packet_filter_hook *, int); +static int pfil_chain_remove(pfil_chain_t *, pfil_func_t, void *); LIST_HEAD(pfilheadhead, pfil_head); VNET_DEFINE(struct pfilheadhead, pfil_head_list); #define V_pfil_head_list VNET(pfil_head_list) +VNET_DEFINE(struct rmlock, pfil_lock); +#define V_pfil_lock VNET(pfil_lock) /* - * pfil_run_hooks() runs the specified packet filter hooks. + * pfil_run_hooks() runs the specified packet filter hook chain. */ int pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, @@ -78,8 +79,8 @@ pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, PFIL_RLOCK(ph, &rmpt); KASSERT(ph->ph_nhooks >= 0, ("Pfil hook count dropped < 0")); - for (pfh = pfil_hook_get(dir, ph); pfh != NULL; - pfh = TAILQ_NEXT(pfh, pfil_link)) { + for (pfh = pfil_chain_get(dir, ph); pfh != NULL; + pfh = TAILQ_NEXT(pfh, pfil_chain)) { if (pfh->pfil_func != NULL) { rv = (*pfh->pfil_func)(pfh->pfil_arg, &m, ifp, dir, inp); @@ -92,6 +93,80 @@ pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp, return (rv); } +static struct packet_filter_hook * +pfil_chain_get(int dir, struct pfil_head *ph) +{ + + if (dir == PFIL_IN) + return (TAILQ_FIRST(&ph->ph_in)); + else if (dir == PFIL_OUT) + return (TAILQ_FIRST(&ph->ph_out)); + else + return (NULL); +} + +/* + * pfil_try_rlock() acquires rm reader lock for specified head + * if this is immediately possible. + */ +int +pfil_try_rlock(struct pfil_head *ph, struct rm_priotracker *tracker) +{ + + return (PFIL_TRY_RLOCK(ph, tracker)); +} + +/* + * pfil_rlock() acquires rm reader lock for specified head. + */ +void +pfil_rlock(struct pfil_head *ph, struct rm_priotracker *tracker) +{ + + PFIL_RLOCK(ph, tracker); +} + +/* + * pfil_runlock() releases reader lock for specified head. + */ +void +pfil_runlock(struct pfil_head *ph, struct rm_priotracker *tracker) +{ + + PFIL_RUNLOCK(ph, tracker); +} + +/* + * pfil_wlock() acquires writer lock for specified head. + */ +void +pfil_wlock(struct pfil_head *ph) +{ + + PFIL_WLOCK(ph); +} + +/* + * pfil_wunlock() releases writer lock for specified head. + */ +void +pfil_wunlock(struct pfil_head *ph) +{ + + PFIL_WUNLOCK(ph); +} + +/* + * pfil_wowned() returns a non-zero value if the current thread owns + * an exclusive lock. + */ +int +pfil_wowned(struct pfil_head *ph) +{ + + return (PFIL_WOWNED(ph)); +} + /* * pfil_head_register() registers a pfil_head with the packet filter hook * mechanism. @@ -101,11 +176,11 @@ pfil_head_register(struct pfil_head *ph) { struct pfil_head *lph; - PFIL_LIST_LOCK(); + PFIL_HEADLIST_LOCK(); LIST_FOREACH(lph, &V_pfil_head_list, ph_list) { if (ph->ph_type == lph->ph_type && ph->ph_un.phu_val == lph->ph_un.phu_val) { - PFIL_LIST_UNLOCK(); + PFIL_HEADLIST_UNLOCK(); return (EEXIST); } } @@ -114,7 +189,7 @@ pfil_head_register(struct pfil_head *ph) TAILQ_INIT(&ph->ph_in); TAILQ_INIT(&ph->ph_out); LIST_INSERT_HEAD(&V_pfil_head_list, ph, ph_list); - PFIL_LIST_UNLOCK(); + PFIL_HEADLIST_UNLOCK(); return (0); } @@ -128,12 +203,12 @@ pfil_head_unregister(struct pfil_head *ph) { struct packet_filter_hook *pfh, *pfnext; - PFIL_LIST_LOCK(); + PFIL_HEADLIST_LOCK(); LIST_REMOVE(ph, ph_list); - PFIL_LIST_UNLOCK(); - TAILQ_FOREACH_SAFE(pfh, &ph->ph_in, pfil_link, pfnext) + PFIL_HEADLIST_UNLOCK(); + TAILQ_FOREACH_SAFE(pfh, &ph->ph_in, pfil_chain, pfnext) free(pfh, M_IFADDR); - TAILQ_FOREACH_SAFE(pfh, &ph->ph_out, pfil_link, pfnext) + TAILQ_FOREACH_SAFE(pfh, &ph->ph_out, pfil_chain, pfnext) free(pfh, M_IFADDR); PFIL_LOCK_DESTROY(ph); return (0); @@ -147,11 +222,11 @@ pfil_head_get(int type, u_long val) { struct pfil_head *ph; - PFIL_LIST_LOCK(); + PFIL_HEADLIST_LOCK(); LIST_FOREACH(ph, &V_pfil_head_list, ph_list) if (ph->ph_type == type && ph->ph_un.phu_val == val) break; - PFIL_LIST_UNLOCK(); + PFIL_HEADLIST_UNLOCK(); return (ph); } @@ -164,8 +239,7 @@ pfil_head_get(int type, u_long val) * PFIL_WAITOK OK to call malloc with M_WAITOK. */ int -pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, - struct inpcb *), void *arg, int flags, struct pfil_head *ph) +pfil_add_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) { struct packet_filter_hook *pfh1 = NULL; struct packet_filter_hook *pfh2 = NULL; @@ -191,7 +265,7 @@ pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, if (flags & PFIL_IN) { pfh1->pfil_func = func; pfh1->pfil_arg = arg; - err = pfil_list_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT); + err = pfil_chain_add(&ph->ph_in, pfh1, flags & ~PFIL_OUT); if (err) goto locked_error; ph->ph_nhooks++; @@ -199,10 +273,10 @@ pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, if (flags & PFIL_OUT) { pfh2->pfil_func = func; pfh2->pfil_arg = arg; - err = pfil_list_add(&ph->ph_out, pfh2, flags & ~PFIL_IN); + err = pfil_chain_add(&ph->ph_out, pfh2, flags & ~PFIL_IN); if (err) { if (flags & PFIL_IN) - pfil_list_remove(&ph->ph_in, func, arg); + pfil_chain_remove(&ph->ph_in, func, arg); goto locked_error; } ph->ph_nhooks++; @@ -221,22 +295,21 @@ error: /* * pfil_remove_hook removes a specific function from the packet filter hook - * list. + * chain. */ int -pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, - struct inpcb *), void *arg, int flags, struct pfil_head *ph) +pfil_remove_hook(pfil_func_t func, void *arg, int flags, struct pfil_head *ph) { int err = 0; PFIL_WLOCK(ph); if (flags & PFIL_IN) { - err = pfil_list_remove(&ph->ph_in, func, arg); + err = pfil_chain_remove(&ph->ph_in, func, arg); if (err == 0) ph->ph_nhooks--; } if ((err == 0) && (flags & PFIL_OUT)) { - err = pfil_list_remove(&ph->ph_out, func, arg); + err = pfil_chain_remove(&ph->ph_out, func, arg); if (err == 0) ph->ph_nhooks--; } @@ -244,15 +317,18 @@ pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, return (err); } +/* + * Internal: Add a new pfil hook into a hook chain. + */ static int -pfil_list_add(pfil_list_t *list, struct packet_filter_hook *pfh1, int flags) +pfil_chain_add(pfil_chain_t *chain, struct packet_filter_hook *pfh1, int flags) { struct packet_filter_hook *pfh; /* * First make sure the hook is not already there. */ - TAILQ_FOREACH(pfh, list, pfil_link) + TAILQ_FOREACH(pfh, chain, pfil_chain) if (pfh->pfil_func == pfh1->pfil_func && pfh->pfil_arg == pfh1->pfil_arg) return (EEXIST); @@ -262,26 +338,23 @@ pfil_list_add(pfil_list_t *list, struct packet_filter_hook *pfh1, int flags) * the same path is followed in or out of the kernel. */ if (flags & PFIL_IN) - TAILQ_INSERT_HEAD(list, pfh1, pfil_link); + TAILQ_INSERT_HEAD(chain, pfh1, pfil_chain); else - TAILQ_INSERT_TAIL(list, pfh1, pfil_link); + TAILQ_INSERT_TAIL(chain, pfh1, pfil_chain); return (0); } /* - * pfil_list_remove is an internal function that takes a function off the - * specified list. + * Internal: Remove a pfil hook from a hook chain. */ static int -pfil_list_remove(pfil_list_t *list, - int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), - void *arg) +pfil_chain_remove(pfil_chain_t *chain, pfil_func_t func, void *arg) { struct packet_filter_hook *pfh; - TAILQ_FOREACH(pfh, list, pfil_link) + TAILQ_FOREACH(pfh, chain, pfil_chain) if (pfh->pfil_func == func && pfh->pfil_arg == arg) { - TAILQ_REMOVE(list, pfh, pfil_link); + TAILQ_REMOVE(chain, pfh, pfil_chain); free(pfh, M_IFADDR); return (0); } @@ -292,36 +365,34 @@ pfil_list_remove(pfil_list_t *list, * Stuff that must be initialized for every instance (including the first of * course). */ -static int -vnet_pfil_init(const void *unused) +static void +vnet_pfil_init(const void *unused __unused) { LIST_INIT(&V_pfil_head_list); - return (0); + PFIL_LOCK_INIT_REAL(&V_pfil_lock, "shared"); } /* * Called for the removal of each instance. */ -static int -vnet_pfil_uninit(const void *unused) +static void +vnet_pfil_uninit(const void *unused __unused) { - /* XXX should panic if list is not empty */ - return (0); + KASSERT(LIST_EMPTY(&V_pfil_head_list), + ("%s: pfil_head_list %p not empty", __func__, &V_pfil_head_list)); + PFIL_LOCK_DESTROY_REAL(&V_pfil_lock); } -/* Define startup order. */ -#define PFIL_SYSINIT_ORDER SI_SUB_PROTO_BEGIN -#define PFIL_MODEVENT_ORDER (SI_ORDER_FIRST) /* On boot slot in here. */ -#define PFIL_VNET_ORDER (PFIL_MODEVENT_ORDER + 2) /* Later still. */ - /* * Starting up. * * VNET_SYSINIT is called for each existing vnet and each new vnet. + * Make sure the pfil bits are first before any possible subsystem which + * might piggyback on the SI_SUB_PROTO_PFIL. */ -VNET_SYSINIT(vnet_pfil_init, PFIL_SYSINIT_ORDER, PFIL_VNET_ORDER, +VNET_SYSINIT(vnet_pfil_init, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, vnet_pfil_init, NULL); /* @@ -329,5 +400,5 @@ VNET_SYSINIT(vnet_pfil_init, PFIL_SYSINIT_ORDER, PFIL_VNET_ORDER, * * VNET_SYSUNINIT is called for each exiting vnet as it exits. */ -VNET_SYSUNINIT(vnet_pfil_uninit, PFIL_SYSINIT_ORDER, PFIL_VNET_ORDER, +VNET_SYSUNINIT(vnet_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_FIRST, vnet_pfil_uninit, NULL); diff --git a/freebsd/sys/net/pfil.h b/freebsd/sys/net/pfil.h index da06dedf..b78023b7 100644 --- a/freebsd/sys/net/pfil.h +++ b/freebsd/sys/net/pfil.h @@ -43,15 +43,18 @@ struct mbuf; struct ifnet; struct inpcb; +typedef int (*pfil_func_t)(void *, struct mbuf **, struct ifnet *, int, + struct inpcb *); + /* * The packet filter hooks are designed for anything to call them to - * possibly intercept the packet. + * possibly intercept the packet. Multiple filter hooks are chained + * together and after each other in the specified order. */ struct packet_filter_hook { - TAILQ_ENTRY(packet_filter_hook) pfil_link; - int (*pfil_func)(void *, struct mbuf **, struct ifnet *, int, - struct inpcb *); - void *pfil_arg; + TAILQ_ENTRY(packet_filter_hook) pfil_chain; + pfil_func_t pfil_func; + void *pfil_arg; }; #define PFIL_IN 0x00000001 @@ -59,63 +62,87 @@ struct packet_filter_hook { #define PFIL_WAITOK 0x00000004 #define PFIL_ALL (PFIL_IN|PFIL_OUT) -typedef TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t; +typedef TAILQ_HEAD(pfil_chain, packet_filter_hook) pfil_chain_t; #define PFIL_TYPE_AF 1 /* key is AF_* type */ #define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ +#define PFIL_FLAG_PRIVATE_LOCK 0x01 /* Personal lock instead of global */ + +/* + * A pfil head is created by each protocol or packet intercept point. + * For packet is then run through the hook chain for inspection. + */ struct pfil_head { - pfil_list_t ph_in; - pfil_list_t ph_out; - int ph_type; - int ph_nhooks; + pfil_chain_t ph_in; + pfil_chain_t ph_out; + int ph_type; + int ph_nhooks; #if defined( __linux__ ) || defined( _WIN32 ) - rwlock_t ph_mtx; + rwlock_t ph_mtx; #else - struct rmlock ph_lock; + struct rmlock *ph_plock; /* Pointer to the used lock */ + struct rmlock ph_lock; /* Private lock storage */ + int flags; #endif union { - u_long phu_val; - void *phu_ptr; + u_long phu_val; + void *phu_ptr; } ph_un; -#define ph_af ph_un.phu_val -#define ph_ifnet ph_un.phu_ptr +#define ph_af ph_un.phu_val +#define ph_ifnet ph_un.phu_ptr LIST_ENTRY(pfil_head) ph_list; }; -int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, - int, struct inpcb *), void *, int, struct pfil_head *); -int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, - int, struct inpcb *), void *, int, struct pfil_head *); +/* Public functions for pfil hook management by packet filters. */ +struct pfil_head *pfil_head_get(int, u_long); +int pfil_add_hook(pfil_func_t, void *, int, struct pfil_head *); +int pfil_remove_hook(pfil_func_t, void *, int, struct pfil_head *); +#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) + +/* Public functions to run the packet inspection by protocols. */ int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, int, struct inpcb *inp); +/* Public functions for pfil head management by protocols. */ int pfil_head_register(struct pfil_head *); int pfil_head_unregister(struct pfil_head *); -struct pfil_head *pfil_head_get(int, u_long); - -#define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) -#define PFIL_LOCK_INIT(p) \ - rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE) -#define PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock) -#define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t)) -#define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock) -#define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t)) -#define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock) -#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) -#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) - -static __inline struct packet_filter_hook * -pfil_hook_get(int dir, struct pfil_head *ph) -{ - - if (dir == PFIL_IN) - return (TAILQ_FIRST(&ph->ph_in)); - else if (dir == PFIL_OUT) - return (TAILQ_FIRST(&ph->ph_out)); - else - return (NULL); -} +/* Public pfil locking functions for self managed locks by packet filters. */ +struct rm_priotracker; /* Do not require including rmlock header */ +int pfil_try_rlock(struct pfil_head *, struct rm_priotracker *); +void pfil_rlock(struct pfil_head *, struct rm_priotracker *); +void pfil_runlock(struct pfil_head *, struct rm_priotracker *); +void pfil_wlock(struct pfil_head *); +void pfil_wunlock(struct pfil_head *); +int pfil_wowned(struct pfil_head *ph); + +/* Internal pfil locking functions. */ +#define PFIL_LOCK_INIT_REAL(l, t) \ + rm_init_flags(l, "PFil " t " rmlock", RM_RECURSE) +#define PFIL_LOCK_DESTROY_REAL(l) \ + rm_destroy(l) +#define PFIL_LOCK_INIT(p) do { \ + if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) { \ + PFIL_LOCK_INIT_REAL(&(p)->ph_lock, "private"); \ + (p)->ph_plock = &(p)->ph_lock; \ + } else \ + (p)->ph_plock = &V_pfil_lock; \ +} while (0) +#define PFIL_LOCK_DESTROY(p) do { \ + if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) \ + PFIL_LOCK_DESTROY_REAL((p)->ph_plock); \ +} while (0) + +#define PFIL_TRY_RLOCK(p, t) rm_try_rlock((p)->ph_plock, (t)) +#define PFIL_RLOCK(p, t) rm_rlock((p)->ph_plock, (t)) +#define PFIL_WLOCK(p) rm_wlock((p)->ph_plock) +#define PFIL_RUNLOCK(p, t) rm_runlock((p)->ph_plock, (t)) +#define PFIL_WUNLOCK(p) rm_wunlock((p)->ph_plock) +#define PFIL_WOWNED(p) rm_wowned((p)->ph_plock) + +/* Internal locking macros for global/vnet pfil_head_list. */ +#define PFIL_HEADLIST_LOCK() mtx_lock(&pfil_global_lock) +#define PFIL_HEADLIST_UNLOCK() mtx_unlock(&pfil_global_lock) #endif /* _NET_PFIL_H_ */ diff --git a/freebsd/sys/net/pfkeyv2.h b/freebsd/sys/net/pfkeyv2.h index c45f8b05..c9b27695 100644 --- a/freebsd/sys/net/pfkeyv2.h +++ b/freebsd/sys/net/pfkeyv2.h @@ -218,7 +218,6 @@ struct sadb_x_sa2 { }; /* XXX Policy Extension */ -/* sizeof(struct sadb_x_policy) == 16 */ struct sadb_x_policy { u_int16_t sadb_x_policy_len; u_int16_t sadb_x_policy_exttype; @@ -226,8 +225,10 @@ struct sadb_x_policy { u_int8_t sadb_x_policy_dir; /* direction, see ipsec.h */ u_int8_t sadb_x_policy_reserved; u_int32_t sadb_x_policy_id; - u_int32_t sadb_x_policy_reserved2; + u_int32_t sadb_x_policy_priority; }; +_Static_assert(sizeof(struct sadb_x_policy) == 16, "struct size mismatch"); + /* * When policy_type == IPSEC, it is followed by some of * the ipsec policy request. @@ -256,31 +257,31 @@ struct sadb_x_ipsecrequest { }; /* NAT-Traversal type, see RFC 3948 (and drafts). */ -/* sizeof(struct sadb_x_nat_t_type) == 8 */ struct sadb_x_nat_t_type { u_int16_t sadb_x_nat_t_type_len; u_int16_t sadb_x_nat_t_type_exttype; u_int8_t sadb_x_nat_t_type_type; u_int8_t sadb_x_nat_t_type_reserved[3]; }; +_Static_assert(sizeof(struct sadb_x_nat_t_type) == 8, "struct size mismatch"); /* NAT-Traversal source or destination port. */ -/* sizeof(struct sadb_x_nat_t_port) == 8 */ struct sadb_x_nat_t_port { u_int16_t sadb_x_nat_t_port_len; u_int16_t sadb_x_nat_t_port_exttype; u_int16_t sadb_x_nat_t_port_port; u_int16_t sadb_x_nat_t_port_reserved; }; +_Static_assert(sizeof(struct sadb_x_nat_t_port) == 8, "struct size mismatch"); /* ESP fragmentation size. */ -/* sizeof(struct sadb_x_nat_t_frag) == 8 */ struct sadb_x_nat_t_frag { u_int16_t sadb_x_nat_t_frag_len; u_int16_t sadb_x_nat_t_frag_exttype; u_int16_t sadb_x_nat_t_frag_fraglen; u_int16_t sadb_x_nat_t_frag_reserved; }; +_Static_assert(sizeof(struct sadb_x_nat_t_frag) == 8, "struct size mismatch"); #define SADB_EXT_RESERVED 0 @@ -332,39 +333,47 @@ struct sadb_x_nat_t_frag { #define SADB_SAFLAGS_PFS 1 -/* RFC2367 numbers - meets RFC2407 */ +/* + * Though some of these numbers (both _AALG and _EALG) appear to be + * IKEv2 numbers and others original IKE numbers, they have no meaning. + * These are constants that the various IKE daemons use to tell the kernel + * what cipher to use. + * + * Do not use these constants directly to decide which Transformation ID + * to send. You are responsible for mapping them yourself. + */ #define SADB_AALG_NONE 0 #define SADB_AALG_MD5HMAC 2 #define SADB_AALG_SHA1HMAC 3 #define SADB_AALG_MAX 252 -/* private allocations - based on RFC2407/IANA assignment */ #define SADB_X_AALG_SHA2_256 5 #define SADB_X_AALG_SHA2_384 6 #define SADB_X_AALG_SHA2_512 7 #define SADB_X_AALG_RIPEMD160HMAC 8 -#define SADB_X_AALG_AES_XCBC_MAC 9 /* draft-ietf-ipsec-ciph-aes-xcbc-mac-04 */ -/* private allocations should use 249-255 (RFC2407) */ +#define SADB_X_AALG_AES_XCBC_MAC 9 /* RFC3566 */ +#define SADB_X_AALG_AES128GMAC 11 /* RFC4543 + Errata1821 */ +#define SADB_X_AALG_AES192GMAC 12 +#define SADB_X_AALG_AES256GMAC 13 #define SADB_X_AALG_MD5 249 /* Keyed MD5 */ #define SADB_X_AALG_SHA 250 /* Keyed SHA */ #define SADB_X_AALG_NULL 251 /* null authentication */ #define SADB_X_AALG_TCP_MD5 252 /* Keyed TCP-MD5 (RFC2385) */ -/* RFC2367 numbers - meets RFC2407 */ #define SADB_EALG_NONE 0 #define SADB_EALG_DESCBC 2 #define SADB_EALG_3DESCBC 3 -#define SADB_EALG_NULL 11 -#define SADB_EALG_MAX 250 -/* private allocations - based on RFC2407/IANA assignment */ #define SADB_X_EALG_CAST128CBC 6 #define SADB_X_EALG_BLOWFISHCBC 7 +#define SADB_EALG_NULL 11 #define SADB_X_EALG_RIJNDAELCBC 12 #define SADB_X_EALG_AES 12 -/* private allocations - based on RFC4312/IANA assignment */ -#define SADB_X_EALG_CAMELLIACBC 22 -/* private allocations should use 249-255 (RFC2407) */ -#define SADB_X_EALG_SKIPJACK 249 /*250*/ /* for IPSEC */ -#define SADB_X_EALG_AESCTR 250 /*249*/ /* draft-ietf-ipsec-ciph-aes-ctr-03 */ +#define SADB_X_EALG_AESCTR 13 +#define SADB_X_EALG_AESGCM8 18 /* RFC4106 */ +#define SADB_X_EALG_AESGCM12 19 +#define SADB_X_EALG_AESGCM16 20 +#define SADB_X_EALG_CAMELLIACBC 22 +#define SADB_X_EALG_AESGMAC 23 /* RFC4543 + Errata1821 */ +#define SADB_EALG_MAX 23 /* !!! keep updated !!! */ /* private allocations - based on RFC2407/IANA assignment */ #define SADB_X_CALG_NONE 0 diff --git a/freebsd/sys/net/pfvar.h b/freebsd/sys/net/pfvar.h new file mode 100644 index 00000000..17768e96 --- /dev/null +++ b/freebsd/sys/net/pfvar.h @@ -0,0 +1,1757 @@ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $ + * $FreeBSD$ + */ + +#ifndef _NET_PFVAR_H_ +#define _NET_PFVAR_H_ + +#include <rtems/bsd/sys/param.h> +#include <sys/queue.h> +#include <sys/counter.h> +#include <sys/malloc.h> +#include <sys/refcount.h> +#include <sys/tree.h> +#include <vm/uma.h> + +#include <net/radix.h> +#include <netinet/in.h> + +#include <netpfil/pf/pf.h> +#include <netpfil/pf/pf_altq.h> +#include <netpfil/pf/pf_mtag.h> + +struct pf_addr { + union { + struct in_addr v4; + struct in6_addr v6; + u_int8_t addr8[16]; + u_int16_t addr16[8]; + u_int32_t addr32[4]; + } pfa; /* 128-bit address */ +#define v4 pfa.v4 +#define v6 pfa.v6 +#define addr8 pfa.addr8 +#define addr16 pfa.addr16 +#define addr32 pfa.addr32 +}; + +#define PFI_AFLAG_NETWORK 0x01 +#define PFI_AFLAG_BROADCAST 0x02 +#define PFI_AFLAG_PEER 0x04 +#define PFI_AFLAG_MODEMASK 0x07 +#define PFI_AFLAG_NOALIAS 0x08 + +struct pf_addr_wrap { + union { + struct { + struct pf_addr addr; + struct pf_addr mask; + } a; + char ifname[IFNAMSIZ]; + char tblname[PF_TABLE_NAME_SIZE]; + } v; + union { + struct pfi_dynaddr *dyn; + struct pfr_ktable *tbl; + int dyncnt; + int tblcnt; + } p; + u_int8_t type; /* PF_ADDR_* */ + u_int8_t iflags; /* PFI_AFLAG_* */ +}; + +#ifdef _KERNEL + +struct pfi_dynaddr { + TAILQ_ENTRY(pfi_dynaddr) entry; + struct pf_addr pfid_addr4; + struct pf_addr pfid_mask4; + struct pf_addr pfid_addr6; + struct pf_addr pfid_mask6; + struct pfr_ktable *pfid_kt; + struct pfi_kif *pfid_kif; + int pfid_net; /* mask or 128 */ + int pfid_acnt4; /* address count IPv4 */ + int pfid_acnt6; /* address count IPv6 */ + sa_family_t pfid_af; /* rule af */ + u_int8_t pfid_iflags; /* PFI_AFLAG_* */ +}; + +/* + * Address manipulation macros + */ +#define HTONL(x) (x) = htonl((__uint32_t)(x)) +#define HTONS(x) (x) = htons((__uint16_t)(x)) +#define NTOHL(x) (x) = ntohl((__uint32_t)(x)) +#define NTOHS(x) (x) = ntohs((__uint16_t)(x)) + +#define PF_NAME "pf" + +#define PF_HASHROW_ASSERT(h) mtx_assert(&(h)->lock, MA_OWNED) +#define PF_HASHROW_LOCK(h) mtx_lock(&(h)->lock) +#define PF_HASHROW_UNLOCK(h) mtx_unlock(&(h)->lock) + +#define PF_STATE_LOCK(s) \ + do { \ + struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH(s)]; \ + PF_HASHROW_LOCK(_ih); \ + } while (0) + +#define PF_STATE_UNLOCK(s) \ + do { \ + struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH((s))]; \ + PF_HASHROW_UNLOCK(_ih); \ + } while (0) + +#ifdef INVARIANTS +#define PF_STATE_LOCK_ASSERT(s) \ + do { \ + struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH(s)]; \ + PF_HASHROW_ASSERT(_ih); \ + } while (0) +#else /* !INVARIANTS */ +#define PF_STATE_LOCK_ASSERT(s) do {} while (0) +#endif /* INVARIANTS */ + +extern struct mtx pf_unlnkdrules_mtx; +#define PF_UNLNKDRULES_LOCK() mtx_lock(&pf_unlnkdrules_mtx) +#define PF_UNLNKDRULES_UNLOCK() mtx_unlock(&pf_unlnkdrules_mtx) + +extern struct rwlock pf_rules_lock; +#define PF_RULES_RLOCK() rw_rlock(&pf_rules_lock) +#define PF_RULES_RUNLOCK() rw_runlock(&pf_rules_lock) +#define PF_RULES_WLOCK() rw_wlock(&pf_rules_lock) +#define PF_RULES_WUNLOCK() rw_wunlock(&pf_rules_lock) +#define PF_RULES_ASSERT() rw_assert(&pf_rules_lock, RA_LOCKED) +#define PF_RULES_RASSERT() rw_assert(&pf_rules_lock, RA_RLOCKED) +#define PF_RULES_WASSERT() rw_assert(&pf_rules_lock, RA_WLOCKED) + +#define PF_MODVER 1 +#define PFLOG_MODVER 1 +#define PFSYNC_MODVER 1 + +#define PFLOG_MINVER 1 +#define PFLOG_PREFVER PFLOG_MODVER +#define PFLOG_MAXVER 1 +#define PFSYNC_MINVER 1 +#define PFSYNC_PREFVER PFSYNC_MODVER +#define PFSYNC_MAXVER 1 + +#ifdef INET +#ifndef INET6 +#define PF_INET_ONLY +#endif /* ! INET6 */ +#endif /* INET */ + +#ifdef INET6 +#ifndef INET +#define PF_INET6_ONLY +#endif /* ! INET */ +#endif /* INET6 */ + +#ifdef INET +#ifdef INET6 +#define PF_INET_INET6 +#endif /* INET6 */ +#endif /* INET */ + +#else + +#define PF_INET_INET6 + +#endif /* _KERNEL */ + +/* Both IPv4 and IPv6 */ +#ifdef PF_INET_INET6 + +#define PF_AEQ(a, b, c) \ + ((c == AF_INET && (a)->addr32[0] == (b)->addr32[0]) || \ + (c == AF_INET6 && (a)->addr32[3] == (b)->addr32[3] && \ + (a)->addr32[2] == (b)->addr32[2] && \ + (a)->addr32[1] == (b)->addr32[1] && \ + (a)->addr32[0] == (b)->addr32[0])) \ + +#define PF_ANEQ(a, b, c) \ + ((c == AF_INET && (a)->addr32[0] != (b)->addr32[0]) || \ + (c == AF_INET6 && ((a)->addr32[0] != (b)->addr32[0] || \ + (a)->addr32[1] != (b)->addr32[1] || \ + (a)->addr32[2] != (b)->addr32[2] || \ + (a)->addr32[3] != (b)->addr32[3]))) \ + +#define PF_AZERO(a, c) \ + ((c == AF_INET && !(a)->addr32[0]) || \ + (c == AF_INET6 && !(a)->addr32[0] && !(a)->addr32[1] && \ + !(a)->addr32[2] && !(a)->addr32[3] )) \ + +#define PF_MATCHA(n, a, m, b, f) \ + pf_match_addr(n, a, m, b, f) + +#define PF_ACPY(a, b, f) \ + pf_addrcpy(a, b, f) + +#define PF_AINC(a, f) \ + pf_addr_inc(a, f) + +#define PF_POOLMASK(a, b, c, d, f) \ + pf_poolmask(a, b, c, d, f) + +#else + +/* Just IPv6 */ + +#ifdef PF_INET6_ONLY + +#define PF_AEQ(a, b, c) \ + ((a)->addr32[3] == (b)->addr32[3] && \ + (a)->addr32[2] == (b)->addr32[2] && \ + (a)->addr32[1] == (b)->addr32[1] && \ + (a)->addr32[0] == (b)->addr32[0]) \ + +#define PF_ANEQ(a, b, c) \ + ((a)->addr32[3] != (b)->addr32[3] || \ + (a)->addr32[2] != (b)->addr32[2] || \ + (a)->addr32[1] != (b)->addr32[1] || \ + (a)->addr32[0] != (b)->addr32[0]) \ + +#define PF_AZERO(a, c) \ + (!(a)->addr32[0] && \ + !(a)->addr32[1] && \ + !(a)->addr32[2] && \ + !(a)->addr32[3] ) \ + +#define PF_MATCHA(n, a, m, b, f) \ + pf_match_addr(n, a, m, b, f) + +#define PF_ACPY(a, b, f) \ + pf_addrcpy(a, b, f) + +#define PF_AINC(a, f) \ + pf_addr_inc(a, f) + +#define PF_POOLMASK(a, b, c, d, f) \ + pf_poolmask(a, b, c, d, f) + +#else + +/* Just IPv4 */ +#ifdef PF_INET_ONLY + +#define PF_AEQ(a, b, c) \ + ((a)->addr32[0] == (b)->addr32[0]) + +#define PF_ANEQ(a, b, c) \ + ((a)->addr32[0] != (b)->addr32[0]) + +#define PF_AZERO(a, c) \ + (!(a)->addr32[0]) + +#define PF_MATCHA(n, a, m, b, f) \ + pf_match_addr(n, a, m, b, f) + +#define PF_ACPY(a, b, f) \ + (a)->v4.s_addr = (b)->v4.s_addr + +#define PF_AINC(a, f) \ + do { \ + (a)->addr32[0] = htonl(ntohl((a)->addr32[0]) + 1); \ + } while (0) + +#define PF_POOLMASK(a, b, c, d, f) \ + do { \ + (a)->addr32[0] = ((b)->addr32[0] & (c)->addr32[0]) | \ + (((c)->addr32[0] ^ 0xffffffff ) & (d)->addr32[0]); \ + } while (0) + +#endif /* PF_INET_ONLY */ +#endif /* PF_INET6_ONLY */ +#endif /* PF_INET_INET6 */ + +/* + * XXX callers not FIB-aware in our version of pf yet. + * OpenBSD fixed it later it seems, 2010/05/07 13:33:16 claudio. + */ +#define PF_MISMATCHAW(aw, x, af, neg, ifp, rtid) \ + ( \ + (((aw)->type == PF_ADDR_NOROUTE && \ + pf_routable((x), (af), NULL, (rtid))) || \ + (((aw)->type == PF_ADDR_URPFFAILED && (ifp) != NULL && \ + pf_routable((x), (af), (ifp), (rtid))) || \ + ((aw)->type == PF_ADDR_TABLE && \ + !pfr_match_addr((aw)->p.tbl, (x), (af))) || \ + ((aw)->type == PF_ADDR_DYNIFTL && \ + !pfi_match_addr((aw)->p.dyn, (x), (af))) || \ + ((aw)->type == PF_ADDR_RANGE && \ + !pf_match_addr_range(&(aw)->v.a.addr, \ + &(aw)->v.a.mask, (x), (af))) || \ + ((aw)->type == PF_ADDR_ADDRMASK && \ + !PF_AZERO(&(aw)->v.a.mask, (af)) && \ + !PF_MATCHA(0, &(aw)->v.a.addr, \ + &(aw)->v.a.mask, (x), (af))))) != \ + (neg) \ + ) + + +struct pf_rule_uid { + uid_t uid[2]; + u_int8_t op; +}; + +struct pf_rule_gid { + uid_t gid[2]; + u_int8_t op; +}; + +struct pf_rule_addr { + struct pf_addr_wrap addr; + u_int16_t port[2]; + u_int8_t neg; + u_int8_t port_op; +}; + +struct pf_pooladdr { + struct pf_addr_wrap addr; + TAILQ_ENTRY(pf_pooladdr) entries; + char ifname[IFNAMSIZ]; + struct pfi_kif *kif; +}; + +TAILQ_HEAD(pf_palist, pf_pooladdr); + +struct pf_poolhashkey { + union { + u_int8_t key8[16]; + u_int16_t key16[8]; + u_int32_t key32[4]; + } pfk; /* 128-bit hash key */ +#define key8 pfk.key8 +#define key16 pfk.key16 +#define key32 pfk.key32 +}; + +struct pf_pool { + struct pf_palist list; + struct pf_pooladdr *cur; + struct pf_poolhashkey key; + struct pf_addr counter; + int tblidx; + u_int16_t proxy_port[2]; + u_int8_t opts; +}; + + +/* A packed Operating System description for fingerprinting */ +typedef u_int32_t pf_osfp_t; +#define PF_OSFP_ANY ((pf_osfp_t)0) +#define PF_OSFP_UNKNOWN ((pf_osfp_t)-1) +#define PF_OSFP_NOMATCH ((pf_osfp_t)-2) + +struct pf_osfp_entry { + SLIST_ENTRY(pf_osfp_entry) fp_entry; + pf_osfp_t fp_os; + int fp_enflags; +#define PF_OSFP_EXPANDED 0x001 /* expanded entry */ +#define PF_OSFP_GENERIC 0x002 /* generic signature */ +#define PF_OSFP_NODETAIL 0x004 /* no p0f details */ +#define PF_OSFP_LEN 32 + char fp_class_nm[PF_OSFP_LEN]; + char fp_version_nm[PF_OSFP_LEN]; + char fp_subtype_nm[PF_OSFP_LEN]; +}; +#define PF_OSFP_ENTRY_EQ(a, b) \ + ((a)->fp_os == (b)->fp_os && \ + memcmp((a)->fp_class_nm, (b)->fp_class_nm, PF_OSFP_LEN) == 0 && \ + memcmp((a)->fp_version_nm, (b)->fp_version_nm, PF_OSFP_LEN) == 0 && \ + memcmp((a)->fp_subtype_nm, (b)->fp_subtype_nm, PF_OSFP_LEN) == 0) + +/* handle pf_osfp_t packing */ +#define _FP_RESERVED_BIT 1 /* For the special negative #defines */ +#define _FP_UNUSED_BITS 1 +#define _FP_CLASS_BITS 10 /* OS Class (Windows, Linux) */ +#define _FP_VERSION_BITS 10 /* OS version (95, 98, NT, 2.4.54, 3.2) */ +#define _FP_SUBTYPE_BITS 10 /* patch level (NT SP4, SP3, ECN patch) */ +#define PF_OSFP_UNPACK(osfp, class, version, subtype) do { \ + (class) = ((osfp) >> (_FP_VERSION_BITS+_FP_SUBTYPE_BITS)) & \ + ((1 << _FP_CLASS_BITS) - 1); \ + (version) = ((osfp) >> _FP_SUBTYPE_BITS) & \ + ((1 << _FP_VERSION_BITS) - 1);\ + (subtype) = (osfp) & ((1 << _FP_SUBTYPE_BITS) - 1); \ +} while(0) +#define PF_OSFP_PACK(osfp, class, version, subtype) do { \ + (osfp) = ((class) & ((1 << _FP_CLASS_BITS) - 1)) << (_FP_VERSION_BITS \ + + _FP_SUBTYPE_BITS); \ + (osfp) |= ((version) & ((1 << _FP_VERSION_BITS) - 1)) << \ + _FP_SUBTYPE_BITS; \ + (osfp) |= (subtype) & ((1 << _FP_SUBTYPE_BITS) - 1); \ +} while(0) + +/* the fingerprint of an OSes TCP SYN packet */ +typedef u_int64_t pf_tcpopts_t; +struct pf_os_fingerprint { + SLIST_HEAD(pf_osfp_enlist, pf_osfp_entry) fp_oses; /* list of matches */ + pf_tcpopts_t fp_tcpopts; /* packed TCP options */ + u_int16_t fp_wsize; /* TCP window size */ + u_int16_t fp_psize; /* ip->ip_len */ + u_int16_t fp_mss; /* TCP MSS */ + u_int16_t fp_flags; +#define PF_OSFP_WSIZE_MOD 0x0001 /* Window modulus */ +#define PF_OSFP_WSIZE_DC 0x0002 /* Window don't care */ +#define PF_OSFP_WSIZE_MSS 0x0004 /* Window multiple of MSS */ +#define PF_OSFP_WSIZE_MTU 0x0008 /* Window multiple of MTU */ +#define PF_OSFP_PSIZE_MOD 0x0010 /* packet size modulus */ +#define PF_OSFP_PSIZE_DC 0x0020 /* packet size don't care */ +#define PF_OSFP_WSCALE 0x0040 /* TCP window scaling */ +#define PF_OSFP_WSCALE_MOD 0x0080 /* TCP window scale modulus */ +#define PF_OSFP_WSCALE_DC 0x0100 /* TCP window scale dont-care */ +#define PF_OSFP_MSS 0x0200 /* TCP MSS */ +#define PF_OSFP_MSS_MOD 0x0400 /* TCP MSS modulus */ +#define PF_OSFP_MSS_DC 0x0800 /* TCP MSS dont-care */ +#define PF_OSFP_DF 0x1000 /* IPv4 don't fragment bit */ +#define PF_OSFP_TS0 0x2000 /* Zero timestamp */ +#define PF_OSFP_INET6 0x4000 /* IPv6 */ + u_int8_t fp_optcnt; /* TCP option count */ + u_int8_t fp_wscale; /* TCP window scaling */ + u_int8_t fp_ttl; /* IPv4 TTL */ +#define PF_OSFP_MAXTTL_OFFSET 40 +/* TCP options packing */ +#define PF_OSFP_TCPOPT_NOP 0x0 /* TCP NOP option */ +#define PF_OSFP_TCPOPT_WSCALE 0x1 /* TCP window scaling option */ +#define PF_OSFP_TCPOPT_MSS 0x2 /* TCP max segment size opt */ +#define PF_OSFP_TCPOPT_SACK 0x3 /* TCP SACK OK option */ +#define PF_OSFP_TCPOPT_TS 0x4 /* TCP timestamp option */ +#define PF_OSFP_TCPOPT_BITS 3 /* bits used by each option */ +#define PF_OSFP_MAX_OPTS \ + (sizeof(((struct pf_os_fingerprint *)0)->fp_tcpopts) * 8) \ + / PF_OSFP_TCPOPT_BITS + + SLIST_ENTRY(pf_os_fingerprint) fp_next; +}; + +struct pf_osfp_ioctl { + struct pf_osfp_entry fp_os; + pf_tcpopts_t fp_tcpopts; /* packed TCP options */ + u_int16_t fp_wsize; /* TCP window size */ + u_int16_t fp_psize; /* ip->ip_len */ + u_int16_t fp_mss; /* TCP MSS */ + u_int16_t fp_flags; + u_int8_t fp_optcnt; /* TCP option count */ + u_int8_t fp_wscale; /* TCP window scaling */ + u_int8_t fp_ttl; /* IPv4 TTL */ + + int fp_getnum; /* DIOCOSFPGET number */ +}; + + +union pf_rule_ptr { + struct pf_rule *ptr; + u_int32_t nr; +}; + +#define PF_ANCHOR_NAME_SIZE 64 + +struct pf_rule { + struct pf_rule_addr src; + struct pf_rule_addr dst; +#define PF_SKIP_IFP 0 +#define PF_SKIP_DIR 1 +#define PF_SKIP_AF 2 +#define PF_SKIP_PROTO 3 +#define PF_SKIP_SRC_ADDR 4 +#define PF_SKIP_SRC_PORT 5 +#define PF_SKIP_DST_ADDR 6 +#define PF_SKIP_DST_PORT 7 +#define PF_SKIP_COUNT 8 + union pf_rule_ptr skip[PF_SKIP_COUNT]; +#define PF_RULE_LABEL_SIZE 64 + char label[PF_RULE_LABEL_SIZE]; + char ifname[IFNAMSIZ]; + char qname[PF_QNAME_SIZE]; + char pqname[PF_QNAME_SIZE]; +#define PF_TAG_NAME_SIZE 64 + char tagname[PF_TAG_NAME_SIZE]; + char match_tagname[PF_TAG_NAME_SIZE]; + + char overload_tblname[PF_TABLE_NAME_SIZE]; + + TAILQ_ENTRY(pf_rule) entries; + struct pf_pool rpool; + + u_int64_t evaluations; + u_int64_t packets[2]; + u_int64_t bytes[2]; + + struct pfi_kif *kif; + struct pf_anchor *anchor; + struct pfr_ktable *overload_tbl; + + pf_osfp_t os_fingerprint; + + int rtableid; + u_int32_t timeout[PFTM_MAX]; + u_int32_t max_states; + u_int32_t max_src_nodes; + u_int32_t max_src_states; + u_int32_t max_src_conn; + struct { + u_int32_t limit; + u_int32_t seconds; + } max_src_conn_rate; + u_int32_t qid; + u_int32_t pqid; + u_int32_t rt_listid; + u_int32_t nr; + u_int32_t prob; + uid_t cuid; + pid_t cpid; + + counter_u64_t states_cur; + counter_u64_t states_tot; + counter_u64_t src_nodes; + + u_int16_t return_icmp; + u_int16_t return_icmp6; + u_int16_t max_mss; + u_int16_t tag; + u_int16_t match_tag; + u_int16_t scrub_flags; + + struct pf_rule_uid uid; + struct pf_rule_gid gid; + + u_int32_t rule_flag; + u_int8_t action; + u_int8_t direction; + u_int8_t log; + u_int8_t logif; + u_int8_t quick; + u_int8_t ifnot; + u_int8_t match_tag_not; + u_int8_t natpass; + +#define PF_STATE_NORMAL 0x1 +#define PF_STATE_MODULATE 0x2 +#define PF_STATE_SYNPROXY 0x3 + u_int8_t keep_state; + sa_family_t af; + u_int8_t proto; + u_int8_t type; + u_int8_t code; + u_int8_t flags; + u_int8_t flagset; + u_int8_t min_ttl; + u_int8_t allow_opts; + u_int8_t rt; + u_int8_t return_ttl; + u_int8_t tos; + u_int8_t set_tos; + u_int8_t anchor_relative; + u_int8_t anchor_wildcard; + +#define PF_FLUSH 0x01 +#define PF_FLUSH_GLOBAL 0x02 + u_int8_t flush; +#define PF_PRIO_ZERO 0xff /* match "prio 0" packets */ +#define PF_PRIO_MAX 7 + u_int8_t prio; + u_int8_t set_prio[2]; + + struct { + struct pf_addr addr; + u_int16_t port; + } divert; + + uint64_t u_states_cur; + uint64_t u_states_tot; + uint64_t u_src_nodes; +}; + +/* rule flags */ +#define PFRULE_DROP 0x0000 +#define PFRULE_RETURNRST 0x0001 +#define PFRULE_FRAGMENT 0x0002 +#define PFRULE_RETURNICMP 0x0004 +#define PFRULE_RETURN 0x0008 +#define PFRULE_NOSYNC 0x0010 +#define PFRULE_SRCTRACK 0x0020 /* track source states */ +#define PFRULE_RULESRCTRACK 0x0040 /* per rule */ +#define PFRULE_REFS 0x0080 /* rule has references */ + +/* scrub flags */ +#define PFRULE_NODF 0x0100 +#define PFRULE_RANDOMID 0x0800 +#define PFRULE_REASSEMBLE_TCP 0x1000 +#define PFRULE_SET_TOS 0x2000 + +/* rule flags again */ +#define PFRULE_IFBOUND 0x00010000 /* if-bound */ +#define PFRULE_STATESLOPPY 0x00020000 /* sloppy state tracking */ + +#define PFSTATE_HIWAT 10000 /* default state table size */ +#define PFSTATE_ADAPT_START 6000 /* default adaptive timeout start */ +#define PFSTATE_ADAPT_END 12000 /* default adaptive timeout end */ + + +struct pf_threshold { + u_int32_t limit; +#define PF_THRESHOLD_MULT 1000 +#define PF_THRESHOLD_MAX 0xffffffff / PF_THRESHOLD_MULT + u_int32_t seconds; + u_int32_t count; + u_int32_t last; +}; + +struct pf_src_node { + LIST_ENTRY(pf_src_node) entry; + struct pf_addr addr; + struct pf_addr raddr; + union pf_rule_ptr rule; + struct pfi_kif *kif; + u_int64_t bytes[2]; + u_int64_t packets[2]; + u_int32_t states; + u_int32_t conn; + struct pf_threshold conn_rate; + u_int32_t creation; + u_int32_t expire; + sa_family_t af; + u_int8_t ruletype; +}; + +#define PFSNODE_HIWAT 10000 /* default source node table size */ + +struct pf_state_scrub { + struct timeval pfss_last; /* time received last packet */ + u_int32_t pfss_tsecr; /* last echoed timestamp */ + u_int32_t pfss_tsval; /* largest timestamp */ + u_int32_t pfss_tsval0; /* original timestamp */ + u_int16_t pfss_flags; +#define PFSS_TIMESTAMP 0x0001 /* modulate timestamp */ +#define PFSS_PAWS 0x0010 /* stricter PAWS checks */ +#define PFSS_PAWS_IDLED 0x0020 /* was idle too long. no PAWS */ +#define PFSS_DATA_TS 0x0040 /* timestamp on data packets */ +#define PFSS_DATA_NOTS 0x0080 /* no timestamp on data packets */ + u_int8_t pfss_ttl; /* stashed TTL */ + u_int8_t pad; + u_int32_t pfss_ts_mod; /* timestamp modulation */ +}; + +struct pf_state_host { + struct pf_addr addr; + u_int16_t port; + u_int16_t pad; +}; + +struct pf_state_peer { + struct pf_state_scrub *scrub; /* state is scrubbed */ + u_int32_t seqlo; /* Max sequence number sent */ + u_int32_t seqhi; /* Max the other end ACKd + win */ + u_int32_t seqdiff; /* Sequence number modulator */ + u_int16_t max_win; /* largest window (pre scaling) */ + u_int16_t mss; /* Maximum segment size option */ + u_int8_t state; /* active state level */ + u_int8_t wscale; /* window scaling factor */ + u_int8_t tcp_est; /* Did we reach TCPS_ESTABLISHED */ + u_int8_t pad[1]; +}; + +/* Keep synced with struct pf_state_key. */ +struct pf_state_key_cmp { + struct pf_addr addr[2]; + u_int16_t port[2]; + sa_family_t af; + u_int8_t proto; + u_int8_t pad[2]; +}; + +struct pf_state_key { + struct pf_addr addr[2]; + u_int16_t port[2]; + sa_family_t af; + u_int8_t proto; + u_int8_t pad[2]; + + LIST_ENTRY(pf_state_key) entry; + TAILQ_HEAD(, pf_state) states[2]; +}; + +/* Keep synced with struct pf_state. */ +struct pf_state_cmp { + u_int64_t id; + u_int32_t creatorid; + u_int8_t direction; + u_int8_t pad[3]; +}; + +struct pf_state { + u_int64_t id; + u_int32_t creatorid; + u_int8_t direction; + u_int8_t pad[3]; + + u_int refs; + TAILQ_ENTRY(pf_state) sync_list; + TAILQ_ENTRY(pf_state) key_list[2]; + LIST_ENTRY(pf_state) entry; + struct pf_state_peer src; + struct pf_state_peer dst; + union pf_rule_ptr rule; + union pf_rule_ptr anchor; + union pf_rule_ptr nat_rule; + struct pf_addr rt_addr; + struct pf_state_key *key[2]; /* addresses stack and wire */ + struct pfi_kif *kif; + struct pfi_kif *rt_kif; + struct pf_src_node *src_node; + struct pf_src_node *nat_src_node; + u_int64_t packets[2]; + u_int64_t bytes[2]; + u_int32_t creation; + u_int32_t expire; + u_int32_t pfsync_time; + u_int16_t tag; + u_int8_t log; + u_int8_t state_flags; +#define PFSTATE_ALLOWOPTS 0x01 +#define PFSTATE_SLOPPY 0x02 +/* was PFSTATE_PFLOW 0x04 */ +#define PFSTATE_NOSYNC 0x08 +#define PFSTATE_ACK 0x10 +#define PFSTATE_SETPRIO 0x0200 +#define PFSTATE_SETMASK (PFSTATE_SETPRIO) + u_int8_t timeout; + u_int8_t sync_state; /* PFSYNC_S_x */ + + /* XXX */ + u_int8_t sync_updates; + u_int8_t _tail[3]; +}; + +/* + * Unified state structures for pulling states out of the kernel + * used by pfsync(4) and the pf(4) ioctl. + */ +struct pfsync_state_scrub { + u_int16_t pfss_flags; + u_int8_t pfss_ttl; /* stashed TTL */ +#define PFSYNC_SCRUB_FLAG_VALID 0x01 + u_int8_t scrub_flag; + u_int32_t pfss_ts_mod; /* timestamp modulation */ +} __packed; + +struct pfsync_state_peer { + struct pfsync_state_scrub scrub; /* state is scrubbed */ + u_int32_t seqlo; /* Max sequence number sent */ + u_int32_t seqhi; /* Max the other end ACKd + win */ + u_int32_t seqdiff; /* Sequence number modulator */ + u_int16_t max_win; /* largest window (pre scaling) */ + u_int16_t mss; /* Maximum segment size option */ + u_int8_t state; /* active state level */ + u_int8_t wscale; /* window scaling factor */ + u_int8_t pad[6]; +} __packed; + +struct pfsync_state_key { + struct pf_addr addr[2]; + u_int16_t port[2]; +}; + +struct pfsync_state { + u_int64_t id; + char ifname[IFNAMSIZ]; + struct pfsync_state_key key[2]; + struct pfsync_state_peer src; + struct pfsync_state_peer dst; + struct pf_addr rt_addr; + u_int32_t rule; + u_int32_t anchor; + u_int32_t nat_rule; + u_int32_t creation; + u_int32_t expire; + u_int32_t packets[2][2]; + u_int32_t bytes[2][2]; + u_int32_t creatorid; + sa_family_t af; + u_int8_t proto; + u_int8_t direction; + u_int8_t __spare[2]; + u_int8_t log; + u_int8_t state_flags; + u_int8_t timeout; + u_int8_t sync_flags; + u_int8_t updates; +} __packed; + +#ifdef _KERNEL +/* pfsync */ +typedef int pfsync_state_import_t(struct pfsync_state *, u_int8_t); +typedef void pfsync_insert_state_t(struct pf_state *); +typedef void pfsync_update_state_t(struct pf_state *); +typedef void pfsync_delete_state_t(struct pf_state *); +typedef void pfsync_clear_states_t(u_int32_t, const char *); +typedef int pfsync_defer_t(struct pf_state *, struct mbuf *); + +extern pfsync_state_import_t *pfsync_state_import_ptr; +extern pfsync_insert_state_t *pfsync_insert_state_ptr; +extern pfsync_update_state_t *pfsync_update_state_ptr; +extern pfsync_delete_state_t *pfsync_delete_state_ptr; +extern pfsync_clear_states_t *pfsync_clear_states_ptr; +extern pfsync_defer_t *pfsync_defer_ptr; + +void pfsync_state_export(struct pfsync_state *, + struct pf_state *); + +/* pflog */ +struct pf_ruleset; +struct pf_pdesc; +typedef int pflog_packet_t(struct pfi_kif *, struct mbuf *, sa_family_t, + u_int8_t, u_int8_t, struct pf_rule *, struct pf_rule *, + struct pf_ruleset *, struct pf_pdesc *, int); +extern pflog_packet_t *pflog_packet_ptr; + +#endif /* _KERNEL */ + +#define PFSYNC_FLAG_SRCNODE 0x04 +#define PFSYNC_FLAG_NATSRCNODE 0x08 + +/* for copies to/from network byte order */ +/* ioctl interface also uses network byte order */ +#define pf_state_peer_hton(s,d) do { \ + (d)->seqlo = htonl((s)->seqlo); \ + (d)->seqhi = htonl((s)->seqhi); \ + (d)->seqdiff = htonl((s)->seqdiff); \ + (d)->max_win = htons((s)->max_win); \ + (d)->mss = htons((s)->mss); \ + (d)->state = (s)->state; \ + (d)->wscale = (s)->wscale; \ + if ((s)->scrub) { \ + (d)->scrub.pfss_flags = \ + htons((s)->scrub->pfss_flags & PFSS_TIMESTAMP); \ + (d)->scrub.pfss_ttl = (s)->scrub->pfss_ttl; \ + (d)->scrub.pfss_ts_mod = htonl((s)->scrub->pfss_ts_mod);\ + (d)->scrub.scrub_flag = PFSYNC_SCRUB_FLAG_VALID; \ + } \ +} while (0) + +#define pf_state_peer_ntoh(s,d) do { \ + (d)->seqlo = ntohl((s)->seqlo); \ + (d)->seqhi = ntohl((s)->seqhi); \ + (d)->seqdiff = ntohl((s)->seqdiff); \ + (d)->max_win = ntohs((s)->max_win); \ + (d)->mss = ntohs((s)->mss); \ + (d)->state = (s)->state; \ + (d)->wscale = (s)->wscale; \ + if ((s)->scrub.scrub_flag == PFSYNC_SCRUB_FLAG_VALID && \ + (d)->scrub != NULL) { \ + (d)->scrub->pfss_flags = \ + ntohs((s)->scrub.pfss_flags) & PFSS_TIMESTAMP; \ + (d)->scrub->pfss_ttl = (s)->scrub.pfss_ttl; \ + (d)->scrub->pfss_ts_mod = ntohl((s)->scrub.pfss_ts_mod);\ + } \ +} while (0) + +#define pf_state_counter_hton(s,d) do { \ + d[0] = htonl((s>>32)&0xffffffff); \ + d[1] = htonl(s&0xffffffff); \ +} while (0) + +#define pf_state_counter_from_pfsync(s) \ + (((u_int64_t)(s[0])<<32) | (u_int64_t)(s[1])) + +#define pf_state_counter_ntoh(s,d) do { \ + d = ntohl(s[0]); \ + d = d<<32; \ + d += ntohl(s[1]); \ +} while (0) + +TAILQ_HEAD(pf_rulequeue, pf_rule); + +struct pf_anchor; + +struct pf_ruleset { + struct { + struct pf_rulequeue queues[2]; + struct { + struct pf_rulequeue *ptr; + struct pf_rule **ptr_array; + u_int32_t rcount; + u_int32_t ticket; + int open; + } active, inactive; + } rules[PF_RULESET_MAX]; + struct pf_anchor *anchor; + u_int32_t tticket; + int tables; + int topen; +}; + +RB_HEAD(pf_anchor_global, pf_anchor); +RB_HEAD(pf_anchor_node, pf_anchor); +struct pf_anchor { + RB_ENTRY(pf_anchor) entry_global; + RB_ENTRY(pf_anchor) entry_node; + struct pf_anchor *parent; + struct pf_anchor_node children; + char name[PF_ANCHOR_NAME_SIZE]; + char path[MAXPATHLEN]; + struct pf_ruleset ruleset; + int refcnt; /* anchor rules */ + int match; /* XXX: used for pfctl black magic */ +}; +RB_PROTOTYPE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare); +RB_PROTOTYPE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare); + +#define PF_RESERVED_ANCHOR "_pf" + +#define PFR_TFLAG_PERSIST 0x00000001 +#define PFR_TFLAG_CONST 0x00000002 +#define PFR_TFLAG_ACTIVE 0x00000004 +#define PFR_TFLAG_INACTIVE 0x00000008 +#define PFR_TFLAG_REFERENCED 0x00000010 +#define PFR_TFLAG_REFDANCHOR 0x00000020 +#define PFR_TFLAG_COUNTERS 0x00000040 +/* Adjust masks below when adding flags. */ +#define PFR_TFLAG_USRMASK (PFR_TFLAG_PERSIST | \ + PFR_TFLAG_CONST | \ + PFR_TFLAG_COUNTERS) +#define PFR_TFLAG_SETMASK (PFR_TFLAG_ACTIVE | \ + PFR_TFLAG_INACTIVE | \ + PFR_TFLAG_REFERENCED | \ + PFR_TFLAG_REFDANCHOR) +#define PFR_TFLAG_ALLMASK (PFR_TFLAG_PERSIST | \ + PFR_TFLAG_CONST | \ + PFR_TFLAG_ACTIVE | \ + PFR_TFLAG_INACTIVE | \ + PFR_TFLAG_REFERENCED | \ + PFR_TFLAG_REFDANCHOR | \ + PFR_TFLAG_COUNTERS) + +struct pf_anchor_stackframe; + +struct pfr_table { + char pfrt_anchor[MAXPATHLEN]; + char pfrt_name[PF_TABLE_NAME_SIZE]; + u_int32_t pfrt_flags; + u_int8_t pfrt_fback; +}; + +enum { PFR_FB_NONE, PFR_FB_MATCH, PFR_FB_ADDED, PFR_FB_DELETED, + PFR_FB_CHANGED, PFR_FB_CLEARED, PFR_FB_DUPLICATE, + PFR_FB_NOTMATCH, PFR_FB_CONFLICT, PFR_FB_NOCOUNT, PFR_FB_MAX }; + +struct pfr_addr { + union { + struct in_addr _pfra_ip4addr; + struct in6_addr _pfra_ip6addr; + } pfra_u; + u_int8_t pfra_af; + u_int8_t pfra_net; + u_int8_t pfra_not; + u_int8_t pfra_fback; +}; +#define pfra_ip4addr pfra_u._pfra_ip4addr +#define pfra_ip6addr pfra_u._pfra_ip6addr + +enum { PFR_DIR_IN, PFR_DIR_OUT, PFR_DIR_MAX }; +enum { PFR_OP_BLOCK, PFR_OP_PASS, PFR_OP_ADDR_MAX, PFR_OP_TABLE_MAX }; +#define PFR_OP_XPASS PFR_OP_ADDR_MAX + +struct pfr_astats { + struct pfr_addr pfras_a; + u_int64_t pfras_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + u_int64_t pfras_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + long pfras_tzero; +}; + +enum { PFR_REFCNT_RULE, PFR_REFCNT_ANCHOR, PFR_REFCNT_MAX }; + +struct pfr_tstats { + struct pfr_table pfrts_t; + u_int64_t pfrts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + u_int64_t pfrts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; + u_int64_t pfrts_match; + u_int64_t pfrts_nomatch; + long pfrts_tzero; + int pfrts_cnt; + int pfrts_refcnt[PFR_REFCNT_MAX]; +}; +#define pfrts_name pfrts_t.pfrt_name +#define pfrts_flags pfrts_t.pfrt_flags + +#ifndef _SOCKADDR_UNION_DEFINED +#define _SOCKADDR_UNION_DEFINED +union sockaddr_union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; +#endif /* _SOCKADDR_UNION_DEFINED */ + +struct pfr_kcounters { + u_int64_t pfrkc_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; + u_int64_t pfrkc_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; +}; + +SLIST_HEAD(pfr_kentryworkq, pfr_kentry); +struct pfr_kentry { + struct radix_node pfrke_node[2]; + union sockaddr_union pfrke_sa; + SLIST_ENTRY(pfr_kentry) pfrke_workq; + struct pfr_kcounters *pfrke_counters; + long pfrke_tzero; + u_int8_t pfrke_af; + u_int8_t pfrke_net; + u_int8_t pfrke_not; + u_int8_t pfrke_mark; +}; + +SLIST_HEAD(pfr_ktableworkq, pfr_ktable); +RB_HEAD(pfr_ktablehead, pfr_ktable); +struct pfr_ktable { + struct pfr_tstats pfrkt_ts; + RB_ENTRY(pfr_ktable) pfrkt_tree; + SLIST_ENTRY(pfr_ktable) pfrkt_workq; + struct radix_node_head *pfrkt_ip4; + struct radix_node_head *pfrkt_ip6; + struct pfr_ktable *pfrkt_shadow; + struct pfr_ktable *pfrkt_root; + struct pf_ruleset *pfrkt_rs; + long pfrkt_larg; + int pfrkt_nflags; +}; +#define pfrkt_t pfrkt_ts.pfrts_t +#define pfrkt_name pfrkt_t.pfrt_name +#define pfrkt_anchor pfrkt_t.pfrt_anchor +#define pfrkt_ruleset pfrkt_t.pfrt_ruleset +#define pfrkt_flags pfrkt_t.pfrt_flags +#define pfrkt_cnt pfrkt_ts.pfrts_cnt +#define pfrkt_refcnt pfrkt_ts.pfrts_refcnt +#define pfrkt_packets pfrkt_ts.pfrts_packets +#define pfrkt_bytes pfrkt_ts.pfrts_bytes +#define pfrkt_match pfrkt_ts.pfrts_match +#define pfrkt_nomatch pfrkt_ts.pfrts_nomatch +#define pfrkt_tzero pfrkt_ts.pfrts_tzero + +/* keep synced with pfi_kif, used in RB_FIND */ +struct pfi_kif_cmp { + char pfik_name[IFNAMSIZ]; +}; + +struct pfi_kif { + char pfik_name[IFNAMSIZ]; + union { + RB_ENTRY(pfi_kif) _pfik_tree; + LIST_ENTRY(pfi_kif) _pfik_list; + } _pfik_glue; +#define pfik_tree _pfik_glue._pfik_tree +#define pfik_list _pfik_glue._pfik_list + u_int64_t pfik_packets[2][2][2]; + u_int64_t pfik_bytes[2][2][2]; + u_int32_t pfik_tzero; + u_int pfik_flags; + struct ifnet *pfik_ifp; + struct ifg_group *pfik_group; + u_int pfik_rulerefs; + TAILQ_HEAD(, pfi_dynaddr) pfik_dynaddrs; +}; + +#define PFI_IFLAG_REFS 0x0001 /* has state references */ +#define PFI_IFLAG_SKIP 0x0100 /* skip filtering on interface */ + +struct pf_pdesc { + struct { + int done; + uid_t uid; + gid_t gid; + } lookup; + u_int64_t tot_len; /* Make Mickey money */ + union { + struct tcphdr *tcp; + struct udphdr *udp; + struct icmp *icmp; +#ifdef INET6 + struct icmp6_hdr *icmp6; +#endif /* INET6 */ + void *any; + } hdr; + + struct pf_rule *nat_rule; /* nat/rdr rule applied to packet */ + struct pf_addr *src; /* src address */ + struct pf_addr *dst; /* dst address */ + u_int16_t *sport; + u_int16_t *dport; + struct pf_mtag *pf_mtag; + + u_int32_t p_len; /* total length of payload */ + + u_int16_t *ip_sum; + u_int16_t *proto_sum; + u_int16_t flags; /* Let SCRUB trigger behavior in + * state code. Easier than tags */ +#define PFDESC_TCP_NORM 0x0001 /* TCP shall be statefully scrubbed */ +#define PFDESC_IP_REAS 0x0002 /* IP frags would've been reassembled */ + sa_family_t af; + u_int8_t proto; + u_int8_t tos; + u_int8_t dir; /* direction */ + u_int8_t sidx; /* key index for source */ + u_int8_t didx; /* key index for destination */ +}; + +/* flags for RDR options */ +#define PF_DPORT_RANGE 0x01 /* Dest port uses range */ +#define PF_RPORT_RANGE 0x02 /* RDR'ed port uses range */ + +/* UDP state enumeration */ +#define PFUDPS_NO_TRAFFIC 0 +#define PFUDPS_SINGLE 1 +#define PFUDPS_MULTIPLE 2 + +#define PFUDPS_NSTATES 3 /* number of state levels */ + +#define PFUDPS_NAMES { \ + "NO_TRAFFIC", \ + "SINGLE", \ + "MULTIPLE", \ + NULL \ +} + +/* Other protocol state enumeration */ +#define PFOTHERS_NO_TRAFFIC 0 +#define PFOTHERS_SINGLE 1 +#define PFOTHERS_MULTIPLE 2 + +#define PFOTHERS_NSTATES 3 /* number of state levels */ + +#define PFOTHERS_NAMES { \ + "NO_TRAFFIC", \ + "SINGLE", \ + "MULTIPLE", \ + NULL \ +} + +#define ACTION_SET(a, x) \ + do { \ + if ((a) != NULL) \ + *(a) = (x); \ + } while (0) + +#define REASON_SET(a, x) \ + do { \ + if ((a) != NULL) \ + *(a) = (x); \ + if (x < PFRES_MAX) \ + counter_u64_add(V_pf_status.counters[x], 1); \ + } while (0) + +struct pf_kstatus { + counter_u64_t counters[PFRES_MAX]; /* reason for passing/dropping */ + counter_u64_t lcounters[LCNT_MAX]; /* limit counters */ + counter_u64_t fcounters[FCNT_MAX]; /* state operation counters */ + counter_u64_t scounters[SCNT_MAX]; /* src_node operation counters */ + uint32_t states; + uint32_t src_nodes; + uint32_t running; + uint32_t since; + uint32_t debug; + uint32_t hostid; + char ifname[IFNAMSIZ]; + uint8_t pf_chksum[PF_MD5_DIGEST_LENGTH]; +}; + +struct pf_divert { + union { + struct in_addr ipv4; + struct in6_addr ipv6; + } addr; + u_int16_t port; +}; + +#define PFFRAG_FRENT_HIWAT 5000 /* Number of fragment entries */ +#define PFR_KENTRY_HIWAT 200000 /* Number of table entries */ + +/* + * ioctl parameter structures + */ + +struct pfioc_pooladdr { + u_int32_t action; + u_int32_t ticket; + u_int32_t nr; + u_int32_t r_num; + u_int8_t r_action; + u_int8_t r_last; + u_int8_t af; + char anchor[MAXPATHLEN]; + struct pf_pooladdr addr; +}; + +struct pfioc_rule { + u_int32_t action; + u_int32_t ticket; + u_int32_t pool_ticket; + u_int32_t nr; + char anchor[MAXPATHLEN]; + char anchor_call[MAXPATHLEN]; + struct pf_rule rule; +}; + +struct pfioc_natlook { + struct pf_addr saddr; + struct pf_addr daddr; + struct pf_addr rsaddr; + struct pf_addr rdaddr; + u_int16_t sport; + u_int16_t dport; + u_int16_t rsport; + u_int16_t rdport; + sa_family_t af; + u_int8_t proto; + u_int8_t direction; +}; + +struct pfioc_state { + struct pfsync_state state; +}; + +struct pfioc_src_node_kill { + sa_family_t psnk_af; + struct pf_rule_addr psnk_src; + struct pf_rule_addr psnk_dst; + u_int psnk_killed; +}; + +struct pfioc_state_kill { + struct pf_state_cmp psk_pfcmp; + sa_family_t psk_af; + int psk_proto; + struct pf_rule_addr psk_src; + struct pf_rule_addr psk_dst; + char psk_ifname[IFNAMSIZ]; + char psk_label[PF_RULE_LABEL_SIZE]; + u_int psk_killed; +}; + +struct pfioc_states { + int ps_len; + union { + caddr_t psu_buf; + struct pfsync_state *psu_states; + } ps_u; +#define ps_buf ps_u.psu_buf +#define ps_states ps_u.psu_states +}; + +struct pfioc_src_nodes { + int psn_len; + union { + caddr_t psu_buf; + struct pf_src_node *psu_src_nodes; + } psn_u; +#define psn_buf psn_u.psu_buf +#define psn_src_nodes psn_u.psu_src_nodes +}; + +struct pfioc_if { + char ifname[IFNAMSIZ]; +}; + +struct pfioc_tm { + int timeout; + int seconds; +}; + +struct pfioc_limit { + int index; + unsigned limit; +}; + +struct pfioc_altq { + u_int32_t action; + u_int32_t ticket; + u_int32_t nr; + struct pf_altq altq; +}; + +struct pfioc_qstats { + u_int32_t ticket; + u_int32_t nr; + void *buf; + int nbytes; + u_int8_t scheduler; +}; + +struct pfioc_ruleset { + u_int32_t nr; + char path[MAXPATHLEN]; + char name[PF_ANCHOR_NAME_SIZE]; +}; + +#define PF_RULESET_ALTQ (PF_RULESET_MAX) +#define PF_RULESET_TABLE (PF_RULESET_MAX+1) +struct pfioc_trans { + int size; /* number of elements */ + int esize; /* size of each element in bytes */ + struct pfioc_trans_e { + int rs_num; + char anchor[MAXPATHLEN]; + u_int32_t ticket; + } *array; +}; + +#define PFR_FLAG_ATOMIC 0x00000001 /* unused */ +#define PFR_FLAG_DUMMY 0x00000002 +#define PFR_FLAG_FEEDBACK 0x00000004 +#define PFR_FLAG_CLSTATS 0x00000008 +#define PFR_FLAG_ADDRSTOO 0x00000010 +#define PFR_FLAG_REPLACE 0x00000020 +#define PFR_FLAG_ALLRSETS 0x00000040 +#define PFR_FLAG_ALLMASK 0x0000007F +#ifdef _KERNEL +#define PFR_FLAG_USERIOCTL 0x10000000 +#endif + +struct pfioc_table { + struct pfr_table pfrio_table; + void *pfrio_buffer; + int pfrio_esize; + int pfrio_size; + int pfrio_size2; + int pfrio_nadd; + int pfrio_ndel; + int pfrio_nchange; + int pfrio_flags; + u_int32_t pfrio_ticket; +}; +#define pfrio_exists pfrio_nadd +#define pfrio_nzero pfrio_nadd +#define pfrio_nmatch pfrio_nadd +#define pfrio_naddr pfrio_size2 +#define pfrio_setflag pfrio_size2 +#define pfrio_clrflag pfrio_nadd + +struct pfioc_iface { + char pfiio_name[IFNAMSIZ]; + void *pfiio_buffer; + int pfiio_esize; + int pfiio_size; + int pfiio_nzero; + int pfiio_flags; +}; + + +/* + * ioctl operations + */ + +#define DIOCSTART _IO ('D', 1) +#define DIOCSTOP _IO ('D', 2) +#define DIOCADDRULE _IOWR('D', 4, struct pfioc_rule) +#define DIOCGETRULES _IOWR('D', 6, struct pfioc_rule) +#define DIOCGETRULE _IOWR('D', 7, struct pfioc_rule) +/* XXX cut 8 - 17 */ +#define DIOCCLRSTATES _IOWR('D', 18, struct pfioc_state_kill) +#define DIOCGETSTATE _IOWR('D', 19, struct pfioc_state) +#define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) +#define DIOCGETSTATUS _IOWR('D', 21, struct pf_status) +#define DIOCCLRSTATUS _IO ('D', 22) +#define DIOCNATLOOK _IOWR('D', 23, struct pfioc_natlook) +#define DIOCSETDEBUG _IOWR('D', 24, u_int32_t) +#define DIOCGETSTATES _IOWR('D', 25, struct pfioc_states) +#define DIOCCHANGERULE _IOWR('D', 26, struct pfioc_rule) +/* XXX cut 26 - 28 */ +#define DIOCSETTIMEOUT _IOWR('D', 29, struct pfioc_tm) +#define DIOCGETTIMEOUT _IOWR('D', 30, struct pfioc_tm) +#define DIOCADDSTATE _IOWR('D', 37, struct pfioc_state) +#define DIOCCLRRULECTRS _IO ('D', 38) +#define DIOCGETLIMIT _IOWR('D', 39, struct pfioc_limit) +#define DIOCSETLIMIT _IOWR('D', 40, struct pfioc_limit) +#define DIOCKILLSTATES _IOWR('D', 41, struct pfioc_state_kill) +#define DIOCSTARTALTQ _IO ('D', 42) +#define DIOCSTOPALTQ _IO ('D', 43) +#define DIOCADDALTQ _IOWR('D', 45, struct pfioc_altq) +#define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) +#define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) +#define DIOCCHANGEALTQ _IOWR('D', 49, struct pfioc_altq) +#define DIOCGETQSTATS _IOWR('D', 50, struct pfioc_qstats) +#define DIOCBEGINADDRS _IOWR('D', 51, struct pfioc_pooladdr) +#define DIOCADDADDR _IOWR('D', 52, struct pfioc_pooladdr) +#define DIOCGETADDRS _IOWR('D', 53, struct pfioc_pooladdr) +#define DIOCGETADDR _IOWR('D', 54, struct pfioc_pooladdr) +#define DIOCCHANGEADDR _IOWR('D', 55, struct pfioc_pooladdr) +/* XXX cut 55 - 57 */ +#define DIOCGETRULESETS _IOWR('D', 58, struct pfioc_ruleset) +#define DIOCGETRULESET _IOWR('D', 59, struct pfioc_ruleset) +#define DIOCRCLRTABLES _IOWR('D', 60, struct pfioc_table) +#define DIOCRADDTABLES _IOWR('D', 61, struct pfioc_table) +#define DIOCRDELTABLES _IOWR('D', 62, struct pfioc_table) +#define DIOCRGETTABLES _IOWR('D', 63, struct pfioc_table) +#define DIOCRGETTSTATS _IOWR('D', 64, struct pfioc_table) +#define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) +#define DIOCRCLRADDRS _IOWR('D', 66, struct pfioc_table) +#define DIOCRADDADDRS _IOWR('D', 67, struct pfioc_table) +#define DIOCRDELADDRS _IOWR('D', 68, struct pfioc_table) +#define DIOCRSETADDRS _IOWR('D', 69, struct pfioc_table) +#define DIOCRGETADDRS _IOWR('D', 70, struct pfioc_table) +#define DIOCRGETASTATS _IOWR('D', 71, struct pfioc_table) +#define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) +#define DIOCRTSTADDRS _IOWR('D', 73, struct pfioc_table) +#define DIOCRSETTFLAGS _IOWR('D', 74, struct pfioc_table) +#define DIOCRINADEFINE _IOWR('D', 77, struct pfioc_table) +#define DIOCOSFPFLUSH _IO('D', 78) +#define DIOCOSFPADD _IOWR('D', 79, struct pf_osfp_ioctl) +#define DIOCOSFPGET _IOWR('D', 80, struct pf_osfp_ioctl) +#define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) +#define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) +#define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) +#define DIOCGETSRCNODES _IOWR('D', 84, struct pfioc_src_nodes) +#define DIOCCLRSRCNODES _IO('D', 85) +#define DIOCSETHOSTID _IOWR('D', 86, u_int32_t) +#define DIOCIGETIFACES _IOWR('D', 87, struct pfioc_iface) +#define DIOCSETIFFLAG _IOWR('D', 89, struct pfioc_iface) +#define DIOCCLRIFFLAG _IOWR('D', 90, struct pfioc_iface) +#define DIOCKILLSRCNODES _IOWR('D', 91, struct pfioc_src_node_kill) +struct pf_ifspeed { + char ifname[IFNAMSIZ]; + u_int32_t baudrate; +}; +#define DIOCGIFSPEED _IOWR('D', 92, struct pf_ifspeed) + +#ifdef _KERNEL +LIST_HEAD(pf_src_node_list, pf_src_node); +struct pf_srchash { + struct pf_src_node_list nodes; + struct mtx lock; +}; + +struct pf_keyhash { + LIST_HEAD(, pf_state_key) keys; + struct mtx lock; +}; + +struct pf_idhash { + LIST_HEAD(, pf_state) states; + struct mtx lock; +}; + +extern u_long pf_hashmask; +extern u_long pf_srchashmask; +#define PF_HASHSIZ (32768) +VNET_DECLARE(struct pf_keyhash *, pf_keyhash); +VNET_DECLARE(struct pf_idhash *, pf_idhash); +#define V_pf_keyhash VNET(pf_keyhash) +#define V_pf_idhash VNET(pf_idhash) +VNET_DECLARE(struct pf_srchash *, pf_srchash); +#define V_pf_srchash VNET(pf_srchash) + +#define PF_IDHASH(s) (be64toh((s)->id) % (pf_hashmask + 1)) + +VNET_DECLARE(void *, pf_swi_cookie); +#define V_pf_swi_cookie VNET(pf_swi_cookie) + +VNET_DECLARE(uint64_t, pf_stateid[MAXCPU]); +#define V_pf_stateid VNET(pf_stateid) + +TAILQ_HEAD(pf_altqqueue, pf_altq); +VNET_DECLARE(struct pf_altqqueue, pf_altqs[2]); +#define V_pf_altqs VNET(pf_altqs) +VNET_DECLARE(struct pf_palist, pf_pabuf); +#define V_pf_pabuf VNET(pf_pabuf) + +VNET_DECLARE(u_int32_t, ticket_altqs_active); +#define V_ticket_altqs_active VNET(ticket_altqs_active) +VNET_DECLARE(u_int32_t, ticket_altqs_inactive); +#define V_ticket_altqs_inactive VNET(ticket_altqs_inactive) +VNET_DECLARE(int, altqs_inactive_open); +#define V_altqs_inactive_open VNET(altqs_inactive_open) +VNET_DECLARE(u_int32_t, ticket_pabuf); +#define V_ticket_pabuf VNET(ticket_pabuf) +VNET_DECLARE(struct pf_altqqueue *, pf_altqs_active); +#define V_pf_altqs_active VNET(pf_altqs_active) +VNET_DECLARE(struct pf_altqqueue *, pf_altqs_inactive); +#define V_pf_altqs_inactive VNET(pf_altqs_inactive) + +VNET_DECLARE(struct pf_rulequeue, pf_unlinked_rules); +#define V_pf_unlinked_rules VNET(pf_unlinked_rules) + +void pf_initialize(void); +void pf_mtag_initialize(void); +void pf_mtag_cleanup(void); +void pf_cleanup(void); + +struct pf_mtag *pf_get_mtag(struct mbuf *); + +extern void pf_calc_skip_steps(struct pf_rulequeue *); +#ifdef ALTQ +extern void pf_altq_ifnet_event(struct ifnet *, int); +#endif +VNET_DECLARE(uma_zone_t, pf_state_z); +#define V_pf_state_z VNET(pf_state_z) +VNET_DECLARE(uma_zone_t, pf_state_key_z); +#define V_pf_state_key_z VNET(pf_state_key_z) +VNET_DECLARE(uma_zone_t, pf_state_scrub_z); +#define V_pf_state_scrub_z VNET(pf_state_scrub_z) + +extern void pf_purge_thread(void *); +extern void pf_unload_vnet_purge(void); +extern void pf_intr(void *); +extern void pf_purge_expired_src_nodes(void); + +extern int pf_unlink_state(struct pf_state *, u_int); +#define PF_ENTER_LOCKED 0x00000001 +#define PF_RETURN_LOCKED 0x00000002 +extern int pf_state_insert(struct pfi_kif *, + struct pf_state_key *, + struct pf_state_key *, + struct pf_state *); +extern void pf_free_state(struct pf_state *); + +static __inline void +pf_ref_state(struct pf_state *s) +{ + + refcount_acquire(&s->refs); +} + +static __inline int +pf_release_state(struct pf_state *s) +{ + + if (refcount_release(&s->refs)) { + pf_free_state(s); + return (1); + } else + return (0); +} + +extern struct pf_state *pf_find_state_byid(uint64_t, uint32_t); +extern struct pf_state *pf_find_state_all(struct pf_state_key_cmp *, + u_int, int *); +extern struct pf_src_node *pf_find_src_node(struct pf_addr *, + struct pf_rule *, sa_family_t, int); +extern void pf_unlink_src_node(struct pf_src_node *); +extern u_int pf_free_src_nodes(struct pf_src_node_list *); +extern void pf_print_state(struct pf_state *); +extern void pf_print_flags(u_int8_t); +extern u_int16_t pf_cksum_fixup(u_int16_t, u_int16_t, u_int16_t, + u_int8_t); +extern u_int16_t pf_proto_cksum_fixup(struct mbuf *, u_int16_t, + u_int16_t, u_int16_t, u_int8_t); + +VNET_DECLARE(struct ifnet *, sync_ifp); +#define V_sync_ifp VNET(sync_ifp); +VNET_DECLARE(struct pf_rule, pf_default_rule); +#define V_pf_default_rule VNET(pf_default_rule) +extern void pf_addrcpy(struct pf_addr *, struct pf_addr *, + u_int8_t); +void pf_free_rule(struct pf_rule *); + +#ifdef INET +int pf_test(int, struct ifnet *, struct mbuf **, struct inpcb *); +int pf_normalize_ip(struct mbuf **, int, struct pfi_kif *, u_short *, + struct pf_pdesc *); +#endif /* INET */ + +#ifdef INET6 +int pf_test6(int, struct ifnet *, struct mbuf **, struct inpcb *); +int pf_normalize_ip6(struct mbuf **, int, struct pfi_kif *, u_short *, + struct pf_pdesc *); +void pf_poolmask(struct pf_addr *, struct pf_addr*, + struct pf_addr *, struct pf_addr *, u_int8_t); +void pf_addr_inc(struct pf_addr *, sa_family_t); +int pf_refragment6(struct ifnet *, struct mbuf **, struct m_tag *); +#endif /* INET6 */ + +u_int32_t pf_new_isn(struct pf_state *); +void *pf_pull_hdr(struct mbuf *, int, void *, int, u_short *, u_short *, + sa_family_t); +void pf_change_a(void *, u_int16_t *, u_int32_t, u_int8_t); +void pf_change_proto_a(struct mbuf *, void *, u_int16_t *, u_int32_t, + u_int8_t); +void pf_change_tcp_a(struct mbuf *, void *, u_int16_t *, u_int32_t); +void pf_send_deferred_syn(struct pf_state *); +int pf_match_addr(u_int8_t, struct pf_addr *, struct pf_addr *, + struct pf_addr *, sa_family_t); +int pf_match_addr_range(struct pf_addr *, struct pf_addr *, + struct pf_addr *, sa_family_t); +int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); + +void pf_normalize_init(void); +void pf_normalize_cleanup(void); +int pf_normalize_tcp(int, struct pfi_kif *, struct mbuf *, int, int, void *, + struct pf_pdesc *); +void pf_normalize_tcp_cleanup(struct pf_state *); +int pf_normalize_tcp_init(struct mbuf *, int, struct pf_pdesc *, + struct tcphdr *, struct pf_state_peer *, struct pf_state_peer *); +int pf_normalize_tcp_stateful(struct mbuf *, int, struct pf_pdesc *, + u_short *, struct tcphdr *, struct pf_state *, + struct pf_state_peer *, struct pf_state_peer *, int *); +u_int32_t + pf_state_expires(const struct pf_state *); +void pf_purge_expired_fragments(void); +int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *, + int); +int pf_socket_lookup(int, struct pf_pdesc *, struct mbuf *); +struct pf_state_key *pf_alloc_state_key(int); +void pfr_initialize(void); +void pfr_cleanup(void); +int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t); +void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t, + u_int64_t, int, int, int); +int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, sa_family_t); +void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); +struct pfr_ktable * + pfr_attach_table(struct pf_ruleset *, char *); +void pfr_detach_table(struct pfr_ktable *); +int pfr_clr_tables(struct pfr_table *, int *, int); +int pfr_add_tables(struct pfr_table *, int, int *, int); +int pfr_del_tables(struct pfr_table *, int, int *, int); +int pfr_get_tables(struct pfr_table *, struct pfr_table *, int *, int); +int pfr_get_tstats(struct pfr_table *, struct pfr_tstats *, int *, int); +int pfr_clr_tstats(struct pfr_table *, int, int *, int); +int pfr_set_tflags(struct pfr_table *, int, int, int, int *, int *, int); +int pfr_clr_addrs(struct pfr_table *, int *, int); +int pfr_insert_kentry(struct pfr_ktable *, struct pfr_addr *, long); +int pfr_add_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_del_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_set_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int *, int *, int *, int, u_int32_t); +int pfr_get_addrs(struct pfr_table *, struct pfr_addr *, int *, int); +int pfr_get_astats(struct pfr_table *, struct pfr_astats *, int *, int); +int pfr_clr_astats(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_tst_addrs(struct pfr_table *, struct pfr_addr *, int, int *, + int); +int pfr_ina_begin(struct pfr_table *, u_int32_t *, int *, int); +int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int); +int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int); +int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *, + int *, u_int32_t, int); + +MALLOC_DECLARE(PFI_MTYPE); +VNET_DECLARE(struct pfi_kif *, pfi_all); +#define V_pfi_all VNET(pfi_all) + +void pfi_initialize(void); +void pfi_initialize_vnet(void); +void pfi_cleanup(void); +void pfi_cleanup_vnet(void); +void pfi_kif_ref(struct pfi_kif *); +void pfi_kif_unref(struct pfi_kif *); +struct pfi_kif *pfi_kif_find(const char *); +struct pfi_kif *pfi_kif_attach(struct pfi_kif *, const char *); +int pfi_kif_match(struct pfi_kif *, struct pfi_kif *); +void pfi_kif_purge(void); +int pfi_match_addr(struct pfi_dynaddr *, struct pf_addr *, + sa_family_t); +int pfi_dynaddr_setup(struct pf_addr_wrap *, sa_family_t); +void pfi_dynaddr_remove(struct pfi_dynaddr *); +void pfi_dynaddr_copyout(struct pf_addr_wrap *); +void pfi_update_status(const char *, struct pf_status *); +void pfi_get_ifaces(const char *, struct pfi_kif *, int *); +int pfi_set_flags(const char *, int); +int pfi_clear_flags(const char *, int); + +int pf_match_tag(struct mbuf *, struct pf_rule *, int *, int); +int pf_tag_packet(struct mbuf *, struct pf_pdesc *, int); +int pf_addr_cmp(struct pf_addr *, struct pf_addr *, + sa_family_t); +void pf_qid2qname(u_int32_t, char *); + +VNET_DECLARE(struct pf_kstatus, pf_status); +#define V_pf_status VNET(pf_status) + +struct pf_limit { + uma_zone_t zone; + u_int limit; +}; +VNET_DECLARE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); +#define V_pf_limits VNET(pf_limits) + +#endif /* _KERNEL */ + +#ifdef _KERNEL +VNET_DECLARE(struct pf_anchor_global, pf_anchors); +#define V_pf_anchors VNET(pf_anchors) +VNET_DECLARE(struct pf_anchor, pf_main_anchor); +#define V_pf_main_anchor VNET(pf_main_anchor) +#define pf_main_ruleset V_pf_main_anchor.ruleset +#endif + +/* these ruleset functions can be linked into userland programs (pfctl) */ +int pf_get_ruleset_number(u_int8_t); +void pf_init_ruleset(struct pf_ruleset *); +int pf_anchor_setup(struct pf_rule *, + const struct pf_ruleset *, const char *); +int pf_anchor_copyout(const struct pf_ruleset *, + const struct pf_rule *, struct pfioc_rule *); +void pf_anchor_remove(struct pf_rule *); +void pf_remove_if_empty_ruleset(struct pf_ruleset *); +struct pf_ruleset *pf_find_ruleset(const char *); +struct pf_ruleset *pf_find_or_create_ruleset(const char *); +void pf_rs_initialize(void); + +/* The fingerprint functions can be linked into userland programs (tcpdump) */ +int pf_osfp_add(struct pf_osfp_ioctl *); +#ifdef _KERNEL +struct pf_osfp_enlist * + pf_osfp_fingerprint(struct pf_pdesc *, struct mbuf *, int, + const struct tcphdr *); +#endif /* _KERNEL */ +void pf_osfp_flush(void); +int pf_osfp_get(struct pf_osfp_ioctl *); +int pf_osfp_match(struct pf_osfp_enlist *, pf_osfp_t); + +#ifdef _KERNEL +void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t); + +void pf_step_into_anchor(struct pf_anchor_stackframe *, int *, + struct pf_ruleset **, int, struct pf_rule **, + struct pf_rule **, int *); +int pf_step_out_of_anchor(struct pf_anchor_stackframe *, int *, + struct pf_ruleset **, int, struct pf_rule **, + struct pf_rule **, int *); + +int pf_map_addr(u_int8_t, struct pf_rule *, + struct pf_addr *, struct pf_addr *, + struct pf_addr *, struct pf_src_node **); +struct pf_rule *pf_get_translation(struct pf_pdesc *, struct mbuf *, + int, int, struct pfi_kif *, struct pf_src_node **, + struct pf_state_key **, struct pf_state_key **, + struct pf_addr *, struct pf_addr *, + uint16_t, uint16_t, struct pf_anchor_stackframe *); + +struct pf_state_key *pf_state_key_setup(struct pf_pdesc *, struct pf_addr *, + struct pf_addr *, u_int16_t, u_int16_t); +struct pf_state_key *pf_state_key_clone(struct pf_state_key *); +#endif /* _KERNEL */ + +#endif /* _NET_PFVAR_H_ */ diff --git a/freebsd/sys/net/ppp_defs.h b/freebsd/sys/net/ppp_defs.h index 386a1763..5f6d4106 100644 --- a/freebsd/sys/net/ppp_defs.h +++ b/freebsd/sys/net/ppp_defs.h @@ -31,6 +31,8 @@ #ifndef _PPP_DEFS_H_ #define _PPP_DEFS_H_ +#include <sys/_types.h> + /* * The basic PPP frame. */ @@ -83,7 +85,7 @@ /* * Extended asyncmap - allows any character to be escaped. */ -typedef u_int32_t ext_accm[8]; +typedef __uint32_t ext_accm[8]; /* * What to do with network protocol (NP) packets. @@ -143,8 +145,8 @@ struct ppp_comp_stats { * the last NP packet was sent or received. */ struct ppp_idle { - time_t xmit_idle; /* time since last NP packet sent */ - time_t recv_idle; /* time since last NP packet received */ + __time_t xmit_idle; /* time since last NP packet sent */ + __time_t recv_idle; /* time since last NP packet received */ }; #ifndef __P diff --git a/freebsd/sys/net/radix.c b/freebsd/sys/net/radix.c index ba15eb51..2615de65 100644 --- a/freebsd/sys/net/radix.c +++ b/freebsd/sys/net/radix.c @@ -58,18 +58,15 @@ #include <net/radix.h> #endif /* !_KERNEL */ -static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, - walktree_f_t *f, void *w); -static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); static struct radix_node - *rn_insert(void *, struct radix_node_head *, int *, + *rn_insert(void *, struct radix_head *, int *, struct radix_node [2]), *rn_newpair(void *, int, struct radix_node[2]), *rn_search(void *, struct radix_node *), *rn_search_m(void *, struct radix_node *, void *); +static struct radix_node *rn_addmask(void *, struct radix_mask_head *, int,int); -static void rn_detachhead_internal(void **head); -static int rn_inithead_internal(void **head, int off); +static void rn_detachhead_internal(struct radix_head *); #define RADIX_MAX_KEY_LEN 32 @@ -81,14 +78,6 @@ static char rn_ones[RADIX_MAX_KEY_LEN] = { -1, -1, -1, -1, -1, -1, -1, -1, }; -/* - * XXX: Compat stuff for old rn_addmask() users - */ -static struct radix_node_head *mask_rnhead_compat; -#ifdef _KERNEL -static struct mtx mask_mtx; -#endif - static int rn_lexobetter(void *m_arg, void *n_arg); static struct radix_mask * @@ -225,7 +214,7 @@ rn_refines(void *m_arg, void *n_arg) * from host routes. */ struct radix_node * -rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) +rn_lookup(void *v_arg, void *m_arg, struct radix_head *head) { struct radix_node *x; caddr_t netmask; @@ -234,7 +223,7 @@ rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) /* * Most common case: search exact prefix/mask */ - x = rn_addmask_r(m_arg, head->rnh_masks, 1, + x = rn_addmask(m_arg, head->rnh_masks, 1, head->rnh_treetop->rn_offset); if (x == NULL) return (NULL); @@ -287,7 +276,7 @@ rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip) * Search for longest-prefix match in given @head */ struct radix_node * -rn_match(void *v_arg, struct radix_node_head *head) +rn_match(void *v_arg, struct radix_head *head) { caddr_t v = v_arg; struct radix_node *t = head->rnh_treetop, *x; @@ -436,7 +425,7 @@ rn_newpair(void *v, int b, struct radix_node nodes[2]) } static struct radix_node * -rn_insert(void *v_arg, struct radix_node_head *head, int *dupentry, +rn_insert(void *v_arg, struct radix_head *head, int *dupentry, struct radix_node nodes[2]) { caddr_t v = v_arg; @@ -500,9 +489,9 @@ on1: } struct radix_node * -rn_addmask_r(void *arg, struct radix_node_head *maskhead, int search, int skip) +rn_addmask(void *n_arg, struct radix_mask_head *maskhead, int search, int skip) { - unsigned char *netmask = arg; + unsigned char *netmask = n_arg; unsigned char *cp, *cplim; struct radix_node *x; int b = 0, mlen, j; @@ -515,7 +504,7 @@ rn_addmask_r(void *arg, struct radix_node_head *maskhead, int search, int skip) if (skip == 0) skip = 1; if (mlen <= skip) - return (maskhead->rnh_nodes); + return (maskhead->mask_nodes); bzero(addmask_key, RADIX_MAX_KEY_LEN); if (skip > 1) @@ -528,22 +517,22 @@ rn_addmask_r(void *arg, struct radix_node_head *maskhead, int search, int skip) cp--; mlen = cp - addmask_key; if (mlen <= skip) - return (maskhead->rnh_nodes); + return (maskhead->mask_nodes); *addmask_key = mlen; - x = rn_search(addmask_key, maskhead->rnh_treetop); + x = rn_search(addmask_key, maskhead->head.rnh_treetop); if (bcmp(addmask_key, x->rn_key, mlen) != 0) - x = 0; + x = NULL; if (x || search) return (x); R_Zalloc(x, struct radix_node *, RADIX_MAX_KEY_LEN + 2 * sizeof (*x)); - if ((saved_x = x) == 0) + if ((saved_x = x) == NULL) return (0); - netmask = cp = (caddr_t)(x + 2); + netmask = cp = (unsigned char *)(x + 2); bcopy(addmask_key, cp, mlen); - x = rn_insert(cp, maskhead, &maskduplicated, x); + x = rn_insert(cp, &maskhead->head, &maskduplicated, x); if (maskduplicated) { log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); - Free(saved_x); + R_Free(saved_x); return (x); } /* @@ -571,23 +560,6 @@ rn_addmask_r(void *arg, struct radix_node_head *maskhead, int search, int skip) return (x); } -struct radix_node * -rn_addmask(void *n_arg, int search, int skip) -{ - struct radix_node *tt; - -#ifdef _KERNEL - mtx_lock(&mask_mtx); -#endif - tt = rn_addmask_r(&mask_rnhead_compat, n_arg, search, skip); - -#ifdef _KERNEL - mtx_unlock(&mask_mtx); -#endif - - return (tt); -} - static int /* XXX: arbitrary ordering for non-contiguous masks */ rn_lexobetter(void *m_arg, void *n_arg) { @@ -625,11 +597,11 @@ rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next) } struct radix_node * -rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, +rn_addroute(void *v_arg, void *n_arg, struct radix_head *head, struct radix_node treenodes[2]) { caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; - struct radix_node *t, *x = 0, *tt; + struct radix_node *t, *x = NULL, *tt; struct radix_node *saved_tt, *top = head->rnh_treetop; short b = 0, b_leaf = 0; int keyduplicated; @@ -644,7 +616,7 @@ rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * nodes and possibly save time in calculating indices. */ if (netmask) { - x = rn_addmask_r(netmask, head->rnh_masks, 0, top->rn_offset); + x = rn_addmask(netmask, head->rnh_masks, 0, top->rn_offset); if (x == NULL) return (0); b_leaf = x->rn_bit; @@ -752,7 +724,7 @@ rn_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m->rm_bit >= b_leaf) break; - t->rn_mklist = m; *mp = 0; + t->rn_mklist = m; *mp = NULL; } on2: /* Add new route to highest possible ancestor's list */ @@ -799,7 +771,7 @@ on2: } struct radix_node * -rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head) +rn_delete(void *v_arg, void *netmask_arg, struct radix_head *head) { struct radix_node *t, *p, *x, *tt; struct radix_mask *m, *saved_m, **mp; @@ -815,22 +787,22 @@ rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head) vlen = LEN(v); saved_tt = tt; top = x; - if (tt == 0 || + if (tt == NULL || bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) return (0); /* * Delete our route from mask lists. */ if (netmask) { - x = rn_addmask_r(netmask, head->rnh_masks, 1, head_off); + x = rn_addmask(netmask, head->rnh_masks, 1, head_off); if (x == NULL) return (0); netmask = x->rn_key; while (tt->rn_mask != netmask) - if ((tt = tt->rn_dupedkey) == 0) + if ((tt = tt->rn_dupedkey) == NULL) return (0); } - if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) + if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == NULL) goto on1; if (tt->rn_flags & RNF_NORMAL) { if (m->rm_leaf != tt || m->rm_refs > 0) { @@ -856,10 +828,10 @@ rn_delete(void *v_arg, void *netmask_arg, struct radix_node_head *head) for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m == saved_m) { *mp = m->rm_mklist; - Free(m); + R_Free(m); break; } - if (m == 0) { + if (m == NULL) { log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); if (tt->rn_flags & RNF_NORMAL) return (0); /* Dangling ref to us */ @@ -947,7 +919,7 @@ on1: struct radix_mask *mm = m->rm_mklist; x->rn_mklist = 0; if (--(m->rm_refs) < 0) - Free(m); + R_Free(m); m = mm; } if (m) @@ -986,8 +958,8 @@ out: * This is the same as rn_walktree() except for the parameters and the * exit. */ -static int -rn_walktree_from(struct radix_node_head *h, void *a, void *m, +int +rn_walktree_from(struct radix_head *h, void *a, void *m, walktree_f_t *f, void *w) { int error; @@ -998,6 +970,8 @@ rn_walktree_from(struct radix_node_head *h, void *a, void *m, int stopping = 0; int lastb; + KASSERT(m != NULL, ("%s: mask needs to be specified", __func__)); + /* * rn_search_m is sort-of-open-coded here. We cannot use the * function because we need to keep track of the last node seen. @@ -1021,11 +995,11 @@ rn_walktree_from(struct radix_node_head *h, void *a, void *m, /* * Two cases: either we stepped off the end of our mask, * in which case last == rn, or we reached a leaf, in which - * case we want to start from the last node we looked at. - * Either way, last is the node we want to start from. + * case we want to start from the leaf. */ - rn = last; - lastb = rn->rn_bit; + if (rn->rn_bit >= 0) + rn = last; + lastb = last->rn_bit; /* printf("rn %p, lastb %d\n", rn, lastb);*/ @@ -1072,7 +1046,7 @@ rn_walktree_from(struct radix_node_head *h, void *a, void *m, rn = rn->rn_left; next = rn; /* Process leaves */ - while ((rn = base) != 0) { + while ((rn = base) != NULL) { base = rn->rn_dupedkey; /* printf("leaf %p\n", rn); */ if (!(rn->rn_flags & RNF_ROOT) @@ -1090,8 +1064,8 @@ rn_walktree_from(struct radix_node_head *h, void *a, void *m, return (0); } -static int -rn_walktree(struct radix_node_head *h, walktree_f_t *f, void *w) +int +rn_walktree(struct radix_head *h, walktree_f_t *f, void *w) { int error; struct radix_node *base, *next; @@ -1130,82 +1104,94 @@ rn_walktree(struct radix_node_head *h, walktree_f_t *f, void *w) } /* - * Allocate and initialize an empty tree. This has 3 nodes, which are - * part of the radix_node_head (in the order <left,root,right>) and are + * Initialize an empty tree. This has 3 nodes, which are passed + * via base_nodes (in the order <left,root,right>) and are * marked RNF_ROOT so they cannot be freed. * The leaves have all-zero and all-one keys, with significant * bits starting at 'off'. - * Return 1 on success, 0 on error. */ -static int -rn_inithead_internal(void **head, int off) +void +rn_inithead_internal(struct radix_head *rh, struct radix_node *base_nodes, int off) { - struct radix_node_head *rnh; struct radix_node *t, *tt, *ttt; - if (*head) - return (1); - R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); - if (rnh == 0) - return (0); -#ifdef _KERNEL - RADIX_NODE_HEAD_LOCK_INIT(rnh); -#endif - *head = rnh; - t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); - ttt = rnh->rnh_nodes + 2; + + t = rn_newpair(rn_zeros, off, base_nodes); + ttt = base_nodes + 2; t->rn_right = ttt; t->rn_parent = t; - tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ + tt = t->rn_left; /* ... which in turn is base_nodes */ tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; tt->rn_bit = -1 - off; *ttt = *tt; ttt->rn_key = rn_ones; - rnh->rnh_addaddr = rn_addroute; - rnh->rnh_deladdr = rn_delete; - rnh->rnh_matchaddr = rn_match; - rnh->rnh_lookup = rn_lookup; - rnh->rnh_walktree = rn_walktree; - rnh->rnh_walktree_from = rn_walktree_from; - rnh->rnh_treetop = t; - return (1); + + rh->rnh_treetop = t; } static void -rn_detachhead_internal(void **head) +rn_detachhead_internal(struct radix_head *head) { - struct radix_node_head *rnh; - KASSERT((head != NULL && *head != NULL), + KASSERT((head != NULL), ("%s: head already freed", __func__)); - rnh = *head; /* Free <left,root,right> nodes. */ - Free(rnh); - - *head = NULL; + R_Free(head); } +/* Functions used by 'struct radix_node_head' users */ + int rn_inithead(void **head, int off) { struct radix_node_head *rnh; + struct radix_mask_head *rmh; + + rnh = *head; + rmh = NULL; if (*head != NULL) return (1); - if (rn_inithead_internal(head, off) == 0) + R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); + R_Zalloc(rmh, struct radix_mask_head *, sizeof (*rmh)); + if (rnh == NULL || rmh == NULL) { + if (rnh != NULL) + R_Free(rnh); + if (rmh != NULL) + R_Free(rmh); return (0); + } - rnh = (struct radix_node_head *)(*head); + /* Init trees */ + rn_inithead_internal(&rnh->rh, rnh->rnh_nodes, off); + rn_inithead_internal(&rmh->head, rmh->mask_nodes, 0); + *head = rnh; + rnh->rh.rnh_masks = rmh; - if (rn_inithead_internal((void **)&rnh->rnh_masks, 0) == 0) { - rn_detachhead_internal(head); - return (0); - } + /* Finally, set base callbacks */ + rnh->rnh_addaddr = rn_addroute; + rnh->rnh_deladdr = rn_delete; + rnh->rnh_matchaddr = rn_match; + rnh->rnh_lookup = rn_lookup; + rnh->rnh_walktree = rn_walktree; + rnh->rnh_walktree_from = rn_walktree_from; return (1); } +static int +rn_freeentry(struct radix_node *rn, void *arg) +{ + struct radix_head * const rnh = arg; + struct radix_node *x; + + x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); + if (x != NULL) + R_Free(x); + return (0); +} + int rn_detachhead(void **head) { @@ -1214,29 +1200,14 @@ rn_detachhead(void **head) KASSERT((head != NULL && *head != NULL), ("%s: head already freed", __func__)); - rnh = *head; + rnh = (struct radix_node_head *)(*head); - rn_detachhead_internal((void **)&rnh->rnh_masks); - rn_detachhead_internal(head); - return (1); -} + rn_walktree(&rnh->rh.rnh_masks->head, rn_freeentry, rnh->rh.rnh_masks); + rn_detachhead_internal(&rnh->rh.rnh_masks->head); + rn_detachhead_internal(&rnh->rh); -void -rn_init(int maxk) -{ - if ((maxk <= 0) || (maxk > RADIX_MAX_KEY_LEN)) { - log(LOG_ERR, - "rn_init: max_keylen must be within 1..%d\n", - RADIX_MAX_KEY_LEN); - return; - } + *head = NULL; - /* - * XXX: Compat for old rn_addmask() users - */ - if (rn_inithead((void **)(void *)&mask_rnhead_compat, 0) == 0) - panic("rn_init 2"); -#ifdef _KERNEL - mtx_init(&mask_mtx, "radix_mask", NULL, MTX_DEF); -#endif + return (1); } + diff --git a/freebsd/sys/net/radix.h b/freebsd/sys/net/radix.h index 3554c77c..69aad831 100644 --- a/freebsd/sys/net/radix.h +++ b/freebsd/sys/net/radix.h @@ -101,52 +101,61 @@ struct radix_mask { #define rm_mask rm_rmu.rmu_mask #define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ +struct radix_head; + typedef int walktree_f_t(struct radix_node *, void *); +typedef struct radix_node *rn_matchaddr_f_t(void *v, + struct radix_head *head); +typedef struct radix_node *rn_addaddr_f_t(void *v, void *mask, + struct radix_head *head, struct radix_node nodes[]); +typedef struct radix_node *rn_deladdr_f_t(void *v, void *mask, + struct radix_head *head); +typedef struct radix_node *rn_lookup_f_t(void *v, void *mask, + struct radix_head *head); +typedef int rn_walktree_t(struct radix_head *head, walktree_f_t *f, + void *w); +typedef int rn_walktree_from_t(struct radix_head *head, + void *a, void *m, walktree_f_t *f, void *w); +typedef void rn_close_t(struct radix_node *rn, struct radix_head *head); + +struct radix_mask_head; + +struct radix_head { + struct radix_node *rnh_treetop; + struct radix_mask_head *rnh_masks; /* Storage for our masks */ +}; struct radix_node_head { - struct radix_node *rnh_treetop; - u_int rnh_gen; /* generation counter */ - int rnh_multipath; /* multipath capable ? */ - int rnh_addrsize; /* permit, but not require fixed keys */ - int rnh_pktsize; /* permit, but not require fixed keys */ - struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ - (void *v, void *mask, - struct radix_node_head *head, struct radix_node nodes[]); - struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ - (void *v, void *mask, - struct radix_node_head *head, struct radix_node nodes[]); - struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ - (void *v, void *mask, struct radix_node_head *head); - struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ - (void *v, void *mask, struct radix_node_head *head); - struct radix_node *(*rnh_matchaddr) /* longest match for sockaddr */ - (void *v, struct radix_node_head *head); - struct radix_node *(*rnh_lookup) /*exact match for sockaddr*/ - (void *v, void *mask, struct radix_node_head *head); - struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ - (void *v, struct radix_node_head *head); - int (*rnh_walktree) /* traverse tree */ - (struct radix_node_head *head, walktree_f_t *f, void *w); - int (*rnh_walktree_from) /* traverse tree below a */ - (struct radix_node_head *head, void *a, void *m, - walktree_f_t *f, void *w); - void (*rnh_close) /* do something when the last ref drops */ - (struct radix_node *rn, struct radix_node_head *head); + struct radix_head rh; + rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ + rn_addaddr_f_t *rnh_addaddr; /* add based on sockaddr*/ + rn_deladdr_f_t *rnh_deladdr; /* remove based on sockaddr */ + rn_lookup_f_t *rnh_lookup; /* exact match for sockaddr */ + rn_walktree_t *rnh_walktree; /* traverse tree */ + rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ + rn_close_t *rnh_close; /*do something when the last ref drops*/ struct radix_node rnh_nodes[3]; /* empty tree for common case */ #ifdef _KERNEL struct rwlock rnh_lock; /* locks entire radix tree */ #endif - struct radix_node_head *rnh_masks; /* Storage for our masks */ }; +struct radix_mask_head { + struct radix_head head; + struct radix_node mask_nodes[3]; +}; + +void rn_inithead_internal(struct radix_head *rh, struct radix_node *base_nodes, + int off); + #ifndef _KERNEL #define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) #define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) -#define Free(p) free((char *)p); +#define R_Free(p) free((char *)p); #else #define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) #define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) -#define Free(p) free((caddr_t)p, M_RTABLE); +#define R_Free(p) free((caddr_t)p, M_RTABLE); #define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) @@ -162,18 +171,17 @@ struct radix_node_head { #define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) #endif /* _KERNEL */ -void rn_init(int); int rn_inithead(void **, int); int rn_detachhead(void **); int rn_refines(void *, void *); -struct radix_node - *rn_addmask(void *, int, int), - *rn_addmask_r(void *, struct radix_node_head *, int, int), - *rn_addroute (void *, void *, struct radix_node_head *, - struct radix_node [2]), - *rn_delete(void *, void *, struct radix_node_head *), - *rn_lookup (void *v_arg, void *m_arg, - struct radix_node_head *head), - *rn_match(void *, struct radix_node_head *); +struct radix_node *rn_addroute(void *, void *, struct radix_head *, + struct radix_node[2]); +struct radix_node *rn_delete(void *, void *, struct radix_head *); +struct radix_node *rn_lookup (void *v_arg, void *m_arg, + struct radix_head *head); +struct radix_node *rn_match(void *, struct radix_head *); +int rn_walktree_from(struct radix_head *h, void *a, void *m, + walktree_f_t *f, void *w); +int rn_walktree(struct radix_head *, walktree_f_t *, void *); #endif /* _RADIX_H_ */ diff --git a/freebsd/sys/net/radix_mpath.c b/freebsd/sys/net/radix_mpath.c index 1bce388e..f5215205 100644 --- a/freebsd/sys/net/radix_mpath.c +++ b/freebsd/sys/net/radix_mpath.c @@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$"); #include <net/radix.h> #include <net/radix_mpath.h> #include <net/route.h> +#include <net/route_var.h> #include <net/if.h> #include <net/if_var.h> @@ -59,12 +60,19 @@ __FBSDID("$FreeBSD$"); static uint32_t hashjitter; int -rn_mpath_capable(struct radix_node_head *rnh) +rt_mpath_capable(struct rib_head *rnh) { return rnh->rnh_multipath; } +int +rn_mpath_capable(struct radix_head *rh) +{ + + return (rt_mpath_capable((struct rib_head *)rh)); +} + struct radix_node * rn_mpath_next(struct radix_node *rn) { @@ -91,7 +99,7 @@ rn_mpath_count(struct radix_node *rn) while (rn != NULL) { rt = (struct rtentry *)rn; - i += rt->rt_rmx.rmx_weight; + i += rt->rt_weight; rn = rn_mpath_next(rn); } return (i); @@ -165,14 +173,14 @@ rt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt) * Assume @rt rt_key host bits are cleared according to @netmask */ int -rt_mpath_conflict(struct radix_node_head *rnh, struct rtentry *rt, +rt_mpath_conflict(struct rib_head *rnh, struct rtentry *rt, struct sockaddr *netmask) { struct radix_node *rn, *rn1; struct rtentry *rt1; rn = (struct radix_node *)rt; - rn1 = rnh->rnh_lookup(rt_key(rt), netmask, rnh); + rn1 = rnh->rnh_lookup(rt_key(rt), netmask, &rnh->head); if (!rn1 || rn1->rn_flags & RNF_ROOT) return (0); @@ -203,18 +211,50 @@ rt_mpath_conflict(struct radix_node_head *rnh, struct rtentry *rt, return (0); } -void -#ifndef __rtems__ -rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum) -#else /* __rtems__ */ -rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum) -#endif /* __rtems__ */ +static struct rtentry * +rt_mpath_selectrte(struct rtentry *rte, uint32_t hash) { struct radix_node *rn0, *rn; - u_int32_t n; + uint32_t total_weight; struct rtentry *rt; int64_t weight; + /* beyond here, we use rn as the master copy */ + rn0 = rn = (struct radix_node *)rte; + rt = rte; + + /* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */ + total_weight = rn_mpath_count(rn0); + hash += hashjitter; + hash %= total_weight; + for (weight = abs((int32_t)hash); + rt != NULL && weight >= rt->rt_weight; + weight -= (rt == NULL) ? 0 : rt->rt_weight) { + + /* stay within the multipath routes */ + if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask) + break; + rn = rn->rn_dupedkey; + rt = (struct rtentry *)rn; + } + + return (rt); +} + +struct rtentry * +rt_mpath_select(struct rtentry *rte, uint32_t hash) +{ + if (rn_mpath_next((struct radix_node *)rte) == NULL) + return (rte); + + return (rt_mpath_selectrte(rte, hash)); +} + +void +rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum) +{ + struct rtentry *rt; + /* * XXX we don't attempt to lookup cached route again; what should * be done for sendto(3) case? @@ -232,34 +272,18 @@ rtalloc_mpath_fib(struct route *ro, u_int32_t hash, u_int fibnum) return; } - /* beyond here, we use rn as the master copy */ - rn0 = rn = (struct radix_node *)ro->ro_rt; - n = rn_mpath_count(rn0); - - /* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */ - hash += hashjitter; - hash %= n; - for (weight = abs((int32_t)hash), rt = ro->ro_rt; - weight >= rt->rt_rmx.rmx_weight && rn; - weight -= rt->rt_rmx.rmx_weight) { - - /* stay within the multipath routes */ - if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask) - break; - rn = rn->rn_dupedkey; - rt = (struct rtentry *)rn; - } + rt = rt_mpath_selectrte(ro->ro_rt, hash); /* XXX try filling rt_gwroute and avoid unreachable gw */ /* gw selection has failed - there must be only zero weight routes */ - if (!rn) { + if (!rt) { RT_UNLOCK(ro->ro_rt); ro->ro_rt = NULL; return; } if (ro->ro_rt != rt) { RTFREE_LOCKED(ro->ro_rt); - ro->ro_rt = (struct rtentry *)rn; + ro->ro_rt = rt; RT_LOCK(ro->ro_rt); RT_ADDREF(ro->ro_rt); @@ -274,11 +298,11 @@ extern int in_inithead(void **head, int off); int rn4_mpath_inithead(void **head, int off) { - struct radix_node_head *rnh; + struct rib_head *rnh; hashjitter = arc4random(); if (in_inithead(head, off) == 1) { - rnh = (struct radix_node_head *)*head; + rnh = (struct rib_head *)*head; rnh->rnh_multipath = 1; return 1; } else @@ -290,11 +314,11 @@ rn4_mpath_inithead(void **head, int off) int rn6_mpath_inithead(void **head, int off) { - struct radix_node_head *rnh; + struct rib_head *rnh; hashjitter = arc4random(); if (in6_inithead(head, off) == 1) { - rnh = (struct radix_node_head *)*head; + rnh = (struct rib_head *)*head; rnh->rnh_multipath = 1; return 1; } else diff --git a/freebsd/sys/net/radix_mpath.h b/freebsd/sys/net/radix_mpath.h index bcb210e3..2b0d442e 100644 --- a/freebsd/sys/net/radix_mpath.h +++ b/freebsd/sys/net/radix_mpath.h @@ -44,16 +44,16 @@ struct route; struct rtentry; struct sockaddr; -int rn_mpath_capable(struct radix_node_head *); +struct rib_head; +int rt_mpath_capable(struct rib_head *); +int rn_mpath_capable(struct radix_head *); struct radix_node *rn_mpath_next(struct radix_node *); u_int32_t rn_mpath_count(struct radix_node *); struct rtentry *rt_mpath_matchgate(struct rtentry *, struct sockaddr *); -int rt_mpath_conflict(struct radix_node_head *, struct rtentry *, +int rt_mpath_conflict(struct rib_head *, struct rtentry *, struct sockaddr *); void rtalloc_mpath_fib(struct route *, u_int32_t, u_int); -#define rtalloc_mpath(_route, _hash) rtalloc_mpath_fib((_route), (_hash), 0) -struct radix_node *rn_mpath_lookup(void *, void *, - struct radix_node_head *); +struct rtentry *rt_mpath_select(struct rtentry *, uint32_t); int rt_mpath_deldup(struct rtentry *, struct rtentry *); int rn4_mpath_inithead(void **, int); int rn6_mpath_inithead(void **, int); diff --git a/freebsd/sys/net/raw_cb.c b/freebsd/sys/net/raw_cb.c index 10db8bba..00a199f3 100644 --- a/freebsd/sys/net/raw_cb.c +++ b/freebsd/sys/net/raw_cb.c @@ -46,8 +46,8 @@ #include <sys/systm.h> #include <net/if.h> -#include <net/raw_cb.h> #include <net/vnet.h> +#include <net/raw_cb.h> /* * Routines to manage the raw protocol control blocks. diff --git a/freebsd/sys/net/raw_usrreq.c b/freebsd/sys/net/raw_usrreq.c index 1030526f..e170ad74 100644 --- a/freebsd/sys/net/raw_usrreq.c +++ b/freebsd/sys/net/raw_usrreq.c @@ -48,8 +48,8 @@ #include <sys/systm.h> #include <net/if.h> -#include <net/raw_cb.h> #include <net/vnet.h> +#include <net/raw_cb.h> MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF); @@ -85,7 +85,7 @@ raw_input_ext(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src, struct mbuf *m = m0; struct socket *last; - last = 0; + last = NULL; mtx_lock(&rawcb_mtx); LIST_FOREACH(rp, &V_rawcb_list, list) { if (rp->rcb_proto.sp_family != proto->sp_family) diff --git a/freebsd/sys/net/route.c b/freebsd/sys/net/route.c index 781d8bb9..3eb05b94 100644 --- a/freebsd/sys/net/route.c +++ b/freebsd/sys/net/route.c @@ -45,7 +45,6 @@ #include <rtems/bsd/sys/param.h> #include <sys/systm.h> -#include <sys/syslog.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> @@ -57,8 +56,10 @@ #include <sys/kernel.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_dl.h> #include <net/route.h> +#include <net/route_var.h> #include <net/vnet.h> #include <net/flowtable.h> @@ -75,8 +76,7 @@ #include <sys/file.h> #endif /* __rtems__ */ -/* We use 4 bits in the mbuf flags, thus we are limited to 16 FIBS. */ -#define RT_MAXFIBS 16 +#define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES @@ -102,17 +102,7 @@ extern void sctp_addr_change(struct ifaddr *ifa, int cmd); /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; -SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, ""); -/* - * Allow the boot code to allow LESS than RT_MAXFIBS to be used. - * We can't do more because storage is statically allocated for now. - * (for compatibility reasons.. this will change. When this changes, code should - * be refactored to protocol independent parts and protocol dependent parts, - * probably hanging of domain(9) specific storage to not need the full - * fib * af RNH allocation etc. but allow tuning the number of tables per - * address family). - */ -TUNABLE_INT("net.fibs", &rt_numfibs); +SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. @@ -124,25 +114,20 @@ TUNABLE_INT("net.fibs", &rt_numfibs); * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ -u_int rt_add_addr_allfibs = 1; -SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW, - &rt_add_addr_allfibs, 0, ""); -TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs); +VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; +SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, + &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) -VNET_DEFINE(struct radix_node_head *, rt_tables); +VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) -/* compare two sockaddr structures */ -#define sa_equal(a1, a2) (((a1)->sa_len == (a2)->sa_len) && \ - (bcmp((a1), (a2), (a1)->sa_len) == 0)) - /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a @@ -158,6 +143,28 @@ VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) +static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, + struct rtentry **, u_int); +static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); +static int rt_ifdelroute(const struct rtentry *rt, void *arg); +static struct rtentry *rt_unlinkrte(struct rib_head *rnh, + struct rt_addrinfo *info, int *perror); +static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); +#ifdef RADIX_MPATH +static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, + struct rt_addrinfo *info, struct rtentry *rto, int *perror); +#endif +static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, + int flags); + +struct if_mtuinfo +{ + struct ifnet *ifp; + int mtu; +}; + +static int if_updatemtu_cb(struct radix_node *, void *); + /* * handler for net.my_fibnum */ @@ -179,10 +186,10 @@ sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); -static __inline struct radix_node_head ** +static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { - struct radix_node_head **rnh; + struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); @@ -190,20 +197,32 @@ rt_tables_get_rnh_ptr(int table, int fam) __func__)); /* rnh is [fib=0][af=0]. */ - rnh = (struct radix_node_head **)V_rt_tables; + rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } -struct radix_node_head * +struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } +u_int +rt_tables_get_gen(int table, int fam) +{ + struct rib_head *rnh; + + rnh = *rt_tables_get_rnh_ptr(table, fam); + KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d", + __func__, table, fam)); + return (rnh->rnh_gen); +} + + /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. @@ -211,36 +230,72 @@ rt_tables_get_rnh(int table, int fam) static void route_init(void) { - struct domain *dom; - int max_keylen = 0; /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; +} +SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); - for (dom = domains; dom; dom = dom->dom_next) - if (dom->dom_maxrtkey > max_keylen) - max_keylen = dom->dom_maxrtkey; +static int +rtentry_zinit(void *mem, int size, int how) +{ + struct rtentry *rt = mem; + + rt->rt_pksent = counter_u64_alloc(how); + if (rt->rt_pksent == NULL) + return (ENOMEM); - rn_init(max_keylen); /* init all zeroes, all ones, mask table */ + RT_LOCK_INIT(rt); + + return (0); +} + +static void +rtentry_zfini(void *mem, int size) +{ + struct rtentry *rt = mem; + + RT_LOCK_DESTROY(rt); + counter_u64_free(rt->rt_pksent); +} + +static int +rtentry_ctor(void *mem, int size, void *arg, int how) +{ + struct rtentry *rt = mem; + + bzero(rt, offsetof(struct rtentry, rt_endzero)); + counter_u64_zero(rt->rt_pksent); + rt->rt_chain = NULL; + + return (0); +} + +static void +rtentry_dtor(void *mem, int size, void *arg) +{ + struct rtentry *rt = mem; + + RT_UNLOCK_COND(rt); } -SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); static void vnet_route_init(const void *unused __unused) { struct domain *dom; - struct radix_node_head **rnh; + struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * - sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO); + sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); - V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); + V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), + rtentry_ctor, rtentry_dtor, + rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; @@ -250,15 +305,10 @@ vnet_route_init(const void *unused __unused) if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; - /* - * XXX MRT rtattach will be also called from - * vfs_export.c but the offset will be 0 (only for - * AF_INET and AF_INET6 which don't need it anyhow). - */ rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); - dom->dom_rtattach((void **)rnh, dom->dom_rtoffset); + dom->dom_rtattach((void **)rnh, 0); } } } @@ -272,7 +322,7 @@ vnet_route_uninit(const void *unused __unused) int table; int fam; struct domain *dom; - struct radix_node_head **rnh; + struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) @@ -287,14 +337,68 @@ vnet_route_uninit(const void *unused __unused) rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); - dom->dom_rtdetach((void **)rnh, dom->dom_rtoffset); + dom->dom_rtdetach((void **)rnh, 0); } } + + free(V_rt_tables, M_RTABLE); + uma_zdestroy(V_rtzone); } -VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, +VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); #endif +struct rib_head * +rt_table_init(int offset) +{ + struct rib_head *rh; + + rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); + + /* TODO: These details should be hidded inside radix.c */ + /* Init masks tree */ + rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); + rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); + rh->head.rnh_masks = &rh->rmhead; + + /* Init locks */ + rw_init(&rh->rib_lock, "rib head lock"); + + /* Finally, set base callbacks */ + rh->rnh_addaddr = rn_addroute; + rh->rnh_deladdr = rn_delete; + rh->rnh_matchaddr = rn_match; + rh->rnh_lookup = rn_lookup; + rh->rnh_walktree = rn_walktree; + rh->rnh_walktree_from = rn_walktree_from; + + return (rh); +} + +static int +rt_freeentry(struct radix_node *rn, void *arg) +{ + struct radix_head * const rnh = arg; + struct radix_node *x; + + x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); + if (x != NULL) + R_Free(x); + return (0); +} + +void +rt_table_destroy(struct rib_head *rh) +{ + + rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); + + /* Assume table is already empty */ + rw_destroy(&rh->rib_lock); + free(rh, M_RTABLE); +} + + #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; @@ -335,35 +439,6 @@ setfib(int fibnum) * Packet routing routines. */ void -rtalloc(struct route *ro) -{ - - rtalloc_ign_fib(ro, 0UL, RT_DEFAULT_FIB); -} - -void -rtalloc_fib(struct route *ro, u_int fibnum) -{ - rtalloc_ign_fib(ro, 0UL, fibnum); -} - -void -rtalloc_ign(struct route *ro, u_long ignore) -{ - struct rtentry *rt; - - if ((rt = ro->ro_rt) != NULL) { - if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) - return; - RTFREE(rt); - ro->ro_rt = NULL; - } - ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, RT_DEFAULT_FIB); - if (ro->ro_rt) - RT_UNLOCK(ro->ro_rt); -} - -void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; @@ -396,49 +471,32 @@ struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { - struct radix_node_head *rnh; + struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; - int needlock; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); - switch (dst->sa_family) { - case AF_INET6: - case AF_INET: - /* We support multiple FIBs. */ - break; - default: - fibnum = RT_DEFAULT_FIB; - break; - } - rnh = rt_tables_get_rnh(fibnum, dst->sa_family); + rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; - if (rnh == NULL) + if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ - needlock = !(ignflags & RTF_RNH_LOCKED); - if (needlock) - RADIX_NODE_HEAD_RLOCK(rnh); -#ifdef INVARIANTS - else - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); -#endif - rn = rnh->rnh_matchaddr(dst, rnh); + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); - if (needlock) - RADIX_NODE_HEAD_RUNLOCK(rnh); - goto done; + RIB_RUNLOCK(rh); + return (newrt); - } else if (needlock) - RADIX_NODE_HEAD_RUNLOCK(rnh); + } else + RIB_RUNLOCK(rh); /* * Either we hit the root or couldn't find any match, @@ -457,10 +515,7 @@ miss: bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); - } -done: - if (newrt) - RT_LOCK_ASSERT(newrt); + } return (newrt); } @@ -471,7 +526,7 @@ done: void rtfree(struct rtentry *rt) { - struct radix_node_head *rnh; + struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); @@ -499,7 +554,7 @@ rtfree(struct rtentry *rt) * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) - rnh->rnh_close((struct radix_node *)rt, rnh); + rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) @@ -531,12 +586,11 @@ rtfree(struct rtentry *rt) * This also frees the gateway, as they are always malloc'd * together. */ - Free(rt_key(rt)); + R_Free(rt_key(rt)); /* * and the rtentry itself of course */ - RT_LOCK_DESTROY(rt); uma_zfree(V_rtzone, rt); return; } @@ -552,17 +606,6 @@ done: * message from the network layer. */ void -rtredirect(struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct sockaddr *src) -{ - - rtredirect_fib(dst, gateway, netmask, flags, src, RT_DEFAULT_FIB); -} - -void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, @@ -570,12 +613,12 @@ rtredirect_fib(struct sockaddr *dst, struct sockaddr *src, u_int fibnum) { - struct rtentry *rt, *rt0 = NULL; + struct rtentry *rt; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; - struct radix_node_head *rnh; + struct rib_head *rnh; ifa = NULL; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); @@ -585,7 +628,7 @@ rtredirect_fib(struct sockaddr *dst, } /* verify the gateway is directly reachable */ - if ((ifa = ifa_ifwithnet_fib(gateway, 0, fibnum)) == NULL) { + if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } @@ -596,13 +639,20 @@ rtredirect_fib(struct sockaddr *dst, * we have a routing loop, perhaps as a result of an interface * going down recently. */ - if (!(flags & RTF_DONE) && rt && - (!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) - error = EINVAL; - else if (ifa_ifwithaddr_check(gateway)) + if (!(flags & RTF_DONE) && rt) { + if (!sa_equal(src, rt->rt_gateway)) { + error = EINVAL; + goto done; + } + if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { + error = EINVAL; + goto done; + } + } + if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) { error = EHOSTUNREACH; - if (error) goto done; + } /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts @@ -622,36 +672,31 @@ rtredirect_fib(struct sockaddr *dst, * Create new route, rather than smashing route to net. */ create: - rt0 = rt; - rt = NULL; + if (rt != NULL) + RTFREE_LOCKED(rt); - flags |= RTF_GATEWAY | RTF_DYNAMIC; + flags |= RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; - if (rt0 != NULL) - RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */ error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); - if (rt0 != NULL) - EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst); flags = rt->rt_flags; } - if (rt0 != NULL) - RTFREE(rt0); stat = &V_rtstat.rts_dynamic; } else { - struct rtentry *gwrt; /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ + if ((flags & RTF_GATEWAY) == 0) + rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; @@ -659,13 +704,10 @@ rtredirect_fib(struct sockaddr *dst, * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); - RADIX_NODE_HEAD_LOCK(rnh); + RIB_WLOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); - gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED); - RADIX_NODE_HEAD_UNLOCK(rnh); - EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst); - RTFREE_LOCKED(gwrt); + RIB_WUNLOCK(rnh); } } else error = EHOSTUNREACH; @@ -687,13 +729,6 @@ out: ifa_free(ifa); } -int -rtioctl(u_long req, caddr_t data) -{ - - return (rtioctl_fib(req, data, RT_DEFAULT_FIB)); -} - /* * Routing table ioctl interface. */ @@ -715,21 +750,11 @@ rtioctl_fib(u_long req, caddr_t data, u_int fibnum) #endif /* INET */ } -/* - * For both ifa_ifwithroute() routines, 'ifa' is returned referenced. - */ struct ifaddr * -ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway) -{ - - return (ifa_ifwithroute_fib(flags, dst, gateway, RT_DEFAULT_FIB)); -} - -struct ifaddr * -ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, +ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { - register struct ifaddr *ifa; + struct ifaddr *ifa; int not_found = 0; if ((flags & RTF_GATEWAY) == 0) { @@ -742,7 +767,7 @@ ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, */ ifa = NULL; if (flags & RTF_HOST) - ifa = ifa_ifwithdstaddr_fib(dst, fibnum); + ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { @@ -751,12 +776,12 @@ ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, * or host, the gateway may still be on the * other end of a pt to pt link. */ - ifa = ifa_ifwithdstaddr_fib(gateway, fibnum); + ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) - ifa = ifa_ifwithnet_fib(gateway, 0, fibnum); + ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { - struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum); + struct rtentry *rt = rtalloc1_fib(gateway, 0, 0, fibnum); if (rt == NULL) return (NULL); /* @@ -800,19 +825,6 @@ ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway, * all the bits of info needed */ int -rtrequest(int req, - struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct rtentry **ret_nrt) -{ - - return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, - RT_DEFAULT_FIB)); -} - -int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, @@ -834,6 +846,443 @@ rtrequest_fib(int req, return rtrequest1_fib(req, &info, ret_nrt, fibnum); } + +/* + * Copy most of @rt data into @info. + * + * If @flags contains NHR_COPY, copies dst,netmask and gw to the + * pointers specified by @info structure. Assume such pointers + * are zeroed sockaddr-like structures with sa_len field initialized + * to reflect size of the provided buffer. if no NHR_COPY is specified, + * point dst,netmask and gw @info fields to appropriate @rt values. + * + * if @flags contains NHR_REF, do refcouting on rt_ifp. + * + * Returns 0 on success. + */ +int +rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) +{ + struct rt_metrics *rmx; + struct sockaddr *src, *dst; + int sa_len; + + if (flags & NHR_COPY) { + /* Copy destination if dst is non-zero */ + src = rt_key(rt); + dst = info->rti_info[RTAX_DST]; + sa_len = src->sa_len; + if (dst != NULL) { + if (src->sa_len > dst->sa_len) + return (ENOMEM); + memcpy(dst, src, src->sa_len); + info->rti_addrs |= RTA_DST; + } + + /* Copy mask if set && dst is non-zero */ + src = rt_mask(rt); + dst = info->rti_info[RTAX_NETMASK]; + if (src != NULL && dst != NULL) { + + /* + * Radix stores different value in sa_len, + * assume rt_mask() to have the same length + * as rt_key() + */ + if (sa_len > dst->sa_len) + return (ENOMEM); + memcpy(dst, src, src->sa_len); + info->rti_addrs |= RTA_NETMASK; + } + + /* Copy gateway is set && dst is non-zero */ + src = rt->rt_gateway; + dst = info->rti_info[RTAX_GATEWAY]; + if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ + if (src->sa_len > dst->sa_len) + return (ENOMEM); + memcpy(dst, src, src->sa_len); + info->rti_addrs |= RTA_GATEWAY; + } + } else { + info->rti_info[RTAX_DST] = rt_key(rt); + info->rti_addrs |= RTA_DST; + if (rt_mask(rt) != NULL) { + info->rti_info[RTAX_NETMASK] = rt_mask(rt); + info->rti_addrs |= RTA_NETMASK; + } + if (rt->rt_flags & RTF_GATEWAY) { + info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info->rti_addrs |= RTA_GATEWAY; + } + } + + rmx = info->rti_rmx; + if (rmx != NULL) { + info->rti_mflags |= RTV_MTU; + rmx->rmx_mtu = rt->rt_mtu; + } + + info->rti_flags = rt->rt_flags; + info->rti_ifp = rt->rt_ifp; + info->rti_ifa = rt->rt_ifa; + + if (flags & NHR_REF) { + /* Do 'traditional' refcouting */ + if_ref(info->rti_ifp); + } + + return (0); +} + +/* + * Lookups up route entry for @dst in RIB database for fib @fibnum. + * Exports entry data to @info using rt_exportinfo(). + * + * if @flags contains NHR_REF, refcouting is performed on rt_ifp. + * All references can be released later by calling rib_free_info() + * + * Returns 0 on success. + * Returns ENOENT for lookup failure, ENOMEM for export failure. + */ +int +rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, + uint32_t flowid, struct rt_addrinfo *info) +{ + struct rib_head *rh; + struct radix_node *rn; + struct rtentry *rt; + int error; + + KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, dst->sa_family); + if (rh == NULL) + return (ENOENT); + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = RNTORT(rn); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(rt->rt_ifp)) { + flags = (flags & NHR_REF) | NHR_COPY; + error = rt_exportinfo(rt, info, flags); + RIB_RUNLOCK(rh); + + return (error); + } + } + RIB_RUNLOCK(rh); + + return (ENOENT); +} + +/* + * Releases all references acquired by rib_lookup_info() when + * called with NHR_REF flags. + */ +void +rib_free_info(struct rt_addrinfo *info) +{ + + if_rele(info->rti_ifp); +} + +/* + * Iterates over all existing fibs in system calling + * @setwa_f function prior to traversing each fib. + * Calls @wa_f function for each element in current fib. + * If af is not AF_UNSPEC, iterates over fibs in particular + * address family. + */ +void +rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, + void *arg) +{ + struct rib_head *rnh; + uint32_t fibnum; + int i; + + for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { + /* Do we want some specific family? */ + if (af != AF_UNSPEC) { + rnh = rt_tables_get_rnh(fibnum, af); + if (rnh == NULL) + continue; + if (setwa_f != NULL) + setwa_f(rnh, fibnum, af, arg); + + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); + RIB_WUNLOCK(rnh); + continue; + } + + for (i = 1; i <= AF_MAX; i++) { + rnh = rt_tables_get_rnh(fibnum, i); + if (rnh == NULL) + continue; + if (setwa_f != NULL) + setwa_f(rnh, fibnum, i, arg); + + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); + RIB_WUNLOCK(rnh); + } + } +} + +struct rt_delinfo +{ + struct rt_addrinfo info; + struct rib_head *rnh; + struct rtentry *head; +}; + +/* + * Conditionally unlinks @rn from radix tree based + * on info data passed in @arg. + */ +static int +rt_checkdelroute(struct radix_node *rn, void *arg) +{ + struct rt_delinfo *di; + struct rt_addrinfo *info; + struct rtentry *rt; + int error; + + di = (struct rt_delinfo *)arg; + rt = (struct rtentry *)rn; + info = &di->info; + error = 0; + + info->rti_info[RTAX_DST] = rt_key(rt); + info->rti_info[RTAX_NETMASK] = rt_mask(rt); + info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + + rt = rt_unlinkrte(di->rnh, info, &error); + if (rt == NULL) { + /* Either not allowed or not matched. Skip entry */ + return (0); + } + + /* Entry was unlinked. Add to the list and return */ + rt->rt_chain = di->head; + di->head = rt; + + return (0); +} + +/* + * Iterates over all existing fibs in system. + * Deletes each element for which @filter_f function returned + * non-zero value. + * If @af is not AF_UNSPEC, iterates over fibs in particular + * address family. + */ +void +rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg) +{ + struct rib_head *rnh; + struct rt_delinfo di; + struct rtentry *rt; + uint32_t fibnum; + int i, start, end; + + bzero(&di, sizeof(di)); + di.info.rti_filter = filter_f; + di.info.rti_filterdata = arg; + + for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { + /* Do we want some specific family? */ + if (af != AF_UNSPEC) { + start = af; + end = af; + } else { + start = 1; + end = AF_MAX; + } + + for (i = start; i <= end; i++) { + rnh = rt_tables_get_rnh(fibnum, i); + if (rnh == NULL) + continue; + di.rnh = rnh; + + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); + RIB_WUNLOCK(rnh); + + if (di.head == NULL) + continue; + + /* We might have something to reclaim */ + while (di.head != NULL) { + rt = di.head; + di.head = rt->rt_chain; + rt->rt_chain = NULL; + + /* TODO std rt -> rt_addrinfo export */ + di.info.rti_info[RTAX_DST] = rt_key(rt); + di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); + + rt_notifydelete(rt, &di.info); + RTFREE_LOCKED(rt); + } + + } + } +} + +/* + * Delete Routes for a Network Interface + * + * Called for each routing entry via the rnh->rnh_walktree() call above + * to delete all route entries referencing a detaching network interface. + * + * Arguments: + * rt pointer to rtentry + * arg argument passed to rnh->rnh_walktree() - detaching interface + * + * Returns: + * 0 successful + * errno failed - reason indicated + */ +static int +rt_ifdelroute(const struct rtentry *rt, void *arg) +{ + struct ifnet *ifp = arg; + + if (rt->rt_ifp != ifp) + return (0); + + /* + * Protect (sorta) against walktree recursion problems + * with cloned routes + */ + if ((rt->rt_flags & RTF_UP) == 0) + return (0); + + return (1); +} + +/* + * Delete all remaining routes using this interface + * Unfortuneatly the only way to do this is to slog through + * the entire routing table looking for routes which point + * to this interface...oh well... + */ +void +rt_flushifroutes_af(struct ifnet *ifp, int af) +{ + KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d", + __func__, af, AF_MAX)); + + rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp); +} + +void +rt_flushifroutes(struct ifnet *ifp) +{ + + rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); +} + +/* + * Conditionally unlinks rtentry matching data inside @info from @rnh. + * Returns unlinked, locked and referenced @rtentry on success, + * Returns NULL and sets @perror to: + * ESRCH - if prefix was not found, + * EADDRINUSE - if trying to delete PINNED route without appropriate flag. + * ENOENT - if supplied filter function returned 0 (not matched). + */ +static struct rtentry * +rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) +{ + struct sockaddr *dst, *netmask; + struct rtentry *rt; + struct radix_node *rn; + + dst = info->rti_info[RTAX_DST]; + netmask = info->rti_info[RTAX_NETMASK]; + + rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); + if (rt == NULL) { + *perror = ESRCH; + return (NULL); + } + + if ((info->rti_flags & RTF_PINNED) == 0) { + /* Check if target route can be deleted */ + if (rt->rt_flags & RTF_PINNED) { + *perror = EADDRINUSE; + return (NULL); + } + } + + if (info->rti_filter != NULL) { + if (info->rti_filter(rt, info->rti_filterdata) == 0) { + /* Not matched */ + *perror = ENOENT; + return (NULL); + } + + /* + * Filter function requested rte deletion. + * Ease the caller work by filling in remaining info + * from that particular entry. + */ + info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; + } + + /* + * Remove the item from the tree and return it. + * Complain if it is not there and do no more processing. + */ + *perror = ESRCH; +#ifdef RADIX_MPATH + if (rt_mpath_capable(rnh)) + rn = rt_mpath_unlink(rnh, info, rt, perror); + else +#endif + rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); + if (rn == NULL) + return (NULL); + + if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) + panic ("rtrequest delete"); + + rt = RNTORT(rn); + RT_LOCK(rt); + RT_ADDREF(rt); + rt->rt_flags &= ~RTF_UP; + + *perror = 0; + + return (rt); +} + +static void +rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) +{ + struct ifaddr *ifa; + + /* + * give the protocol a chance to keep things in sync. + */ + ifa = rt->rt_ifa; + if (ifa != NULL && ifa->ifa_rtrequest != NULL) + ifa->ifa_rtrequest(RTM_DELETE, rt, info); + + /* + * One more rtentry floating around that is not + * linked to the routing table. rttrash will be decremented + * when RTFREE(rt) is eventually called. + */ + V_rttrash++; +} + + /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! @@ -845,13 +1294,6 @@ rtrequest_fib(int req, #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags -int -rt_getifa(struct rt_addrinfo *info) -{ - - return (rt_getifa_fib(info, RT_DEFAULT_FIB)); -} - /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. @@ -868,7 +1310,7 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && - (ifa = ifa_ifwithnet_fib(ifpaddr, 0, fibnum)) != NULL) { + (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; ifa_free(ifa); } @@ -882,10 +1324,10 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) - info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway, + info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) - info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa, + info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { @@ -896,94 +1338,70 @@ rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) return (error); } -/* - * Expunges references to a route that's about to be reclaimed. - * The route must be locked. - */ -int -rtexpunge(struct rtentry *rt) +static int +if_updatemtu_cb(struct radix_node *rn, void *arg) { -#if !defined(RADIX_MPATH) - struct radix_node *rn; -#else - struct rt_addrinfo info; - int fib; - struct rtentry *rt0; -#endif - struct radix_node_head *rnh; - struct ifaddr *ifa; - int error = 0; - - /* - * Find the correct routing tree to use for this Address Family - */ - rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); - RT_LOCK_ASSERT(rt); - if (rnh == NULL) - return (EAFNOSUPPORT); - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); + struct rtentry *rt; + struct if_mtuinfo *ifmtu; -#ifdef RADIX_MPATH - fib = rt->rt_fibnum; - bzero(&info, sizeof(info)); - info.rti_ifp = rt->rt_ifp; - info.rti_flags = RTF_RNH_LOCKED; - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr; + rt = (struct rtentry *)rn; + ifmtu = (struct if_mtuinfo *)arg; - RT_UNLOCK(rt); - error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib); + if (rt->rt_ifp != ifmtu->ifp) + return (0); - if (error == 0 && rt0 != NULL) { - rt = rt0; - RT_LOCK(rt); - } else if (error != 0) { - RT_LOCK(rt); - return (error); + if (rt->rt_mtu >= ifmtu->mtu) { + /* We have to decrease mtu regardless of flags */ + rt->rt_mtu = ifmtu->mtu; + return (0); } -#else + /* - * Remove the item from the tree; it should be there, - * but when callers invoke us blindly it may not (sigh). + * New MTU is bigger. Check if are allowed to alter it */ - rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh); - if (rn == NULL) { - error = ESRCH; - goto bad; + if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { + + /* + * Skip routes with user-supplied MTU and + * non-interface routes + */ + return (0); } - KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0, - ("unexpected flags 0x%x", rn->rn_flags)); - KASSERT(rt == RNTORT(rn), - ("lookup mismatch, rt %p rn %p", rt, rn)); -#endif /* RADIX_MPATH */ - rt->rt_flags &= ~RTF_UP; + /* We are safe to update route MTU */ + rt->rt_mtu = ifmtu->mtu; - /* - * Give the protocol a chance to keep things in sync. - */ - if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) { - struct rt_addrinfo info; + return (0); +} - bzero((caddr_t)&info, sizeof(info)); - info.rti_flags = rt->rt_flags; - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = rt_mask(rt); - ifa->ifa_rtrequest(RTM_DELETE, rt, &info); - } +void +rt_updatemtu(struct ifnet *ifp) +{ + struct if_mtuinfo ifmtu; + struct rib_head *rnh; + int i, j; + + ifmtu.ifp = ifp; /* - * one more rtentry floating around that is not - * linked to the routing table. + * Try to update rt_mtu for all routes using this interface + * Unfortunately the only way to do this is to traverse all + * routing tables in all fibs/domains. */ - V_rttrash++; -#if !defined(RADIX_MPATH) -bad: -#endif - return (error); + for (i = 1; i <= AF_MAX; i++) { + ifmtu.mtu = if_getmtu_family(ifp, i); + for (j = 0; j < rt_numfibs; j++) { + rnh = rt_tables_get_rnh(j, i); + if (rnh == NULL) + continue; + RIB_WLOCK(rnh); + rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); + RIB_WUNLOCK(rnh); + } + } } + #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); @@ -1036,26 +1454,32 @@ rt_print(char *buf, int buflen, struct rtentry *rt) #endif #ifdef RADIX_MPATH -static int -rn_mpath_update(int req, struct rt_addrinfo *info, - struct radix_node_head *rnh, struct rtentry **ret_nrt) +/* + * Deletes key for single-path routes, unlinks rtentry with + * gateway specified in @info from multi-path routes. + * + * Returnes unlinked entry. In case of failure, returns NULL + * and sets @perror to ESRCH. + */ +static struct radix_node * +rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ - struct rtentry *rt, *rto = NULL; - register struct radix_node *rn; - int error = 0; + struct rtentry *rt; // *rto = NULL; + struct radix_node *rn; + struct sockaddr *gw; - rn = rnh->rnh_lookup(dst, netmask, rnh); - if (rn == NULL) - return (ESRCH); - rto = rt = RNTORT(rn); + gw = info->rti_info[RTAX_GATEWAY]; + rt = rt_mpath_matchgate(rto, gw); + if (rt == NULL) { + *perror = ESRCH; + return (NULL); + } - rt = rt_mpath_matchgate(rt, gateway); - if (rt == NULL) - return (ESRCH); /* * this is the first entry in the chain */ @@ -1078,67 +1502,95 @@ rn_mpath_update(int req, struct rt_addrinfo *info, * check the case when there is only * one route in the chain. */ - if (gateway && - (rt->rt_gateway->sa_len != gateway->sa_len || - memcmp(rt->rt_gateway, gateway, gateway->sa_len))) - error = ESRCH; - else { - /* - * remove from tree before returning it - * to the caller - */ - rn = rnh->rnh_deladdr(dst, netmask, rnh); - KASSERT(rt == RNTORT(rn), ("radix node disappeared")); - goto gwdelete; + if (gw && + (rt->rt_gateway->sa_len != gw->sa_len || + memcmp(rt->rt_gateway, gw, gw->sa_len))) { + *perror = ESRCH; + return (NULL); } - } + /* * use the normal delete code to remove * the first entry */ - if (req != RTM_DELETE) - goto nondelete; - - error = ENOENT; - goto done; + rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); + *perror = 0; + return (rn); } /* * if the entry is 2nd and on up */ - if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt)) + if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); -gwdelete: - RT_LOCK(rt); - RT_ADDREF(rt); - if (req == RTM_DELETE) { - rt->rt_flags &= ~RTF_UP; - /* - * One more rtentry floating around that is not - * linked to the routing table. rttrash will be decremented - * when RTFREE(rt) is eventually called. - */ - V_rttrash++; + *perror = 0; + rn = (struct radix_node *)rt; + return (rn); +} +#endif + +#ifdef FLOWTABLE +static struct rtentry * +rt_flowtable_check_route(struct rib_head *rnh, struct rt_addrinfo *info) +{ +#if defined(INET6) || defined(INET) + struct radix_node *rn; +#endif + struct rtentry *rt0; + + rt0 = NULL; + /* "flow-table" only supports IPv6 and IPv4 at the moment. */ + switch (dst->sa_family) { +#ifdef INET6 + case AF_INET6: +#endif +#ifdef INET + case AF_INET: +#endif +#if defined(INET6) || defined(INET) + rn = rnh->rnh_matchaddr(dst, &rnh->head); + if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { + struct sockaddr *mask; + u_char *m, *n; + int len; + + /* + * compare mask to see if the new route is + * more specific than the existing one + */ + rt0 = RNTORT(rn); + RT_LOCK(rt0); + RT_ADDREF(rt0); + RT_UNLOCK(rt0); + /* + * A host route is already present, so + * leave the flow-table entries as is. + */ + if (rt0->rt_flags & RTF_HOST) { + RTFREE(rt0); + rt0 = NULL; + } else if (!(flags & RTF_HOST) && netmask) { + mask = rt_mask(rt0); + len = mask->sa_len; + m = (u_char *)mask; + n = (u_char *)netmask; + while (len-- > 0) { + if (*n != *m) + break; + n++; + m++; + } + if (len == 0 || (*n < *m)) { + RTFREE(rt0); + rt0 = NULL; + } + } + } +#endif/* INET6 || INET */ } - -nondelete: - if (req != RTM_DELETE) - panic("unrecognized request %d", req); - - /* - * If the caller wants it, then it can have it, - * but it's up to it to free the rtentry as we won't be - * doing it. - */ - if (ret_nrt) { - *ret_nrt = rt; - RT_UNLOCK(rt); - } else - RTFREE_LOCKED(rt); -done: - return (error); + return (rt0); } #endif @@ -1146,19 +1598,19 @@ int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { - int error = 0, needlock = 0; - register struct rtentry *rt; + int error = 0; + struct rtentry *rt, *rt_old; #ifdef FLOWTABLE - register struct rtentry *rt0; + struct rtentry *rt0; #endif - register struct radix_node *rn; - register struct radix_node_head *rnh; + struct radix_node *rn; + struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; -#define senderr(x) { error = x ; goto bad; } KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); + KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); switch (dst->sa_family) { case AF_INET6: case AF_INET: @@ -1175,12 +1627,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); - needlock = ((flags & RTF_RNH_LOCKED) == 0); - flags &= ~RTF_RNH_LOCKED; - if (needlock) - RADIX_NODE_HEAD_LOCK(rnh); - else - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); + /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. @@ -1194,52 +1641,14 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } -#ifdef RADIX_MPATH - if (rn_mpath_capable(rnh)) { - error = rn_mpath_update(req, info, rnh, ret_nrt); - /* - * "bad" holds true for the success case - * as well - */ - if (error != ENOENT) - goto bad; - error = 0; - } -#endif - if ((flags & RTF_PINNED) == 0) { - /* Check if target route can be deleted */ - rt = (struct rtentry *)rnh->rnh_lookup(dst, - netmask, rnh); - if ((rt != NULL) && (rt->rt_flags & RTF_PINNED)) - senderr(EADDRINUSE); - } - /* - * Remove the item from the tree and return it. - * Complain if it is not there and do no more processing. - */ - rn = rnh->rnh_deladdr(dst, netmask, rnh); - if (rn == NULL) - senderr(ESRCH); - if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) - panic ("rtrequest delete"); - rt = RNTORT(rn); - RT_LOCK(rt); - RT_ADDREF(rt); - rt->rt_flags &= ~RTF_UP; + RIB_WLOCK(rnh); + rt = rt_unlinkrte(rnh, info, &error); + RIB_WUNLOCK(rnh); + if (error != 0) + return (error); - /* - * give the protocol a chance to keep things in sync. - */ - if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) - ifa->ifa_rtrequest(RTM_DELETE, rt, info); - - /* - * One more rtentry floating around that is not - * linked to the routing table. rttrash will be decremented - * when RTFREE(rt) is eventually called. - */ - V_rttrash++; + rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, @@ -1260,37 +1669,32 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) - senderr(EINVAL); + return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) - senderr(EINVAL); + return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) - senderr(error); + return (error); } else ifa_ref(info->rti_ifa); ifa = info->rti_ifa; - rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO); + rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { - if (ifa != NULL) - ifa_free(ifa); - senderr(ENOBUFS); + ifa_free(ifa); + return (ENOBUFS); } - RT_LOCK_INIT(rt); rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ - RT_LOCK(rt); if ((error = rt_setgate(rt, dst, gateway)) != 0) { - RT_LOCK_DESTROY(rt); - if (ifa != NULL) - ifa_free(ifa); + ifa_free(ifa); uma_zfree(V_rtzone, rt); - senderr(error); + return (error); } /* @@ -1313,111 +1717,81 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, */ rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; - rt->rt_rmx.rmx_weight = 1; + rt->rt_weight = 1; + rt_setmetrics(info, rt); + + RIB_WLOCK(rnh); + RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ - if (rn_mpath_capable(rnh) && + if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { - if (rt->rt_ifa) { - ifa_free(rt->rt_ifa); - } - Free(rt_key(rt)); - RT_LOCK_DESTROY(rt); + RIB_WUNLOCK(rnh); + + ifa_free(rt->rt_ifa); + R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); - senderr(EEXIST); + return (EEXIST); } #endif #ifdef FLOWTABLE - rt0 = NULL; - /* "flow-table" only supports IPv6 and IPv4 at the moment. */ - switch (dst->sa_family) { -#ifdef INET6 - case AF_INET6: -#endif -#ifdef INET - case AF_INET: -#endif -#if defined(INET6) || defined(INET) - rn = rnh->rnh_matchaddr(dst, rnh); - if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { - struct sockaddr *mask; - u_char *m, *n; - int len; - - /* - * compare mask to see if the new route is - * more specific than the existing one - */ - rt0 = RNTORT(rn); - RT_LOCK(rt0); - RT_ADDREF(rt0); - RT_UNLOCK(rt0); - /* - * A host route is already present, so - * leave the flow-table entries as is. - */ - if (rt0->rt_flags & RTF_HOST) { - RTFREE(rt0); - rt0 = NULL; - } else if (!(flags & RTF_HOST) && netmask) { - mask = rt_mask(rt0); - len = mask->sa_len; - m = (u_char *)mask; - n = (u_char *)netmask; - while (len-- > 0) { - if (*n != *m) - break; - n++; - m++; - } - if (len == 0 || (*n < *m)) { - RTFREE(rt0); - rt0 = NULL; - } - } - } -#endif/* INET6 || INET */ - } + rt0 = rt_flowtable_check_route(rnh, info); #endif /* FLOWTABLE */ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ - rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes); + rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); + + rt_old = NULL; + if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { + + /* + * Force removal and re-try addition + * TODO: better multipath&pinned support + */ + struct sockaddr *info_dst = info->rti_info[RTAX_DST]; + info->rti_info[RTAX_DST] = ndst; + /* Do not delete existing PINNED(interface) routes */ + info->rti_flags &= ~RTF_PINNED; + rt_old = rt_unlinkrte(rnh, info, &error); + info->rti_flags |= RTF_PINNED; + info->rti_info[RTAX_DST] = info_dst; + if (rt_old != NULL) + rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, + rt->rt_nodes); + } + RIB_WUNLOCK(rnh); + + if (rt_old != NULL) + RT_UNLOCK(rt_old); + /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { - if (rt->rt_ifa) - ifa_free(rt->rt_ifa); - Free(rt_key(rt)); - RT_LOCK_DESTROY(rt); + ifa_free(rt->rt_ifa); + R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); #ifdef FLOWTABLE if (rt0 != NULL) RTFREE(rt0); #endif - senderr(EEXIST); + return (EEXIST); } #ifdef FLOWTABLE else if (rt0 != NULL) { - switch (dst->sa_family) { -#ifdef INET6 - case AF_INET6: - flowtable_route_flush(V_ip6_ft, rt0); - break; -#endif -#ifdef INET - case AF_INET: - flowtable_route_flush(V_ip_ft, rt0); - break; -#endif - } + flowtable_route_flush(dst->sa_family, rt0); RTFREE(rt0); } #endif + if (rt_old != NULL) { + rt_notifydelete(rt_old, info); + RTFREE(rt_old); + } + /* * If this protocol has something to add to this then * allow it to do that as well. @@ -1433,16 +1807,19 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, *ret_nrt = rt; RT_ADDREF(rt); } + rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; + case RTM_CHANGE: + RIB_WLOCK(rnh); + error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); + RIB_WUNLOCK(rnh); + break; default: error = EOPNOTSUPP; } -bad: - if (needlock) - RADIX_NODE_HEAD_UNLOCK(rnh); + return (error); -#undef senderr } #undef dst @@ -1452,20 +1829,147 @@ bad: #undef ifpaddr #undef flags +static int +rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, + struct rtentry **ret_nrt, u_int fibnum) +{ + struct rtentry *rt = NULL; + int error = 0; + int free_ifa = 0; + int family, mtu; + struct if_mtuinfo ifmtu; + + rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], + info->rti_info[RTAX_NETMASK], &rnh->head); + + if (rt == NULL) + return (ESRCH); + +#ifdef RADIX_MPATH + /* + * If we got multipath routes, + * we require users to specify a matching RTAX_GATEWAY. + */ + if (rt_mpath_capable(rnh)) { + rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); + if (rt == NULL) + return (ESRCH); + } +#endif + + RT_LOCK(rt); + + rt_setmetrics(info, rt); + + /* + * New gateway could require new ifaddr, ifp; + * flags may also be different; ifp may be specified + * by ll sockaddr when protocol address is ambiguous + */ + if (((rt->rt_flags & RTF_GATEWAY) && + info->rti_info[RTAX_GATEWAY] != NULL) || + info->rti_info[RTAX_IFP] != NULL || + (info->rti_info[RTAX_IFA] != NULL && + !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { + + error = rt_getifa_fib(info, fibnum); + if (info->rti_ifa != NULL) + free_ifa = 1; + + if (error != 0) + goto bad; + } + + /* Check if outgoing interface has changed */ + if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && + rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) { + rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); + ifa_free(rt->rt_ifa); + } + /* Update gateway address */ + if (info->rti_info[RTAX_GATEWAY] != NULL) { + error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); + if (error != 0) + goto bad; + + rt->rt_flags &= ~RTF_GATEWAY; + rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); + } + + if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { + ifa_ref(info->rti_ifa); + rt->rt_ifa = info->rti_ifa; + rt->rt_ifp = info->rti_ifp; + } + /* Allow some flags to be toggled on change. */ + rt->rt_flags &= ~RTF_FMASK; + rt->rt_flags |= info->rti_flags & RTF_FMASK; + + if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); + + /* Alter route MTU if necessary */ + if (rt->rt_ifp != NULL) { + family = info->rti_info[RTAX_DST]->sa_family; + mtu = if_getmtu_family(rt->rt_ifp, family); + /* Set default MTU */ + if (rt->rt_mtu == 0) + rt->rt_mtu = mtu; + if (rt->rt_mtu != mtu) { + /* Check if we really need to update */ + ifmtu.ifp = rt->rt_ifp; + ifmtu.mtu = mtu; + if_updatemtu_cb(rt->rt_nodes, &ifmtu); + } + } + + if (ret_nrt) { + *ret_nrt = rt; + RT_ADDREF(rt); + } +bad: + RT_UNLOCK(rt); + if (free_ifa != 0) + ifa_free(info->rti_ifa); + return (error); +} + +static void +rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) +{ + + if (info->rti_mflags & RTV_MTU) { + if (info->rti_rmx->rmx_mtu != 0) { + + /* + * MTU was explicitly provided by user. + * Keep it. + */ + rt->rt_flags |= RTF_FIXEDMTU; + } else { + + /* + * User explicitly sets MTU to 0. + * Assume rollback to default. + */ + rt->rt_flags &= ~RTF_FIXEDMTU; + } + rt->rt_mtu = info->rti_rmx->rmx_mtu; + } + if (info->rti_mflags & RTV_WEIGHT) + rt->rt_weight = info->rti_rmx->rmx_weight; + /* Kernel -> userland timebase conversion. */ + if (info->rti_mflags & RTV_EXPIRE) + rt->rt_expire = info->rti_rmx->rmx_expire ? + info->rti_rmx->rmx_expire - time_second + time_uptime : 0; +} + int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); -#ifdef INVARIANTS - struct radix_node_head *rnh; - rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family); -#endif - - RT_LOCK_ASSERT(rt); - RADIX_NODE_HEAD_LOCK_ASSERT(rnh); - /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same @@ -1487,7 +1991,7 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); - Free(rt_key(rt)); /* free old block, if any */ + R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } @@ -1503,9 +2007,9 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { - register u_char *cp1 = (u_char *)src; - register u_char *cp2 = (u_char *)dst; - register u_char *cp3 = (u_char *)netmask; + u_char *cp1 = (u_char *)src; + u_char *cp2 = (u_char *)dst; + u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; @@ -1537,7 +2041,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; - struct radix_node_head *rnh; + struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; @@ -1558,13 +2062,13 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) break; } if (fibnum == RT_ALL_FIBS) { - if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) { + if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) #ifndef __rtems__ startfib = endfib = ifa->ifa_ifp->if_fib; #else /* __rtems__ */ startfib = endfib = BSD_DEFAULT_FIB; #endif /* __rtems__ */ - } else { + else { startfib = 0; endfib = rt_numfibs - 1; } @@ -1609,10 +2113,10 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) if (rnh == NULL) /* this table doesn't exist but others might */ continue; - RADIX_NODE_HEAD_RLOCK(rnh); - rn = rnh->rnh_lookup(dst, netmask, rnh); + RIB_RLOCK(rnh); + rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH - if (rn_mpath_capable(rnh)) { + if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; @@ -1635,7 +2139,7 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); - RADIX_NODE_HEAD_RUNLOCK(rnh); + RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; @@ -1660,32 +2164,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); - if ((error == EEXIST) && (cmd == RTM_ADD)) { - /* - * Interface route addition failed. - * Atomically delete current prefix generating - * RTM_DELETE message, and retry adding - * interface prefix. - */ - rnh = rt_tables_get_rnh(fibnum, dst->sa_family); - RADIX_NODE_HEAD_LOCK(rnh); - - /* Delete old prefix */ - info.rti_ifa = NULL; - info.rti_flags = RTF_RNH_LOCKED; - - error = rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); - if (error == 0) { - info.rti_ifa = ifa; - info.rti_flags = flags | RTF_RNH_LOCKED | - (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; - error = rtrequest1_fib(cmd, &info, &rt, fibnum); - } - - RADIX_NODE_HEAD_UNLOCK(rnh); - } - - if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change @@ -1760,15 +2238,6 @@ rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) return (error); } -#ifndef BURN_BRIDGES -/* special one for inet internal use. may not use. */ -int -rtinit_fib(struct ifaddr *ifa, int cmd, int flags) -{ - return (rtinit1(ifa, cmd, flags, RT_ALL_FIBS)); -} -#endif - /* * Set up a routing table entry, normally * for an interface. diff --git a/freebsd/sys/net/route.h b/freebsd/sys/net/route.h index 0baa9a4c..d44dc9d5 100644 --- a/freebsd/sys/net/route.h +++ b/freebsd/sys/net/route.h @@ -33,6 +33,9 @@ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ +#include <sys/counter.h> +#include <net/vnet.h> + /* * Kernel resident routing tables. * @@ -41,32 +44,39 @@ */ /* - * A route consists of a destination address, a reference - * to a routing entry, and a reference to an llentry. - * These are often held by protocols in their control - * blocks, e.g. inpcb. + * Struct route consiste of a destination address, + * a route entry pointer, link-layer prepend data pointer along + * with its length. */ struct route { struct rtentry *ro_rt; struct llentry *ro_lle; - struct in_ifaddr *ro_ia; - int ro_flags; + /* + * ro_prepend and ro_plen are only used for bpf to pass in a + * preformed header. They are not cacheable. + */ + char *ro_prepend; + uint16_t ro_plen; + uint16_t ro_flags; + uint16_t ro_mtu; /* saved ro_rt mtu */ + uint16_t spare; struct sockaddr ro_dst; }; +#define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ +#define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ +#define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ + #define RT_CACHING_CONTEXT 0x1 /* XXX: not used anywhere */ #define RT_NORTREF 0x2 /* doesn't hold reference on ro_rt */ +#define RT_L2_ME (1 << RT_L2_ME_BIT) /* 0x0004 */ +#define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) /* 0x0008 */ +#define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) /* 0x0010 */ -/* - * These numbers are used by reliable protocols for determining - * retransmission behavior and are included in the routing structure. - */ -struct rt_metrics_lite { - u_long rmx_mtu; /* MTU for this path */ - u_long rmx_expire; /* lifetime for route, e.g. redirect */ - u_long rmx_pksent; /* packets sent using this route */ - u_long rmx_weight; /* absolute weight */ -}; +#define RT_REJECT 0x0020 /* Destination is reject */ +#define RT_BLACKHOLE 0x0040 /* Destination is blackhole */ +#define RT_HAS_GW 0x0080 /* Destination has GW */ +#define RT_LLE_CACHE 0x0100 /* Cache link layer */ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ @@ -91,14 +101,24 @@ struct rt_metrics { #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) +/* lle state is exported in rmx_state rt_metrics field */ +#define rmx_state rmx_weight + +/* + * Keep a generation count of routing table, incremented on route addition, + * so we can invalidate caches. This is accessed without a lock, as precision + * is not required. + */ +typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ +#define RT_GEN(fibnum, af) rt_tables_get_gen(fibnum, af) + #define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ #define RT_ALL_FIBS -1 /* Announce event for every fib */ +#ifdef _KERNEL extern u_int rt_numfibs; /* number of usable routing tables */ -extern u_int rt_add_addr_allfibs; /* Announce interfaces to all fibs */ -/* - * XXX kernel function pointer `rt_output' is visible to applications. - */ -struct mbuf; +VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ +#define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) +#endif /* * We distinguish between routes to hosts and routes to networks, @@ -114,6 +134,8 @@ struct mbuf; #include <net/radix_mpath.h> #endif #endif + +#if defined(_KERNEL) || defined(_WANT_RTENTRY) struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ /* @@ -124,33 +146,20 @@ struct rtentry { #define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) #define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) struct sockaddr *rt_gateway; /* value */ - int rt_flags; /* up/down?, host/net */ - int rt_refcnt; /* # held references */ struct ifnet *rt_ifp; /* the answer: interface to use */ struct ifaddr *rt_ifa; /* the answer: interface address to use */ - struct rt_metrics_lite rt_rmx; /* metrics used by rx'ing protocols */ - u_int rt_fibnum; /* which FIB */ -#ifdef _KERNEL - /* XXX ugly, user apps use this definition but don't have a mtx def */ - struct mtx rt_mtx; /* mutex for routing entry */ -#endif + int rt_flags; /* up/down?, host/net */ + int rt_refcnt; /* # held references */ + u_int rt_fibnum; /* which FIB */ + u_long rt_mtu; /* MTU for this path */ + u_long rt_weight; /* absolute weight */ + u_long rt_expire; /* lifetime for route, e.g. redirect */ +#define rt_endzero rt_pksent + counter_u64_t rt_pksent; /* packets sent using this route */ + struct mtx rt_mtx; /* mutex for routing entry */ + struct rtentry *rt_chain; /* pointer to next rtentry to delete */ }; - -/* - * Following structure necessary for 4.3 compatibility; - * We should eventually move it to a compat file. - */ -struct ortentry { - u_long rt_hash; /* to speed lookups */ - struct sockaddr rt_dst; /* key */ - struct sockaddr rt_gateway; /* value */ - short rt_flags; /* up/down?, host/net */ - short rt_refcnt; /* # held references */ - u_long rt_use; /* raw # packets forwarded */ - struct ifnet *rt_ifp; /* the answer: interface to use */ -}; - -#define rt_use rt_rmx.rmx_pksent +#endif /* _KERNEL || _WANT_RTENTRY */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ @@ -169,15 +178,10 @@ struct ortentry { #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ - -/* XXX: temporary to stay API/ABI compatible with userland */ -#ifndef _KERNEL -#define RTF_PRCLONING 0x10000 /* unused, for compatibility */ -#endif - +/* 0x10000 unused, was RTF_PRCLONING */ /* 0x20000 unused, was RTF_WASCLONED */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ -/* 0x80000 unused */ +#define RTF_FIXEDMTU 0x80000 /* MTU was explicitly specified */ #define RTF_PINNED 0x100000 /* route is immutable */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ @@ -185,7 +189,10 @@ struct ortentry { /* 0x8000000 and up unassigned */ #define RTF_STICKY 0x10000000 /* always route dst->src */ -#define RTF_RNH_LOCKED 0x40000000 /* radix node head is locked */ +#define RTF_RNH_LOCKED 0x40000000 /* unused */ + +#define RTF_GWFLAG_COMPAT 0x80000000 /* a compatibility bit for interacting + with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ @@ -193,6 +200,40 @@ struct ortentry { RTF_REJECT | RTF_STATIC | RTF_STICKY) /* + * fib_ nexthop API flags. + */ + +/* Consumer-visible nexthop info flags */ +#define NHF_REJECT 0x0010 /* RTF_REJECT */ +#define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ +#define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ +#define NHF_DEFAULT 0x0080 /* Default route */ +#define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ +#define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ + +/* Nexthop request flags */ +#define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ +#define NHR_REF 0x02 /* For future use */ + +/* Control plane route request flags */ +#define NHR_COPY 0x100 /* Copy rte data */ + +#ifdef _KERNEL +/* rte<>ro_flags translation */ +static inline void +rt_update_ro_flags(struct route *ro) +{ + int rt_flags = ro->ro_rt->rt_flags; + + ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW); + + ro->ro_flags |= (rt_flags & RTF_REJECT) ? RT_REJECT : 0; + ro->ro_flags |= (rt_flags & RTF_BLACKHOLE) ? RT_BLACKHOLE : 0; + ro->ro_flags |= (rt_flags & RTF_GATEWAY) ? RT_HAS_GW : 0; +} +#endif + +/* * Routing statistics. */ struct rtstat { @@ -233,8 +274,8 @@ struct rt_msghdr { #define RTM_REDIRECT 0x6 /* Told to use different route */ #define RTM_MISS 0x7 /* Lookup failed on this address */ #define RTM_LOCK 0x8 /* fix specified metrics */ -#define RTM_OLDADD 0x9 /* caused by SIOCADDRT */ -#define RTM_OLDDEL 0xa /* caused by SIOCDELRT */ + /* 0x9 */ + /* 0xa */ #define RTM_RESOLVE 0xb /* req to resolve dst to LL addr */ #define RTM_NEWADDR 0xc /* address being added to iface */ #define RTM_DELADDR 0xd /* address being removed from iface */ @@ -282,12 +323,19 @@ struct rt_msghdr { #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ +typedef int rt_filter_f_t(const struct rtentry *, void *); + struct rt_addrinfo { - int rti_addrs; - struct sockaddr *rti_info[RTAX_MAX]; - int rti_flags; - struct ifaddr *rti_ifa; - struct ifnet *rti_ifp; + int rti_addrs; /* Route RTF_ flags */ + int rti_flags; /* Route RTF_ flags */ + struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ + struct ifaddr *rti_ifa; /* value of rt_ifa addr */ + struct ifnet *rti_ifp; /* route interface */ + rt_filter_f_t *rti_filter; /* filter function */ + void *rti_filterdata; /* filter paramenters */ + u_long rti_mflags; /* metrics RTV_ flags */ + u_long rti_spare; /* Will be used for fib */ + struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; /* @@ -302,17 +350,25 @@ struct rt_addrinfo { sizeof(long) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) ) +#define sa_equal(a, b) ( \ + (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \ + (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0)) + #ifdef _KERNEL #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) #define RT_LOCK_INIT(_rt) \ - mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK) + mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW) #define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) #define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx) #define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx) #define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED) +#define RT_UNLOCK_COND(_rt) do { \ + if (mtx_owned(&(_rt)->rt_mtx)) \ + mtx_unlock(&(_rt)->rt_mtx); \ +} while (0) #define RT_ADDREF(_rt) do { \ RT_LOCK_ASSERT(_rt); \ @@ -349,6 +405,7 @@ struct rt_addrinfo { if ((_ro)->ro_flags & RT_NORTREF) { \ (_ro)->ro_flags &= ~RT_NORTREF; \ (_ro)->ro_rt = NULL; \ + (_ro)->ro_lle = NULL; \ } else { \ RT_LOCK((_ro)->ro_rt); \ RTFREE_LOCKED((_ro)->ro_rt); \ @@ -356,9 +413,24 @@ struct rt_addrinfo { } \ } while (0) -struct radix_node_head *rt_tables_get_rnh(int, int); +/* + * Validate a cached route based on a supplied cookie. If there is an + * out-of-date cache, simply free it. Update the generation number + * for the new allocation + */ +#define RT_VALIDATE(ro, cookiep, fibnum) do { \ + rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ + if (*(cookiep) != cookie) { \ + if ((ro)->ro_rt != NULL) { \ + RTFREE((ro)->ro_rt); \ + (ro)->ro_rt = NULL; \ + } \ + *(cookiep) = cookie; \ + } \ +} while (0) struct ifmultiaddr; +struct rib_head; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); @@ -372,6 +444,9 @@ int rt_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); +struct rib_head *rt_table_init(int); +void rt_table_destroy(struct rib_head *); +u_int rt_tables_get_gen(int table, int fam); int rtsock_addrmsg(int, struct ifaddr *, int); int rtsock_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); @@ -379,8 +454,6 @@ int rtsock_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); /* * Note the following locking behavior: * - * rtalloc_ign() and rtalloc() return ro->ro_rt unlocked - * * rtalloc1() returns a locked rtentry * * rtfree() and RTFREE_LOCKED() require a locked rtentry @@ -388,27 +461,20 @@ int rtsock_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); * RTFREE() uses an unlocked entry. */ -int rtexpunge(struct rtentry *); void rtfree(struct rtentry *); -int rt_check(struct rtentry **, struct rtentry **, struct sockaddr *); +void rt_updatemtu(struct ifnet *); + +typedef int rt_walktree_f_t(struct rtentry *, void *); +typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *); +void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *); +void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg); +void rt_flushifroutes_af(struct ifnet *, int); +void rt_flushifroutes(struct ifnet *ifp); /* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */ /* Thes are used by old code not yet converted to use multiple FIBS */ -int rt_getifa(struct rt_addrinfo *); -void rtalloc_ign(struct route *ro, u_long ignflags); -void rtalloc(struct route *ro); /* XXX deprecated, use rtalloc_ign(ro, 0) */ struct rtentry *rtalloc1(struct sockaddr *, int, u_long); int rtinit(struct ifaddr *, int, int); -int rtioctl(u_long, caddr_t); -void rtredirect(struct sockaddr *, struct sockaddr *, - struct sockaddr *, int, struct sockaddr *); -int rtrequest(int, struct sockaddr *, - struct sockaddr *, struct sockaddr *, int, struct rtentry **); - -#ifndef BURN_BRIDGES -/* defaults to "all" FIBs */ -int rtinit_fib(struct ifaddr *, int, int); -#endif /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones @@ -416,7 +482,6 @@ int rtinit_fib(struct ifaddr *, int, int); */ int rt_getifa_fib(struct rt_addrinfo *, u_int fibnum); void rtalloc_ign_fib(struct route *ro, u_long ignflags, u_int fibnum); -void rtalloc_fib(struct route *ro, u_int fibnum); struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int); int rtioctl_fib(u_long, caddr_t, u_int); void rtredirect_fib(struct sockaddr *, struct sockaddr *, @@ -424,13 +489,10 @@ void rtredirect_fib(struct sockaddr *, struct sockaddr *, int rtrequest_fib(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); int rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int); +int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t, + struct rt_addrinfo *); +void rib_free_info(struct rt_addrinfo *info); -#include <sys/eventhandler.h> -typedef void (*rtevent_arp_update_fn)(void *, struct rtentry *, uint8_t *, struct sockaddr *); -typedef void (*rtevent_redirect_fn)(void *, struct rtentry *, struct rtentry *, struct sockaddr *); -/* route_arp_update_event is no longer generated; see arp_update_event */ -EVENTHANDLER_DECLARE(route_arp_update_event, rtevent_arp_update_fn); -EVENTHANDLER_DECLARE(route_redirect_event, rtevent_redirect_fn); #endif #endif diff --git a/freebsd/sys/net/route_var.h b/freebsd/sys/net/route_var.h new file mode 100644 index 00000000..a8ef56a5 --- /dev/null +++ b/freebsd/sys/net/route_var.h @@ -0,0 +1,76 @@ +/*- + * Copyright (c) 2015-2016 + * Alexander V. Chernikov <melifaro@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_ROUTE_VAR_H_ +#define _NET_ROUTE_VAR_H_ + +struct rib_head { + struct radix_head head; + rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ + rn_addaddr_f_t *rnh_addaddr; /* add based on sockaddr*/ + rn_deladdr_f_t *rnh_deladdr; /* remove based on sockaddr */ + rn_lookup_f_t *rnh_lookup; /* exact match for sockaddr */ + rn_walktree_t *rnh_walktree; /* traverse tree */ + rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ + rn_close_t *rnh_close; /*do something when the last ref drops*/ + rt_gen_t rnh_gen; /* generation counter */ + int rnh_multipath; /* multipath capable ? */ + struct radix_node rnh_nodes[3]; /* empty tree for common case */ + struct rwlock rib_lock; /* config/data path lock */ + struct radix_mask_head rmhead; /* masks radix head */ +}; + +#define RIB_RLOCK(rh) rw_rlock(&(rh)->rib_lock) +#define RIB_RUNLOCK(rh) rw_runlock(&(rh)->rib_lock) +#define RIB_WLOCK(rh) rw_wlock(&(rh)->rib_lock) +#define RIB_WUNLOCK(rh) rw_wunlock(&(rh)->rib_lock) +#define RIB_LOCK_ASSERT(rh) rw_assert(&(rh)->rib_lock, RA_LOCKED) +#define RIB_WLOCK_ASSERT(rh) rw_assert(&(rh)->rib_lock, RA_WLOCKED) + +struct rib_head *rt_tables_get_rnh(int fib, int family); + +/* rte<>nhop translation */ +static inline uint16_t +fib_rte_to_nh_flags(int rt_flags) +{ + uint16_t res; + + res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0; + res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0; + res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0; + res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0; + res |= (rt_flags & RTF_GATEWAY) ? NHF_GATEWAY : 0; + + return (res); +} + + +#endif diff --git a/freebsd/sys/net/rss_config.h b/freebsd/sys/net/rss_config.h new file mode 100644 index 00000000..2ab32a43 --- /dev/null +++ b/freebsd/sys/net/rss_config.h @@ -0,0 +1,138 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NET_RSS_CONFIG_H_ +#define _NET_RSS_CONFIG_H_ + +#include <netinet/in.h> /* in_addr_t */ + +/* + * Supported RSS hash functions. + */ +#define RSS_HASH_NAIVE 0x00000001 /* Poor but fast hash. */ +#define RSS_HASH_TOEPLITZ 0x00000002 /* Required by RSS. */ +#define RSS_HASH_CRC32 0x00000004 /* Future; some NICs do it. */ + +#define RSS_HASH_MASK (RSS_HASH_NAIVE | RSS_HASH_TOEPLITZ) + +/* + * Instances of struct inpcbinfo declare an RSS hash type indicating what + * header fields are covered. + */ +#define RSS_HASHFIELDS_NONE 0 +#define RSS_HASHFIELDS_4TUPLE 1 +#define RSS_HASHFIELDS_2TUPLE 2 + +/* + * Define RSS representations of the M_HASHTYPE_* values, representing + * which particular bits are supported. The NICs can then use this to + * calculate which hash types to enable and which not to enable. + * + * The fact that these line up with M_HASHTYPE_* is not to be relied + * upon. + */ +#define RSS_HASHTYPE_RSS_IPV4 (1 << 1) /* IPv4 2-tuple */ +#define RSS_HASHTYPE_RSS_TCP_IPV4 (1 << 2) /* TCPv4 4-tuple */ +#define RSS_HASHTYPE_RSS_IPV6 (1 << 3) /* IPv6 2-tuple */ +#define RSS_HASHTYPE_RSS_TCP_IPV6 (1 << 4) /* TCPv6 4-tuple */ +#define RSS_HASHTYPE_RSS_IPV6_EX (1 << 5) /* IPv6 2-tuple + ext hdrs */ +#define RSS_HASHTYPE_RSS_TCP_IPV6_EX (1 << 6) /* TCPv6 4-tiple + ext hdrs */ +#define RSS_HASHTYPE_RSS_UDP_IPV4 (1 << 7) /* IPv4 UDP 4-tuple */ +#define RSS_HASHTYPE_RSS_UDP_IPV4_EX (1 << 8) /* IPv4 UDP 4-tuple + ext hdrs */ +#define RSS_HASHTYPE_RSS_UDP_IPV6 (1 << 9) /* IPv6 UDP 4-tuple */ +#define RSS_HASHTYPE_RSS_UDP_IPV6_EX (1 << 10) /* IPv6 UDP 4-tuple + ext hdrs */ + +/* + * Compile-time limits on the size of the indirection table. + */ +#define RSS_MAXBITS 7 +#define RSS_TABLE_MAXLEN (1 << RSS_MAXBITS) + +/* + * Maximum key size used throughout. It's OK for hardware to use only the + * first 16 bytes, which is all that's required for IPv4. + */ +#define RSS_KEYSIZE 40 + +/* + * For RSS hash methods that do a software hash on an mbuf, the packet + * direction (ingress / egress) is required. + * + * The default direction (INGRESS) is the "receive into the NIC" - ie, + * what the hardware is hashing on. + */ +#define RSS_HASH_PKT_INGRESS 0 +#define RSS_HASH_PKT_EGRESS 1 + +/* + * Rate limited debugging routines. + */ +#define RSS_DEBUG(format, ...) do { \ + if (rss_debug) { \ + static struct timeval lastfail; \ + static int curfail; \ + if (ppsratecheck(&lastfail, &curfail, 5)) \ + printf("RSS (%s:%u): " format, __func__, __LINE__,\ + ##__VA_ARGS__); \ + } \ +} while (0) + +extern int rss_debug; + +/* + * Device driver interfaces to query RSS properties that must be programmed + * into hardware. + */ +u_int rss_getbits(void); +u_int rss_getbucket(u_int hash); +u_int rss_get_indirection_to_bucket(u_int index); +u_int rss_getcpu(u_int bucket); +void rss_getkey(uint8_t *key); +u_int rss_gethashalgo(void); +u_int rss_getnumbuckets(void); +u_int rss_getnumcpus(void); +u_int rss_gethashconfig(void); + +/* + * Hash calculation functions. + */ +uint32_t rss_hash(u_int datalen, const uint8_t *data); + +/* + * Network stack interface to query desired CPU affinity of a packet. + */ +struct mbuf * rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid); +u_int rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type); +int rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, + uint32_t *bucket_id); +int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id); + +#endif /* !_NET_RSS_CONFIG_H_ */ diff --git a/freebsd/sys/net/rtsock.c b/freebsd/sys/net/rtsock.c index e768e17b..1e69bcdf 100644 --- a/freebsd/sys/net/rtsock.c +++ b/freebsd/sys/net/rtsock.c @@ -54,17 +54,21 @@ #include <sys/systm.h> #include <net/if.h> +#include <net/if_var.h> #include <net/if_dl.h> #include <net/if_llatbl.h> #include <net/if_types.h> #include <net/netisr.h> #include <net/raw_cb.h> #include <net/route.h> +#include <net/route_var.h> #include <net/vnet.h> #include <netinet/in.h> #include <netinet/if_ether.h> +#include <netinet/ip_carp.h> #ifdef INET6 +#include <netinet6/ip6_var.h> #include <netinet6/scope6_var.h> #endif @@ -72,34 +76,6 @@ #include <sys/mount.h> #include <compat/freebsd32/freebsd32.h> -struct if_data32 { - uint8_t ifi_type; - uint8_t ifi_physical; - uint8_t ifi_addrlen; - uint8_t ifi_hdrlen; - uint8_t ifi_link_state; - uint8_t ifi_spare_char1; - uint8_t ifi_spare_char2; - uint8_t ifi_datalen; - uint32_t ifi_mtu; - uint32_t ifi_metric; - uint32_t ifi_baudrate; - uint32_t ifi_ipackets; - uint32_t ifi_ierrors; - uint32_t ifi_opackets; - uint32_t ifi_oerrors; - uint32_t ifi_collisions; - uint32_t ifi_ibytes; - uint32_t ifi_obytes; - uint32_t ifi_imcasts; - uint32_t ifi_omcasts; - uint32_t ifi_iqdrops; - uint32_t ifi_noproto; - uint32_t ifi_hwassist; - int32_t ifi_epoch; - struct timeval32 ifi_lastchange; -}; - struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; @@ -107,7 +83,7 @@ struct if_msghdr32 { int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; - struct if_data32 ifm_data; + struct if_data ifm_data; }; struct if_msghdrl32 { @@ -120,7 +96,7 @@ struct if_msghdrl32 { uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; - struct if_data32 ifm_data; + struct if_data ifm_data; }; struct ifa_msghdrl32 { @@ -134,7 +110,7 @@ struct ifa_msghdrl32 { uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; - struct if_data32 ifam_data; + struct if_data ifam_data; }; #endif /* COMPAT_FREEBSD32 */ @@ -144,18 +120,22 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; +/* These are external hooks for CARP. */ +int (*carp_get_vhid_p)(struct ifaddr *); + /* * Used by rtsock/raw_input callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 -static struct { +typedef struct { int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ - int ipx_count; /* attached w/ AF_IPX */ int any_count; /* total attached */ -} route_cb; +} route_cb_t; +static VNET_DEFINE(route_cb_t, route_cb); +#define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); @@ -174,20 +154,19 @@ struct walkarg { }; static void rts_input(struct mbuf *m); -static struct mbuf *rt_msg1(int type, struct rt_addrinfo *rtinfo); -static int rt_msg2(int type, struct rt_addrinfo *rtinfo, - caddr_t cp, struct walkarg *w); +static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); +static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, + struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); -static int route_output(struct mbuf *m, struct socket *so); -static void rt_setmetrics(u_long which, const struct rt_metrics *in, - struct rt_metrics_lite *out); -static void rt_getmetrics(const struct rt_metrics_lite *in, - struct rt_metrics *out); +static int route_output(struct mbuf *m, struct socket *so, ...); +static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); +static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst, + struct sockaddr *smask, struct sockaddr_storage *dmask); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", @@ -214,17 +193,35 @@ SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW, "maximum routing socket dispatch queue length"); static void -rts_init(void) +vnet_rts_init(void) { int tmp; + if (IS_DEFAULT_VNET(curvnet)) { #ifndef __rtems__ - if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) - rtsock_nh.nh_qlimit = tmp; + if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) + rtsock_nh.nh_qlimit = tmp; +#endif /* __rtems__ */ + netisr_register(&rtsock_nh); + } +#ifdef VIMAGE + else + netisr_register_vnet(&rtsock_nh); #endif - netisr_register(&rtsock_nh); } -SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0); +VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_rts_init, 0); + +#ifdef VIMAGE +static void +vnet_rts_uninit(void) +{ + + netisr_unregister_vnet(&rtsock_nh); +} +VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_rts_uninit, 0); +#endif static int raw_input_rts_cb(struct mbuf *m, struct sockproto *proto, struct sockaddr *src, @@ -294,23 +291,13 @@ static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rawcb *rp; - int s, error; + int error; KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL")); /* XXX */ rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO); - if (rp == NULL) - return ENOBUFS; - /* - * The splnet() is necessary to block protocols from sending - * error notifications (like RTM_REDIRECT or RTM_LOSING) while - * this PCB is extant but incompletely initialized. - * Probably we should try to do more of this work beforehand and - * eliminate the spl. - */ - s = splnet(); so->so_pcb = (caddr_t)rp; #ifndef __rtems__ so->so_fibnum = td->td_proc->p_fibnum; @@ -320,7 +307,6 @@ rts_attach(struct socket *so, int proto, struct thread *td) error = raw_attach(so, proto); rp = sotorawcb(so); if (error) { - splx(s); so->so_pcb = NULL; free(rp, M_PCB); return error; @@ -328,20 +314,16 @@ rts_attach(struct socket *so, int proto, struct thread *td) RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: - route_cb.ip_count++; + V_route_cb.ip_count++; break; case AF_INET6: - route_cb.ip6_count++; - break; - case AF_IPX: - route_cb.ipx_count++; + V_route_cb.ip6_count++; break; } - route_cb.any_count++; + V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); so->so_options |= SO_USELOOPBACK; - splx(s); return 0; } @@ -372,16 +354,13 @@ rts_detach(struct socket *so) RTSOCK_LOCK(); switch(rp->rcb_proto.sp_protocol) { case AF_INET: - route_cb.ip_count--; + V_route_cb.ip_count--; break; case AF_INET6: - route_cb.ip6_count--; - break; - case AF_IPX: - route_cb.ipx_count--; + V_route_cb.ip6_count--; break; } - route_cb.any_count--; + V_route_cb.any_count--; RTSOCK_UNLOCK(); raw_usrreqs.pru_detach(so); } @@ -562,17 +541,25 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, /*ARGSUSED*/ static int -route_output(struct mbuf *m, struct socket *so) +route_output(struct mbuf *m, struct socket *so, ...) { -#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0) struct rt_msghdr *rtm = NULL; struct rtentry *rt = NULL; - struct radix_node_head *rnh; + struct rib_head *rnh; struct rt_addrinfo info; - int len, error = 0; + struct sockaddr_storage ss; +#ifdef INET6 + struct sockaddr_in6 *sin6; + int i, rti_need_deembed = 0; +#endif + int alloc_len = 0, len, error = 0, fibnum; struct ifnet *ifp = NULL; union sockaddr_union saun; sa_family_t saf = AF_UNSPEC; + struct rawcb *rp = NULL; + struct walkarg w; + + fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && @@ -582,31 +569,53 @@ route_output(struct mbuf *m, struct socket *so) panic("route_output"); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || - len != mtod(m, struct rt_msghdr *)->rtm_msglen) { - info.rti_info[RTAX_DST] = NULL; + len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); - } - R_Malloc(rtm, struct rt_msghdr *, len); - if (rtm == NULL) { - info.rti_info[RTAX_DST] = NULL; + + /* + * Most of current messages are in range 200-240 bytes, + * minimize possible re-allocation on reply using larger size + * buffer aligned on 1k boundaty. + */ + alloc_len = roundup2(len, 1024); + if ((rtm = malloc(alloc_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); - } + m_copydata(m, 0, len, (caddr_t)rtm); + bzero(&info, sizeof(info)); + bzero(&w, sizeof(w)); + if (rtm->rtm_version != RTM_VERSION) { - info.rti_info[RTAX_DST] = NULL; + /* Do not touch message since format is unknown */ + free(rtm, M_TEMP); + rtm = NULL; senderr(EPROTONOSUPPORT); } + + /* + * Starting from here, it is possible + * to alter original message and insert + * caller PID and error value. + */ + #ifndef __rtems__ rtm->rtm_pid = curproc->p_pid; #else /* __rtems__ */ rtm->rtm_pid = BSD_DEFAULT_PID; #endif /* __rtems__ */ - bzero(&info, sizeof(info)); info.rti_addrs = rtm->rtm_addrs; - if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { - info.rti_info[RTAX_DST] = NULL; + + info.rti_mflags = rtm->rtm_inits; + info.rti_rmx = &rtm->rtm_rmx; + + /* + * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 + * link-local address because rtrequest requires addresses with + * embedded scope id. + */ + if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) senderr(EINVAL); - } + info.rti_flags = rtm->rtm_flags; if (info.rti_info[RTAX_DST] == NULL || info.rti_info[RTAX_DST]->sa_family >= AF_MAX || @@ -634,11 +643,16 @@ route_output(struct mbuf *m, struct socket *so) */ if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { - struct route gw_ro; + struct rt_addrinfo ginfo; + struct sockaddr *gdst; + + bzero(&ginfo, sizeof(ginfo)); + bzero(&ss, sizeof(ss)); + ss.ss_len = sizeof(ss); + + ginfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&ss; + gdst = info.rti_info[RTAX_GATEWAY]; - bzero(&gw_ro, sizeof(gw_ro)); - gw_ro.ro_dst = *info.rti_info[RTAX_GATEWAY]; - rtalloc_ign_fib(&gw_ro, 0, so->so_fibnum); /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 @@ -649,18 +663,21 @@ route_output(struct mbuf *m, struct socket *so) * AF_LINK sa_family type of the rt_gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ - if (gw_ro.ro_rt != NULL && - gw_ro.ro_rt->rt_gateway->sa_family == AF_LINK && - gw_ro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) - info.rti_flags &= ~RTF_GATEWAY; - if (gw_ro.ro_rt != NULL) - RTFREE(gw_ro.ro_rt); + if (rib_lookup_info(fibnum, gdst, NHR_REF, 0, &ginfo) == 0) { + if (ss.ss_family == AF_LINK && + ginfo.rti_ifp->if_flags & IFF_LOOPBACK) { + info.rti_flags &= ~RTF_GATEWAY; + info.rti_flags |= RTF_GWFLAG_COMPAT; + } + rib_free_info(&ginfo); + } } switch (rtm->rtm_type) { struct rtentry *saved_nrt; case RTM_ADD: + case RTM_CHANGE: if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); saved_nrt = NULL; @@ -669,14 +686,19 @@ route_output(struct mbuf *m, struct socket *so) if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); +#ifdef INET6 + if (error == 0) + rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; +#endif break; } - error = rtrequest1_fib(RTM_ADD, &info, &saved_nrt, - so->so_fibnum); - if (error == 0 && saved_nrt) { + error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt, + fibnum); + if (error == 0 && saved_nrt != NULL) { +#ifdef INET6 + rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; +#endif RT_LOCK(saved_nrt); - rt_setmetrics(rtm->rtm_inits, - &rtm->rtm_rmx, &saved_nrt->rt_rmx); rtm->rtm_index = saved_nrt->rt_ifp->if_index; RT_REMREF(saved_nrt); RT_UNLOCK(saved_nrt); @@ -690,26 +712,30 @@ route_output(struct mbuf *m, struct socket *so) (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) && (rtm->rtm_flags & RTF_LLDATA) != 0) { error = lla_rt_output(rtm, &info); +#ifdef INET6 + if (error == 0) + rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; +#endif break; } - error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, - so->so_fibnum); + error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum); if (error == 0) { RT_LOCK(saved_nrt); rt = saved_nrt; goto report; } +#ifdef INET6 + /* rt_msg2() will not be used when RTM_DELETE fails. */ + rti_need_deembed = (V_deembed_scopeid) ? 1 : 0; +#endif break; case RTM_GET: - case RTM_CHANGE: - case RTM_LOCK: - rnh = rt_tables_get_rnh(so->so_fibnum, - info.rti_info[RTAX_DST]->sa_family); + rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) senderr(EAFNOSUPPORT); - RADIX_NODE_HEAD_RLOCK(rnh); + RIB_RLOCK(rnh); if (info.rti_info[RTAX_NETMASK] == NULL && rtm->rtm_type == RTM_GET) { @@ -719,14 +745,14 @@ route_output(struct mbuf *m, struct socket *so) * 'route -n get addr' */ rt = (struct rtentry *) rnh->rnh_matchaddr( - info.rti_info[RTAX_DST], rnh); + info.rti_info[RTAX_DST], &rnh->head); } else rt = (struct rtentry *) rnh->rnh_lookup( info.rti_info[RTAX_DST], - info.rti_info[RTAX_NETMASK], rnh); + info.rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) { - RADIX_NODE_HEAD_RUNLOCK(rnh); + RIB_RUNLOCK(rnh); senderr(ESRCH); } #ifdef RADIX_MPATH @@ -738,11 +764,11 @@ route_output(struct mbuf *m, struct socket *so) * if gate == NULL the first match is returned. * (no need to call rt_mpath_matchgate if gate == NULL) */ - if (rn_mpath_capable(rnh) && + if (rt_mpath_capable(rnh) && (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) { rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]); if (!rt) { - RADIX_NODE_HEAD_RUNLOCK(rnh); + RIB_RUNLOCK(rnh); senderr(ESRCH); } } @@ -760,7 +786,8 @@ route_output(struct mbuf *m, struct socket *so) rt->rt_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; - ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1); + ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1, + RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, &laddr, @@ -772,139 +799,81 @@ route_output(struct mbuf *m, struct socket *so) /* * refactor rt and no lock operation necessary */ - rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh); + rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, + &rnh->head); if (rt == NULL) { - RADIX_NODE_HEAD_RUNLOCK(rnh); + RIB_RUNLOCK(rnh); senderr(ESRCH); } } RT_LOCK(rt); RT_ADDREF(rt); - RADIX_NODE_HEAD_RUNLOCK(rnh); - - switch(rtm->rtm_type) { - - case RTM_GET: - report: - RT_LOCK_ASSERT(rt); - if ((rt->rt_flags & RTF_HOST) == 0 - ? jailed_without_vnet(curthread->td_ucred) - : prison_if(curthread->td_ucred, - rt_key(rt)) != 0) { - RT_UNLOCK(rt); - senderr(ESRCH); - } - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = rt_mask(rt); - info.rti_info[RTAX_GENMASK] = 0; - if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { - ifp = rt->rt_ifp; - if (ifp) { - info.rti_info[RTAX_IFP] = - ifp->if_addr->ifa_addr; - error = rtm_get_jailed(&info, ifp, rt, - &saun, curthread->td_ucred); - if (error != 0) { - RT_UNLOCK(rt); - senderr(error); - } - if (ifp->if_flags & IFF_POINTOPOINT) - info.rti_info[RTAX_BRD] = - rt->rt_ifa->ifa_dstaddr; - rtm->rtm_index = ifp->if_index; - } else { - info.rti_info[RTAX_IFP] = NULL; - info.rti_info[RTAX_IFA] = NULL; - } - } else if ((ifp = rt->rt_ifp) != NULL) { - rtm->rtm_index = ifp->if_index; - } - len = rt_msg2(rtm->rtm_type, &info, NULL, NULL); - if (len > rtm->rtm_msglen) { - struct rt_msghdr *new_rtm; - R_Malloc(new_rtm, struct rt_msghdr *, len); - if (new_rtm == NULL) { - RT_UNLOCK(rt); - senderr(ENOBUFS); - } - bcopy(rtm, new_rtm, rtm->rtm_msglen); - Free(rtm); rtm = new_rtm; - } - (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL); - rtm->rtm_flags = rt->rt_flags; - rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); - rtm->rtm_addrs = info.rti_addrs; - break; - - case RTM_CHANGE: - /* - * New gateway could require new ifaddr, ifp; - * flags may also be different; ifp may be specified - * by ll sockaddr when protocol address is ambiguous - */ - if (((rt->rt_flags & RTF_GATEWAY) && - info.rti_info[RTAX_GATEWAY] != NULL) || - info.rti_info[RTAX_IFP] != NULL || - (info.rti_info[RTAX_IFA] != NULL && - !sa_equal(info.rti_info[RTAX_IFA], - rt->rt_ifa->ifa_addr))) { - RT_UNLOCK(rt); - RADIX_NODE_HEAD_LOCK(rnh); - error = rt_getifa_fib(&info, rt->rt_fibnum); - /* - * XXXRW: Really we should release this - * reference later, but this maintains - * historical behavior. - */ - if (info.rti_ifa != NULL) - ifa_free(info.rti_ifa); - RADIX_NODE_HEAD_UNLOCK(rnh); - if (error != 0) - senderr(error); - RT_LOCK(rt); - } - if (info.rti_ifa != NULL && - info.rti_ifa != rt->rt_ifa && - rt->rt_ifa != NULL && - rt->rt_ifa->ifa_rtrequest != NULL) { - rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, - &info); - ifa_free(rt->rt_ifa); - } - if (info.rti_info[RTAX_GATEWAY] != NULL) { - RT_UNLOCK(rt); - RADIX_NODE_HEAD_LOCK(rnh); - RT_LOCK(rt); - - error = rt_setgate(rt, rt_key(rt), - info.rti_info[RTAX_GATEWAY]); - RADIX_NODE_HEAD_UNLOCK(rnh); + RIB_RUNLOCK(rnh); + +report: + RT_LOCK_ASSERT(rt); + if ((rt->rt_flags & RTF_HOST) == 0 + ? jailed_without_vnet(curthread->td_ucred) + : prison_if(curthread->td_ucred, + rt_key(rt)) != 0) { + RT_UNLOCK(rt); + senderr(ESRCH); + } + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), + rt_mask(rt), &ss); + info.rti_info[RTAX_GENMASK] = 0; + if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { + ifp = rt->rt_ifp; + if (ifp) { + info.rti_info[RTAX_IFP] = + ifp->if_addr->ifa_addr; + error = rtm_get_jailed(&info, ifp, rt, + &saun, curthread->td_ucred); if (error != 0) { RT_UNLOCK(rt); senderr(error); } - rt->rt_flags |= (RTF_GATEWAY & info.rti_flags); + if (ifp->if_flags & IFF_POINTOPOINT) + info.rti_info[RTAX_BRD] = + rt->rt_ifa->ifa_dstaddr; + rtm->rtm_index = ifp->if_index; + } else { + info.rti_info[RTAX_IFP] = NULL; + info.rti_info[RTAX_IFA] = NULL; } - if (info.rti_ifa != NULL && - info.rti_ifa != rt->rt_ifa) { - ifa_ref(info.rti_ifa); - rt->rt_ifa = info.rti_ifa; - rt->rt_ifp = info.rti_ifp; + } else if ((ifp = rt->rt_ifp) != NULL) { + rtm->rtm_index = ifp->if_index; + } + + /* Check if we need to realloc storage */ + rtsock_msg_buffer(rtm->rtm_type, &info, NULL, &len); + if (len > alloc_len) { + struct rt_msghdr *new_rtm; + new_rtm = malloc(len, M_TEMP, M_NOWAIT); + if (new_rtm == NULL) { + RT_UNLOCK(rt); + senderr(ENOBUFS); } - /* Allow some flags to be toggled on change. */ - rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) | - (rtm->rtm_flags & RTF_FMASK); - rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, - &rt->rt_rmx); - rtm->rtm_index = rt->rt_ifp->if_index; - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, &info); - /* FALLTHROUGH */ - case RTM_LOCK: - /* We don't support locks anymore */ - break; + bcopy(rtm, new_rtm, rtm->rtm_msglen); + free(rtm, M_TEMP); + rtm = new_rtm; + alloc_len = len; } + + w.w_tmem = (caddr_t)rtm; + w.w_tmemsize = alloc_len; + rtsock_msg_buffer(rtm->rtm_type, &info, &w, &len); + + if (rt->rt_flags & RTF_GWFLAG_COMPAT) + rtm->rtm_flags = RTF_GATEWAY | + (rt->rt_flags & ~RTF_GWFLAG_COMPAT); + else + rtm->rtm_flags = rt->rt_flags; + rt_getmetrics(rt, &rtm->rtm_rmx); + rtm->rtm_addrs = info.rti_addrs; + RT_UNLOCK(rt); break; @@ -913,39 +882,55 @@ route_output(struct mbuf *m, struct socket *so) } flush: - if (rtm) { - if (error) - rtm->rtm_errno = error; - else - rtm->rtm_flags |= RTF_DONE; - } - if (rt) /* XXX can this be true? */ + if (rt != NULL) RTFREE(rt); - { - struct rawcb *rp = NULL; /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { - if (route_cb.any_count <= 1) { - if (rtm) - Free(rtm); + if (V_route_cb.any_count <= 1) { + if (rtm != NULL) + free(rtm, M_TEMP); m_freem(m); return (error); } /* There is another listener, so construct message */ rp = sotorawcb(so); } - if (rtm) { + + if (rtm != NULL) { +#ifdef INET6 + if (rti_need_deembed) { + /* sin6_scope_id is recovered before sending rtm. */ + sin6 = (struct sockaddr_in6 *)&ss; + for (i = 0; i < RTAX_MAX; i++) { + if (info.rti_info[i] == NULL) + continue; + if (info.rti_info[i]->sa_family != AF_INET6) + continue; + bcopy(info.rti_info[i], sin6, sizeof(*sin6)); + if (sa6_recoverscope(sin6) == 0) + bcopy(sin6, info.rti_info[i], + sizeof(*sin6)); + } + } +#endif + if (error != 0) + rtm->rtm_errno = error; + else + rtm->rtm_flags |= RTF_DONE; + m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); + + free(rtm, M_TEMP); } - if (m) { - M_SETFIB(m, so->so_fibnum); + if (m != NULL) { + M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rp) { /* @@ -959,43 +944,21 @@ flush: } else rt_dispatch(m, saf); } - /* info.rti_info[RTAX_DST] (used above) can point inside of rtm */ - if (rtm) - Free(rtm); - } + return (error); -#undef sa_equal } static void -rt_setmetrics(u_long which, const struct rt_metrics *in, - struct rt_metrics_lite *out) +rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out) { -#define metric(f, e) if (which & (f)) out->e = in->e; - /* - * Only these are stored in the routing entry since introduction - * of tcp hostcache. The rest is ignored. - */ - metric(RTV_MTU, rmx_mtu); - metric(RTV_WEIGHT, rmx_weight); - /* Userland -> kernel timebase conversion. */ - if (which & RTV_EXPIRE) - out->rmx_expire = in->rmx_expire ? - in->rmx_expire - time_second + time_uptime : 0; -#undef metric -} -static void -rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out) -{ -#define metric(e) out->e = in->e; bzero(out, sizeof(*out)); - metric(rmx_mtu); - metric(rmx_weight); + out->rmx_mtu = rt->rt_mtu; + out->rmx_weight = rt->rt_weight; + out->rmx_pksent = counter_u64_fetch(rt->rt_pksent); /* Kernel -> userland timebase conversion. */ - out->rmx_expire = in->rmx_expire ? - in->rmx_expire - time_uptime + time_second : 0; -#undef metric + out->rmx_expire = rt->rt_expire ? + rt->rt_expire - time_uptime + time_second : 0; } /* @@ -1030,6 +993,11 @@ rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) return (0); /* should be EINVAL but for compat */ } /* accept it */ +#ifdef INET6 + if (sa->sa_family == AF_INET6) + sa6_embedscope((struct sockaddr_in6 *)sa, + V_ip6_use_defzone); +#endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } @@ -1037,15 +1005,42 @@ rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) } /* - * Used by the routing socket. + * Fill in @dmask with valid netmask leaving original @smask + * intact. Mostly used with radix netmasks. + */ +static struct sockaddr * +rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask, + struct sockaddr_storage *dmask) +{ + if (dst == NULL || smask == NULL) + return (NULL); + + memset(dmask, 0, dst->sa_len); + memcpy(dmask, smask, smask->sa_len); + dmask->ss_len = dst->sa_len; + dmask->ss_family = dst->sa_family; + + return ((struct sockaddr *)dmask); +} + +/* + * Writes information related to @rtinfo object to newly-allocated mbuf. + * Assumes MCLBYTES is enough to construct any message. + * Used for OS notifications of vaious events (if/ifa announces,etc) + * + * Returns allocated mbuf or NULL on failure. */ static struct mbuf * -rt_msg1(int type, struct rt_addrinfo *rtinfo) +rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; +#ifdef INET6 + struct sockaddr_storage ss; + struct sockaddr_in6 *sin6; +#endif int len, dlen; switch (type) { @@ -1072,20 +1067,17 @@ rt_msg1(int type, struct rt_addrinfo *rtinfo) default: len = sizeof(struct rt_msghdr); } - if (len > MCLBYTES) - panic("rt_msg1"); - m = m_gethdr(M_DONTWAIT, MT_DATA); - if (m && len > MHLEN) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - m_free(m); - m = NULL; - } - } + + /* XXXGL: can we use MJUMPAGESIZE cluster here? */ + KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); + if (len > MHLEN) + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + else + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); + m->m_pkthdr.len = m->m_len = len; - m->m_pkthdr.rcvif = NULL; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { @@ -1093,6 +1085,14 @@ rt_msg1(int type, struct rt_addrinfo *rtinfo) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); +#ifdef INET6 + if (V_deembed_scopeid && sa->sa_family == AF_INET6) { + sin6 = (struct sockaddr_in6 *)&ss; + bcopy(sa, sin6, sizeof(*sin6)); + if (sa6_recoverscope(sin6) == 0) + sa = (struct sockaddr *)sin6; + } +#endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } @@ -1107,17 +1107,26 @@ rt_msg1(int type, struct rt_addrinfo *rtinfo) } /* - * Used by the sysctl code and routing socket. + * Writes information related to @rtinfo object to preallocated buffer. + * Stores needed size in @plen. If @w is NULL, calculates size without + * writing. + * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. + * + * Returns 0 on success. + * */ static int -rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w) +rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { int i; - int len, dlen, second_time = 0; - caddr_t cp0; + int len, buflen = 0, dlen; + caddr_t cp = NULL; + struct rt_msghdr *rtm = NULL; +#ifdef INET6 + struct sockaddr_storage ss; + struct sockaddr_in6 *sin6; +#endif - rtinfo->rti_addrs = 0; -again: switch (type) { case RTM_DELADDR: @@ -1156,9 +1165,14 @@ again: default: len = sizeof(struct rt_msghdr); } - cp0 = cp; - if (cp0) - cp += len; + + if (w != NULL) { + rtm = (struct rt_msghdr *)w->w_tmem; + buflen = w->w_tmemsize - len; + cp = (caddr_t)w->w_tmem + len; + } + + rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; @@ -1166,45 +1180,56 @@ again: continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); - if (cp) { + if (cp != NULL && buflen >= dlen) { +#ifdef INET6 + if (V_deembed_scopeid && sa->sa_family == AF_INET6) { + sin6 = (struct sockaddr_in6 *)&ss; + bcopy(sa, sin6, sizeof(*sin6)); + if (sa6_recoverscope(sin6) == 0) + sa = (struct sockaddr *)sin6; + } +#endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; + buflen -= dlen; + } else if (cp != NULL) { + /* + * Buffer too small. Count needed size + * and return with error. + */ + cp = NULL; } + len += dlen; } - len = ALIGN(len); - if (cp == NULL && w != NULL && !second_time) { - struct walkarg *rw = w; - - if (rw->w_req) { - if (rw->w_tmemsize < len) { - if (rw->w_tmem) - free(rw->w_tmem, M_RTABLE); - rw->w_tmem = (caddr_t) - malloc(len, M_RTABLE, M_NOWAIT); - if (rw->w_tmem) - rw->w_tmemsize = len; - } - if (rw->w_tmem) { - cp = rw->w_tmem; - second_time = 1; - goto again; - } - } + + if (cp != NULL) { + dlen = ALIGN(len) - len; + if (buflen < dlen) + cp = NULL; + else + buflen -= dlen; } - if (cp) { - struct rt_msghdr *rtm = (struct rt_msghdr *)cp0; + len = ALIGN(len); + if (cp != NULL) { + /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } - return (len); + + *plen = len; + + if (w != NULL && cp == NULL) + return (ENOBUFS); + + return (0); } /* * This routine is called to generate a message from the routing - * socket indicating that a redirect has occured, a routing lookup + * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ @@ -1216,9 +1241,9 @@ rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; - if (route_cb.any_count == 0) + if (V_route_cb.any_count == 0) return; - m = rt_msg1(type, rtinfo); + m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; @@ -1254,16 +1279,16 @@ rt_ifmsg(struct ifnet *ifp) struct mbuf *m; struct rt_addrinfo info; - if (route_cb.any_count == 0) + if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); - m = rt_msg1(RTM_IFINFO, &info); + m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm->ifm_data = ifp->if_data; + if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } @@ -1283,8 +1308,9 @@ rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; + struct sockaddr_storage ss; - if (route_cb.any_count == 0) + if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; @@ -1292,13 +1318,14 @@ rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; - info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( + info.rti_info[RTAX_IFP], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; - if ((m = rt_msg1(ncmd, &info)) == NULL) + if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; - ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; @@ -1331,15 +1358,16 @@ rtsock_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, struct sockaddr *sa; struct mbuf *m; struct rt_msghdr *rtm; + struct sockaddr_storage ss; - if (route_cb.any_count == 0) + if (V_route_cb.any_count == 0) return (0); bzero((caddr_t)&info, sizeof(info)); - info.rti_info[RTAX_NETMASK] = rt_mask(rt); info.rti_info[RTAX_DST] = sa = rt_key(rt); + info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(sa, rt_mask(rt), &ss); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - if ((m = rt_msg1(cmd, &info)) == NULL) + if ((m = rtsock_msg_mbuf(cmd, &info)) == NULL) return (ENOBUFS); rtm = mtod(m, struct rt_msghdr *); rtm->rtm_index = ifp->if_index; @@ -1370,7 +1398,7 @@ rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; - if (route_cb.any_count == 0) + if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); @@ -1381,7 +1409,7 @@ rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; - m = rt_msg1(cmd, &info); + m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); @@ -1399,10 +1427,10 @@ rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct if_announcemsghdr *ifan; struct mbuf *m; - if (route_cb.any_count == 0) + if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); - m = rt_msg1(type, info); + m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; @@ -1509,6 +1537,7 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) struct rtentry *rt = (struct rtentry *)rn; int error = 0, size; struct rt_addrinfo info; + struct sockaddr_storage ss; if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) return 0; @@ -1519,7 +1548,8 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), + rt_mask(rt), &ss); info.rti_info[RTAX_GENMASK] = 0; if (rt->rt_ifp) { info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr; @@ -1527,16 +1557,17 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) if (rt->rt_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr; } - size = rt_msg2(RTM_GET, &info, NULL, w); + if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) + return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; - rtm->rtm_flags = rt->rt_flags; - /* - * let's be honest about this being a retarded hack - */ - rtm->rtm_fmask = rt->rt_rmx.rmx_pksent; - rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); + if (rt->rt_flags & RTF_GWFLAG_COMPAT) + rtm->rtm_flags = RTF_GATEWAY | + (rt->rt_flags & ~RTF_GWFLAG_COMPAT); + else + rtm->rtm_flags = rt->rt_flags; + rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; @@ -1546,70 +1577,40 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) return (error); } -#ifdef COMPAT_FREEBSD32 -static void -copy_ifdata32(struct if_data *src, struct if_data32 *dst) -{ - - bzero(dst, sizeof(*dst)); - CP(*src, *dst, ifi_type); - CP(*src, *dst, ifi_physical); - CP(*src, *dst, ifi_addrlen); - CP(*src, *dst, ifi_hdrlen); - CP(*src, *dst, ifi_link_state); - dst->ifi_datalen = sizeof(struct if_data32); - CP(*src, *dst, ifi_mtu); - CP(*src, *dst, ifi_metric); - CP(*src, *dst, ifi_baudrate); - CP(*src, *dst, ifi_ipackets); - CP(*src, *dst, ifi_ierrors); - CP(*src, *dst, ifi_opackets); - CP(*src, *dst, ifi_oerrors); - CP(*src, *dst, ifi_collisions); - CP(*src, *dst, ifi_ibytes); - CP(*src, *dst, ifi_obytes); - CP(*src, *dst, ifi_imcasts); - CP(*src, *dst, ifi_omcasts); - CP(*src, *dst, ifi_iqdrops); - CP(*src, *dst, ifi_noproto); - CP(*src, *dst, ifi_hwassist); - CP(*src, *dst, ifi_epoch); - TV_CP(*src, *dst, ifi_lastchange); -} -#endif - static int sysctl_iflist_ifml(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; + struct if_data *ifd; + + ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; - ifm32 = (struct if_msghdrl32 *)w->w_tmem; + ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); - - copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifm32, len)); - } + ifd = &ifm32->ifm_data; + } else #endif - ifm = (struct if_msghdrl *)w->w_tmem; - ifm->ifm_addrs = info->rti_addrs; - ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm->ifm_index = ifp->if_index; - ifm->_ifm_spare1 = 0; - ifm->ifm_len = sizeof(*ifm); - ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); + { + ifm->ifm_addrs = info->rti_addrs; + ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm->ifm_index = ifp->if_index; + ifm->_ifm_spare1 = 0; + ifm->ifm_len = sizeof(*ifm); + ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); + ifd = &ifm->ifm_data; + } - ifm->ifm_data = ifp->if_data; + if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } @@ -1619,27 +1620,29 @@ sysctl_iflist_ifm(struct ifnet *ifp, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; + struct if_data *ifd; + + ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; - ifm32 = (struct if_msghdr32 *)w->w_tmem; + ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; - - copy_ifdata32(&ifp->if_data, &ifm32->ifm_data); - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifm32, len)); - } + ifd = &ifm32->ifm_data; + } else #endif - ifm = (struct if_msghdr *)w->w_tmem; - ifm->ifm_addrs = info->rti_addrs; - ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; - ifm->ifm_index = ifp->if_index; + { + ifm->ifm_addrs = info->rti_addrs; + ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; + ifm->ifm_index = ifp->if_index; + ifd = &ifm->ifm_data; + } - ifm->ifm_data = ifp->if_data; + if_data_copy(ifp, ifd); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } @@ -1649,12 +1652,15 @@ sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; + struct if_data *ifd; + + ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; - ifam32 = (struct ifa_msghdrl32 *)w->w_tmem; + ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; @@ -1662,24 +1668,31 @@ sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); - ifam32->ifam_metric = ifa->ifa_metric; - - copy_ifdata32(&ifa->ifa_ifp->if_data, &ifam32->ifam_data); - - return (SYSCTL_OUT(w->w_req, (caddr_t)ifam32, len)); - } + ifam32->ifam_metric = ifa->ifa_ifp->if_metric; + ifd = &ifam32->ifam_data; + } else #endif + { + ifam->ifam_addrs = info->rti_addrs; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->_ifam_spare1 = 0; + ifam->ifam_len = sizeof(*ifam); + ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); + ifam->ifam_metric = ifa->ifa_ifp->if_metric; + ifd = &ifam->ifam_data; + } - ifam = (struct ifa_msghdrl *)w->w_tmem; - ifam->ifam_addrs = info->rti_addrs; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->_ifam_spare1 = 0; - ifam->ifam_len = sizeof(*ifam); - ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); - ifam->ifam_metric = ifa->ifa_metric; + bzero(ifd, sizeof(*ifd)); + ifd->ifi_datalen = sizeof(struct if_data); + ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); + ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); + ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); + ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); - ifam->ifam_data = ifa->if_data; + /* Fixup if_data carp(4) vhid. */ + if (carp_get_vhid_p != NULL) + ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } @@ -1694,7 +1707,7 @@ sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } @@ -1706,16 +1719,19 @@ sysctl_iflist(int af, struct walkarg *w) struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; + struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); - IFNET_RLOCK(); + IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; - len = rt_msg2(RTM_IFINFO, &info, NULL, w); + error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); + if (error != 0) + goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) @@ -1732,9 +1748,12 @@ sysctl_iflist(int af, struct walkarg *w) ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; - info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( + ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; - len = rt_msg2(RTM_NEWADDR, &info, NULL, w); + error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); + if (error != 0) + goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, @@ -1747,13 +1766,14 @@ sysctl_iflist(int af, struct walkarg *w) } } IF_ADDR_RUNLOCK(ifp); - info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = - info.rti_info[RTAX_BRD] = NULL; + info.rti_info[RTAX_IFA] = NULL; + info.rti_info[RTAX_NETMASK] = NULL; + info.rti_info[RTAX_BRD] = NULL; } done: if (ifp != NULL) IF_ADDR_RUNLOCK(ifp); - IFNET_RUNLOCK(); + IFNET_RUNLOCK_NOSLEEP(); return (error); } @@ -1767,7 +1787,7 @@ sysctl_ifmalist(int af, struct walkarg *w) struct ifaddr *ifa; bzero((caddr_t)&info, sizeof(info)); - IFNET_RLOCK(); + IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; @@ -1784,7 +1804,9 @@ sysctl_ifmalist(int af, struct walkarg *w) info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; - len = rt_msg2(RTM_NEWMADDR, &info, NULL, w); + error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); + if (error != 0) + goto done; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; @@ -1802,7 +1824,7 @@ sysctl_ifmalist(int af, struct walkarg *w) IF_ADDR_RUNLOCK(ifp); } done: - IFNET_RUNLOCK(); + IFNET_RUNLOCK_NOSLEEP(); return (error); } @@ -1811,7 +1833,7 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; - struct radix_node_head *rnh = NULL; /* silence compiler. */ + struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; @@ -1852,6 +1874,14 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) error = sysctl_wire_old_buffer(req, 0); if (error) return (error); + + /* + * Allocate reply buffer in advance. + * All rtsock messages has maximum length of u_short. + */ + w.w_tmemsize = 65536; + w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); + switch (w.w_op) { case NET_RT_DUMP: @@ -1880,10 +1910,10 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { - RADIX_NODE_HEAD_RLOCK(rnh); - error = rnh->rnh_walktree(rnh, + RIB_RLOCK(rnh); + error = rnh->rnh_walktree(&rnh->head, sysctl_dumpentry, &w); - RADIX_NODE_HEAD_RUNLOCK(rnh); + RIB_RUNLOCK(rnh); } else if (af != 0) error = EAFNOSUPPORT; } @@ -1898,8 +1928,8 @@ sysctl_rtsock(SYSCTL_HANDLER_ARGS) error = sysctl_ifmalist(af, &w); break; } - if (w.w_tmem) - free(w.w_tmem, M_RTABLE); + + free(w.w_tmem, M_TEMP); return (error); } @@ -1927,7 +1957,7 @@ static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_protosw = routesw, - .dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])] + .dom_protoswNPROTOSW = &routesw[nitems(routesw)] }; VNET_DOMAIN_SET(route); diff --git a/freebsd/sys/net/sff8436.h b/freebsd/sys/net/sff8436.h new file mode 100644 index 00000000..3399cce5 --- /dev/null +++ b/freebsd/sys/net/sff8436.h @@ -0,0 +1,213 @@ +/*- + * Copyright (c) 2014 Yandex LLC. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The following set of constants are from Document SFF-8436 + * "QSFP+ 10 Gbs 4X PLUGGABLE TRANSCEIVER" revision 4.8 dated October 31, 2013 + * + * This SFF standard defines the following QSFP+ memory address module: + * + * 1) 256-byte addressable block and 128-byte pages + * 2) Lower 128-bytes addresses always refer to the same page + * 3) Upper address space may refer to different pages depending on + * "page select" byte value. + * + * Map description: + * + * Serial address 0xA02: + * + * Lower bits + * 0-127 Monitoring data & page select byte + * 128-255: + * + * Page 00: + * 128-191 Base ID Fields + * 191-223 Extended ID + * 223-255 Vendor Specific ID + * + * Page 01 (optional): + * 128-255 App-specific data + * + * Page 02 (optional): + * 128-255 User EEPROM Data + * + * Page 03 (optional for Cable Assmeblies) + * 128-223 Thresholds + * 225-237 Vendor Specific + * 238-253 Channel Controls/Monitor + * 254-255 Reserverd + * + * All these values are read across an I2C (i squared C) bus. + */ + +#define SFF_8436_BASE 0xA0 /* Base address for all requests */ + +/* Table 17 - Lower Memory Map */ +enum { + SFF_8436_MID = 0, /* Copy of SFF_8436_ID field */ + SFF_8436_STATUS = 1, /* 2-bytes status (Table 18) */ + SFF_8436_INTR_START = 3, /* Interrupt flags (Tables 19-21) */ + SFF_8436_INTR_END = 21, + SFF_8436_MODMON_START = 22, /* Module monitors (Table 22 */ + SFF_8436_TEMP = 22, /* Internally measured module temp */ + SFF_8436_VCC = 26, /* Internally mesasure module + * supplied voltage */ + SFF_8436_MODMON_END = 33, + SFF_8436_CHMON_START = 34, /* Channel monitors (Table 23) */ + SFF_8436_RX_CH1_MSB = 34, /* Internally measured RX input power */ + SFF_8436_RX_CH1_LSB = 35, /* for channel 1 */ + SFF_8436_RX_CH2_MSB = 36, /* Internally measured RX input power */ + SFF_8436_RX_CH2_LSB = 37, /* for channel 2 */ + SFF_8436_RX_CH3_MSB = 38, /* Internally measured RX input power */ + SFF_8436_RX_CH3_LSB = 39, /* for channel 3 */ + SFF_8436_RX_CH4_MSB = 40, /* Internally measured RX input power */ + SFF_8436_RX_CH4_LSB = 41, /* for channel 4 */ + SFF_8436_TX_CH1_MSB = 42, /* Internally measured TX bias */ + SFF_8436_TX_CH1_LSB = 43, /* for channel 1 */ + SFF_8436_TX_CH2_MSB = 44, /* Internally measured TX bias */ + SFF_8436_TX_CH2_LSB = 45, /* for channel 2 */ + SFF_8436_TX_CH3_MSB = 46, /* Internally measured TX bias */ + SFF_8436_TX_CH3_LSB = 47, /* for channel 3 */ + SFF_8436_TX_CH4_MSB = 48, /* Internally measured TX bias */ + SFF_8436_TX_CH4_LSB = 49, /* for channel 4 */ + SFF_8436_CHANMON_END = 81, + SFF_8436_CONTROL_START = 86, /* Control (Table 24) */ + SFF_8436_CONTROL_END = 97, + SFF_8436_MASKS_START = 100, /* Module/channel masks (Table 25) */ + SFF_8436_MASKS_END = 106, + SFF_8436_CHPASSWORD = 119, /* Password change entry (4 bytes) */ + SFF_8436_PASSWORD = 123, /* Password entry area (4 bytes) */ + SFF_8436_PAGESEL = 127, /* Page select byte */ +}; + +/* Table 18 - Status Indicators bits */ +/* Byte 1: all bits reserved */ + +/* Byte 2 bits */ +#define SFF_8436_STATUS_FLATMEM (1 << 2) /* Upper memory flat or paged + * 0 = paging, 1=Page 0 only */ +#define SFF_8436_STATUS_INTL (1 << 1) /* Digital state of the intL + * Interrupt output pin */ +#define SFF_8436_STATUS_NOTREADY 1 /* Module has not yet achieved + * power up and memory data is not + * ready. 0=data is ready */ +/* + * Upper page 0 definitions: + * Table 29 - Serial ID: Data fields. + * + * Note that this table is mostly the same as used in SFF-8472. + * The only differenee is address shift: +128 bytes. + */ +enum { + SFF_8436_ID = 128, /* Module Type (defined in sff8472.h) */ + SFF_8436_EXT_ID = 129, /* Extended transceiver type + * (Table 31) */ + SFF_8436_CONNECTOR = 130, /* Connector type (Table 32) */ + SFF_8436_TRANS_START = 131, /* Electric or Optical Compatibility + * (Table 33) */ + SFF_8436_CODE_E1040100G = 131, /* 10/40/100G Ethernet Compliance Code */ + SFF_8436_CODE_SONET = 132, /* SONET Compliance codes */ + SFF_8436_CODE_SATA = 133, /* SAS/SATA compliance codes */ + SFF_8436_CODE_E1G = 134, /* Gigabit Ethernet Compliant codes */ + SFF_8436_CODE_FC_START = 135, /* FC link/media/speed */ + SFF_8436_CODE_FC_END = 138, + SFF_8436_TRANS_END = 138, + SFF_8436_ENCODING = 139, /* Encoding Code for high speed + * serial encoding algorithm (see + * Table 34) */ + SFF_8436_BITRATE = 140, /* Nominal signaling rate, units + * of 100MBd. */ + SFF_8436_RATEID = 141, /* Extended RateSelect Compliance + * (see Table 35) */ + SFF_8436_LEN_SMF_KM = 142, /* Link length supported for single + * mode fiber, units of km */ + SFF_8436_LEN_OM3 = 143, /* Link length supported for 850nm + * 50um multimode fiber, units of 2 m */ + SFF_8436_LEN_OM2 = 144, /* Link length supported for 50 um + * OM2 fiber, units of 1 m */ + SFF_8436_LEN_OM1 = 145, /* Link length supported for 1310 nm + * 50um multi-mode fiber, units of 1m*/ + SFF_8436_LEN_ASM = 144, /* Link length of passive cable assembly + * Length is specified as in the INF + * 8074, units of 1m. 0 means this is + * not value assembly. Value of 255 + * means thet the Module supports length + * greater than 254 m. */ + SFF_8436_DEV_TECH = 147, /* Device/transmitter technology, + * see Table 36/37 */ + SFF_8436_VENDOR_START = 148, /* Vendor name, 16 bytes, padded + * right with 0x20 */ + SFF_8436_VENDOR_END = 163, + SFF_8436_EXTMODCODE = 164, /* Extended module code, Table 164 */ + SFF_8436_VENDOR_OUI_START = 165 , /* Vendor OUI SFP vendor IEEE + * company ID */ + SFF_8436_VENDOR_OUI_END = 167, + SFF_8436_PN_START = 168, /* Vendor PN, padded right with 0x20 */ + SFF_8436_PN_END = 183, + SFF_8436_REV_START = 184, /* Vendor Revision, padded right 0x20 */ + SFF_8436_REV_END = 185, + SFF_8436_WAVELEN_START = 186, /* Wavelength Laser wavelength + * (Passive/Active Cable + * Specification Compliance) */ + SFF_8436_WAVELEN_END = 189, + SFF_8436_MAX_CASE_TEMP = 190, /* Allows to specify maximum temp + * above 70C. Maximum case temperature is + * an 8-bit value in Degrees C. A value + *of 0 implies the standard 70C rating.*/ + SFF_8436_CC_BASE = 191, /* CC_BASE Check code for Base ID + * Fields (first 63 bytes) */ + /* Extended ID fields */ + SFF_8436_OPTIONS_START = 192, /* Options Indicates which optional + * transceiver signals are + * implemented (see Table 39) */ + SFF_8436_OPTIONS_END = 195, + SFF_8436_SN_START = 196, /* Vendor SN, riwght padded with 0x20 */ + SFF_8436_SN_END = 211, + SFF_8436_DATE_START = 212, /* Vendor’s manufacturing date code + * (see Table 40) */ + SFF_8436_DATE_END = 219, + SFF_8436_DIAG_TYPE = 220, /* Diagnostic Monitoring Type + * Indicates which type of + * diagnostic monitoring is + * implemented (if any) in the + * transceiver (see Table 41) */ + + SFF_8436_ENHANCED = 221, /* Enhanced Options Indicates which + * optional features are implemented + * (if any) in the transceiver + * (see Table 42) */ + SFF_8636_BITRATE = 222, /* Nominal bit rate per channel, units + * of 250 Mbps */ + SFF_8436_CC_EXT = 223, /* Check code for the Extended ID + * Fields (bytes 192-222 incl) */ + SFF_8436_VENDOR_RSRVD_START = 224, + SFF_8436_VENDOR_RSRVD_END = 255, +}; + + diff --git a/freebsd/sys/net/sff8472.h b/freebsd/sys/net/sff8472.h new file mode 100644 index 00000000..5c50ea46 --- /dev/null +++ b/freebsd/sys/net/sff8472.h @@ -0,0 +1,508 @@ +/*- + * Copyright (c) 2013 George V. Neville-Neil + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The following set of constants are from Document SFF-8472 + * "Diagnostic Monitoring Interface for Optical Transceivers" revision + * 11.3 published by the SFF Committee on June 11, 2013 + * + * The SFF standard defines two ranges of addresses, each 255 bytes + * long for the storage of data and diagnostics on cables, such as + * SFP+ optics and TwinAx cables. The ranges are defined in the + * following way: + * + * Base Address 0xa0 (Identification Data) + * 0-95 Serial ID Defined by SFP MSA + * 96-127 Vendor Specific Data + * 128-255 Reserved + * + * Base Address 0xa2 (Diagnostic Data) + * 0-55 Alarm and Warning Thresholds + * 56-95 Cal Constants + * 96-119 Real Time Diagnostic Interface + * 120-127 Vendor Specific + * 128-247 User Writable EEPROM + * 248-255 Vendor Specific + * + * Note that not all addresses are supported. Where support is + * optional this is noted and instructions for checking for the + * support are supplied. + * + * All these values are read across an I2C (i squared C) bus. Any + * device wishing to read these addresses must first have support for + * i2c calls. The Chelsio T4/T5 driver (dev/cxgbe) is one such + * driver. + */ + + +/* Table 3.1 Two-wire interface ID: Data Fields */ + +enum { + SFF_8472_BASE = 0xa0, /* Base address for all our queries. */ + SFF_8472_ID = 0, /* Transceiver Type (Table 3.2) */ + SFF_8472_EXT_ID = 1, /* Extended transceiver type (Table 3.3) */ + SFF_8472_CONNECTOR = 2, /* Connector type (Table 3.4) */ + SFF_8472_TRANS_START = 3, /* Elec or Optical Compatibility + * (Table 3.5) */ + SFF_8472_TRANS_END = 10, + SFF_8472_ENCODING = 11, /* Encoding Code for high speed + * serial encoding algorithm (see + * Table 3.6) */ + SFF_8472_BITRATE = 12, /* Nominal signaling rate, units + * of 100MBd. (see details for + * rates > 25.0Gb/s) */ + SFF_8472_RATEID = 13, /* Type of rate select + * functionality (see Table + * 3.6a) */ + SFF_8472_LEN_SMF_KM = 14, /* Link length supported for single + * mode fiber, units of km */ + SFF_8472_LEN_SMF = 15, /* Link length supported for single + * mode fiber, units of 100 m */ + SFF_8472_LEN_50UM = 16, /* Link length supported for 50 um + * OM2 fiber, units of 10 m */ + SFF_8472_LEN_625UM = 17, /* Link length supported for 62.5 + * um OM1 fiber, units of 10 m */ + SFF_8472_LEN_OM4 = 18, /* Link length supported for 50um + * OM4 fiber, units of 10m. + * Alternatively copper or direct + * attach cable, units of m */ + SFF_8472_LEN_OM3 = 19, /* Link length supported for 50 um OM3 fiber, units of 10 m */ + SFF_8472_VENDOR_START = 20, /* Vendor name [Address A0h, Bytes + * 20-35] */ + SFF_8472_VENDOR_END = 35, + SFF_8472_TRANS = 36, /* Transceiver Code for electronic + * or optical compatibility (see + * Table 3.5) */ + SFF_8472_VENDOR_OUI_START = 37, /* Vendor OUI SFP vendor IEEE + * company ID */ + SFF_8472_VENDOR_OUI_END = 39, + SFF_8472_PN_START = 40, /* Vendor PN */ + SFF_8472_PN_END = 55, + SFF_8472_REV_START = 56, /* Vendor Revision */ + SFF_8472_REV_END = 59, + SFF_8472_WAVELEN_START = 60, /* Wavelength Laser wavelength + * (Passive/Active Cable + * Specification Compliance) */ + SFF_8472_WAVELEN_END = 61, + SFF_8472_CC_BASE = 63, /* CC_BASE Check code for Base ID + * Fields (addresses 0 to 62) */ + +/* + * Extension Fields (optional) check the options before reading other + * addresses. + */ + SFF_8472_OPTIONS_MSB = 64, /* Options Indicates which optional + * transceiver signals are + * implemented */ + SFF_8472_OPTIONS_LSB = 65, /* (see Table 3.7) */ + SFF_8472_BR_MAX = 66, /* BR max Upper bit rate margin, + * units of % (see details for + * rates > 25.0Gb/s) */ + SFF_8472_BR_MIN = 67, /* Lower bit rate margin, units of + * % (see details for rates > + * 25.0Gb/s) */ + SFF_8472_SN_START = 68, /* Vendor SN [Address A0h, Bytes 68-83] */ + SFF_8472_SN_END = 83, + SFF_8472_DATE_START = 84, /* Date code Vendor’s manufacturing + * date code (see Table 3.8) */ + SFF_8472_DATE_END = 91, + SFF_8472_DIAG_TYPE = 92, /* Diagnostic Monitoring Type + * Indicates which type of + * diagnostic monitoring is + * implemented (if any) in the + * transceiver (see Table 3.9) + */ + + SFF_8472_ENHANCED = 93, /* Enhanced Options Indicates which + * optional enhanced features are + * implemented (if any) in the + * transceiver (see Table 3.10) */ + SFF_8472_COMPLIANCE = 94, /* SFF-8472 Compliance Indicates + * which revision of SFF-8472 the + * transceiver complies with. (see + * Table 3.12)*/ + SFF_8472_CC_EXT = 95, /* Check code for the Extended ID + * Fields (addresses 64 to 94) + */ + + SFF_8472_VENDOR_RSRVD_START = 96, + SFF_8472_VENDOR_RSRVD_END = 127, + + SFF_8472_RESERVED_START = 128, + SFF_8472_RESERVED_END = 255 +}; + +#define SFF_8472_DIAG_IMPL (1 << 6) /* Required to be 1 */ +#define SFF_8472_DIAG_INTERNAL (1 << 5) /* Internal measurements. */ +#define SFF_8472_DIAG_EXTERNAL (1 << 4) /* External measurements. */ +#define SFF_8472_DIAG_POWER (1 << 3) /* Power measurement type */ +#define SFF_8472_DIAG_ADDR_CHG (1 << 2) /* Address change required. + * See SFF-8472 doc. */ + + /* + * Diagnostics are available at the two wire address 0xa2. All + * diagnostics are OPTIONAL so you should check 0xa0 registers 92 to + * see which, if any are supported. + */ + +enum {SFF_8472_DIAG = 0xa2}; /* Base address for diagnostics. */ + + /* + * Table 3.15 Alarm and Warning Thresholds All values are 2 bytes + * and MUST be read in a single read operation starting at the MSB + */ + +enum { + SFF_8472_TEMP_HIGH_ALM = 0, /* Temp High Alarm */ + SFF_8472_TEMP_LOW_ALM = 2, /* Temp Low Alarm */ + SFF_8472_TEMP_HIGH_WARN = 4, /* Temp High Warning */ + SFF_8472_TEMP_LOW_WARN = 6, /* Temp Low Warning */ + SFF_8472_VOLTAGE_HIGH_ALM = 8, /* Voltage High Alarm */ + SFF_8472_VOLTAGE_LOW_ALM = 10, /* Voltage Low Alarm */ + SFF_8472_VOLTAGE_HIGH_WARN = 12, /* Voltage High Warning */ + SFF_8472_VOLTAGE_LOW_WARN = 14, /* Voltage Low Warning */ + SFF_8472_BIAS_HIGH_ALM = 16, /* Bias High Alarm */ + SFF_8472_BIAS_LOW_ALM = 18, /* Bias Low Alarm */ + SFF_8472_BIAS_HIGH_WARN = 20, /* Bias High Warning */ + SFF_8472_BIAS_LOW_WARN = 22, /* Bias Low Warning */ + SFF_8472_TX_POWER_HIGH_ALM = 24, /* TX Power High Alarm */ + SFF_8472_TX_POWER_LOW_ALM = 26, /* TX Power Low Alarm */ + SFF_8472_TX_POWER_HIGH_WARN = 28, /* TX Power High Warning */ + SFF_8472_TX_POWER_LOW_WARN = 30, /* TX Power Low Warning */ + SFF_8472_RX_POWER_HIGH_ALM = 32, /* RX Power High Alarm */ + SFF_8472_RX_POWER_LOW_ALM = 34, /* RX Power Low Alarm */ + SFF_8472_RX_POWER_HIGH_WARN = 36, /* RX Power High Warning */ + SFF_8472_RX_POWER_LOW_WARN = 38, /* RX Power Low Warning */ + + SFF_8472_RX_POWER4 = 56, /* Rx_PWR(4) Single precision + * floating point calibration data + * - Rx optical power. Bit 7 of + * byte 56 is MSB. Bit 0 of byte + * 59 is LSB. Rx_PWR(4) should be + * set to zero for “internally + * calibrated” devices. */ + SFF_8472_RX_POWER3 = 60, /* Rx_PWR(3) Single precision + * floating point calibration data + * - Rx optical power. Bit 7 of + * byte 60 is MSB. Bit 0 of byte 63 + * is LSB. Rx_PWR(3) should be set + * to zero for “internally + * calibrated” devices.*/ + SFF_8472_RX_POWER2 = 64, /* Rx_PWR(2) Single precision + * floating point calibration data, + * Rx optical power. Bit 7 of byte + * 64 is MSB, bit 0 of byte 67 is + * LSB. Rx_PWR(2) should be set to + * zero for “internally calibrated” + * devices. */ + SFF_8472_RX_POWER1 = 68, /* Rx_PWR(1) Single precision + * floating point calibration data, + * Rx optical power. Bit 7 of byte + * 68 is MSB, bit 0 of byte 71 is + * LSB. Rx_PWR(1) should be set to + * 1 for “internally calibrated” + * devices. */ + SFF_8472_RX_POWER0 = 72, /* Rx_PWR(0) Single precision + * floating point calibration data, + * Rx optical power. Bit 7 of byte + * 72 is MSB, bit 0 of byte 75 is + * LSB. Rx_PWR(0) should be set to + * zero for “internally calibrated” + * devices. */ + SFF_8472_TX_I_SLOPE = 76, /* Tx_I(Slope) Fixed decimal + * (unsigned) calibration data, + * laser bias current. Bit 7 of + * byte 76 is MSB, bit 0 of byte 77 + * is LSB. Tx_I(Slope) should be + * set to 1 for “internally + * calibrated” devices. */ + SFF_8472_TX_I_OFFSET = 78, /* Tx_I(Offset) Fixed decimal + * (signed two’s complement) + * calibration data, laser bias + * current. Bit 7 of byte 78 is + * MSB, bit 0 of byte 79 is + * LSB. Tx_I(Offset) should be set + * to zero for “internally + * calibrated” devices. */ + SFF_8472_TX_POWER_SLOPE = 80, /* Tx_PWR(Slope) Fixed decimal + * (unsigned) calibration data, + * transmitter coupled output + * power. Bit 7 of byte 80 is MSB, + * bit 0 of byte 81 is LSB. + * Tx_PWR(Slope) should be set to 1 + * for “internally calibrated” + * devices. */ + SFF_8472_TX_POWER_OFFSET = 82, /* Tx_PWR(Offset) Fixed decimal + * (signed two’s complement) + * calibration data, transmitter + * coupled output power. Bit 7 of + * byte 82 is MSB, bit 0 of byte 83 + * is LSB. Tx_PWR(Offset) should be + * set to zero for “internally + * calibrated” devices. */ + SFF_8472_T_SLOPE = 84, /* T (Slope) Fixed decimal + * (unsigned) calibration data, + * internal module temperature. Bit + * 7 of byte 84 is MSB, bit 0 of + * byte 85 is LSB. T(Slope) should + * be set to 1 for “internally + * calibrated” devices. */ + SFF_8472_T_OFFSET = 86, /* T (Offset) Fixed decimal (signed + * two’s complement) calibration + * data, internal module + * temperature. Bit 7 of byte 86 is + * MSB, bit 0 of byte 87 is LSB. + * T(Offset) should be set to zero + * for “internally calibrated” + * devices. */ + SFF_8472_V_SLOPE = 88, /* V (Slope) Fixed decimal + * (unsigned) calibration data, + * internal module supply + * voltage. Bit 7 of byte 88 is + * MSB, bit 0 of byte 89 is + * LSB. V(Slope) should be set to 1 + * for “internally calibrated” + * devices. */ + SFF_8472_V_OFFSET = 90, /* V (Offset) Fixed decimal (signed + * two’s complement) calibration + * data, internal module supply + * voltage. Bit 7 of byte 90 is + * MSB. Bit 0 of byte 91 is + * LSB. V(Offset) should be set to + * zero for “internally calibrated” + * devices. */ + SFF_8472_CHECKSUM = 95, /* Checksum Byte 95 contains the + * low order 8 bits of the sum of + * bytes 0 – 94. */ + /* Internal measurements. */ + + SFF_8472_TEMP = 96, /* Internally measured module temperature. */ + SFF_8472_VCC = 98, /* Internally measured supply + * voltage in transceiver. + */ + SFF_8472_TX_BIAS = 100, /* Internally measured TX Bias Current. */ + SFF_8472_TX_POWER = 102, /* Measured TX output power. */ + SFF_8472_RX_POWER = 104, /* Measured RX input power. */ + + SFF_8472_STATUS = 110 /* See below */ +}; + /* Status Bits Described */ + +/* + * TX Disable State Digital state of the TX Disable Input Pin. Updated + * within 100ms of change on pin. + */ +#define SFF_8472_STATUS_TX_DISABLE (1 << 7) + +/* + * Select Read/write bit that allows software disable of + * laser. Writing ‘1’ disables laser. See Table 3.11 for + * enable/disable timing requirements. This bit is “OR”d with the hard + * TX_DISABLE pin value. Note, per SFP MSA TX_DISABLE pin is default + * enabled unless pulled low by hardware. If Soft TX Disable is not + * implemented, the transceiver ignores the value of this bit. Default + * power up value is zero/low. + */ +#define SFF_8472_STATUS_SOFT_TX_DISABLE (1 << 6) + +/* + * RS(1) State Digital state of SFP input pin AS(1) per SFF-8079 or + * RS(1) per SFF-8431. Updated within 100ms of change on pin. See A2h + * Byte 118, Bit 3 for Soft RS(1) Select control information. + */ +#define SFF_8472_RS_STATE (1 << 5) + +/* + * Rate_Select State [aka. “RS(0)”] Digital state of the SFP + * Rate_Select Input Pin. Updated within 100ms of change on pin. Note: + * This pin is also known as AS(0) in SFF-8079 and RS(0) in SFF-8431. + */ +#define SFF_8472_STATUS_SELECT_STATE (1 << 4) + +/* + * Read/write bit that allows software rate select control. Writing + * ‘1’ selects full bandwidth operation. This bit is “OR’d with the + * hard Rate_Select, AS(0) or RS(0) pin value. See Table 3.11 for + * timing requirements. Default at power up is logic zero/low. If Soft + * Rate Select is not implemented, the transceiver ignores the value + * of this bit. Note: Specific transceiver behaviors of this bit are + * identified in Table 3.6a and referenced documents. See Table 3.18a, + * byte 118, bit 3 for Soft RS(1) Select. + */ +#define SFF_8472_STATUS_SOFT_RATE_SELECT (1 << 3) + +/* + * TX Fault State Digital state of the TX Fault Output Pin. Updated + * within 100ms of change on pin. + */ +#define SFF_8472_STATUS_TX_FAULT_STATE (1 << 2) + +/* + * Digital state of the RX_LOS Output Pin. Updated within 100ms of + * change on pin. + */ +#define SFF_8472_STATUS_RX_LOS (1 << 1) + +/* + * Indicates transceiver has achieved power up and data is ready. Bit + * remains high until data is ready to be read at which time the + * device sets the bit low. + */ +#define SFF_8472_STATUS_DATA_READY (1 << 0) + +/* + * Table 3.2 Identifier values. + * Identifier constants has taken from SFF-8024 rev 2.9 table 4.1 + * (as referenced by table 3.2 footer) + * */ +enum { + SFF_8024_ID_UNKNOWN = 0x0, /* Unknown or unspecified */ + SFF_8024_ID_GBIC = 0x1, /* GBIC */ + SFF_8024_ID_SFF = 0x2, /* Module soldered to motherboard (ex: SFF)*/ + SFF_8024_ID_SFP = 0x3, /* SFP or SFP “Plus” */ + SFF_8024_ID_XBI = 0x4, /* 300 pin XBI */ + SFF_8024_ID_XENPAK = 0x5, /* Xenpak */ + SFF_8024_ID_XFP = 0x6, /* XFP */ + SFF_8024_ID_XFF = 0x7, /* XFF */ + SFF_8024_ID_XFPE = 0x8, /* XFP-E */ + SFF_8024_ID_XPAK = 0x9, /* XPAk */ + SFF_8024_ID_X2 = 0xA, /* X2 */ + SFF_8024_ID_DWDM_SFP = 0xB, /* DWDM-SFP */ + SFF_8024_ID_QSFP = 0xC, /* QSFP */ + SFF_8024_ID_QSFPPLUS = 0xD, /* QSFP+ */ + SFF_8024_ID_CXP = 0xE, /* CXP */ + SFF_8024_ID_HD4X = 0xF, /* Shielded Mini Multilane HD 4X */ + SFF_8024_ID_HD8X = 0x10, /* Shielded Mini Multilane HD 8X */ + SFF_8024_ID_QSFP28 = 0x11, /* QSFP28 */ + SFF_8024_ID_CXP2 = 0x12, /* CXP2 (aka CXP28) */ + SFF_8024_ID_CDFP = 0x13, /* CDFP (Style 1/Style 2) */ + SFF_8024_ID_SMM4 = 0x14, /* Shielded Mini Multilate HD 4X Fanout */ + SFF_8024_ID_SMM8 = 0x15, /* Shielded Mini Multilate HD 8X Fanout */ + SFF_8024_ID_CDFP3 = 0x16, /* CDFP (Style3) */ + SFF_8024_ID_LAST = SFF_8024_ID_CDFP3 + }; + +static const char *sff_8024_id[SFF_8024_ID_LAST + 1] = {"Unknown", + "GBIC", + "SFF", + "SFP/SFP+/SFP28", + "XBI", + "Xenpak", + "XFP", + "XFF", + "XFP-E", + "XPAK", + "X2", + "DWDM-SFP/SFP+", + "QSFP", + "QSFP+", + "CXP", + "HD4X", + "HD8X", + "QSFP28", + "CXP2", + "CDFP", + "SMM4", + "SMM8", + "CDFP3"}; + +/* Keep compatibility with old definitions */ +#define SFF_8472_ID_UNKNOWN SFF_8024_ID_UNKNOWN +#define SFF_8472_ID_GBIC SFF_8024_ID_GBIC +#define SFF_8472_ID_SFF SFF_8024_ID_SFF +#define SFF_8472_ID_SFP SFF_8024_ID_SFP +#define SFF_8472_ID_XBI SFF_8024_ID_XBI +#define SFF_8472_ID_XENPAK SFF_8024_ID_XENPAK +#define SFF_8472_ID_XFP SFF_8024_ID_XFP +#define SFF_8472_ID_XFF SFF_8024_ID_XFF +#define SFF_8472_ID_XFPE SFF_8024_ID_XFPE +#define SFF_8472_ID_XPAK SFF_8024_ID_XPAK +#define SFF_8472_ID_X2 SFF_8024_ID_X2 +#define SFF_8472_ID_DWDM_SFP SFF_8024_ID_DWDM_SFP +#define SFF_8472_ID_QSFP SFF_8024_ID_QSFP +#define SFF_8472_ID_LAST SFF_8024_ID_LAST + +#define sff_8472_id sff_8024_id + +/* + * Table 3.9 Diagnostic Monitoring Type (byte 92) + * bits described. + */ + +/* + * Digital diagnostic monitoring implemented. + * Set to 1 for transceivers implementing DDM. + */ +#define SFF_8472_DDM_DONE (1 << 6) + +/* + * Measurements are internally calibrated. + */ +#define SFF_8472_DDM_INTERNAL (1 << 5) + +/* + * Measurements are externally calibrated. + */ +#define SFF_8472_DDM_EXTERNAL (1 << 4) + +/* + * Received power measurement type + * 0 = OMA, 1 = average power + */ +#define SFF_8472_DDM_PMTYPE (1 << 3) + +/* Table 3.13 and 3.14 Temperature Conversion Values */ +#define SFF_8472_TEMP_SIGN (1 << 15) +#define SFF_8472_TEMP_SHIFT 8 +#define SFF_8472_TEMP_MSK 0xEF00 +#define SFF_8472_TEMP_FRAC 0x00FF + +/* Internal Callibration Conversion factors */ + +/* + * Represented as a 16 bit unsigned integer with the voltage defined + * as the full 16 bit value (0 – 65535) with LSB equal to 100 uVolt, + * yielding a total range of 0 to +6.55 Volts. + */ +#define SFF_8472_VCC_FACTOR 10000.0 + +/* + * Represented as a 16 bit unsigned integer with the current defined + * as the full 16 bit value (0 – 65535) with LSB equal to 2 uA, + * yielding a total range of 0 to 131 mA. + */ + +#define SFF_8472_BIAS_FACTOR 2000.0 + +/* + * Represented as a 16 bit unsigned integer with the power defined as + * the full 16 bit value (0 – 65535) with LSB equal to 0.1 uW, + * yielding a total range of 0 to 6.5535 mW (~ -40 to +8.2 dBm). + */ + +#define SFF_8472_POWER_FACTOR 10000.0 diff --git a/freebsd/sys/net/vnet.h b/freebsd/sys/net/vnet.h index 01e26cdb..3e186c12 100644 --- a/freebsd/sys/net/vnet.h +++ b/freebsd/sys/net/vnet.h @@ -70,6 +70,7 @@ struct vnet { u_int vnet_magic_n; u_int vnet_ifcnt; u_int vnet_sockcnt; + u_int vnet_state; /* SI_SUB_* */ void *vnet_data_mem; uintptr_t vnet_data_base; }; @@ -85,6 +86,61 @@ struct vnet { #ifdef _KERNEL +#define VNET_PCPUSTAT_DECLARE(type, name) \ + VNET_DECLARE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)]) + +#define VNET_PCPUSTAT_DEFINE(type, name) \ + VNET_DEFINE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)]) + +#define VNET_PCPUSTAT_ALLOC(name, wait) \ + COUNTER_ARRAY_ALLOC(VNET(name), \ + sizeof(VNET(name)) / sizeof(counter_u64_t), (wait)) + +#define VNET_PCPUSTAT_FREE(name) \ + COUNTER_ARRAY_FREE(VNET(name), sizeof(VNET(name)) / sizeof(counter_u64_t)) + +#define VNET_PCPUSTAT_ADD(type, name, f, v) \ + counter_u64_add(VNET(name)[offsetof(type, f) / sizeof(uint64_t)], (v)) + +#define VNET_PCPUSTAT_FETCH(type, name, f) \ + counter_u64_fetch(VNET(name)[offsetof(type, f) / sizeof(uint64_t)]) + +#define VNET_PCPUSTAT_SYSINIT(name) \ +static void \ +vnet_##name##_init(const void *unused) \ +{ \ + VNET_PCPUSTAT_ALLOC(name, M_WAITOK); \ +} \ +VNET_SYSINIT(vnet_ ## name ## _init, SI_SUB_INIT_IF, \ + SI_ORDER_FIRST, vnet_ ## name ## _init, NULL) + +#define VNET_PCPUSTAT_SYSUNINIT(name) \ +static void \ +vnet_##name##_uninit(const void *unused) \ +{ \ + VNET_PCPUSTAT_FREE(name); \ +} \ +VNET_SYSUNINIT(vnet_ ## name ## _uninit, SI_SUB_INIT_IF, \ + SI_ORDER_FIRST, vnet_ ## name ## _uninit, NULL) + +#ifdef SYSCTL_OID +#define SYSCTL_VNET_PCPUSTAT(parent, nbr, name, type, array, desc) \ +static int \ +array##_sysctl(SYSCTL_HANDLER_ARGS) \ +{ \ + type s; \ + CTASSERT((sizeof(type) / sizeof(uint64_t)) == \ + (sizeof(VNET(array)) / sizeof(counter_u64_t))); \ + COUNTER_ARRAY_COPY(VNET(array), &s, sizeof(type) / sizeof(uint64_t));\ + if (req->newptr) \ + COUNTER_ARRAY_ZERO(VNET(array), \ + sizeof(type) / sizeof(uint64_t)); \ + return (SYSCTL_OUT(req, &s, sizeof(type))); \ +} \ +SYSCTL_PROC(parent, nbr, name, CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RW, \ + NULL, 0, array ## _sysctl, "I", desc) +#endif /* SYSCTL_OID */ + #ifdef VIMAGE #include <rtems/bsd/sys/lock.h> #include <sys/proc.h> /* for struct thread */ @@ -233,53 +289,6 @@ void vnet_data_copy(void *start, int size); void vnet_data_free(void *start_arg, int size); /* - * Sysctl variants for vnet-virtualized global variables. Include - * <sys/sysctl.h> to expose these definitions. - * - * Note: SYSCTL_PROC() handler functions will need to resolve pointer - * arguments themselves, if required. - */ -#ifdef SYSCTL_OID -int vnet_sysctl_handle_int(SYSCTL_HANDLER_ARGS); -int vnet_sysctl_handle_opaque(SYSCTL_HANDLER_ARGS); -int vnet_sysctl_handle_string(SYSCTL_HANDLER_ARGS); -int vnet_sysctl_handle_uint(SYSCTL_HANDLER_ARGS); - -#define SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access), \ - ptr, val, vnet_sysctl_handle_int, "I", descr) -#define SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler, \ - fmt, descr) \ - CTASSERT(((access) & CTLTYPE) != 0); \ - SYSCTL_OID(parent, nbr, name, CTLFLAG_VNET|(access), ptr, arg, \ - handler, fmt, descr) -#define SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt, \ - descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr, len, \ - vnet_sysctl_handle_opaque, fmt, descr) -#define SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_STRING|CTLFLAG_VNET|(access), \ - arg, len, vnet_sysctl_handle_string, "A", descr) -#define SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr, \ - sizeof(struct type), vnet_sysctl_handle_opaque, "S," #type, \ - descr) -#define SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_UINT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access), \ - ptr, val, vnet_sysctl_handle_uint, "IU", descr) -#define VNET_SYSCTL_ARG(req, arg1) do { \ - if (arg1 != NULL) \ - arg1 = (void *)(TD_TO_VNET((req)->td)->vnet_data_base + \ - (uintptr_t)(arg1)); \ -} while (0) -#endif /* SYSCTL_OID */ - -/* * Virtual sysinit mechanism, allowing network stack components to declare * startup and shutdown methods to be run when virtual network stack * instances are created and destroyed. @@ -402,29 +411,6 @@ do { \ #define VNET(n) (n) /* - * When VIMAGE isn't compiled into the kernel, virtaulized SYSCTLs simply - * become normal SYSCTLs. - */ -#ifdef SYSCTL_OID -#define SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) -#define SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler, \ - fmt, descr) \ - SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, \ - descr) -#define SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt, \ - descr) \ - SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) -#define SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr) \ - SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) -#define SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr) \ - SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) -#define SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) -#define VNET_SYSCTL_ARG(req, arg1) -#endif /* SYSCTL_OID */ - -/* * When VIMAGE isn't compiled into the kernel, VNET_SYSINIT/VNET_SYSUNINIT * map into normal sysinits, which have the same ordering properties. */ |